tokenise labels and constants! Now assembly highkey fails but ok

This commit is contained in:
andromeda
2026-03-30 16:09:25 +02:00
parent b1e7d2e3d5
commit f789d49e8a
3 changed files with 342 additions and 112 deletions

View File

@@ -2,16 +2,21 @@
LOAD_ADDR equ 0x00010000 ; address this program is loaded at
STACK_ADDR equ 0x00030000 ; address to put the 64-bit stack at
AWAITING_LABEL_TABLE_ADDR equ 0x00030000 ; address to store pending labels at
AWAITING_LABEL_TABLE_SIZE equ 0x00010000
LABEL_TABLE_ADDR equ 0x00040000 ; address to store labels at
LABEL_TABLE_SIZE equ 0x00010000
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
TEST_ARENA_SIZE equ 0x1000 ; maximum size tests can use
TEST_ARENA_SIZE equ 0x00010000 ; maximum size tests can use
TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
TOKEN_TABLE_SIZE equ 0x1000 ; max length of table
TOKEN_TABLE_SIZE equ 0x00010000 ; max length of table
OUTPUT_ADDR equ 0x00070000 ; address of outputed binary
OUTPUT_SIZE equ 0x1000 ; max length of outputed binary
STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at
OUTPUT_SIZE equ 0x00010000 ; max length of outputed binary
UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token
UNRECOGNISED_ID_TYPE equ 0x0F ; type of an unrecognised id
@@ -27,6 +32,7 @@ E_WHITESPACE equ 1 << 2
E_COMMA equ 1 << 3
E_OPERATOR equ 1 << 4
E_OPERAND equ 1 << 5
E_LABEL equ 1 << 6
[bits 64]
[org LOAD_ADDR]
@@ -43,6 +49,7 @@ start:
call run_tests
call clear_token_table
call clear_label_tables
mov rdi, program ; -> program
mov rsi, [program.size] ; = size of program
@@ -98,6 +105,7 @@ assemble:
cmp al, 0x1 ; check if next tte is an operator
je .operator ; if so, handle
jmp .unexpected_token ; otherwise, fail
.operator: ; if next tte's type is an operator:
@@ -120,7 +128,6 @@ assemble:
je .operator_2 ; if so, handle case of two operands
jmp .unexpected_token
.operator_0:
mov rsi, .msg_operator_0
call print.debug
@@ -133,7 +140,6 @@ assemble:
call .write_byte
jmp .loop_next_token
.operator_1:
mov rsi, .msg_operator_1
call print.debug
@@ -167,12 +173,10 @@ assemble:
je .operator_1_register
jmp .unexpected_token
.operator_1_memory:
mov rsi, .msg_operator_1_memory
call print.debug
jmp .unsupported_memory_access
.operator_1_register:
mov rsi, .msg_operator_1_register
call print.debug
@@ -213,7 +217,6 @@ assemble:
call .write_byte
jmp .loop_next_token
.operator_2:
mov rsi, .msg_operator_2
call print.debug
@@ -240,7 +243,6 @@ assemble:
je .operator_2_register
jmp .unexpected_token
.operator_2_memory:
mov rsi, .msg_operator_2_memory
call print.debug
@@ -319,7 +321,6 @@ assemble:
cmp al, 11b ; 64 bit
je .operator_2_memory_register_64
.operator_2_memory_register_16:
mov al, 0x66
call .push_byte
@@ -337,7 +338,6 @@ assemble:
call .write_byte
jmp .loop_next_token
.operator_2_register:
mov rsi, .msg_operator_2_register
call print.debug
@@ -398,7 +398,6 @@ assemble:
je .operator_2_register_register ; if so, handle
jmp .unexpected_token
.operator_2_register_memory:
push rsi
mov rsi, .msg_operator_2_register_memory
@@ -441,7 +440,6 @@ assemble:
call .write_byte
jmp .loop_next_token
.operator_2_register_register:
push rsi
mov rsi, .msg_operator_2_register_register
@@ -543,7 +541,6 @@ assemble:
jmp .operator_2_register_register_continue
.operator_2_register_register_continue:
push rsi
mov esi, edi ; si = reg; src tte
pop rdi ; di = r/m; dst tte
@@ -658,7 +655,7 @@ assemble:
call .output_byte
mov byte [ecx], 0x00
jmp .flush_write_buffer_loop
.flush_write_buffer_break
.flush_write_buffer_break:
mov dword [.buffer_pointer], .buffer
pop rax
pop rcx
@@ -680,6 +677,7 @@ assemble:
.msg_operator_2_register db "operator_2_register", 0x0A, 0x00
.msg_operator_2_register_memory db "operator_2_register_memory", 0x0A, 0x00
.msg_operator_2_register_register db "operator_2_register_register", 0x0A, 0x00
.msg_potential_label db "potential_label", 0x0A, 0x00
; ------------------------------------------------------------------------------
; get_tte_type
@@ -956,9 +954,11 @@ tokenise:
pop rsi ; rsi -> last byte of program
jnz .skip_byte_whitespace
test byte [.expecting], E_OPERATOR ; check if an operator is expected
test byte [.expecting], E_LABEL ; check if a label is expected
jnz .label ; if so, handle it
test byte [.expecting], E_OPERATOR ; else, check if an operator is expected
jnz .operator ; if so, handle it
jmp .operand ; otherwise, handle as an operand
jmp .operand ; else, handle as an operand
.comment:
push rsi
@@ -1021,11 +1021,72 @@ tokenise:
test byte [.expecting], E_NEWLINE ; make sure a newline was expected
jz .unexpected_newline ; if not, error
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR | E_LABEL
inc rdi
jmp .loop
.label:
push rax
xor eax, eax ; rax = number of bytes in label
.label_loop:
mov dl, [rdi + rax] ; next byte
cmp dl, ":"
je .label_break
cmp dl, " "
je .label_not_found
cmp dl, 0x0A
je .label_not_found
cmp dl, 0x00
je .label_not_found
cmp dl, ";"
je .label_not_found
inc eax ; inc byte counter
cmp rdi, rsi
jge .break
jmp .label_loop
.label_break:
push rsi
mov rsi, .found
call print.debug
mov rsi, .msg_label
call print
pop rsi ; rsi -> last byte of program
push rax
push rdi
push rsi
mov rsi, rdi ; rsi -> start of string
mov rdi, rax ; rdi = size of string
call djb2
; rax = hash
mov rdi, rax ; rdi = hash
call add_label_hash
; rax = index on label table
mov cx, ax
and cx, 0x0FFF
or cx, 0x3000
pop rsi ; rsi -> last byte of program
pop rdi ; rdi -> current byte of program
pop rax ; rax = number of bytes in label
add rdi, rax ; move on to next byte
inc rdi ; move past the colon
pop rax ; rax = number of tokens processed
mov [TOKEN_TABLE_ADDR + rax * 2], cx
inc rax ; the next token
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE
jmp .loop
.label_not_found:
pop rax ; rax = number of tokens processed
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR
jmp .loop
.operator:
; debug message
push rsi
@@ -1038,7 +1099,6 @@ tokenise:
mov rcx, rax ; rcx = number of tokens processed
xor eax, eax ; eax = number of bytes in operator
mov [.pending_operator], eax ; zero pending operator
.operator_loop:
; TODO give this its own error
@@ -1063,7 +1123,6 @@ tokenise:
cmp rdi, rsi
jge .break
jmp .operator_loop ; and loop
.operator_break:
; rax already pushed from .operator
push rdi
@@ -1097,7 +1156,6 @@ tokenise:
push rax
push rdi
xor eax, eax ; rax = length of operand
.operand_loop:
mov dl, [rdi]
@@ -1113,7 +1171,6 @@ tokenise:
inc rax ; inc length counter
inc rdi ; inc byte pointer
jmp .operand_loop
.operand_break:
pop rdi ; rdi -> first byte of operand
push rdi
@@ -1121,9 +1178,11 @@ tokenise:
mov rsi, rax ; rsi = length of operand in bytes
mov cx, ax ; cx = length counter for safe keeping
push rcx
call evaluate_operand
; dl = return code
; rax = binary data
pop rcx
pop rsi
pop rdi ; rdi = first byte of operand
add di, cx ; rdi = last byte of operand
@@ -1131,31 +1190,48 @@ tokenise:
pop rax ; rax = number of tokens processed
; operand is some reg
; cx = token ID
cmp dl, 0x00
; cx = token ID
je .operand_register
; operand is some [reg]
; cx = token ID
cmp dl, 0x10
; cx = token ID
je .operand_addr_register
jmp .unexpected_operand
; operand is some constant
cmp dl, 0x20
; rcx = constant value
je .operand_constant
; cx = token ID
; operand is some label
cmp dl, 0x30
; rcx = index of label in LT
je .operand_label
jmp .unexpected_operand
.operand_register:
mov [TOKEN_TABLE_ADDR + rax * 2], cx
inc rax ; another token processed
jmp .operand_break_continue
; cx = token ID
.operand_addr_register:
mov word [TOKEN_TABLE_ADDR + rax * 2], 0x1000
inc rax ; 0x1000: addr reg token, next token is the register
mov [TOKEN_TABLE_ADDR + rax * 2], cx
inc rax ; the register as returned by evaluate_operand
jmp .operand_break_continue
.operand_constant:
mov word [TOKEN_TABLE_ADDR + rax * 2], 0x2000
inc rax ; another token processed
mov [TOKEN_TABLE_ADDR + rax * 2], rcx
add rax, 4
jmp .operand_break_continue
.operand_label:
and cx, 0x0FFF
or cx, 0x3000
mov [TOKEN_TABLE_ADDR + rax * 2], cx
inc rax
jmp .operand_break_continue
.operand_break_continue:
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_COMMA
jmp .loop
@@ -1164,8 +1240,7 @@ tokenise:
ret
; state
.expecting db E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR
.expecting db E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR | E_LABEL
.unexpected_whitespace:
mov rsi, .err_unexpected
@@ -1209,6 +1284,7 @@ tokenise:
.msg_comment db "comment.", 0x0A, 0x00
.msg_newline db "newline.", 0x0A, 0x00
.msg_comma db "comma.", 0x0A, 0x00
.msg_label db "label.", 0x0A, 0x00
.msg_operator db "operator.", 0x0A, 0x00
.msg_operand db "operand.", 0x0A, 0x00
.pending_operator dd 0 ; the operator token that is pending processing
@@ -1220,10 +1296,12 @@ tokenise:
; takes the location and length of an operand and evaluates it into binary data
; and a return code to interpret the binary data.
;
; | code | rsi contents | notes |
; | code | rax contents | notes |
; |------|----------------------|-------|
; | 0x00 | token ID of register | reg |
; | 0x10 | token ID of register | [reg] |
; | 0x20 | constant value | const |
; | 0x30 | index of label in LT | label |
; | 0xFF | - | error |
;
; parameters:
@@ -1239,6 +1317,7 @@ evaluate_operand:
push rdi ; rdi -> start of operand
; rsi = size of operand
call trim_trailing_whitespace
; rax = new size of operand
pop rdi ; rdi -> first byte of operand
mov rsi, rax ; rsi = size of operand w/o trailing whitespace
@@ -1249,7 +1328,7 @@ evaluate_operand:
cmp byte [rdi], '[' ; case: memory addressing
je .address
jmp .register ; otherwise: register
jmp .register ; otherwise: register (or constant, or label)
.address:
cmp byte [rdi + rsi - 1], ']' ; check if address is closed correctly
@@ -1262,12 +1341,13 @@ evaluate_operand:
cmp dl, 0x10 ; make sure return code isn't another memory reference
je .unrecognised ; if it is, fail
or dl, 0x10 ; flip bit for address return
shr edx, 4
or dl, 0x10 ; address return
ret
.register:
cmp rsi, 4
jg .unrecognised
jg .constant ; not a register: too long. Maybe constant?
push rdi
mov edi, [rdi] ; edi = register to be searched
@@ -1288,18 +1368,47 @@ evaluate_operand:
.register3:
and edi, 0xFFFFFF
.register4:
call identify_register
; ax = register's token ID or UNRECOGNISED_TOKEN_ID
pop rdi
pop rdi ; rdi -> first byte of operand
cmp ax, UNRECOGNISED_TOKEN_ID
je .unrecognised
cmp ax, UNRECOGNISED_TOKEN_ID ; if not a register, constant?
je .constant
mov dl, 0x00
ret
.constant:
push rdi
push rsi
; rdi -> first byte of constant
; rsi = size of constant in bytes
call evaluate_constant
; dl = type of constant
; rax = hex value of constant
pop rdi ; rdi = size of label in bytes
pop rsi ; rsi -> first byte of label
cmp dl, 0xFF
je .label
; rax = hex value of constant
mov dl, 0x20
ret
.label:
; rdi = size of label in bytes
; rsi -> first byte of label
call djb2
; rax = hash
mov rdi, rax ; rdi = hash
call add_label_hash
; rax = index in LT of label
mov dl, 0x30
ret
.unrecognised:
xor eax, eax
mov dl, 0xFF
ret
@@ -1353,6 +1462,7 @@ evaluate_constant:
je .chr
pop rcx
push rcx ; waste value; .unrecognise expects something on the stack
jmp .unrecognised
.numeric:
@@ -1396,12 +1506,10 @@ evaluate_constant:
cmp dl, 9 ; if !digit:
jg .hex_alpha ; letter
jmp .hex_continue ; else loop
.hex_alpha:
sub dl, 7 ; map [('A'-'0')..('F'-'0')] to [0xA..0xF]
cmp dl, 0xF ; if not in the range [0xA..0xF]
jg .unrecognised ; then unrecognised
.hex_continue:
and dl, 0x0F ; mask
or al, dl ; and add newest nibble
@@ -1471,14 +1579,12 @@ evaluate_constant:
inc rdi
jmp .chr
.chr_break:
cmp rcx, 1 ; for each [1..rcx]
jle .chr_break_for_good
rol rax, 8 ; roll left to make up for the roll right earlier
dec rcx
jmp .chr_break
.chr_break_for_good:
mov dl, [rdi] ; make sure the chr is closed
cmp dl, '"'
@@ -1492,8 +1598,9 @@ evaluate_constant:
.unrecognised:
pop rdx
mov rdx, 0xFF ; unrecognised type
mov edx, 0xFF ; unrecognised type
ret
.msg db "evaluate_constant", 0x0A, 0x00
; ------------------------------------------------------------------------------
; identify_register
@@ -1750,6 +1857,38 @@ trim_trailing_whitespace:
mov rax, rsi
ret
; ------------------------------------------------------------------------------
; add_label_hash
;
; description:
; adds a label hash to the label table, or just finds it if already present
;
; parameters
; rdi = 64-bit hash to be added
;
; returned
; rax = index in label table
; ------------------------------------------------------------------------------
add_label_hash:
xor eax, eax
.loop:
cmp rax, LABEL_TABLE_SIZE
jge .break
mov rcx, [LABEL_TABLE_ADDR + rax]
; TODO bug if there's an empty slot before the entry, it won't be found
cmp rcx, 0 ; empty slot
je .break
cmp rcx, rdi ; already present
je .break
add rax, 16
jmp .loop
.break:
mov [LABEL_TABLE_ADDR + rax], rdi
shr rax, 4 ; rax / 16
; rax = index
ret
; ------------------------------------------------------------------------------
; clear_token_table
;
@@ -1764,6 +1903,26 @@ clear_token_table:
rep stosd
ret
; ------------------------------------------------------------------------------
; clear_label_tables
;
; description:
; clears the label table as specified by LABEL_TABLE_SIZE and LABEL_TABLE_ADDR
; and the awaiting label table as specified by AWAITING_LABEL_TABLE_SIZE and
; AWAITING_LABEL_TABLE_ADDR
; ------------------------------------------------------------------------------
clear_label_tables:
xor eax, eax ; value to write
mov ecx, LABEL_TABLE_SIZE / 4 ; number of double words
mov edi, LABEL_TABLE_ADDR ; address to start
rep stosd
xor eax, eax ; value to write
mov ecx, AWAITING_LABEL_TABLE_SIZE / 4 ; number of double words
mov edi, AWAITING_LABEL_TABLE_ADDR ; address to start
rep stosd
ret
; ------------------------------------------------------------------------------
; clear_test_arena
;
@@ -1772,9 +1931,9 @@ clear_token_table:
; ------------------------------------------------------------------------------
clear_test_arena:
xor eax, eax ; value to write
mov ecx, TOKEN_TABLE_SIZE / 4 ; number of double words
mov edi, TOKEN_TABLE_ADDR ; address to start
xor eax, eax ; value to write
mov ecx, TEST_ARENA_SIZE / 4 ; number of double words
mov edi, TEST_ARENA_ADDR ; address to start
rep stosd
ret
@@ -2341,22 +2500,76 @@ whitespace_2 db " ", 0x0D
; test program
program:
db "xor eax, eax", 0x0A
db "mov rax, rax", 0x0A
db "mov rax, rbx", 0x0A
db "mov eax, ebx", 0x0A
db "mov ax, bx", 0x0A
db "inc rax ; inline comment", 0x0A
db "dec rax", 0x0A
db "; one line comment", 0x0A
db "mov rdx, [rax]", 0x0A
db "mov [rax], rdx", 0x0A
db "mov [rcx], rbx", 0x0A
db "mov rcx, [rbx]", 0x0A
db "mov rcx, [ebx]", 0x0A
db "mov ecx, [ebx]", 0x0A
db "mov cx, [ebx]", 0x0A
db "hlt", 0x0A
db "print:", 0x0A
db " push rdx", 0x0A
db " push rax", 0x0A
db " push rsi", 0x0A
db "", 0x0A
db " mov edx, 0x3F8", 0x0A
db " .loop:", 0x0A
db " mov al, [rsi]", 0x0A
db " cmp al, 0x00", 0x0A
db " je .done", 0x0A
db " out dx, al", 0x0A
db " inc rsi", 0x0A
db " jmp .loop", 0x0A
db " .done:", 0x0A
db " pop rsi", 0x0A
db " pop rax", 0x0A
db " pop rdx", 0x0A
db " ret", 0x0A
db " .debug:", 0x0A
db " push rsi", 0x0A
db " mov rsi, .debug_msg", 0x0A
db " call print", 0x0A
db " pop rsi", 0x0A
db " jmp print ; tail call", 0x0A
db " .error:", 0x0A
db " push rsi", 0x0A
db " mov rsi, .error_msg", 0x0A
db " call print", 0x0A
db " pop rsi", 0x0A
db " jmp print ; tail call", 0x0A
db " .test:", 0x0A
db " push rsi", 0x0A
db " mov rsi, .test_msg", 0x0A
db " call print", 0x0A
db " pop rsi", 0x0A
db " jmp print ; tail call", 0x0A
db " .warn:", 0x0A
db " push rsi", 0x0A
db " mov rsi, .warn_msg", 0x0A
db " call print", 0x0A
db " pop rsi", 0x0A
db " jmp print ; tail call", 0x0A
db " .debug_msg:", 0x0A
db " db 0x1B", 0x0A
db ' db "[36m"', 0x0A
db ' db "[DEBUG]: "', 0x0A
db " db 0x1B", 0x0A
db ' db "[0m"', 0x0A
db " db 0x00", 0x0A
db " .error_msg:", 0x0A
db " db 0x1B", 0x0A
db ' db "[1;31m"', 0x0A
db ' db "[ERROR]: "', 0x0A
db " db 0x1B", 0x0A
db ' db "[0m"', 0x0A
db " db 0x00", 0x0A
db " .test_msg:", 0x0A
db " db 0x1B", 0x0A
db ' db "[1;33m"', 0x0A
db ' db "[TEST]: "', 0x0A
db " db 0x1B", 0x0A
db ' db "[0m"', 0x0A
db " db 0x00", 0x0A
db " .warn_msg:", 0x0A
db " db 0x1B", 0x0A
db ' db "[1;35m"', 0x0A
db ' db "[WARN]: "', 0x0A
db " db 0x1B", 0x0A
db ' db "[0m"', 0x0A
db " db 0x00", 0x0A
.size dq $ - program
msg_end db "end of the binary ->|", 0x0A, 0x00