tokenise labels and constants! Now assembly highkey fails but ok

This commit is contained in:
andromeda
2026-03-30 16:09:25 +02:00
parent b1e7d2e3d5
commit f789d49e8a
3 changed files with 342 additions and 112 deletions

View File

@@ -6,7 +6,7 @@
LOAD_ADDR equ 0x7C00 LOAD_ADDR equ 0x7C00
KERNEL_START equ 2 ; first sector on disk to load kernel from; 1 indexed KERNEL_START equ 2 ; first sector on disk to load kernel from; 1 indexed
KERNEL_SIZE equ 16 ; length of kernel in sectors KERNEL_SIZE equ 32 ; length of kernel in sectors
KERNEL_LOAD_ADDR_ES equ 0x1000 ; kernel to be loaded at es * 0x10 + 0x0000 KERNEL_LOAD_ADDR_ES equ 0x1000 ; kernel to be loaded at es * 0x10 + 0x0000
PAGE_TABLE_LOAD_ADDR equ 0x1000 ; start of page table; 4 * pt size PAGE_TABLE_LOAD_ADDR equ 0x1000 ; start of page table; 4 * pt size

View File

@@ -22,11 +22,14 @@ tokeniser
------------------------ ------------------------
byte(s) -> next byte(s) byte(s) -> next byte(s)
------------------------ ------------------------
Newline -> Newline Newline -> Label
-> Newline
-> Komment -> Komment
-> Operator -> Operator
-> Directive -> Directive
Label -> Newline
Komment -> Newline Komment -> Newline
Operator -> Newline Operator -> Newline
@@ -45,37 +48,6 @@ Directive -> Newline
------------------------ ------------------------
``` ```
not yet implemented:
```
------------------------
operand parser
------------------------
byte(s) -> next byte(s)
------------------------
START -> '['
-> Register
-> Constant
'[' -> Register
-> Constant
']' -> END
Register -> IF #[, ']'
-> Operator
Constant -> IF #[, ']'
-> Operator
Operator -> IF NOT #R, Register
-> Constant
------------------------
:R: = whether a register has been found
:[: = whether a '[' has been found
------------------------
```
### memory map ### memory map
``` ```
@@ -88,6 +60,10 @@ Operator -> IF NOT #R, Register
+------ 0x00060000 ------+ +------ 0x00060000 ------+
| test arena | | test arena |
+------ 0x00050000 ------+ +------ 0x00050000 ------+
| label table |
+------ 0x00040000 ------+
| awaiting label table |
+------ 0x00030000 ------+
| stack (rsp) | | stack (rsp) |
+------------------------+ +------------------------+
| input | | input |
@@ -105,6 +81,7 @@ each word represents a token on the token table.
each token gets loaded into the token table with the following form: each token gets loaded into the token table with the following form:
``` ```
2 bytes
+----------+ +----------+
| 15 0 | | 15 0 |
+----------+ +----------+
@@ -112,6 +89,40 @@ each token gets loaded into the token table with the following form:
+----------+ +----------+
``` ```
#### label table (LT)
label definitions are stored and recalled from this table. The memory addresses are relative to the start of the program
```
16 bytes
+---------+
| 127 64 |
+---------+
| address |
+---------+
| 63 0 |
+---------+
| hash |
+---------+
```
#### awaiting label table (ALT)
forward references are stored in this table to be filled in after assembly is otherwise complete. The memory addresses are relative to the start of the program
```
16 bytes
+----------+----------+------------------+---------+
| 127 105 | 104 104 | 103 96 | 95 64 |
+----------+----------+------------------+---------+
| reserved | abs flag | # bytes reserved | address |
+----------+----------+------------------+---------+
| 63 0 |
+--------------------------------------------------+
| hash |
+--------------------------------------------------+
```
### internal data structures ### internal data structures
#### `tokens.[operators|registers]` #### `tokens.[operators|registers]`
@@ -121,6 +132,7 @@ contains tokens by their type. Intended to be searched by token name to get the
each entry is in the following form: each entry is in the following form:
``` ```
6 bytes
+----------+--------------------------------+ +----------+--------------------------------+
| 47 32 | 31 0 | | 47 32 | 31 0 |
+----------+--------------------------------+ +----------+--------------------------------+
@@ -129,26 +141,16 @@ each entry is in the following form:
``` ```
example implementation:
```nasm
tokens
.registers:
dd "r8"
dw 0x0008
.by_name3: ; this is required for futureproofness; the caller can use this to
; find the size of registers.by_name2
```
note that tokens longer than 4 bytes are problematic :/ note that tokens longer than 4 bytes are problematic :/
#### `tokens.by_id` #### `tokens.by_id`
contains some tokens with their metadata. Some tokens have embedded information (`0x10XX` for instance). Those will not have entries in this table, being handled instead inside the assemble function itself. contains some tokens with their metadata. Some tokens have embedded information (`0x10XX` for instance). Those do not have entries in this table, being handled instead inside the assemble function itself.
metadata about some tokens in the following form: metadata about some tokens in the following form:
``` ```
4 bytes
+----------------+----------+-------+----------+ +----------------+----------+-------+----------+
| 31 24 | 23 20 | 19 16 | 15 0 | | 31 24 | 23 20 | 19 16 | 15 0 |
+----------------+----------+-------+----------+ +----------------+----------+-------+----------+
@@ -168,6 +170,7 @@ the `type` hex digit is defined as the following:
type metadata for the different types is as follows: type metadata for the different types is as follows:
``` ```
1 byte
+----------+ +----------+
| type 0x0 | | type 0x0 |
+----------+ +----------+
@@ -178,6 +181,7 @@ type metadata for the different types is as follows:
``` ```
``` ```
1 byte
+-------------------------------+ +-------------------------------+
| type 0x1 | | type 0x1 |
+----------+--------------------+ +----------+--------------------+
@@ -188,6 +192,7 @@ type metadata for the different types is as follows:
``` ```
``` ```
1 byte
+------------------------------+ +------------------------------+
| type 0x2 | | type 0x2 |
+----------+-----------+-------+ +----------+-----------+-------+
@@ -210,6 +215,7 @@ type metadata for the different types is as follows:
entries are as follows: entries are as follows:
``` ```
16 bytes
+------------------------------+ +------------------------------+
| 0 operand operators | | 0 operand operators |
+------------------------------+ +------------------------------+
@@ -230,6 +236,7 @@ entries are as follows:
| reserved | opcode | token ID | | reserved | opcode | token ID |
+----------+--------+----------+ +----------+--------+----------+
16 bytes
+-------------------------------------------------------------+ +-------------------------------------------------------------+
| 1 operand operators | | 1 operand operators |
+-------------------------------------------------------------+ +-------------------------------------------------------------+
@@ -252,6 +259,7 @@ entries are as follows:
| | dst=r/m | | | | dst=r/m | |
+----------+---------------+----------------------------------+ +----------+---------------+----------------------------------+
16 bytes
+----------------------------------------------+ +----------------------------------------------+
| 2 operand operators | | 2 operand operators |
+----------------------------------------------+ +----------------------------------------------+
@@ -389,14 +397,23 @@ supported tokens are listed below
| ret | 0x005A | | | ret | 0x005A | |
| cmp | 0x005B | | | cmp | 0x005B | |
| | 0x10XX | some memory address; `XX` is as specified below | | | 0x10XX | some memory address; `XX` is as specified below |
| | 0xFEXX | used to pass some raw value `XX` in place of a token id | | | 0x20XX | some constant; `XX` is as specified below |
| | 0x3XXX | some label definition; `XXX` is its entry index in the label table |
| | 0x4XXX | some label reference; `XXX` is its entry index in the label table
| | 0xFEXX | used to pass some raw value `XX` in place of a token id to a couple of functions that mention this as a feature. If the function doesn't mention it, it will lead to undefined behaviour |
| | 0xFFFF | unrecognised token | | | 0xFFFF | unrecognised token |
values of `XX` in `0x10XX`: values of `XX` in `0x10XX`:
| XX | description | | XX | description |
|------|-------------| |------|-------------|
| 0x00 | following byte is the token ID of some register | | 0x00 | following word is the token ID of some register |
values of `XX` in `0x20XX`:
| XX | description |
|------|-------------|
| 0x00 | following 8 bytes are the constant's value |
### example program ### example program

View File

@@ -2,16 +2,21 @@
LOAD_ADDR equ 0x00010000 ; address this program is loaded at LOAD_ADDR equ 0x00010000 ; address this program is loaded at
STACK_ADDR equ 0x00030000 ; address to put the 64-bit stack at
AWAITING_LABEL_TABLE_ADDR equ 0x00030000 ; address to store pending labels at
AWAITING_LABEL_TABLE_SIZE equ 0x00010000
LABEL_TABLE_ADDR equ 0x00040000 ; address to store labels at
LABEL_TABLE_SIZE equ 0x00010000
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
TEST_ARENA_SIZE equ 0x1000 ; maximum size tests can use TEST_ARENA_SIZE equ 0x00010000 ; maximum size tests can use
TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
TOKEN_TABLE_SIZE equ 0x1000 ; max length of table TOKEN_TABLE_SIZE equ 0x00010000 ; max length of table
OUTPUT_ADDR equ 0x00070000 ; address of outputed binary OUTPUT_ADDR equ 0x00070000 ; address of outputed binary
OUTPUT_SIZE equ 0x1000 ; max length of outputed binary OUTPUT_SIZE equ 0x00010000 ; max length of outputed binary
STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at
UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token
UNRECOGNISED_ID_TYPE equ 0x0F ; type of an unrecognised id UNRECOGNISED_ID_TYPE equ 0x0F ; type of an unrecognised id
@@ -27,6 +32,7 @@ E_WHITESPACE equ 1 << 2
E_COMMA equ 1 << 3 E_COMMA equ 1 << 3
E_OPERATOR equ 1 << 4 E_OPERATOR equ 1 << 4
E_OPERAND equ 1 << 5 E_OPERAND equ 1 << 5
E_LABEL equ 1 << 6
[bits 64] [bits 64]
[org LOAD_ADDR] [org LOAD_ADDR]
@@ -43,6 +49,7 @@ start:
call run_tests call run_tests
call clear_token_table call clear_token_table
call clear_label_tables
mov rdi, program ; -> program mov rdi, program ; -> program
mov rsi, [program.size] ; = size of program mov rsi, [program.size] ; = size of program
@@ -98,6 +105,7 @@ assemble:
cmp al, 0x1 ; check if next tte is an operator cmp al, 0x1 ; check if next tte is an operator
je .operator ; if so, handle je .operator ; if so, handle
jmp .unexpected_token ; otherwise, fail jmp .unexpected_token ; otherwise, fail
.operator: ; if next tte's type is an operator: .operator: ; if next tte's type is an operator:
@@ -120,7 +128,6 @@ assemble:
je .operator_2 ; if so, handle case of two operands je .operator_2 ; if so, handle case of two operands
jmp .unexpected_token jmp .unexpected_token
.operator_0: .operator_0:
mov rsi, .msg_operator_0 mov rsi, .msg_operator_0
call print.debug call print.debug
@@ -133,7 +140,6 @@ assemble:
call .write_byte call .write_byte
jmp .loop_next_token jmp .loop_next_token
.operator_1: .operator_1:
mov rsi, .msg_operator_1 mov rsi, .msg_operator_1
call print.debug call print.debug
@@ -167,12 +173,10 @@ assemble:
je .operator_1_register je .operator_1_register
jmp .unexpected_token jmp .unexpected_token
.operator_1_memory: .operator_1_memory:
mov rsi, .msg_operator_1_memory mov rsi, .msg_operator_1_memory
call print.debug call print.debug
jmp .unsupported_memory_access jmp .unsupported_memory_access
.operator_1_register: .operator_1_register:
mov rsi, .msg_operator_1_register mov rsi, .msg_operator_1_register
call print.debug call print.debug
@@ -213,7 +217,6 @@ assemble:
call .write_byte call .write_byte
jmp .loop_next_token jmp .loop_next_token
.operator_2: .operator_2:
mov rsi, .msg_operator_2 mov rsi, .msg_operator_2
call print.debug call print.debug
@@ -240,7 +243,6 @@ assemble:
je .operator_2_register je .operator_2_register
jmp .unexpected_token jmp .unexpected_token
.operator_2_memory: .operator_2_memory:
mov rsi, .msg_operator_2_memory mov rsi, .msg_operator_2_memory
call print.debug call print.debug
@@ -319,7 +321,6 @@ assemble:
cmp al, 11b ; 64 bit cmp al, 11b ; 64 bit
je .operator_2_memory_register_64 je .operator_2_memory_register_64
.operator_2_memory_register_16: .operator_2_memory_register_16:
mov al, 0x66 mov al, 0x66
call .push_byte call .push_byte
@@ -337,7 +338,6 @@ assemble:
call .write_byte call .write_byte
jmp .loop_next_token jmp .loop_next_token
.operator_2_register: .operator_2_register:
mov rsi, .msg_operator_2_register mov rsi, .msg_operator_2_register
call print.debug call print.debug
@@ -398,7 +398,6 @@ assemble:
je .operator_2_register_register ; if so, handle je .operator_2_register_register ; if so, handle
jmp .unexpected_token jmp .unexpected_token
.operator_2_register_memory: .operator_2_register_memory:
push rsi push rsi
mov rsi, .msg_operator_2_register_memory mov rsi, .msg_operator_2_register_memory
@@ -441,7 +440,6 @@ assemble:
call .write_byte call .write_byte
jmp .loop_next_token jmp .loop_next_token
.operator_2_register_register: .operator_2_register_register:
push rsi push rsi
mov rsi, .msg_operator_2_register_register mov rsi, .msg_operator_2_register_register
@@ -543,7 +541,6 @@ assemble:
jmp .operator_2_register_register_continue jmp .operator_2_register_register_continue
.operator_2_register_register_continue: .operator_2_register_register_continue:
push rsi push rsi
mov esi, edi ; si = reg; src tte mov esi, edi ; si = reg; src tte
pop rdi ; di = r/m; dst tte pop rdi ; di = r/m; dst tte
@@ -658,7 +655,7 @@ assemble:
call .output_byte call .output_byte
mov byte [ecx], 0x00 mov byte [ecx], 0x00
jmp .flush_write_buffer_loop jmp .flush_write_buffer_loop
.flush_write_buffer_break .flush_write_buffer_break:
mov dword [.buffer_pointer], .buffer mov dword [.buffer_pointer], .buffer
pop rax pop rax
pop rcx pop rcx
@@ -680,6 +677,7 @@ assemble:
.msg_operator_2_register db "operator_2_register", 0x0A, 0x00 .msg_operator_2_register db "operator_2_register", 0x0A, 0x00
.msg_operator_2_register_memory db "operator_2_register_memory", 0x0A, 0x00 .msg_operator_2_register_memory db "operator_2_register_memory", 0x0A, 0x00
.msg_operator_2_register_register db "operator_2_register_register", 0x0A, 0x00 .msg_operator_2_register_register db "operator_2_register_register", 0x0A, 0x00
.msg_potential_label db "potential_label", 0x0A, 0x00
; ------------------------------------------------------------------------------ ; ------------------------------------------------------------------------------
; get_tte_type ; get_tte_type
@@ -956,9 +954,11 @@ tokenise:
pop rsi ; rsi -> last byte of program pop rsi ; rsi -> last byte of program
jnz .skip_byte_whitespace jnz .skip_byte_whitespace
test byte [.expecting], E_OPERATOR ; check if an operator is expected test byte [.expecting], E_LABEL ; check if a label is expected
jnz .label ; if so, handle it
test byte [.expecting], E_OPERATOR ; else, check if an operator is expected
jnz .operator ; if so, handle it jnz .operator ; if so, handle it
jmp .operand ; otherwise, handle as an operand jmp .operand ; else, handle as an operand
.comment: .comment:
push rsi push rsi
@@ -1021,11 +1021,72 @@ tokenise:
test byte [.expecting], E_NEWLINE ; make sure a newline was expected test byte [.expecting], E_NEWLINE ; make sure a newline was expected
jz .unexpected_newline ; if not, error jz .unexpected_newline ; if not, error
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR | E_LABEL
inc rdi inc rdi
jmp .loop jmp .loop
.label:
push rax
xor eax, eax ; rax = number of bytes in label
.label_loop:
mov dl, [rdi + rax] ; next byte
cmp dl, ":"
je .label_break
cmp dl, " "
je .label_not_found
cmp dl, 0x0A
je .label_not_found
cmp dl, 0x00
je .label_not_found
cmp dl, ";"
je .label_not_found
inc eax ; inc byte counter
cmp rdi, rsi
jge .break
jmp .label_loop
.label_break:
push rsi
mov rsi, .found
call print.debug
mov rsi, .msg_label
call print
pop rsi ; rsi -> last byte of program
push rax
push rdi
push rsi
mov rsi, rdi ; rsi -> start of string
mov rdi, rax ; rdi = size of string
call djb2
; rax = hash
mov rdi, rax ; rdi = hash
call add_label_hash
; rax = index on label table
mov cx, ax
and cx, 0x0FFF
or cx, 0x3000
pop rsi ; rsi -> last byte of program
pop rdi ; rdi -> current byte of program
pop rax ; rax = number of bytes in label
add rdi, rax ; move on to next byte
inc rdi ; move past the colon
pop rax ; rax = number of tokens processed
mov [TOKEN_TABLE_ADDR + rax * 2], cx
inc rax ; the next token
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE
jmp .loop
.label_not_found:
pop rax ; rax = number of tokens processed
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR
jmp .loop
.operator: .operator:
; debug message ; debug message
push rsi push rsi
@@ -1038,7 +1099,6 @@ tokenise:
mov rcx, rax ; rcx = number of tokens processed mov rcx, rax ; rcx = number of tokens processed
xor eax, eax ; eax = number of bytes in operator xor eax, eax ; eax = number of bytes in operator
mov [.pending_operator], eax ; zero pending operator mov [.pending_operator], eax ; zero pending operator
.operator_loop: .operator_loop:
; TODO give this its own error ; TODO give this its own error
@@ -1063,7 +1123,6 @@ tokenise:
cmp rdi, rsi cmp rdi, rsi
jge .break jge .break
jmp .operator_loop ; and loop jmp .operator_loop ; and loop
.operator_break: .operator_break:
; rax already pushed from .operator ; rax already pushed from .operator
push rdi push rdi
@@ -1097,7 +1156,6 @@ tokenise:
push rax push rax
push rdi push rdi
xor eax, eax ; rax = length of operand xor eax, eax ; rax = length of operand
.operand_loop: .operand_loop:
mov dl, [rdi] mov dl, [rdi]
@@ -1113,7 +1171,6 @@ tokenise:
inc rax ; inc length counter inc rax ; inc length counter
inc rdi ; inc byte pointer inc rdi ; inc byte pointer
jmp .operand_loop jmp .operand_loop
.operand_break: .operand_break:
pop rdi ; rdi -> first byte of operand pop rdi ; rdi -> first byte of operand
push rdi push rdi
@@ -1121,9 +1178,11 @@ tokenise:
mov rsi, rax ; rsi = length of operand in bytes mov rsi, rax ; rsi = length of operand in bytes
mov cx, ax ; cx = length counter for safe keeping mov cx, ax ; cx = length counter for safe keeping
push rcx
call evaluate_operand call evaluate_operand
; dl = return code ; dl = return code
; rax = binary data ; rax = binary data
pop rcx
pop rsi pop rsi
pop rdi ; rdi = first byte of operand pop rdi ; rdi = first byte of operand
add di, cx ; rdi = last byte of operand add di, cx ; rdi = last byte of operand
@@ -1131,31 +1190,48 @@ tokenise:
pop rax ; rax = number of tokens processed pop rax ; rax = number of tokens processed
; operand is some reg ; operand is some reg
; cx = token ID
cmp dl, 0x00 cmp dl, 0x00
; cx = token ID
je .operand_register je .operand_register
; operand is some [reg] ; operand is some [reg]
; cx = token ID
cmp dl, 0x10 cmp dl, 0x10
; cx = token ID
je .operand_addr_register je .operand_addr_register
jmp .unexpected_operand ; operand is some constant
cmp dl, 0x20
; rcx = constant value
je .operand_constant
; cx = token ID ; operand is some label
cmp dl, 0x30
; rcx = index of label in LT
je .operand_label
jmp .unexpected_operand
.operand_register: .operand_register:
mov [TOKEN_TABLE_ADDR + rax * 2], cx mov [TOKEN_TABLE_ADDR + rax * 2], cx
inc rax ; another token processed inc rax ; another token processed
jmp .operand_break_continue jmp .operand_break_continue
; cx = token ID
.operand_addr_register: .operand_addr_register:
mov word [TOKEN_TABLE_ADDR + rax * 2], 0x1000 mov word [TOKEN_TABLE_ADDR + rax * 2], 0x1000
inc rax ; 0x1000: addr reg token, next token is the register inc rax ; 0x1000: addr reg token, next token is the register
mov [TOKEN_TABLE_ADDR + rax * 2], cx mov [TOKEN_TABLE_ADDR + rax * 2], cx
inc rax ; the register as returned by evaluate_operand inc rax ; the register as returned by evaluate_operand
jmp .operand_break_continue jmp .operand_break_continue
.operand_constant:
mov word [TOKEN_TABLE_ADDR + rax * 2], 0x2000
inc rax ; another token processed
mov [TOKEN_TABLE_ADDR + rax * 2], rcx
add rax, 4
jmp .operand_break_continue
.operand_label:
and cx, 0x0FFF
or cx, 0x3000
mov [TOKEN_TABLE_ADDR + rax * 2], cx
inc rax
jmp .operand_break_continue
.operand_break_continue: .operand_break_continue:
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_COMMA mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_COMMA
jmp .loop jmp .loop
@@ -1164,8 +1240,7 @@ tokenise:
ret ret
; state ; state
.expecting db E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR | E_LABEL
.expecting db E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR
.unexpected_whitespace: .unexpected_whitespace:
mov rsi, .err_unexpected mov rsi, .err_unexpected
@@ -1209,6 +1284,7 @@ tokenise:
.msg_comment db "comment.", 0x0A, 0x00 .msg_comment db "comment.", 0x0A, 0x00
.msg_newline db "newline.", 0x0A, 0x00 .msg_newline db "newline.", 0x0A, 0x00
.msg_comma db "comma.", 0x0A, 0x00 .msg_comma db "comma.", 0x0A, 0x00
.msg_label db "label.", 0x0A, 0x00
.msg_operator db "operator.", 0x0A, 0x00 .msg_operator db "operator.", 0x0A, 0x00
.msg_operand db "operand.", 0x0A, 0x00 .msg_operand db "operand.", 0x0A, 0x00
.pending_operator dd 0 ; the operator token that is pending processing .pending_operator dd 0 ; the operator token that is pending processing
@@ -1220,10 +1296,12 @@ tokenise:
; takes the location and length of an operand and evaluates it into binary data ; takes the location and length of an operand and evaluates it into binary data
; and a return code to interpret the binary data. ; and a return code to interpret the binary data.
; ;
; | code | rsi contents | notes | ; | code | rax contents | notes |
; |------|----------------------|-------| ; |------|----------------------|-------|
; | 0x00 | token ID of register | reg | ; | 0x00 | token ID of register | reg |
; | 0x10 | token ID of register | [reg] | ; | 0x10 | token ID of register | [reg] |
; | 0x20 | constant value | const |
; | 0x30 | index of label in LT | label |
; | 0xFF | - | error | ; | 0xFF | - | error |
; ;
; parameters: ; parameters:
@@ -1239,6 +1317,7 @@ evaluate_operand:
push rdi ; rdi -> start of operand push rdi ; rdi -> start of operand
; rsi = size of operand ; rsi = size of operand
call trim_trailing_whitespace call trim_trailing_whitespace
; rax = new size of operand
pop rdi ; rdi -> first byte of operand pop rdi ; rdi -> first byte of operand
mov rsi, rax ; rsi = size of operand w/o trailing whitespace mov rsi, rax ; rsi = size of operand w/o trailing whitespace
@@ -1249,7 +1328,7 @@ evaluate_operand:
cmp byte [rdi], '[' ; case: memory addressing cmp byte [rdi], '[' ; case: memory addressing
je .address je .address
jmp .register ; otherwise: register jmp .register ; otherwise: register (or constant, or label)
.address: .address:
cmp byte [rdi + rsi - 1], ']' ; check if address is closed correctly cmp byte [rdi + rsi - 1], ']' ; check if address is closed correctly
@@ -1262,12 +1341,13 @@ evaluate_operand:
cmp dl, 0x10 ; make sure return code isn't another memory reference cmp dl, 0x10 ; make sure return code isn't another memory reference
je .unrecognised ; if it is, fail je .unrecognised ; if it is, fail
or dl, 0x10 ; flip bit for address return shr edx, 4
or dl, 0x10 ; address return
ret ret
.register: .register:
cmp rsi, 4 cmp rsi, 4
jg .unrecognised jg .constant ; not a register: too long. Maybe constant?
push rdi push rdi
mov edi, [rdi] ; edi = register to be searched mov edi, [rdi] ; edi = register to be searched
@@ -1288,18 +1368,47 @@ evaluate_operand:
.register3: .register3:
and edi, 0xFFFFFF and edi, 0xFFFFFF
.register4: .register4:
call identify_register call identify_register
; ax = register's token ID or UNRECOGNISED_TOKEN_ID ; ax = register's token ID or UNRECOGNISED_TOKEN_ID
pop rdi pop rdi ; rdi -> first byte of operand
cmp ax, UNRECOGNISED_TOKEN_ID cmp ax, UNRECOGNISED_TOKEN_ID ; if not a register, constant?
je .unrecognised je .constant
mov dl, 0x00 mov dl, 0x00
ret ret
.constant:
push rdi
push rsi
; rdi -> first byte of constant
; rsi = size of constant in bytes
call evaluate_constant
; dl = type of constant
; rax = hex value of constant
pop rdi ; rdi = size of label in bytes
pop rsi ; rsi -> first byte of label
cmp dl, 0xFF
je .label
; rax = hex value of constant
mov dl, 0x20
ret
.label:
; rdi = size of label in bytes
; rsi -> first byte of label
call djb2
; rax = hash
mov rdi, rax ; rdi = hash
call add_label_hash
; rax = index in LT of label
mov dl, 0x30
ret
.unrecognised: .unrecognised:
xor eax, eax
mov dl, 0xFF mov dl, 0xFF
ret ret
@@ -1353,6 +1462,7 @@ evaluate_constant:
je .chr je .chr
pop rcx pop rcx
push rcx ; waste value; .unrecognise expects something on the stack
jmp .unrecognised jmp .unrecognised
.numeric: .numeric:
@@ -1396,12 +1506,10 @@ evaluate_constant:
cmp dl, 9 ; if !digit: cmp dl, 9 ; if !digit:
jg .hex_alpha ; letter jg .hex_alpha ; letter
jmp .hex_continue ; else loop jmp .hex_continue ; else loop
.hex_alpha: .hex_alpha:
sub dl, 7 ; map [('A'-'0')..('F'-'0')] to [0xA..0xF] sub dl, 7 ; map [('A'-'0')..('F'-'0')] to [0xA..0xF]
cmp dl, 0xF ; if not in the range [0xA..0xF] cmp dl, 0xF ; if not in the range [0xA..0xF]
jg .unrecognised ; then unrecognised jg .unrecognised ; then unrecognised
.hex_continue: .hex_continue:
and dl, 0x0F ; mask and dl, 0x0F ; mask
or al, dl ; and add newest nibble or al, dl ; and add newest nibble
@@ -1471,14 +1579,12 @@ evaluate_constant:
inc rdi inc rdi
jmp .chr jmp .chr
.chr_break: .chr_break:
cmp rcx, 1 ; for each [1..rcx] cmp rcx, 1 ; for each [1..rcx]
jle .chr_break_for_good jle .chr_break_for_good
rol rax, 8 ; roll left to make up for the roll right earlier rol rax, 8 ; roll left to make up for the roll right earlier
dec rcx dec rcx
jmp .chr_break jmp .chr_break
.chr_break_for_good: .chr_break_for_good:
mov dl, [rdi] ; make sure the chr is closed mov dl, [rdi] ; make sure the chr is closed
cmp dl, '"' cmp dl, '"'
@@ -1492,8 +1598,9 @@ evaluate_constant:
.unrecognised: .unrecognised:
pop rdx pop rdx
mov rdx, 0xFF ; unrecognised type mov edx, 0xFF ; unrecognised type
ret ret
.msg db "evaluate_constant", 0x0A, 0x00
; ------------------------------------------------------------------------------ ; ------------------------------------------------------------------------------
; identify_register ; identify_register
@@ -1750,6 +1857,38 @@ trim_trailing_whitespace:
mov rax, rsi mov rax, rsi
ret ret
; ------------------------------------------------------------------------------
; add_label_hash
;
; description:
; adds a label hash to the label table, or just finds it if already present
;
; parameters
; rdi = 64-bit hash to be added
;
; returned
; rax = index in label table
; ------------------------------------------------------------------------------
add_label_hash:
xor eax, eax
.loop:
cmp rax, LABEL_TABLE_SIZE
jge .break
mov rcx, [LABEL_TABLE_ADDR + rax]
; TODO bug if there's an empty slot before the entry, it won't be found
cmp rcx, 0 ; empty slot
je .break
cmp rcx, rdi ; already present
je .break
add rax, 16
jmp .loop
.break:
mov [LABEL_TABLE_ADDR + rax], rdi
shr rax, 4 ; rax / 16
; rax = index
ret
; ------------------------------------------------------------------------------ ; ------------------------------------------------------------------------------
; clear_token_table ; clear_token_table
; ;
@@ -1764,6 +1903,26 @@ clear_token_table:
rep stosd rep stosd
ret ret
; ------------------------------------------------------------------------------
; clear_label_tables
;
; description:
; clears the label table as specified by LABEL_TABLE_SIZE and LABEL_TABLE_ADDR
; and the awaiting label table as specified by AWAITING_LABEL_TABLE_SIZE and
; AWAITING_LABEL_TABLE_ADDR
; ------------------------------------------------------------------------------
clear_label_tables:
xor eax, eax ; value to write
mov ecx, LABEL_TABLE_SIZE / 4 ; number of double words
mov edi, LABEL_TABLE_ADDR ; address to start
rep stosd
xor eax, eax ; value to write
mov ecx, AWAITING_LABEL_TABLE_SIZE / 4 ; number of double words
mov edi, AWAITING_LABEL_TABLE_ADDR ; address to start
rep stosd
ret
; ------------------------------------------------------------------------------ ; ------------------------------------------------------------------------------
; clear_test_arena ; clear_test_arena
; ;
@@ -1773,8 +1932,8 @@ clear_token_table:
clear_test_arena: clear_test_arena:
xor eax, eax ; value to write xor eax, eax ; value to write
mov ecx, TOKEN_TABLE_SIZE / 4 ; number of double words mov ecx, TEST_ARENA_SIZE / 4 ; number of double words
mov edi, TOKEN_TABLE_ADDR ; address to start mov edi, TEST_ARENA_ADDR ; address to start
rep stosd rep stosd
ret ret
@@ -2341,22 +2500,76 @@ whitespace_2 db " ", 0x0D
; test program ; test program
program: program:
db "xor eax, eax", 0x0A db "print:", 0x0A
db "mov rax, rax", 0x0A db " push rdx", 0x0A
db "mov rax, rbx", 0x0A db " push rax", 0x0A
db "mov eax, ebx", 0x0A db " push rsi", 0x0A
db "mov ax, bx", 0x0A db "", 0x0A
db "inc rax ; inline comment", 0x0A db " mov edx, 0x3F8", 0x0A
db "dec rax", 0x0A db " .loop:", 0x0A
db "; one line comment", 0x0A db " mov al, [rsi]", 0x0A
db "mov rdx, [rax]", 0x0A db " cmp al, 0x00", 0x0A
db "mov [rax], rdx", 0x0A db " je .done", 0x0A
db "mov [rcx], rbx", 0x0A db " out dx, al", 0x0A
db "mov rcx, [rbx]", 0x0A db " inc rsi", 0x0A
db "mov rcx, [ebx]", 0x0A db " jmp .loop", 0x0A
db "mov ecx, [ebx]", 0x0A db " .done:", 0x0A
db "mov cx, [ebx]", 0x0A db " pop rsi", 0x0A
db "hlt", 0x0A db " pop rax", 0x0A
db " pop rdx", 0x0A
db " ret", 0x0A
db " .debug:", 0x0A
db " push rsi", 0x0A
db " mov rsi, .debug_msg", 0x0A
db " call print", 0x0A
db " pop rsi", 0x0A
db " jmp print ; tail call", 0x0A
db " .error:", 0x0A
db " push rsi", 0x0A
db " mov rsi, .error_msg", 0x0A
db " call print", 0x0A
db " pop rsi", 0x0A
db " jmp print ; tail call", 0x0A
db " .test:", 0x0A
db " push rsi", 0x0A
db " mov rsi, .test_msg", 0x0A
db " call print", 0x0A
db " pop rsi", 0x0A
db " jmp print ; tail call", 0x0A
db " .warn:", 0x0A
db " push rsi", 0x0A
db " mov rsi, .warn_msg", 0x0A
db " call print", 0x0A
db " pop rsi", 0x0A
db " jmp print ; tail call", 0x0A
db " .debug_msg:", 0x0A
db " db 0x1B", 0x0A
db ' db "[36m"', 0x0A
db ' db "[DEBUG]: "', 0x0A
db " db 0x1B", 0x0A
db ' db "[0m"', 0x0A
db " db 0x00", 0x0A
db " .error_msg:", 0x0A
db " db 0x1B", 0x0A
db ' db "[1;31m"', 0x0A
db ' db "[ERROR]: "', 0x0A
db " db 0x1B", 0x0A
db ' db "[0m"', 0x0A
db " db 0x00", 0x0A
db " .test_msg:", 0x0A
db " db 0x1B", 0x0A
db ' db "[1;33m"', 0x0A
db ' db "[TEST]: "', 0x0A
db " db 0x1B", 0x0A
db ' db "[0m"', 0x0A
db " db 0x00", 0x0A
db " .warn_msg:", 0x0A
db " db 0x1B", 0x0A
db ' db "[1;35m"', 0x0A
db ' db "[WARN]: "', 0x0A
db " db 0x1B", 0x0A
db ' db "[0m"', 0x0A
db " db 0x00", 0x0A
.size dq $ - program .size dq $ - program
msg_end db "end of the binary ->|", 0x0A, 0x00 msg_end db "end of the binary ->|", 0x0A, 0x00