clear up internal data structures, add to README

This commit is contained in:
andromeda
2026-03-08 16:03:24 +01:00
parent 76e9cc4cd7
commit 0b7526661c
2 changed files with 160 additions and 19 deletions

View File

@@ -48,6 +48,99 @@ each token gets loaded into the token table with the following form:
+----------+
```
### internal data structures
#### `tokens.by_nameX`
contains all tokens of that length followed by their ID. For some non-empty `tokens.by_nameX`, it is true that `tokens.by_name<X+1> - tokens.by_nameX` is the size in bytes of `tokens.by_nameX`.
each entry is in the following form:
```
+----------+--------------------------------+
|[2 bytes] | 8 * token_length - 1 0 |
+----------+--------------------------------+
| token ID | string without null terminator |
+----------+--------------------------------+
```
example implementation:
```nasm
tokens:
.by_name1:
db "+"
dw 0x0062
db "-"
dw 0x0063
.by_name2:
db "r8"
dw 0x0008
.by_name3: ; this is required for futureproofness; the caller can use this to
; find the size of tokens.by_name2
```
#### `tokens.by_id`
contains some tokens with their metadata. Some tokens have embedded information (`0x10XX` for instance). Those will not have entries in this table, being handled instead inside the assemble function itself.
metadata about some tokens in the following form:
```
+----------------+----------+-------+----------+
| 31 24 | 23 20 | 19 16 | 15 0 |
+----------------+----------+-------+----------+
| typed metadata | reserved | type | token ID |
+----------------+----------+-------+----------+
```
the `type` hex digit is defined as the following:
| hex | meaning | examples |
|-----|----------|-|
| 0x0 | ignored | `; this entire comment is 1 token` |
| 0x1 | operator | `mov`, `hlt` |
| 0x2 | register | `rsp`, `al` |
type metadata for the different types is as follows:
```
+----------+
| type 0x0 |
+----------+
| 31 24 |
+----------+
| reserved |
+----------+
```
```
+-------------------------------+
| type 0x1 |
+----------+--------------------+
| 31 26 | 25 24 |
+----------+--------------------+
| reserved | number of operands |
+----------+--------------------+
```
```
+------------------+
| type 0x2 |
+----------+-------+
| 31 26 | 25 24 |
+----------+-------+
| reserved | width |
+----------+-------+
; width:
00b ; 8 bit
01b ; 16 bit
10b ; 32 bit
11b ; 64 bit
```
### token IDs
supported tokens are listed below

View File

@@ -1,3 +1,5 @@
; TODO actually enforce any of these *_SIZE constants :p
LOAD_ADDR equ 0x00010000 ; address this program is loaded at
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
@@ -34,8 +36,40 @@ start:
mov rsi, [program.size] ; = size of program
call tokenise
call clear_output_arena
call assemble
jmp halt
; ------------------------------------------------------------------------------
; assembling
; ------------------------------------------------------------------------------
; ------------------------------------------------------------------------------
; assemble
; TODO write tests
;
; description:
; assembles the program from tokens located at TOKEN_TABLE_ADDR into a flat
; binary located at OUTPUT_ADDR. It's probably desirable to clear the output
; arena before calling this function.
; ------------------------------------------------------------------------------
assemble:
xor rax, rax ; number of tokens processed
.loop:
cmp rax, TOKEN_TABLE_SIZE / TOKEN_TABLE_ENTRY_SIZE ; check incrementer
; against the number of
; entries in the token
; table
jg .break ; if overflown, break
inc rax ; move to next token
jmp .loop
.break:
ret
; ------------------------------------------------------------------------------
; tokenising
; ------------------------------------------------------------------------------
@@ -46,7 +80,7 @@ start:
;
; description:
; represents the program at the given address and puts it in the token table
; it's probably desirable to clear the token table before calling this function
; it's probably desirable to clear the token table before calling this function.
;
; parameters:
; rdi -> first byte of program
@@ -156,11 +190,11 @@ identify_token:
; length1
.start_length1:
mov rcx, tokens.length1 ; rcx -> list of known tokens
mov rcx, tokens.by_name_1 ; rcx -> list of known tokens
.loop_length1:
cmp rcx, tokens.length2 ; check if rcx still in the bounds of length1 tokens
jge .unrecognised ; if not, unrecognised
cmp rcx, tokens.by_name_2 ; check if rcx still in the bounds of length1 tokens
jge .unrecognised ; if not, unrecognised
mov r10b, [rcx] ; known token
mov r11b, [rdi] ; token
@@ -177,11 +211,11 @@ identify_token:
; length2
.start_length2:
mov rcx, tokens.length2 ; rcx -> list of known tokens
mov rcx, tokens.by_name_2 ; rcx -> list of known tokens
.loop_length2:
cmp rcx, tokens.length3 ; check if rcx still in the bounds of length2 tokens
jge .unrecognised ; if not, unrecognised
cmp rcx, tokens.by_name_3 ; check if rcx still in the bounds of length2 tokens
jge .unrecognised ; if not, unrecognised
mov r10w, [rcx] ; current entry in known tokens
mov r11w, [rdi] ; token
@@ -198,11 +232,11 @@ identify_token:
; length3
.start_length3:
mov rcx, tokens.length3 ; rcx -> list of known tokens
mov rcx, tokens.by_name_3 ; rcx -> list of known tokens
.loop_length3:
cmp rcx, tokens.length4 ; check if rcx still in bounds of length3 tokens
jge .unrecognised ; if not, unrecognised
cmp rcx, tokens.by_name_4 ; check if rcx still in bounds of length3 tokens
jge .unrecognised ; if not, unrecognised
; TODO make this safe (it overreaches 1 byte)
mov r10d, [rcx] ; known token + next byte
@@ -224,11 +258,11 @@ identify_token:
; length4
.start_length4:
mov rcx, tokens.length4 ; rcx -> list of known tokens
mov rcx, tokens.by_name_4 ; rcx -> list of known tokens
.loop_length4:
cmp rcx, tokens.length5 ; check if rcx still in bounds of length3 tokens
jge .unrecognised ; if not, unrecognised
cmp rcx, tokens.by_name_5 ; check if rcx still in bounds of length3 tokens
jge .unrecognised ; if not, unrecognised
mov r10d, [rcx] ; known token
mov r11d, [rdi] ; token
@@ -483,6 +517,20 @@ clear_test_arena:
rep stosd
ret
; ------------------------------------------------------------------------------
; clear_output_arena
;
; description:
; clears the output arena as specified by OUTPUT_SIZE and OUTPUT_ADDR
; ------------------------------------------------------------------------------
clear_output_arena:
xor eax, eax ; value to write
mov rcx, OUTPUT_SIZE / 4 ; number of double words
mov rdi, OUTPUT_ADDR ; address to start
rep stosd
ret
; ------------------------------------------------------------------------------
; tests
; ------------------------------------------------------------------------------
@@ -865,7 +913,7 @@ test_identify_next_token:
; ------------------------------------------------------------------------------
tokens:
.length1:
.by_name_1:
db "["
dw 0x0051
db "]"
@@ -878,7 +926,7 @@ tokens:
dw 0x0064
db "/"
dw 0x0065
.length2:
.by_name_2:
db "r8"
dw 0x0008
db "r9"
@@ -933,7 +981,7 @@ tokens:
dw 0x005F
db "jl"
dw 0x0061
.length3:
.by_name_3:
db "rax"
dw 0x0000
db "rbx"
@@ -1032,7 +1080,7 @@ tokens:
dw 0x005E
db "jle"
dw 0x0060
.length4:
.by_name_4:
db "r10d"
dw 0x001A
db "r11d"
@@ -1073,8 +1121,8 @@ tokens:
dw 0x0050
db "call"
dw 0x0059
.length5:
.end:
.by_name_5:
.by_id:
msg_welcome db "Welcome to Twasm", 0x0A, 0x00
msg_halt db "halted.", 0x0A, 0x00