From 0b7526661c0c6cd39d744c68c565e77f4aa7196d Mon Sep 17 00:00:00 2001 From: andromeda Date: Sun, 8 Mar 2026 16:03:24 +0100 Subject: [PATCH] clear up internal data structures, add to README --- twasm/README.md | 93 ++++++++++++++++++++++++++++++++++++++++++++++ twasm/asm/main.asm | 86 ++++++++++++++++++++++++++++++++---------- 2 files changed, 160 insertions(+), 19 deletions(-) diff --git a/twasm/README.md b/twasm/README.md index 7d0309d..be3f5ad 100644 --- a/twasm/README.md +++ b/twasm/README.md @@ -48,6 +48,99 @@ each token gets loaded into the token table with the following form: +----------+ ``` +### internal data structures + +#### `tokens.by_nameX` + +contains all tokens of that length followed by their ID. For some non-empty `tokens.by_nameX`, it is true that `tokens.by_name - tokens.by_nameX` is the size in bytes of `tokens.by_nameX`. + +each entry is in the following form: + +``` ++----------+--------------------------------+ +|[2 bytes] | 8 * token_length - 1 0 | ++----------+--------------------------------+ +| token ID | string without null terminator | ++----------+--------------------------------+ + +``` + +example implementation: + +```nasm +tokens: + .by_name1: + db "+" + dw 0x0062 + db "-" + dw 0x0063 + .by_name2: + db "r8" + dw 0x0008 + .by_name3: ; this is required for futureproofness; the caller can use this to + ; find the size of tokens.by_name2 +``` + +#### `tokens.by_id` + +contains some tokens with their metadata. Some tokens have embedded information (`0x10XX` for instance). Those will not have entries in this table, being handled instead inside the assemble function itself. + +metadata about some tokens in the following form: + +``` ++----------------+----------+-------+----------+ +| 31 24 | 23 20 | 19 16 | 15 0 | ++----------------+----------+-------+----------+ +| typed metadata | reserved | type | token ID | ++----------------+----------+-------+----------+ +``` + +the `type` hex digit is defined as the following: + +| hex | meaning | examples | +|-----|----------|-| +| 0x0 | ignored | `; this entire comment is 1 token` | +| 0x1 | operator | `mov`, `hlt` | +| 0x2 | register | `rsp`, `al` | + +type metadata for the different types is as follows: + +``` ++----------+ +| type 0x0 | ++----------+ +| 31 24 | ++----------+ +| reserved | ++----------+ +``` + +``` ++-------------------------------+ +| type 0x1 | ++----------+--------------------+ +| 31 26 | 25 24 | ++----------+--------------------+ +| reserved | number of operands | ++----------+--------------------+ +``` + +``` ++------------------+ +| type 0x2 | ++----------+-------+ +| 31 26 | 25 24 | ++----------+-------+ +| reserved | width | ++----------+-------+ + +; width: +00b ; 8 bit +01b ; 16 bit +10b ; 32 bit +11b ; 64 bit +``` + ### token IDs supported tokens are listed below diff --git a/twasm/asm/main.asm b/twasm/asm/main.asm index dce7bed..bc9b333 100644 --- a/twasm/asm/main.asm +++ b/twasm/asm/main.asm @@ -1,3 +1,5 @@ +; TODO actually enforce any of these *_SIZE constants :p + LOAD_ADDR equ 0x00010000 ; address this program is loaded at TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at @@ -34,8 +36,40 @@ start: mov rsi, [program.size] ; = size of program call tokenise + call clear_output_arena + call assemble + jmp halt +; ------------------------------------------------------------------------------ +; assembling +; ------------------------------------------------------------------------------ + +; ------------------------------------------------------------------------------ +; assemble +; TODO write tests +; +; description: +; assembles the program from tokens located at TOKEN_TABLE_ADDR into a flat +; binary located at OUTPUT_ADDR. It's probably desirable to clear the output +; arena before calling this function. +; ------------------------------------------------------------------------------ + +assemble: + xor rax, rax ; number of tokens processed + .loop: + cmp rax, TOKEN_TABLE_SIZE / TOKEN_TABLE_ENTRY_SIZE ; check incrementer + ; against the number of + ; entries in the token + ; table + jg .break ; if overflown, break + + + inc rax ; move to next token + jmp .loop + .break: + ret + ; ------------------------------------------------------------------------------ ; tokenising ; ------------------------------------------------------------------------------ @@ -46,7 +80,7 @@ start: ; ; description: ; represents the program at the given address and puts it in the token table -; it's probably desirable to clear the token table before calling this function +; it's probably desirable to clear the token table before calling this function. ; ; parameters: ; rdi -> first byte of program @@ -156,11 +190,11 @@ identify_token: ; length1 .start_length1: - mov rcx, tokens.length1 ; rcx -> list of known tokens + mov rcx, tokens.by_name_1 ; rcx -> list of known tokens .loop_length1: - cmp rcx, tokens.length2 ; check if rcx still in the bounds of length1 tokens - jge .unrecognised ; if not, unrecognised + cmp rcx, tokens.by_name_2 ; check if rcx still in the bounds of length1 tokens + jge .unrecognised ; if not, unrecognised mov r10b, [rcx] ; known token mov r11b, [rdi] ; token @@ -177,11 +211,11 @@ identify_token: ; length2 .start_length2: - mov rcx, tokens.length2 ; rcx -> list of known tokens + mov rcx, tokens.by_name_2 ; rcx -> list of known tokens .loop_length2: - cmp rcx, tokens.length3 ; check if rcx still in the bounds of length2 tokens - jge .unrecognised ; if not, unrecognised + cmp rcx, tokens.by_name_3 ; check if rcx still in the bounds of length2 tokens + jge .unrecognised ; if not, unrecognised mov r10w, [rcx] ; current entry in known tokens mov r11w, [rdi] ; token @@ -198,11 +232,11 @@ identify_token: ; length3 .start_length3: - mov rcx, tokens.length3 ; rcx -> list of known tokens + mov rcx, tokens.by_name_3 ; rcx -> list of known tokens .loop_length3: - cmp rcx, tokens.length4 ; check if rcx still in bounds of length3 tokens - jge .unrecognised ; if not, unrecognised + cmp rcx, tokens.by_name_4 ; check if rcx still in bounds of length3 tokens + jge .unrecognised ; if not, unrecognised ; TODO make this safe (it overreaches 1 byte) mov r10d, [rcx] ; known token + next byte @@ -224,11 +258,11 @@ identify_token: ; length4 .start_length4: - mov rcx, tokens.length4 ; rcx -> list of known tokens + mov rcx, tokens.by_name_4 ; rcx -> list of known tokens .loop_length4: - cmp rcx, tokens.length5 ; check if rcx still in bounds of length3 tokens - jge .unrecognised ; if not, unrecognised + cmp rcx, tokens.by_name_5 ; check if rcx still in bounds of length3 tokens + jge .unrecognised ; if not, unrecognised mov r10d, [rcx] ; known token mov r11d, [rdi] ; token @@ -483,6 +517,20 @@ clear_test_arena: rep stosd ret +; ------------------------------------------------------------------------------ +; clear_output_arena +; +; description: +; clears the output arena as specified by OUTPUT_SIZE and OUTPUT_ADDR +; ------------------------------------------------------------------------------ + +clear_output_arena: + xor eax, eax ; value to write + mov rcx, OUTPUT_SIZE / 4 ; number of double words + mov rdi, OUTPUT_ADDR ; address to start + rep stosd + ret + ; ------------------------------------------------------------------------------ ; tests ; ------------------------------------------------------------------------------ @@ -865,7 +913,7 @@ test_identify_next_token: ; ------------------------------------------------------------------------------ tokens: - .length1: + .by_name_1: db "[" dw 0x0051 db "]" @@ -878,7 +926,7 @@ tokens: dw 0x0064 db "/" dw 0x0065 - .length2: + .by_name_2: db "r8" dw 0x0008 db "r9" @@ -933,7 +981,7 @@ tokens: dw 0x005F db "jl" dw 0x0061 - .length3: + .by_name_3: db "rax" dw 0x0000 db "rbx" @@ -1032,7 +1080,7 @@ tokens: dw 0x005E db "jle" dw 0x0060 - .length4: + .by_name_4: db "r10d" dw 0x001A db "r11d" @@ -1073,8 +1121,8 @@ tokens: dw 0x0050 db "call" dw 0x0059 - .length5: - .end: + .by_name_5: + .by_id: msg_welcome db "Welcome to Twasm", 0x0A, 0x00 msg_halt db "halted.", 0x0A, 0x00