From 0b7526661c0c6cd39d744c68c565e77f4aa7196d Mon Sep 17 00:00:00 2001
From: andromeda <andromeda@lenovo>
Date: Sun, 8 Mar 2026 16:03:24 +0100
Subject: [PATCH] clear up internal data structures, add to README

---
 twasm/README.md    | 93 ++++++++++++++++++++++++++++++++++++++++++++++
 twasm/asm/main.asm | 86 ++++++++++++++++++++++++++++++++----------
 2 files changed, 160 insertions(+), 19 deletions(-)

diff --git a/twasm/README.md b/twasm/README.md
index 7d0309d..be3f5ad 100644
--- a/twasm/README.md
+++ b/twasm/README.md
@@ -48,6 +48,99 @@ each token gets loaded into the token table with the following form:
 +----------+
 ```
 
+### internal data structures
+
+#### `tokens.by_nameX`
+
+contains all tokens of that length followed by their ID. For some non-empty `tokens.by_nameX`, it is true that `tokens.by_name<X+1> - tokens.by_nameX` is the size in bytes of `tokens.by_nameX`.
+
+each entry is in the following form:
+
+```
++----------+--------------------------------+
+|[2 bytes] | 8 * token_length - 1         0 |
++----------+--------------------------------+
+| token ID | string without null terminator |
++----------+--------------------------------+
+
+```
+
+example implementation:
+
+```nasm
+tokens:
+  .by_name1:
+    db "+"
+    dw 0x0062
+    db "-"
+    dw 0x0063
+  .by_name2:
+    db "r8"
+    dw 0x0008
+  .by_name3: ; this is required for futureproofness; the caller can use this to
+             ; find the size of tokens.by_name2
+```
+
+#### `tokens.by_id`
+
+contains some tokens with their metadata. Some tokens have embedded information (`0x10XX` for instance). Those will not have entries in this table, being handled instead inside the assemble function itself.
+
+metadata about some tokens in the following form:
+
+```
++----------------+----------+-------+----------+
+| 31          24 | 23    20 | 19 16 | 15     0 |
++----------------+----------+-------+----------+
+| typed metadata | reserved | type  | token ID |
++----------------+----------+-------+----------+
+```
+
+the `type` hex digit is defined as the following:
+
+| hex | meaning  | examples |
+|-----|----------|-|
+| 0x0 | ignored  | `; this entire comment is 1 token` |
+| 0x1 | operator | `mov`, `hlt` |
+| 0x2 | register | `rsp`, `al` |
+
+type metadata for the different types is as follows:
+
+```
++----------+
+| type 0x0 |
++----------+
+| 31    24 |
++----------+
+| reserved |
++----------+
+```
+
+```
++-------------------------------+
+| type 0x1                      |
++----------+--------------------+
+| 31    26 | 25              24 |
++----------+--------------------+
+| reserved | number of operands |
++----------+--------------------+
+```
+
+```
++------------------+
+| type 0x2         |
++----------+-------+
+| 31    26 | 25 24 |
++----------+-------+
+| reserved | width |
++----------+-------+
+
+; width:
+00b ; 8 bit
+01b ; 16 bit
+10b ; 32 bit
+11b ; 64 bit
+```
+
 ### token IDs
 
 supported tokens are listed below
diff --git a/twasm/asm/main.asm b/twasm/asm/main.asm
index dce7bed..bc9b333 100644
--- a/twasm/asm/main.asm
+++ b/twasm/asm/main.asm
@@ -1,3 +1,5 @@
+; TODO actually enforce any of these *_SIZE constants :p
+
 LOAD_ADDR equ 0x00010000 ; address this program is loaded at
 
 TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
@@ -34,8 +36,40 @@ start:
   mov rsi, [program.size] ; = size of program
   call tokenise
 
+  call clear_output_arena
+  call assemble
+
   jmp halt
 
+; ------------------------------------------------------------------------------
+; assembling
+; ------------------------------------------------------------------------------
+
+; ------------------------------------------------------------------------------
+; assemble
+; TODO write tests
+;
+; description:
+; assembles the program from tokens located at TOKEN_TABLE_ADDR into a flat
+; binary located at OUTPUT_ADDR. It's probably desirable to clear the output
+; arena before calling this function.
+; ------------------------------------------------------------------------------
+
+assemble:
+  xor rax, rax ; number of tokens processed
+  .loop:
+    cmp rax, TOKEN_TABLE_SIZE / TOKEN_TABLE_ENTRY_SIZE ; check incrementer
+                                                       ; against the number of
+                                                       ; entries in the token
+                                                       ; table
+    jg .break                                          ; if overflown, break
+
+
+    inc rax ; move to next token
+    jmp .loop
+  .break:
+    ret
+
 ; ------------------------------------------------------------------------------
 ; tokenising
 ; ------------------------------------------------------------------------------
@@ -46,7 +80,7 @@ start:
 ;
 ; description:
 ; represents the program at the given address and puts it in the token table
-; it's probably desirable to clear the token table before calling this function
+; it's probably desirable to clear the token table before calling this function.
 ;
 ; parameters:
 ; rdi -> first byte of program
@@ -156,11 +190,11 @@ identify_token:
 
   ; length1
   .start_length1:
-    mov rcx, tokens.length1 ; rcx -> list of known tokens
+    mov rcx, tokens.by_name_1 ; rcx -> list of known tokens
 
   .loop_length1:
-    cmp rcx, tokens.length2 ; check if rcx still in the bounds of length1 tokens
-    jge .unrecognised       ; if not, unrecognised
+    cmp rcx, tokens.by_name_2 ; check if rcx still in the bounds of length1 tokens
+    jge .unrecognised         ; if not, unrecognised
 
     mov r10b, [rcx] ; known token
     mov r11b, [rdi] ; token
@@ -177,11 +211,11 @@ identify_token:
 
   ; length2
   .start_length2:
-    mov rcx, tokens.length2 ; rcx -> list of known tokens
+    mov rcx, tokens.by_name_2 ; rcx -> list of known tokens
 
   .loop_length2:
-    cmp rcx, tokens.length3 ; check if rcx still in the bounds of length2 tokens
-    jge .unrecognised       ; if not, unrecognised
+    cmp rcx, tokens.by_name_3 ; check if rcx still in the bounds of length2 tokens
+    jge .unrecognised         ; if not, unrecognised
 
     mov r10w, [rcx] ; current entry in known tokens
     mov r11w, [rdi] ; token
@@ -198,11 +232,11 @@ identify_token:
 
   ; length3
   .start_length3:
-    mov rcx, tokens.length3 ; rcx -> list of known tokens
+    mov rcx, tokens.by_name_3 ; rcx -> list of known tokens
 
   .loop_length3:
-    cmp rcx, tokens.length4 ; check if rcx still in bounds of length3 tokens
-    jge .unrecognised       ; if not, unrecognised
+    cmp rcx, tokens.by_name_4 ; check if rcx still in bounds of length3 tokens
+    jge .unrecognised         ; if not, unrecognised
 
     ; TODO make this safe (it overreaches 1 byte)
     mov r10d, [rcx] ; known token + next byte
@@ -224,11 +258,11 @@ identify_token:
 
   ; length4
   .start_length4:
-    mov rcx, tokens.length4 ; rcx -> list of known tokens
+    mov rcx, tokens.by_name_4 ; rcx -> list of known tokens
 
   .loop_length4:
-    cmp rcx, tokens.length5 ; check if rcx still in bounds of length3 tokens
-    jge .unrecognised       ; if not, unrecognised
+    cmp rcx, tokens.by_name_5 ; check if rcx still in bounds of length3 tokens
+    jge .unrecognised         ; if not, unrecognised
 
     mov r10d, [rcx] ; known token
     mov r11d, [rdi] ; token
@@ -483,6 +517,20 @@ clear_test_arena:
   rep stosd
   ret
 
+; ------------------------------------------------------------------------------
+; clear_output_arena
+;
+; description:
+; clears the output arena as specified by OUTPUT_SIZE and OUTPUT_ADDR
+; ------------------------------------------------------------------------------
+
+clear_output_arena:
+  xor eax, eax             ; value to write
+  mov rcx, OUTPUT_SIZE / 4 ; number of double words
+  mov rdi, OUTPUT_ADDR     ; address to start
+  rep stosd
+  ret
+
 ; ------------------------------------------------------------------------------
 ; tests
 ; ------------------------------------------------------------------------------
@@ -865,7 +913,7 @@ test_identify_next_token:
 ; ------------------------------------------------------------------------------
 
 tokens:
-  .length1:
+  .by_name_1:
     db "["
     dw 0x0051
     db "]"
@@ -878,7 +926,7 @@ tokens:
     dw 0x0064
     db "/"
     dw 0x0065
-  .length2:
+  .by_name_2:
     db "r8"
     dw 0x0008
     db "r9"
@@ -933,7 +981,7 @@ tokens:
     dw 0x005F
     db "jl"
     dw 0x0061
-  .length3:
+  .by_name_3:
     db "rax"
     dw 0x0000
     db "rbx"
@@ -1032,7 +1080,7 @@ tokens:
     dw 0x005E
     db "jle"
     dw 0x0060
-  .length4:
+  .by_name_4:
     db "r10d"
     dw 0x001A
     db "r11d"
@@ -1073,8 +1121,8 @@ tokens:
     dw 0x0050
     db "call"
     dw 0x0059
-  .length5:
-  .end:
+  .by_name_5:
+  .by_id:
 
 msg_welcome db "Welcome to Twasm", 0x0A, 0x00
 msg_halt db "halted.", 0x0A, 0x00