From 0ee8ff7914683f492c04e12bbec2af8cc2bb4125 Mon Sep 17 00:00:00 2001
From: andromeda <andromeda@lenovo>
Date: Thu, 12 Mar 2026 23:03:29 +0100
Subject: [PATCH] some major architecture changes

---
 twasm/README.md     |  85 ++++-
 twasm/asm/main.asm  | 827 +++++++++++++++++++++-----------------------
 twasm/asm/tests.asm | 202 -----------
 3 files changed, 459 insertions(+), 655 deletions(-)

diff --git a/twasm/README.md b/twasm/README.md
index d6d2ec5..c2cc263 100644
--- a/twasm/README.md
+++ b/twasm/README.md
@@ -12,6 +12,70 @@ I want to compile Bootler and Twasm with the Twasm assembler
 - [opcodes,ModR/M,SIB](http://ref.x86asm.net/coder64.html) (no secure site available)
 - [calling conventions](https://wiki.osdev.org/Calling_Conventions); I try to use System V
 
+### tokeniser
+
+whitespace is ignored for the sake of readability; it can go between pretty much anything
+
+```
+------------------------
+tokeniser
+------------------------
+byte(s) -> next byte(s)
+------------------------
+Newline   -> Newline
+          -> Komment
+          -> Operator
+          -> Directive
+
+Komment   -> Newline
+
+Operator  -> Newline
+          -> Komment
+          -> Operand
+
+Operand   -> Newline
+          -> Komment
+          -> Comma
+
+Comma     -> Operand
+
+Directive -> Newline
+          -> Komment
+          -> Operator
+------------------------
+```
+
+not yet implemented:
+
+```
+------------------------
+operand parser
+------------------------
+byte(s) -> next byte(s)
+------------------------
+START    -> '['
+         -> Register
+         -> Constant
+
+'['      -> Register
+         -> Constant
+
+']'      -> END
+
+Register -> IF #[, ']'
+         -> Operator
+
+Constant -> IF #[, ']'
+         -> Operator
+
+Operator -> IF NOT #R, Register
+         -> Constant
+------------------------
+:R: = whether a register has been found
+:[: = whether a '[' has been found
+------------------------
+```
+
 ### memory map
 
 ```
@@ -50,15 +114,15 @@ each token gets loaded into the token table with the following form:
 
 ### internal data structures
 
-#### `tokens.by_nameX`
+#### `tokens.[operators|registers]`
 
-contains all tokens of that length followed by their ID. For some non-empty `tokens.by_nameX`, it is true that `tokens.by_name<X+1> - tokens.by_nameX` is the size in bytes of `tokens.by_nameX`.
+contains tokens by their type. Intended to be searched by token name to get the token's ID.
 
 each entry is in the following form:
 
 ```
 +----------+--------------------------------+
-|[2 bytes] | 8 * token_length - 1         0 |
+| 47    32 | 31                           0 |
 +----------+--------------------------------+
 | token ID | string without null terminator |
 +----------+--------------------------------+
@@ -68,19 +132,16 @@ each entry is in the following form:
 example implementation:
 
 ```nasm
-tokens:
-  .by_name1:
-    db "+"
-    dw 0x0062
-    db "-"
-    dw 0x0063
-  .by_name2:
-    db "r8"
+tokens
+  .registers:
+    dd "r8"
     dw 0x0008
   .by_name3: ; this is required for futureproofness; the caller can use this to
-             ; find the size of tokens.by_name2
+             ; find the size of registers.by_name2
 ```
 
+note that tokens longer than 4 bytes are problematic :/
+
 #### `tokens.by_id`
 
 contains some tokens with their metadata. Some tokens have embedded information (`0x10XX` for instance). Those will not have entries in this table, being handled instead inside the assemble function itself.
diff --git a/twasm/asm/main.asm b/twasm/asm/main.asm
index b2961b8..22a1084 100644
--- a/twasm/asm/main.asm
+++ b/twasm/asm/main.asm
@@ -22,6 +22,14 @@ UNRECOGNISED_ID_OPCODE equ 0x90   ; opcode of an unrecognised id (NOP)
 
 TEST_LINE_LENGTH equ 80 ; right border of test suite results
 
+; flags for expected values in tokeniser
+E_COMMENT equ 1 << 0
+E_NEWLINE equ 1 << 1
+E_WHITESPACE equ 1 << 2
+E_COMMA equ 1 << 3
+E_OPERATOR equ 1 << 4
+E_OPERAND equ 1 << 5
+
 [bits 64]
 [org LOAD_ADDR]
 [default abs]   ; TODO see if I actually need to do this
@@ -455,256 +463,201 @@ get_reg_bits:
 ; ------------------------------------------------------------------------------
 
 tokenise:
-  add rsi, rdi ; last byte of program
-  xor ecx, ecx ; number of tokens processed
+               ; rdi -> current byte of program
+  add rsi, rdi ; rsi -> last byte of program
+  xor eax, eax ; rax = number of tokens processed
+  xor edx, edx ; dl = current byte of program
+
   .loop:
     cmp rdi, rsi ; if current byte greater than last byte
     jg .break    ; then break
 
-    push rdi
-    push rsi
-    push rcx
+    mov dl, [rdi] ; dl = current byte
 
-    ; rdi -> current byte
-    call identify_next_token
-    ; ax = id of token
-    ; dx = length of token
+    cmp dl, ";" ; if current byte is the start of a comment
+    je .comment ; then handle the comment
 
-    pop rcx
-    pop rsi
-    pop rdi
+    cmp dl, 0x0A         ; if current byte is the end of a line
+    je .newline_mk_flags ; then reset relevant flags
 
-    ; deal with terminator character (reported as 0 length token)
-    cmp rdx, 0
-    je .token_length0
-    jne .continue0
-
-  .token_length0:
-    mov ax, 0xFE00 ; terminator character
-    mov al, [rdi]  ; byte of terminator
-    mov edx, 1     ; byte length is 1
-
-  .continue0:
-    add rdi, rdx ; current byte + length of token = next unread byte
-
-    mov [TOKEN_TABLE_ADDR + rcx * TOKEN_TABLE_ENTRY_SIZE], ax ; fill next entry
-                                                               ; in token table
-
-    ; TODO fix undefined behaviour when open brackets and closed brackets aren't
-    ; correctly paired or have too much distance between them
-    cmp ax, 0x0051             ; check if read token is an open bracket
-    je .open_bracket           ; if so, handle it
-    jne .continue_open_bracket ; if not, continue
-
-  .open_bracket:
-    ; TODO make brackets able to hold more
-    mov [.data_open_bracket], cl ; record which entry the open bracket is at
-
-  .continue_open_bracket:
-    cmp ax, 0x0052              ; check if read token is a closing bracket
-    je .close_bracket           ; if so, handle it
-    jne .continue_close_bracket ; if not, continue
-
-  .close_bracket:
-    ; rewrite open bracket token entry with a filled out one
-    push rcx
-
-    mov dl, [.data_open_bracket]
-    sub cl, dl
-    mov byte [TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], cl
-    mov byte [1 + TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], 0x10
-
-    pop rcx
-
-  .continue_close_bracket:
-    inc rcx ; +1 token processed
-    jmp .loop
-  .break:
-    mov rax, rcx
-    ret
-
-  .data_open_bracket db 0x00 ; represents the token # of the latest open bracket
-
-; ------------------------------------------------------------------------------
-; identify_token
-;
-; description:
-; returns the id of a given token. If there are multiple ways to represent a
-; given token, like the open-bracket, it returns the one that doesn't require
-; information about the surrounding tokens, because it has no such information.
-; In other words, if it isn't in the `tokens` data structure, this function
-; doesn't see it. If the first byte of the token points to a terminator
-; byte, this function returns it as an unrecognised token.
-;
-; parameters:
-; rdi -> first byte of token
-; rsi = size of token in bytes
-;
-; returned:
-; ax = id of token; the rest of rax is zeroed
-; ------------------------------------------------------------------------------
-
-identify_token:
-  cmp rsi, 1        ; if the token has length 1
-  je .start_length1 ; then enter the length 1 loop
-
-  cmp rsi, 2        ; if the token has length 2
-  je .start_length2 ; then enter the length 2 loop
-
-  cmp rsi, 3        ; if the token has length 3
-  je .start_length3 ; then enter the length 3 loop
-
-  cmp rsi, 4        ; if the token has length 4
-  je .start_length4 ; then enter the length 4 loop
-
-  jmp .unrecognised ; else unrecognised
-
-  ; length1
-  .start_length1:
-    mov rcx, tokens.by_name_1 ; rcx -> list of known tokens
-
-  .loop_length1:
-    cmp rcx, tokens.by_name_2 ; check if rcx still in the bounds of length1 tokens
-    jge .unrecognised         ; if not, unrecognised
-
-    mov r10b, [rcx] ; known token
-    mov r11b, [rdi] ; token
-    cmp r10b, r11b    ; if known token matches token
-    je .found_length1 ; exit loop
-
-    add rcx, 3 ; length of token + length of id
-    jmp .loop_length1
-
-  .found_length1:
-    xor eax, eax      ; make sure rest of rax is zeroed
-    mov ax, [rcx + 1] ; return id of token
-    ret
-
-  ; length2
-  .start_length2:
-    mov rcx, tokens.by_name_2 ; rcx -> list of known tokens
-
-  .loop_length2:
-    cmp rcx, tokens.by_name_3 ; check if rcx still in the bounds of length2 tokens
-    jge .unrecognised         ; if not, unrecognised
-
-    mov r10w, [rcx] ; current entry in known tokens
-    mov r11w, [rdi] ; token
-    cmp r10w, r11w  ; if current entry matches token,
-    je .found_length2 ; exit loop
-
-    add rcx, 4 ; length of token + length of id
-    jmp .loop_length2
-
-  .found_length2:
-    xor eax, eax      ; make sure rest of rax is zeroed
-    mov ax, [rcx + 2] ; return id of token
-    ret
-
-  ; length3
-  .start_length3:
-    mov rcx, tokens.by_name_3 ; rcx -> list of known tokens
-
-  .loop_length3:
-    cmp rcx, tokens.by_name_4 ; check if rcx still in bounds of length3 tokens
-    jge .unrecognised         ; if not, unrecognised
-
-    ; TODO make this safe (it overreaches 1 byte)
-    mov r10d, [rcx] ; known token + next byte
-    mov r11d, [rdi] ; token + next byte
-
-    and r10d, 0x00FFFFFF ; mask for just the token
-    and r11d, 0x00FFFFFF
-
-    cmp r10d, r11d    ; if known token matches token,
-    je .found_length3 ; exit loop
-
-    add rcx, 5 ; length of token + length of id
-    jmp .loop_length3
-
-  .found_length3:
-    xor rax, rax      ; zero rax
-    mov ax, [rcx + 3] ; return id of token
-    ret
-
-  ; length4
-  .start_length4:
-    mov rcx, tokens.by_name_4 ; rcx -> list of known tokens
-
-  .loop_length4:
-    cmp rcx, tokens.by_name_5 ; check if rcx still in bounds of length3 tokens
-    jge .unrecognised         ; if not, unrecognised
-
-    mov r10d, [rcx] ; known token
-    mov r11d, [rdi] ; token
-    cmp r10d, r11d  ; if known token matches token,
-    je .found_length4 ; exit loop
-
-    add rcx, 6 ; length of token + length of id
-    jmp .loop_length4
-
-  .found_length4:
-    xor rax, rax      ; zero rax
-    mov ax, [rcx + 4] ; return id of token
-    ret
-
-  .unrecognised:
-    xor eax, eax
-    mov ax, UNRECOGNISED_TOKEN_ID
-    ret
-
-; ------------------------------------------------------------------------------
-; identify_next_token
-; description:
-; like identify_token, except it automatically finds the length. If the first
-; byte of the token points to a terminator byte, it returns a length of 0.
-;
-; parameters:
-; rdi -> first byte of token
-;
-; returned:
-; ax = id of token; the rest of rax is zeroed
-; dx = length of token in bytes; the rest of rdx is zeroed
-; ------------------------------------------------------------------------------
-
-identify_next_token:
-  push rdi
-
-  mov rsi, rdi ; rsi is the current byte
-  xor rdi, rdi ; rdi is the length
-  .loop:
-    xor edx, edx
-    mov dl, [rsi]
+    cmp dl, "," ; if current byte is a comma
+    je .comma   ; then handle the comma
 
     push rsi
     push rdi
+    push rax
     push rdx
-
-    mov rdi, 8                  ; length of terminator list
-    mov rsi, token_terminator_8 ; start of terminator list
+    mov rsi, whitespace_2 ; rsi -> list of whitespace (ignored) bytes
+    mov rdi,            2 ; rdi = size of list in bytes
+                          ; dl = current byte
     call elemb
-
+    ; al = 0 if not whitespace, 1 if whitespace
+    cmp al, 1  ; check if current byte is whitespace
     pop rdx
+    pop rax
     pop rdi
     pop rsi
+    je .skip_byte_whitespace
 
-    cmp rax, 1 ; check if the next character is a token terminator
-    je .break  ; if so, break
+    test byte [.expecting], E_OPERATOR ; check if an operator is expected
+    jnz .operator                      ; if so, handle it
+    jmp .operand                       ; otherwise, handle as an operand
 
-    inc rdi ; next character
-    inc rsi ; next byte of token
+  .comment:
+    push rsi
+    mov rsi, .found
+    call print.debug
+    mov rsi, .msg_comment
+    call print
+    pop rsi
+    test byte [.expecting], E_COMMENT ; make sure a comment is expected
+    jz .unexpected_comment            ; if not, error
+  .comment_loop:
+    ; TODO range check rdi
+    mov dl, [rdi]     ; dl = current byte
+
+    cmp dl, 0x0A      ; if current byte is a newline
+    je .comment_break ; then break
+
+    inc rdi           ; point to next unread byte
+    jmp .comment_loop
+  .comment_break:
+    jmp .loop
+
+  .skip_byte_whitespace:
+    push rsi
+    mov rsi, .found
+    call print.debug
+    mov rsi, .msg_whitespace
+    call print
+    pop rsi
+
+    test byte [.expecting], E_WHITESPACE ; make sure a whitespace was expected
+    jz .unexpected_whitespace            ; if not, error
+    inc rdi
+    jmp .loop                            ; else, loop
+
+  .comma: ; found comma
+    push rsi
+    mov rsi, .found
+    call print.debug
+    mov rsi, .msg_comma
+    call print
+    pop rsi
+
+    test byte [.expecting], E_COMMA            ; make sure a comma was expected
+    jz .unexpected_comma                       ; if not, error
+    inc rdi
+    mov [.expecting], E_WHITESPACE | E_OPERAND ; else, make operand expected
+    jmp .loop                                  ;       and loop
+
+  .newline_mk_flags:
+    push rsi
+    mov rsi, .found
+    call print.debug
+    mov rsi, .msg_newline
+    call print
+    pop rsi
+
+    test byte [.expecting], E_NEWLINE ; make sure a newline was expected
+    jz .unexpected_newline            ; if not, error
+
+    mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR
+
+    inc rdi
+    jmp .loop
+
+  .operator:
+    push rsi
+    mov rsi, .found
+    call print.debug
+    mov rsi, .msg_operator
+    call print
+    pop rsi
+  .operator_loop:
+    mov dl, [rdi] ; next byte
+
+    cmp dl, " "
+    je .operator_break
+    cmp dl, 0x0A
+    je .operator_break
+    cmp dl, ";"
+    je .operator_break
+
+    inc rdi            ; inc byte counter
+    jmp .operator_loop ; and loop
+  .operator_break:
+    mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERAND
+    jmp .loop
+
+  .operand:
+    push rsi
+    mov rsi, .found
+    call print.debug
+    mov rsi, .msg_operand
+    call print
+    pop rsi
+    test byte [.expecting], E_OPERAND ; make sure an operand was expected
+    jz .unexpected_operand            ; if not, error
+  .operand_loop:
+    mov dl, [rdi]
+    cmp dl, ","
+    je .operand_break
+    cmp dl, 0x0A
+    je .operand_break
+    cmp dl, 0x00
+    je .operand_break
+    inc rdi
+    jmp .operand_loop
+  .operand_break:
+    mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_COMMA
     jmp .loop
 
   .break:
-    mov rsi, rdi ; length of token
+    ret
 
-  pop rdi
+  ; state
 
-  push rsi
-  call identify_token
-  pop rsi
-  mov rdx, rsi ; length
-  ret
+  .expecting db E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR
+
+  .unexpected_whitespace:
+    mov rsi, .err_unexpected
+    call print.error
+    mov rsi, .msg_whitespace
+    call print
+    jmp halt
+  .unexpected_comment:
+    mov rsi, .err_unexpected
+    call print.error
+    mov rsi, .msg_comment
+    call print
+    jmp halt
+  .unexpected_newline:
+    mov rsi, .err_unexpected
+    call print.error
+    mov rsi, .msg_newline
+    call print
+    jmp halt
+  .unexpected_comma:
+    mov rsi, .err_unexpected
+    call print.error
+    mov rsi, .msg_comma
+    call print
+    jmp halt
+  .unexpected_operand:
+    mov rsi, .err_unexpected
+    call print.error
+    mov rsi, .msg_operand
+    call print
+    jmp halt
+  .err_unexpected db "unexpected ", 0x00
+  .found db "found ", 0x00
+  .msg_whitespace db "whitespace.", 0x0A, 0x00
+  .msg_comment db "comment.", 0x0A, 0x00
+  .msg_newline db "newline.", 0x0A, 0x00
+  .msg_comma db "comma.", 0x0A, 0x00
+  .msg_operator db "operator.", 0x0A, 0x00
+  .msg_operand db "operand.", 0x0A, 0x00
 
 ; ------------------------------------------------------------------------------
 ; utilities
@@ -789,7 +742,7 @@ halt:
 ; elemb
 ;
 ; description:
-; checks if given byte is element of the specified list
+; checks if given byte is element of the specified list.
 ;
 ; parameters:
 ; rdi = size of list
@@ -872,216 +825,9 @@ clear_output_arena:
 ; data
 ; ------------------------------------------------------------------------------
 
+
+
 tokens:
-  .by_name_1:
-    db "["
-    dw 0x0051
-    db "]"
-    dw 0x0052
-    db "+"
-    dw 0x0062
-    db "-"
-    dw 0x0063
-    db "*"
-    dw 0x0064
-    db "/"
-    dw 0x0065
-  .by_name_2:
-    db "r8"
-    dw 0x0008
-    db "r9"
-    dw 0x0009
-    db "ax"
-    dw 0x0020
-    db "bx"
-    dw 0x0021
-    db "cx"
-    dw 0x0022
-    db "dx"
-    dw 0x0023
-    db "si"
-    dw 0x0024
-    db "di"
-    dw 0x0025
-    db "sp"
-    dw 0x0026
-    db "bp"
-    dw 0x0027
-    db "al"
-    dw 0x0030
-    db "bl"
-    dw 0x0031
-    db "cl"
-    dw 0x0032
-    db "dl"
-    dw 0x0033
-    db "ah"
-    dw 0x0040
-    db "bh"
-    dw 0x0041
-    db "ch"
-    dw 0x0042
-    db "dh"
-    dw 0x0043
-    db "cs"
-    dw 0x0044
-    db "ds"
-    dw 0x0045
-    db "es"
-    dw 0x0046
-    db "fs"
-    dw 0x0047
-    db "gs"
-    dw 0x0048
-    db "ss"
-    dw 0x0049
-    db "je"
-    dw 0x005C
-    db "jg"
-    dw 0x005F
-    db "jl"
-    dw 0x0061
-  .by_name_3:
-    db "rax"
-    dw 0x0000
-    db "rbx"
-    dw 0x0001
-    db "rcx"
-    dw 0x0002
-    db "rdx"
-    dw 0x0003
-    db "rsi"
-    dw 0x0004
-    db "rdi"
-    dw 0x0005
-    db "rsp"
-    dw 0x0006
-    db "rbp"
-    dw 0x0007
-    db "r10"
-    dw 0x000A
-    db "r11"
-    dw 0x000B
-    db "r12"
-    dw 0x000C
-    db "r13"
-    dw 0x000D
-    db "r14"
-    dw 0x000E
-    db "r15"
-    dw 0x000F
-    db "eax"
-    dw 0x0010
-    db "ebx"
-    dw 0x0011
-    db "ecx"
-    dw 0x0012
-    db "edx"
-    dw 0x0013
-    db "esi"
-    dw 0x0014
-    db "edi"
-    dw 0x0015
-    db "esp"
-    dw 0x0016
-    db "ebp"
-    dw 0x0017
-    db "r8d"
-    dw 0x0018
-    db "r9d"
-    dw 0x0019
-    db "r8w"
-    dw 0x0028
-    db "r9w"
-    dw 0x0029
-    db "sil"
-    dw 0x0034
-    db "dil"
-    dw 0x0035
-    db "spl"
-    dw 0x0036
-    db "bpl"
-    dw 0x0037
-    db "r8b"
-    dw 0x0038
-    db "r9b"
-    dw 0x0039
-    db "cr0"
-    dw 0x004A
-    db "cr2"
-    dw 0x004B
-    db "cr3"
-    dw 0x004C
-    db "cr4"
-    dw 0x004D
-    db "cr8"
-    dw 0x004E
-    db "hlt"
-    dw 0x004F
-    db "xor"
-    dw 0x0053
-    db "inc"
-    dw 0x0054
-    db "dec"
-    dw 0x0055
-    db "mov"
-    dw 0x0056
-    db "add"
-    dw 0x0057
-    db "sub"
-    dw 0x0058
-    db "ret"
-    dw 0x005A
-    db "cmp"
-    dw 0x005B
-    db "jne"
-    dw 0x005D
-    db "jge"
-    dw 0x005E
-    db "jle"
-    dw 0x0060
-  .by_name_4:
-    db "r10d"
-    dw 0x001A
-    db "r11d"
-    dw 0x001B
-    db "r12d"
-    dw 0x001C
-    db "r13d"
-    dw 0x001D
-    db "r14d"
-    dw 0x001E
-    db "r15d"
-    dw 0x001F
-    db "r10w"
-    dw 0x002A
-    db "r11w"
-    dw 0x002B
-    db "r12w"
-    dw 0x002C
-    db "r13w"
-    dw 0x002D
-    db "r14w"
-    dw 0x002E
-    db "r15w"
-    dw 0x002F
-    db "r10b"
-    dw 0x003A
-    db "r11b"
-    dw 0x003B
-    db "r12b"
-    dw 0x003C
-    db "r13b"
-    dw 0x003D
-    db "r14b"
-    dw 0x003E
-    db "r15b"
-    dw 0x003F
-    db "int3"
-    dw 0x0050
-    db "call"
-    dw 0x0059
-  .by_name_5:
   .by_id:
     dw 0x0010    ; eax
     db 0x02      ; type: register
@@ -1114,6 +860,202 @@ tokens:
     db 0x01   ; type: operator
     db 0x00   ; # operands
   .by_id_end:
+  .operators:
+    dd "je"
+    dw 0x005C
+    dd "jg"
+    dw 0x005F
+    dd "jl"
+    dw 0x0061
+    dd "hlt"
+    dw 0x004F
+    dd "xor"
+    dw 0x0053
+    dd "inc"
+    dw 0x0054
+    dd "dec"
+    dw 0x0055
+    dd "mov"
+    dw 0x0056
+    dd "add"
+    dw 0x0057
+    dd "sub"
+    dw 0x0058
+    dd "ret"
+    dw 0x005A
+    dd "cmp"
+    dw 0x005B
+    dd "jne"
+    dw 0x005D
+    dd "jge"
+    dw 0x005E
+    dd "jle"
+    dw 0x0060
+    dd "int3"
+    dw 0x0050
+    dd "call"
+    dw 0x0059
+  .operators_end:
+  .registers:
+    dd "r8"
+    dw 0x0008
+    dd "r9"
+    dw 0x0009
+    dd "ax"
+    dw 0x0020
+    dd "bx"
+    dw 0x0021
+    dd "cx"
+    dw 0x0022
+    dd "dx"
+    dw 0x0023
+    dd "si"
+    dw 0x0024
+    dd "di"
+    dw 0x0025
+    dd "sp"
+    dw 0x0026
+    dd "bp"
+    dw 0x0027
+    dd "al"
+    dw 0x0030
+    dd "bl"
+    dw 0x0031
+    dd "cl"
+    dw 0x0032
+    dd "dl"
+    dw 0x0033
+    dd "ah"
+    dw 0x0040
+    dd "bh"
+    dw 0x0041
+    dd "ch"
+    dw 0x0042
+    dd "dh"
+    dw 0x0043
+    dd "cs"
+    dw 0x0044
+    dd "ds"
+    dw 0x0045
+    dd "es"
+    dw 0x0046
+    dd "fs"
+    dw 0x0047
+    dd "gs"
+    dw 0x0048
+    dd "ss"
+    dw 0x0049
+    dd "rax"
+    dw 0x0000
+    dd "rbx"
+    dw 0x0001
+    dd "rcx"
+    dw 0x0002
+    dd "rdx"
+    dw 0x0003
+    dd "rsi"
+    dw 0x0004
+    dd "rdi"
+    dw 0x0005
+    dd "rsp"
+    dw 0x0006
+    dd "rbp"
+    dw 0x0007
+    dd "r10"
+    dw 0x000A
+    dd "r11"
+    dw 0x000B
+    dd "r12"
+    dw 0x000C
+    dd "r13"
+    dw 0x000D
+    dd "r14"
+    dw 0x000E
+    dd "r15"
+    dw 0x000F
+    dd "eax"
+    dw 0x0010
+    dd "ebx"
+    dw 0x0011
+    dd "ecx"
+    dw 0x0012
+    dd "edx"
+    dw 0x0013
+    dd "esi"
+    dw 0x0014
+    dd "edi"
+    dw 0x0015
+    dd "esp"
+    dw 0x0016
+    dd "ebp"
+    dw 0x0017
+    dd "r8d"
+    dw 0x0018
+    dd "r9d"
+    dw 0x0019
+    dd "r8w"
+    dw 0x0028
+    dd "r9w"
+    dw 0x0029
+    dd "sil"
+    dw 0x0034
+    dd "dil"
+    dw 0x0035
+    dd "spl"
+    dw 0x0036
+    dd "bpl"
+    dw 0x0037
+    dd "r8b"
+    dw 0x0038
+    dd "r9b"
+    dw 0x0039
+    dd "cr0"
+    dw 0x004A
+    dd "cr2"
+    dw 0x004B
+    dd "cr3"
+    dw 0x004C
+    dd "cr4"
+    dw 0x004D
+    dd "cr8"
+    dw 0x004E
+    dd "r10d"
+    dw 0x001A
+    dd "r11d"
+    dw 0x001B
+    dd "r12d"
+    dw 0x001C
+    dd "r13d"
+    dw 0x001D
+    dd "r14d"
+    dw 0x001E
+    dd "r15d"
+    dw 0x001F
+    dd "r10w"
+    dw 0x002A
+    dd "r11w"
+    dw 0x002B
+    dd "r12w"
+    dw 0x002C
+    dd "r13w"
+    dw 0x002D
+    dd "r14w"
+    dw 0x002E
+    dd "r15w"
+    dw 0x002F
+    dd "r10b"
+    dw 0x003A
+    dd "r11b"
+    dw 0x003B
+    dd "r12b"
+    dw 0x003C
+    dd "r13b"
+    dw 0x003D
+    dd "r14b"
+    dw 0x003E
+    dd "r15b"
+    dw 0x003F
+  .registers_end:
 
 opcodes:
   .by_id:
@@ -1139,11 +1081,14 @@ msg_halt db "halted.", 0x0A, 0x00
 
 token_terminator_8 db 0x00, " ", 0x0A, 0x0D, ",", 0x00, 0x00, 0x00
 
+whitespace_2 db " ", 0x0D
+
 ; test program
 program:
   db "xor eax, eax", 0x0A
-  db "inc rax", 0x0A
+  db "inc rax ; inline comment", 0x0A
+  db "; one line comment", 0x0A
   db "mov [ rax ], rdx", 0x0A
-  db "hlt", 0x0A
+  db "hlt"
   db 0x00 ; just for the sake of being able to print it, I made it a string
-  .size db $ - program - 1
+  .size db $ - program
diff --git a/twasm/asm/tests.asm b/twasm/asm/tests.asm
index fc9fcc7..6fffb84 100644
--- a/twasm/asm/tests.asm
+++ b/twasm/asm/tests.asm
@@ -16,12 +16,6 @@ run_tests:
   call clear_test_arena
   call test_elemb
 
-  call clear_test_arena
-  call test_identify_token
-
-  call clear_test_arena
-  call test_identify_next_token
-
   call clear_test_arena
   call test_get_tte_type
 
@@ -101,202 +95,6 @@ test_elemb:
     ret
   .msg db "test_elemb...", 0x00
 
-; ------------------------------------------------------------------------------
-; test_identify_token
-;
-; description:
-; tests identify_token described functionality
-; ------------------------------------------------------------------------------
-
-test_identify_token:
-  mov rsi, .msg
-  call print.test
-
-  ; length1 token that exists
-  mov byte [TEST_ARENA_ADDR], "*"
-  mov rdi, TEST_ARENA_ADDR
-  mov rsi, 1
-  call identify_token
-  cmp ax, 0x0064
-  jne .fail
-
-  ; length1 token that doesn't exist
-  mov byte [TEST_ARENA_ADDR], " "
-  mov rdi, TEST_ARENA_ADDR
-  mov rsi, 1
-  call identify_token
-  cmp ax, 0xFFFF
-  jne .fail
-
-  ; length2 token that exists
-  mov word [TEST_ARENA_ADDR], "sp"
-  mov rdi, TEST_ARENA_ADDR
-  mov rsi, 2
-  call identify_token
-  cmp ax, 0x0026
-  jne .fail
-
-  ; length2 token that doesn't exist
-  mov word [TEST_ARENA_ADDR], "QQ"
-  mov rdi, TEST_ARENA_ADDR
-  mov rsi, 2
-  call identify_token
-  cmp ax, 0xFFFF
-  jne .fail
-
-  ; length3 token that exists
-  mov dword [TEST_ARENA_ADDR], "rax"
-  mov rdi, TEST_ARENA_ADDR
-  mov rsi, 3
-  call identify_token
-  cmp ax, 0x0000
-  jne .fail
-
-  ; length3 token that exists
-  mov dword [TEST_ARENA_ADDR], "cr0"
-  mov rdi, TEST_ARENA_ADDR
-  mov rsi, 3
-  call identify_token
-  cmp ax, 0x004A
-  jne .fail
-
-  ; length3 token that doesn't exist
-  mov dword [TEST_ARENA_ADDR], "r16"
-  mov rdi, TEST_ARENA_ADDR
-  mov rsi, 3
-  call identify_token
-  cmp ax, 0xFFFF
-  jne .fail
-
-  ; length4 token that exists
-  mov dword [TEST_ARENA_ADDR], "r10d"
-  mov rdi, TEST_ARENA_ADDR
-  mov rsi, 4
-  call identify_token
-  cmp ax, 0x001A
-  jne .fail
-
-  ; length4 token that exists
-  mov dword [TEST_ARENA_ADDR], "r15b"
-  mov rdi, TEST_ARENA_ADDR
-  mov rsi, 4
-  call identify_token
-  cmp ax, 0x003F
-  jne .fail
-
-  ; length4 token that doesn't exist
-  mov dword [TEST_ARENA_ADDR], "r15q"
-  mov rdi, TEST_ARENA_ADDR
-  mov rsi, 4
-  call identify_token
-  cmp ax, 0xFFFF
-  jne .fail
-
-  .pass:
-    mov rsi, msg_pass
-    call print
-    ret
-  .fail:
-    mov rsi, msg_fail
-    call print
-    ret
-  .msg db "test_identify_token...", 0x00
-
-; ------------------------------------------------------------------------------
-; test_identify_next_token
-;
-; description:
-; tests identify_next_token described functionality
-; ------------------------------------------------------------------------------
-
-test_identify_next_token:
-  mov rsi, .msg
-  call print.test
-
-  ; length1 token that exists
-  mov word [TEST_ARENA_ADDR], "* "
-  mov rdi, TEST_ARENA_ADDR
-  call identify_next_token
-  cmp ax, 0x0064
-  jne .fail
-
-  ; length1 token that doesn't exist
-  mov word [TEST_ARENA_ADDR], "  "
-  mov rdi, TEST_ARENA_ADDR
-  call identify_next_token
-  cmp ax, 0xFFFF
-  jne .fail
-
-  ; length2 token that exists
-  mov dword [TEST_ARENA_ADDR], "sp  "
-  mov rdi, TEST_ARENA_ADDR
-  call identify_next_token
-  cmp ax, 0x0026
-  jne .fail
-
-  ; length2 token that doesn't exist
-  mov dword [TEST_ARENA_ADDR], "QQ  "
-  mov rdi, TEST_ARENA_ADDR
-  call identify_next_token
-  cmp ax, 0xFFFF
-  jne .fail
-
-  ; length3 token that exists
-  mov dword [TEST_ARENA_ADDR], "rax "
-  mov rdi, TEST_ARENA_ADDR
-  call identify_next_token
-  cmp ax, 0x0000
-  jne .fail
-
-  ; length3 token that exists
-  mov dword [TEST_ARENA_ADDR], "cr0 "
-  mov rdi, TEST_ARENA_ADDR
-  call identify_next_token
-  cmp ax, 0x004A
-  jne .fail
-
-  ; length3 token that doesn't exist
-  mov dword [TEST_ARENA_ADDR], "r16 "
-  mov rdi, TEST_ARENA_ADDR
-  call identify_next_token
-  cmp ax, 0xFFFF
-  jne .fail
-
-  ; length4 token that exists
-  mov dword [TEST_ARENA_ADDR], "r10d"
-  mov byte [TEST_ARENA_ADDR + 4], " "
-  mov rdi, TEST_ARENA_ADDR
-  call identify_next_token
-  cmp ax, 0x001A
-  jne .fail
-
-  ; length4 token that exists
-  mov dword [TEST_ARENA_ADDR], "r15b"
-  mov byte [TEST_ARENA_ADDR + 4], " "
-  mov rdi, TEST_ARENA_ADDR
-  call identify_next_token
-  cmp ax, 0x003F
-  jne .fail
-
-  ; length4 token that doesn't exist
-  mov dword [TEST_ARENA_ADDR], "r15q"
-  mov byte [TEST_ARENA_ADDR + 4], " "
-  mov rdi, TEST_ARENA_ADDR
-  call identify_next_token
-  cmp ax, 0xFFFF
-  jne .fail
-
-  .pass:
-    mov rsi, msg_pass
-    call print
-    ret
-  .fail:
-    mov rsi, msg_fail
-    call print
-    ret
-  .msg db "test_identify_next_token...", 0x00
-
-
 ; ------------------------------------------------------------------------------
 ; test_get_tte_type
 ;