get tokenising working a bit :p also some fixes and semantics

This commit is contained in:
andromeda
2026-03-08 10:56:20 +01:00
parent 63e3a1ea7e
commit 0d66e77976
2 changed files with 258 additions and 29 deletions

View File

@@ -21,8 +21,6 @@ I want to compile Bootler and Twasm with the Twasm assembler
| stack (rsp) |
+------------------------+
| input |
+------------------------+ <- this is lined up to a sector
| | <- and this is less than a sector
+------------------------+
| assembler |
+------ 0x00010000 ------+
@@ -37,11 +35,11 @@ each word represents a token on the token table.
each token gets loaded into the token table with the following form:
```
+----------+-----------------------+
| 31 16 | 15 0 |
+----------+-----------------------+
| reserved | token id |
+----------+-----------------------+
+----------+----------+
| 31 16 | 15 0 |
+----------+----------+
| reserved | token id |
+----------+----------+
```
### token IDs
@@ -152,4 +150,5 @@ supported tokens are listed below
| - | 0x0063 | |
| * | 0x0064 | |
| / | 0x0065 | |
| | 0xFEXX | token terminator byte as token, where `XX` is the byte |
| | 0xFFFF | unrecognised token |

View File

@@ -1,11 +1,13 @@
LOAD_ADDR equ 0x00010000 ; address this program is loaded at
TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
TOKEN_TABLE_SIZE equ 0x1000 ; max length of table
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
TEST_ARENA_SIZE equ 0x1000 ; maximum size tests can use
TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
TOKEN_TABLE_SIZE equ 0x1000 ; max length of table
TOKEN_TABLE_ENTRY_SIZE equ 8 ; size of token table entry; a LOT of things
; may break if this ever changes
OUTPUT_ADDR equ 0x00070000 ; address of outputed binary
OUTPUT_SIZE equ 0x1000 ; max length of outputed binary
@@ -13,6 +15,8 @@ STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at
UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token
TEST_LINE_LENGTH equ 80 ; right border of test suite results
[bits 64]
[org LOAD_ADDR]
@@ -26,12 +30,70 @@ start:
call clear_token_table
mov rdi, program ; -> program
mov rsi, [program.size] ; = size of program
call tokenise
jmp halt
; ------------------------------------------------------------------------------
; tokenising
; ------------------------------------------------------------------------------
; ------------------------------------------------------------------------------
; tokenise
; TODO write tests
;
; description:
; represents the program at the given address and puts it in the token table
; it's probably desirable to clear the token table before calling this function
;
; parameters:
; rdi -> first byte of program
; rsi = size of program in bytes
; ------------------------------------------------------------------------------
tokenise:
add rsi, rdi ; last byte of program
xor rcx, rcx ; number of tokens processed
.loop:
cmp rdi, rsi ; if current byte greater than last byte
jg .break ; then break
push rdi
push rsi
push rcx
; rdi -> current byte
call identify_next_token
; ax = id of token
; dx = length of token
pop rcx
pop rsi
pop rdi
; deal with terminator character (reported as 0 length token)
cmp rdx, 0
je .token_length0
jne .continue
.token_length0:
mov ax, 0xFE00 ; terminator character
mov al, [rdi] ; byte of terminator
mov edx, 1 ; byte length is 1
.continue:
add rdi, rdx ; current byte + length of token = next unread byte
mov [TOKEN_TABLE_ADDR + rcx * TOKEN_TABLE_ENTRY_SIZE], rax ; fill next entry
; in token table
inc rcx ; +1 token processed
jmp .loop
.break:
ret
; ------------------------------------------------------------------------------
; identify_token
;
@@ -61,10 +123,10 @@ identify_token:
jmp .unrecognised ; else unrecognised
.start_length1
.start_length1:
mov rcx, tokens.length1 ; rcx -> list of known tokens
.loop_length1
.loop_length1:
cmp rcx, tokens.length2 ; check if rcx still in the bounds of length1 tokens
jge .unrecognised ; if not, unrecognised
@@ -76,12 +138,12 @@ identify_token:
add rcx, 3 ; length of token + length of id
jmp .loop_length1
.found_length1
.found_length1:
xor eax, eax ; make sure rest of rax is zeroed
mov ax, [rcx + 1] ; return id of token
ret
.start_length2
.start_length2:
mov rcx, tokens.length2 ; rcx -> list of known tokens
.loop_length2:
@@ -151,6 +213,58 @@ identify_token:
mov ax, UNRECOGNISED_TOKEN_ID
ret
; ------------------------------------------------------------------------------
; identify_next_token
; description:
; like identify_token, except it automatically finds the length
;
; parameters:
; rdi -> first byte of token
;
; returned:
; ax = id of token; the rest of rax is zeroed
; dx = length of token in bytes; the rest of rdx is zeroed
; ------------------------------------------------------------------------------
identify_next_token:
push rdi
mov rsi, rdi ; rsi is the current byte
xor rdi, rdi ; rdi is the length
.loop:
xor edx, edx
mov dl, [rsi]
push rsi
push rdi
push rdx
mov rdi, 8 ; length of terminator list
mov rsi, token_terminator_8 ; start of terminator list
call elemb
pop rdx
pop rdi
pop rsi
cmp rax, 1 ; check if the next character is a token terminator
je .break ; if so, break
inc rdi ; next character
inc rsi ; next byte of token
jmp .loop
.break:
mov rsi, rdi ; length of token
pop rdi
push rsi
call identify_token
pop rsi
mov rdx, rsi ; length
ret
; ------------------------------------------------------------------------------
; copy_token
;
@@ -168,7 +282,6 @@ identify_token:
copy_token:
.loop:
mov dl, [rdi] ; move bit to compare to current byte in read buffer
push rdi ; push incrementors to call elemb
@@ -190,7 +303,6 @@ copy_token:
inc rdi ; read pointer
inc rsi ; write pointer
jmp .loop
.break:
@@ -227,6 +339,7 @@ copy_byte:
;
; description:
; prints a null-terminated string
; probably doesn't change any registers for ease of debugging
;
; parameters:
; rsi -> start of null-terminated string
@@ -234,6 +347,9 @@ copy_byte:
print:
push rdx
push rax
push rsi
mov edx, 0x3F8
.loop:
mov al, [rsi]
@@ -243,6 +359,8 @@ print:
inc rsi
jmp .loop
.done:
pop rsi
pop rax
pop rdx
ret
@@ -276,7 +394,7 @@ halt:
; ------------------------------------------------------------------------------
elemb:
.loop
.loop:
cmp rdi, 0 ; check if remaining length 0
je .not_found ; if so, break; dl not an element of list
@@ -289,11 +407,11 @@ elemb:
jmp .loop
.not_found
.not_found:
xor eax, eax ; return 0; dl not an element of list
ret
.found
.found:
xor eax, eax
mov rax, 1 ; return 1; dl an element of list
ret
@@ -356,6 +474,9 @@ run_tests:
call clear_test_arena
call test_identify_token
call clear_test_arena
call test_identify_next_token
ret
.msg db "running test suite...", 0x0D, 0x0A, 0x00
@@ -609,12 +730,106 @@ test_identify_token:
ret
.msg db "test_identify_token...", 0x00
; ------------------------------------------------------------------------------
; test_identify_next_token
;
; description:
; tests identify_next_token described functionality
; ------------------------------------------------------------------------------
test_identify_next_token:
mov rsi, .msg
call print
; length1 token that exists
mov word [TEST_ARENA_ADDR], "* "
mov rdi, TEST_ARENA_ADDR
call identify_next_token
cmp ax, 0x0064
jne .fail
; length1 token that doesn't exist
mov word [TEST_ARENA_ADDR], " "
mov rdi, TEST_ARENA_ADDR
call identify_next_token
cmp ax, 0xFFFF
jne .fail
; length2 token that exists
mov dword [TEST_ARENA_ADDR], "sp "
mov rdi, TEST_ARENA_ADDR
call identify_next_token
cmp ax, 0x0026
jne .fail
; length2 token that doesn't exist
mov dword [TEST_ARENA_ADDR], "QQ "
mov rdi, TEST_ARENA_ADDR
call identify_next_token
cmp ax, 0xFFFF
jne .fail
; length3 token that exists
mov dword [TEST_ARENA_ADDR], "rax "
mov rdi, TEST_ARENA_ADDR
call identify_next_token
cmp ax, 0x0000
jne .fail
; length3 token that exists
mov dword [TEST_ARENA_ADDR], "cr0 "
mov rdi, TEST_ARENA_ADDR
call identify_next_token
cmp ax, 0x004A
jne .fail
; length3 token that doesn't exist
mov dword [TEST_ARENA_ADDR], "r16 "
mov rdi, TEST_ARENA_ADDR
call identify_next_token
cmp ax, 0xFFFF
jne .fail
; length4 token that exists
mov dword [TEST_ARENA_ADDR], "r10d"
mov byte [TEST_ARENA_ADDR + 4], " "
mov rdi, TEST_ARENA_ADDR
call identify_next_token
cmp ax, 0x001A
jne .fail
; length4 token that exists
mov dword [TEST_ARENA_ADDR], "r15b"
mov byte [TEST_ARENA_ADDR + 4], " "
mov rdi, TEST_ARENA_ADDR
call identify_next_token
cmp ax, 0x003F
jne .fail
; length4 token that doesn't exist
mov dword [TEST_ARENA_ADDR], "r15q"
mov byte [TEST_ARENA_ADDR + 4], " "
mov rdi, TEST_ARENA_ADDR
call identify_next_token
cmp ax, 0xFFFF
jne .fail
.pass:
mov rsi, msg_pass
call print
ret
.fail:
mov rsi, msg_fail
call print
ret
.msg db "test_identify_next_token...", 0x00
; ------------------------------------------------------------------------------
; data
; ------------------------------------------------------------------------------
tokens:
.length1
.length1:
db "["
dw 0x0051
db "]"
@@ -627,7 +842,7 @@ tokens:
dw 0x0064
db "/"
dw 0x0065
.length2
.length2:
db "r8"
dw 0x0008
db "r9"
@@ -682,7 +897,7 @@ tokens:
dw 0x005F
db "jl"
dw 0x0061
.length3
.length3:
db "rax"
dw 0x0000
db "rbx"
@@ -781,7 +996,7 @@ tokens:
dw 0x005E
db "jle"
dw 0x0060
.length4
.length4:
db "r10d"
dw 0x001A
db "r11d"
@@ -822,20 +1037,35 @@ tokens:
dw 0x0050
db "call"
dw 0x0059
.length5
.end
.length5:
.end:
msg_welcome db "Welcome to Twasm", 0x0D, 0x0A, 0x00
msg_halt db "halted.", 0x0D, 0x0A, 0x00
msg_pass db "passed.", 0x0D, 0x0A, 0x00
msg_fail db "failed.", 0x0D, 0x0A, 0x00
msg_pass:
db 0x0D, 0x0A
times (TEST_LINE_LENGTH + .start - .end) db " ", ; right align
.start db "passed."
.end db 0x0D, 0x0A, 0x00
msg_fail:
db 0x0D, 0x0A
times (TEST_LINE_LENGTH + .start - .end) db " ",
.start db "failed."
.end db 0x0D, 0x0A, 0x00
test_byte db "Q" ; unterminated, just a byte chillin
test_token_null db "TestTokn", 0x00 ; followed by null terminator. Quad word
test_token_space db "TestTokn " ; followed by space. Quad word
test_elemb_0 ; [This Page Intentionally Left Blank]
test_elemb_0: ; [This Page Intentionally Left Blank]
test_elemb_5 db 0x54, 0x00, 0x21, 0x20, 0x34
token_terminator_8 db 0x00, " ", 0x0A, 0x0D, 0x00, 0x00, 0x00, 0x00
token_terminator_8 db 0x00, " ", 0x0A, 0x0D, ",", 0x00, 0x00, 0x00
debug_string db "debug_string", 0x0D, 0x0A, 0x00
program:
db "xor eax, eax", 0x0D, 0x0A
db "inc rax", 0x0D, 0x0A
db "hlt", 0x0D, 0x0A
db 0x00
.size db $ - program - 1