little optimisation, add single-token parsing function

This commit is contained in:
andromeda
2026-03-07 20:57:19 +01:00
parent 50964e945e
commit 19a3e4ff5b
2 changed files with 198 additions and 10 deletions

View File

@@ -16,11 +16,13 @@ I want to compile Bootler and Twasm with the Twasm assembler
+------ 0x00070000 ------+
| token table |
+------ 0x00060000 ------+
| test arena |
+------ 0x00050000 ------+
| stack (rsp) |
+------------------------+
| input |
+------------------------+ this is lined up to a sector
| | and this is less than a sector
+------------------------+ <- this is lined up to a sector
| | <- and this is less than a sector
+------------------------+
| assembler |
+------ 0x00010000 ------+

View File

@@ -3,7 +3,7 @@ LOAD_ADDR equ 0x00010000 ; address this program is loaded at
TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
TOKEN_TABLE_SIZE equ 0x1000 ; max length of table
TEST_ARENA_ADDR equ 0x00060000 ; address to run tests at
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
TEST_ARENA_SIZE equ 0x1000 ; maximum size tests can use
OUTPUT_ADDR equ 0x00070000 ; address of outputed binary
@@ -11,6 +11,8 @@ OUTPUT_SIZE equ 0x1000 ; max length of outputed binary
STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at
UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token
[bits 64]
[org LOAD_ADDR]
@@ -30,6 +32,102 @@ start:
; tokenising
; ------------------------------------------------------------------------------
; ------------------------------------------------------------------------------
; identify_token
;
; description:
; returns the id of a given token
;
; parameters:
; rdi -> first byte of token
; rsi = size of token in bytes
;
; returned:
; ax = id of token; the rest of rax is zeroed
; ------------------------------------------------------------------------------
identify_token:
cmp rsi, 2 ; if the token has length 2
je .start_length2 ; then enter the length 2 loop
cmp rsi, 3 ; if the token has length 3
je .start_length3 ; then enter the length 3 loop
cmp rsi, 4 ; if the token has length 4
je .start_length4 ; then enter the length 4 loop
jmp .unrecognised ; else unrecognised
.start_length2
mov rcx, tokens.length2 ; rcx -> list of known tokens
.loop_length2:
cmp rcx, tokens.length3 ; check if rcx still in the bounds of length2 tokens
jge .unrecognised ; if not, unrecognised
mov r10w, [rcx] ; current entry in known tokens
mov r11w, [rdi] ; token
cmp r10w, r11w ; if current entry matches token,
je .found_length2 ; exit loop
add rcx, 4 ; length of token + length of id
jmp .loop_length2
.found_length2:
xor eax, eax ; make sure rest of rax is zeroed
mov ax, [rcx + 2] ; return id of token
ret
.start_length3:
mov rcx, tokens.length3 ; rcx -> list of known tokens
.loop_length3:
cmp rcx, tokens.length4 ; check if rcx still in bounds of length3 tokens
jge .unrecognised ; if not, unrecognised
; TODO make this safe (it overreaches 1 byte)
mov r10d, [rcx] ; known token + next byte
mov r11d, [rdi] ; token + next byte
and r10d, 0x00FFFFFF ; mask for just the token
and r11d, 0x00FFFFFF
cmp r10d, r11d ; if known token matches token,
je .found_length3 ; exit loop
add rcx, 5 ; length of token + length of id
jmp .loop_length3
.found_length3:
xor rax, rax ; zero rax
mov ax, [rcx + 3] ; return id of token
ret
.start_length4:
mov rcx, tokens.length4 ; rcx -> list of known tokens
.loop_length4:
cmp rcx, tokens.length5 ; check if rcx still in bounds of length3 tokens
jge .unrecognised ; if not, unrecognised
mov r10d, [rcx] ; known token
mov r11d, [rdi] ; token
cmp r10d, r11d ; if known token matches token,
je .found_length4 ; exit loop
add rcx, 6 ; length of token + length of id
jmp .loop_length4
.found_length4:
xor rax, rax ; zero rax
mov ax, [rcx + 4] ; return id of token
ret
.unrecognised:
xor eax, eax
mov ax, UNRECOGNISED_TOKEN_ID
ret
; ------------------------------------------------------------------------------
; copy_token
;
@@ -92,7 +190,7 @@ copy_token:
; ------------------------------------------------------------------------------
copy_byte:
xor rax, rax ; zero out so it returns fine
xor eax, eax ; zero out so it returns fine
mov al, [rdi]
mov [rsi], al
ret
@@ -113,7 +211,7 @@ copy_byte:
print:
push rdx
mov rdx, 0x3F8
mov edx, 0x3F8
.loop:
mov al, [rsi]
test al, al
@@ -169,11 +267,11 @@ elemb:
jmp .loop
.not_found
xor rax, rax ; return 0; dl not an element of list
xor eax, eax ; return 0; dl not an element of list
ret
.found
xor rax, rax
xor eax, eax
mov rax, 1 ; return 1; dl an element of list
ret
@@ -188,7 +286,7 @@ elemb:
; ------------------------------------------------------------------------------
clear_token_table:
xor rax, rax ; value to write
xor eax, eax ; value to write
mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words
mov rdi, TOKEN_TABLE_ADDR ; address to start
rep stosd
@@ -202,7 +300,7 @@ clear_token_table:
; ------------------------------------------------------------------------------
clear_test_arena:
xor rax, rax ; value to write
xor eax, eax ; value to write
mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words
mov rdi, TOKEN_TABLE_ADDR ; address to start
rep stosd
@@ -232,6 +330,9 @@ run_tests:
call clear_test_arena
call test_elemb
call clear_test_arena
call test_identify_token
ret
.msg db "running test suite...", 0x0D, 0x0A, 0x00
@@ -384,6 +485,91 @@ test_elemb:
ret
.msg db "test_elemb...", 0x00
; ------------------------------------------------------------------------------
; test_identify_token
;
; description:
; tests identify_token described functionality
; ------------------------------------------------------------------------------
test_identify_token:
mov rsi, .msg
call print
; length2 token that exists
mov word [TEST_ARENA_ADDR], "sp"
mov rdi, TEST_ARENA_ADDR
mov rsi, 2
call identify_token
cmp ax, 0x0026
jne .fail
; length2 token that doesn't exist
mov word [TEST_ARENA_ADDR], "QQ"
mov rdi, TEST_ARENA_ADDR
mov rsi, 2
call identify_token
cmp ax, 0xFFFF
jne .fail
; length3 token that exists
mov dword [TEST_ARENA_ADDR], "rax"
mov rdi, TEST_ARENA_ADDR
mov rsi, 3
call identify_token
cmp ax, 0x0000
jne .fail
; length3 token that exists
mov dword [TEST_ARENA_ADDR], "cr0"
mov rdi, TEST_ARENA_ADDR
mov rsi, 3
call identify_token
cmp ax, 0x004A
jne .fail
; length3 token that doesn't exist
mov dword [TEST_ARENA_ADDR], "r16"
mov rdi, TEST_ARENA_ADDR
mov rsi, 3
call identify_token
cmp ax, 0xFFFF
jne .fail
; length4 token that exists
mov dword [TEST_ARENA_ADDR], "r10d"
mov rdi, TEST_ARENA_ADDR
mov rsi, 4
call identify_token
cmp ax, 0x001A
jne .fail
; length4 token that exists
mov dword [TEST_ARENA_ADDR], "r15b"
mov rdi, TEST_ARENA_ADDR
mov rsi, 4
call identify_token
cmp ax, 0x003F
jne .fail
; length4 token that doesn't exist
mov dword [TEST_ARENA_ADDR], "r15q"
mov rdi, TEST_ARENA_ADDR
mov rsi, 4
call identify_token
cmp ax, 0xFFFF
jne .fail
.pass:
mov rsi, msg_pass
call print
ret
.fail:
mov rsi, msg_fail
call print
ret
.msg db "test_identify_token...", 0x00
; ------------------------------------------------------------------------------
; data
; ------------------------------------------------------------------------------
@@ -561,7 +747,7 @@ msg_fail db "failed.", 0x0D, 0x0A, 0x00
test_byte db "Q" ; unterminated, just a byte chillin
test_token_null db "TestTokn", 0x00 ; followed by null terminator. Quad word
test_token_space db "TestTokn " ; followed by space. Quad word
test_elemb_0
test_elemb_0 ; [This Page Intentionally Left Blank]
test_elemb_5 db 0x54, 0x00, 0x21, 0x20, 0x34
token_terminator_8 db 0x00, " ", 0x0A, 0x0D, 0x00, 0x00, 0x00, 0x00