From 19a3e4ff5b9e303b3d2bfaa386d795a8d05fc7f6 Mon Sep 17 00:00:00 2001 From: andromeda Date: Sat, 7 Mar 2026 20:57:19 +0100 Subject: [PATCH] little optimisation, add single-token parsing function --- twasm/README.md | 6 +- twasm/asm/main.asm | 202 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 198 insertions(+), 10 deletions(-) diff --git a/twasm/README.md b/twasm/README.md index a8ad332..4607b01 100644 --- a/twasm/README.md +++ b/twasm/README.md @@ -16,11 +16,13 @@ I want to compile Bootler and Twasm with the Twasm assembler +------ 0x00070000 ------+ | token table | +------ 0x00060000 ------+ +| test arena | ++------ 0x00050000 ------+ | stack (rsp) | +------------------------+ | input | -+------------------------+ this is lined up to a sector -| | and this is less than a sector ++------------------------+ <- this is lined up to a sector +| | <- and this is less than a sector +------------------------+ | assembler | +------ 0x00010000 ------+ diff --git a/twasm/asm/main.asm b/twasm/asm/main.asm index e26afb0..e498f91 100644 --- a/twasm/asm/main.asm +++ b/twasm/asm/main.asm @@ -3,7 +3,7 @@ LOAD_ADDR equ 0x00010000 ; address this program is loaded at TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at TOKEN_TABLE_SIZE equ 0x1000 ; max length of table -TEST_ARENA_ADDR equ 0x00060000 ; address to run tests at +TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at TEST_ARENA_SIZE equ 0x1000 ; maximum size tests can use OUTPUT_ADDR equ 0x00070000 ; address of outputed binary @@ -11,6 +11,8 @@ OUTPUT_SIZE equ 0x1000 ; max length of outputed binary STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at +UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token + [bits 64] [org LOAD_ADDR] @@ -30,6 +32,102 @@ start: ; tokenising ; ------------------------------------------------------------------------------ +; ------------------------------------------------------------------------------ +; identify_token +; +; description: +; returns the id of a given token +; +; parameters: +; rdi -> first byte of token +; rsi = size of token in bytes +; +; returned: +; ax = id of token; the rest of rax is zeroed +; ------------------------------------------------------------------------------ + +identify_token: + cmp rsi, 2 ; if the token has length 2 + je .start_length2 ; then enter the length 2 loop + + cmp rsi, 3 ; if the token has length 3 + je .start_length3 ; then enter the length 3 loop + + cmp rsi, 4 ; if the token has length 4 + je .start_length4 ; then enter the length 4 loop + + jmp .unrecognised ; else unrecognised + + .start_length2 + mov rcx, tokens.length2 ; rcx -> list of known tokens + + .loop_length2: + cmp rcx, tokens.length3 ; check if rcx still in the bounds of length2 tokens + jge .unrecognised ; if not, unrecognised + + mov r10w, [rcx] ; current entry in known tokens + mov r11w, [rdi] ; token + cmp r10w, r11w ; if current entry matches token, + je .found_length2 ; exit loop + + add rcx, 4 ; length of token + length of id + jmp .loop_length2 + + .found_length2: + xor eax, eax ; make sure rest of rax is zeroed + mov ax, [rcx + 2] ; return id of token + ret + + .start_length3: + mov rcx, tokens.length3 ; rcx -> list of known tokens + + .loop_length3: + cmp rcx, tokens.length4 ; check if rcx still in bounds of length3 tokens + jge .unrecognised ; if not, unrecognised + + ; TODO make this safe (it overreaches 1 byte) + mov r10d, [rcx] ; known token + next byte + mov r11d, [rdi] ; token + next byte + + and r10d, 0x00FFFFFF ; mask for just the token + and r11d, 0x00FFFFFF + + cmp r10d, r11d ; if known token matches token, + je .found_length3 ; exit loop + + add rcx, 5 ; length of token + length of id + jmp .loop_length3 + + .found_length3: + xor rax, rax ; zero rax + mov ax, [rcx + 3] ; return id of token + ret + + .start_length4: + mov rcx, tokens.length4 ; rcx -> list of known tokens + + .loop_length4: + cmp rcx, tokens.length5 ; check if rcx still in bounds of length3 tokens + jge .unrecognised ; if not, unrecognised + + mov r10d, [rcx] ; known token + mov r11d, [rdi] ; token + cmp r10d, r11d ; if known token matches token, + je .found_length4 ; exit loop + + add rcx, 6 ; length of token + length of id + jmp .loop_length4 + + .found_length4: + xor rax, rax ; zero rax + mov ax, [rcx + 4] ; return id of token + ret + + .unrecognised: + xor eax, eax + mov ax, UNRECOGNISED_TOKEN_ID + ret + ; ------------------------------------------------------------------------------ ; copy_token ; @@ -92,7 +190,7 @@ copy_token: ; ------------------------------------------------------------------------------ copy_byte: - xor rax, rax ; zero out so it returns fine + xor eax, eax ; zero out so it returns fine mov al, [rdi] mov [rsi], al ret @@ -113,7 +211,7 @@ copy_byte: print: push rdx - mov rdx, 0x3F8 + mov edx, 0x3F8 .loop: mov al, [rsi] test al, al @@ -169,11 +267,11 @@ elemb: jmp .loop .not_found - xor rax, rax ; return 0; dl not an element of list + xor eax, eax ; return 0; dl not an element of list ret .found - xor rax, rax + xor eax, eax mov rax, 1 ; return 1; dl an element of list ret @@ -188,7 +286,7 @@ elemb: ; ------------------------------------------------------------------------------ clear_token_table: - xor rax, rax ; value to write + xor eax, eax ; value to write mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words mov rdi, TOKEN_TABLE_ADDR ; address to start rep stosd @@ -202,7 +300,7 @@ clear_token_table: ; ------------------------------------------------------------------------------ clear_test_arena: - xor rax, rax ; value to write + xor eax, eax ; value to write mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words mov rdi, TOKEN_TABLE_ADDR ; address to start rep stosd @@ -232,6 +330,9 @@ run_tests: call clear_test_arena call test_elemb + call clear_test_arena + call test_identify_token + ret .msg db "running test suite...", 0x0D, 0x0A, 0x00 @@ -384,6 +485,91 @@ test_elemb: ret .msg db "test_elemb...", 0x00 +; ------------------------------------------------------------------------------ +; test_identify_token +; +; description: +; tests identify_token described functionality +; ------------------------------------------------------------------------------ + +test_identify_token: + mov rsi, .msg + call print + + ; length2 token that exists + mov word [TEST_ARENA_ADDR], "sp" + mov rdi, TEST_ARENA_ADDR + mov rsi, 2 + call identify_token + cmp ax, 0x0026 + jne .fail + + ; length2 token that doesn't exist + mov word [TEST_ARENA_ADDR], "QQ" + mov rdi, TEST_ARENA_ADDR + mov rsi, 2 + call identify_token + cmp ax, 0xFFFF + jne .fail + + ; length3 token that exists + mov dword [TEST_ARENA_ADDR], "rax" + mov rdi, TEST_ARENA_ADDR + mov rsi, 3 + call identify_token + cmp ax, 0x0000 + jne .fail + + ; length3 token that exists + mov dword [TEST_ARENA_ADDR], "cr0" + mov rdi, TEST_ARENA_ADDR + mov rsi, 3 + call identify_token + cmp ax, 0x004A + jne .fail + + ; length3 token that doesn't exist + mov dword [TEST_ARENA_ADDR], "r16" + mov rdi, TEST_ARENA_ADDR + mov rsi, 3 + call identify_token + cmp ax, 0xFFFF + jne .fail + + ; length4 token that exists + mov dword [TEST_ARENA_ADDR], "r10d" + mov rdi, TEST_ARENA_ADDR + mov rsi, 4 + call identify_token + cmp ax, 0x001A + jne .fail + + ; length4 token that exists + mov dword [TEST_ARENA_ADDR], "r15b" + mov rdi, TEST_ARENA_ADDR + mov rsi, 4 + call identify_token + cmp ax, 0x003F + jne .fail + + ; length4 token that doesn't exist + mov dword [TEST_ARENA_ADDR], "r15q" + mov rdi, TEST_ARENA_ADDR + mov rsi, 4 + call identify_token + cmp ax, 0xFFFF + jne .fail + + .pass: + mov rsi, msg_pass + call print + ret + .fail: + mov rsi, msg_fail + call print + ret + .msg db "test_identify_token...", 0x00 + ; ------------------------------------------------------------------------------ ; data ; ------------------------------------------------------------------------------ @@ -561,7 +747,7 @@ msg_fail db "failed.", 0x0D, 0x0A, 0x00 test_byte db "Q" ; unterminated, just a byte chillin test_token_null db "TestTokn", 0x00 ; followed by null terminator. Quad word test_token_space db "TestTokn " ; followed by space. Quad word -test_elemb_0 +test_elemb_0 ; [This Page Intentionally Left Blank] test_elemb_5 db 0x54, 0x00, 0x21, 0x20, 0x34 token_terminator_8 db 0x00, " ", 0x0A, 0x0D, 0x00, 0x00, 0x00, 0x00