get tokenising working a bit :p also some fixes and semantics
This commit is contained in:
@@ -21,8 +21,6 @@ I want to compile Bootler and Twasm with the Twasm assembler
|
||||
| stack (rsp) |
|
||||
+------------------------+
|
||||
| input |
|
||||
+------------------------+ <- this is lined up to a sector
|
||||
| | <- and this is less than a sector
|
||||
+------------------------+
|
||||
| assembler |
|
||||
+------ 0x00010000 ------+
|
||||
@@ -37,11 +35,11 @@ each word represents a token on the token table.
|
||||
each token gets loaded into the token table with the following form:
|
||||
|
||||
```
|
||||
+----------+-----------------------+
|
||||
| 31 16 | 15 0 |
|
||||
+----------+-----------------------+
|
||||
| reserved | token id |
|
||||
+----------+-----------------------+
|
||||
+----------+----------+
|
||||
| 31 16 | 15 0 |
|
||||
+----------+----------+
|
||||
| reserved | token id |
|
||||
+----------+----------+
|
||||
```
|
||||
|
||||
### token IDs
|
||||
@@ -152,4 +150,5 @@ supported tokens are listed below
|
||||
| - | 0x0063 | |
|
||||
| * | 0x0064 | |
|
||||
| / | 0x0065 | |
|
||||
| | 0xFEXX | token terminator byte as token, where `XX` is the byte |
|
||||
| | 0xFFFF | unrecognised token |
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
LOAD_ADDR equ 0x00010000 ; address this program is loaded at
|
||||
|
||||
TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
|
||||
TOKEN_TABLE_SIZE equ 0x1000 ; max length of table
|
||||
|
||||
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
|
||||
TEST_ARENA_SIZE equ 0x1000 ; maximum size tests can use
|
||||
|
||||
TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
|
||||
TOKEN_TABLE_SIZE equ 0x1000 ; max length of table
|
||||
TOKEN_TABLE_ENTRY_SIZE equ 8 ; size of token table entry; a LOT of things
|
||||
; may break if this ever changes
|
||||
|
||||
OUTPUT_ADDR equ 0x00070000 ; address of outputed binary
|
||||
OUTPUT_SIZE equ 0x1000 ; max length of outputed binary
|
||||
|
||||
@@ -13,6 +15,8 @@ STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at
|
||||
|
||||
UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token
|
||||
|
||||
TEST_LINE_LENGTH equ 80 ; right border of test suite results
|
||||
|
||||
[bits 64]
|
||||
[org LOAD_ADDR]
|
||||
|
||||
@@ -26,12 +30,70 @@ start:
|
||||
|
||||
call clear_token_table
|
||||
|
||||
mov rdi, program ; -> program
|
||||
mov rsi, [program.size] ; = size of program
|
||||
call tokenise
|
||||
|
||||
jmp halt
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; tokenising
|
||||
; ------------------------------------------------------------------------------
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; tokenise
|
||||
; TODO write tests
|
||||
;
|
||||
; description:
|
||||
; represents the program at the given address and puts it in the token table
|
||||
; it's probably desirable to clear the token table before calling this function
|
||||
;
|
||||
; parameters:
|
||||
; rdi -> first byte of program
|
||||
; rsi = size of program in bytes
|
||||
; ------------------------------------------------------------------------------
|
||||
|
||||
tokenise:
|
||||
add rsi, rdi ; last byte of program
|
||||
xor rcx, rcx ; number of tokens processed
|
||||
.loop:
|
||||
cmp rdi, rsi ; if current byte greater than last byte
|
||||
jg .break ; then break
|
||||
|
||||
push rdi
|
||||
push rsi
|
||||
push rcx
|
||||
|
||||
; rdi -> current byte
|
||||
call identify_next_token
|
||||
; ax = id of token
|
||||
; dx = length of token
|
||||
|
||||
pop rcx
|
||||
pop rsi
|
||||
pop rdi
|
||||
|
||||
; deal with terminator character (reported as 0 length token)
|
||||
cmp rdx, 0
|
||||
je .token_length0
|
||||
jne .continue
|
||||
|
||||
.token_length0:
|
||||
mov ax, 0xFE00 ; terminator character
|
||||
mov al, [rdi] ; byte of terminator
|
||||
mov edx, 1 ; byte length is 1
|
||||
|
||||
.continue:
|
||||
|
||||
add rdi, rdx ; current byte + length of token = next unread byte
|
||||
|
||||
mov [TOKEN_TABLE_ADDR + rcx * TOKEN_TABLE_ENTRY_SIZE], rax ; fill next entry
|
||||
; in token table
|
||||
inc rcx ; +1 token processed
|
||||
jmp .loop
|
||||
.break:
|
||||
ret
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; identify_token
|
||||
;
|
||||
@@ -61,10 +123,10 @@ identify_token:
|
||||
|
||||
jmp .unrecognised ; else unrecognised
|
||||
|
||||
.start_length1
|
||||
.start_length1:
|
||||
mov rcx, tokens.length1 ; rcx -> list of known tokens
|
||||
|
||||
.loop_length1
|
||||
.loop_length1:
|
||||
cmp rcx, tokens.length2 ; check if rcx still in the bounds of length1 tokens
|
||||
jge .unrecognised ; if not, unrecognised
|
||||
|
||||
@@ -76,12 +138,12 @@ identify_token:
|
||||
add rcx, 3 ; length of token + length of id
|
||||
jmp .loop_length1
|
||||
|
||||
.found_length1
|
||||
.found_length1:
|
||||
xor eax, eax ; make sure rest of rax is zeroed
|
||||
mov ax, [rcx + 1] ; return id of token
|
||||
ret
|
||||
|
||||
.start_length2
|
||||
.start_length2:
|
||||
mov rcx, tokens.length2 ; rcx -> list of known tokens
|
||||
|
||||
.loop_length2:
|
||||
@@ -151,6 +213,58 @@ identify_token:
|
||||
mov ax, UNRECOGNISED_TOKEN_ID
|
||||
ret
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; identify_next_token
|
||||
; description:
|
||||
; like identify_token, except it automatically finds the length
|
||||
;
|
||||
; parameters:
|
||||
; rdi -> first byte of token
|
||||
;
|
||||
; returned:
|
||||
; ax = id of token; the rest of rax is zeroed
|
||||
; dx = length of token in bytes; the rest of rdx is zeroed
|
||||
; ------------------------------------------------------------------------------
|
||||
|
||||
identify_next_token:
|
||||
push rdi
|
||||
|
||||
mov rsi, rdi ; rsi is the current byte
|
||||
xor rdi, rdi ; rdi is the length
|
||||
.loop:
|
||||
xor edx, edx
|
||||
mov dl, [rsi]
|
||||
|
||||
push rsi
|
||||
push rdi
|
||||
push rdx
|
||||
|
||||
mov rdi, 8 ; length of terminator list
|
||||
mov rsi, token_terminator_8 ; start of terminator list
|
||||
call elemb
|
||||
|
||||
pop rdx
|
||||
pop rdi
|
||||
pop rsi
|
||||
|
||||
cmp rax, 1 ; check if the next character is a token terminator
|
||||
je .break ; if so, break
|
||||
|
||||
inc rdi ; next character
|
||||
inc rsi ; next byte of token
|
||||
jmp .loop
|
||||
|
||||
.break:
|
||||
mov rsi, rdi ; length of token
|
||||
|
||||
pop rdi
|
||||
|
||||
push rsi
|
||||
call identify_token
|
||||
pop rsi
|
||||
mov rdx, rsi ; length
|
||||
ret
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; copy_token
|
||||
;
|
||||
@@ -168,7 +282,6 @@ identify_token:
|
||||
|
||||
copy_token:
|
||||
.loop:
|
||||
|
||||
mov dl, [rdi] ; move bit to compare to current byte in read buffer
|
||||
|
||||
push rdi ; push incrementors to call elemb
|
||||
@@ -190,7 +303,6 @@ copy_token:
|
||||
|
||||
inc rdi ; read pointer
|
||||
inc rsi ; write pointer
|
||||
|
||||
jmp .loop
|
||||
|
||||
.break:
|
||||
@@ -227,6 +339,7 @@ copy_byte:
|
||||
;
|
||||
; description:
|
||||
; prints a null-terminated string
|
||||
; probably doesn't change any registers for ease of debugging
|
||||
;
|
||||
; parameters:
|
||||
; rsi -> start of null-terminated string
|
||||
@@ -234,6 +347,9 @@ copy_byte:
|
||||
|
||||
print:
|
||||
push rdx
|
||||
push rax
|
||||
push rsi
|
||||
|
||||
mov edx, 0x3F8
|
||||
.loop:
|
||||
mov al, [rsi]
|
||||
@@ -243,6 +359,8 @@ print:
|
||||
inc rsi
|
||||
jmp .loop
|
||||
.done:
|
||||
pop rsi
|
||||
pop rax
|
||||
pop rdx
|
||||
ret
|
||||
|
||||
@@ -276,7 +394,7 @@ halt:
|
||||
; ------------------------------------------------------------------------------
|
||||
|
||||
elemb:
|
||||
.loop
|
||||
.loop:
|
||||
cmp rdi, 0 ; check if remaining length 0
|
||||
je .not_found ; if so, break; dl not an element of list
|
||||
|
||||
@@ -289,11 +407,11 @@ elemb:
|
||||
|
||||
jmp .loop
|
||||
|
||||
.not_found
|
||||
.not_found:
|
||||
xor eax, eax ; return 0; dl not an element of list
|
||||
ret
|
||||
|
||||
.found
|
||||
.found:
|
||||
xor eax, eax
|
||||
mov rax, 1 ; return 1; dl an element of list
|
||||
ret
|
||||
@@ -356,6 +474,9 @@ run_tests:
|
||||
call clear_test_arena
|
||||
call test_identify_token
|
||||
|
||||
call clear_test_arena
|
||||
call test_identify_next_token
|
||||
|
||||
ret
|
||||
.msg db "running test suite...", 0x0D, 0x0A, 0x00
|
||||
|
||||
@@ -609,12 +730,106 @@ test_identify_token:
|
||||
ret
|
||||
.msg db "test_identify_token...", 0x00
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; test_identify_next_token
|
||||
;
|
||||
; description:
|
||||
; tests identify_next_token described functionality
|
||||
; ------------------------------------------------------------------------------
|
||||
|
||||
test_identify_next_token:
|
||||
mov rsi, .msg
|
||||
call print
|
||||
|
||||
; length1 token that exists
|
||||
mov word [TEST_ARENA_ADDR], "* "
|
||||
mov rdi, TEST_ARENA_ADDR
|
||||
call identify_next_token
|
||||
cmp ax, 0x0064
|
||||
jne .fail
|
||||
|
||||
; length1 token that doesn't exist
|
||||
mov word [TEST_ARENA_ADDR], " "
|
||||
mov rdi, TEST_ARENA_ADDR
|
||||
call identify_next_token
|
||||
cmp ax, 0xFFFF
|
||||
jne .fail
|
||||
|
||||
; length2 token that exists
|
||||
mov dword [TEST_ARENA_ADDR], "sp "
|
||||
mov rdi, TEST_ARENA_ADDR
|
||||
call identify_next_token
|
||||
cmp ax, 0x0026
|
||||
jne .fail
|
||||
|
||||
; length2 token that doesn't exist
|
||||
mov dword [TEST_ARENA_ADDR], "QQ "
|
||||
mov rdi, TEST_ARENA_ADDR
|
||||
call identify_next_token
|
||||
cmp ax, 0xFFFF
|
||||
jne .fail
|
||||
|
||||
; length3 token that exists
|
||||
mov dword [TEST_ARENA_ADDR], "rax "
|
||||
mov rdi, TEST_ARENA_ADDR
|
||||
call identify_next_token
|
||||
cmp ax, 0x0000
|
||||
jne .fail
|
||||
|
||||
; length3 token that exists
|
||||
mov dword [TEST_ARENA_ADDR], "cr0 "
|
||||
mov rdi, TEST_ARENA_ADDR
|
||||
call identify_next_token
|
||||
cmp ax, 0x004A
|
||||
jne .fail
|
||||
|
||||
; length3 token that doesn't exist
|
||||
mov dword [TEST_ARENA_ADDR], "r16 "
|
||||
mov rdi, TEST_ARENA_ADDR
|
||||
call identify_next_token
|
||||
cmp ax, 0xFFFF
|
||||
jne .fail
|
||||
|
||||
; length4 token that exists
|
||||
mov dword [TEST_ARENA_ADDR], "r10d"
|
||||
mov byte [TEST_ARENA_ADDR + 4], " "
|
||||
mov rdi, TEST_ARENA_ADDR
|
||||
call identify_next_token
|
||||
cmp ax, 0x001A
|
||||
jne .fail
|
||||
|
||||
; length4 token that exists
|
||||
mov dword [TEST_ARENA_ADDR], "r15b"
|
||||
mov byte [TEST_ARENA_ADDR + 4], " "
|
||||
mov rdi, TEST_ARENA_ADDR
|
||||
call identify_next_token
|
||||
cmp ax, 0x003F
|
||||
jne .fail
|
||||
|
||||
; length4 token that doesn't exist
|
||||
mov dword [TEST_ARENA_ADDR], "r15q"
|
||||
mov byte [TEST_ARENA_ADDR + 4], " "
|
||||
mov rdi, TEST_ARENA_ADDR
|
||||
call identify_next_token
|
||||
cmp ax, 0xFFFF
|
||||
jne .fail
|
||||
|
||||
.pass:
|
||||
mov rsi, msg_pass
|
||||
call print
|
||||
ret
|
||||
.fail:
|
||||
mov rsi, msg_fail
|
||||
call print
|
||||
ret
|
||||
.msg db "test_identify_next_token...", 0x00
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; data
|
||||
; ------------------------------------------------------------------------------
|
||||
|
||||
tokens:
|
||||
.length1
|
||||
.length1:
|
||||
db "["
|
||||
dw 0x0051
|
||||
db "]"
|
||||
@@ -627,7 +842,7 @@ tokens:
|
||||
dw 0x0064
|
||||
db "/"
|
||||
dw 0x0065
|
||||
.length2
|
||||
.length2:
|
||||
db "r8"
|
||||
dw 0x0008
|
||||
db "r9"
|
||||
@@ -682,7 +897,7 @@ tokens:
|
||||
dw 0x005F
|
||||
db "jl"
|
||||
dw 0x0061
|
||||
.length3
|
||||
.length3:
|
||||
db "rax"
|
||||
dw 0x0000
|
||||
db "rbx"
|
||||
@@ -781,7 +996,7 @@ tokens:
|
||||
dw 0x005E
|
||||
db "jle"
|
||||
dw 0x0060
|
||||
.length4
|
||||
.length4:
|
||||
db "r10d"
|
||||
dw 0x001A
|
||||
db "r11d"
|
||||
@@ -822,20 +1037,35 @@ tokens:
|
||||
dw 0x0050
|
||||
db "call"
|
||||
dw 0x0059
|
||||
.length5
|
||||
.end
|
||||
.length5:
|
||||
.end:
|
||||
|
||||
msg_welcome db "Welcome to Twasm", 0x0D, 0x0A, 0x00
|
||||
msg_halt db "halted.", 0x0D, 0x0A, 0x00
|
||||
msg_pass db "passed.", 0x0D, 0x0A, 0x00
|
||||
msg_fail db "failed.", 0x0D, 0x0A, 0x00
|
||||
msg_pass:
|
||||
db 0x0D, 0x0A
|
||||
times (TEST_LINE_LENGTH + .start - .end) db " ", ; right align
|
||||
.start db "passed."
|
||||
.end db 0x0D, 0x0A, 0x00
|
||||
msg_fail:
|
||||
db 0x0D, 0x0A
|
||||
times (TEST_LINE_LENGTH + .start - .end) db " ",
|
||||
.start db "failed."
|
||||
.end db 0x0D, 0x0A, 0x00
|
||||
|
||||
test_byte db "Q" ; unterminated, just a byte chillin
|
||||
test_token_null db "TestTokn", 0x00 ; followed by null terminator. Quad word
|
||||
test_token_space db "TestTokn " ; followed by space. Quad word
|
||||
test_elemb_0 ; [This Page Intentionally Left Blank]
|
||||
test_elemb_0: ; [This Page Intentionally Left Blank]
|
||||
test_elemb_5 db 0x54, 0x00, 0x21, 0x20, 0x34
|
||||
|
||||
token_terminator_8 db 0x00, " ", 0x0A, 0x0D, 0x00, 0x00, 0x00, 0x00
|
||||
token_terminator_8 db 0x00, " ", 0x0A, 0x0D, ",", 0x00, 0x00, 0x00
|
||||
|
||||
debug_string db "debug_string", 0x0D, 0x0A, 0x00
|
||||
|
||||
program:
|
||||
db "xor eax, eax", 0x0D, 0x0A
|
||||
db "inc rax", 0x0D, 0x0A
|
||||
db "hlt", 0x0D, 0x0A
|
||||
db 0x00
|
||||
.size db $ - program - 1
|
||||
|
||||
Reference in New Issue
Block a user