get tokenising working a bit :p also some fixes and semantics
This commit is contained in:
@@ -21,8 +21,6 @@ I want to compile Bootler and Twasm with the Twasm assembler
|
|||||||
| stack (rsp) |
|
| stack (rsp) |
|
||||||
+------------------------+
|
+------------------------+
|
||||||
| input |
|
| input |
|
||||||
+------------------------+ <- this is lined up to a sector
|
|
||||||
| | <- and this is less than a sector
|
|
||||||
+------------------------+
|
+------------------------+
|
||||||
| assembler |
|
| assembler |
|
||||||
+------ 0x00010000 ------+
|
+------ 0x00010000 ------+
|
||||||
@@ -37,11 +35,11 @@ each word represents a token on the token table.
|
|||||||
each token gets loaded into the token table with the following form:
|
each token gets loaded into the token table with the following form:
|
||||||
|
|
||||||
```
|
```
|
||||||
+----------+-----------------------+
|
+----------+----------+
|
||||||
| 31 16 | 15 0 |
|
| 31 16 | 15 0 |
|
||||||
+----------+-----------------------+
|
+----------+----------+
|
||||||
| reserved | token id |
|
| reserved | token id |
|
||||||
+----------+-----------------------+
|
+----------+----------+
|
||||||
```
|
```
|
||||||
|
|
||||||
### token IDs
|
### token IDs
|
||||||
@@ -152,4 +150,5 @@ supported tokens are listed below
|
|||||||
| - | 0x0063 | |
|
| - | 0x0063 | |
|
||||||
| * | 0x0064 | |
|
| * | 0x0064 | |
|
||||||
| / | 0x0065 | |
|
| / | 0x0065 | |
|
||||||
|
| | 0xFEXX | token terminator byte as token, where `XX` is the byte |
|
||||||
| | 0xFFFF | unrecognised token |
|
| | 0xFFFF | unrecognised token |
|
||||||
|
|||||||
@@ -1,11 +1,13 @@
|
|||||||
LOAD_ADDR equ 0x00010000 ; address this program is loaded at
|
LOAD_ADDR equ 0x00010000 ; address this program is loaded at
|
||||||
|
|
||||||
TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
|
|
||||||
TOKEN_TABLE_SIZE equ 0x1000 ; max length of table
|
|
||||||
|
|
||||||
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
|
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
|
||||||
TEST_ARENA_SIZE equ 0x1000 ; maximum size tests can use
|
TEST_ARENA_SIZE equ 0x1000 ; maximum size tests can use
|
||||||
|
|
||||||
|
TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
|
||||||
|
TOKEN_TABLE_SIZE equ 0x1000 ; max length of table
|
||||||
|
TOKEN_TABLE_ENTRY_SIZE equ 8 ; size of token table entry; a LOT of things
|
||||||
|
; may break if this ever changes
|
||||||
|
|
||||||
OUTPUT_ADDR equ 0x00070000 ; address of outputed binary
|
OUTPUT_ADDR equ 0x00070000 ; address of outputed binary
|
||||||
OUTPUT_SIZE equ 0x1000 ; max length of outputed binary
|
OUTPUT_SIZE equ 0x1000 ; max length of outputed binary
|
||||||
|
|
||||||
@@ -13,6 +15,8 @@ STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at
|
|||||||
|
|
||||||
UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token
|
UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token
|
||||||
|
|
||||||
|
TEST_LINE_LENGTH equ 80 ; right border of test suite results
|
||||||
|
|
||||||
[bits 64]
|
[bits 64]
|
||||||
[org LOAD_ADDR]
|
[org LOAD_ADDR]
|
||||||
|
|
||||||
@@ -26,12 +30,70 @@ start:
|
|||||||
|
|
||||||
call clear_token_table
|
call clear_token_table
|
||||||
|
|
||||||
|
mov rdi, program ; -> program
|
||||||
|
mov rsi, [program.size] ; = size of program
|
||||||
|
call tokenise
|
||||||
|
|
||||||
jmp halt
|
jmp halt
|
||||||
|
|
||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
; tokenising
|
; tokenising
|
||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
; tokenise
|
||||||
|
; TODO write tests
|
||||||
|
;
|
||||||
|
; description:
|
||||||
|
; represents the program at the given address and puts it in the token table
|
||||||
|
; it's probably desirable to clear the token table before calling this function
|
||||||
|
;
|
||||||
|
; parameters:
|
||||||
|
; rdi -> first byte of program
|
||||||
|
; rsi = size of program in bytes
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
tokenise:
|
||||||
|
add rsi, rdi ; last byte of program
|
||||||
|
xor rcx, rcx ; number of tokens processed
|
||||||
|
.loop:
|
||||||
|
cmp rdi, rsi ; if current byte greater than last byte
|
||||||
|
jg .break ; then break
|
||||||
|
|
||||||
|
push rdi
|
||||||
|
push rsi
|
||||||
|
push rcx
|
||||||
|
|
||||||
|
; rdi -> current byte
|
||||||
|
call identify_next_token
|
||||||
|
; ax = id of token
|
||||||
|
; dx = length of token
|
||||||
|
|
||||||
|
pop rcx
|
||||||
|
pop rsi
|
||||||
|
pop rdi
|
||||||
|
|
||||||
|
; deal with terminator character (reported as 0 length token)
|
||||||
|
cmp rdx, 0
|
||||||
|
je .token_length0
|
||||||
|
jne .continue
|
||||||
|
|
||||||
|
.token_length0:
|
||||||
|
mov ax, 0xFE00 ; terminator character
|
||||||
|
mov al, [rdi] ; byte of terminator
|
||||||
|
mov edx, 1 ; byte length is 1
|
||||||
|
|
||||||
|
.continue:
|
||||||
|
|
||||||
|
add rdi, rdx ; current byte + length of token = next unread byte
|
||||||
|
|
||||||
|
mov [TOKEN_TABLE_ADDR + rcx * TOKEN_TABLE_ENTRY_SIZE], rax ; fill next entry
|
||||||
|
; in token table
|
||||||
|
inc rcx ; +1 token processed
|
||||||
|
jmp .loop
|
||||||
|
.break:
|
||||||
|
ret
|
||||||
|
|
||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
; identify_token
|
; identify_token
|
||||||
;
|
;
|
||||||
@@ -61,10 +123,10 @@ identify_token:
|
|||||||
|
|
||||||
jmp .unrecognised ; else unrecognised
|
jmp .unrecognised ; else unrecognised
|
||||||
|
|
||||||
.start_length1
|
.start_length1:
|
||||||
mov rcx, tokens.length1 ; rcx -> list of known tokens
|
mov rcx, tokens.length1 ; rcx -> list of known tokens
|
||||||
|
|
||||||
.loop_length1
|
.loop_length1:
|
||||||
cmp rcx, tokens.length2 ; check if rcx still in the bounds of length1 tokens
|
cmp rcx, tokens.length2 ; check if rcx still in the bounds of length1 tokens
|
||||||
jge .unrecognised ; if not, unrecognised
|
jge .unrecognised ; if not, unrecognised
|
||||||
|
|
||||||
@@ -76,12 +138,12 @@ identify_token:
|
|||||||
add rcx, 3 ; length of token + length of id
|
add rcx, 3 ; length of token + length of id
|
||||||
jmp .loop_length1
|
jmp .loop_length1
|
||||||
|
|
||||||
.found_length1
|
.found_length1:
|
||||||
xor eax, eax ; make sure rest of rax is zeroed
|
xor eax, eax ; make sure rest of rax is zeroed
|
||||||
mov ax, [rcx + 1] ; return id of token
|
mov ax, [rcx + 1] ; return id of token
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.start_length2
|
.start_length2:
|
||||||
mov rcx, tokens.length2 ; rcx -> list of known tokens
|
mov rcx, tokens.length2 ; rcx -> list of known tokens
|
||||||
|
|
||||||
.loop_length2:
|
.loop_length2:
|
||||||
@@ -151,6 +213,58 @@ identify_token:
|
|||||||
mov ax, UNRECOGNISED_TOKEN_ID
|
mov ax, UNRECOGNISED_TOKEN_ID
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
; identify_next_token
|
||||||
|
; description:
|
||||||
|
; like identify_token, except it automatically finds the length
|
||||||
|
;
|
||||||
|
; parameters:
|
||||||
|
; rdi -> first byte of token
|
||||||
|
;
|
||||||
|
; returned:
|
||||||
|
; ax = id of token; the rest of rax is zeroed
|
||||||
|
; dx = length of token in bytes; the rest of rdx is zeroed
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
identify_next_token:
|
||||||
|
push rdi
|
||||||
|
|
||||||
|
mov rsi, rdi ; rsi is the current byte
|
||||||
|
xor rdi, rdi ; rdi is the length
|
||||||
|
.loop:
|
||||||
|
xor edx, edx
|
||||||
|
mov dl, [rsi]
|
||||||
|
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push rdx
|
||||||
|
|
||||||
|
mov rdi, 8 ; length of terminator list
|
||||||
|
mov rsi, token_terminator_8 ; start of terminator list
|
||||||
|
call elemb
|
||||||
|
|
||||||
|
pop rdx
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
|
||||||
|
cmp rax, 1 ; check if the next character is a token terminator
|
||||||
|
je .break ; if so, break
|
||||||
|
|
||||||
|
inc rdi ; next character
|
||||||
|
inc rsi ; next byte of token
|
||||||
|
jmp .loop
|
||||||
|
|
||||||
|
.break:
|
||||||
|
mov rsi, rdi ; length of token
|
||||||
|
|
||||||
|
pop rdi
|
||||||
|
|
||||||
|
push rsi
|
||||||
|
call identify_token
|
||||||
|
pop rsi
|
||||||
|
mov rdx, rsi ; length
|
||||||
|
ret
|
||||||
|
|
||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
; copy_token
|
; copy_token
|
||||||
;
|
;
|
||||||
@@ -168,7 +282,6 @@ identify_token:
|
|||||||
|
|
||||||
copy_token:
|
copy_token:
|
||||||
.loop:
|
.loop:
|
||||||
|
|
||||||
mov dl, [rdi] ; move bit to compare to current byte in read buffer
|
mov dl, [rdi] ; move bit to compare to current byte in read buffer
|
||||||
|
|
||||||
push rdi ; push incrementors to call elemb
|
push rdi ; push incrementors to call elemb
|
||||||
@@ -190,7 +303,6 @@ copy_token:
|
|||||||
|
|
||||||
inc rdi ; read pointer
|
inc rdi ; read pointer
|
||||||
inc rsi ; write pointer
|
inc rsi ; write pointer
|
||||||
|
|
||||||
jmp .loop
|
jmp .loop
|
||||||
|
|
||||||
.break:
|
.break:
|
||||||
@@ -227,6 +339,7 @@ copy_byte:
|
|||||||
;
|
;
|
||||||
; description:
|
; description:
|
||||||
; prints a null-terminated string
|
; prints a null-terminated string
|
||||||
|
; probably doesn't change any registers for ease of debugging
|
||||||
;
|
;
|
||||||
; parameters:
|
; parameters:
|
||||||
; rsi -> start of null-terminated string
|
; rsi -> start of null-terminated string
|
||||||
@@ -234,6 +347,9 @@ copy_byte:
|
|||||||
|
|
||||||
print:
|
print:
|
||||||
push rdx
|
push rdx
|
||||||
|
push rax
|
||||||
|
push rsi
|
||||||
|
|
||||||
mov edx, 0x3F8
|
mov edx, 0x3F8
|
||||||
.loop:
|
.loop:
|
||||||
mov al, [rsi]
|
mov al, [rsi]
|
||||||
@@ -243,6 +359,8 @@ print:
|
|||||||
inc rsi
|
inc rsi
|
||||||
jmp .loop
|
jmp .loop
|
||||||
.done:
|
.done:
|
||||||
|
pop rsi
|
||||||
|
pop rax
|
||||||
pop rdx
|
pop rdx
|
||||||
ret
|
ret
|
||||||
|
|
||||||
@@ -276,7 +394,7 @@ halt:
|
|||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
|
|
||||||
elemb:
|
elemb:
|
||||||
.loop
|
.loop:
|
||||||
cmp rdi, 0 ; check if remaining length 0
|
cmp rdi, 0 ; check if remaining length 0
|
||||||
je .not_found ; if so, break; dl not an element of list
|
je .not_found ; if so, break; dl not an element of list
|
||||||
|
|
||||||
@@ -289,11 +407,11 @@ elemb:
|
|||||||
|
|
||||||
jmp .loop
|
jmp .loop
|
||||||
|
|
||||||
.not_found
|
.not_found:
|
||||||
xor eax, eax ; return 0; dl not an element of list
|
xor eax, eax ; return 0; dl not an element of list
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.found
|
.found:
|
||||||
xor eax, eax
|
xor eax, eax
|
||||||
mov rax, 1 ; return 1; dl an element of list
|
mov rax, 1 ; return 1; dl an element of list
|
||||||
ret
|
ret
|
||||||
@@ -356,6 +474,9 @@ run_tests:
|
|||||||
call clear_test_arena
|
call clear_test_arena
|
||||||
call test_identify_token
|
call test_identify_token
|
||||||
|
|
||||||
|
call clear_test_arena
|
||||||
|
call test_identify_next_token
|
||||||
|
|
||||||
ret
|
ret
|
||||||
.msg db "running test suite...", 0x0D, 0x0A, 0x00
|
.msg db "running test suite...", 0x0D, 0x0A, 0x00
|
||||||
|
|
||||||
@@ -609,12 +730,106 @@ test_identify_token:
|
|||||||
ret
|
ret
|
||||||
.msg db "test_identify_token...", 0x00
|
.msg db "test_identify_token...", 0x00
|
||||||
|
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
; test_identify_next_token
|
||||||
|
;
|
||||||
|
; description:
|
||||||
|
; tests identify_next_token described functionality
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
test_identify_next_token:
|
||||||
|
mov rsi, .msg
|
||||||
|
call print
|
||||||
|
|
||||||
|
; length1 token that exists
|
||||||
|
mov word [TEST_ARENA_ADDR], "* "
|
||||||
|
mov rdi, TEST_ARENA_ADDR
|
||||||
|
call identify_next_token
|
||||||
|
cmp ax, 0x0064
|
||||||
|
jne .fail
|
||||||
|
|
||||||
|
; length1 token that doesn't exist
|
||||||
|
mov word [TEST_ARENA_ADDR], " "
|
||||||
|
mov rdi, TEST_ARENA_ADDR
|
||||||
|
call identify_next_token
|
||||||
|
cmp ax, 0xFFFF
|
||||||
|
jne .fail
|
||||||
|
|
||||||
|
; length2 token that exists
|
||||||
|
mov dword [TEST_ARENA_ADDR], "sp "
|
||||||
|
mov rdi, TEST_ARENA_ADDR
|
||||||
|
call identify_next_token
|
||||||
|
cmp ax, 0x0026
|
||||||
|
jne .fail
|
||||||
|
|
||||||
|
; length2 token that doesn't exist
|
||||||
|
mov dword [TEST_ARENA_ADDR], "QQ "
|
||||||
|
mov rdi, TEST_ARENA_ADDR
|
||||||
|
call identify_next_token
|
||||||
|
cmp ax, 0xFFFF
|
||||||
|
jne .fail
|
||||||
|
|
||||||
|
; length3 token that exists
|
||||||
|
mov dword [TEST_ARENA_ADDR], "rax "
|
||||||
|
mov rdi, TEST_ARENA_ADDR
|
||||||
|
call identify_next_token
|
||||||
|
cmp ax, 0x0000
|
||||||
|
jne .fail
|
||||||
|
|
||||||
|
; length3 token that exists
|
||||||
|
mov dword [TEST_ARENA_ADDR], "cr0 "
|
||||||
|
mov rdi, TEST_ARENA_ADDR
|
||||||
|
call identify_next_token
|
||||||
|
cmp ax, 0x004A
|
||||||
|
jne .fail
|
||||||
|
|
||||||
|
; length3 token that doesn't exist
|
||||||
|
mov dword [TEST_ARENA_ADDR], "r16 "
|
||||||
|
mov rdi, TEST_ARENA_ADDR
|
||||||
|
call identify_next_token
|
||||||
|
cmp ax, 0xFFFF
|
||||||
|
jne .fail
|
||||||
|
|
||||||
|
; length4 token that exists
|
||||||
|
mov dword [TEST_ARENA_ADDR], "r10d"
|
||||||
|
mov byte [TEST_ARENA_ADDR + 4], " "
|
||||||
|
mov rdi, TEST_ARENA_ADDR
|
||||||
|
call identify_next_token
|
||||||
|
cmp ax, 0x001A
|
||||||
|
jne .fail
|
||||||
|
|
||||||
|
; length4 token that exists
|
||||||
|
mov dword [TEST_ARENA_ADDR], "r15b"
|
||||||
|
mov byte [TEST_ARENA_ADDR + 4], " "
|
||||||
|
mov rdi, TEST_ARENA_ADDR
|
||||||
|
call identify_next_token
|
||||||
|
cmp ax, 0x003F
|
||||||
|
jne .fail
|
||||||
|
|
||||||
|
; length4 token that doesn't exist
|
||||||
|
mov dword [TEST_ARENA_ADDR], "r15q"
|
||||||
|
mov byte [TEST_ARENA_ADDR + 4], " "
|
||||||
|
mov rdi, TEST_ARENA_ADDR
|
||||||
|
call identify_next_token
|
||||||
|
cmp ax, 0xFFFF
|
||||||
|
jne .fail
|
||||||
|
|
||||||
|
.pass:
|
||||||
|
mov rsi, msg_pass
|
||||||
|
call print
|
||||||
|
ret
|
||||||
|
.fail:
|
||||||
|
mov rsi, msg_fail
|
||||||
|
call print
|
||||||
|
ret
|
||||||
|
.msg db "test_identify_next_token...", 0x00
|
||||||
|
|
||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
; data
|
; data
|
||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
|
|
||||||
tokens:
|
tokens:
|
||||||
.length1
|
.length1:
|
||||||
db "["
|
db "["
|
||||||
dw 0x0051
|
dw 0x0051
|
||||||
db "]"
|
db "]"
|
||||||
@@ -627,7 +842,7 @@ tokens:
|
|||||||
dw 0x0064
|
dw 0x0064
|
||||||
db "/"
|
db "/"
|
||||||
dw 0x0065
|
dw 0x0065
|
||||||
.length2
|
.length2:
|
||||||
db "r8"
|
db "r8"
|
||||||
dw 0x0008
|
dw 0x0008
|
||||||
db "r9"
|
db "r9"
|
||||||
@@ -682,7 +897,7 @@ tokens:
|
|||||||
dw 0x005F
|
dw 0x005F
|
||||||
db "jl"
|
db "jl"
|
||||||
dw 0x0061
|
dw 0x0061
|
||||||
.length3
|
.length3:
|
||||||
db "rax"
|
db "rax"
|
||||||
dw 0x0000
|
dw 0x0000
|
||||||
db "rbx"
|
db "rbx"
|
||||||
@@ -781,7 +996,7 @@ tokens:
|
|||||||
dw 0x005E
|
dw 0x005E
|
||||||
db "jle"
|
db "jle"
|
||||||
dw 0x0060
|
dw 0x0060
|
||||||
.length4
|
.length4:
|
||||||
db "r10d"
|
db "r10d"
|
||||||
dw 0x001A
|
dw 0x001A
|
||||||
db "r11d"
|
db "r11d"
|
||||||
@@ -822,20 +1037,35 @@ tokens:
|
|||||||
dw 0x0050
|
dw 0x0050
|
||||||
db "call"
|
db "call"
|
||||||
dw 0x0059
|
dw 0x0059
|
||||||
.length5
|
.length5:
|
||||||
.end
|
.end:
|
||||||
|
|
||||||
msg_welcome db "Welcome to Twasm", 0x0D, 0x0A, 0x00
|
msg_welcome db "Welcome to Twasm", 0x0D, 0x0A, 0x00
|
||||||
msg_halt db "halted.", 0x0D, 0x0A, 0x00
|
msg_halt db "halted.", 0x0D, 0x0A, 0x00
|
||||||
msg_pass db "passed.", 0x0D, 0x0A, 0x00
|
msg_pass:
|
||||||
msg_fail db "failed.", 0x0D, 0x0A, 0x00
|
db 0x0D, 0x0A
|
||||||
|
times (TEST_LINE_LENGTH + .start - .end) db " ", ; right align
|
||||||
|
.start db "passed."
|
||||||
|
.end db 0x0D, 0x0A, 0x00
|
||||||
|
msg_fail:
|
||||||
|
db 0x0D, 0x0A
|
||||||
|
times (TEST_LINE_LENGTH + .start - .end) db " ",
|
||||||
|
.start db "failed."
|
||||||
|
.end db 0x0D, 0x0A, 0x00
|
||||||
|
|
||||||
test_byte db "Q" ; unterminated, just a byte chillin
|
test_byte db "Q" ; unterminated, just a byte chillin
|
||||||
test_token_null db "TestTokn", 0x00 ; followed by null terminator. Quad word
|
test_token_null db "TestTokn", 0x00 ; followed by null terminator. Quad word
|
||||||
test_token_space db "TestTokn " ; followed by space. Quad word
|
test_token_space db "TestTokn " ; followed by space. Quad word
|
||||||
test_elemb_0 ; [This Page Intentionally Left Blank]
|
test_elemb_0: ; [This Page Intentionally Left Blank]
|
||||||
test_elemb_5 db 0x54, 0x00, 0x21, 0x20, 0x34
|
test_elemb_5 db 0x54, 0x00, 0x21, 0x20, 0x34
|
||||||
|
|
||||||
token_terminator_8 db 0x00, " ", 0x0A, 0x0D, 0x00, 0x00, 0x00, 0x00
|
token_terminator_8 db 0x00, " ", 0x0A, 0x0D, ",", 0x00, 0x00, 0x00
|
||||||
|
|
||||||
debug_string db "debug_string", 0x0D, 0x0A, 0x00
|
debug_string db "debug_string", 0x0D, 0x0A, 0x00
|
||||||
|
|
||||||
|
program:
|
||||||
|
db "xor eax, eax", 0x0D, 0x0A
|
||||||
|
db "inc rax", 0x0D, 0x0A
|
||||||
|
db "hlt", 0x0D, 0x0A
|
||||||
|
db 0x00
|
||||||
|
.size db $ - program - 1
|
||||||
|
|||||||
Reference in New Issue
Block a user