parse brackets, improve docs
This commit is contained in:
@@ -129,7 +129,7 @@ supported tokens are listed below
|
|||||||
| cr8 | 0x004E | |
|
| cr8 | 0x004E | |
|
||||||
| hlt | 0x004F | |
|
| hlt | 0x004F | |
|
||||||
| int3 | 0x0050 | |
|
| int3 | 0x0050 | |
|
||||||
| [ | 0x0051 | |
|
| [ | 0x0051 | open bracket placeholder; 0x10XX should be used in contexts where the surrounding tokens can be known |
|
||||||
| ] | 0x0052 | |
|
| ] | 0x0052 | |
|
||||||
| xor | 0x0053 | |
|
| xor | 0x0053 | |
|
||||||
| inc | 0x0054 | |
|
| inc | 0x0054 | |
|
||||||
@@ -150,5 +150,50 @@ supported tokens are listed below
|
|||||||
| - | 0x0063 | |
|
| - | 0x0063 | |
|
||||||
| * | 0x0064 | |
|
| * | 0x0064 | |
|
||||||
| / | 0x0065 | |
|
| / | 0x0065 | |
|
||||||
|
| [ | 0x10XX | open bracket with `XX` bytes until the closing bracket |
|
||||||
| | 0xFEXX | token terminator byte as token, where `XX` is the byte |
|
| | 0xFEXX | token terminator byte as token, where `XX` is the byte |
|
||||||
| | 0xFFFF | unrecognised token |
|
| | 0xFFFF | unrecognised token |
|
||||||
|
|
||||||
|
### example program
|
||||||
|
|
||||||
|
#### program in assembly
|
||||||
|
|
||||||
|
this program doesn't do anything useful, it's just a test
|
||||||
|
|
||||||
|
```nasm
|
||||||
|
xor eax, eax
|
||||||
|
inc rax
|
||||||
|
mov [ rax ], rdx
|
||||||
|
hlt
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### tokenization
|
||||||
|
|
||||||
|
```nasm
|
||||||
|
0x0053 ; xor
|
||||||
|
0xFE20 ; space
|
||||||
|
0x0010 ; eax
|
||||||
|
0xFE2C ; comma
|
||||||
|
0xFE20 ; space
|
||||||
|
0x0010 ; eax
|
||||||
|
0xFE0A ; newline
|
||||||
|
0x0054 ; inc
|
||||||
|
0xFE20 ; space
|
||||||
|
0x0000 ; rax
|
||||||
|
0xFE0A ; newline
|
||||||
|
0x0056 ; mov
|
||||||
|
0xFE20 ; space
|
||||||
|
0x1004 ; open bracket (4)
|
||||||
|
0xFE20 ; space |1
|
||||||
|
0x0000 ; rax |2
|
||||||
|
0xFE20 ; space |3
|
||||||
|
0x0052 ; close bracket |4
|
||||||
|
0xFE2C ; comma
|
||||||
|
0xFE20 ; space
|
||||||
|
0x0003 ; rdx
|
||||||
|
0xFE0A ; newline
|
||||||
|
0x004F ; hlt
|
||||||
|
0xFE0A ; newline
|
||||||
|
0xFE00 ; null terminator
|
||||||
|
```
|
||||||
|
|||||||
@@ -76,28 +76,60 @@ tokenise:
|
|||||||
; deal with terminator character (reported as 0 length token)
|
; deal with terminator character (reported as 0 length token)
|
||||||
cmp rdx, 0
|
cmp rdx, 0
|
||||||
je .token_length0
|
je .token_length0
|
||||||
jne .continue
|
jne .continue0
|
||||||
|
|
||||||
.token_length0:
|
.token_length0:
|
||||||
mov ax, 0xFE00 ; terminator character
|
mov ax, 0xFE00 ; terminator character
|
||||||
mov al, [rdi] ; byte of terminator
|
mov al, [rdi] ; byte of terminator
|
||||||
mov edx, 1 ; byte length is 1
|
mov edx, 1 ; byte length is 1
|
||||||
|
|
||||||
.continue:
|
.continue0:
|
||||||
add rdi, rdx ; current byte + length of token = next unread byte
|
add rdi, rdx ; current byte + length of token = next unread byte
|
||||||
|
|
||||||
mov [TOKEN_TABLE_ADDR + rcx * TOKEN_TABLE_ENTRY_SIZE], ax ; fill next entry
|
mov [TOKEN_TABLE_ADDR + rcx * TOKEN_TABLE_ENTRY_SIZE], ax ; fill next entry
|
||||||
; in token table
|
; in token table
|
||||||
|
|
||||||
|
; TODO fix undefined behaviour when open brackets and closed brackets aren't
|
||||||
|
; correctly paired or have too much distance between them
|
||||||
|
cmp ax, 0x0051 ; check if read token is an open bracket
|
||||||
|
je .open_bracket ; if so, handle it
|
||||||
|
jne .continue_open_bracket ; if not, continue
|
||||||
|
|
||||||
|
.open_bracket:
|
||||||
|
; TODO make brackets able to hold more
|
||||||
|
mov [.data_open_bracket], cl ; record which entry the open bracket is at
|
||||||
|
|
||||||
|
.continue_open_bracket:
|
||||||
|
cmp ax, 0x0052 ; check if read token is a closing bracket
|
||||||
|
je .close_bracket ; if so, handle it
|
||||||
|
jne .continue_close_bracket ; if not, continue
|
||||||
|
|
||||||
|
.close_bracket:
|
||||||
|
; rewrite open bracket token entry with a filled out one
|
||||||
|
mov dl, [.data_open_bracket]
|
||||||
|
sub cl, dl
|
||||||
|
mov byte [TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], cl
|
||||||
|
mov byte [1 + TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], 0x10
|
||||||
|
add cl, dl
|
||||||
|
|
||||||
|
.continue_close_bracket:
|
||||||
inc rcx ; +1 token processed
|
inc rcx ; +1 token processed
|
||||||
jmp .loop
|
jmp .loop
|
||||||
.break:
|
.break:
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
.data_open_bracket db 0x00 ; represents the token # of the latest open bracket
|
||||||
|
|
||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
; identify_token
|
; identify_token
|
||||||
;
|
;
|
||||||
; description:
|
; description:
|
||||||
; returns the id of a given token
|
; returns the id of a given token. If there are multiple ways to represent a
|
||||||
|
; given token, like the open-bracket, it returns the one that doesn't require
|
||||||
|
; information about the surrounding tokens, because it has no such information.
|
||||||
|
; In other words, if it isn't in the `tokens` data structure, this function
|
||||||
|
; doesn't see it. If the first byte of the token points to a terminator
|
||||||
|
; byte, this function returns it as an unrecognised token.
|
||||||
;
|
;
|
||||||
; parameters:
|
; parameters:
|
||||||
; rdi -> first byte of token
|
; rdi -> first byte of token
|
||||||
@@ -122,6 +154,7 @@ identify_token:
|
|||||||
|
|
||||||
jmp .unrecognised ; else unrecognised
|
jmp .unrecognised ; else unrecognised
|
||||||
|
|
||||||
|
; length1
|
||||||
.start_length1:
|
.start_length1:
|
||||||
mov rcx, tokens.length1 ; rcx -> list of known tokens
|
mov rcx, tokens.length1 ; rcx -> list of known tokens
|
||||||
|
|
||||||
@@ -142,6 +175,7 @@ identify_token:
|
|||||||
mov ax, [rcx + 1] ; return id of token
|
mov ax, [rcx + 1] ; return id of token
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
; length2
|
||||||
.start_length2:
|
.start_length2:
|
||||||
mov rcx, tokens.length2 ; rcx -> list of known tokens
|
mov rcx, tokens.length2 ; rcx -> list of known tokens
|
||||||
|
|
||||||
@@ -162,6 +196,7 @@ identify_token:
|
|||||||
mov ax, [rcx + 2] ; return id of token
|
mov ax, [rcx + 2] ; return id of token
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
; length3
|
||||||
.start_length3:
|
.start_length3:
|
||||||
mov rcx, tokens.length3 ; rcx -> list of known tokens
|
mov rcx, tokens.length3 ; rcx -> list of known tokens
|
||||||
|
|
||||||
@@ -187,6 +222,7 @@ identify_token:
|
|||||||
mov ax, [rcx + 3] ; return id of token
|
mov ax, [rcx + 3] ; return id of token
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
; length4
|
||||||
.start_length4:
|
.start_length4:
|
||||||
mov rcx, tokens.length4 ; rcx -> list of known tokens
|
mov rcx, tokens.length4 ; rcx -> list of known tokens
|
||||||
|
|
||||||
@@ -215,7 +251,8 @@ identify_token:
|
|||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
; identify_next_token
|
; identify_next_token
|
||||||
; description:
|
; description:
|
||||||
; like identify_token, except it automatically finds the length
|
; like identify_token, except it automatically finds the length. If the first
|
||||||
|
; byte of the token points to a terminator byte, it returns a length of 0.
|
||||||
;
|
;
|
||||||
; parameters:
|
; parameters:
|
||||||
; rdi -> first byte of token
|
; rdi -> first byte of token
|
||||||
@@ -1062,9 +1099,11 @@ token_terminator_8 db 0x00, " ", 0x0A, 0x0D, ",", 0x00, 0x00, 0x00
|
|||||||
|
|
||||||
debug_string db "debug_string", 0x0A, 0x00
|
debug_string db "debug_string", 0x0A, 0x00
|
||||||
|
|
||||||
|
; test program
|
||||||
program:
|
program:
|
||||||
db "xor eax, eax", 0x0A
|
db "xor eax, eax", 0x0A
|
||||||
db "inc rax", 0x0A
|
db "inc rax", 0x0A
|
||||||
|
db "mov [ rax ], rdx", 0x0A
|
||||||
db "hlt", 0x0A
|
db "hlt", 0x0A
|
||||||
db 0x00
|
db 0x00 ; just for the sake of being able to print it, I made it a string
|
||||||
.size db $ - program - 1
|
.size db $ - program - 1
|
||||||
|
|||||||
Reference in New Issue
Block a user