diff --git a/twasm/README.md b/twasm/README.md index 83dab90..b414770 100644 --- a/twasm/README.md +++ b/twasm/README.md @@ -129,7 +129,7 @@ supported tokens are listed below | cr8 | 0x004E | | | hlt | 0x004F | | | int3 | 0x0050 | | -| [ | 0x0051 | | +| [ | 0x0051 | open bracket placeholder; 0x10XX should be used in contexts where the surrounding tokens can be known | | ] | 0x0052 | | | xor | 0x0053 | | | inc | 0x0054 | | @@ -150,5 +150,50 @@ supported tokens are listed below | - | 0x0063 | | | * | 0x0064 | | | / | 0x0065 | | +| [ | 0x10XX | open bracket with `XX` bytes until the closing bracket | | | 0xFEXX | token terminator byte as token, where `XX` is the byte | | | 0xFFFF | unrecognised token | + +### example program + +#### program in assembly + +this program doesn't do anything useful, it's just a test + +```nasm +xor eax, eax +inc rax +mov [ rax ], rdx +hlt + +``` + +#### tokenization + +```nasm +0x0053 ; xor +0xFE20 ; space +0x0010 ; eax +0xFE2C ; comma +0xFE20 ; space +0x0010 ; eax +0xFE0A ; newline +0x0054 ; inc +0xFE20 ; space +0x0000 ; rax +0xFE0A ; newline +0x0056 ; mov +0xFE20 ; space +0x1004 ; open bracket (4) +0xFE20 ; space |1 +0x0000 ; rax |2 +0xFE20 ; space |3 +0x0052 ; close bracket |4 +0xFE2C ; comma +0xFE20 ; space +0x0003 ; rdx +0xFE0A ; newline +0x004F ; hlt +0xFE0A ; newline +0xFE00 ; null terminator +``` diff --git a/twasm/asm/main.asm b/twasm/asm/main.asm index 73abda4..dce7bed 100644 --- a/twasm/asm/main.asm +++ b/twasm/asm/main.asm @@ -76,28 +76,60 @@ tokenise: ; deal with terminator character (reported as 0 length token) cmp rdx, 0 je .token_length0 - jne .continue + jne .continue0 .token_length0: mov ax, 0xFE00 ; terminator character mov al, [rdi] ; byte of terminator mov edx, 1 ; byte length is 1 - .continue: + .continue0: add rdi, rdx ; current byte + length of token = next unread byte mov [TOKEN_TABLE_ADDR + rcx * TOKEN_TABLE_ENTRY_SIZE], ax ; fill next entry ; in token table + + ; TODO fix undefined behaviour when open brackets and closed brackets aren't + ; correctly paired or have too much distance between them + cmp ax, 0x0051 ; check if read token is an open bracket + je .open_bracket ; if so, handle it + jne .continue_open_bracket ; if not, continue + + .open_bracket: + ; TODO make brackets able to hold more + mov [.data_open_bracket], cl ; record which entry the open bracket is at + + .continue_open_bracket: + cmp ax, 0x0052 ; check if read token is a closing bracket + je .close_bracket ; if so, handle it + jne .continue_close_bracket ; if not, continue + + .close_bracket: + ; rewrite open bracket token entry with a filled out one + mov dl, [.data_open_bracket] + sub cl, dl + mov byte [TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], cl + mov byte [1 + TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], 0x10 + add cl, dl + + .continue_close_bracket: inc rcx ; +1 token processed jmp .loop .break: ret + .data_open_bracket db 0x00 ; represents the token # of the latest open bracket + ; ------------------------------------------------------------------------------ ; identify_token ; ; description: -; returns the id of a given token +; returns the id of a given token. If there are multiple ways to represent a +; given token, like the open-bracket, it returns the one that doesn't require +; information about the surrounding tokens, because it has no such information. +; In other words, if it isn't in the `tokens` data structure, this function +; doesn't see it. If the first byte of the token points to a terminator +; byte, this function returns it as an unrecognised token. ; ; parameters: ; rdi -> first byte of token @@ -122,6 +154,7 @@ identify_token: jmp .unrecognised ; else unrecognised + ; length1 .start_length1: mov rcx, tokens.length1 ; rcx -> list of known tokens @@ -142,6 +175,7 @@ identify_token: mov ax, [rcx + 1] ; return id of token ret + ; length2 .start_length2: mov rcx, tokens.length2 ; rcx -> list of known tokens @@ -162,6 +196,7 @@ identify_token: mov ax, [rcx + 2] ; return id of token ret + ; length3 .start_length3: mov rcx, tokens.length3 ; rcx -> list of known tokens @@ -187,6 +222,7 @@ identify_token: mov ax, [rcx + 3] ; return id of token ret + ; length4 .start_length4: mov rcx, tokens.length4 ; rcx -> list of known tokens @@ -215,7 +251,8 @@ identify_token: ; ------------------------------------------------------------------------------ ; identify_next_token ; description: -; like identify_token, except it automatically finds the length +; like identify_token, except it automatically finds the length. If the first +; byte of the token points to a terminator byte, it returns a length of 0. ; ; parameters: ; rdi -> first byte of token @@ -1062,9 +1099,11 @@ token_terminator_8 db 0x00, " ", 0x0A, 0x0D, ",", 0x00, 0x00, 0x00 debug_string db "debug_string", 0x0A, 0x00 +; test program program: db "xor eax, eax", 0x0A db "inc rax", 0x0A + db "mov [ rax ], rdx", 0x0A db "hlt", 0x0A - db 0x00 + db 0x00 ; just for the sake of being able to print it, I made it a string .size db $ - program - 1