LOAD_ADDR equ 0x00010000 ; address this program is loaded at TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at TEST_ARENA_SIZE equ 0x1000 ; maximum size tests can use TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at TOKEN_TABLE_SIZE equ 0x1000 ; max length of table TOKEN_TABLE_ENTRY_SIZE equ 2 ; size of token table entry; things may break ; if this ever changes OUTPUT_ADDR equ 0x00070000 ; address of outputed binary OUTPUT_SIZE equ 0x1000 ; max length of outputed binary STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token TEST_LINE_LENGTH equ 80 ; right border of test suite results [bits 64] [org LOAD_ADDR] start: mov rsp, STACK_ADDR ; we might need more stack space, let's just be safe mov rsi, msg_welcome call print call run_tests call clear_token_table mov rdi, program ; -> program mov rsi, [program.size] ; = size of program call tokenise jmp halt ; ------------------------------------------------------------------------------ ; tokenising ; ------------------------------------------------------------------------------ ; ------------------------------------------------------------------------------ ; tokenise ; TODO write tests ; ; description: ; represents the program at the given address and puts it in the token table ; it's probably desirable to clear the token table before calling this function ; ; parameters: ; rdi -> first byte of program ; rsi = size of program in bytes ; ------------------------------------------------------------------------------ tokenise: add rsi, rdi ; last byte of program xor rcx, rcx ; number of tokens processed .loop: cmp rdi, rsi ; if current byte greater than last byte jg .break ; then break push rdi push rsi push rcx ; rdi -> current byte call identify_next_token ; ax = id of token ; dx = length of token pop rcx pop rsi pop rdi ; deal with terminator character (reported as 0 length token) cmp rdx, 0 je .token_length0 jne .continue0 .token_length0: mov ax, 0xFE00 ; terminator character mov al, [rdi] ; byte of terminator mov edx, 1 ; byte length is 1 .continue0: add rdi, rdx ; current byte + length of token = next unread byte mov [TOKEN_TABLE_ADDR + rcx * TOKEN_TABLE_ENTRY_SIZE], ax ; fill next entry ; in token table ; TODO fix undefined behaviour when open brackets and closed brackets aren't ; correctly paired or have too much distance between them cmp ax, 0x0051 ; check if read token is an open bracket je .open_bracket ; if so, handle it jne .continue_open_bracket ; if not, continue .open_bracket: ; TODO make brackets able to hold more mov [.data_open_bracket], cl ; record which entry the open bracket is at .continue_open_bracket: cmp ax, 0x0052 ; check if read token is a closing bracket je .close_bracket ; if so, handle it jne .continue_close_bracket ; if not, continue .close_bracket: ; rewrite open bracket token entry with a filled out one mov dl, [.data_open_bracket] sub cl, dl mov byte [TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], cl mov byte [1 + TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], 0x10 add cl, dl .continue_close_bracket: inc rcx ; +1 token processed jmp .loop .break: ret .data_open_bracket db 0x00 ; represents the token # of the latest open bracket ; ------------------------------------------------------------------------------ ; identify_token ; ; description: ; returns the id of a given token. If there are multiple ways to represent a ; given token, like the open-bracket, it returns the one that doesn't require ; information about the surrounding tokens, because it has no such information. ; In other words, if it isn't in the `tokens` data structure, this function ; doesn't see it. If the first byte of the token points to a terminator ; byte, this function returns it as an unrecognised token. ; ; parameters: ; rdi -> first byte of token ; rsi = size of token in bytes ; ; returned: ; ax = id of token; the rest of rax is zeroed ; ------------------------------------------------------------------------------ identify_token: cmp rsi, 1 ; if the token has length 1 je .start_length1 ; then enter the length 1 loop cmp rsi, 2 ; if the token has length 2 je .start_length2 ; then enter the length 2 loop cmp rsi, 3 ; if the token has length 3 je .start_length3 ; then enter the length 3 loop cmp rsi, 4 ; if the token has length 4 je .start_length4 ; then enter the length 4 loop jmp .unrecognised ; else unrecognised ; length1 .start_length1: mov rcx, tokens.length1 ; rcx -> list of known tokens .loop_length1: cmp rcx, tokens.length2 ; check if rcx still in the bounds of length1 tokens jge .unrecognised ; if not, unrecognised mov r10b, [rcx] ; known token mov r11b, [rdi] ; token cmp r10b, r11b ; if known token matches token je .found_length1 ; exit loop add rcx, 3 ; length of token + length of id jmp .loop_length1 .found_length1: xor eax, eax ; make sure rest of rax is zeroed mov ax, [rcx + 1] ; return id of token ret ; length2 .start_length2: mov rcx, tokens.length2 ; rcx -> list of known tokens .loop_length2: cmp rcx, tokens.length3 ; check if rcx still in the bounds of length2 tokens jge .unrecognised ; if not, unrecognised mov r10w, [rcx] ; current entry in known tokens mov r11w, [rdi] ; token cmp r10w, r11w ; if current entry matches token, je .found_length2 ; exit loop add rcx, 4 ; length of token + length of id jmp .loop_length2 .found_length2: xor eax, eax ; make sure rest of rax is zeroed mov ax, [rcx + 2] ; return id of token ret ; length3 .start_length3: mov rcx, tokens.length3 ; rcx -> list of known tokens .loop_length3: cmp rcx, tokens.length4 ; check if rcx still in bounds of length3 tokens jge .unrecognised ; if not, unrecognised ; TODO make this safe (it overreaches 1 byte) mov r10d, [rcx] ; known token + next byte mov r11d, [rdi] ; token + next byte and r10d, 0x00FFFFFF ; mask for just the token and r11d, 0x00FFFFFF cmp r10d, r11d ; if known token matches token, je .found_length3 ; exit loop add rcx, 5 ; length of token + length of id jmp .loop_length3 .found_length3: xor rax, rax ; zero rax mov ax, [rcx + 3] ; return id of token ret ; length4 .start_length4: mov rcx, tokens.length4 ; rcx -> list of known tokens .loop_length4: cmp rcx, tokens.length5 ; check if rcx still in bounds of length3 tokens jge .unrecognised ; if not, unrecognised mov r10d, [rcx] ; known token mov r11d, [rdi] ; token cmp r10d, r11d ; if known token matches token, je .found_length4 ; exit loop add rcx, 6 ; length of token + length of id jmp .loop_length4 .found_length4: xor rax, rax ; zero rax mov ax, [rcx + 4] ; return id of token ret .unrecognised: xor eax, eax mov ax, UNRECOGNISED_TOKEN_ID ret ; ------------------------------------------------------------------------------ ; identify_next_token ; description: ; like identify_token, except it automatically finds the length. If the first ; byte of the token points to a terminator byte, it returns a length of 0. ; ; parameters: ; rdi -> first byte of token ; ; returned: ; ax = id of token; the rest of rax is zeroed ; dx = length of token in bytes; the rest of rdx is zeroed ; ------------------------------------------------------------------------------ identify_next_token: push rdi mov rsi, rdi ; rsi is the current byte xor rdi, rdi ; rdi is the length .loop: xor edx, edx mov dl, [rsi] push rsi push rdi push rdx mov rdi, 8 ; length of terminator list mov rsi, token_terminator_8 ; start of terminator list call elemb pop rdx pop rdi pop rsi cmp rax, 1 ; check if the next character is a token terminator je .break ; if so, break inc rdi ; next character inc rsi ; next byte of token jmp .loop .break: mov rsi, rdi ; length of token pop rdi push rsi call identify_token pop rsi mov rdx, rsi ; length ret ; ------------------------------------------------------------------------------ ; copy_token ; ; description: ; copies a token from one spot in memory to another ; ; parameters: ; rdi -> start of buffer to be read ; rsi -> start of buffer to be written ; ; returned: ; rax -> last byte read ; rdx -> last byte written ; ------------------------------------------------------------------------------ copy_token: .loop: mov dl, [rdi] ; move bit to compare to current byte in read buffer push rdi ; push incrementors to call elemb push rsi ; mov rdi, 8 ; length of terminator list mov rsi, token_terminator_8 ; start of terminator list ; dl set before pushing rdi call elemb pop rsi ; pop rdi ; pop incrementors after call cmp rax, 1 ; check if the next character is a token terminator je .break ; > if so, break the function ; rdi and rsi set from previous loop iteration call copy_byte ; if not, copy the current byte in read buffer inc rdi ; read pointer inc rsi ; write pointer jmp .loop .break: mov rax, rdi ; -> last byte read mov rdx, rsi ; -> last byte written ret ; ------------------------------------------------------------------------------ ; copy_byte ; ; description: ; copies a byte from one spot in memory to another ; ; parameters: ; rdi -> word to be read ; rsi -> word to be written ; ; returned: ; al = byte that was read; the rest of rax is zeroed ; ------------------------------------------------------------------------------ copy_byte: xor eax, eax ; zero out so it returns fine mov al, [rdi] mov [rsi], al ret ; ------------------------------------------------------------------------------ ; utilities ; ------------------------------------------------------------------------------ ; ------------------------------------------------------------------------------ ; print ; ; description: ; prints a null-terminated string ; probably doesn't change any registers for ease of debugging ; ; parameters: ; rsi -> start of null-terminated string ; ------------------------------------------------------------------------------ print: push rdx push rax push rsi mov edx, 0x3F8 .loop: mov al, [rsi] test al, al jz .done out dx, al inc rsi jmp .loop .done: pop rsi pop rax pop rdx ret ; ------------------------------------------------------------------------------ ; halt ; ; description: ; halts the program, silly :) ; ------------------------------------------------------------------------------ halt: mov rsi, msg_halt call print hlt jmp halt ; ------------------------------------------------------------------------------ ; elemb ; ; description: ; checks if given byte is element of the specified list ; ; parameters: ; rdi = size of list ; rsi -> start of list ; dl = given byte ; ; returned: ; rax = 0: is not an element ; 1: is an element ; ------------------------------------------------------------------------------ elemb: .loop: cmp rdi, 0 ; check if remaining length 0 je .not_found ; if so, break; dl not an element of list mov al, [rsi] cmp al, dl ; check if current byte in list is the desired byte je .found ; if so, break; dl an element of list inc rsi ; move to next byte dec rdi ; and reduce remaining length jmp .loop .not_found: xor eax, eax ; return 0; dl not an element of list ret .found: xor eax, eax mov rax, 1 ; return 1; dl an element of list ret .f db "found", 0x0A, 0x00 .nf db "not found", 0x0A, 0x00 ; ------------------------------------------------------------------------------ ; clear_token_table ; ; description: ; clears the token table as specified by TOKEN_TABLE_SIZE and TOKEN_TABLE_ADDR ; ------------------------------------------------------------------------------ clear_token_table: xor eax, eax ; value to write mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words mov rdi, TOKEN_TABLE_ADDR ; address to start rep stosd ret ; ------------------------------------------------------------------------------ ; clear_test_arena ; ; description: ; clears the test arena as specified by TEST_ARENA_SIZE and TEST_ARENA_ADDR ; ------------------------------------------------------------------------------ clear_test_arena: xor eax, eax ; value to write mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words mov rdi, TOKEN_TABLE_ADDR ; address to start rep stosd ret ; ------------------------------------------------------------------------------ ; tests ; ------------------------------------------------------------------------------ ; ------------------------------------------------------------------------------ ; run_tests ; ; description: ; runs all tests ; ------------------------------------------------------------------------------ run_tests: mov rsi, .msg call print call clear_test_arena call test_copy_byte call clear_test_arena call test_copy_token call clear_test_arena call test_elemb call clear_test_arena call test_identify_token call clear_test_arena call test_identify_next_token ret .msg db "running test suite...", 0x0A, 0x00 ; ------------------------------------------------------------------------------ ; test_copy_byte ; ; description: ; tests copy_byte described functionality ; ------------------------------------------------------------------------------ test_copy_byte: mov rsi, .msg call print mov rdi, test_byte ; byte to be copied mov rsi, TEST_ARENA_ADDR ; location of test call copy_byte mov cx, [rsi] and ax, 0xFF ; only compare bottom byte and cx, 0xFF cmp ax, cx ; compare returned byte to copied byte jne .fail cmp al, [test_byte] ; compare returned byte to expected byte jne .fail .pass: mov rsi, msg_pass call print ret .fail: mov rsi, msg_fail call print ret .msg db "test_copy_byte...", 0x00 ; ------------------------------------------------------------------------------ ; test_copy_token ; ; description: ; tests copy_token described functionality ; ------------------------------------------------------------------------------ test_copy_token: mov rsi, .msg call print ; test case: space terminated mov rdi, test_token_space ; read buffer mov rsi, TEST_ARENA_ADDR ; write buffer call copy_token ; check reported final indicies with the expected final indicies cmp rax, test_token_space + 8 ; last byte read jne .fail cmp rdx, TEST_ARENA_ADDR + 8 ; last byte written jne .fail mov rsi, TEST_ARENA_ADDR mov rcx, [rsi] cmp rcx, [test_token_space] ; check if copied token matches expected token jne .fail ; if not, fail ; test case: null terminated mov rdi, test_token_null ; read buffer mov rsi, TEST_ARENA_ADDR ; write buffer call copy_token ; check reported final indicies with the expected final indicies cmp rax, test_token_null + 8 ; last byte read jne .fail cmp rdx, TEST_ARENA_ADDR + 8 ; last byte written jne .fail mov rsi, TEST_ARENA_ADDR mov rcx, [rsi] cmp rcx, [test_token_null] ; check if copied token matches expected token jne .fail ; if not, fail .pass: mov rsi, msg_pass call print ret .fail: mov rsi, msg_fail call print ret .msg db "test_copy_token...", 0x00 ; ------------------------------------------------------------------------------ ; test_elemb ; ; description: ; tests elemb described functionality ; ------------------------------------------------------------------------------ test_elemb: mov rsi, .msg call print ; [0] mov rdi, 5 mov rsi, test_elemb_5 mov dl, [test_elemb_5] call elemb cmp al, 1 jne .fail ; [n - 1] mov rdi, 5 mov rsi, test_elemb_5 mov dl, [test_elemb_5 + 4] call elemb cmp al, 1 jne .fail ; [1] mov rdi, 5 mov rsi, test_elemb_5 mov dl, [test_elemb_5 + 1] call elemb cmp al, 1 jne .fail ; not present mov rdi, 5 mov rsi, test_elemb_5 mov dl, 0xDA call elemb cmp al, 0 jne .fail ; 0 length list mov rdi, 0 mov rsi, test_elemb_0 mov dl, 0x34 call elemb cmp al, 0 jne .fail .pass: mov rsi, msg_pass call print ret .fail: mov rsi, msg_fail call print ret .msg db "test_elemb...", 0x00 ; ------------------------------------------------------------------------------ ; test_identify_token ; ; description: ; tests identify_token described functionality ; ------------------------------------------------------------------------------ test_identify_token: mov rsi, .msg call print ; length1 token that exists mov byte [TEST_ARENA_ADDR], "*" mov rdi, TEST_ARENA_ADDR mov rsi, 1 call identify_token cmp ax, 0x0064 jne .fail ; length1 token that doesn't exist mov byte [TEST_ARENA_ADDR], " " mov rdi, TEST_ARENA_ADDR mov rsi, 1 call identify_token cmp ax, 0xFFFF jne .fail ; length2 token that exists mov word [TEST_ARENA_ADDR], "sp" mov rdi, TEST_ARENA_ADDR mov rsi, 2 call identify_token cmp ax, 0x0026 jne .fail ; length2 token that doesn't exist mov word [TEST_ARENA_ADDR], "QQ" mov rdi, TEST_ARENA_ADDR mov rsi, 2 call identify_token cmp ax, 0xFFFF jne .fail ; length3 token that exists mov dword [TEST_ARENA_ADDR], "rax" mov rdi, TEST_ARENA_ADDR mov rsi, 3 call identify_token cmp ax, 0x0000 jne .fail ; length3 token that exists mov dword [TEST_ARENA_ADDR], "cr0" mov rdi, TEST_ARENA_ADDR mov rsi, 3 call identify_token cmp ax, 0x004A jne .fail ; length3 token that doesn't exist mov dword [TEST_ARENA_ADDR], "r16" mov rdi, TEST_ARENA_ADDR mov rsi, 3 call identify_token cmp ax, 0xFFFF jne .fail ; length4 token that exists mov dword [TEST_ARENA_ADDR], "r10d" mov rdi, TEST_ARENA_ADDR mov rsi, 4 call identify_token cmp ax, 0x001A jne .fail ; length4 token that exists mov dword [TEST_ARENA_ADDR], "r15b" mov rdi, TEST_ARENA_ADDR mov rsi, 4 call identify_token cmp ax, 0x003F jne .fail ; length4 token that doesn't exist mov dword [TEST_ARENA_ADDR], "r15q" mov rdi, TEST_ARENA_ADDR mov rsi, 4 call identify_token cmp ax, 0xFFFF jne .fail .pass: mov rsi, msg_pass call print ret .fail: mov rsi, msg_fail call print ret .msg db "test_identify_token...", 0x00 ; ------------------------------------------------------------------------------ ; test_identify_next_token ; ; description: ; tests identify_next_token described functionality ; ------------------------------------------------------------------------------ test_identify_next_token: mov rsi, .msg call print ; length1 token that exists mov word [TEST_ARENA_ADDR], "* " mov rdi, TEST_ARENA_ADDR call identify_next_token cmp ax, 0x0064 jne .fail ; length1 token that doesn't exist mov word [TEST_ARENA_ADDR], " " mov rdi, TEST_ARENA_ADDR call identify_next_token cmp ax, 0xFFFF jne .fail ; length2 token that exists mov dword [TEST_ARENA_ADDR], "sp " mov rdi, TEST_ARENA_ADDR call identify_next_token cmp ax, 0x0026 jne .fail ; length2 token that doesn't exist mov dword [TEST_ARENA_ADDR], "QQ " mov rdi, TEST_ARENA_ADDR call identify_next_token cmp ax, 0xFFFF jne .fail ; length3 token that exists mov dword [TEST_ARENA_ADDR], "rax " mov rdi, TEST_ARENA_ADDR call identify_next_token cmp ax, 0x0000 jne .fail ; length3 token that exists mov dword [TEST_ARENA_ADDR], "cr0 " mov rdi, TEST_ARENA_ADDR call identify_next_token cmp ax, 0x004A jne .fail ; length3 token that doesn't exist mov dword [TEST_ARENA_ADDR], "r16 " mov rdi, TEST_ARENA_ADDR call identify_next_token cmp ax, 0xFFFF jne .fail ; length4 token that exists mov dword [TEST_ARENA_ADDR], "r10d" mov byte [TEST_ARENA_ADDR + 4], " " mov rdi, TEST_ARENA_ADDR call identify_next_token cmp ax, 0x001A jne .fail ; length4 token that exists mov dword [TEST_ARENA_ADDR], "r15b" mov byte [TEST_ARENA_ADDR + 4], " " mov rdi, TEST_ARENA_ADDR call identify_next_token cmp ax, 0x003F jne .fail ; length4 token that doesn't exist mov dword [TEST_ARENA_ADDR], "r15q" mov byte [TEST_ARENA_ADDR + 4], " " mov rdi, TEST_ARENA_ADDR call identify_next_token cmp ax, 0xFFFF jne .fail .pass: mov rsi, msg_pass call print ret .fail: mov rsi, msg_fail call print ret .msg db "test_identify_next_token...", 0x00 ; ------------------------------------------------------------------------------ ; data ; ------------------------------------------------------------------------------ tokens: .length1: db "[" dw 0x0051 db "]" dw 0x0052 db "+" dw 0x0062 db "-" dw 0x0063 db "*" dw 0x0064 db "/" dw 0x0065 .length2: db "r8" dw 0x0008 db "r9" dw 0x0009 db "ax" dw 0x0020 db "bx" dw 0x0021 db "cx" dw 0x0022 db "dx" dw 0x0023 db "si" dw 0x0024 db "di" dw 0x0025 db "sp" dw 0x0026 db "bp" dw 0x0027 db "al" dw 0x0030 db "bl" dw 0x0031 db "cl" dw 0x0032 db "dl" dw 0x0033 db "ah" dw 0x0040 db "bh" dw 0x0041 db "ch" dw 0x0042 db "dh" dw 0x0043 db "cs" dw 0x0044 db "ds" dw 0x0045 db "es" dw 0x0046 db "fs" dw 0x0047 db "gs" dw 0x0048 db "ss" dw 0x0049 db "je" dw 0x005C db "jg" dw 0x005F db "jl" dw 0x0061 .length3: db "rax" dw 0x0000 db "rbx" dw 0x0001 db "rcx" dw 0x0002 db "rdx" dw 0x0003 db "rsi" dw 0x0004 db "rdi" dw 0x0005 db "rsp" dw 0x0006 db "rbp" dw 0x0007 db "r10" dw 0x000A db "r11" dw 0x000B db "r12" dw 0x000C db "r13" dw 0x000D db "r14" dw 0x000E db "r15" dw 0x000F db "eax" dw 0x0010 db "ebx" dw 0x0011 db "ecx" dw 0x0012 db "edx" dw 0x0013 db "esi" dw 0x0014 db "edi" dw 0x0015 db "esp" dw 0x0016 db "ebp" dw 0x0017 db "r8d" dw 0x0018 db "r9d" dw 0x0019 db "r8w" dw 0x0028 db "r9w" dw 0x0029 db "sil" dw 0x0034 db "dil" dw 0x0035 db "spl" dw 0x0036 db "bpl" dw 0x0037 db "r8b" dw 0x0038 db "r9b" dw 0x0039 db "cr0" dw 0x004A db "cr2" dw 0x004B db "cr3" dw 0x004C db "cr4" dw 0x004D db "cr8" dw 0x004E db "hlt" dw 0x004F db "xor" dw 0x0053 db "inc" dw 0x0054 db "dec" dw 0x0055 db "mov" dw 0x0056 db "add" dw 0x0057 db "sub" dw 0x0058 db "ret" dw 0x005A db "cmp" dw 0x005B db "jne" dw 0x005D db "jge" dw 0x005E db "jle" dw 0x0060 .length4: db "r10d" dw 0x001A db "r11d" dw 0x001B db "r12d" dw 0x001C db "r13d" dw 0x001D db "r14d" dw 0x001E db "r15d" dw 0x001F db "r10w" dw 0x002A db "r11w" dw 0x002B db "r12w" dw 0x002C db "r13w" dw 0x002D db "r14w" dw 0x002E db "r15w" dw 0x002F db "r10b" dw 0x003A db "r11b" dw 0x003B db "r12b" dw 0x003C db "r13b" dw 0x003D db "r14b" dw 0x003E db "r15b" dw 0x003F db "int3" dw 0x0050 db "call" dw 0x0059 .length5: .end: msg_welcome db "Welcome to Twasm", 0x0A, 0x00 msg_halt db "halted.", 0x0A, 0x00 msg_pass: db 0x0A times (TEST_LINE_LENGTH + .start - .end) db " ", ; right align .start db "passed." .end db 0x0A, 0x00 msg_fail: db 0x0A times (TEST_LINE_LENGTH + .start - .end) db " ", .start db "failed." .end db 0x0A, 0x00 test_byte db "Q" ; unterminated, just a byte chillin test_token_null db "TestTokn", 0x00 ; followed by null terminator. Quad word test_token_space db "TestTokn " ; followed by space. Quad word test_elemb_0: ; [This Page Intentionally Left Blank] test_elemb_5 db 0x54, 0x00, 0x21, 0x20, 0x34 token_terminator_8 db 0x00, " ", 0x0A, 0x0D, ",", 0x00, 0x00, 0x00 debug_string db "debug_string", 0x0A, 0x00 ; test program program: db "xor eax, eax", 0x0A db "inc rax", 0x0A db "mov [ rax ], rdx", 0x0A db "hlt", 0x0A db 0x00 ; just for the sake of being able to print it, I made it a string .size db $ - program - 1