Files
bootler/twasm/asm/main.asm
2026-03-09 11:00:59 +01:00

919 lines
21 KiB
NASM

; TODO actually enforce any of these *_SIZE constants :p
LOAD_ADDR equ 0x00010000 ; address this program is loaded at
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
TEST_ARENA_SIZE equ 0x1000 ; maximum size tests can use
TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
TOKEN_TABLE_SIZE equ 0x1000 ; max length of table
TOKEN_TABLE_ENTRY_SIZE equ 2 ; size of token table entry; things may break
; if this ever changes
OUTPUT_ADDR equ 0x00070000 ; address of outputed binary
OUTPUT_SIZE equ 0x1000 ; max length of outputed binary
STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at
UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token
UNRECOGNISED_ID_TYPE equ 0x0F ; type of an unrecognised id
UNRECOGNISED_ID_METADATA equ 0xFF ; metadata of an unrecognised id
TEST_LINE_LENGTH equ 80 ; right border of test suite results
[bits 64]
[org LOAD_ADDR]
start:
mov rsp, STACK_ADDR ; we might need more stack space, let's just be safe
mov rsi, msg_welcome
call print
call run_tests
call clear_token_table
mov rdi, program ; -> program
mov rsi, [program.size] ; = size of program
call tokenise
; rax = number of tokens processed
mov rdi, rax
push rdi
call clear_output_arena
pop rdi
call assemble
jmp halt
; ------------------------------------------------------------------------------
; assembling
; ------------------------------------------------------------------------------
; ------------------------------------------------------------------------------
; assemble
; TODO write testsr
; TODO make it work :/ putting the cart before the horse
;
; description:
; assembles the program from tokens located at TOKEN_TABLE_ADDR into a flat
; binary located at OUTPUT_ADDR. It's probably desirable to clear the output
; arena before calling this function.
;
; parameters:
; rdi = number of tokens in the token table
; ------------------------------------------------------------------------------
assemble:
xor rax, rax ; number of tokens processed
.loop:
cmp rax, rdi ; check incrementer against the number of tokens in the token
jge .break ; table. If overflown, break
push rdi
xor edi, edi
mov di, [rax * TOKEN_TABLE_ENTRY_SIZE + TOKEN_TABLE_ADDR] ; rdi = next tte
push rax
xor eax, eax
call get_tte_type
cmp ax, 0x01 ; check if it's an operator
je .operator
jne .continue_operator
.operator
push rsi
mov rsi, .msg_found_operator
call print
pop rsi
.continue_operator
cmp ax, 0x02 ; check if it's a register
je .register
jne .continue_register
.register
push rsi
mov rsi, .msg_found_register
call print
pop rsi
.continue_register
pop rax ; incrementer
pop rdi ; total number of tokens
inc rax ; move to next token
jmp .loop
.break:
ret
.msg_found_operator db "found operator", 0x0A, 0x00
.msg_found_register db "found register", 0x0A, 0x00
; ------------------------------------------------------------------------------
; get_tte_type
;
; description:
; given a token table entry, returns the declared type in `tokens.by_id`. If
; there is no entry, returns UNRECOGNISED_ID_TYPE
;
; parameters:
; di = token table entry
;
; returned:
; al = type of token, or UNRECOGNISED_ID_TYPE. The upper 4 bits of al are
; zeroed; the rest of rax is zeroed.
; ------------------------------------------------------------------------------
get_tte_type:
and rdi, 0xFFFF ; mask input so it behaves as expected
xor eax, eax
.loop:
cmp rax, (tokens.by_id_end - tokens.by_id) / 4 ; make sure it's still in range
jg .not_found
mov cx, [tokens.by_id + rax * 4] ; next entry in tokens.by_id
cmp cx, di
je .found
inc rax
jmp .loop
.not_found:
mov al, UNRECOGNISED_ID_TYPE
and ax, 0xF ; mask as expected
ret
.found:
mov al, [2 + tokens.by_id + rax * 4]
and ax, 0xF ; mask as expected
ret
; ------------------------------------------------------------------------------
; get_tte_typed_metadata
;
; description:
; given a token table entry, returns the declared typed metadata in
; `tokens.by_id`. If there is no entry, returns UNRECOGNISED_ID_METADATA
;
; parameters:
; di = token table entry
;
; returned:
; al = typed metadata of token, or UNRECOGNISED_ID_METADATA; the rest of rax is
; zeroed.
; ------------------------------------------------------------------------------
get_tte_typed_metadata:
and rdi, 0xFFFF ; mask input so it behaves as expected
xor eax, eax
.loop:
cmp rax, (tokens.by_id_end - tokens.by_id) / 4 ; make sure it's still in range
jg .not_found
mov cx, [tokens.by_id + rax * 4] ; next entry in tokens.by_id
cmp cx, di
je .found
inc rax
jmp .loop
.not_found:
xor eax, eax
mov al, UNRECOGNISED_ID_METADATA
ret
.found:
mov al, [3 + tokens.by_id + rax * 4]
ret
; ------------------------------------------------------------------------------
; tokenising
; ------------------------------------------------------------------------------
; ------------------------------------------------------------------------------
; tokenise
; TODO write tests
;
; description:
; represents the program at the given address and puts it in the token table
; it's probably desirable to clear the token table before calling this function.
;
; parameters:
; rdi -> first byte of program
; rsi = size of program in bytes
;
; returned:
; rax = number of tokens processed
; ------------------------------------------------------------------------------
tokenise:
add rsi, rdi ; last byte of program
xor ecx, ecx ; number of tokens processed
.loop:
cmp rdi, rsi ; if current byte greater than last byte
jg .break ; then break
push rdi
push rsi
push rcx
; rdi -> current byte
call identify_next_token
; ax = id of token
; dx = length of token
pop rcx
pop rsi
pop rdi
; deal with terminator character (reported as 0 length token)
cmp rdx, 0
je .token_length0
jne .continue0
.token_length0:
mov ax, 0xFE00 ; terminator character
mov al, [rdi] ; byte of terminator
mov edx, 1 ; byte length is 1
.continue0:
add rdi, rdx ; current byte + length of token = next unread byte
mov [TOKEN_TABLE_ADDR + rcx * TOKEN_TABLE_ENTRY_SIZE], ax ; fill next entry
; in token table
; TODO fix undefined behaviour when open brackets and closed brackets aren't
; correctly paired or have too much distance between them
cmp ax, 0x0051 ; check if read token is an open bracket
je .open_bracket ; if so, handle it
jne .continue_open_bracket ; if not, continue
.open_bracket:
; TODO make brackets able to hold more
mov [.data_open_bracket], cl ; record which entry the open bracket is at
.continue_open_bracket:
cmp ax, 0x0052 ; check if read token is a closing bracket
je .close_bracket ; if so, handle it
jne .continue_close_bracket ; if not, continue
.close_bracket:
; rewrite open bracket token entry with a filled out one
push rcx
mov dl, [.data_open_bracket]
sub cl, dl
mov byte [TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], cl
mov byte [1 + TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], 0x10
pop rcx
.continue_close_bracket:
inc rcx ; +1 token processed
jmp .loop
.break:
mov rax, rcx
ret
.data_open_bracket db 0x00 ; represents the token # of the latest open bracket
; ------------------------------------------------------------------------------
; identify_token
;
; description:
; returns the id of a given token. If there are multiple ways to represent a
; given token, like the open-bracket, it returns the one that doesn't require
; information about the surrounding tokens, because it has no such information.
; In other words, if it isn't in the `tokens` data structure, this function
; doesn't see it. If the first byte of the token points to a terminator
; byte, this function returns it as an unrecognised token.
;
; parameters:
; rdi -> first byte of token
; rsi = size of token in bytes
;
; returned:
; ax = id of token; the rest of rax is zeroed
; ------------------------------------------------------------------------------
identify_token:
cmp rsi, 1 ; if the token has length 1
je .start_length1 ; then enter the length 1 loop
cmp rsi, 2 ; if the token has length 2
je .start_length2 ; then enter the length 2 loop
cmp rsi, 3 ; if the token has length 3
je .start_length3 ; then enter the length 3 loop
cmp rsi, 4 ; if the token has length 4
je .start_length4 ; then enter the length 4 loop
jmp .unrecognised ; else unrecognised
; length1
.start_length1:
mov rcx, tokens.by_name_1 ; rcx -> list of known tokens
.loop_length1:
cmp rcx, tokens.by_name_2 ; check if rcx still in the bounds of length1 tokens
jge .unrecognised ; if not, unrecognised
mov r10b, [rcx] ; known token
mov r11b, [rdi] ; token
cmp r10b, r11b ; if known token matches token
je .found_length1 ; exit loop
add rcx, 3 ; length of token + length of id
jmp .loop_length1
.found_length1:
xor eax, eax ; make sure rest of rax is zeroed
mov ax, [rcx + 1] ; return id of token
ret
; length2
.start_length2:
mov rcx, tokens.by_name_2 ; rcx -> list of known tokens
.loop_length2:
cmp rcx, tokens.by_name_3 ; check if rcx still in the bounds of length2 tokens
jge .unrecognised ; if not, unrecognised
mov r10w, [rcx] ; current entry in known tokens
mov r11w, [rdi] ; token
cmp r10w, r11w ; if current entry matches token,
je .found_length2 ; exit loop
add rcx, 4 ; length of token + length of id
jmp .loop_length2
.found_length2:
xor eax, eax ; make sure rest of rax is zeroed
mov ax, [rcx + 2] ; return id of token
ret
; length3
.start_length3:
mov rcx, tokens.by_name_3 ; rcx -> list of known tokens
.loop_length3:
cmp rcx, tokens.by_name_4 ; check if rcx still in bounds of length3 tokens
jge .unrecognised ; if not, unrecognised
; TODO make this safe (it overreaches 1 byte)
mov r10d, [rcx] ; known token + next byte
mov r11d, [rdi] ; token + next byte
and r10d, 0x00FFFFFF ; mask for just the token
and r11d, 0x00FFFFFF
cmp r10d, r11d ; if known token matches token,
je .found_length3 ; exit loop
add rcx, 5 ; length of token + length of id
jmp .loop_length3
.found_length3:
xor rax, rax ; zero rax
mov ax, [rcx + 3] ; return id of token
ret
; length4
.start_length4:
mov rcx, tokens.by_name_4 ; rcx -> list of known tokens
.loop_length4:
cmp rcx, tokens.by_name_5 ; check if rcx still in bounds of length3 tokens
jge .unrecognised ; if not, unrecognised
mov r10d, [rcx] ; known token
mov r11d, [rdi] ; token
cmp r10d, r11d ; if known token matches token,
je .found_length4 ; exit loop
add rcx, 6 ; length of token + length of id
jmp .loop_length4
.found_length4:
xor rax, rax ; zero rax
mov ax, [rcx + 4] ; return id of token
ret
.unrecognised:
xor eax, eax
mov ax, UNRECOGNISED_TOKEN_ID
ret
; ------------------------------------------------------------------------------
; identify_next_token
; description:
; like identify_token, except it automatically finds the length. If the first
; byte of the token points to a terminator byte, it returns a length of 0.
;
; parameters:
; rdi -> first byte of token
;
; returned:
; ax = id of token; the rest of rax is zeroed
; dx = length of token in bytes; the rest of rdx is zeroed
; ------------------------------------------------------------------------------
identify_next_token:
push rdi
mov rsi, rdi ; rsi is the current byte
xor rdi, rdi ; rdi is the length
.loop:
xor edx, edx
mov dl, [rsi]
push rsi
push rdi
push rdx
mov rdi, 8 ; length of terminator list
mov rsi, token_terminator_8 ; start of terminator list
call elemb
pop rdx
pop rdi
pop rsi
cmp rax, 1 ; check if the next character is a token terminator
je .break ; if so, break
inc rdi ; next character
inc rsi ; next byte of token
jmp .loop
.break:
mov rsi, rdi ; length of token
pop rdi
push rsi
call identify_token
pop rsi
mov rdx, rsi ; length
ret
; ------------------------------------------------------------------------------
; copy_token
;
; description:
; copies a token from one spot in memory to another
;
; parameters:
; rdi -> start of buffer to be read
; rsi -> start of buffer to be written
;
; returned:
; rax -> last byte read
; rdx -> last byte written
; ------------------------------------------------------------------------------
copy_token:
.loop:
mov dl, [rdi] ; move bit to compare to current byte in read buffer
push rdi ; push incrementors to call elemb
push rsi ;
mov rdi, 8 ; length of terminator list
mov rsi, token_terminator_8 ; start of terminator list
; dl set before pushing rdi
call elemb
pop rsi ;
pop rdi ; pop incrementors after call
cmp rax, 1 ; check if the next character is a token terminator
je .break ; > if so, break the function
; rdi and rsi set from previous loop iteration
call copy_byte ; if not, copy the current byte in read buffer
inc rdi ; read pointer
inc rsi ; write pointer
jmp .loop
.break:
mov rax, rdi ; -> last byte read
mov rdx, rsi ; -> last byte written
ret
; ------------------------------------------------------------------------------
; copy_byte
;
; description:
; copies a byte from one spot in memory to another
;
; parameters:
; rdi -> word to be read
; rsi -> word to be written
;
; returned:
; al = byte that was read; the rest of rax is zeroed
; ------------------------------------------------------------------------------
copy_byte:
xor eax, eax ; zero out so it returns fine
mov al, [rdi]
mov [rsi], al
ret
; ------------------------------------------------------------------------------
; utilities
; ------------------------------------------------------------------------------
; ------------------------------------------------------------------------------
; print
;
; description:
; prints a null-terminated string
; probably doesn't change any registers for ease of debugging
;
; parameters:
; rsi -> start of null-terminated string
; ------------------------------------------------------------------------------
print:
push rdx
push rax
push rsi
mov edx, 0x3F8
.loop:
mov al, [rsi]
test al, al
jz .done
out dx, al
inc rsi
jmp .loop
.done:
pop rsi
pop rax
pop rdx
ret
; ------------------------------------------------------------------------------
; halt
;
; description:
; halts the program, silly :)
; ------------------------------------------------------------------------------
halt:
mov rsi, msg_halt
call print
hlt
jmp halt
; ------------------------------------------------------------------------------
; elemb
;
; description:
; checks if given byte is element of the specified list
;
; parameters:
; rdi = size of list
; rsi -> start of list
; dl = given byte
;
; returned:
; rax = 0: is not an element
; 1: is an element
; ------------------------------------------------------------------------------
elemb:
.loop:
cmp rdi, 0 ; check if remaining length 0
je .not_found ; if so, break; dl not an element of list
mov al, [rsi]
cmp al, dl ; check if current byte in list is the desired byte
je .found ; if so, break; dl an element of list
inc rsi ; move to next byte
dec rdi ; and reduce remaining length
jmp .loop
.not_found:
xor eax, eax ; return 0; dl not an element of list
ret
.found:
xor eax, eax
mov rax, 1 ; return 1; dl an element of list
ret
.f db "found", 0x0A, 0x00
.nf db "not found", 0x0A, 0x00
; ------------------------------------------------------------------------------
; clear_token_table
;
; description:
; clears the token table as specified by TOKEN_TABLE_SIZE and TOKEN_TABLE_ADDR
; ------------------------------------------------------------------------------
clear_token_table:
xor eax, eax ; value to write
mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words
mov rdi, TOKEN_TABLE_ADDR ; address to start
rep stosd
ret
; ------------------------------------------------------------------------------
; clear_test_arena
;
; description:
; clears the test arena as specified by TEST_ARENA_SIZE and TEST_ARENA_ADDR
; ------------------------------------------------------------------------------
clear_test_arena:
xor eax, eax ; value to write
mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words
mov rdi, TOKEN_TABLE_ADDR ; address to start
rep stosd
ret
; ------------------------------------------------------------------------------
; clear_output_arena
;
; description:
; clears the output arena as specified by OUTPUT_SIZE and OUTPUT_ADDR
; ------------------------------------------------------------------------------
clear_output_arena:
xor eax, eax ; value to write
mov rcx, OUTPUT_SIZE / 4 ; number of double words
mov rdi, OUTPUT_ADDR ; address to start
rep stosd
ret
%include "asm/tests.asm"
; ------------------------------------------------------------------------------
; data
; ------------------------------------------------------------------------------
tokens:
.by_name_1:
db "["
dw 0x0051
db "]"
dw 0x0052
db "+"
dw 0x0062
db "-"
dw 0x0063
db "*"
dw 0x0064
db "/"
dw 0x0065
.by_name_2:
db "r8"
dw 0x0008
db "r9"
dw 0x0009
db "ax"
dw 0x0020
db "bx"
dw 0x0021
db "cx"
dw 0x0022
db "dx"
dw 0x0023
db "si"
dw 0x0024
db "di"
dw 0x0025
db "sp"
dw 0x0026
db "bp"
dw 0x0027
db "al"
dw 0x0030
db "bl"
dw 0x0031
db "cl"
dw 0x0032
db "dl"
dw 0x0033
db "ah"
dw 0x0040
db "bh"
dw 0x0041
db "ch"
dw 0x0042
db "dh"
dw 0x0043
db "cs"
dw 0x0044
db "ds"
dw 0x0045
db "es"
dw 0x0046
db "fs"
dw 0x0047
db "gs"
dw 0x0048
db "ss"
dw 0x0049
db "je"
dw 0x005C
db "jg"
dw 0x005F
db "jl"
dw 0x0061
.by_name_3:
db "rax"
dw 0x0000
db "rbx"
dw 0x0001
db "rcx"
dw 0x0002
db "rdx"
dw 0x0003
db "rsi"
dw 0x0004
db "rdi"
dw 0x0005
db "rsp"
dw 0x0006
db "rbp"
dw 0x0007
db "r10"
dw 0x000A
db "r11"
dw 0x000B
db "r12"
dw 0x000C
db "r13"
dw 0x000D
db "r14"
dw 0x000E
db "r15"
dw 0x000F
db "eax"
dw 0x0010
db "ebx"
dw 0x0011
db "ecx"
dw 0x0012
db "edx"
dw 0x0013
db "esi"
dw 0x0014
db "edi"
dw 0x0015
db "esp"
dw 0x0016
db "ebp"
dw 0x0017
db "r8d"
dw 0x0018
db "r9d"
dw 0x0019
db "r8w"
dw 0x0028
db "r9w"
dw 0x0029
db "sil"
dw 0x0034
db "dil"
dw 0x0035
db "spl"
dw 0x0036
db "bpl"
dw 0x0037
db "r8b"
dw 0x0038
db "r9b"
dw 0x0039
db "cr0"
dw 0x004A
db "cr2"
dw 0x004B
db "cr3"
dw 0x004C
db "cr4"
dw 0x004D
db "cr8"
dw 0x004E
db "hlt"
dw 0x004F
db "xor"
dw 0x0053
db "inc"
dw 0x0054
db "dec"
dw 0x0055
db "mov"
dw 0x0056
db "add"
dw 0x0057
db "sub"
dw 0x0058
db "ret"
dw 0x005A
db "cmp"
dw 0x005B
db "jne"
dw 0x005D
db "jge"
dw 0x005E
db "jle"
dw 0x0060
.by_name_4:
db "r10d"
dw 0x001A
db "r11d"
dw 0x001B
db "r12d"
dw 0x001C
db "r13d"
dw 0x001D
db "r14d"
dw 0x001E
db "r15d"
dw 0x001F
db "r10w"
dw 0x002A
db "r11w"
dw 0x002B
db "r12w"
dw 0x002C
db "r13w"
dw 0x002D
db "r14w"
dw 0x002E
db "r15w"
dw 0x002F
db "r10b"
dw 0x003A
db "r11b"
dw 0x003B
db "r12b"
dw 0x003C
db "r13b"
dw 0x003D
db "r14b"
dw 0x003E
db "r15b"
dw 0x003F
db "int3"
dw 0x0050
db "call"
dw 0x0059
.by_name_5:
.by_id:
dw 0x0053 ; xor
db 0x01 ; type: operator
db 0x02 ; # operands
dw 0x0010 ; eax
db 0x02 ; type: register
db 0x02 ; width: 32 bit
dw 0x0054 ; inc
db 0x01 ; type: operator
db 0x01 ; # operands
dw 0x0000 ; rax
db 0x02 ; type: register
db 0x03 ; width: 64 bit
dw 0x0056 ; mov
db 0x01 ; type: operator
db 0x02 ; # operands
dw 0x0003 ; rdx
db 0x02 ; type: register
db 0x03 ; width: 64 bit
dw 0x004F ; hlt
db 0x01 ; type: operator
db 0x00 ; # operands
.by_id_end:
msg_welcome db "Welcome to Twasm", 0x0A, 0x00
msg_halt db "halted.", 0x0A, 0x00
token_terminator_8 db 0x00, " ", 0x0A, 0x0D, ",", 0x00, 0x00, 0x00
debug_string db "debug_string", 0x0A, 0x00
; test program
program:
db "xor eax, eax", 0x0A
db "inc rax", 0x0A
db "mov [ rax ], rdx", 0x0A
db "hlt", 0x0A
db 0x00 ; just for the sake of being able to print it, I made it a string
.size db $ - program - 1