Files
bootler/twasm/asm/main.asm
2026-03-23 23:37:39 +01:00

1830 lines
41 KiB
NASM

; TODO actually enforce any of these *_SIZE constants :p
LOAD_ADDR equ 0x00010000 ; address this program is loaded at
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
TEST_ARENA_SIZE equ 0x1000 ; maximum size tests can use
TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
TOKEN_TABLE_SIZE equ 0x1000 ; max length of table
TOKEN_TABLE_ENTRY_SIZE equ 2 ; size of token table entry; things may break
; if this ever changes
OUTPUT_ADDR equ 0x00070000 ; address of outputed binary
OUTPUT_SIZE equ 0x1000 ; max length of outputed binary
STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at
UNRECOGNISED_TOKEN_ID equ 0xFFFF ; id of an unrecognised token
UNRECOGNISED_ID_TYPE equ 0x0F ; type of an unrecognised id
UNRECOGNISED_ID_METADATA equ 0xFF ; metadata of an unrecognised id
UNRECOGNISED_ID_OPCODE equ 0x90 ; opcode of an unrecognised id (NOP)
TEST_LINE_LENGTH equ 80 ; right border of test suite results
; flags for expected values in tokeniser
E_COMMENT equ 1 << 0
E_NEWLINE equ 1 << 1
E_WHITESPACE equ 1 << 2
E_COMMA equ 1 << 3
E_OPERATOR equ 1 << 4
E_OPERAND equ 1 << 5
[bits 64]
[org LOAD_ADDR]
[default abs] ; TODO see if I actually need to do this
; afaik absolute addressing is not harmful on bare metal
; reasoning: stops annoying warning =D
start:
mov rsp, STACK_ADDR ; we might need more stack space, let's just be safe
mov rsi, msg_welcome
call print
call run_tests
call clear_token_table
mov rdi, program ; -> program
mov rsi, [program.size] ; = size of program
call tokenise
; rax = number of tokens processed
mov rdi, rax
push rdi
call clear_output_arena
pop rdi
call assemble
jmp halt
; ------------------------------------------------------------------------------
; assembling
; ------------------------------------------------------------------------------
; ------------------------------------------------------------------------------
; assemble
; TODO write tests
; TODO make it work :/ putting the cart before the horse
;
; description:
; assembles the program from tokens located at TOKEN_TABLE_ADDR into a flat
; binary located at OUTPUT_ADDR.
; Behaviour is undefined when:
; - tokens are in an impossible order
; 0x1000 ; memory address, following byte should be a register
; 0x1000 ; not a register
; - operator tokens followed by the wrong number of arguments
; TODO enforce this in `tokenise`
; 0x004F ; hlt, expects 0 arguments
; 0x0000 ; rax, an argument
; - an undefined token is included, like 0x0051
;
; parameters:
; rdi = number of tokens in the token table
; ------------------------------------------------------------------------------
assemble:
xor eax, eax ; rax = number of tokens processed
; rdi = number of tokens in table
.loop:
push rdi
xor edi, edi
mov di, [rax * TOKEN_TABLE_ENTRY_SIZE + TOKEN_TABLE_ADDR] ; next tte
push rax
; di = tte of operator
.operator: ; if next tte's type is an operator:
push rdi
; di = tte of operator
call get_tte_typed_metadata
; al = tte typed metadata
pop rdi
and al, 11b ; mask for # operands
cmp al, 0 ; check if operator has no operands
je .operator_0 ; if so, handle case of no operands
cmp al, 1 ; check if operator has one operand
je .operator_1 ; if so, handle case of one operand
cmp al, 2 ; check if operator has two operands
je .operator_2 ; if so, handle case of two operands
jmp .unexpected_token ; TODO actually check operator type or not first
; if get_tte_typed_metadata happens to return 0, 1,
; or 2 on a non-operator, it doesn't get caught
; di = tte of operator
.operator_0:
push rsi
mov rsi, .msg_operator_0
call print.debug
pop rsi
push rdi
push rsi
; di = tte of operator
mov sil, 0b ; standard opcode
call get_opcode
; al = opcode
call .output_byte
pop rsi
pop rdi
pop rax ; rax = number of tokens processed
pop rdi ; rdi = total number of tokens
inc rax
cmp rax, rdi
jge .break
jmp .loop
; di = tte of operator
.operator_1:
push rsi
mov rsi, .msg_operator_1
call print.debug
pop rsi
push rdi
push rsi
; di = tte of operator
mov sil, 0b ; dst=r/m
call get_opcode
; al = opcode
call .output_byte
pop rsi
pop rdi ; di = tte of operator
pop rax ; rax = number of tokens processed
pop rdi ; rdi = total number of tokens
inc rax
cmp rax, rdi
jge .break
push rdi
push rax
xor edi, edi
mov di, [rax * TOKEN_TABLE_ENTRY_SIZE + TOKEN_TABLE_ADDR] ; di = next tte
push rdi
and di, 0xFF00
cmp di, 0x1000 ; check if token is a memory address
pop rdi ; di = next tte
je .operator_1_memory_access
; di = next tte
call get_tte_type
; al = type of token
cmp al, 0x02 ; type: register
je .operator_1_register
pop rax ; rax = number of tokens processed
pop rdi ; rdi = total number of tokens
inc rax
cmp rax, rdi
jge .break
jmp .loop
; TODO figure out if this is relevant
.operator_1_memory_access:
push rsi
mov rsi, .msg_operator_1_memory_access
call print.debug
pop rsi
jmp .unsupported_memory_access
.operator_1_register:
push rsi
mov rsi, .msg_operator_1_register
call print.debug
pop rsi
mov si, di ; si = `R/M` tte
mov di, 0x0000 ; di = `reg` tte
mov dl, 11b ; dl bits
call get_ModRM
; al = Mod R/M byte
call .output_byte
pop rax ; rax = number of tokens processed
pop rdi ; rdi = total number of tokens
inc rax
cmp rax, rdi
jge .break
jmp .loop
.operator_2:
push rsi
mov rsi, .msg_operator_2
call print.debug
pop rsi
mov cx, di ; cx = tte of operator
pop rax
pop rdi
inc rax
cmp rax, rdi
jge .break
push rdi
push rax
xor edi, edi
mov di, [rax * TOKEN_TABLE_ENTRY_SIZE + TOKEN_TABLE_ADDR] ; di = next tte
push rdi
and di, 0xFF00
cmp di, 0x1000 ; check if token is a memory address
pop rdi ; di = next tte
je .operator_2_memory_access
push rcx
; di = next tte
call get_tte_type
; al = type of token
pop rcx ; cx = tte of operator
cmp al, 0x02 ; type: register
je .operator_2_register
pop rax ; rax = number of tokens processed
pop rdi ; rdi = total number of tokens
inc rax
cmp rax, rdi
jge .break
jmp .loop
.operator_2_memory_access:
push rsi
mov rsi, .msg_operator_2_memory_access
call print.debug
pop rsi
cmp di, 0x1000 ; check if token is addressing a register
jne .unsupported_memory_access ; if not, unsupported :/
push rdi
mov di, cx ; di = tte of operator
mov sil, 0 ; dst = r/m
call get_opcode
; al = opcode
call .output_byte
pop rdi
pop rax
pop rdi
inc rax
cmp rax, rdi
jge .break
push rdi
push rax
xor edi, edi
mov di, [rax * TOKEN_TABLE_ENTRY_SIZE + TOKEN_TABLE_ADDR] ; di = next tte
mov si, di ; si = dst tte
pop rax
pop rdi
inc rax
cmp rax, rdi
jge .break
push rdi
push rax
xor edi, edi
mov di, [rax * TOKEN_TABLE_ENTRY_SIZE + TOKEN_TABLE_ADDR] ; di = next tte
push rdi
and di, 0xFF00
cmp di, 0x1000 ; check if token is a memory address
pop rdi ; di = next tte
je .unsupported_memory_access ; no case of *],[* in asm
; di = next tte
call get_tte_type
; al = type of token
cmp al, 0x02
je .operator_2_memory_access_register
pop rax ; rax = number of tokens processed
pop rdi ; rdi = total number of tokens
inc rax
cmp rax, rdi
jge .break
jmp .loop
.operator_2_memory_access_register:
push rsi
mov rsi, .msg_operator_2_memory_access_register
call print.debug
pop rsi
; si = r/m; dst tte
; di = reg; src tte
mov dl, 00b ; dl = mod bits
call get_ModRM
; al = Mod R/M byte
call .output_byte
pop rax ; rax = number of tokens processed
pop rdi ; rdi = total number of tokens
inc rax
cmp rax, rdi
jge .break
jmp .loop
.operator_2_register:
push rsi
mov rsi, .msg_operator_2_register
call print.debug
pop rsi
push rdi
mov di, cx ; di = tte of operator
mov sil, 1 ; dst = reg
call get_opcode
; al = opcode
call .output_byte
pop rdi
mov si, di ; si = dst tte
pop rax
pop rdi
inc rax
cmp rax, rdi
jge .break
push rdi
push rax
xor edi, edi
mov di, [rax * TOKEN_TABLE_ENTRY_SIZE + TOKEN_TABLE_ADDR] ; di = next tte
push rdi
and di, 0xFF00
cmp di, 0x1000 ; check if token is a memory address
pop rdi ; di = next tte
je .operator_2_register_memory_access
; di = next tte
call get_tte_type
; al = type of token
cmp al, 0x02
je .operator_2_register_register
pop rax ; rax = number of tokens processed
pop rdi ; rdi = total number of tokens
inc rax
cmp rax, rdi
jge .break
jmp .loop
.operator_2_register_memory_access:
push rsi
mov rsi, .msg_operator_2_register_memory_access
call print.debug
pop rsi
cmp di, 0x1000 ; check if token is addressing to a register
jne .unsupported_memory_access ; if not, unsupported :/
pop rax
pop rdi
inc rax
cmp rax, rdi
jge .break
push rdi
push rax
xor edi, edi
mov di, [rax * TOKEN_TABLE_ENTRY_SIZE + TOKEN_TABLE_ADDR] ; di = next tte
; si = `R/M` tte
; di = `reg` tte
push rsi
mov si, di
pop rdi
mov dl, 00b ; dl = mod bits
call get_ModRM
; al = Mod R/M byte
call .output_byte
pop rax ; rax = number of tokens processed
pop rdi ; rdi = total number of tokens
inc rax
cmp rax, rdi
jge .break
jmp .loop
.operator_2_register_register:
push rsi
mov rsi, .msg_operator_2_register_register
call print.debug
pop rsi
push rsi
mov si, di ; si = reg; src tte
pop rdi ; di = r/m; dst tte
mov dl, 11b ; dl = mod bits
call get_ModRM
; al = Mod R/M byte
call .output_byte
pop rax ; rax = number of tokens processed
pop rdi ; rdi = total number of tokens
inc rax
cmp rax, rdi
jge .break
jmp .loop
.break:
ret
.unexpected_token:
mov rsi, .msg_unexpected_token
call print.error
jmp halt
.unsupported_memory_access:
mov rsi, .msg_unsupported_memory_access
call print.error
jmp halt
; procedures
; al = byte to write
.output_byte:
mov edx, [.next_output_byte] ; get output byte's address
mov [edx], al ; write byte to that address
inc edx ; increment address
mov [.next_output_byte], edx ; put output byte's address
ret
.next_output_byte dd OUTPUT_ADDR ; next empty byte in output
; TODO get rid of this sketchy bit of state
.msg_unexpected_token db "unexpected token, aborting", 0x0A, 0x00
.msg_unsupported_memory_access db "unsupported memory access, aborting", 0x0A, 0x00
.msg_operator_0 db "operator_0", 0x0A, 0x00
.msg_operator_1 db "operator_1", 0x0A, 0x00
.msg_operator_1_memory_access db "operator_1_memory_access", 0x0A, 0x00
.msg_operator_1_register db "operator_1_register", 0x0A, 0x00
.msg_operator_2 db "operator_2", 0x0A, 0x00
.msg_operator_2_memory_access db "operator_2_memory_access", 0x0A, 0x00
.msg_operator_2_memory_access_register db "operator_2_memory_access_register", 0x0A, 0x00
.msg_operator_2_register db "operator_2_register", 0x0A, 0x00
.msg_operator_2_register_memory_access db "operator_2_register_memory_access", 0x0A, 0x00
.msg_operator_2_register_register db "operator_2_register_register", 0x0A, 0x00
; ------------------------------------------------------------------------------
; get_tte_type
;
; description:
; given a token table entry, returns the declared type in `tokens.by_id`. If
; there is no entry, returns UNRECOGNISED_ID_TYPE
;
; parameters:
; di = token table entry
;
; returned:
; al = type of token, or UNRECOGNISED_ID_TYPE. The upper 4 bits of al are
; zeroed; the rest of rax is zeroed.
; ------------------------------------------------------------------------------
get_tte_type:
and rdi, 0xFFFF ; mask input so it behaves as expected
xor eax, eax
.loop:
cmp rax, (tokens.by_id_end - tokens.by_id) / 4 ; make sure it's still in range
jg .not_found
mov cx, [tokens.by_id + rax * 4] ; next entry in tokens.by_id
cmp cx, di
je .found
inc rax
jmp .loop
.not_found:
mov al, UNRECOGNISED_ID_TYPE
and ax, 0xF ; mask as expected
ret
.found:
mov al, [2 + tokens.by_id + rax * 4]
and ax, 0xF ; mask as expected
ret
; ------------------------------------------------------------------------------
; get_tte_typed_metadata
;
; description:
; given a token table entry, returns the declared typed metadata in
; `tokens.by_id`. If there is no entry, returns UNRECOGNISED_ID_METADATA
;
; parameters:
; di = token table entry
;
; returned:
; al = typed metadata of token, or UNRECOGNISED_ID_METADATA; the rest of rax is
; zeroed.
; ------------------------------------------------------------------------------
get_tte_typed_metadata:
and rdi, 0xFFFF ; mask input so it behaves as expected
xor eax, eax
.loop:
cmp rax, (tokens.by_id_end - tokens.by_id) / 4 ; make sure it's still in range
jg .not_found
mov cx, [tokens.by_id + rax * 4] ; next entry in tokens.by_id
cmp cx, di
je .found
inc rax
jmp .loop
.not_found:
xor eax, eax
mov al, UNRECOGNISED_ID_METADATA
ret
.found:
mov al, [3 + tokens.by_id + rax * 4]
and rax, 0xFF
ret
; ------------------------------------------------------------------------------
; get_ModRM
;
; description:
; given 2 register tokens and the mod bits, returns the ModR/M byte
;
; parameters:
; di = token table entry `reg`
; si = token table entry `R/M`
; dl = lower 2 bits: mod bits. The rest is ignored
;
; returned:
; al = ModR/M byte; the rest of rax is zeroed
; ------------------------------------------------------------------------------
get_ModRM:
and dl, 11b ; mask for mod bits
shl dl, 6
; di = tte
call get_reg_bits
; al = reg bits
mov bl, al
shl bl, 3
mov rdi, rsi ; do the other one
; di = tte
call get_reg_bits
; al = reg bits
mov cl, al
xor eax, eax
or al, dl ; mod bits
or al, bl ; reg bits
or al, cl ; R/M bits
and rax, 0xFF ; mask for byte
ret
; ------------------------------------------------------------------------------
; get_opcode
;
; description:
; given an operator token, returns its opcode. For operators with multiple
; opcodes, the variant can be specified.
;
; parameters:
; di = token table entry
; sil = lower bit: 0: dst=r/m or only opcode
; 1: dst=reg or 0x00
;
; returned:
; al = opcode; the rest of rax is zeroed
; ------------------------------------------------------------------------------
get_opcode:
and rdi, 0xFFFF
and rsi, 1
xor eax, eax
.loop:
cmp rax, (opcodes.by_id_end - opcodes.by_id) / 4 ; make sure it's still in range
jg .not_found
mov cx, [opcodes.by_id + rax * 4] ; next entry in opcodes.by_id
cmp cx, di
je .found
inc rax
jmp .loop
.not_found:
xor eax, eax
mov al, UNRECOGNISED_ID_OPCODE
ret
.found:
mov al, [rsi + 2 + opcodes.by_id + rax * 4]
and rax, 0xFF ; mask
ret
; ------------------------------------------------------------------------------
; get_reg_bits
;
; description:
; given a register token, returns its reg bits metadata
;
; parameters:
; di = token table entry
;
; returned:
; al = register token; the rest of rax, including the upper 5 bits of al, are
; zeroed.
; ------------------------------------------------------------------------------
get_reg_bits:
; di = tte
call get_tte_typed_metadata
; al = typed metadata
shr al, 2 ; discard type data
and al, 111b ; mask
ret
; ------------------------------------------------------------------------------
; tokenising
; ------------------------------------------------------------------------------
; ------------------------------------------------------------------------------
; tokenise
; TODO write tests
;
; description:
; represents the program at the given address and puts it in the token table
; it's probably desirable to clear the token table before calling this function.
;
; parameters:
; rdi -> first byte of program
; rsi = size of program in bytes
;
; returned:
; rax = number of tokens processed
; ------------------------------------------------------------------------------
tokenise:
; rdi -> current byte of program
add rsi, rdi ; rsi -> last byte of program
xor eax, eax ; rax = number of tokens processed
xor edx, edx ; dl = current byte of program
.loop:
cmp rdi, rsi ; if current byte greater than last byte
jge .break ; then break
mov dl, [rdi] ; dl = current byte
cmp dl, ";" ; if current byte is the start of a comment
je .comment ; then handle the comment
cmp dl, 0x0A ; if current byte is the end of a line
je .newline_mk_flags ; then reset relevant flags
cmp dl, "," ; if current byte is a comma
je .comma ; then handle the comma
push rsi
push rdi
push rax
push rdx
; TODO probably should not ignore null bytes
mov rsi, whitespace_2 ; rsi -> list of whitespace (ignored) bytes
mov rdi, 2 ; rdi = size of list in bytes
; dl = current byte
call elemb
; al = 0 if not whitespace, 1 if whitespace
cmp al, 1 ; check if current byte is whitespace
pop rdx
pop rax
pop rdi
pop rsi
je .skip_byte_whitespace
test byte [.expecting], E_OPERATOR ; check if an operator is expected
jnz .operator ; if so, handle it
jmp .operand ; otherwise, handle as an operand
.comment:
push rsi
mov rsi, .found
call print.debug
mov rsi, .msg_comment
call print
pop rsi
test byte [.expecting], E_COMMENT ; make sure a comment is expected
jz .unexpected_comment ; if not, error
.comment_loop:
; TODO range check rdi
mov dl, [rdi] ; dl = current byte
cmp dl, 0x0A ; if current byte is a newline
je .comment_break ; then break
inc rdi ; point to next unread byte
jmp .comment_loop
.comment_break:
jmp .loop
.skip_byte_whitespace:
push rsi
mov rsi, .found
call print.debug
mov rsi, .msg_whitespace
call print
pop rsi
test byte [.expecting], E_WHITESPACE ; make sure a whitespace was expected
jz .unexpected_whitespace ; if not, error
inc rdi
jmp .loop ; else, loop
.comma: ; found comma
push rsi
mov rsi, .found
call print.debug
mov rsi, .msg_comma
call print
pop rsi
test byte [.expecting], E_COMMA ; make sure a comma was expected
jz .unexpected_comma ; if not, error
inc rdi
mov [.expecting], E_WHITESPACE | E_OPERAND ; else, make operand expected
jmp .loop ; and loop
.newline_mk_flags:
push rsi
mov rsi, .found
call print.debug
mov rsi, .msg_newline
call print
pop rsi
test byte [.expecting], E_NEWLINE ; make sure a newline was expected
jz .unexpected_newline ; if not, error
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR
inc rdi
jmp .loop
.operator:
; debug message
push rsi
mov rsi, .found
call print.debug
mov rsi, .msg_operator
call print
pop rsi
push rax
xor eax, eax ; eax = number of bytes in operator
mov [.pending_operator], eax ; zero pending operator
.operator_loop:
; TODO give this its own error
; TODO make this pop rax
cmp eax, 4 ; check that operator is short enough
jg .unexpected_operator ; if not, error
mov dl, [rdi] ; next byte
; TODO have better check for operator end
cmp dl, " "
je .operator_break
cmp dl, 0x0A
je .operator_break
cmp dl, 0x00
je .operator_break
cmp dl, ";"
je .operator_break
mov [.pending_operator + eax], dl
inc rax ; inc byte counter
inc rdi ; inc byte pointer
jmp .operator_loop ; and loop
.operator_break:
; rax already pushed from .operator
push rdi
mov edi, [.pending_operator] ; edi = operator to be searched
call identify_operator
; ax = operator's token ID
mov cx, ax ; cx = operator's token ID for safe keeping
pop rdi ; rdi = byte counter
pop rax ; rax = tokens processed
mov [TOKEN_TABLE_ADDR + rax * TOKEN_TABLE_ENTRY_SIZE], cx
inc rax ; plus 1 token processed
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERAND
jmp .loop
.operand:
; debug message
push rsi
mov rsi, .found
call print.debug
mov rsi, .msg_operand
call print
pop rsi
test byte [.expecting], E_OPERAND ; make sure an operand was expected
jz .unexpected_operand ; if not, error
push rax
push rdi
xor eax, eax ; rax = length of operand
.operand_loop:
mov dl, [rdi]
cmp dl, ","
je .operand_break
cmp dl, 0x0A
je .operand_break
cmp dl, 0x00
je .operand_break
cmp dl, ";"
je .operand_break
inc rax ; inc length counter
inc rdi ; inc byte pointer
jmp .operand_loop
.operand_break:
pop rdi ; rdi -> first byte of operand
push rdi
push rsi
mov rsi, rax ; rsi = length of operand in bytes
mov cx, ax ; cx = length counter for safe keeping
call evaluate_operand
; dl = return code
; rax = binary data
pop rsi
pop rdi ; rdi = first byte of operand
add di, cx ; rdi = last byte of operand
mov rcx, rax ; rcx = evaluate_operand's binary return data
pop rax ; rax = number of tokens processed
; operand is some reg
; cx = token ID
cmp dl, 0x00
je .operand_register
; operand is some [reg]
; cx = token ID
cmp dl, 0x10
je .operand_addr_register
jmp .unexpected_operand
; cx = token ID
.operand_register:
mov [TOKEN_TABLE_ADDR + rax * TOKEN_TABLE_ENTRY_SIZE], cx
inc rax ; another token processed
jmp .operand_break_continue
; cx = token ID
.operand_addr_register:
mov word [TOKEN_TABLE_ADDR + rax * TOKEN_TABLE_ENTRY_SIZE], 0x1000
inc rax ; 0x1000: addr reg token, next token is the register
mov [TOKEN_TABLE_ADDR + rax * TOKEN_TABLE_ENTRY_SIZE], cx
inc rax ; the register as returned by evaluate_operand
jmp .operand_break_continue
.operand_break_continue:
mov byte [.expecting], E_COMMENT | E_NEWLINE | E_WHITESPACE | E_COMMA
jmp .loop
.break:
ret
; state
.expecting db E_COMMENT | E_NEWLINE | E_WHITESPACE | E_OPERATOR
.unexpected_whitespace:
mov rsi, .err_unexpected
call print.error
mov rsi, .msg_whitespace
call print
jmp halt
.unexpected_comment:
mov rsi, .err_unexpected
call print.error
mov rsi, .msg_comment
call print
jmp halt
.unexpected_newline:
mov rsi, .err_unexpected
call print.error
mov rsi, .msg_newline
call print
jmp halt
.unexpected_comma:
mov rsi, .err_unexpected
call print.error
mov rsi, .msg_comma
call print
jmp halt
.unexpected_operand:
mov rsi, .err_unexpected
call print.error
mov rsi, .msg_operand
call print
jmp halt
.unexpected_operator:
mov rsi, .err_unexpected
call print.error
mov rsi, .msg_operator
call print
jmp halt
.err_unexpected db "unexpected ", 0x00
.found db "found ", 0x00
.msg_whitespace db "whitespace.", 0x0A, 0x00
.msg_comment db "comment.", 0x0A, 0x00
.msg_newline db "newline.", 0x0A, 0x00
.msg_comma db "comma.", 0x0A, 0x00
.msg_operator db "operator.", 0x0A, 0x00
.msg_operand db "operand.", 0x0A, 0x00
.pending_operator dd 0 ; the operator token that is pending processing
; ------------------------------------------------------------------------------
; evaluate_operand
;
; description:
; takes the location and length of an operand and evaluates it into binary data
; and a return code to interpret the binary data.
;
; | code | rsi contents | notes |
; |------|----------------------|-------|
; | 0x00 | token ID of register | reg |
; | 0x10 | token ID of register | [reg] |
; | 0xFF | - | error |
;
; parameters:
; rdi -> first byte of operand
; rsi = size of operand in bytes
;
; returned:
; rax = binary data corresponding to the operand
; dl = return code
; ------------------------------------------------------------------------------
evaluate_operand:
push rdi
push rsi
mov rsi, rdi ; rsi -> start of operand
pop rdi ; rdi = size of operand
call trim_trailing_whitespace
pop rdi ; rdi -> first byte of operand
mov rsi, rax ; rsi = size of operand w/o trailing whitespace
cmp rsi, 0 ; case: 0 length
je .unrecognised ; unrecognised
cmp byte [rdi], '[' ; case: memory addressing
je .address
jmp .register ; otherwise: register
.address:
cmp byte [rdi + rsi - 1], ']' ; check if address is closed correctly
jne .unrecognised ; if not, fail
inc rdi ; rdi -> enclosed operand
sub rsi, 2 ; rsi = length of enclosed operand
call evaluate_operand
; rax = binary data
; dl = return code
cmp dl, 0x10 ; make sure return code isn't another memory reference
je .unrecognised ; if it is, fail
or dl, 0x10 ; flip bit for address return
ret
.register:
cmp rsi, 4
jg .unrecognised
push rdi
mov edi, [rdi] ; edi = register to be searched
; TODO figure out how to mask elegantly :/
; mask edi for lower rsi bits
cmp rsi, 4
je .register4
cmp rsi, 3
je .register3
cmp rsi, 2
je .register2
cmp rsi, 1
je .register1
.register1:
and edi, 0xFF
.register2:
and edi, 0xFFFF
.register3:
and edi, 0xFFFFFF
.register4:
call identify_register
; ax = register's token ID or UNRECOGNISED_TOKEN_ID
pop rdi
cmp ax, UNRECOGNISED_TOKEN_ID
je .unrecognised
mov dl, 0x00
ret
.unrecognised:
mov dl, 0xFF
ret
; ------------------------------------------------------------------------------
; evaluate_constant
;
; description:
; takes a constant and returns its hexidecimal representation. Currently the
; following constants are supported:
;
; | type | p. | description |
; |------|----|--------------|
; | 0x00 | 0x | hexidecimal |
; | 0x01 | 0q | octal |
; | 0x02 | 0b | binary |
; | 0x03 | " | char |
; | 0xFF | | unrecognised |
;
; where `p.` is the prefix or otherwise indicator
;
; parameters:
; rdi -> first byte of constant
; rsi = size of constant in bytes
;
; returned:
; rax = value of the constant in hexidecimal
; dl = type of constant; the rest of rdx is zeroed
; ------------------------------------------------------------------------------
evaluate_constant:
; rsi = number of bytes left
; rdi -> current byte of constant
xor eax, eax ; rax = value of constant
; each case pushes the return value of dl into `rcx`, which is popped into dl
; to return
mov dl, [rdi]
dec rsi ; one fewer byte left
inc rdi ; point to next byte
; all numeric prefixes further handled in .numeric
cmp dl, '0'
je .numeric
; chr case
mov rcx, 0x03
push rcx
xor ecx, ecx ; rcx = number of times right-rolled
cmp dl, '"'
je .chr
pop rcx
jmp .unrecognised
.numeric:
mov dl, [rdi]
dec rsi ; one fewer byte left
inc rdi ; point to next byte
; hex case
mov rcx, 0x00
push rcx
cmp dl, 'x'
je .hex_loop
pop rcx
; octal case
mov rcx, 0x01
push rcx
cmp dl, 'q'
je .oct_loop
pop rcx
; binary case
mov rcx, 0x02
push rcx
cmp dl, 'b'
je .bin_loop
pop rcx
jmp .unrecognised
.hex_loop:
cmp rsi, 0 ; make sure we're in range
je .break ; if not, break
shl rax, 4 ; make room for next hex digit
mov dl, [rdi] ; dl = next byte of constant
sub dl, '0' ; dl = if digit: digit; else :shrug:
cmp dl, 9 ; if !digit:
jg .hex_alpha ; letter
jmp .hex_continue ; else loop
.hex_alpha:
sub dl, 7 ; map [('A'-'0')..('F'-'0')] to [0xA..0xF]
cmp dl, 0xF ; if not in the range [0xA..0xF]
jg .unrecognised ; then unrecognised
.hex_continue:
and dl, 0x0F ; mask
or al, dl ; and add newest nibble
dec rsi ; one fewer byte left
inc rdi ; point to next byte
jmp .hex_loop ; and loop
.oct_loop:
cmp rsi, 0 ; make sure we're in range
je .break ; if not, break
shl rax, 3 ; make room for next octal digit
mov dl, [rdi] ; dl = next byte of constant
sub dl, '0'
cmp dl, 7
jg .unrecognised
and dl, 7 ; mask
or al, dl ; and add newest 3-bit group
dec rsi ; one fewer byte left
inc rdi ; point to next byte
jmp .oct_loop ; and loop
.bin_loop:
cmp rsi, 0 ; range check
je .break
shl rax, 1
mov dl, [rdi]
sub dl, '0'
cmp dl, 1
jg .unrecognised
and dl, 1 ; mask
or al, dl ; and newest bit
dec rsi
inc rdi
jmp .bin_loop
.chr:
cmp rcx, 4 ; ensure char is only 4 bytes long
jg .unrecognised
cmp rsi, 1 ; range check
je .chr_break
ror rax, 8
inc rcx
mov dl, [rdi]
; bound check byte as printable char
cmp dl, 0x20
jl .unrecognised
cmp dl, 0x7E
jg .unrecognised
or al, dl
dec rsi
inc rdi
jmp .chr
.chr_break:
cmp rcx, 1 ; for each [1..rcx]
jle .chr_break_for_good
rol rax, 8 ; roll left to make up for the roll right earlier
dec rcx
jmp .chr_break
.chr_break_for_good:
mov dl, [rdi] ; make sure the chr is closed
cmp dl, '"'
jne .unrecognised
jmp .break
.break:
pop rdx
ret
.unrecognised:
pop rdx
mov rdx, 0xFF ; unrecognised type
ret
; ------------------------------------------------------------------------------
; identify_register
;
; description:
; takes a register in ascii-encoded text and returns its token ID or
; UNRECOGNISED_TOKEN_ID if not recognised
;
; parameters:
; edi = register to be searched
;
; returned:
; ax = register's token ID or UNRECOGNISED_TOKEN_ID
; ------------------------------------------------------------------------------
identify_register:
xor eax, eax ; tokens.registers + eax -> entry in tokens.registers
.loop:
cmp eax, (tokens.registers_end - tokens.registers)
jge .not_found
cmp edi, [tokens.registers + eax]
je .found
add eax, 6
jmp .loop
.found:
mov ax, [tokens.registers + eax + 4]
ret
.not_found:
mov ax, UNRECOGNISED_TOKEN_ID
ret
; ------------------------------------------------------------------------------
; identify_operator
; TODO combine with identify_register
;
; description:
; takes an operator in ascii-encoded text and returns its token ID or
; UNRECOGNISED_TOKEN_ID if not recognised
;
; parameters:
; edi = operator to be searched
;
; returned:
; ax = operator's token ID or UNRECOGNISED_TOKEN_ID
; ------------------------------------------------------------------------------
identify_operator:
xor eax, eax ; tokens.operators + eax -> entry in tokens.operators
.loop:
cmp eax, (tokens.operators_end - tokens.operators)
jge .not_found
cmp edi, [tokens.operators + eax]
je .found
add eax, 6
jmp .loop
.found:
mov ax, [tokens.operators + eax + 4]
ret
.not_found:
mov ax, UNRECOGNISED_TOKEN_ID
ret
; ------------------------------------------------------------------------------
; utilities
; ------------------------------------------------------------------------------
; ------------------------------------------------------------------------------
; print
;
; description:
; prints a null-terminated string
; probably doesn't change any registers for ease of debugging
;
; parameters:
; rsi -> start of null-terminated string
; ------------------------------------------------------------------------------
print:
push rdx
push rax
push rsi
mov edx, 0x3F8
.loop:
mov al, [rsi]
test al, al
jz .done
out dx, al
inc rsi
jmp .loop
.done:
pop rsi
pop rax
pop rdx
ret
.debug:
push rsi
mov rsi, .debug_msg
call print
pop rsi
call print
ret
.error:
push rsi
mov rsi, .error_msg
call print
pop rsi
call print
ret
.test:
push rsi
mov rsi, .test_msg
call print
pop rsi
call print
ret
.warn:
push rsi
mov rsi, .warn_msg
call print
pop rsi
call print
ret
.debug_msg db "[DEBUG]: ", 0x00
.error_msg db "[ERROR]: ", 0x00
.test_msg db "[TEST]: ", 0x00
.warn_msg db "[WARN]: ", 0x00
; ------------------------------------------------------------------------------
; halt
;
; description:
; halts the program, silly :)
; ------------------------------------------------------------------------------
halt:
mov rsi, msg_halt
call print
hlt
jmp halt
; ------------------------------------------------------------------------------
; elemb
;
; description:
; checks if given byte is element of the specified list.
;
; parameters:
; rdi = size of list
; rsi -> start of list
; dl = given byte
;
; returned:
; rax = 0: is not an element
; 1: is an element
; ------------------------------------------------------------------------------
elemb:
.loop:
cmp rdi, 0 ; check if remaining length 0
je .not_found ; if so, break; dl not an element of list
mov al, [rsi]
cmp al, dl ; check if current byte in list is the desired byte
je .found ; if so, break; dl an element of list
inc rsi ; move to next byte
dec rdi ; and reduce remaining length
jmp .loop
.not_found:
xor eax, eax ; return 0; dl not an element of list
ret
.found:
xor eax, eax
mov rax, 1 ; return 1; dl an element of list
ret
; ------------------------------------------------------------------------------
; djb2
;
; description:
; gets the 64-bit djb2 hash of a given string
;
; parameters:
; rdi = size of string
; rsi -> start of string
;
; returned:
; rax = hash
; ------------------------------------------------------------------------------
djb2:
xor ecx, ecx ; rcx = index
mov rax, 5381 ; rax = hash
.loop:
cmp rcx, rdi
jge .break
mov rdx, rax
shl rax, 5
add rax, rdx
xor edx, edx
mov dl, [rsi + rcx] ; dl = current byte
add rax, rdx
inc rcx
jmp .loop
.break:
ret
; ------------------------------------------------------------------------------
; trim_trailing_whitespace
;
; description:
; trims whitespace from the start and end of the given byte array.
;
; parameters:
; rdi = size of list
; rsi -> start of list
;
; returned:
; rax = new size of list
; ------------------------------------------------------------------------------
trim_trailing_whitespace:
cmp rdi, 0 ; list of length zero
je .done ; already trimmed
push rdi
push rsi
mov dl, [rsi + rdi - 1] ; last element of given list
mov rsi, whitespace_2 ; pointer of whitespace list
mov rdi, 2 ; length of whitespace list
call elemb
pop rsi ; rsi -> start of list
pop rdi ; rdi = size of list
cmp al, 0 ; if last element whitespace
je .done ; then break
.trim: ; otherwise one shorter
dec rdi
call trim_trailing_whitespace
ret
.done:
mov rax, rdi
ret
; ------------------------------------------------------------------------------
; clear_token_table
;
; description:
; clears the token table as specified by TOKEN_TABLE_SIZE and TOKEN_TABLE_ADDR
; ------------------------------------------------------------------------------
clear_token_table:
xor eax, eax ; value to write
mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words
mov rdi, TOKEN_TABLE_ADDR ; address to start
rep stosd
ret
; ------------------------------------------------------------------------------
; clear_test_arena
;
; description:
; clears the test arena as specified by TEST_ARENA_SIZE and TEST_ARENA_ADDR
; ------------------------------------------------------------------------------
clear_test_arena:
xor eax, eax ; value to write
mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words
mov rdi, TOKEN_TABLE_ADDR ; address to start
rep stosd
ret
; ------------------------------------------------------------------------------
; clear_output_arena
;
; description:
; clears the output arena as specified by OUTPUT_SIZE and OUTPUT_ADDR
; ------------------------------------------------------------------------------
clear_output_arena:
xor eax, eax ; value to write
mov rcx, OUTPUT_SIZE / 4 ; number of double words
mov rdi, OUTPUT_ADDR ; address to start
rep stosd
ret
%include "asm/tests.asm"
; ------------------------------------------------------------------------------
; data
; ------------------------------------------------------------------------------
tokens:
.by_id:
dw 0x0010 ; eax
db 0x02 ; type: register
db 00000010b ; reg: 000b
; width: 10b (32 bits)
dw 0x0000 ; rax
db 0x02 ; type: register
db 00000011b ; reg: 000b
; width: 11b (64 bits)
dw 0x0003 ; rdx
db 0x02 ; type: register
db 00001011b ; reg: 010b
; width: 11b (64 bits)
dw 0x0053 ; xor
db 0x01 ; type: operator
db 0x02 ; # operands
dw 0x0054 ; inc
db 0x01 ; type: operator
db 0x01 ; # operands
dw 0x0056 ; mov
db 0x01 ; type: operator
db 0x02 ; # operands
dw 0x004F ; hlt
db 0x01 ; type: operator
db 0x00 ; # operands
.by_id_end:
.operators:
dd "je"
dw 0x005C
dd "jg"
dw 0x005F
dd "jl"
dw 0x0061
dd "hlt"
dw 0x004F
dd "xor"
dw 0x0053
dd "inc"
dw 0x0054
dd "dec"
dw 0x0055
dd "mov"
dw 0x0056
dd "add"
dw 0x0057
dd "sub"
dw 0x0058
dd "ret"
dw 0x005A
dd "cmp"
dw 0x005B
dd "jne"
dw 0x005D
dd "jge"
dw 0x005E
dd "jle"
dw 0x0060
dd "int3"
dw 0x0050
dd "call"
dw 0x0059
.operators_end:
.registers:
dd "r8"
dw 0x0008
dd "r9"
dw 0x0009
dd "ax"
dw 0x0020
dd "bx"
dw 0x0021
dd "cx"
dw 0x0022
dd "dx"
dw 0x0023
dd "si"
dw 0x0024
dd "di"
dw 0x0025
dd "sp"
dw 0x0026
dd "bp"
dw 0x0027
dd "al"
dw 0x0030
dd "bl"
dw 0x0031
dd "cl"
dw 0x0032
dd "dl"
dw 0x0033
dd "ah"
dw 0x0040
dd "bh"
dw 0x0041
dd "ch"
dw 0x0042
dd "dh"
dw 0x0043
dd "cs"
dw 0x0044
dd "ds"
dw 0x0045
dd "es"
dw 0x0046
dd "fs"
dw 0x0047
dd "gs"
dw 0x0048
dd "ss"
dw 0x0049
dd "rax"
dw 0x0000
dd "rbx"
dw 0x0001
dd "rcx"
dw 0x0002
dd "rdx"
dw 0x0003
dd "rsi"
dw 0x0004
dd "rdi"
dw 0x0005
dd "rsp"
dw 0x0006
dd "rbp"
dw 0x0007
dd "r10"
dw 0x000A
dd "r11"
dw 0x000B
dd "r12"
dw 0x000C
dd "r13"
dw 0x000D
dd "r14"
dw 0x000E
dd "r15"
dw 0x000F
dd "eax"
dw 0x0010
dd "ebx"
dw 0x0011
dd "ecx"
dw 0x0012
dd "edx"
dw 0x0013
dd "esi"
dw 0x0014
dd "edi"
dw 0x0015
dd "esp"
dw 0x0016
dd "ebp"
dw 0x0017
dd "r8d"
dw 0x0018
dd "r9d"
dw 0x0019
dd "r8w"
dw 0x0028
dd "r9w"
dw 0x0029
dd "sil"
dw 0x0034
dd "dil"
dw 0x0035
dd "spl"
dw 0x0036
dd "bpl"
dw 0x0037
dd "r8b"
dw 0x0038
dd "r9b"
dw 0x0039
dd "cr0"
dw 0x004A
dd "cr2"
dw 0x004B
dd "cr3"
dw 0x004C
dd "cr4"
dw 0x004D
dd "cr8"
dw 0x004E
dd "r10d"
dw 0x001A
dd "r11d"
dw 0x001B
dd "r12d"
dw 0x001C
dd "r13d"
dw 0x001D
dd "r14d"
dw 0x001E
dd "r15d"
dw 0x001F
dd "r10w"
dw 0x002A
dd "r11w"
dw 0x002B
dd "r12w"
dw 0x002C
dd "r13w"
dw 0x002D
dd "r14w"
dw 0x002E
dd "r15w"
dw 0x002F
dd "r10b"
dw 0x003A
dd "r11b"
dw 0x003B
dd "r12b"
dw 0x003C
dd "r13b"
dw 0x003D
dd "r14b"
dw 0x003E
dd "r15b"
dw 0x003F
.registers_end:
opcodes:
.by_id:
dw 0x0053 ; xor
db 0x31 ; r/m <- reg
db 0x33 ; reg <- r/m
dw 0x0054 ; inc
db 0xFF ; r/m
db 0x00 ;
dw 0x0056 ; mov
db 0x89 ; r/m <- reg
db 0x8B ; reg <- r/m
dw 0x004F ; hlt
db 0xF4 ; .
db 0x00 ;
.by_id_end:
msg_welcome db "Welcome to Twasm", 0x0A, 0x00
msg_halt db "halted.", 0x0A, 0x00
whitespace_2 db " ", 0x0D
; test program
program:
db "xor eax, eax", 0x0A
db "inc rax ; inline comment", 0x0A
db "; one line comment", 0x0A
db "mov rdx, [rax]", 0x0A
db "mov [rax], rdx", 0x0A
db "hlt", 0x0A
.size dq $ - program
msg_end db "end of the binary ->|", 0x0A, 0x00