bootler/twasm/asm/main.asm

; TODO actually enforce any of these *_SIZE constants :p

LOAD_ADDR equ 0x00010000 ; address this program is loaded at

TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
TEST_ARENA_SIZE equ 0x1000     ; maximum size tests can use

TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
TOKEN_TABLE_SIZE equ 0x1000     ; max length of table
TOKEN_TABLE_ENTRY_SIZE equ 2    ; size of token table entry; things may break
                                ; if this ever changes

OUTPUT_ADDR equ 0x00070000 ; address of outputed binary
OUTPUT_SIZE equ 0x1000     ; max length of outputed binary

STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at

UNRECOGNISED_TOKEN_ID equ 0xFFFF  ; id of an unrecognised token
UNRECOGNISED_ID_TYPE equ 0x0F     ; type of an unrecognised id
UNRECOGNISED_ID_METADATA equ 0xFF ; metadata of an unrecognised id

TEST_LINE_LENGTH equ 80 ; right border of test suite results

[bits 64]
[org LOAD_ADDR]

start:
  mov rsp, STACK_ADDR ; we might need more stack space, let's just be safe

  mov rsi, msg_welcome
  call print

  call run_tests

  call clear_token_table

  mov rdi, program      ; -> program
  mov rsi, [program.size] ; = size of program
  call tokenise
  ; rax = number of tokens processed
  mov rdi, rax
  push rdi
  call clear_output_arena
  pop rdi
  call assemble

  jmp halt

; ------------------------------------------------------------------------------
; assembling
; ------------------------------------------------------------------------------

; ------------------------------------------------------------------------------
; assemble
; TODO write testsr
; TODO make it work :/ putting the cart before the horse
;
; description:
; assembles the program from tokens located at TOKEN_TABLE_ADDR into a flat
; binary located at OUTPUT_ADDR. It's probably desirable to clear the output
; arena before calling this function.
;
; parameters:
; rdi = number of tokens in the token table
; ------------------------------------------------------------------------------

assemble:
  xor rax, rax ; number of tokens processed
  .loop:
    cmp rax, rdi ; check incrementer against the number of tokens in the token
    jge .break    ; table. If overflown, break

    push rdi
    xor edi, edi
    mov di, [rax * TOKEN_TABLE_ENTRY_SIZE + TOKEN_TABLE_ADDR] ; rdi = next tte
    push rax
    xor eax, eax
    call get_tte_type

    cmp ax, 0x01 ; check if it's an operator
    je .operator
    jne .continue_operator

  .operator
    push rsi
    mov rsi, .msg_found_operator
    call print
    pop rsi

  .continue_operator
    cmp ax, 0x02 ; check if it's a register
    je .register
    jne .continue_register

  .register
    push rsi
    mov rsi, .msg_found_register
    call print
    pop rsi

  .continue_register
    pop rax ; incrementer
    pop rdi ; total number of tokens

    inc rax ; move to next token
    jmp .loop

  .break:
    ret
  .msg_found_operator db "found operator", 0x0A, 0x00
  .msg_found_register db "found register", 0x0A, 0x00

; ------------------------------------------------------------------------------
; get_tte_type
;
; description:
; given a token table entry, returns the declared type in `tokens.by_id`. If
; there is no entry, returns UNRECOGNISED_ID_TYPE
;
; parameters:
; di = token table entry
;
; returned:
; al = type of token, or UNRECOGNISED_ID_TYPE. The upper 4 bits of al are
;      zeroed; the rest of rax is zeroed.
; ------------------------------------------------------------------------------

get_tte_type:
  and rdi, 0xFFFF ; mask input so it behaves as expected
  xor eax, eax

  .loop:
    cmp rax, (tokens.by_id_end - tokens.by_id) / 4 ; make sure it's still in range
    jg .not_found

    mov cx, [tokens.by_id + rax * 4] ; next entry in tokens.by_id

    cmp cx, di
    je .found

    inc rax
    jmp .loop
  .not_found:
    mov al, UNRECOGNISED_ID_TYPE
    and ax, 0xF ; mask as expected
    ret
  .found:
    mov al, [2 + tokens.by_id + rax * 4]
    and ax, 0xF ; mask as expected
    ret

; ------------------------------------------------------------------------------
; get_tte_typed_metadata
;
; description:
; given a token table entry, returns the declared typed metadata in
; `tokens.by_id`. If there is no entry, returns UNRECOGNISED_ID_METADATA
;
; parameters:
; di = token table entry
;
; returned:
; al = typed metadata of token, or UNRECOGNISED_ID_METADATA; the rest of rax is
;      zeroed.
; ------------------------------------------------------------------------------

get_tte_typed_metadata:
  and rdi, 0xFFFF ; mask input so it behaves as expected
  xor eax, eax

  .loop:
    cmp rax, (tokens.by_id_end - tokens.by_id) / 4 ; make sure it's still in range
    jg .not_found

    mov cx, [tokens.by_id + rax * 4] ; next entry in tokens.by_id

    cmp cx, di
    je .found

    inc rax
    jmp .loop
  .not_found:
    xor eax, eax
    mov al, UNRECOGNISED_ID_METADATA
    ret
  .found:
    mov al, [3 + tokens.by_id + rax * 4]
    ret

; ------------------------------------------------------------------------------
; tokenising
; ------------------------------------------------------------------------------

; ------------------------------------------------------------------------------
; tokenise
; TODO write tests
;
; description:
; represents the program at the given address and puts it in the token table
; it's probably desirable to clear the token table before calling this function.
;
; parameters:
; rdi -> first byte of program
; rsi = size of program in bytes
;
; returned:
; rax = number of tokens processed
; ------------------------------------------------------------------------------

tokenise:
  add rsi, rdi ; last byte of program
  xor ecx, ecx ; number of tokens processed
  .loop:
    cmp rdi, rsi ; if current byte greater than last byte
    jg .break    ; then break

    push rdi
    push rsi
    push rcx

    ; rdi -> current byte
    call identify_next_token
    ; ax = id of token
    ; dx = length of token

    pop rcx
    pop rsi
    pop rdi

    ; deal with terminator character (reported as 0 length token)
    cmp rdx, 0
    je .token_length0
    jne .continue0

  .token_length0:
    mov ax, 0xFE00 ; terminator character
    mov al, [rdi]  ; byte of terminator
    mov edx, 1     ; byte length is 1

  .continue0:
    add rdi, rdx ; current byte + length of token = next unread byte

    mov [TOKEN_TABLE_ADDR + rcx * TOKEN_TABLE_ENTRY_SIZE], ax ; fill next entry
                                                               ; in token table

    ; TODO fix undefined behaviour when open brackets and closed brackets aren't
    ; correctly paired or have too much distance between them
    cmp ax, 0x0051             ; check if read token is an open bracket
    je .open_bracket           ; if so, handle it
    jne .continue_open_bracket ; if not, continue

  .open_bracket:
    ; TODO make brackets able to hold more
    mov [.data_open_bracket], cl ; record which entry the open bracket is at

  .continue_open_bracket:
    cmp ax, 0x0052              ; check if read token is a closing bracket
    je .close_bracket           ; if so, handle it
    jne .continue_close_bracket ; if not, continue

  .close_bracket:
    ; rewrite open bracket token entry with a filled out one
    push rcx

    mov dl, [.data_open_bracket]
    sub cl, dl
    mov byte [TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], cl
    mov byte [1 + TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], 0x10

    pop rcx

  .continue_close_bracket:
    inc rcx ; +1 token processed
    jmp .loop
  .break:
    mov rax, rcx
    ret

  .data_open_bracket db 0x00 ; represents the token # of the latest open bracket

; ------------------------------------------------------------------------------
; identify_token
;
; description:
; returns the id of a given token. If there are multiple ways to represent a
; given token, like the open-bracket, it returns the one that doesn't require
; information about the surrounding tokens, because it has no such information.
; In other words, if it isn't in the `tokens` data structure, this function
; doesn't see it. If the first byte of the token points to a terminator
; byte, this function returns it as an unrecognised token.
;
; parameters:
; rdi -> first byte of token
; rsi = size of token in bytes
;
; returned:
; ax = id of token; the rest of rax is zeroed
; ------------------------------------------------------------------------------

identify_token:
  cmp rsi, 1        ; if the token has length 1
  je .start_length1 ; then enter the length 1 loop

  cmp rsi, 2        ; if the token has length 2
  je .start_length2 ; then enter the length 2 loop

  cmp rsi, 3        ; if the token has length 3
  je .start_length3 ; then enter the length 3 loop

  cmp rsi, 4        ; if the token has length 4
  je .start_length4 ; then enter the length 4 loop

  jmp .unrecognised ; else unrecognised

  ; length1
  .start_length1:
    mov rcx, tokens.by_name_1 ; rcx -> list of known tokens

  .loop_length1:
    cmp rcx, tokens.by_name_2 ; check if rcx still in the bounds of length1 tokens
    jge .unrecognised         ; if not, unrecognised

    mov r10b, [rcx] ; known token
    mov r11b, [rdi] ; token
    cmp r10b, r11b    ; if known token matches token
    je .found_length1 ; exit loop

    add rcx, 3 ; length of token + length of id
    jmp .loop_length1

  .found_length1:
    xor eax, eax      ; make sure rest of rax is zeroed
    mov ax, [rcx + 1] ; return id of token
    ret

  ; length2
  .start_length2:
    mov rcx, tokens.by_name_2 ; rcx -> list of known tokens

  .loop_length2:
    cmp rcx, tokens.by_name_3 ; check if rcx still in the bounds of length2 tokens
    jge .unrecognised         ; if not, unrecognised

    mov r10w, [rcx] ; current entry in known tokens
    mov r11w, [rdi] ; token
    cmp r10w, r11w  ; if current entry matches token,
    je .found_length2 ; exit loop

    add rcx, 4 ; length of token + length of id
    jmp .loop_length2

  .found_length2:
    xor eax, eax      ; make sure rest of rax is zeroed
    mov ax, [rcx + 2] ; return id of token
    ret

  ; length3
  .start_length3:
    mov rcx, tokens.by_name_3 ; rcx -> list of known tokens

  .loop_length3:
    cmp rcx, tokens.by_name_4 ; check if rcx still in bounds of length3 tokens
    jge .unrecognised         ; if not, unrecognised

    ; TODO make this safe (it overreaches 1 byte)
    mov r10d, [rcx] ; known token + next byte
    mov r11d, [rdi] ; token + next byte

    and r10d, 0x00FFFFFF ; mask for just the token
    and r11d, 0x00FFFFFF

    cmp r10d, r11d    ; if known token matches token,
    je .found_length3 ; exit loop

    add rcx, 5 ; length of token + length of id
    jmp .loop_length3

  .found_length3:
    xor rax, rax      ; zero rax
    mov ax, [rcx + 3] ; return id of token
    ret

  ; length4
  .start_length4:
    mov rcx, tokens.by_name_4 ; rcx -> list of known tokens

  .loop_length4:
    cmp rcx, tokens.by_name_5 ; check if rcx still in bounds of length3 tokens
    jge .unrecognised         ; if not, unrecognised

    mov r10d, [rcx] ; known token
    mov r11d, [rdi] ; token
    cmp r10d, r11d  ; if known token matches token,
    je .found_length4 ; exit loop

    add rcx, 6 ; length of token + length of id
    jmp .loop_length4

  .found_length4:
    xor rax, rax      ; zero rax
    mov ax, [rcx + 4] ; return id of token
    ret

  .unrecognised:
    xor eax, eax
    mov ax, UNRECOGNISED_TOKEN_ID
    ret

; ------------------------------------------------------------------------------
; identify_next_token
; description:
; like identify_token, except it automatically finds the length. If the first
; byte of the token points to a terminator byte, it returns a length of 0.
;
; parameters:
; rdi -> first byte of token
;
; returned:
; ax = id of token; the rest of rax is zeroed
; dx = length of token in bytes; the rest of rdx is zeroed
; ------------------------------------------------------------------------------

identify_next_token:
  push rdi

  mov rsi, rdi ; rsi is the current byte
  xor rdi, rdi ; rdi is the length
  .loop:
    xor edx, edx
    mov dl, [rsi]

    push rsi
    push rdi
    push rdx

    mov rdi, 8                  ; length of terminator list
    mov rsi, token_terminator_8 ; start of terminator list
    call elemb

    pop rdx
    pop rdi
    pop rsi

    cmp rax, 1 ; check if the next character is a token terminator
    je .break  ; if so, break

    inc rdi ; next character
    inc rsi ; next byte of token
    jmp .loop

  .break:
    mov rsi, rdi ; length of token

  pop rdi

  push rsi
  call identify_token
  pop rsi
  mov rdx, rsi ; length
  ret

; ------------------------------------------------------------------------------
; copy_token
;
; description:
; copies a token from one spot in memory to another
;
; parameters:
; rdi -> start of buffer to be read
; rsi -> start of buffer to be written
;
; returned:
; rax -> last byte read
; rdx -> last byte written
; ------------------------------------------------------------------------------

copy_token:
  .loop:
    mov dl, [rdi] ; move bit to compare to current byte in read buffer

    push rdi ; push incrementors to call elemb
    push rsi ;

    mov rdi, 8                  ; length of terminator list
    mov rsi, token_terminator_8 ; start of terminator list
                                ; dl set before pushing rdi
    call elemb

    pop rsi ;
    pop rdi ; pop incrementors after call

    cmp rax, 1 ; check if the next character is a token terminator
    je .break  ; > if so, break the function

    ; rdi and rsi set from previous loop iteration
    call copy_byte ; if not, copy the current byte in read buffer

    inc rdi ; read pointer
    inc rsi ; write pointer
    jmp .loop

  .break:
    mov rax, rdi ; -> last byte read
    mov rdx, rsi ; -> last byte written
    ret

; ------------------------------------------------------------------------------
; copy_byte
;
; description:
; copies a byte from one spot in memory to another
;
; parameters:
; rdi -> word to be read
; rsi -> word to be written
;
; returned:
; al = byte that was read; the rest of rax is zeroed
; ------------------------------------------------------------------------------

copy_byte:
  xor eax, eax ; zero out so it returns fine
  mov al, [rdi]
  mov [rsi], al
  ret

; ------------------------------------------------------------------------------
; utilities
; ------------------------------------------------------------------------------

; ------------------------------------------------------------------------------
; print
;
; description:
; prints a null-terminated string
; probably doesn't change any registers for ease of debugging
;
; parameters:
; rsi -> start of null-terminated string
; ------------------------------------------------------------------------------

print:
  push rdx
  push rax
  push rsi

  mov edx, 0x3F8
  .loop:
    mov al, [rsi]
    test al, al
    jz .done
    out dx, al
    inc rsi
    jmp .loop
  .done:
    pop rsi
    pop rax
    pop rdx
    ret

; ------------------------------------------------------------------------------
; halt
;
; description:
; halts the program, silly :)
; ------------------------------------------------------------------------------

halt:
  mov rsi, msg_halt
  call print
  hlt
  jmp halt

; ------------------------------------------------------------------------------
; elemb
;
; description:
; checks if given byte is element of the specified list
;
; parameters:
; rdi = size of list
; rsi -> start of list
; dl =  given byte
;
; returned:
; rax = 0: is not an element
;       1: is an element
; ------------------------------------------------------------------------------

elemb:
  .loop:
    cmp rdi, 0    ; check if remaining length 0
    je .not_found ; if so, break; dl not an element of list

    mov al, [rsi]
    cmp al, dl    ; check if current byte in list is the desired byte
    je .found     ; if so, break; dl an element of list

    inc rsi ; move to next byte
    dec rdi ; and reduce remaining length

    jmp .loop

  .not_found:
    xor eax, eax ; return 0; dl not an element of list
    ret

  .found:
    xor eax, eax
    mov rax, 1 ; return 1; dl an element of list
    ret

  .f db "found", 0x0A, 0x00
  .nf db "not found", 0x0A, 0x00

; ------------------------------------------------------------------------------
; clear_token_table
;
; description:
; clears the token table as specified by TOKEN_TABLE_SIZE and TOKEN_TABLE_ADDR
; ------------------------------------------------------------------------------

clear_token_table:
  xor eax, eax                  ; value to write
  mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words
  mov rdi, TOKEN_TABLE_ADDR     ; address to start
  rep stosd
  ret

; ------------------------------------------------------------------------------
; clear_test_arena
;
; description:
; clears the test arena as specified by TEST_ARENA_SIZE and TEST_ARENA_ADDR
; ------------------------------------------------------------------------------

clear_test_arena:
  xor eax, eax                  ; value to write
  mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words
  mov rdi, TOKEN_TABLE_ADDR     ; address to start
  rep stosd
  ret

; ------------------------------------------------------------------------------
; clear_output_arena
;
; description:
; clears the output arena as specified by OUTPUT_SIZE and OUTPUT_ADDR
; ------------------------------------------------------------------------------

clear_output_arena:
  xor eax, eax             ; value to write
  mov rcx, OUTPUT_SIZE / 4 ; number of double words
  mov rdi, OUTPUT_ADDR     ; address to start
  rep stosd
  ret

%include "asm/tests.asm"

; ------------------------------------------------------------------------------
; data
; ------------------------------------------------------------------------------

tokens:
  .by_name_1:
    db "["
    dw 0x0051
    db "]"
    dw 0x0052
    db "+"
    dw 0x0062
    db "-"
    dw 0x0063
    db "*"
    dw 0x0064
    db "/"
    dw 0x0065
  .by_name_2:
    db "r8"
    dw 0x0008
    db "r9"
    dw 0x0009
    db "ax"
    dw 0x0020
    db "bx"
    dw 0x0021
    db "cx"
    dw 0x0022
    db "dx"
    dw 0x0023
    db "si"
    dw 0x0024
    db "di"
    dw 0x0025
    db "sp"
    dw 0x0026
    db "bp"
    dw 0x0027
    db "al"
    dw 0x0030
    db "bl"
    dw 0x0031
    db "cl"
    dw 0x0032
    db "dl"
    dw 0x0033
    db "ah"
    dw 0x0040
    db "bh"
    dw 0x0041
    db "ch"
    dw 0x0042
    db "dh"
    dw 0x0043
    db "cs"
    dw 0x0044
    db "ds"
    dw 0x0045
    db "es"
    dw 0x0046
    db "fs"
    dw 0x0047
    db "gs"
    dw 0x0048
    db "ss"
    dw 0x0049
    db "je"
    dw 0x005C
    db "jg"
    dw 0x005F
    db "jl"
    dw 0x0061
  .by_name_3:
    db "rax"
    dw 0x0000
    db "rbx"
    dw 0x0001
    db "rcx"
    dw 0x0002
    db "rdx"
    dw 0x0003
    db "rsi"
    dw 0x0004
    db "rdi"
    dw 0x0005
    db "rsp"
    dw 0x0006
    db "rbp"
    dw 0x0007
    db "r10"
    dw 0x000A
    db "r11"
    dw 0x000B
    db "r12"
    dw 0x000C
    db "r13"
    dw 0x000D
    db "r14"
    dw 0x000E
    db "r15"
    dw 0x000F
    db "eax"
    dw 0x0010
    db "ebx"
    dw 0x0011
    db "ecx"
    dw 0x0012
    db "edx"
    dw 0x0013
    db "esi"
    dw 0x0014
    db "edi"
    dw 0x0015
    db "esp"
    dw 0x0016
    db "ebp"
    dw 0x0017
    db "r8d"
    dw 0x0018
    db "r9d"
    dw 0x0019
    db "r8w"
    dw 0x0028
    db "r9w"
    dw 0x0029
    db "sil"
    dw 0x0034
    db "dil"
    dw 0x0035
    db "spl"
    dw 0x0036
    db "bpl"
    dw 0x0037
    db "r8b"
    dw 0x0038
    db "r9b"
    dw 0x0039
    db "cr0"
    dw 0x004A
    db "cr2"
    dw 0x004B
    db "cr3"
    dw 0x004C
    db "cr4"
    dw 0x004D
    db "cr8"
    dw 0x004E
    db "hlt"
    dw 0x004F
    db "xor"
    dw 0x0053
    db "inc"
    dw 0x0054
    db "dec"
    dw 0x0055
    db "mov"
    dw 0x0056
    db "add"
    dw 0x0057
    db "sub"
    dw 0x0058
    db "ret"
    dw 0x005A
    db "cmp"
    dw 0x005B
    db "jne"
    dw 0x005D
    db "jge"
    dw 0x005E
    db "jle"
    dw 0x0060
  .by_name_4:
    db "r10d"
    dw 0x001A
    db "r11d"
    dw 0x001B
    db "r12d"
    dw 0x001C
    db "r13d"
    dw 0x001D
    db "r14d"
    dw 0x001E
    db "r15d"
    dw 0x001F
    db "r10w"
    dw 0x002A
    db "r11w"
    dw 0x002B
    db "r12w"
    dw 0x002C
    db "r13w"
    dw 0x002D
    db "r14w"
    dw 0x002E
    db "r15w"
    dw 0x002F
    db "r10b"
    dw 0x003A
    db "r11b"
    dw 0x003B
    db "r12b"
    dw 0x003C
    db "r13b"
    dw 0x003D
    db "r14b"
    dw 0x003E
    db "r15b"
    dw 0x003F
    db "int3"
    dw 0x0050
    db "call"
    dw 0x0059
  .by_name_5:
  .by_id:
    dw 0x0053 ; xor
    db 0x01   ; type: operator
    db 0x02   ; # operands

    dw 0x0010 ; eax
    db 0x02   ; type: register
    db 0x02   ; width: 32 bit

    dw 0x0054 ; inc
    db 0x01   ; type: operator
    db 0x01   ; # operands

    dw 0x0000 ; rax
    db 0x02   ; type: register
    db 0x03   ; width: 64 bit

    dw 0x0056 ; mov
    db 0x01   ; type: operator
    db 0x02   ; # operands

    dw 0x0003 ; rdx
    db 0x02   ; type: register
    db 0x03   ; width: 64 bit

    dw 0x004F ; hlt
    db 0x01   ; type: operator
    db 0x00   ; # operands
  .by_id_end:

msg_welcome db "Welcome to Twasm", 0x0A, 0x00
msg_halt db "halted.", 0x0A, 0x00

token_terminator_8 db 0x00, " ", 0x0A, 0x0D, ",", 0x00, 0x00, 0x00

debug_string db "debug_string", 0x0A, 0x00

; test program
program:
  db "xor eax, eax", 0x0A
  db "inc rax", 0x0A
  db "mov [ rax ], rdx", 0x0A
  db "hlt", 0x0A
  db 0x00 ; just for the sake of being able to print it, I made it a string
  .size db $ - program - 1