bootler/twasm/asm/main.asm

; TODO actually enforce any of these *_SIZE constants :p

LOAD_ADDR equ 0x00010000 ; address this program is loaded at

TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
TEST_ARENA_SIZE equ 0x1000     ; maximum size tests can use

TOKEN_TABLE_ADDR equ 0x00060000 ; address the token table is loaded at
TOKEN_TABLE_SIZE equ 0x1000     ; max length of table
TOKEN_TABLE_ENTRY_SIZE equ 2    ; size of token table entry; things may break
                                ; if this ever changes

OUTPUT_ADDR equ 0x00070000 ; address of outputed binary
OUTPUT_SIZE equ 0x1000     ; max length of outputed binary

STACK_ADDR equ 0x00060000 ; address to put the 64-bit stack at

UNRECOGNISED_TOKEN_ID equ 0xFFFF  ; id of an unrecognised token
UNRECOGNISED_ID_TYPE equ 0x0F     ; type of an unrecognised id
UNRECOGNISED_ID_METADATA equ 0xFF ; metadata of an unrecognised id
UNRECOGNISED_ID_OPCODE equ 0x90   ; opcode of an unrecognised id (NOP)

TEST_LINE_LENGTH equ 80 ; right border of test suite results

[bits 64]
[org LOAD_ADDR]
[default abs]   ; TODO see if I actually need to do this
                ; afaik absolute addressing is not harmful on bare metal
                ; reasoning: stops annoying warning =D

start:
  mov rsp, STACK_ADDR ; we might need more stack space, let's just be safe

  mov rsi, msg_welcome
  call print

  call run_tests

  call clear_token_table

  mov rdi, program      ; -> program
  mov rsi, [program.size] ; = size of program
  call tokenise
  ; rax = number of tokens processed
  mov rdi, rax
  push rdi
  call clear_output_arena
  pop rdi
  call assemble

  jmp halt

; ------------------------------------------------------------------------------
; assembling
; ------------------------------------------------------------------------------

; ------------------------------------------------------------------------------
; assemble
; TODO write tests
; TODO make it work :/ putting the cart before the horse
;
; description:
; assembles the program from tokens located at TOKEN_TABLE_ADDR into a flat
; binary located at OUTPUT_ADDR. It's probably desirable to clear the output
; arena before calling this function.
;
; parameters:
; rdi = number of tokens in the token table
; ------------------------------------------------------------------------------

assemble:
  xor rax, rax ; number of tokens processed
  .loop:
    cmp rax, rdi ; check incrementer against the number of tokens in the token
    jge .break    ; table. If overflown, break

    push rdi
    xor edi, edi
    mov di, [rax * TOKEN_TABLE_ENTRY_SIZE + TOKEN_TABLE_ADDR] ; next tte
    push rax

    ; di = next tte
    call get_tte_type
    ; al = type of token
    cmp al, 0x01           ; check if next tte's type is an operator
    je .operator           ; if so, handle case of operator
    jne .continue_operator ; if not, jump past the case

  .operator: ; if next tte's type is an operator:
    push rax ; MUST be popped BEFORE returning to .continue_operator; it
             ; contains the type of token, which still needs to be used.

    push rdi
    ; di = tte
    call get_tte_typed_metadata
    ; al = tte typed metadata
    pop rdi

    and al, 11b ; mask for # operands

    cmp al, 0               ; check if operator has no operands
    je .operator_0          ; if so, handle case of no operands
    jne .operator_with_args ; if not, jump to case of multiple operands

  .operator_0:
    push rdi
    ; di = next tte
    call get_opcode
    ; al = opcode
    call .output_byte
    pop rdi

    pop rax ; from start of label .operator
    jmp .continue_operator

  .operator_with_args:
    mov [.pending_operator_num_args], al ; save # args fttb

    push rdi
    ; di = next tte
    call get_opcode
    ; al = opcode
    mov [.pending_operator_opcode], al ; save opcode fttb
    pop rdi

    pop rax ; from start of label .operator

  .continue_operator:
    cmp al, 0x02           ; check if next tte's type is a register
    je .register           ; if so, handle case of register
    jne .continue_register ; if not, jump past the case

  .register: ; if next tte's type is a register:
    call .dec_num_args ; because we've found an argument, we need 1 fewer noch

    cmp byte [.pending_operator_num_args], 1 ; check if this is 1st of 2 args
    je .register_one_of_two                  ; if so, jump to handler

    cmp byte [.pending_operator_num_args], 0 ; check if this is the last arg
    je .register_last                        ; if so, jump to handler
                                             ; note: not necessarily the last
                                             ; of 2 args, it could also be the
                                             ; last of 1

    ; otherwise, discard the token, reset things, and keep going :/
    push rsi
    mov rsi, .warn_unexpected_register
    call print.warn
    pop rsi
    call .reset_state
    jmp .continue_register

  .register_one_of_two: ; if it's the first of 2 arguments:
    mov [.first_argument], di ; ax = tte
    jmp .continue_register

  .register_last: ; if it's the last argument:
    ; swap so the first argument sits in .first_argument
    push rax
    mov ax, di
    mov di, [.first_argument]
    mov [.first_argument], ax
    pop rax

    cmp di, UNRECOGNISED_TOKEN_ID ; check if the second argument is defined
    jne .operator_finalise_2      ; if so, there are 2 arguments
                                  ; if not, there is just 1

  .operator_finalise_1:
    mov di, 0x0000 ; id of rax. reg bits 000b

  .operator_finalise_2:
    ; TODO avoid swapping earlier and now :/
    mov cx, di
    mov di, [.first_argument]
    mov si, cx
    call get_direct_addressing_ModRM
    ; al = ModR/M byte
    push rax
    mov al, [.pending_operator_opcode]
    call .output_byte ; output operator's opcode
    pop rax

    call .output_byte ; output ModR/M byte

    call .reset_state ; reset all the state parts of this function
    jmp .continue_register

  .continue_register:
    pop rax ; incrementer
    pop rdi ; total number of tokens

    inc rax ; move to next token
    jmp .loop

  .break:
    ret

  ; constants

  .warn_unexpected_register db "ignoring unexpected register", 0x0A, 0x00

  ; procedures

  ; al = byte to write
  .output_byte:
    mov edx, [.next_output_byte] ; get output byte's address
    mov [edx], al                ; write byte to that address
    inc edx                      ; increment address
    mov [.next_output_byte], edx ; put output byte's address
    ret

  ; runs dec on .pending_operator_num_args
  .dec_num_args:
    push rax
    mov al, [.pending_operator_num_args]
    dec al
    mov [.pending_operator_num_args], al
    pop rax
    ret

  .reset_state:
    ; I don't actually know if these `word` directives are needed
    ; TODO check that. I think they are, becasue Nasm doesn't record the size
    ; of labels?
    mov word [.pending_operator_opcode], UNRECOGNISED_TOKEN_ID
    mov [.pending_operator_num_args], 0x00
    mov word [.first_argument], UNRECOGNISED_TOKEN_ID
    ret

  ; state variables

  .pending_operator_opcode db 0x00   ; the operator seeking args
  .pending_operator_num_args db 0x00 ; # of args it takes

  .first_argument dw UNRECOGNISED_TOKEN_ID ; first argument if there are two

  .next_output_byte dd OUTPUT_ADDR             ; next empty byte in output

; ------------------------------------------------------------------------------
; get_tte_type
;
; description:
; given a token table entry, returns the declared type in `tokens.by_id`. If
; there is no entry, returns UNRECOGNISED_ID_TYPE
;
; parameters:
; di = token table entry
;
; returned:
; al = type of token, or UNRECOGNISED_ID_TYPE. The upper 4 bits of al are
;      zeroed; the rest of rax is zeroed.
; ------------------------------------------------------------------------------

get_tte_type:
  and rdi, 0xFFFF ; mask input so it behaves as expected
  xor eax, eax

  .loop:
    cmp rax, (tokens.by_id_end - tokens.by_id) / 4 ; make sure it's still in range
    jg .not_found

    mov cx, [tokens.by_id + rax * 4] ; next entry in tokens.by_id

    cmp cx, di
    je .found

    inc rax
    jmp .loop
  .not_found:
    mov al, UNRECOGNISED_ID_TYPE
    and ax, 0xF ; mask as expected
    ret
  .found:
    mov al, [2 + tokens.by_id + rax * 4]
    and ax, 0xF ; mask as expected
    ret

; ------------------------------------------------------------------------------
; get_tte_typed_metadata
;
; description:
; given a token table entry, returns the declared typed metadata in
; `tokens.by_id`. If there is no entry, returns UNRECOGNISED_ID_METADATA
;
; parameters:
; di = token table entry
;
; returned:
; al = typed metadata of token, or UNRECOGNISED_ID_METADATA; the rest of rax is
;      zeroed.
; ------------------------------------------------------------------------------

get_tte_typed_metadata:
  and rdi, 0xFFFF ; mask input so it behaves as expected
  xor eax, eax

  .loop:
    cmp rax, (tokens.by_id_end - tokens.by_id) / 4 ; make sure it's still in range
    jg .not_found

    mov cx, [tokens.by_id + rax * 4] ; next entry in tokens.by_id

    cmp cx, di
    je .found

    inc rax
    jmp .loop
  .not_found:
    xor eax, eax
    mov al, UNRECOGNISED_ID_METADATA
    ret
  .found:
    mov al, [3 + tokens.by_id + rax * 4]
    and rax, 0xFF
    ret

; ------------------------------------------------------------------------------
; get_direct_addressing_ModRM
;
; description:
; given 2 register tokens, returns the ModR/M byte in direct addressing
; (mod = 11b) mode
;
; parameters:
; di = token table entry `reg`
; si = token table entry `R/M`
;
; returned:
; al = ModR/M byte; the rest of rax is zeroed
; ------------------------------------------------------------------------------

get_direct_addressing_ModRM:
  mov dl, 11b
  call get_ModRM
  ret

; ------------------------------------------------------------------------------
; get_ModRM
;
; description:
; given 2 register tokens and the mod bits, returns the ModR/M byte
;
; parameters:
; di = token table entry `reg`
; si = token table entry `R/M`
; dl = lower 2 bits: mod bits. The rest is ignored
;
; returned:
; al = ModR/M byte; the rest of rax is zeroed
; ------------------------------------------------------------------------------

get_ModRM:
  and dl, 11b ; mask for mod bits
  shl dl, 6

  ; di = tte
  call get_reg_bits
  ; al = reg bits
  mov bl, al
  shl bl, 3

  mov rdi, rsi ; do the other one

  ; di = tte
  call get_reg_bits
  ; al = reg bits

  mov cl, al

  xor eax, eax
  or al, dl     ; mod bits
  or al, bl     ; reg bits
  or al, cl     ; R/M bits
  and rax, 0xFF ; mask for byte
  ret

; ------------------------------------------------------------------------------
; get_opcode
;
; description:
; given an operator token, returns its opcode
;
; parameters:
; di = token table entry
;
; returned:
; al = opcode; the rest of rax is zeroed
; ------------------------------------------------------------------------------

get_opcode:
  and rdi, 0xFFFF
  xor eax, eax

  .loop:
    cmp rax, (opcodes.by_id_end - opcodes.by_id) / 4 ; make sure it's still in range
    jg .not_found

    mov cx, [opcodes.by_id + rax * 4] ; next entry in opcodes.by_id

    cmp cx, di
    je .found

    inc rax
    jmp .loop
  .not_found:
    xor eax, eax
    mov al, UNRECOGNISED_ID_OPCODE
    ret
  .found:
    mov al, [2 + opcodes.by_id + rax * 4]
    and rax, 0xFF ; mask
    ret

; ------------------------------------------------------------------------------
; get_reg_bits
;
; description:
; given a register token, returns its reg bits metadata
;
; parameters:
; di = token table entry
;
; returned:
; al = register token; the rest of rax, including the upper 5 bits of al, are
;      zeroed.
; ------------------------------------------------------------------------------

get_reg_bits:
  ; di = tte
  call get_tte_typed_metadata
  ; al = typed metadata
  shr al, 2    ; discard type data
  and al, 111b ; mask
  ret

; ------------------------------------------------------------------------------
; tokenising
; ------------------------------------------------------------------------------

; ------------------------------------------------------------------------------
; tokenise
; TODO write tests
;
; description:
; represents the program at the given address and puts it in the token table
; it's probably desirable to clear the token table before calling this function.
;
; parameters:
; rdi -> first byte of program
; rsi = size of program in bytes
;
; returned:
; rax = number of tokens processed
; ------------------------------------------------------------------------------

tokenise:
  add rsi, rdi ; last byte of program
  xor ecx, ecx ; number of tokens processed
  .loop:
    cmp rdi, rsi ; if current byte greater than last byte
    jg .break    ; then break

    push rdi
    push rsi
    push rcx

    ; rdi -> current byte
    call identify_next_token
    ; ax = id of token
    ; dx = length of token

    pop rcx
    pop rsi
    pop rdi

    ; deal with terminator character (reported as 0 length token)
    cmp rdx, 0
    je .token_length0
    jne .continue0

  .token_length0:
    mov ax, 0xFE00 ; terminator character
    mov al, [rdi]  ; byte of terminator
    mov edx, 1     ; byte length is 1

  .continue0:
    add rdi, rdx ; current byte + length of token = next unread byte

    mov [TOKEN_TABLE_ADDR + rcx * TOKEN_TABLE_ENTRY_SIZE], ax ; fill next entry
                                                               ; in token table

    ; TODO fix undefined behaviour when open brackets and closed brackets aren't
    ; correctly paired or have too much distance between them
    cmp ax, 0x0051             ; check if read token is an open bracket
    je .open_bracket           ; if so, handle it
    jne .continue_open_bracket ; if not, continue

  .open_bracket:
    ; TODO make brackets able to hold more
    mov [.data_open_bracket], cl ; record which entry the open bracket is at

  .continue_open_bracket:
    cmp ax, 0x0052              ; check if read token is a closing bracket
    je .close_bracket           ; if so, handle it
    jne .continue_close_bracket ; if not, continue

  .close_bracket:
    ; rewrite open bracket token entry with a filled out one
    push rcx

    mov dl, [.data_open_bracket]
    sub cl, dl
    mov byte [TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], cl
    mov byte [1 + TOKEN_TABLE_ADDR + rdx * TOKEN_TABLE_ENTRY_SIZE], 0x10

    pop rcx

  .continue_close_bracket:
    inc rcx ; +1 token processed
    jmp .loop
  .break:
    mov rax, rcx
    ret

  .data_open_bracket db 0x00 ; represents the token # of the latest open bracket

; ------------------------------------------------------------------------------
; identify_token
;
; description:
; returns the id of a given token. If there are multiple ways to represent a
; given token, like the open-bracket, it returns the one that doesn't require
; information about the surrounding tokens, because it has no such information.
; In other words, if it isn't in the `tokens` data structure, this function
; doesn't see it. If the first byte of the token points to a terminator
; byte, this function returns it as an unrecognised token.
;
; parameters:
; rdi -> first byte of token
; rsi = size of token in bytes
;
; returned:
; ax = id of token; the rest of rax is zeroed
; ------------------------------------------------------------------------------

identify_token:
  cmp rsi, 1        ; if the token has length 1
  je .start_length1 ; then enter the length 1 loop

  cmp rsi, 2        ; if the token has length 2
  je .start_length2 ; then enter the length 2 loop

  cmp rsi, 3        ; if the token has length 3
  je .start_length3 ; then enter the length 3 loop

  cmp rsi, 4        ; if the token has length 4
  je .start_length4 ; then enter the length 4 loop

  jmp .unrecognised ; else unrecognised

  ; length1
  .start_length1:
    mov rcx, tokens.by_name_1 ; rcx -> list of known tokens

  .loop_length1:
    cmp rcx, tokens.by_name_2 ; check if rcx still in the bounds of length1 tokens
    jge .unrecognised         ; if not, unrecognised

    mov r10b, [rcx] ; known token
    mov r11b, [rdi] ; token
    cmp r10b, r11b    ; if known token matches token
    je .found_length1 ; exit loop

    add rcx, 3 ; length of token + length of id
    jmp .loop_length1

  .found_length1:
    xor eax, eax      ; make sure rest of rax is zeroed
    mov ax, [rcx + 1] ; return id of token
    ret

  ; length2
  .start_length2:
    mov rcx, tokens.by_name_2 ; rcx -> list of known tokens

  .loop_length2:
    cmp rcx, tokens.by_name_3 ; check if rcx still in the bounds of length2 tokens
    jge .unrecognised         ; if not, unrecognised

    mov r10w, [rcx] ; current entry in known tokens
    mov r11w, [rdi] ; token
    cmp r10w, r11w  ; if current entry matches token,
    je .found_length2 ; exit loop

    add rcx, 4 ; length of token + length of id
    jmp .loop_length2

  .found_length2:
    xor eax, eax      ; make sure rest of rax is zeroed
    mov ax, [rcx + 2] ; return id of token
    ret

  ; length3
  .start_length3:
    mov rcx, tokens.by_name_3 ; rcx -> list of known tokens

  .loop_length3:
    cmp rcx, tokens.by_name_4 ; check if rcx still in bounds of length3 tokens
    jge .unrecognised         ; if not, unrecognised

    ; TODO make this safe (it overreaches 1 byte)
    mov r10d, [rcx] ; known token + next byte
    mov r11d, [rdi] ; token + next byte

    and r10d, 0x00FFFFFF ; mask for just the token
    and r11d, 0x00FFFFFF

    cmp r10d, r11d    ; if known token matches token,
    je .found_length3 ; exit loop

    add rcx, 5 ; length of token + length of id
    jmp .loop_length3

  .found_length3:
    xor rax, rax      ; zero rax
    mov ax, [rcx + 3] ; return id of token
    ret

  ; length4
  .start_length4:
    mov rcx, tokens.by_name_4 ; rcx -> list of known tokens

  .loop_length4:
    cmp rcx, tokens.by_name_5 ; check if rcx still in bounds of length3 tokens
    jge .unrecognised         ; if not, unrecognised

    mov r10d, [rcx] ; known token
    mov r11d, [rdi] ; token
    cmp r10d, r11d  ; if known token matches token,
    je .found_length4 ; exit loop

    add rcx, 6 ; length of token + length of id
    jmp .loop_length4

  .found_length4:
    xor rax, rax      ; zero rax
    mov ax, [rcx + 4] ; return id of token
    ret

  .unrecognised:
    xor eax, eax
    mov ax, UNRECOGNISED_TOKEN_ID
    ret

; ------------------------------------------------------------------------------
; identify_next_token
; description:
; like identify_token, except it automatically finds the length. If the first
; byte of the token points to a terminator byte, it returns a length of 0.
;
; parameters:
; rdi -> first byte of token
;
; returned:
; ax = id of token; the rest of rax is zeroed
; dx = length of token in bytes; the rest of rdx is zeroed
; ------------------------------------------------------------------------------

identify_next_token:
  push rdi

  mov rsi, rdi ; rsi is the current byte
  xor rdi, rdi ; rdi is the length
  .loop:
    xor edx, edx
    mov dl, [rsi]

    push rsi
    push rdi
    push rdx

    mov rdi, 8                  ; length of terminator list
    mov rsi, token_terminator_8 ; start of terminator list
    call elemb

    pop rdx
    pop rdi
    pop rsi

    cmp rax, 1 ; check if the next character is a token terminator
    je .break  ; if so, break

    inc rdi ; next character
    inc rsi ; next byte of token
    jmp .loop

  .break:
    mov rsi, rdi ; length of token

  pop rdi

  push rsi
  call identify_token
  pop rsi
  mov rdx, rsi ; length
  ret

; ------------------------------------------------------------------------------
; utilities
; ------------------------------------------------------------------------------

; ------------------------------------------------------------------------------
; print
;
; description:
; prints a null-terminated string
; probably doesn't change any registers for ease of debugging
;
; parameters:
; rsi -> start of null-terminated string
; ------------------------------------------------------------------------------

print:
  push rdx
  push rax
  push rsi

  mov edx, 0x3F8
  .loop:
    mov al, [rsi]
    test al, al
    jz .done
    out dx, al
    inc rsi
    jmp .loop
  .done:
    pop rsi
    pop rax
    pop rdx
    ret
  .debug:
    push rsi
    mov rsi, .debug_msg
    call print
    pop rsi
    call print
    ret
  .error:
    push rsi
    mov rsi, .error_msg
    call print
    pop rsi
    call print
    ret
  .test:
    push rsi
    mov rsi, .test_msg
    call print
    pop rsi
    call print
    ret
  .warn:
    push rsi
    mov rsi, .warn_msg
    call print
    pop rsi
    call print
    ret
  .debug_msg db "[DEBUG]: ", 0x00
  .error_msg db "[ERROR]: ", 0x00
  .test_msg db "[TEST]: ", 0x00
  .warn_msg db "[WARN]: ", 0x00

; ------------------------------------------------------------------------------
; halt
;
; description:
; halts the program, silly :)
; ------------------------------------------------------------------------------

halt:
  mov rsi, msg_halt
  call print
  hlt
  jmp halt

; ------------------------------------------------------------------------------
; elemb
;
; description:
; checks if given byte is element of the specified list
;
; parameters:
; rdi = size of list
; rsi -> start of list
; dl =  given byte
;
; returned:
; rax = 0: is not an element
;       1: is an element
; ------------------------------------------------------------------------------

elemb:
  .loop:
    cmp rdi, 0    ; check if remaining length 0
    je .not_found ; if so, break; dl not an element of list

    mov al, [rsi]
    cmp al, dl    ; check if current byte in list is the desired byte
    je .found     ; if so, break; dl an element of list

    inc rsi ; move to next byte
    dec rdi ; and reduce remaining length

    jmp .loop

  .not_found:
    xor eax, eax ; return 0; dl not an element of list
    ret

  .found:
    xor eax, eax
    mov rax, 1 ; return 1; dl an element of list
    ret

  .f db "found", 0x0A, 0x00
  .nf db "not found", 0x0A, 0x00

; ------------------------------------------------------------------------------
; clear_token_table
;
; description:
; clears the token table as specified by TOKEN_TABLE_SIZE and TOKEN_TABLE_ADDR
; ------------------------------------------------------------------------------

clear_token_table:
  xor eax, eax                  ; value to write
  mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words
  mov rdi, TOKEN_TABLE_ADDR     ; address to start
  rep stosd
  ret

; ------------------------------------------------------------------------------
; clear_test_arena
;
; description:
; clears the test arena as specified by TEST_ARENA_SIZE and TEST_ARENA_ADDR
; ------------------------------------------------------------------------------

clear_test_arena:
  xor eax, eax                  ; value to write
  mov rcx, TOKEN_TABLE_SIZE / 4 ; number of double words
  mov rdi, TOKEN_TABLE_ADDR     ; address to start
  rep stosd
  ret

; ------------------------------------------------------------------------------
; clear_output_arena
;
; description:
; clears the output arena as specified by OUTPUT_SIZE and OUTPUT_ADDR
; ------------------------------------------------------------------------------

clear_output_arena:
  xor eax, eax             ; value to write
  mov rcx, OUTPUT_SIZE / 4 ; number of double words
  mov rdi, OUTPUT_ADDR     ; address to start
  rep stosd
  ret

%include "asm/tests.asm"

; ------------------------------------------------------------------------------
; data
; ------------------------------------------------------------------------------

tokens:
  .by_name_1:
    db "["
    dw 0x0051
    db "]"
    dw 0x0052
    db "+"
    dw 0x0062
    db "-"
    dw 0x0063
    db "*"
    dw 0x0064
    db "/"
    dw 0x0065
  .by_name_2:
    db "r8"
    dw 0x0008
    db "r9"
    dw 0x0009
    db "ax"
    dw 0x0020
    db "bx"
    dw 0x0021
    db "cx"
    dw 0x0022
    db "dx"
    dw 0x0023
    db "si"
    dw 0x0024
    db "di"
    dw 0x0025
    db "sp"
    dw 0x0026
    db "bp"
    dw 0x0027
    db "al"
    dw 0x0030
    db "bl"
    dw 0x0031
    db "cl"
    dw 0x0032
    db "dl"
    dw 0x0033
    db "ah"
    dw 0x0040
    db "bh"
    dw 0x0041
    db "ch"
    dw 0x0042
    db "dh"
    dw 0x0043
    db "cs"
    dw 0x0044
    db "ds"
    dw 0x0045
    db "es"
    dw 0x0046
    db "fs"
    dw 0x0047
    db "gs"
    dw 0x0048
    db "ss"
    dw 0x0049
    db "je"
    dw 0x005C
    db "jg"
    dw 0x005F
    db "jl"
    dw 0x0061
  .by_name_3:
    db "rax"
    dw 0x0000
    db "rbx"
    dw 0x0001
    db "rcx"
    dw 0x0002
    db "rdx"
    dw 0x0003
    db "rsi"
    dw 0x0004
    db "rdi"
    dw 0x0005
    db "rsp"
    dw 0x0006
    db "rbp"
    dw 0x0007
    db "r10"
    dw 0x000A
    db "r11"
    dw 0x000B
    db "r12"
    dw 0x000C
    db "r13"
    dw 0x000D
    db "r14"
    dw 0x000E
    db "r15"
    dw 0x000F
    db "eax"
    dw 0x0010
    db "ebx"
    dw 0x0011
    db "ecx"
    dw 0x0012
    db "edx"
    dw 0x0013
    db "esi"
    dw 0x0014
    db "edi"
    dw 0x0015
    db "esp"
    dw 0x0016
    db "ebp"
    dw 0x0017
    db "r8d"
    dw 0x0018
    db "r9d"
    dw 0x0019
    db "r8w"
    dw 0x0028
    db "r9w"
    dw 0x0029
    db "sil"
    dw 0x0034
    db "dil"
    dw 0x0035
    db "spl"
    dw 0x0036
    db "bpl"
    dw 0x0037
    db "r8b"
    dw 0x0038
    db "r9b"
    dw 0x0039
    db "cr0"
    dw 0x004A
    db "cr2"
    dw 0x004B
    db "cr3"
    dw 0x004C
    db "cr4"
    dw 0x004D
    db "cr8"
    dw 0x004E
    db "hlt"
    dw 0x004F
    db "xor"
    dw 0x0053
    db "inc"
    dw 0x0054
    db "dec"
    dw 0x0055
    db "mov"
    dw 0x0056
    db "add"
    dw 0x0057
    db "sub"
    dw 0x0058
    db "ret"
    dw 0x005A
    db "cmp"
    dw 0x005B
    db "jne"
    dw 0x005D
    db "jge"
    dw 0x005E
    db "jle"
    dw 0x0060
  .by_name_4:
    db "r10d"
    dw 0x001A
    db "r11d"
    dw 0x001B
    db "r12d"
    dw 0x001C
    db "r13d"
    dw 0x001D
    db "r14d"
    dw 0x001E
    db "r15d"
    dw 0x001F
    db "r10w"
    dw 0x002A
    db "r11w"
    dw 0x002B
    db "r12w"
    dw 0x002C
    db "r13w"
    dw 0x002D
    db "r14w"
    dw 0x002E
    db "r15w"
    dw 0x002F
    db "r10b"
    dw 0x003A
    db "r11b"
    dw 0x003B
    db "r12b"
    dw 0x003C
    db "r13b"
    dw 0x003D
    db "r14b"
    dw 0x003E
    db "r15b"
    dw 0x003F
    db "int3"
    dw 0x0050
    db "call"
    dw 0x0059
  .by_name_5:
  .by_id:
    dw 0x0010    ; eax
    db 0x02      ; type: register
    db 00000010b ; reg: 000b
                 ; width: 10b (32 bits)

    dw 0x0000    ; rax
    db 0x02      ; type: register
    db 00000011b ; reg: 000b
                 ; width: 11b (64 bits)

    dw 0x0003    ; rdx
    db 0x02      ; type: register
    db 00001011b ; reg: 010b
                 ; width: 11b (64 bits)

    dw 0x0053 ; xor
    db 0x01   ; type: operator
    db 0x02   ; # operands

    dw 0x0054 ; inc
    db 0x01   ; type: operator
    db 0x01   ; # operands

    dw 0x0056 ; mov
    db 0x01   ; type: operator
    db 0x02   ; # operands

    dw 0x004F ; hlt
    db 0x01   ; type: operator
    db 0x00   ; # operands
  .by_id_end:

opcodes:
  .by_id:
    dw 0x0053 ; xor
    db 0x31
    db 0x00   ; reserved

    dw 0x0054 ; inc
    db 0xFF
    db 0x00   ; reserved

    dw 0x0056 ; mov
    db 0x89
    db 0x00   ; reserved

    dw 0x004F ; hlt
    db 0xF4
    db 0x00   ; reserved
  .by_id_end:

msg_welcome db "Welcome to Twasm", 0x0A, 0x00
msg_halt db "halted.", 0x0A, 0x00

token_terminator_8 db 0x00, " ", 0x0A, 0x0D, ",", 0x00, 0x00, 0x00

debug_string db "debug_string", 0x0A, 0x00

; test program
program:
  db "xor eax, eax", 0x0A
  db "inc rax", 0x0A
  db "mov [ rax ], rdx", 0x0A
  db "hlt", 0x0A
  db 0x00 ; just for the sake of being able to print it, I made it a string
  .size db $ - program - 1