clear up internal data structures, add to README
This commit is contained in:
@@ -48,6 +48,99 @@ each token gets loaded into the token table with the following form:
|
|||||||
+----------+
|
+----------+
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### internal data structures
|
||||||
|
|
||||||
|
#### `tokens.by_nameX`
|
||||||
|
|
||||||
|
contains all tokens of that length followed by their ID. For some non-empty `tokens.by_nameX`, it is true that `tokens.by_name<X+1> - tokens.by_nameX` is the size in bytes of `tokens.by_nameX`.
|
||||||
|
|
||||||
|
each entry is in the following form:
|
||||||
|
|
||||||
|
```
|
||||||
|
+----------+--------------------------------+
|
||||||
|
|[2 bytes] | 8 * token_length - 1 0 |
|
||||||
|
+----------+--------------------------------+
|
||||||
|
| token ID | string without null terminator |
|
||||||
|
+----------+--------------------------------+
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
example implementation:
|
||||||
|
|
||||||
|
```nasm
|
||||||
|
tokens:
|
||||||
|
.by_name1:
|
||||||
|
db "+"
|
||||||
|
dw 0x0062
|
||||||
|
db "-"
|
||||||
|
dw 0x0063
|
||||||
|
.by_name2:
|
||||||
|
db "r8"
|
||||||
|
dw 0x0008
|
||||||
|
.by_name3: ; this is required for futureproofness; the caller can use this to
|
||||||
|
; find the size of tokens.by_name2
|
||||||
|
```
|
||||||
|
|
||||||
|
#### `tokens.by_id`
|
||||||
|
|
||||||
|
contains some tokens with their metadata. Some tokens have embedded information (`0x10XX` for instance). Those will not have entries in this table, being handled instead inside the assemble function itself.
|
||||||
|
|
||||||
|
metadata about some tokens in the following form:
|
||||||
|
|
||||||
|
```
|
||||||
|
+----------------+----------+-------+----------+
|
||||||
|
| 31 24 | 23 20 | 19 16 | 15 0 |
|
||||||
|
+----------------+----------+-------+----------+
|
||||||
|
| typed metadata | reserved | type | token ID |
|
||||||
|
+----------------+----------+-------+----------+
|
||||||
|
```
|
||||||
|
|
||||||
|
the `type` hex digit is defined as the following:
|
||||||
|
|
||||||
|
| hex | meaning | examples |
|
||||||
|
|-----|----------|-|
|
||||||
|
| 0x0 | ignored | `; this entire comment is 1 token` |
|
||||||
|
| 0x1 | operator | `mov`, `hlt` |
|
||||||
|
| 0x2 | register | `rsp`, `al` |
|
||||||
|
|
||||||
|
type metadata for the different types is as follows:
|
||||||
|
|
||||||
|
```
|
||||||
|
+----------+
|
||||||
|
| type 0x0 |
|
||||||
|
+----------+
|
||||||
|
| 31 24 |
|
||||||
|
+----------+
|
||||||
|
| reserved |
|
||||||
|
+----------+
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
+-------------------------------+
|
||||||
|
| type 0x1 |
|
||||||
|
+----------+--------------------+
|
||||||
|
| 31 26 | 25 24 |
|
||||||
|
+----------+--------------------+
|
||||||
|
| reserved | number of operands |
|
||||||
|
+----------+--------------------+
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
+------------------+
|
||||||
|
| type 0x2 |
|
||||||
|
+----------+-------+
|
||||||
|
| 31 26 | 25 24 |
|
||||||
|
+----------+-------+
|
||||||
|
| reserved | width |
|
||||||
|
+----------+-------+
|
||||||
|
|
||||||
|
; width:
|
||||||
|
00b ; 8 bit
|
||||||
|
01b ; 16 bit
|
||||||
|
10b ; 32 bit
|
||||||
|
11b ; 64 bit
|
||||||
|
```
|
||||||
|
|
||||||
### token IDs
|
### token IDs
|
||||||
|
|
||||||
supported tokens are listed below
|
supported tokens are listed below
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
; TODO actually enforce any of these *_SIZE constants :p
|
||||||
|
|
||||||
LOAD_ADDR equ 0x00010000 ; address this program is loaded at
|
LOAD_ADDR equ 0x00010000 ; address this program is loaded at
|
||||||
|
|
||||||
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
|
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
|
||||||
@@ -34,8 +36,40 @@ start:
|
|||||||
mov rsi, [program.size] ; = size of program
|
mov rsi, [program.size] ; = size of program
|
||||||
call tokenise
|
call tokenise
|
||||||
|
|
||||||
|
call clear_output_arena
|
||||||
|
call assemble
|
||||||
|
|
||||||
jmp halt
|
jmp halt
|
||||||
|
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
; assembling
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
; assemble
|
||||||
|
; TODO write tests
|
||||||
|
;
|
||||||
|
; description:
|
||||||
|
; assembles the program from tokens located at TOKEN_TABLE_ADDR into a flat
|
||||||
|
; binary located at OUTPUT_ADDR. It's probably desirable to clear the output
|
||||||
|
; arena before calling this function.
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
assemble:
|
||||||
|
xor rax, rax ; number of tokens processed
|
||||||
|
.loop:
|
||||||
|
cmp rax, TOKEN_TABLE_SIZE / TOKEN_TABLE_ENTRY_SIZE ; check incrementer
|
||||||
|
; against the number of
|
||||||
|
; entries in the token
|
||||||
|
; table
|
||||||
|
jg .break ; if overflown, break
|
||||||
|
|
||||||
|
|
||||||
|
inc rax ; move to next token
|
||||||
|
jmp .loop
|
||||||
|
.break:
|
||||||
|
ret
|
||||||
|
|
||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
; tokenising
|
; tokenising
|
||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
@@ -46,7 +80,7 @@ start:
|
|||||||
;
|
;
|
||||||
; description:
|
; description:
|
||||||
; represents the program at the given address and puts it in the token table
|
; represents the program at the given address and puts it in the token table
|
||||||
; it's probably desirable to clear the token table before calling this function
|
; it's probably desirable to clear the token table before calling this function.
|
||||||
;
|
;
|
||||||
; parameters:
|
; parameters:
|
||||||
; rdi -> first byte of program
|
; rdi -> first byte of program
|
||||||
@@ -156,10 +190,10 @@ identify_token:
|
|||||||
|
|
||||||
; length1
|
; length1
|
||||||
.start_length1:
|
.start_length1:
|
||||||
mov rcx, tokens.length1 ; rcx -> list of known tokens
|
mov rcx, tokens.by_name_1 ; rcx -> list of known tokens
|
||||||
|
|
||||||
.loop_length1:
|
.loop_length1:
|
||||||
cmp rcx, tokens.length2 ; check if rcx still in the bounds of length1 tokens
|
cmp rcx, tokens.by_name_2 ; check if rcx still in the bounds of length1 tokens
|
||||||
jge .unrecognised ; if not, unrecognised
|
jge .unrecognised ; if not, unrecognised
|
||||||
|
|
||||||
mov r10b, [rcx] ; known token
|
mov r10b, [rcx] ; known token
|
||||||
@@ -177,10 +211,10 @@ identify_token:
|
|||||||
|
|
||||||
; length2
|
; length2
|
||||||
.start_length2:
|
.start_length2:
|
||||||
mov rcx, tokens.length2 ; rcx -> list of known tokens
|
mov rcx, tokens.by_name_2 ; rcx -> list of known tokens
|
||||||
|
|
||||||
.loop_length2:
|
.loop_length2:
|
||||||
cmp rcx, tokens.length3 ; check if rcx still in the bounds of length2 tokens
|
cmp rcx, tokens.by_name_3 ; check if rcx still in the bounds of length2 tokens
|
||||||
jge .unrecognised ; if not, unrecognised
|
jge .unrecognised ; if not, unrecognised
|
||||||
|
|
||||||
mov r10w, [rcx] ; current entry in known tokens
|
mov r10w, [rcx] ; current entry in known tokens
|
||||||
@@ -198,10 +232,10 @@ identify_token:
|
|||||||
|
|
||||||
; length3
|
; length3
|
||||||
.start_length3:
|
.start_length3:
|
||||||
mov rcx, tokens.length3 ; rcx -> list of known tokens
|
mov rcx, tokens.by_name_3 ; rcx -> list of known tokens
|
||||||
|
|
||||||
.loop_length3:
|
.loop_length3:
|
||||||
cmp rcx, tokens.length4 ; check if rcx still in bounds of length3 tokens
|
cmp rcx, tokens.by_name_4 ; check if rcx still in bounds of length3 tokens
|
||||||
jge .unrecognised ; if not, unrecognised
|
jge .unrecognised ; if not, unrecognised
|
||||||
|
|
||||||
; TODO make this safe (it overreaches 1 byte)
|
; TODO make this safe (it overreaches 1 byte)
|
||||||
@@ -224,10 +258,10 @@ identify_token:
|
|||||||
|
|
||||||
; length4
|
; length4
|
||||||
.start_length4:
|
.start_length4:
|
||||||
mov rcx, tokens.length4 ; rcx -> list of known tokens
|
mov rcx, tokens.by_name_4 ; rcx -> list of known tokens
|
||||||
|
|
||||||
.loop_length4:
|
.loop_length4:
|
||||||
cmp rcx, tokens.length5 ; check if rcx still in bounds of length3 tokens
|
cmp rcx, tokens.by_name_5 ; check if rcx still in bounds of length3 tokens
|
||||||
jge .unrecognised ; if not, unrecognised
|
jge .unrecognised ; if not, unrecognised
|
||||||
|
|
||||||
mov r10d, [rcx] ; known token
|
mov r10d, [rcx] ; known token
|
||||||
@@ -483,6 +517,20 @@ clear_test_arena:
|
|||||||
rep stosd
|
rep stosd
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
; clear_output_arena
|
||||||
|
;
|
||||||
|
; description:
|
||||||
|
; clears the output arena as specified by OUTPUT_SIZE and OUTPUT_ADDR
|
||||||
|
; ------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
clear_output_arena:
|
||||||
|
xor eax, eax ; value to write
|
||||||
|
mov rcx, OUTPUT_SIZE / 4 ; number of double words
|
||||||
|
mov rdi, OUTPUT_ADDR ; address to start
|
||||||
|
rep stosd
|
||||||
|
ret
|
||||||
|
|
||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
; tests
|
; tests
|
||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
@@ -865,7 +913,7 @@ test_identify_next_token:
|
|||||||
; ------------------------------------------------------------------------------
|
; ------------------------------------------------------------------------------
|
||||||
|
|
||||||
tokens:
|
tokens:
|
||||||
.length1:
|
.by_name_1:
|
||||||
db "["
|
db "["
|
||||||
dw 0x0051
|
dw 0x0051
|
||||||
db "]"
|
db "]"
|
||||||
@@ -878,7 +926,7 @@ tokens:
|
|||||||
dw 0x0064
|
dw 0x0064
|
||||||
db "/"
|
db "/"
|
||||||
dw 0x0065
|
dw 0x0065
|
||||||
.length2:
|
.by_name_2:
|
||||||
db "r8"
|
db "r8"
|
||||||
dw 0x0008
|
dw 0x0008
|
||||||
db "r9"
|
db "r9"
|
||||||
@@ -933,7 +981,7 @@ tokens:
|
|||||||
dw 0x005F
|
dw 0x005F
|
||||||
db "jl"
|
db "jl"
|
||||||
dw 0x0061
|
dw 0x0061
|
||||||
.length3:
|
.by_name_3:
|
||||||
db "rax"
|
db "rax"
|
||||||
dw 0x0000
|
dw 0x0000
|
||||||
db "rbx"
|
db "rbx"
|
||||||
@@ -1032,7 +1080,7 @@ tokens:
|
|||||||
dw 0x005E
|
dw 0x005E
|
||||||
db "jle"
|
db "jle"
|
||||||
dw 0x0060
|
dw 0x0060
|
||||||
.length4:
|
.by_name_4:
|
||||||
db "r10d"
|
db "r10d"
|
||||||
dw 0x001A
|
dw 0x001A
|
||||||
db "r11d"
|
db "r11d"
|
||||||
@@ -1073,8 +1121,8 @@ tokens:
|
|||||||
dw 0x0050
|
dw 0x0050
|
||||||
db "call"
|
db "call"
|
||||||
dw 0x0059
|
dw 0x0059
|
||||||
.length5:
|
.by_name_5:
|
||||||
.end:
|
.by_id:
|
||||||
|
|
||||||
msg_welcome db "Welcome to Twasm", 0x0A, 0x00
|
msg_welcome db "Welcome to Twasm", 0x0A, 0x00
|
||||||
msg_halt db "halted.", 0x0A, 0x00
|
msg_halt db "halted.", 0x0A, 0x00
|
||||||
|
|||||||
Reference in New Issue
Block a user