clear up internal data structures, add to README
This commit is contained in:
@@ -48,6 +48,99 @@ each token gets loaded into the token table with the following form:
|
||||
+----------+
|
||||
```
|
||||
|
||||
### internal data structures
|
||||
|
||||
#### `tokens.by_nameX`
|
||||
|
||||
contains all tokens of that length followed by their ID. For some non-empty `tokens.by_nameX`, it is true that `tokens.by_name<X+1> - tokens.by_nameX` is the size in bytes of `tokens.by_nameX`.
|
||||
|
||||
each entry is in the following form:
|
||||
|
||||
```
|
||||
+----------+--------------------------------+
|
||||
|[2 bytes] | 8 * token_length - 1 0 |
|
||||
+----------+--------------------------------+
|
||||
| token ID | string without null terminator |
|
||||
+----------+--------------------------------+
|
||||
|
||||
```
|
||||
|
||||
example implementation:
|
||||
|
||||
```nasm
|
||||
tokens:
|
||||
.by_name1:
|
||||
db "+"
|
||||
dw 0x0062
|
||||
db "-"
|
||||
dw 0x0063
|
||||
.by_name2:
|
||||
db "r8"
|
||||
dw 0x0008
|
||||
.by_name3: ; this is required for futureproofness; the caller can use this to
|
||||
; find the size of tokens.by_name2
|
||||
```
|
||||
|
||||
#### `tokens.by_id`
|
||||
|
||||
contains some tokens with their metadata. Some tokens have embedded information (`0x10XX` for instance). Those will not have entries in this table, being handled instead inside the assemble function itself.
|
||||
|
||||
metadata about some tokens in the following form:
|
||||
|
||||
```
|
||||
+----------------+----------+-------+----------+
|
||||
| 31 24 | 23 20 | 19 16 | 15 0 |
|
||||
+----------------+----------+-------+----------+
|
||||
| typed metadata | reserved | type | token ID |
|
||||
+----------------+----------+-------+----------+
|
||||
```
|
||||
|
||||
the `type` hex digit is defined as the following:
|
||||
|
||||
| hex | meaning | examples |
|
||||
|-----|----------|-|
|
||||
| 0x0 | ignored | `; this entire comment is 1 token` |
|
||||
| 0x1 | operator | `mov`, `hlt` |
|
||||
| 0x2 | register | `rsp`, `al` |
|
||||
|
||||
type metadata for the different types is as follows:
|
||||
|
||||
```
|
||||
+----------+
|
||||
| type 0x0 |
|
||||
+----------+
|
||||
| 31 24 |
|
||||
+----------+
|
||||
| reserved |
|
||||
+----------+
|
||||
```
|
||||
|
||||
```
|
||||
+-------------------------------+
|
||||
| type 0x1 |
|
||||
+----------+--------------------+
|
||||
| 31 26 | 25 24 |
|
||||
+----------+--------------------+
|
||||
| reserved | number of operands |
|
||||
+----------+--------------------+
|
||||
```
|
||||
|
||||
```
|
||||
+------------------+
|
||||
| type 0x2 |
|
||||
+----------+-------+
|
||||
| 31 26 | 25 24 |
|
||||
+----------+-------+
|
||||
| reserved | width |
|
||||
+----------+-------+
|
||||
|
||||
; width:
|
||||
00b ; 8 bit
|
||||
01b ; 16 bit
|
||||
10b ; 32 bit
|
||||
11b ; 64 bit
|
||||
```
|
||||
|
||||
### token IDs
|
||||
|
||||
supported tokens are listed below
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
; TODO actually enforce any of these *_SIZE constants :p
|
||||
|
||||
LOAD_ADDR equ 0x00010000 ; address this program is loaded at
|
||||
|
||||
TEST_ARENA_ADDR equ 0x00050000 ; address to run tests at
|
||||
@@ -34,8 +36,40 @@ start:
|
||||
mov rsi, [program.size] ; = size of program
|
||||
call tokenise
|
||||
|
||||
call clear_output_arena
|
||||
call assemble
|
||||
|
||||
jmp halt
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; assembling
|
||||
; ------------------------------------------------------------------------------
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; assemble
|
||||
; TODO write tests
|
||||
;
|
||||
; description:
|
||||
; assembles the program from tokens located at TOKEN_TABLE_ADDR into a flat
|
||||
; binary located at OUTPUT_ADDR. It's probably desirable to clear the output
|
||||
; arena before calling this function.
|
||||
; ------------------------------------------------------------------------------
|
||||
|
||||
assemble:
|
||||
xor rax, rax ; number of tokens processed
|
||||
.loop:
|
||||
cmp rax, TOKEN_TABLE_SIZE / TOKEN_TABLE_ENTRY_SIZE ; check incrementer
|
||||
; against the number of
|
||||
; entries in the token
|
||||
; table
|
||||
jg .break ; if overflown, break
|
||||
|
||||
|
||||
inc rax ; move to next token
|
||||
jmp .loop
|
||||
.break:
|
||||
ret
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; tokenising
|
||||
; ------------------------------------------------------------------------------
|
||||
@@ -46,7 +80,7 @@ start:
|
||||
;
|
||||
; description:
|
||||
; represents the program at the given address and puts it in the token table
|
||||
; it's probably desirable to clear the token table before calling this function
|
||||
; it's probably desirable to clear the token table before calling this function.
|
||||
;
|
||||
; parameters:
|
||||
; rdi -> first byte of program
|
||||
@@ -156,11 +190,11 @@ identify_token:
|
||||
|
||||
; length1
|
||||
.start_length1:
|
||||
mov rcx, tokens.length1 ; rcx -> list of known tokens
|
||||
mov rcx, tokens.by_name_1 ; rcx -> list of known tokens
|
||||
|
||||
.loop_length1:
|
||||
cmp rcx, tokens.length2 ; check if rcx still in the bounds of length1 tokens
|
||||
jge .unrecognised ; if not, unrecognised
|
||||
cmp rcx, tokens.by_name_2 ; check if rcx still in the bounds of length1 tokens
|
||||
jge .unrecognised ; if not, unrecognised
|
||||
|
||||
mov r10b, [rcx] ; known token
|
||||
mov r11b, [rdi] ; token
|
||||
@@ -177,11 +211,11 @@ identify_token:
|
||||
|
||||
; length2
|
||||
.start_length2:
|
||||
mov rcx, tokens.length2 ; rcx -> list of known tokens
|
||||
mov rcx, tokens.by_name_2 ; rcx -> list of known tokens
|
||||
|
||||
.loop_length2:
|
||||
cmp rcx, tokens.length3 ; check if rcx still in the bounds of length2 tokens
|
||||
jge .unrecognised ; if not, unrecognised
|
||||
cmp rcx, tokens.by_name_3 ; check if rcx still in the bounds of length2 tokens
|
||||
jge .unrecognised ; if not, unrecognised
|
||||
|
||||
mov r10w, [rcx] ; current entry in known tokens
|
||||
mov r11w, [rdi] ; token
|
||||
@@ -198,11 +232,11 @@ identify_token:
|
||||
|
||||
; length3
|
||||
.start_length3:
|
||||
mov rcx, tokens.length3 ; rcx -> list of known tokens
|
||||
mov rcx, tokens.by_name_3 ; rcx -> list of known tokens
|
||||
|
||||
.loop_length3:
|
||||
cmp rcx, tokens.length4 ; check if rcx still in bounds of length3 tokens
|
||||
jge .unrecognised ; if not, unrecognised
|
||||
cmp rcx, tokens.by_name_4 ; check if rcx still in bounds of length3 tokens
|
||||
jge .unrecognised ; if not, unrecognised
|
||||
|
||||
; TODO make this safe (it overreaches 1 byte)
|
||||
mov r10d, [rcx] ; known token + next byte
|
||||
@@ -224,11 +258,11 @@ identify_token:
|
||||
|
||||
; length4
|
||||
.start_length4:
|
||||
mov rcx, tokens.length4 ; rcx -> list of known tokens
|
||||
mov rcx, tokens.by_name_4 ; rcx -> list of known tokens
|
||||
|
||||
.loop_length4:
|
||||
cmp rcx, tokens.length5 ; check if rcx still in bounds of length3 tokens
|
||||
jge .unrecognised ; if not, unrecognised
|
||||
cmp rcx, tokens.by_name_5 ; check if rcx still in bounds of length3 tokens
|
||||
jge .unrecognised ; if not, unrecognised
|
||||
|
||||
mov r10d, [rcx] ; known token
|
||||
mov r11d, [rdi] ; token
|
||||
@@ -483,6 +517,20 @@ clear_test_arena:
|
||||
rep stosd
|
||||
ret
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; clear_output_arena
|
||||
;
|
||||
; description:
|
||||
; clears the output arena as specified by OUTPUT_SIZE and OUTPUT_ADDR
|
||||
; ------------------------------------------------------------------------------
|
||||
|
||||
clear_output_arena:
|
||||
xor eax, eax ; value to write
|
||||
mov rcx, OUTPUT_SIZE / 4 ; number of double words
|
||||
mov rdi, OUTPUT_ADDR ; address to start
|
||||
rep stosd
|
||||
ret
|
||||
|
||||
; ------------------------------------------------------------------------------
|
||||
; tests
|
||||
; ------------------------------------------------------------------------------
|
||||
@@ -865,7 +913,7 @@ test_identify_next_token:
|
||||
; ------------------------------------------------------------------------------
|
||||
|
||||
tokens:
|
||||
.length1:
|
||||
.by_name_1:
|
||||
db "["
|
||||
dw 0x0051
|
||||
db "]"
|
||||
@@ -878,7 +926,7 @@ tokens:
|
||||
dw 0x0064
|
||||
db "/"
|
||||
dw 0x0065
|
||||
.length2:
|
||||
.by_name_2:
|
||||
db "r8"
|
||||
dw 0x0008
|
||||
db "r9"
|
||||
@@ -933,7 +981,7 @@ tokens:
|
||||
dw 0x005F
|
||||
db "jl"
|
||||
dw 0x0061
|
||||
.length3:
|
||||
.by_name_3:
|
||||
db "rax"
|
||||
dw 0x0000
|
||||
db "rbx"
|
||||
@@ -1032,7 +1080,7 @@ tokens:
|
||||
dw 0x005E
|
||||
db "jle"
|
||||
dw 0x0060
|
||||
.length4:
|
||||
.by_name_4:
|
||||
db "r10d"
|
||||
dw 0x001A
|
||||
db "r11d"
|
||||
@@ -1073,8 +1121,8 @@ tokens:
|
||||
dw 0x0050
|
||||
db "call"
|
||||
dw 0x0059
|
||||
.length5:
|
||||
.end:
|
||||
.by_name_5:
|
||||
.by_id:
|
||||
|
||||
msg_welcome db "Welcome to Twasm", 0x0A, 0x00
|
||||
msg_halt db "halted.", 0x0A, 0x00
|
||||
|
||||
Reference in New Issue
Block a user