improve data structures

This commit is contained in:
andromeda
2026-03-24 20:49:52 +01:00
parent 74fc57cdfc
commit 121a0df8e4
3 changed files with 322 additions and 70 deletions

View File

@@ -212,33 +212,83 @@ entries are as follows:
```
+------------------------------+
| 0 operand operators |
+------------------------------+
| 127 96 |
+------------------------------+
| reserved |
+------------------------------+
| 95 64 |
+------------------------------+
| reserved |
+------------------------------+
| 63 32 |
+------------------------------+
| reserved |
+----------+--------+----------+
| 31 24 | 23 16 | 15 0 |
+----------+--------+----------+
| reserved | opcode | token ID |
+----------+--------+----------+
+-------+----------+---------+----------+
| 1 operand operators |
+-------+----------+---------+----------+
| 31 27 | 26 24 | 23 16 | 15 0 |
+-------+----------+---------+----------+
| zeros | reg bits | opcode | token ID |
| | | dst=r/m | |
+-------+----------+---------+----------+
+-------------------------------------------------------------+
| 1 operand operators |
+-------------------------------------------------------------+
| 127 96 |
+-------------------------------------------------------------+
| reserved |
+----------+-------+-------+-------+-------+----------+-------+
| 95 88 | 87 84 | 83 80 | 79 76 | 75 72 | 71 68 | 67 64 |
+----------+-------+-------+-------+-------+----------+-------+
| reserved | op5&8 | op4&8 | op3&8 | op2&8 | reserved | op0&8 |
+----------+-------+-------+-------+-------+----------+-------+
| 63 56 | 55 48 | 47 40 | 39 32 |
+----------+---------------+---------------+------------------+
| opcode | opcode | opcode | opcode |
| dst=rel8 | dest=rel | dst=imm8 | dst=imm |
+----------+---------------+---------------+------------------+
| 31 24 | 23 16 | 15 0 |
+----------+---------------+----------------------------------+
| reserved | opcode | token ID |
| | dst=r/m | |
+----------+---------------+----------------------------------+
+------------------------------+
| 2 operand operators |
+---------+---------+----------+
| 31 24 | 23 16 | 15 0 |
+---------+---------+----------+
| opcode | opcode | token ID |
| dst=reg | dst=r/m | |
| src=r/m | src=reg | |
+---------+---------+----------+
+----------------------------------------------+
| 2 operand operators |
+----------------------------------------------+
| 127 96 |
+----------------------------------------------+
| reserved |
+-------------------+-------+-------+----------+
| 95 80 | 79 76 | 75 72 | 71 64 |
+-------------------+-------+-------+----------+
| reserved | op3&8 | op2&8 | reserved |
+-------------------+-------+-------+----------+
| 63 48 | 47 40 | 39 32 |
+-------------------+---------------+----------+
| reserved | opcode | opcode |
| | dst=r/m | dst=r/m |
| | src=imm8 | src=imm |
+---------+---------+---------------+----------+
| 31 24 | 23 16 | 15 0 |
+---------+---------+--------------------------+
| opcode | opcode | token ID |
| dst=r | dst=r/m | |
| src=r/m | src=r | |
+---------+---------+--------------------------+
; key:
r/m ; r/m 16/32/64
r ; r 16/32/64
imm ; imm 16/32
imm8 ; imm 8
rel ; rel 16/32
rel8 ; rel 8
opX&8 ; low 8 bits are the operator flag that goes with opcode at offset X from
; the first opcode in the table entry
```
note the lack of support for multiple-byte opcodes or multiple opcodes for one token ID; these features will likely be added at some point after the parser accumulates too much jank.
note much room to expand. If an opcode doesn't exist, it should be 0x00
### token IDs
@@ -339,6 +389,7 @@ supported tokens are listed below
| ret | 0x005A | |
| cmp | 0x005B | |
| | 0x10XX | some memory address; `XX` is as specified below |
| | 0xFEXX | used to pass some raw value `XX` in place of a token id |
| | 0xFFFF | unrecognised token |
values of `XX` in `0x10XX`:

View File

@@ -129,6 +129,7 @@ assemble:
mov sil, 0b ; standard opcode
call get_opcode
; al = opcode
; dl = op flag (none)
call .output_byte
pop rsi
pop rdi
@@ -153,7 +154,10 @@ assemble:
mov sil, 0b ; dst=r/m
call get_opcode
; al = opcode
; dl = op flag
push rdx
call .output_byte
pop rdx ; dl = op flag
pop rsi
pop rdi ; di = tte of operator
@@ -173,9 +177,11 @@ assemble:
pop rdi ; di = next tte
je .operator_1_memory_access
push rdx
; di = next tte
call get_tte_type
; al = type of token
pop rdx ; dl = op flag
cmp al, 0x02 ; type: register
je .operator_1_register
@@ -202,8 +208,10 @@ assemble:
pop rsi
mov si, di ; si = `R/M` tte
mov di, 0x0000 ; di = `reg` tte
mov dl, 11b ; dl bits
and edx, 0xFF
or dx, 0xFE00 ; pass di as direct value
mov di, dx ; di = op flag
mov dl, 11b ; dl = mod bits
call get_ModRM
; al = Mod R/M byte
call .output_byte
@@ -270,6 +278,8 @@ assemble:
mov sil, 0 ; dst = r/m
call get_opcode
; al = opcode
; dl = op flag
; TODO act accordingly if the op flag is present
call .output_byte
pop rdi
@@ -346,6 +356,8 @@ assemble:
mov sil, 1 ; dst = reg
call get_opcode
; al = opcode
; dl = op flag
; TODO do something if the op flag is present
call .output_byte
pop rdi
@@ -561,7 +573,7 @@ get_tte_typed_metadata:
; given 2 register tokens and the mod bits, returns the ModR/M byte
;
; parameters:
; di = token table entry `reg`
; di = token table entry `reg`. 0xFEXX passes low 3 bytes as op flag
; si = token table entry `R/M`
; dl = lower 2 bits: mod bits. The rest is ignored
;
@@ -570,14 +582,40 @@ get_tte_typed_metadata:
; ------------------------------------------------------------------------------
get_ModRM:
push rbx
and dl, 11b ; mask for mod bits
shl dl, 6
push rdi
shr di, 8
cmp dil, 0xFE
pop rdi
je .pass_di_as_op_flag
; di = tte
call get_reg_bits
; al = reg bits
mov bl, al
shl bl, 3
push rsi
mov rsi, .msg_normal_ModRM
call print.debug
pop rsi
mov bl, al ; bl = reg bits
jmp .continue
.pass_di_as_op_flag:
push rsi
mov rsi, .msg_op_flag
call print.debug
pop rsi
mov bl, dil ; bl = op flag
and bl, 111b ; mask
.continue:
shl bl, 3
mov rdi, rsi ; do the other one
@@ -592,8 +630,12 @@ get_ModRM:
or al, bl ; reg bits
or al, cl ; R/M bits
and rax, 0xFF ; mask for byte
pop rbx
ret
.msg_op_flag db "get_ModRM op_flag", 0x0A, 0x00
.msg_normal_ModRM db "get_ModRM normal_ModRM", 0x0A, 0x00
; ------------------------------------------------------------------------------
; get_opcode
;
@@ -603,23 +645,30 @@ get_ModRM:
;
; parameters:
; di = token table entry
; sil = lower bit: 0: dst=r/m or only opcode
; 1: dst=reg or 0x00
; sil = offset within opcode entry. 0 is the first opcode, 1 the second, and so
; on
;
; returned:
; al = opcode; the rest of rax is zeroed
; al = opcode; the rest of rax is zeroed.
; dl = lower 3 bits: op flag, if applicable. The rest of rdx is zeroed.
; ------------------------------------------------------------------------------
get_opcode:
and rdi, 0xFFFF
and rsi, 1
add rsi, 2
and rsi, 111b
sub rsi, 2
xor eax, eax
.loop:
cmp rax, (opcodes.by_id_end - opcodes.by_id) / 4 ; make sure it's still in range
cmp rax, (opcodes.by_id_end - opcodes.by_id) / 16 ; make sure it's still in range
jg .not_found
mov cx, [opcodes.by_id + rax * 4] ; next entry in opcodes.by_id
shl rax, 4
mov cx, [opcodes.by_id + rax] ; next entry in opcodes.by_id
shr rax, 4
cmp cx, di
je .found
@@ -631,8 +680,24 @@ get_opcode:
mov al, UNRECOGNISED_ID_OPCODE
ret
.found:
mov al, [rsi + 2 + opcodes.by_id + rax * 4]
shl rax, 4
push rsi
shr rsi, 1
mov dl, [rsi + 8 + opcodes.by_id + rax]
pop rsi
push rsi
and rsi, 1
cmp esi, 1 ; check if offset is odd
pop rsi
jne .found_continue
shr dl, 4 ; if so, actually 1 further on dl byte
.found_continue
mov al, [rsi + 2 + opcodes.by_id + rax]
and rax, 0xFF ; mask
and rdx, 0x0F ; mask
ret
; ------------------------------------------------------------------------------
@@ -1558,6 +1623,7 @@ clear_output_arena:
; data
; ------------------------------------------------------------------------------
align 4
tokens:
.by_id:
dw 0x0000 ; rax
@@ -1869,53 +1935,148 @@ tokens:
dw 0x003F
.registers_end:
align 16
opcodes:
.by_id:
dw 0x004F ; hlt
db 0xF4 ; .
db 0x00 ;
; hlt
dw 0x004F
db 0xF4 ; opcode
db 0x00 ; reserved
dd 0x00000000
dd 0x00000000
dd 0x00000000
dw 0x0050 ; int3
db 0xCC ;
db 0x00 ;
; int3
dw 0x0050
db 0xCC ; opcode
db 0x00 ; reserved
dd 0x00000000
dd 0x00000000
dd 0x00000000
dw 0x0053 ; xor
db 0x31 ; r/m <- reg
db 0x33 ; reg <- r/m
; xor
dw 0x0053
db 0x31 ; r/m <- r
db 0x33 ; r <- r/m
dw 0x0054 ; inc
db 0xFF ; r/m <-
db 0x00 ; reg bits
db 0x81 ; r/m <- imm16/32
db 0x83 ; r/m <- imm8
dw 0x0000
dw 0x0055 ; dec
db 0xFF ; r/m <-
db 0x01 ; reg bits
dd 0x00006600 ; 00:
; 6: r/m <- imm16/32 op flag
; 6: r/m <- imm8 op flag
; 0x0000:
dw 0x0056 ; mov
db 0x89 ; r/m <- reg
db 0x8B ; reg <- r/m
dd 0x00000000 ; reserved
dw 0x0057 ; add
db 0x01 ; r/m <- reg
db 0x03 ; reg <- r/m
; inc
dw 0x0054
db 0xFF ; r/m
db 0x00
dw 0x0058 ; sub
db 0x29 ; r/m <- reg
db 0x2B ; reg <- r/m
dd 0x00000000
; TODO deal with rel values, differentiate 16/32 and 64 for call
dw 0x0059 ; call
db 0xFF ; r/m <-
db 0x02 ; reg bits
dd 0x00000000 ; 0: r/m op flag
; 0000000:
; TODO deal with optional parameter
dw 0x005A ; ret
db 0xC3 ; opcode
db 0x00 ; reserved
dd 0x00000000
dw 0x005B ; cmp
db 0x39 ; r/m <- reg
db 0x3B ; reg <- r/m
; dec
dw 0x0055
db 0xFF ; r/m
db 0x00
dd 0x00000000
dd 0x00000001 ; 1: r/m op flag
; 0000000:
dd 0x00000000
; mov
dw 0x0056
db 0x89 ; r/m <- r
db 0x8B ; r <- r/m
db 0xC7 ; r/m <- imm16/32
db 0x00
dw 0x0000
dd 0x00000000 ; 00:
; 0: r/m <- imm16/32 op flag
; 00000:
dd 0x00000000
; add
dw 0x0057
db 0x01 ; r/m <- r
db 0x03 ; r <- r/m
db 0x81 ; r/m <- imm16/32
db 0x83 ; r/m <- imm8
dw 0x0000
dd 0x00000000 ; 00:
; 0: r/m <- imm16/32 op flag
; 0: r/m <- imm8 op flag
; 0000:
dd 0x00000000
; sub
dw 0x0058
db 0x29 ; r/m <- r
db 0x2B ; r <- r/m
db 0x81 ; r/m <- imm16/32
db 0x83 ; r/m <- imm8
dw 0x0000
dd 0x00005500 ; 00:
; 5: r/m <- imm16/32 op flag
; 5: r/m <- imm8 op flag
; 0000:
dd 0x00000000
; call
dw 0x0059
db 0xFF ; r/m
db 0x00
dw 0x0000
db 0x00 ; rel16/32
db 0x00
dd 0x00000002 ; 2: r/m op flag
; 0000000:
dd 0x00000000
; retn
dw 0x005A
db 0xC3 ; opcode
db 0x00 ; reserved
dd 0x00000000
dd 0x00000000
dd 0x00000000
; cmp
dw 0x005B
db 0x39 ; r/m <- r
db 0x3B ; r <- r/m
db 0x81 ; r/m <- imm16/32
db 0x83 ; r/m <- imm8
dw 0x0000
dd 0x00007700 ; 00:
; 7: r/m <- imm16/32 op flag
; 7: r/m <- imm8 op flag
; 0000:
dd 0x00000000
.by_id_end:
msg_welcome db "Welcome to Twasm", 0x0A, 0x00
@@ -1927,6 +2088,7 @@ whitespace_2 db " ", 0x0D
program:
db "xor eax, eax", 0x0A
db "inc rax ; inline comment", 0x0A
db "dec rax", 0x0A
db "; one line comment", 0x0A
db "mov rdx, [rax]", 0x0A
db "mov [rax], rdx", 0x0A

View File

@@ -294,32 +294,71 @@ test_get_opcode:
call print.test
mov di, 0x0053 ; xor
mov sil, 0b
mov sil, 0
call get_opcode
cmp al, 0x31
jne .fail
cmp dl, 0q0
jne .fail
mov di, 0x0053 ; xor
mov sil, 1b
mov sil, 1
call get_opcode
cmp al, 0x33
jne .fail
cmp dl, 0q0
jne .fail
mov di, 0x0053 ; xor
mov sil, 2
call get_opcode
cmp al, 0x81
jne .fail
cmp dl, 0q6
jne .fail
mov di, 0x0053 ; xor
mov sil, 3
call get_opcode
cmp al, 0x83
jne .fail
cmp dl, 0q6
jne .fail
mov di, 0x0054 ; inc
mov sil, 0b
mov sil, 0
call get_opcode
cmp al, 0xFF
jne .fail
cmp dl, 0q0
jne .fail
mov di, 0x0055 ; dec
mov sil, 0
call get_opcode
cmp al, 0xFF
jne .fail
cmp dl, 0q1
jne .fail
mov di, 0x004F ; hlt
mov sil, 0b
mov sil, 0
call get_opcode
cmp al, 0xF4
jne .fail
cmp dl, 0q0
jne .fail
mov di, 0x0059 ; call
mov sil, 0q0
call get_opcode
cmp al, 0xFF
jne .fail
cmp dl, 0q2
jne .fail
mov di, 0x0003 ; rdx (not an operator)
mov sil, 0b
mov sil, 0q0
call get_opcode
cmp al, UNRECOGNISED_ID_OPCODE
jne .fail