From 121a0df8e462babe3b6bf86bde2d900e4feea2b5 Mon Sep 17 00:00:00 2001 From: andromeda Date: Tue, 24 Mar 2026 20:49:52 +0100 Subject: [PATCH] improve data structures --- twasm/README.md | 87 +++++++++++---- twasm/asm/main.asm | 256 ++++++++++++++++++++++++++++++++++++-------- twasm/asm/tests.asm | 49 ++++++++- 3 files changed, 322 insertions(+), 70 deletions(-) diff --git a/twasm/README.md b/twasm/README.md index 72ec2e3..c479b33 100644 --- a/twasm/README.md +++ b/twasm/README.md @@ -212,33 +212,83 @@ entries are as follows: ``` +------------------------------+ | 0 operand operators | ++------------------------------+ +| 127 96 | ++------------------------------+ +| reserved | ++------------------------------+ +| 95 64 | ++------------------------------+ +| reserved | ++------------------------------+ +| 63 32 | ++------------------------------+ +| reserved | +----------+--------+----------+ | 31 24 | 23 16 | 15 0 | +----------+--------+----------+ | reserved | opcode | token ID | +----------+--------+----------+ -+-------+----------+---------+----------+ -| 1 operand operators | -+-------+----------+---------+----------+ -| 31 27 | 26 24 | 23 16 | 15 0 | -+-------+----------+---------+----------+ -| zeros | reg bits | opcode | token ID | -| | | dst=r/m | | -+-------+----------+---------+----------+ ++-------------------------------------------------------------+ +| 1 operand operators | ++-------------------------------------------------------------+ +| 127 96 | ++-------------------------------------------------------------+ +| reserved | ++----------+-------+-------+-------+-------+----------+-------+ +| 95 88 | 87 84 | 83 80 | 79 76 | 75 72 | 71 68 | 67 64 | ++----------+-------+-------+-------+-------+----------+-------+ +| reserved | op5&8 | op4&8 | op3&8 | op2&8 | reserved | op0&8 | ++----------+-------+-------+-------+-------+----------+-------+ +| 63 56 | 55 48 | 47 40 | 39 32 | ++----------+---------------+---------------+------------------+ +| opcode | opcode | opcode | opcode | +| dst=rel8 | dest=rel | dst=imm8 | dst=imm | ++----------+---------------+---------------+------------------+ +| 31 24 | 23 16 | 15 0 | ++----------+---------------+----------------------------------+ +| reserved | opcode | token ID | +| | dst=r/m | | ++----------+---------------+----------------------------------+ -+------------------------------+ -| 2 operand operators | -+---------+---------+----------+ -| 31 24 | 23 16 | 15 0 | -+---------+---------+----------+ -| opcode | opcode | token ID | -| dst=reg | dst=r/m | | -| src=r/m | src=reg | | -+---------+---------+----------+ ++----------------------------------------------+ +| 2 operand operators | ++----------------------------------------------+ +| 127 96 | ++----------------------------------------------+ +| reserved | ++-------------------+-------+-------+----------+ +| 95 80 | 79 76 | 75 72 | 71 64 | ++-------------------+-------+-------+----------+ +| reserved | op3&8 | op2&8 | reserved | ++-------------------+-------+-------+----------+ +| 63 48 | 47 40 | 39 32 | ++-------------------+---------------+----------+ +| reserved | opcode | opcode | +| | dst=r/m | dst=r/m | +| | src=imm8 | src=imm | ++---------+---------+---------------+----------+ +| 31 24 | 23 16 | 15 0 | ++---------+---------+--------------------------+ +| opcode | opcode | token ID | +| dst=r | dst=r/m | | +| src=r/m | src=r | | ++---------+---------+--------------------------+ + +; key: +r/m ; r/m 16/32/64 +r ; r 16/32/64 +imm ; imm 16/32 +imm8 ; imm 8 +rel ; rel 16/32 +rel8 ; rel 8 + +opX&8 ; low 8 bits are the operator flag that goes with opcode at offset X from + ; the first opcode in the table entry ``` -note the lack of support for multiple-byte opcodes or multiple opcodes for one token ID; these features will likely be added at some point after the parser accumulates too much jank. +note much room to expand. If an opcode doesn't exist, it should be 0x00 ### token IDs @@ -339,6 +389,7 @@ supported tokens are listed below | ret | 0x005A | | | cmp | 0x005B | | | | 0x10XX | some memory address; `XX` is as specified below | +| | 0xFEXX | used to pass some raw value `XX` in place of a token id | | | 0xFFFF | unrecognised token | values of `XX` in `0x10XX`: diff --git a/twasm/asm/main.asm b/twasm/asm/main.asm index 89d079f..f3f99e2 100644 --- a/twasm/asm/main.asm +++ b/twasm/asm/main.asm @@ -129,6 +129,7 @@ assemble: mov sil, 0b ; standard opcode call get_opcode ; al = opcode + ; dl = op flag (none) call .output_byte pop rsi pop rdi @@ -153,7 +154,10 @@ assemble: mov sil, 0b ; dst=r/m call get_opcode ; al = opcode + ; dl = op flag + push rdx call .output_byte + pop rdx ; dl = op flag pop rsi pop rdi ; di = tte of operator @@ -173,9 +177,11 @@ assemble: pop rdi ; di = next tte je .operator_1_memory_access + push rdx ; di = next tte call get_tte_type ; al = type of token + pop rdx ; dl = op flag cmp al, 0x02 ; type: register je .operator_1_register @@ -202,8 +208,10 @@ assemble: pop rsi mov si, di ; si = `R/M` tte - mov di, 0x0000 ; di = `reg` tte - mov dl, 11b ; dl bits + and edx, 0xFF + or dx, 0xFE00 ; pass di as direct value + mov di, dx ; di = op flag + mov dl, 11b ; dl = mod bits call get_ModRM ; al = Mod R/M byte call .output_byte @@ -270,6 +278,8 @@ assemble: mov sil, 0 ; dst = r/m call get_opcode ; al = opcode + ; dl = op flag + ; TODO act accordingly if the op flag is present call .output_byte pop rdi @@ -346,6 +356,8 @@ assemble: mov sil, 1 ; dst = reg call get_opcode ; al = opcode + ; dl = op flag + ; TODO do something if the op flag is present call .output_byte pop rdi @@ -561,7 +573,7 @@ get_tte_typed_metadata: ; given 2 register tokens and the mod bits, returns the ModR/M byte ; ; parameters: -; di = token table entry `reg` +; di = token table entry `reg`. 0xFEXX passes low 3 bytes as op flag ; si = token table entry `R/M` ; dl = lower 2 bits: mod bits. The rest is ignored ; @@ -570,14 +582,40 @@ get_tte_typed_metadata: ; ------------------------------------------------------------------------------ get_ModRM: + push rbx + and dl, 11b ; mask for mod bits shl dl, 6 + push rdi + shr di, 8 + cmp dil, 0xFE + pop rdi + je .pass_di_as_op_flag + ; di = tte call get_reg_bits ; al = reg bits - mov bl, al - shl bl, 3 + + push rsi + mov rsi, .msg_normal_ModRM + call print.debug + pop rsi + + mov bl, al ; bl = reg bits + jmp .continue + + .pass_di_as_op_flag: + push rsi + mov rsi, .msg_op_flag + call print.debug + pop rsi + + mov bl, dil ; bl = op flag + and bl, 111b ; mask + + .continue: + shl bl, 3 mov rdi, rsi ; do the other one @@ -592,8 +630,12 @@ get_ModRM: or al, bl ; reg bits or al, cl ; R/M bits and rax, 0xFF ; mask for byte + pop rbx ret + .msg_op_flag db "get_ModRM op_flag", 0x0A, 0x00 + .msg_normal_ModRM db "get_ModRM normal_ModRM", 0x0A, 0x00 + ; ------------------------------------------------------------------------------ ; get_opcode ; @@ -603,23 +645,30 @@ get_ModRM: ; ; parameters: ; di = token table entry -; sil = lower bit: 0: dst=r/m or only opcode -; 1: dst=reg or 0x00 +; sil = offset within opcode entry. 0 is the first opcode, 1 the second, and so +; on ; ; returned: -; al = opcode; the rest of rax is zeroed +; al = opcode; the rest of rax is zeroed. +; dl = lower 3 bits: op flag, if applicable. The rest of rdx is zeroed. ; ------------------------------------------------------------------------------ get_opcode: and rdi, 0xFFFF - and rsi, 1 + + add rsi, 2 + and rsi, 111b + sub rsi, 2 + xor eax, eax .loop: - cmp rax, (opcodes.by_id_end - opcodes.by_id) / 4 ; make sure it's still in range + cmp rax, (opcodes.by_id_end - opcodes.by_id) / 16 ; make sure it's still in range jg .not_found - mov cx, [opcodes.by_id + rax * 4] ; next entry in opcodes.by_id + shl rax, 4 + mov cx, [opcodes.by_id + rax] ; next entry in opcodes.by_id + shr rax, 4 cmp cx, di je .found @@ -631,8 +680,24 @@ get_opcode: mov al, UNRECOGNISED_ID_OPCODE ret .found: - mov al, [rsi + 2 + opcodes.by_id + rax * 4] + shl rax, 4 + push rsi + shr rsi, 1 + mov dl, [rsi + 8 + opcodes.by_id + rax] + pop rsi + + push rsi + and rsi, 1 + cmp esi, 1 ; check if offset is odd + pop rsi + jne .found_continue + + shr dl, 4 ; if so, actually 1 further on dl byte + + .found_continue + mov al, [rsi + 2 + opcodes.by_id + rax] and rax, 0xFF ; mask + and rdx, 0x0F ; mask ret ; ------------------------------------------------------------------------------ @@ -1558,6 +1623,7 @@ clear_output_arena: ; data ; ------------------------------------------------------------------------------ +align 4 tokens: .by_id: dw 0x0000 ; rax @@ -1869,53 +1935,148 @@ tokens: dw 0x003F .registers_end: +align 16 opcodes: .by_id: - dw 0x004F ; hlt - db 0xF4 ; . - db 0x00 ; + ; hlt + dw 0x004F + db 0xF4 ; opcode + db 0x00 ; reserved + dd 0x00000000 + dd 0x00000000 + dd 0x00000000 - dw 0x0050 ; int3 - db 0xCC ; - db 0x00 ; + ; int3 + dw 0x0050 + db 0xCC ; opcode + db 0x00 ; reserved + dd 0x00000000 + dd 0x00000000 + dd 0x00000000 - dw 0x0053 ; xor - db 0x31 ; r/m <- reg - db 0x33 ; reg <- r/m + ; xor + dw 0x0053 + db 0x31 ; r/m <- r + db 0x33 ; r <- r/m - dw 0x0054 ; inc - db 0xFF ; r/m <- - db 0x00 ; reg bits + db 0x81 ; r/m <- imm16/32 + db 0x83 ; r/m <- imm8 + dw 0x0000 - dw 0x0055 ; dec - db 0xFF ; r/m <- - db 0x01 ; reg bits + dd 0x00006600 ; 00: + ; 6: r/m <- imm16/32 op flag + ; 6: r/m <- imm8 op flag + ; 0x0000: - dw 0x0056 ; mov - db 0x89 ; r/m <- reg - db 0x8B ; reg <- r/m + dd 0x00000000 ; reserved - dw 0x0057 ; add - db 0x01 ; r/m <- reg - db 0x03 ; reg <- r/m + ; inc + dw 0x0054 + db 0xFF ; r/m + db 0x00 - dw 0x0058 ; sub - db 0x29 ; r/m <- reg - db 0x2B ; reg <- r/m + dd 0x00000000 - ; TODO deal with rel values, differentiate 16/32 and 64 for call - dw 0x0059 ; call - db 0xFF ; r/m <- - db 0x02 ; reg bits + dd 0x00000000 ; 0: r/m op flag + ; 0000000: - ; TODO deal with optional parameter - dw 0x005A ; ret - db 0xC3 ; opcode - db 0x00 ; reserved + dd 0x00000000 - dw 0x005B ; cmp - db 0x39 ; r/m <- reg - db 0x3B ; reg <- r/m + ; dec + dw 0x0055 + db 0xFF ; r/m + db 0x00 + + dd 0x00000000 + + dd 0x00000001 ; 1: r/m op flag + ; 0000000: + dd 0x00000000 + + ; mov + dw 0x0056 + db 0x89 ; r/m <- r + db 0x8B ; r <- r/m + + db 0xC7 ; r/m <- imm16/32 + db 0x00 + dw 0x0000 + + dd 0x00000000 ; 00: + ; 0: r/m <- imm16/32 op flag + ; 00000: + + dd 0x00000000 + + ; add + dw 0x0057 + db 0x01 ; r/m <- r + db 0x03 ; r <- r/m + + db 0x81 ; r/m <- imm16/32 + db 0x83 ; r/m <- imm8 + dw 0x0000 + + dd 0x00000000 ; 00: + ; 0: r/m <- imm16/32 op flag + ; 0: r/m <- imm8 op flag + ; 0000: + + dd 0x00000000 + + ; sub + dw 0x0058 + db 0x29 ; r/m <- r + db 0x2B ; r <- r/m + + db 0x81 ; r/m <- imm16/32 + db 0x83 ; r/m <- imm8 + dw 0x0000 + + dd 0x00005500 ; 00: + ; 5: r/m <- imm16/32 op flag + ; 5: r/m <- imm8 op flag + ; 0000: + + dd 0x00000000 + + ; call + dw 0x0059 + db 0xFF ; r/m + db 0x00 + + dw 0x0000 + db 0x00 ; rel16/32 + db 0x00 + + dd 0x00000002 ; 2: r/m op flag + ; 0000000: + + dd 0x00000000 + + ; retn + dw 0x005A + db 0xC3 ; opcode + db 0x00 ; reserved + dd 0x00000000 + dd 0x00000000 + dd 0x00000000 + + ; cmp + dw 0x005B + db 0x39 ; r/m <- r + db 0x3B ; r <- r/m + + db 0x81 ; r/m <- imm16/32 + db 0x83 ; r/m <- imm8 + dw 0x0000 + + dd 0x00007700 ; 00: + ; 7: r/m <- imm16/32 op flag + ; 7: r/m <- imm8 op flag + ; 0000: + + dd 0x00000000 .by_id_end: msg_welcome db "Welcome to Twasm", 0x0A, 0x00 @@ -1927,6 +2088,7 @@ whitespace_2 db " ", 0x0D program: db "xor eax, eax", 0x0A db "inc rax ; inline comment", 0x0A + db "dec rax", 0x0A db "; one line comment", 0x0A db "mov rdx, [rax]", 0x0A db "mov [rax], rdx", 0x0A diff --git a/twasm/asm/tests.asm b/twasm/asm/tests.asm index bc12ecd..bc52041 100644 --- a/twasm/asm/tests.asm +++ b/twasm/asm/tests.asm @@ -294,32 +294,71 @@ test_get_opcode: call print.test mov di, 0x0053 ; xor - mov sil, 0b + mov sil, 0 call get_opcode cmp al, 0x31 jne .fail + cmp dl, 0q0 + jne .fail mov di, 0x0053 ; xor - mov sil, 1b + mov sil, 1 call get_opcode cmp al, 0x33 jne .fail + cmp dl, 0q0 + jne .fail + mov di, 0x0053 ; xor + mov sil, 2 + call get_opcode + cmp al, 0x81 + jne .fail + cmp dl, 0q6 + jne .fail + + mov di, 0x0053 ; xor + mov sil, 3 + call get_opcode + cmp al, 0x83 + jne .fail + cmp dl, 0q6 + jne .fail mov di, 0x0054 ; inc - mov sil, 0b + mov sil, 0 call get_opcode cmp al, 0xFF jne .fail + cmp dl, 0q0 + jne .fail + + mov di, 0x0055 ; dec + mov sil, 0 + call get_opcode + cmp al, 0xFF + jne .fail + cmp dl, 0q1 + jne .fail mov di, 0x004F ; hlt - mov sil, 0b + mov sil, 0 call get_opcode cmp al, 0xF4 jne .fail + cmp dl, 0q0 + jne .fail + + mov di, 0x0059 ; call + mov sil, 0q0 + call get_opcode + cmp al, 0xFF + jne .fail + cmp dl, 0q2 + jne .fail mov di, 0x0003 ; rdx (not an operator) - mov sil, 0b + mov sil, 0q0 call get_opcode cmp al, UNRECOGNISED_ID_OPCODE jne .fail