// Hand-optimized TASM baseline: std.crypto.keccak256
//
// First principles rewrite. Keccak-f[1600] on Triton VM.
//
// State: 25 lanes ร 2 U32 = 50 words in RAM[0..49].
// Lane[x][y]: lo at 2*(5y+x), hi at 2*(5y+x)+1.
// Temp: RAM[50..69] for column parities, RAM[70..119] for pi/chi temp.
//
// Key optimizations vs previous baseline:
// - Single __round function (was 24 copies)
// - Single __theta_col called 5ร (was 5 copies)
// - __chi_row subroutine called 5ร (was 5 inline copies)
// - read_mem 2 for lane reads where possible
// - Minimal lane helper subroutines
// ===================================================================
// LANE HELPERS (shared)
// ===================================================================
// xor_lane: [a_lo a_hi b_lo b_hi] -> [r_lo r_hi] (b on top)
std_crypto_keccak256__xor_lane:
swap 2
xor
swap 1
swap 2
xor
swap 1
return
// and_lane: [a_lo a_hi b_lo b_hi] -> [r_lo r_hi]
std_crypto_keccak256__and_lane:
swap 2
and
swap 1
swap 2
and
swap 1
return
// not_lane: [lo hi] -> [~lo ~hi]
std_crypto_keccak256__not_lane:
push 4294967295
xor
swap 1
push 4294967295
xor
swap 1
return
// chi_lane: [a_lo a_hi b_lo b_hi c_lo c_hi] -> [r_lo r_hi]
// r = a ^ (~b & c)
std_crypto_keccak256__chi_lane:
// Stack: a_lo a_hi b_lo b_hi c_lo c_hi (c_hi on top)
// not(b): b_lo b_hi -> ~b_lo ~b_hi
swap 2
push 4294967295
xor
swap 2
swap 3
push 4294967295
xor
swap 3
// Stack: a_lo a_hi ~b_lo ~b_hi c_lo c_hi
// and(~b, c)
swap 2
and
swap 1
swap 2
and
swap 1
// Stack: a_lo a_hi (notb&c)_lo (notb&c)_hi
// xor with a
swap 2
xor
swap 1
swap 2
xor
swap 1
return
// read_lane: [addr] -> [lo hi] reads lane from addr (lo at addr, hi at addr+1)
__read_lane:
dup 0
read_mem 1
pop 1
swap 1
push 1
add
read_mem 1
pop 1
return
// write_lane: [addr lo hi] -> [] writes lane to addr
__write_lane:
swap 2
dup 0
swap 2
write_mem 1
pop 1
swap 1
push 1
add
swap 1
write_mem 1
pop 1
return
// ===================================================================
// THETA
// ===================================================================
// theta: modify state in RAM in-place
// C[x] = s[x][0] ^ s[x][1] ^ s[x][2] ^ s[x][3] ^ s[x][4] stored at [50+2x, 51+2x]
// D[x] = C[x-1 mod 5] ^ rot1(C[x+1 mod 5]) stored at [60+2x, 61+2x]
// s[x][y] ^= D[x]
std_crypto_keccak256__theta:
// Compute 5 column parities C[0..4] -> RAM[50..59]
// C0: s00^s01^s02^s03^s04 = mem[0,1]^mem[10,11]^mem[20,21]^mem[30,31]^mem[40,41]
push 0
call __read_lane
push 10
call __read_lane
call std_crypto_keccak256__xor_lane
push 20
call __read_lane
call std_crypto_keccak256__xor_lane
push 30
call __read_lane
call std_crypto_keccak256__xor_lane
push 40
call __read_lane
call std_crypto_keccak256__xor_lane
push 50
call __write_lane
// C1
push 2
call __read_lane
push 12
call __read_lane
call std_crypto_keccak256__xor_lane
push 22
call __read_lane
call std_crypto_keccak256__xor_lane
push 32
call __read_lane
call std_crypto_keccak256__xor_lane
push 42
call __read_lane
call std_crypto_keccak256__xor_lane
push 52
call __write_lane
// C2
push 4
call __read_lane
push 14
call __read_lane
call std_crypto_keccak256__xor_lane
push 24
call __read_lane
call std_crypto_keccak256__xor_lane
push 34
call __read_lane
call std_crypto_keccak256__xor_lane
push 44
call __read_lane
call std_crypto_keccak256__xor_lane
push 54
call __write_lane
// C3
push 6
call __read_lane
push 16
call __read_lane
call std_crypto_keccak256__xor_lane
push 26
call __read_lane
call std_crypto_keccak256__xor_lane
push 36
call __read_lane
call std_crypto_keccak256__xor_lane
push 46
call __read_lane
call std_crypto_keccak256__xor_lane
push 56
call __write_lane
// C4
push 8
call __read_lane
push 18
call __read_lane
call std_crypto_keccak256__xor_lane
push 28
call __read_lane
call std_crypto_keccak256__xor_lane
push 38
call __read_lane
call std_crypto_keccak256__xor_lane
push 48
call __read_lane
call std_crypto_keccak256__xor_lane
push 58
call __write_lane
// D[x] = C[x-1] ^ rot1(C[x+1])
// D0 = C4 ^ rot1(C1)
push 58
call __read_lane
push 52
call __read_lane
call __rot1_lane
call std_crypto_keccak256__xor_lane
push 60
call __write_lane
// D1 = C0 ^ rot1(C2)
push 50
call __read_lane
push 54
call __read_lane
call __rot1_lane
call std_crypto_keccak256__xor_lane
push 62
call __write_lane
// D2 = C1 ^ rot1(C3)
push 52
call __read_lane
push 56
call __read_lane
call __rot1_lane
call std_crypto_keccak256__xor_lane
push 64
call __write_lane
// D3 = C2 ^ rot1(C4)
push 54
call __read_lane
push 58
call __read_lane
call __rot1_lane
call std_crypto_keccak256__xor_lane
push 66
call __write_lane
// D4 = C3 ^ rot1(C0)
push 56
call __read_lane
push 50
call __read_lane
call __rot1_lane
call std_crypto_keccak256__xor_lane
push 68
call __write_lane
// XOR D[x] into each of 25 state lanes
// Column 0: D0 at addr 60
call __theta_apply_d
return
// Apply D values to all 25 state lanes
__theta_apply_d:
// For each column x (0..4), for each row y (0..4):
// s[x][y] ^= D[x]
// D[x] at addr 60+2x, s[x][y] at addr 2*(5y+x)
// Col 0: D0 at 60. Lanes: 0, 10, 20, 30, 40
push 60
call __read_lane
// XOR into 5 lanes (keep D on stack, dup for each)
dup 1
dup 1
push 0
call __read_lane
call std_crypto_keccak256__xor_lane
push 0
call __write_lane
dup 1
dup 1
push 10
call __read_lane
call std_crypto_keccak256__xor_lane
push 10
call __write_lane
dup 1
dup 1
push 20
call __read_lane
call std_crypto_keccak256__xor_lane
push 20
call __write_lane
dup 1
dup 1
push 30
call __read_lane
call std_crypto_keccak256__xor_lane
push 30
call __write_lane
push 40
call __read_lane
call std_crypto_keccak256__xor_lane
push 40
call __write_lane
// Col 1: D1 at 62. Lanes: 2, 12, 22, 32, 42
push 62
call __read_lane
dup 1
dup 1
push 2
call __read_lane
call std_crypto_keccak256__xor_lane
push 2
call __write_lane
dup 1
dup 1
push 12
call __read_lane
call std_crypto_keccak256__xor_lane
push 12
call __write_lane
dup 1
dup 1
push 22
call __read_lane
call std_crypto_keccak256__xor_lane
push 22
call __write_lane
dup 1
dup 1
push 32
call __read_lane
call std_crypto_keccak256__xor_lane
push 32
call __write_lane
push 42
call __read_lane
call std_crypto_keccak256__xor_lane
push 42
call __write_lane
// Col 2: D2 at 64. Lanes: 4, 14, 24, 34, 44
push 64
call __read_lane
dup 1
dup 1
push 4
call __read_lane
call std_crypto_keccak256__xor_lane
push 4
call __write_lane
dup 1
dup 1
push 14
call __read_lane
call std_crypto_keccak256__xor_lane
push 14
call __write_lane
dup 1
dup 1
push 24
call __read_lane
call std_crypto_keccak256__xor_lane
push 24
call __write_lane
dup 1
dup 1
push 34
call __read_lane
call std_crypto_keccak256__xor_lane
push 34
call __write_lane
push 44
call __read_lane
call std_crypto_keccak256__xor_lane
push 44
call __write_lane
// Col 3: D3 at 66. Lanes: 6, 16, 26, 36, 46
push 66
call __read_lane
dup 1
dup 1
push 6
call __read_lane
call std_crypto_keccak256__xor_lane
push 6
call __write_lane
dup 1
dup 1
push 16
call __read_lane
call std_crypto_keccak256__xor_lane
push 16
call __write_lane
dup 1
dup 1
push 26
call __read_lane
call std_crypto_keccak256__xor_lane
push 26
call __write_lane
dup 1
dup 1
push 36
call __read_lane
call std_crypto_keccak256__xor_lane
push 36
call __write_lane
push 46
call __read_lane
call std_crypto_keccak256__xor_lane
push 46
call __write_lane
// Col 4: D4 at 68. Lanes: 8, 18, 28, 38, 48
push 68
call __read_lane
dup 1
dup 1
push 8
call __read_lane
call std_crypto_keccak256__xor_lane
push 8
call __write_lane
dup 1
dup 1
push 18
call __read_lane
call std_crypto_keccak256__xor_lane
push 18
call __write_lane
dup 1
dup 1
push 28
call __read_lane
call std_crypto_keccak256__xor_lane
push 28
call __write_lane
dup 1
dup 1
push 38
call __read_lane
call std_crypto_keccak256__xor_lane
push 38
call __write_lane
push 48
call __read_lane
call std_crypto_keccak256__xor_lane
push 48
call __write_lane
return
// ===================================================================
// ROT1 (used by theta for D computation)
// ===================================================================
// rot1_lane: [lo hi] -> [lo' hi'] rotate 64-bit lane left by 1
// new_lo = (lo << 1) | (hi >> 31)
// new_hi = (hi << 1) | (lo >> 31)
// lo<<1: lo*2 in field, split -> (carry, shifted). carry = bit 31.
// rot1_lane: [lo hi] -> [lo' hi'] rotate 64-bit left by 1
// Uses __do_rot with shift_factor = 2
__rot1_lane:
push 2
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
// ===================================================================
// RHO (lane rotations by fixed offsets)
// ===================================================================
// 24 non-trivial rotations. Each reads a lane, rotates, writes back.
// For rotation by n bits (0 < n < 32): use shift via field mul.
// For rotation by 32: swap lo and hi.
// For rotation by n > 32: swap then rotate by n-32.
// Rotation by 0: skip (lane 0,0).
std_crypto_keccak256__rho:
// s10 (addr 2): rot 1
push 2
call __read_lane
call __rot_1
push 2
call __write_lane
// s20 (addr 4): rot 62 = swap + rot 30
push 4
call __read_lane
call __rot_62
push 4
call __write_lane
// s30 (addr 6): rot 28
push 6
call __read_lane
call __rot_28
push 6
call __write_lane
// s40 (addr 8): rot 27
push 8
call __read_lane
call __rot_27
push 8
call __write_lane
// s01 (addr 10): rot 36 = swap + rot 4
push 10
call __read_lane
call __rot_36
push 10
call __write_lane
// s11 (addr 12): rot 44 = swap + rot 12
push 12
call __read_lane
call __rot_44
push 12
call __write_lane
// s21 (addr 14): rot 6
push 14
call __read_lane
call __rot_6
push 14
call __write_lane
// s31 (addr 16): rot 55 = swap + rot 23
push 16
call __read_lane
call __rot_55
push 16
call __write_lane
// s41 (addr 18): rot 20
push 18
call __read_lane
call __rot_20
push 18
call __write_lane
// s02 (addr 20): rot 3
push 20
call __read_lane
call __rot_3
push 20
call __write_lane
// s12 (addr 22): rot 10
push 22
call __read_lane
call __rot_10
push 22
call __write_lane
// s22 (addr 24): rot 43 = swap + rot 11
push 24
call __read_lane
call __rot_43
push 24
call __write_lane
// s32 (addr 26): rot 25
push 26
call __read_lane
call __rot_25
push 26
call __write_lane
// s42 (addr 28): rot 39 = swap + rot 7
push 28
call __read_lane
call __rot_39
push 28
call __write_lane
// s03 (addr 30): rot 41 = swap + rot 9
push 30
call __read_lane
call __rot_41
push 30
call __write_lane
// s13 (addr 32): rot 45 = swap + rot 13
push 32
call __read_lane
call __rot_45
push 32
call __write_lane
// s23 (addr 34): rot 15
push 34
call __read_lane
call __rot_15
push 34
call __write_lane
// s33 (addr 36): rot 21
push 36
call __read_lane
call __rot_21
push 36
call __write_lane
// s43 (addr 38): rot 8
push 38
call __read_lane
call __rot_8
push 38
call __write_lane
// s04 (addr 40): rot 18
push 40
call __read_lane
call __rot_18
push 40
call __write_lane
// s14 (addr 42): rot 2
push 42
call __read_lane
call __rot_2
push 42
call __write_lane
// s24 (addr 44): rot 61 = swap + rot 29
push 44
call __read_lane
call __rot_61
push 44
call __write_lane
// s34 (addr 46): rot 56 = swap + rot 24
push 46
call __read_lane
call __rot_56
push 46
call __write_lane
// s44 (addr 48): rot 14
push 48
call __read_lane
call __rot_14
push 48
call __write_lane
return
// Generic rotation: [lo hi] -> [lo' hi'] by n bits (0 < n < 32)
// lo' = (lo << n) | (hi >> (32-n))
// hi' = (hi << n) | (lo >> (32-n))
// shift_factor = 2^n
// lo * shift_factor -> split -> (lo_carry, lo_shifted)
// hi * shift_factor -> split -> (hi_carry, hi_shifted)
// new_lo = lo_shifted ^ hi_carry, new_hi = hi_shifted ^ lo_carry
// Simpler rotation using memory for temp storage
// __do_rot: [lo hi] -> [lo' hi'], shift_factor at mem[120]
__do_rot:
// Read shift factor from mem[120]
push 120
read_mem 1
pop 1
// Stack: lo hi factor
// hi * factor -> split
swap 1
dup 1
mul
split
// Stack: lo factor hi_carry hi_shifted
// Store hi_shifted at mem[121]
push 121
swap 1
write_mem 1
pop 1
// Store hi_carry at mem[122]
push 122
swap 1
write_mem 1
pop 1
// Stack: lo factor
mul
split
// Stack: lo_carry lo_shifted
// new_lo = lo_shifted ^ hi_carry (mem[122])
push 122
read_mem 1
pop 1
xor
// Stack: lo_carry new_lo
swap 1
// new_hi = hi_shifted (mem[121]) ^ lo_carry
push 121
read_mem 1
pop 1
xor
// Stack: new_lo new_hi
return
// Rotation helpers: push shift factor to mem[120], call __do_rot
// For n < 32: shift_factor = 2^n
// For n >= 32: swap lo/hi first, then rotate by n-32
__rot_1:
push 2
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_2:
push 4
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_3:
push 8
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_6:
push 64
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_7:
push 128
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_8:
push 256
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_9:
push 512
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_10:
push 1024
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_11:
push 2048
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_12:
push 4096
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_13:
push 8192
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_14:
push 16384
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_15:
push 32768
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_18:
push 262144
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_20:
push 1048576
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_21:
push 2097152
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_23:
push 8388608
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_24:
push 16777216
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_25:
push 33554432
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_27:
push 134217728
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_28:
push 268435456
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_29:
push 536870912
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_30:
push 1073741824
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
// Compound rotations for n >= 32: swap halves then rotate by n-32
__rot_36:
swap 1
call __rot_4
return
__rot_4:
push 16
push 120
swap 1
write_mem 1
pop 1
call __do_rot
return
__rot_39:
swap 1
call __rot_7
return
__rot_41:
swap 1
call __rot_9
return
__rot_43:
swap 1
call __rot_11
return
__rot_44:
swap 1
call __rot_12
return
__rot_45:
swap 1
call __rot_13
return
__rot_55:
swap 1
call __rot_23
return
__rot_56:
swap 1
call __rot_24
return
__rot_61:
swap 1
call __rot_29
return
__rot_62:
swap 1
call __rot_30
return
// ===================================================================
// PI (lane permutation)
// ===================================================================
// new[y][2x+3y mod 5] = old[x][y]
// Copy state to temp[70..119], then write back in permuted order.
std_crypto_keccak256__pi:
// Copy all 50 words from state[0..49] to temp[70..119]
push 4
read_mem 5
swap 5
pop 1
push 70
swap 5
write_mem 5
pop 1
push 9
read_mem 5
swap 5
pop 1
push 75
swap 5
write_mem 5
pop 1
push 14
read_mem 5
swap 5
pop 1
push 80
swap 5
write_mem 5
pop 1
push 19
read_mem 5
swap 5
pop 1
push 85
swap 5
write_mem 5
pop 1
push 24
read_mem 5
swap 5
pop 1
push 90
swap 5
write_mem 5
pop 1
push 29
read_mem 5
swap 5
pop 1
push 95
swap 5
write_mem 5
pop 1
push 34
read_mem 5
swap 5
pop 1
push 100
swap 5
write_mem 5
pop 1
push 39
read_mem 5
swap 5
pop 1
push 105
swap 5
write_mem 5
pop 1
push 44
read_mem 5
swap 5
pop 1
push 110
swap 5
write_mem 5
pop 1
push 49
read_mem 5
swap 5
pop 1
push 115
swap 5
write_mem 5
pop 1
// Now write back in permuted order.
// Mapping: new[x][y] <- old[src_x][src_y] at temp[70 + 2*(5*src_y + src_x)]
// s00 <- s00 (temp 70,71) -> addr 0,1
push 70
call __read_lane
push 0
call __write_lane
// s10 <- s11 (temp 82,83) -> addr 2,3
push 82
call __read_lane
push 2
call __write_lane
// s20 <- s22 (temp 94,95) -> addr 4,5
push 94
call __read_lane
push 4
call __write_lane
// s30 <- s33 (temp 106,107) -> addr 6,7
push 106
call __read_lane
push 6
call __write_lane
// s40 <- s44 (temp 118,119) -> addr 8,9
push 118
call __read_lane
push 8
call __write_lane
// s01 <- s30 (temp 76,77) -> addr 10,11
push 76
call __read_lane
push 10
call __write_lane
// s11 <- s41 (temp 88,89) -> addr 12,13
push 88
call __read_lane
push 12
call __write_lane
// s21 <- s02 (temp 90,91) -> addr 14,15
push 90
call __read_lane
push 14
call __write_lane
// s31 <- s13 (temp 102,103) -> addr 16,17
push 102
call __read_lane
push 16
call __write_lane
// s41 <- s24 (temp 114,115) -> addr 18,19
push 114
call __read_lane
push 18
call __write_lane
// s02 <- s10 (temp 72,73) -> addr 20,21
push 72
call __read_lane
push 20
call __write_lane
// s12 <- s21 (temp 84,85) -> addr 22,23
push 84
call __read_lane
push 22
call __write_lane
// s22 <- s32 (temp 96,97) -> addr 24,25
push 96
call __read_lane
push 24
call __write_lane
// s32 <- s43 (temp 108,109) -> addr 26,27
push 108
call __read_lane
push 26
call __write_lane
// s42 <- s04 (temp 110,111) -> addr 28,29
push 110
call __read_lane
push 28
call __write_lane
// s03 <- s40 (temp 78,79) -> addr 30,31
push 78
call __read_lane
push 30
call __write_lane
// s13 <- s01 (temp 80,81) -> addr 32,33
push 80
call __read_lane
push 32
call __write_lane
// s23 <- s12 (temp 92,93) -> addr 34,35
push 92
call __read_lane
push 34
call __write_lane
// s33 <- s23 (temp 104,105) -> addr 36,37
push 104
call __read_lane
push 36
call __write_lane
// s43 <- s34 (temp 116,117) -> addr 38,39
push 116
call __read_lane
push 38
call __write_lane
// s04 <- s20 (temp 74,75) -> addr 40,41
push 74
call __read_lane
push 40
call __write_lane
// s14 <- s31 (temp 86,87) -> addr 42,43
push 86
call __read_lane
push 42
call __write_lane
// s24 <- s42 (temp 98,99) -> addr 44,45
push 98
call __read_lane
push 44
call __write_lane
// s34 <- s03 (temp 100,101) -> addr 46,47
push 100
call __read_lane
push 46
call __write_lane
// s44 <- s14 (temp 112,113) -> addr 48,49
push 112
call __read_lane
push 48
call __write_lane
return
// ===================================================================
// CHI (nonlinear row mixing)
// ===================================================================
// For each row: copy 5 lanes to temp, compute chi for each lane.
// chi_lane(a, b, c) = a ^ (~b & c), where b = next lane, c = lane after that.
std_crypto_keccak256__chi:
// Row 0: lanes at addrs 0,2,4,6,8 -> temp 70,72,74,76,78
push 0
call __chi_row
// Row 1: lanes at addrs 10,12,14,16,18 -> temp 70..
push 10
call __chi_row
// Row 2: lanes at addrs 20,22,24,26,28
push 20
call __chi_row
// Row 3
push 30
call __chi_row
// Row 4
push 40
call __chi_row
return
// __chi_row: [base_addr] -> []
// Processes 5 lanes starting at base_addr (stride 2).
// Copies to temp[70..79], then writes chi results back.
__chi_row:
// Copy 5 lanes (10 words) to temp[70..79]
dup 0
call __read_lane
push 70
call __write_lane
dup 0
push 2
add
call __read_lane
push 72
call __write_lane
dup 0
push 4
add
call __read_lane
push 74
call __write_lane
dup 0
push 6
add
call __read_lane
push 76
call __write_lane
dup 0
push 8
add
call __read_lane
push 78
call __write_lane
// chi_lane(temp[70], temp[72], temp[74]) -> write to base+0
push 70
call __read_lane
push 72
call __read_lane
push 74
call __read_lane
call __chi3
dup 0
swap 3
swap 1
call __write_lane
// chi_lane(temp[72], temp[74], temp[76]) -> write to base+2
push 72
call __read_lane
push 74
call __read_lane
push 76
call __read_lane
call __chi3
dup 0
push 2
add
swap 1
swap 2
call __write_lane
// chi_lane(temp[74], temp[76], temp[78]) -> write to base+4
push 74
call __read_lane
push 76
call __read_lane
push 78
call __read_lane
call __chi3
dup 0
push 4
add
swap 1
swap 2
call __write_lane
// chi_lane(temp[76], temp[78], temp[70]) -> write to base+6
push 76
call __read_lane
push 78
call __read_lane
push 70
call __read_lane
call __chi3
dup 0
push 6
add
swap 1
swap 2
call __write_lane
// chi_lane(temp[78], temp[70], temp[72]) -> write to base+8
push 78
call __read_lane
push 70
call __read_lane
push 72
call __read_lane
call __chi3
swap 1
push 8
add
swap 1
swap 2
call __write_lane
return
// __chi3: [a_lo a_hi b_lo b_hi c_lo c_hi] -> [r_lo r_hi]
// r = a ^ (~b & c)
__chi3:
// Stack: a_lo a_hi b_lo b_hi c_lo c_hi (c_hi on top)
// r = a ^ (~b & c)
// Store a at mem[140,141]
push 140
swap 5
write_mem 1
pop 1
push 141
swap 4
write_mem 1
pop 1
// Stack: b_lo b_hi c_lo c_hi
// ~b_lo & c_lo
swap 2
push 4294967295
xor
and
// Stack: b_hi c_hi (notb_lo & c_lo)
swap 2
// Stack: (notb_lo&c_lo) c_hi b_hi
push 4294967295
xor
and
// Stack: (notb_lo&c_lo) (notb_hi&c_hi)
// XOR with a
push 141
read_mem 1
pop 1
xor
swap 1
push 140
read_mem 1
pop 1
xor
swap 1
// Stack: r_lo r_hi
return
// ===================================================================
// IOTA
// ===================================================================
// XOR round constant into lane (0,0).
// Stack: [rc_lo rc_hi] (rc_hi on top)
std_crypto_keccak256__iota:
push 0
call __read_lane
call std_crypto_keccak256__xor_lane
push 0
call __write_lane
return
// ===================================================================
// KECCAK ROUND
// ===================================================================
// __round: [rc_lo rc_hi] -> []
// Applies theta, rho, pi, chi, iota with given round constant.
std_crypto_keccak256__keccak_round:
// Save rc to memory[130,131]
push 130
call __write_lane
call std_crypto_keccak256__theta
call std_crypto_keccak256__rho
call std_crypto_keccak256__pi
call std_crypto_keccak256__chi
// Reload rc and apply iota
push 130
call __read_lane
call std_crypto_keccak256__iota
return
// ===================================================================
// FULL PERMUTATION (24 rounds)
// ===================================================================
std_crypto_keccak256__keccak_f1600:
push 1
push 0
call std_crypto_keccak256__keccak_round
push 32898
push 0
call std_crypto_keccak256__keccak_round
push 32906
push 2147483648
call std_crypto_keccak256__keccak_round
push 2147516416
push 2147483648
call std_crypto_keccak256__keccak_round
push 32907
push 0
call std_crypto_keccak256__keccak_round
push 2147483649
push 0
call std_crypto_keccak256__keccak_round
push 2147516545
push 2147483648
call std_crypto_keccak256__keccak_round
push 32777
push 2147483648
call std_crypto_keccak256__keccak_round
push 138
push 0
call std_crypto_keccak256__keccak_round
push 136
push 0
call std_crypto_keccak256__keccak_round
push 2147516425
push 0
call std_crypto_keccak256__keccak_round
push 2147483658
push 0
call std_crypto_keccak256__keccak_round
push 2147516555
push 0
call std_crypto_keccak256__keccak_round
push 139
push 2147483648
call std_crypto_keccak256__keccak_round
push 32905
push 2147483648
call std_crypto_keccak256__keccak_round
push 32771
push 2147483648
call std_crypto_keccak256__keccak_round
push 32770
push 2147483648
call std_crypto_keccak256__keccak_round
push 128
push 2147483648
call std_crypto_keccak256__keccak_round
push 32778
push 0
call std_crypto_keccak256__keccak_round
push 2147483658
push 2147483648
call std_crypto_keccak256__keccak_round
push 2147516545
push 2147483648
call std_crypto_keccak256__keccak_round
push 32896
push 2147483648
call std_crypto_keccak256__keccak_round
push 2147483649
push 0
call std_crypto_keccak256__keccak_round
push 2147516424
push 2147483648
call std_crypto_keccak256__keccak_round
return
// ===================================================================
// ZERO STATE
// ===================================================================
std_crypto_keccak256__zero_state:
// Push 50 zeros (25 lanes ร 2 words)
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
return
// ===================================================================
// EXTRACT 256
// ===================================================================
// Returns first 4 lanes (256 bits) as 8 U32 values.
// Reads s00, s10, s20, s30 from memory.
std_crypto_keccak256__extract256:
push 0
call __read_lane
push 2
call __read_lane
push 4
call __read_lane
push 6
call __read_lane
return
// ===================================================================
// REMAINING .tri API FUNCTIONS (trivial wrappers)
// ===================================================================
// zero_lane
std_crypto_keccak256__zero_lane:
push 0
push 0
return
// make_lane
std_crypto_keccak256__make_lane:
return
// Round constant functions (rc0..rc23) โ return (lo, hi) pairs
std_crypto_keccak256__rc0:
push 1
push 0
return
std_crypto_keccak256__rc1:
push 32898
push 0
return
std_crypto_keccak256__rc2:
push 32906
push 2147483648
return
std_crypto_keccak256__rc3:
push 2147516416
push 2147483648
return
std_crypto_keccak256__rc4:
push 32907
push 0
return
std_crypto_keccak256__rc5:
push 2147483649
push 0
return
std_crypto_keccak256__rc6:
push 2147516545
push 2147483648
return
std_crypto_keccak256__rc7:
push 32777
push 2147483648
return
std_crypto_keccak256__rc8:
push 138
push 0
return
std_crypto_keccak256__rc9:
push 136
push 0
return
std_crypto_keccak256__rc10:
push 2147516425
push 0
return
std_crypto_keccak256__rc11:
push 2147483658
push 0
return
std_crypto_keccak256__rc12:
push 2147516555
push 0
return
std_crypto_keccak256__rc13:
push 139
push 2147483648
return
std_crypto_keccak256__rc14:
push 32905
push 2147483648
return
std_crypto_keccak256__rc15:
push 32771
push 2147483648
return
std_crypto_keccak256__rc16:
push 32770
push 2147483648
return
std_crypto_keccak256__rc17:
push 128
push 2147483648
return
std_crypto_keccak256__rc18:
push 32778
push 0
return
std_crypto_keccak256__rc19:
push 2147483658
push 2147483648
return
std_crypto_keccak256__rc20:
push 2147516545
push 2147483648
return
std_crypto_keccak256__rc21:
push 32896
push 2147483648
return
std_crypto_keccak256__rc22:
push 2147483649
push 0
return
std_crypto_keccak256__rc23:
push 2147516424
push 2147483648
return