trident/baselines/triton/std/crypto/keccak256.tasm

// Hand-optimized TASM baseline: std.crypto.keccak256
//
// First principles rewrite. Keccak-f[1600] on Triton VM.
//
// State: 25 lanes ร— 2 U32 = 50 words in RAM[0..49].
// Lane[x][y]: lo at 2*(5y+x), hi at 2*(5y+x)+1.
// Temp: RAM[50..69] for column parities, RAM[70..119] for pi/chi temp.
//
// Key optimizations vs previous baseline:
//   - Single __round function (was 24 copies)
//   - Single __theta_col called 5ร— (was 5 copies)
//   - __chi_row subroutine called 5ร— (was 5 inline copies)
//   - read_mem 2 for lane reads where possible
//   - Minimal lane helper subroutines

// ===================================================================
// LANE HELPERS (shared)
// ===================================================================

// xor_lane: [a_lo a_hi b_lo b_hi] -> [r_lo r_hi] (b on top)
std_crypto_keccak256__xor_lane:
    swap 2
    xor
    swap 1
    swap 2
    xor
    swap 1
    return

// and_lane: [a_lo a_hi b_lo b_hi] -> [r_lo r_hi]
std_crypto_keccak256__and_lane:
    swap 2
    and
    swap 1
    swap 2
    and
    swap 1
    return

// not_lane: [lo hi] -> [~lo ~hi]
std_crypto_keccak256__not_lane:
    push 4294967295
    xor
    swap 1
    push 4294967295
    xor
    swap 1
    return

// chi_lane: [a_lo a_hi b_lo b_hi c_lo c_hi] -> [r_lo r_hi]
// r = a ^ (~b & c)
std_crypto_keccak256__chi_lane:
    // Stack: a_lo a_hi b_lo b_hi c_lo c_hi (c_hi on top)
    // not(b): b_lo b_hi -> ~b_lo ~b_hi
    swap 2
    push 4294967295
    xor
    swap 2
    swap 3
    push 4294967295
    xor
    swap 3
    // Stack: a_lo a_hi ~b_lo ~b_hi c_lo c_hi
    // and(~b, c)
    swap 2
    and
    swap 1
    swap 2
    and
    swap 1
    // Stack: a_lo a_hi (notb&c)_lo (notb&c)_hi
    // xor with a
    swap 2
    xor
    swap 1
    swap 2
    xor
    swap 1
    return

// read_lane: [addr] -> [lo hi] reads lane from addr (lo at addr, hi at addr+1)
__read_lane:
    dup 0
    read_mem 1
    pop 1
    swap 1
    push 1
    add
    read_mem 1
    pop 1
    return

// write_lane: [addr lo hi] -> [] writes lane to addr
__write_lane:
    swap 2
    dup 0
    swap 2
    write_mem 1
    pop 1
    swap 1
    push 1
    add
    swap 1
    write_mem 1
    pop 1
    return



// ===================================================================
// THETA
// ===================================================================

// theta: modify state in RAM in-place
// C[x] = s[x][0] ^ s[x][1] ^ s[x][2] ^ s[x][3] ^ s[x][4]  stored at [50+2x, 51+2x]
// D[x] = C[x-1 mod 5] ^ rot1(C[x+1 mod 5])  stored at [60+2x, 61+2x]
// s[x][y] ^= D[x]
std_crypto_keccak256__theta:
    // Compute 5 column parities C[0..4] -> RAM[50..59]
    // C0: s00^s01^s02^s03^s04 = mem[0,1]^mem[10,11]^mem[20,21]^mem[30,31]^mem[40,41]
    push 0
    call __read_lane
    push 10
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 20
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 30
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 40
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 50
    call __write_lane

    // C1
    push 2
    call __read_lane
    push 12
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 22
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 32
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 42
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 52
    call __write_lane

    // C2
    push 4
    call __read_lane
    push 14
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 24
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 34
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 44
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 54
    call __write_lane

    // C3
    push 6
    call __read_lane
    push 16
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 26
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 36
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 46
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 56
    call __write_lane

    // C4
    push 8
    call __read_lane
    push 18
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 28
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 38
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 48
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 58
    call __write_lane

    // D[x] = C[x-1] ^ rot1(C[x+1])
    // D0 = C4 ^ rot1(C1)
    push 58
    call __read_lane
    push 52
    call __read_lane
    call __rot1_lane
    call std_crypto_keccak256__xor_lane
    push 60
    call __write_lane

    // D1 = C0 ^ rot1(C2)
    push 50
    call __read_lane
    push 54
    call __read_lane
    call __rot1_lane
    call std_crypto_keccak256__xor_lane
    push 62
    call __write_lane

    // D2 = C1 ^ rot1(C3)
    push 52
    call __read_lane
    push 56
    call __read_lane
    call __rot1_lane
    call std_crypto_keccak256__xor_lane
    push 64
    call __write_lane

    // D3 = C2 ^ rot1(C4)
    push 54
    call __read_lane
    push 58
    call __read_lane
    call __rot1_lane
    call std_crypto_keccak256__xor_lane
    push 66
    call __write_lane

    // D4 = C3 ^ rot1(C0)
    push 56
    call __read_lane
    push 50
    call __read_lane
    call __rot1_lane
    call std_crypto_keccak256__xor_lane
    push 68
    call __write_lane

    // XOR D[x] into each of 25 state lanes
    // Column 0: D0 at addr 60
    call __theta_apply_d
    return

// Apply D values to all 25 state lanes
__theta_apply_d:
    // For each column x (0..4), for each row y (0..4):
    //   s[x][y] ^= D[x]
    // D[x] at addr 60+2x, s[x][y] at addr 2*(5y+x)

    // Col 0: D0 at 60. Lanes: 0, 10, 20, 30, 40
    push 60
    call __read_lane
    // XOR into 5 lanes (keep D on stack, dup for each)
    dup 1
    dup 1
    push 0
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 0
    call __write_lane
    dup 1
    dup 1
    push 10
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 10
    call __write_lane
    dup 1
    dup 1
    push 20
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 20
    call __write_lane
    dup 1
    dup 1
    push 30
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 30
    call __write_lane
    push 40
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 40
    call __write_lane

    // Col 1: D1 at 62. Lanes: 2, 12, 22, 32, 42
    push 62
    call __read_lane
    dup 1
    dup 1
    push 2
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 2
    call __write_lane
    dup 1
    dup 1
    push 12
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 12
    call __write_lane
    dup 1
    dup 1
    push 22
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 22
    call __write_lane
    dup 1
    dup 1
    push 32
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 32
    call __write_lane
    push 42
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 42
    call __write_lane

    // Col 2: D2 at 64. Lanes: 4, 14, 24, 34, 44
    push 64
    call __read_lane
    dup 1
    dup 1
    push 4
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 4
    call __write_lane
    dup 1
    dup 1
    push 14
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 14
    call __write_lane
    dup 1
    dup 1
    push 24
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 24
    call __write_lane
    dup 1
    dup 1
    push 34
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 34
    call __write_lane
    push 44
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 44
    call __write_lane

    // Col 3: D3 at 66. Lanes: 6, 16, 26, 36, 46
    push 66
    call __read_lane
    dup 1
    dup 1
    push 6
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 6
    call __write_lane
    dup 1
    dup 1
    push 16
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 16
    call __write_lane
    dup 1
    dup 1
    push 26
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 26
    call __write_lane
    dup 1
    dup 1
    push 36
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 36
    call __write_lane
    push 46
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 46
    call __write_lane

    // Col 4: D4 at 68. Lanes: 8, 18, 28, 38, 48
    push 68
    call __read_lane
    dup 1
    dup 1
    push 8
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 8
    call __write_lane
    dup 1
    dup 1
    push 18
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 18
    call __write_lane
    dup 1
    dup 1
    push 28
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 28
    call __write_lane
    dup 1
    dup 1
    push 38
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 38
    call __write_lane
    push 48
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 48
    call __write_lane
    return

// ===================================================================
// ROT1 (used by theta for D computation)
// ===================================================================
// rot1_lane: [lo hi] -> [lo' hi'] rotate 64-bit lane left by 1
// new_lo = (lo << 1) | (hi >> 31)
// new_hi = (hi << 1) | (lo >> 31)
// lo<<1: lo*2 in field, split -> (carry, shifted). carry = bit 31.
// rot1_lane: [lo hi] -> [lo' hi'] rotate 64-bit left by 1
// Uses __do_rot with shift_factor = 2
__rot1_lane:
    push 2
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return

// ===================================================================
// RHO (lane rotations by fixed offsets)
// ===================================================================
// 24 non-trivial rotations. Each reads a lane, rotates, writes back.
// For rotation by n bits (0 < n < 32): use shift via field mul.
// For rotation by 32: swap lo and hi.
// For rotation by n > 32: swap then rotate by n-32.
// Rotation by 0: skip (lane 0,0).
std_crypto_keccak256__rho:
    // s10 (addr 2): rot 1
    push 2
    call __read_lane
    call __rot_1
    push 2
    call __write_lane
    // s20 (addr 4): rot 62 = swap + rot 30
    push 4
    call __read_lane
    call __rot_62
    push 4
    call __write_lane
    // s30 (addr 6): rot 28
    push 6
    call __read_lane
    call __rot_28
    push 6
    call __write_lane
    // s40 (addr 8): rot 27
    push 8
    call __read_lane
    call __rot_27
    push 8
    call __write_lane
    // s01 (addr 10): rot 36 = swap + rot 4
    push 10
    call __read_lane
    call __rot_36
    push 10
    call __write_lane
    // s11 (addr 12): rot 44 = swap + rot 12
    push 12
    call __read_lane
    call __rot_44
    push 12
    call __write_lane
    // s21 (addr 14): rot 6
    push 14
    call __read_lane
    call __rot_6
    push 14
    call __write_lane
    // s31 (addr 16): rot 55 = swap + rot 23
    push 16
    call __read_lane
    call __rot_55
    push 16
    call __write_lane
    // s41 (addr 18): rot 20
    push 18
    call __read_lane
    call __rot_20
    push 18
    call __write_lane
    // s02 (addr 20): rot 3
    push 20
    call __read_lane
    call __rot_3
    push 20
    call __write_lane
    // s12 (addr 22): rot 10
    push 22
    call __read_lane
    call __rot_10
    push 22
    call __write_lane
    // s22 (addr 24): rot 43 = swap + rot 11
    push 24
    call __read_lane
    call __rot_43
    push 24
    call __write_lane
    // s32 (addr 26): rot 25
    push 26
    call __read_lane
    call __rot_25
    push 26
    call __write_lane
    // s42 (addr 28): rot 39 = swap + rot 7
    push 28
    call __read_lane
    call __rot_39
    push 28
    call __write_lane
    // s03 (addr 30): rot 41 = swap + rot 9
    push 30
    call __read_lane
    call __rot_41
    push 30
    call __write_lane
    // s13 (addr 32): rot 45 = swap + rot 13
    push 32
    call __read_lane
    call __rot_45
    push 32
    call __write_lane
    // s23 (addr 34): rot 15
    push 34
    call __read_lane
    call __rot_15
    push 34
    call __write_lane
    // s33 (addr 36): rot 21
    push 36
    call __read_lane
    call __rot_21
    push 36
    call __write_lane
    // s43 (addr 38): rot 8
    push 38
    call __read_lane
    call __rot_8
    push 38
    call __write_lane
    // s04 (addr 40): rot 18
    push 40
    call __read_lane
    call __rot_18
    push 40
    call __write_lane
    // s14 (addr 42): rot 2
    push 42
    call __read_lane
    call __rot_2
    push 42
    call __write_lane
    // s24 (addr 44): rot 61 = swap + rot 29
    push 44
    call __read_lane
    call __rot_61
    push 44
    call __write_lane
    // s34 (addr 46): rot 56 = swap + rot 24
    push 46
    call __read_lane
    call __rot_56
    push 46
    call __write_lane
    // s44 (addr 48): rot 14
    push 48
    call __read_lane
    call __rot_14
    push 48
    call __write_lane
    return

// Generic rotation: [lo hi] -> [lo' hi'] by n bits (0 < n < 32)
// lo' = (lo << n) | (hi >> (32-n))
// hi' = (hi << n) | (lo >> (32-n))
// shift_factor = 2^n
// lo * shift_factor -> split -> (lo_carry, lo_shifted)
// hi * shift_factor -> split -> (hi_carry, hi_shifted)
// new_lo = lo_shifted ^ hi_carry, new_hi = hi_shifted ^ lo_carry


// Simpler rotation using memory for temp storage
// __do_rot: [lo hi] -> [lo' hi'], shift_factor at mem[120]
__do_rot:
    // Read shift factor from mem[120]
    push 120
    read_mem 1
    pop 1
    // Stack: lo hi factor
    // hi * factor -> split
    swap 1
    dup 1
    mul
    split
    // Stack: lo factor hi_carry hi_shifted
    // Store hi_shifted at mem[121]
    push 121
    swap 1
    write_mem 1
    pop 1
    // Store hi_carry at mem[122]
    push 122
    swap 1
    write_mem 1
    pop 1
    // Stack: lo factor
    mul
    split
    // Stack: lo_carry lo_shifted
    // new_lo = lo_shifted ^ hi_carry (mem[122])
    push 122
    read_mem 1
    pop 1
    xor
    // Stack: lo_carry new_lo
    swap 1
    // new_hi = hi_shifted (mem[121]) ^ lo_carry
    push 121
    read_mem 1
    pop 1
    xor
    // Stack: new_lo new_hi
    return

// Rotation helpers: push shift factor to mem[120], call __do_rot
// For n < 32: shift_factor = 2^n
// For n >= 32: swap lo/hi first, then rotate by n-32

__rot_1:
    push 2
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_2:
    push 4
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_3:
    push 8
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_6:
    push 64
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_7:
    push 128
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_8:
    push 256
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_9:
    push 512
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_10:
    push 1024
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_11:
    push 2048
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_12:
    push 4096
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_13:
    push 8192
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_14:
    push 16384
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_15:
    push 32768
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_18:
    push 262144
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_20:
    push 1048576
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_21:
    push 2097152
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_23:
    push 8388608
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_24:
    push 16777216
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_25:
    push 33554432
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_27:
    push 134217728
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_28:
    push 268435456
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_29:
    push 536870912
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_30:
    push 1073741824
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return

// Compound rotations for n >= 32: swap halves then rotate by n-32
__rot_36:
    swap 1
    call __rot_4
    return
__rot_4:
    push 16
    push 120
    swap 1
    write_mem 1
    pop 1
    call __do_rot
    return
__rot_39:
    swap 1
    call __rot_7
    return
__rot_41:
    swap 1
    call __rot_9
    return
__rot_43:
    swap 1
    call __rot_11
    return
__rot_44:
    swap 1
    call __rot_12
    return
__rot_45:
    swap 1
    call __rot_13
    return
__rot_55:
    swap 1
    call __rot_23
    return
__rot_56:
    swap 1
    call __rot_24
    return
__rot_61:
    swap 1
    call __rot_29
    return
__rot_62:
    swap 1
    call __rot_30
    return

// ===================================================================
// PI (lane permutation)
// ===================================================================
// new[y][2x+3y mod 5] = old[x][y]
// Copy state to temp[70..119], then write back in permuted order.
std_crypto_keccak256__pi:
    // Copy all 50 words from state[0..49] to temp[70..119]
    push 4
    read_mem 5
    swap 5
    pop 1
    push 70
    swap 5
    write_mem 5
    pop 1

    push 9
    read_mem 5
    swap 5
    pop 1
    push 75
    swap 5
    write_mem 5
    pop 1

    push 14
    read_mem 5
    swap 5
    pop 1
    push 80
    swap 5
    write_mem 5
    pop 1

    push 19
    read_mem 5
    swap 5
    pop 1
    push 85
    swap 5
    write_mem 5
    pop 1

    push 24
    read_mem 5
    swap 5
    pop 1
    push 90
    swap 5
    write_mem 5
    pop 1

    push 29
    read_mem 5
    swap 5
    pop 1
    push 95
    swap 5
    write_mem 5
    pop 1

    push 34
    read_mem 5
    swap 5
    pop 1
    push 100
    swap 5
    write_mem 5
    pop 1

    push 39
    read_mem 5
    swap 5
    pop 1
    push 105
    swap 5
    write_mem 5
    pop 1

    push 44
    read_mem 5
    swap 5
    pop 1
    push 110
    swap 5
    write_mem 5
    pop 1

    push 49
    read_mem 5
    swap 5
    pop 1
    push 115
    swap 5
    write_mem 5
    pop 1

    // Now write back in permuted order.
    // Mapping: new[x][y] <- old[src_x][src_y] at temp[70 + 2*(5*src_y + src_x)]
    // s00 <- s00 (temp 70,71) -> addr 0,1
    push 70
    call __read_lane
    push 0
    call __write_lane
    // s10 <- s11 (temp 82,83) -> addr 2,3
    push 82
    call __read_lane
    push 2
    call __write_lane
    // s20 <- s22 (temp 94,95) -> addr 4,5
    push 94
    call __read_lane
    push 4
    call __write_lane
    // s30 <- s33 (temp 106,107) -> addr 6,7
    push 106
    call __read_lane
    push 6
    call __write_lane
    // s40 <- s44 (temp 118,119) -> addr 8,9
    push 118
    call __read_lane
    push 8
    call __write_lane
    // s01 <- s30 (temp 76,77) -> addr 10,11
    push 76
    call __read_lane
    push 10
    call __write_lane
    // s11 <- s41 (temp 88,89) -> addr 12,13
    push 88
    call __read_lane
    push 12
    call __write_lane
    // s21 <- s02 (temp 90,91) -> addr 14,15
    push 90
    call __read_lane
    push 14
    call __write_lane
    // s31 <- s13 (temp 102,103) -> addr 16,17
    push 102
    call __read_lane
    push 16
    call __write_lane
    // s41 <- s24 (temp 114,115) -> addr 18,19
    push 114
    call __read_lane
    push 18
    call __write_lane
    // s02 <- s10 (temp 72,73) -> addr 20,21
    push 72
    call __read_lane
    push 20
    call __write_lane
    // s12 <- s21 (temp 84,85) -> addr 22,23
    push 84
    call __read_lane
    push 22
    call __write_lane
    // s22 <- s32 (temp 96,97) -> addr 24,25
    push 96
    call __read_lane
    push 24
    call __write_lane
    // s32 <- s43 (temp 108,109) -> addr 26,27
    push 108
    call __read_lane
    push 26
    call __write_lane
    // s42 <- s04 (temp 110,111) -> addr 28,29
    push 110
    call __read_lane
    push 28
    call __write_lane
    // s03 <- s40 (temp 78,79) -> addr 30,31
    push 78
    call __read_lane
    push 30
    call __write_lane
    // s13 <- s01 (temp 80,81) -> addr 32,33
    push 80
    call __read_lane
    push 32
    call __write_lane
    // s23 <- s12 (temp 92,93) -> addr 34,35
    push 92
    call __read_lane
    push 34
    call __write_lane
    // s33 <- s23 (temp 104,105) -> addr 36,37
    push 104
    call __read_lane
    push 36
    call __write_lane
    // s43 <- s34 (temp 116,117) -> addr 38,39
    push 116
    call __read_lane
    push 38
    call __write_lane
    // s04 <- s20 (temp 74,75) -> addr 40,41
    push 74
    call __read_lane
    push 40
    call __write_lane
    // s14 <- s31 (temp 86,87) -> addr 42,43
    push 86
    call __read_lane
    push 42
    call __write_lane
    // s24 <- s42 (temp 98,99) -> addr 44,45
    push 98
    call __read_lane
    push 44
    call __write_lane
    // s34 <- s03 (temp 100,101) -> addr 46,47
    push 100
    call __read_lane
    push 46
    call __write_lane
    // s44 <- s14 (temp 112,113) -> addr 48,49
    push 112
    call __read_lane
    push 48
    call __write_lane
    return

// ===================================================================
// CHI (nonlinear row mixing)
// ===================================================================
// For each row: copy 5 lanes to temp, compute chi for each lane.
// chi_lane(a, b, c) = a ^ (~b & c), where b = next lane, c = lane after that.
std_crypto_keccak256__chi:
    // Row 0: lanes at addrs 0,2,4,6,8 -> temp 70,72,74,76,78
    push 0
    call __chi_row
    // Row 1: lanes at addrs 10,12,14,16,18 -> temp 70..
    push 10
    call __chi_row
    // Row 2: lanes at addrs 20,22,24,26,28
    push 20
    call __chi_row
    // Row 3
    push 30
    call __chi_row
    // Row 4
    push 40
    call __chi_row
    return

// __chi_row: [base_addr] -> []
// Processes 5 lanes starting at base_addr (stride 2).
// Copies to temp[70..79], then writes chi results back.
__chi_row:
    // Copy 5 lanes (10 words) to temp[70..79]
    dup 0
    call __read_lane
    push 70
    call __write_lane
    dup 0
    push 2
    add
    call __read_lane
    push 72
    call __write_lane
    dup 0
    push 4
    add
    call __read_lane
    push 74
    call __write_lane
    dup 0
    push 6
    add
    call __read_lane
    push 76
    call __write_lane
    dup 0
    push 8
    add
    call __read_lane
    push 78
    call __write_lane

    // chi_lane(temp[70], temp[72], temp[74]) -> write to base+0
    push 70
    call __read_lane
    push 72
    call __read_lane
    push 74
    call __read_lane
    call __chi3
    dup 0
    swap 3
    swap 1
    call __write_lane

    // chi_lane(temp[72], temp[74], temp[76]) -> write to base+2
    push 72
    call __read_lane
    push 74
    call __read_lane
    push 76
    call __read_lane
    call __chi3
    dup 0
    push 2
    add
    swap 1
    swap 2
    call __write_lane

    // chi_lane(temp[74], temp[76], temp[78]) -> write to base+4
    push 74
    call __read_lane
    push 76
    call __read_lane
    push 78
    call __read_lane
    call __chi3
    dup 0
    push 4
    add
    swap 1
    swap 2
    call __write_lane

    // chi_lane(temp[76], temp[78], temp[70]) -> write to base+6
    push 76
    call __read_lane
    push 78
    call __read_lane
    push 70
    call __read_lane
    call __chi3
    dup 0
    push 6
    add
    swap 1
    swap 2
    call __write_lane

    // chi_lane(temp[78], temp[70], temp[72]) -> write to base+8
    push 78
    call __read_lane
    push 70
    call __read_lane
    push 72
    call __read_lane
    call __chi3
    swap 1
    push 8
    add
    swap 1
    swap 2
    call __write_lane
    return

// __chi3: [a_lo a_hi b_lo b_hi c_lo c_hi] -> [r_lo r_hi]
// r = a ^ (~b & c)
__chi3:
    // Stack: a_lo a_hi b_lo b_hi c_lo c_hi (c_hi on top)
    // r = a ^ (~b & c)
    // Store a at mem[140,141]
    push 140
    swap 5
    write_mem 1
    pop 1
    push 141
    swap 4
    write_mem 1
    pop 1
    // Stack: b_lo b_hi c_lo c_hi
    // ~b_lo & c_lo
    swap 2
    push 4294967295
    xor
    and
    // Stack: b_hi c_hi (notb_lo & c_lo)
    swap 2
    // Stack: (notb_lo&c_lo) c_hi b_hi
    push 4294967295
    xor
    and
    // Stack: (notb_lo&c_lo) (notb_hi&c_hi)
    // XOR with a
    push 141
    read_mem 1
    pop 1
    xor
    swap 1
    push 140
    read_mem 1
    pop 1
    xor
    swap 1
    // Stack: r_lo r_hi
    return

// ===================================================================
// IOTA
// ===================================================================
// XOR round constant into lane (0,0).
// Stack: [rc_lo rc_hi] (rc_hi on top)
std_crypto_keccak256__iota:
    push 0
    call __read_lane
    call std_crypto_keccak256__xor_lane
    push 0
    call __write_lane
    return

// ===================================================================
// KECCAK ROUND
// ===================================================================
// __round: [rc_lo rc_hi] -> []
// Applies theta, rho, pi, chi, iota with given round constant.
std_crypto_keccak256__keccak_round:
    // Save rc to memory[130,131]
    push 130
    call __write_lane
    call std_crypto_keccak256__theta
    call std_crypto_keccak256__rho
    call std_crypto_keccak256__pi
    call std_crypto_keccak256__chi
    // Reload rc and apply iota
    push 130
    call __read_lane
    call std_crypto_keccak256__iota
    return

// ===================================================================
// FULL PERMUTATION (24 rounds)
// ===================================================================
std_crypto_keccak256__keccak_f1600:
    push 1
    push 0
    call std_crypto_keccak256__keccak_round
    push 32898
    push 0
    call std_crypto_keccak256__keccak_round
    push 32906
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 2147516416
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 32907
    push 0
    call std_crypto_keccak256__keccak_round
    push 2147483649
    push 0
    call std_crypto_keccak256__keccak_round
    push 2147516545
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 32777
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 138
    push 0
    call std_crypto_keccak256__keccak_round
    push 136
    push 0
    call std_crypto_keccak256__keccak_round
    push 2147516425
    push 0
    call std_crypto_keccak256__keccak_round
    push 2147483658
    push 0
    call std_crypto_keccak256__keccak_round
    push 2147516555
    push 0
    call std_crypto_keccak256__keccak_round
    push 139
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 32905
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 32771
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 32770
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 128
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 32778
    push 0
    call std_crypto_keccak256__keccak_round
    push 2147483658
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 2147516545
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 32896
    push 2147483648
    call std_crypto_keccak256__keccak_round
    push 2147483649
    push 0
    call std_crypto_keccak256__keccak_round
    push 2147516424
    push 2147483648
    call std_crypto_keccak256__keccak_round
    return

// ===================================================================
// ZERO STATE
// ===================================================================
std_crypto_keccak256__zero_state:
    // Push 50 zeros (25 lanes ร— 2 words)
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    push 0
    return

// ===================================================================
// EXTRACT 256
// ===================================================================
// Returns first 4 lanes (256 bits) as 8 U32 values.
// Reads s00, s10, s20, s30 from memory.
std_crypto_keccak256__extract256:
    push 0
    call __read_lane
    push 2
    call __read_lane
    push 4
    call __read_lane
    push 6
    call __read_lane
    return

// ===================================================================
// REMAINING .tri API FUNCTIONS (trivial wrappers)
// ===================================================================

// zero_lane
std_crypto_keccak256__zero_lane:
    push 0
    push 0
    return

// make_lane
std_crypto_keccak256__make_lane:
    return

// Round constant functions (rc0..rc23) โ€” return (lo, hi) pairs
std_crypto_keccak256__rc0:
    push 1
    push 0
    return
std_crypto_keccak256__rc1:
    push 32898
    push 0
    return
std_crypto_keccak256__rc2:
    push 32906
    push 2147483648
    return
std_crypto_keccak256__rc3:
    push 2147516416
    push 2147483648
    return
std_crypto_keccak256__rc4:
    push 32907
    push 0
    return
std_crypto_keccak256__rc5:
    push 2147483649
    push 0
    return
std_crypto_keccak256__rc6:
    push 2147516545
    push 2147483648
    return
std_crypto_keccak256__rc7:
    push 32777
    push 2147483648
    return
std_crypto_keccak256__rc8:
    push 138
    push 0
    return
std_crypto_keccak256__rc9:
    push 136
    push 0
    return
std_crypto_keccak256__rc10:
    push 2147516425
    push 0
    return
std_crypto_keccak256__rc11:
    push 2147483658
    push 0
    return
std_crypto_keccak256__rc12:
    push 2147516555
    push 0
    return
std_crypto_keccak256__rc13:
    push 139
    push 2147483648
    return
std_crypto_keccak256__rc14:
    push 32905
    push 2147483648
    return
std_crypto_keccak256__rc15:
    push 32771
    push 2147483648
    return
std_crypto_keccak256__rc16:
    push 32770
    push 2147483648
    return
std_crypto_keccak256__rc17:
    push 128
    push 2147483648
    return
std_crypto_keccak256__rc18:
    push 32778
    push 0
    return
std_crypto_keccak256__rc19:
    push 2147483658
    push 2147483648
    return
std_crypto_keccak256__rc20:
    push 2147516545
    push 2147483648
    return
std_crypto_keccak256__rc21:
    push 32896
    push 2147483648
    return
std_crypto_keccak256__rc22:
    push 2147483649
    push 0
    return
std_crypto_keccak256__rc23:
    push 2147516424
    push 2147483648
    return

Neighbours