// Hand-optimized TASM baseline: std.crypto.poseidon
//
// First principles rewrite. Key decisions:
//   - sbox (x^5): 6 insns โ€” dup/dup/mul/dup/mul/mul
//   - mix2: 12 insns โ€” S=a+b, new_a=S+a, new_b=S+2b, pure stack
//   - mix4: 26 insns โ€” S=a+b+c+d, pure stack swap-replace (no memory)
//   - hash2/hash4: call subroutines (call is ~1 cycle on Triton VM)
//   - round2/round4: call sbox/mix inline
//
// Formulas:
//   sbox(x) = x^5
//   mix2(a, b) = (2a+b, a+3b)
//   mix4(a,b,c,d) = (S+a, S+b, S+c, S+d) where S=a+b+c+d


// sbox: x -> x^5
// Input: x  Output: x^5
// x -> x,x -> x,x^2 -> x,x^2,x^2 -> x,x^4 -> x^5
std_crypto_poseidon__sbox:
    dup 0
    dup 0
    mul
    dup 0
    mul
    mul
    return


// mix2: (a, b) -> (2a+b, a+3b)
// Input: a b  (b on top)  Output: new_a new_b  (new_b on top)
// S = a + b.  new_a = S+a = 2a+b.  new_b = S+2b = a+3b.
std_crypto_poseidon__mix2:
    dup 0
    dup 2
    add
    // Stack: a b S
    dup 0
    dup 3
    add
    // Stack: a b S new_a
    swap 3
    pop 1
    // Stack: new_a b S
    swap 1
    dup 0
    add
    add
    // Stack: new_a new_b
    return


// mix4: (a,b,c,d) -> (S+a, S+b, S+c, S+d) where S=a+b+c+d
// Input: a b c d  (d on top)  Output: na nb nc nd  (nd on top)
// Pure stack โ€” no memory operations.
std_crypto_poseidon__mix4:
    // Compute S = a+b+c+d
    dup 3
    dup 3
    add
    dup 3
    add
    dup 2
    add
    // Stack: a b c d S
    // na = S + a
    dup 0
    dup 5
    add
    swap 5
    pop 1
    // Stack: na b c d S
    // nb = S + b
    dup 0
    dup 4
    add
    swap 4
    pop 1
    // Stack: na nb c d S
    // nc = S + c
    dup 0
    dup 3
    add
    swap 3
    pop 1
    // Stack: na nb nc d S
    // nd = S + d
    dup 1
    add
    swap 1
    pop 1
    // Stack: na nb nc nd
    return


// round2: one full round for 2-element state
// Input: a b rc0 rc1  (rc1 on top)  Output: new_a new_b
std_crypto_poseidon__round2:
    // Add round constants: b1 = b + rc1 (rc1 on top, b at pos 2)
    dup 2
    add
    swap 2
    pop 1
    // Stack: a b1 rc0
    // a1 = a + rc0
    swap 2
    add
    swap 1
    // Stack: a1 b1
    // sbox both
    call std_crypto_poseidon__sbox
    swap 1
    call std_crypto_poseidon__sbox
    swap 1
    // Stack: a2 b2
    // mix2
    call std_crypto_poseidon__mix2
    return


// round4: one full round for 4-element state
// Input: a b c d rc0 rc1 rc2 rc3  (rc3 on top)  Output: na nb nc nd
std_crypto_poseidon__round4:
    // d1 = d + rc3
    dup 4
    add
    swap 4
    pop 1
    // c1 = c + rc2
    dup 4
    add
    swap 4
    pop 1
    // b1 = b + rc1
    dup 4
    add
    swap 4
    pop 1
    // a1 = a + rc0
    dup 4
    add
    swap 4
    pop 1
    // Stack: a1 b1 c1 d1
    // sbox all 4
    call std_crypto_poseidon__sbox
    swap 1
    call std_crypto_poseidon__sbox
    swap 1
    swap 2
    call std_crypto_poseidon__sbox
    swap 2
    swap 3
    call std_crypto_poseidon__sbox
    swap 3
    // Stack: a2 b2 c2 d2
    call std_crypto_poseidon__mix4
    return


// hash2: Poseidon hash of 2 field elements -> 1 field element
// Input: a b  (b on top)  Output: result
// 4 rounds with constants (3,7), (11,13), (17,19), (23,29)
std_crypto_poseidon__hash2:
    // Round 1
    push 7
    push 3
    call std_crypto_poseidon__round2
    // Round 2
    push 13
    push 11
    call std_crypto_poseidon__round2
    // Round 3
    push 19
    push 17
    call std_crypto_poseidon__round2
    // Round 4
    push 29
    push 23
    call std_crypto_poseidon__round2
    // Return first element
    pop 1
    return


// hash4: Poseidon hash of 4 field elements -> 1 field element
// Input: a b c d  (d on top)  Output: result
// 4 rounds with constants (3,7,11,13), (17,19,23,29), (31,37,41,43), (47,53,59,61)
std_crypto_poseidon__hash4:
    // Round 1
    push 13
    push 11
    push 7
    push 3
    call std_crypto_poseidon__round4
    // Round 2
    push 29
    push 23
    push 19
    push 17
    call std_crypto_poseidon__round4
    // Round 3
    push 43
    push 41
    push 37
    push 31
    call std_crypto_poseidon__round4
    // Round 4
    push 61
    push 59
    push 53
    push 47
    call std_crypto_poseidon__round4
    // Return first element
    pop 1
    pop 1
    pop 1
    return


// hash1: hash single field element (domain tag = 1)
// Input: a  Output: result
std_crypto_poseidon__hash1:
    push 1
    call std_crypto_poseidon__hash2
    return


// hash3: hash 3 field elements via chained hash2
// Input: a b c  (c on top)  Output: result
std_crypto_poseidon__hash3:
    swap 2
    swap 1
    call std_crypto_poseidon__hash2
    swap 1
    call std_crypto_poseidon__hash2
    return

Neighbours