// Hand-optimized TASM baseline: std.crypto.poseidon
//
// First principles rewrite. Key decisions:
// - sbox (x^5): 6 insns โ dup/dup/mul/dup/mul/mul
// - mix2: 12 insns โ S=a+b, new_a=S+a, new_b=S+2b, pure stack
// - mix4: 26 insns โ S=a+b+c+d, pure stack swap-replace (no memory)
// - hash2/hash4: call subroutines (call is ~1 cycle on Triton VM)
// - round2/round4: call sbox/mix inline
//
// Formulas:
// sbox(x) = x^5
// mix2(a, b) = (2a+b, a+3b)
// mix4(a,b,c,d) = (S+a, S+b, S+c, S+d) where S=a+b+c+d
// sbox: x -> x^5
// Input: x Output: x^5
// x -> x,x -> x,x^2 -> x,x^2,x^2 -> x,x^4 -> x^5
std_crypto_poseidon__sbox:
dup 0
dup 0
mul
dup 0
mul
mul
return
// mix2: (a, b) -> (2a+b, a+3b)
// Input: a b (b on top) Output: new_a new_b (new_b on top)
// S = a + b. new_a = S+a = 2a+b. new_b = S+2b = a+3b.
std_crypto_poseidon__mix2:
dup 0
dup 2
add
// Stack: a b S
dup 0
dup 3
add
// Stack: a b S new_a
swap 3
pop 1
// Stack: new_a b S
swap 1
dup 0
add
add
// Stack: new_a new_b
return
// mix4: (a,b,c,d) -> (S+a, S+b, S+c, S+d) where S=a+b+c+d
// Input: a b c d (d on top) Output: na nb nc nd (nd on top)
// Pure stack โ no memory operations.
std_crypto_poseidon__mix4:
// Compute S = a+b+c+d
dup 3
dup 3
add
dup 3
add
dup 2
add
// Stack: a b c d S
// na = S + a
dup 0
dup 5
add
swap 5
pop 1
// Stack: na b c d S
// nb = S + b
dup 0
dup 4
add
swap 4
pop 1
// Stack: na nb c d S
// nc = S + c
dup 0
dup 3
add
swap 3
pop 1
// Stack: na nb nc d S
// nd = S + d
dup 1
add
swap 1
pop 1
// Stack: na nb nc nd
return
// round2: one full round for 2-element state
// Input: a b rc0 rc1 (rc1 on top) Output: new_a new_b
std_crypto_poseidon__round2:
// Add round constants: b1 = b + rc1 (rc1 on top, b at pos 2)
dup 2
add
swap 2
pop 1
// Stack: a b1 rc0
// a1 = a + rc0
swap 2
add
swap 1
// Stack: a1 b1
// sbox both
call std_crypto_poseidon__sbox
swap 1
call std_crypto_poseidon__sbox
swap 1
// Stack: a2 b2
// mix2
call std_crypto_poseidon__mix2
return
// round4: one full round for 4-element state
// Input: a b c d rc0 rc1 rc2 rc3 (rc3 on top) Output: na nb nc nd
std_crypto_poseidon__round4:
// d1 = d + rc3
dup 4
add
swap 4
pop 1
// c1 = c + rc2
dup 4
add
swap 4
pop 1
// b1 = b + rc1
dup 4
add
swap 4
pop 1
// a1 = a + rc0
dup 4
add
swap 4
pop 1
// Stack: a1 b1 c1 d1
// sbox all 4
call std_crypto_poseidon__sbox
swap 1
call std_crypto_poseidon__sbox
swap 1
swap 2
call std_crypto_poseidon__sbox
swap 2
swap 3
call std_crypto_poseidon__sbox
swap 3
// Stack: a2 b2 c2 d2
call std_crypto_poseidon__mix4
return
// hash2: Poseidon hash of 2 field elements -> 1 field element
// Input: a b (b on top) Output: result
// 4 rounds with constants (3,7), (11,13), (17,19), (23,29)
std_crypto_poseidon__hash2:
// Round 1
push 7
push 3
call std_crypto_poseidon__round2
// Round 2
push 13
push 11
call std_crypto_poseidon__round2
// Round 3
push 19
push 17
call std_crypto_poseidon__round2
// Round 4
push 29
push 23
call std_crypto_poseidon__round2
// Return first element
pop 1
return
// hash4: Poseidon hash of 4 field elements -> 1 field element
// Input: a b c d (d on top) Output: result
// 4 rounds with constants (3,7,11,13), (17,19,23,29), (31,37,41,43), (47,53,59,61)
std_crypto_poseidon__hash4:
// Round 1
push 13
push 11
push 7
push 3
call std_crypto_poseidon__round4
// Round 2
push 29
push 23
push 19
push 17
call std_crypto_poseidon__round4
// Round 3
push 43
push 41
push 37
push 31
call std_crypto_poseidon__round4
// Round 4
push 61
push 59
push 53
push 47
call std_crypto_poseidon__round4
// Return first element
pop 1
pop 1
pop 1
return
// hash1: hash single field element (domain tag = 1)
// Input: a Output: result
std_crypto_poseidon__hash1:
push 1
call std_crypto_poseidon__hash2
return
// hash3: hash 3 field elements via chained hash2
// Input: a b c (c on top) Output: result
std_crypto_poseidon__hash3:
swap 2
swap 1
call std_crypto_poseidon__hash2
swap 1
call std_crypto_poseidon__hash2
return