// Hand-optimized TASM baseline: std.crypto.poseidon2
//
// Poseidon2 permutation over the Goldilocks field (p = 2^64 - 2^32 + 1).
// - State width t = 8, rate = 4, capacity = 4
// - S-box: x^7
// - 8 full rounds (4 initial + 4 final), 22 partial rounds
// - External linear layer: circulant(2,1,1,...,1)
// new[i] = state[i] + sum(state)
// - Internal linear layer: diag(2,3,5,9,17,33,65,129) + ones_matrix
// new[i] = d_i * state[i] + sum(state)
// - 86 round constants divined from nondeterministic input
//
// Stack convention:
// State = 8 field elements: s0 on top, s7 deepest.
// [s0, s1, s2, s3, s4, s5, s6, s7] (s0 = top = index 0)
// Rate = s0..s3, capacity = s4..s7.
//
// Round constants are divined per-round (divine 1 per constant).
// The nondeterministic input stream supplies constants in round order:
// 4 full (8 each) + 22 partial (1 each) + 4 full (8 each) = 86 total.
//
// Optimization vs compiler output:
// - sbox (x^7) in 8 ops as subroutine (compiler: function call overhead
// per invocation + redundant stack copies + struct packing)
// - Linear layers use in-place swap-chain to distribute sum (compiler:
// per-field struct reconstruction with full state copies)
// - full_round and partial_round as subroutines: body counted once in
// static instruction count, called 8x and 22x respectively
// - divine-and-add per element avoids stack gymnastics for 86 constants
// - Entire state on operational stack (no memory access)
//
// Instruction count rules:
// - Comments (// ...) are NOT counted
// - Labels (ending with :) are NOT counted
// - halt is NOT counted
// - Blank lines are NOT counted
// - Everything else IS counted (including return)
//
// Static instruction count summary:
// Entry point (hash2 demo) : 13
// __sbox : 9
// __external_linear : 46
// __internal_linear : 62
// __full_round : 54
// __partial_round : 5
// __permute : 31
// __zero_state : 9
// __absorb1 : 2
// __absorb2 : 7
// __absorb4 : 17
// __squeeze1 : 2
// __squeeze4 : 5
// __hash1 : 13
// __hash2 : 15
// __hash4 : 21
// ----------------------------------------
// Total : 311
// ===========================================================================
// ENTRY POINT: hash2(a, b) -> Field
// ===========================================================================
// Reads 2 field elements from public input, computes Poseidon2 hash with
// domain tag 2, writes 1 field element output.
// Nondeterministic input: 86 round constants.
//
// State: s0=a, s1=b, s2..s3=0, s4=2(domain), s5..s7=0.
// Build state bottom-up: push s7..s2 first, read inputs last.
// 13 counted instructions (halt excluded).
push 0
push 0
push 0
push 2
push 0
push 0
read_io 1
read_io 1
swap 1
// Stack: [s0=a, s1=b, s2=0, s3=0, s4=2, s5=0, s6=0, s7=0]
call __permute
// Stack: [s0', s1', s2', s3', s4', s5', s6', s7']
write_io 1
pop 5
pop 2
halt
// ===========================================================================
// CORE SUBROUTINES
// ===========================================================================
// ---------------------------------------------------------------------------
// __sbox: x -> x^7
// ---------------------------------------------------------------------------
// S-box for Poseidon2: x^7 is a permutation over Goldilocks since
// gcd(7, p-1) = 1 for p = 2^64 - 2^32 + 1.
//
// Decomposition: x^2, x^3 = x^2*x, x^6 = x^3^2, x^7 = x^6*x.
//
// Stack trace:
// [x] -> dup 0 -> [x, x] -> dup 0 -> [x, x, x]
// -> mul -> [x, x^2] -> dup 1 -> [x, x^2, x]
// -> mul -> [x, x^3] -> dup 0 -> [x, x^3, x^3]
// -> mul -> [x, x^6] -> mul -> [x^7]
//
// 9 counted instructions.
__sbox:
dup 0
dup 0
mul
dup 1
mul
dup 0
mul
mul
return
// ---------------------------------------------------------------------------
// __external_linear: state -> state
// ---------------------------------------------------------------------------
// Circulant(2,1,1,...,1): new[i] = state[i] + sum(state).
//
// Phase 1 (15 ops): Compute SUM on top via dup-chain.
// Phase 2 (30 ops): Add SUM to each element via swap-chain.
// For s_i (i=7 down to 1): dup 0, swap(d+1), add, swap(d)
// where d = 9-element depth of s_i.
// For s_0 (last): swap 1, add (consumes SUM).
//
// 46 counted instructions.
__external_linear:
// --- Phase 1: compute SUM ---
dup 0
dup 2
add
dup 3
add
dup 4
add
dup 5
add
dup 6
add
dup 7
add
dup 8
add
// Stack: [SUM, s0, s1, s2, s3, s4, s5, s6, s7]
// --- Phase 2: SUM + s_i for each element ---
// s7 (depth 8)
dup 0
swap 9
add
swap 8
// s6 (depth 7)
dup 0
swap 8
add
swap 7
// s5 (depth 6)
dup 0
swap 7
add
swap 6
// s4 (depth 5)
dup 0
swap 6
add
swap 5
// s3 (depth 4)
dup 0
swap 5
add
swap 4
// s2 (depth 3)
dup 0
swap 4
add
swap 3
// s1 (depth 2)
dup 0
swap 3
add
swap 2
// s0 (depth 1, last: consume SUM)
swap 1
add
return
// ---------------------------------------------------------------------------
// __internal_linear: state -> state
// ---------------------------------------------------------------------------
// Diagonal matrix diag(2,3,5,9,17,33,65,129) + ones_matrix:
// new[i] = d_i * state[i] + sum(state)
// where d_i = 1 + 2^i: [2, 3, 5, 9, 17, 33, 65, 129].
//
// Phase 1 (15 ops): Compute SUM (same dup-chain).
// Phase 2 (46 ops): For each element, multiply by d_i, add SUM.
// For s_i (i=7 down to 1): dup 0, swap(d+1), push d_i, mul, add, swap(d)
// For s_0 (last, d_0=2): swap 1, push 2, mul, add.
//
// 62 counted instructions.
__internal_linear:
// --- Phase 1: compute SUM ---
dup 0
dup 2
add
dup 3
add
dup 4
add
dup 5
add
dup 6
add
dup 7
add
dup 8
add
// Stack: [SUM, s0, s1, s2, s3, s4, s5, s6, s7]
// --- Phase 2: d_i * s_i + SUM ---
// s7: d_7 = 129
dup 0
swap 9
push 129
mul
add
swap 8
// s6: d_6 = 65
dup 0
swap 8
push 65
mul
add
swap 7
// s5: d_5 = 33
dup 0
swap 7
push 33
mul
add
swap 6
// s4: d_4 = 17
dup 0
swap 6
push 17
mul
add
swap 5
// s3: d_3 = 9
dup 0
swap 5
push 9
mul
add
swap 4
// s2: d_2 = 5
dup 0
swap 4
push 5
mul
add
swap 3
// s1: d_1 = 3
dup 0
swap 3
push 3
mul
add
swap 2
// s0: d_0 = 2 (last, consume SUM)
swap 1
push 2
mul
add
return
// ---------------------------------------------------------------------------
// __full_round: state -> state (divines 8 round constants)
// ---------------------------------------------------------------------------
// Full round: add 8 round constants, apply sbox to all 8, external linear.
//
// Phase 1 (30 ops): divine and add constants.
// s0: divine 1, add (2 ops).
// s_i (i=1..7): swap i, divine 1, add, swap i (4 ops each).
//
// Phase 2 (22 ops): apply sbox to all 8.
// s0: call __sbox (1 op).
// s_i (i=1..7): swap i, call __sbox, swap i (3 ops each).
//
// Phase 3 (1 op): call __external_linear.
//
// 54 counted instructions.
__full_round:
// --- Phase 1: add round constants ---
divine 1
add
swap 1
divine 1
add
swap 1
swap 2
divine 1
add
swap 2
swap 3
divine 1
add
swap 3
swap 4
divine 1
add
swap 4
swap 5
divine 1
add
swap 5
swap 6
divine 1
add
swap 6
swap 7
divine 1
add
swap 7
// --- Phase 2: sbox all 8 ---
call __sbox
swap 1
call __sbox
swap 1
swap 2
call __sbox
swap 2
swap 3
call __sbox
swap 3
swap 4
call __sbox
swap 4
swap 5
call __sbox
swap 5
swap 6
call __sbox
swap 6
swap 7
call __sbox
swap 7
// --- Phase 3: linear layer ---
call __external_linear
return
// ---------------------------------------------------------------------------
// __partial_round: state -> state (divines 1 round constant)
// ---------------------------------------------------------------------------
// Partial round: add constant to s0, sbox s0, internal linear.
//
// 5 counted instructions.
__partial_round:
divine 1
add
call __sbox
call __internal_linear
return
// ---------------------------------------------------------------------------
// __permute: state -> state (divines 86 round constants)
// ---------------------------------------------------------------------------
// Full Poseidon2 permutation: 4 full + 22 partial + 4 full rounds.
//
// 31 counted instructions.
__permute:
// 4 initial full rounds (32 constants)
call __full_round
call __full_round
call __full_round
call __full_round
// 22 partial rounds (22 constants)
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
call __partial_round
// 4 final full rounds (32 constants)
call __full_round
call __full_round
call __full_round
call __full_round
return
// ===========================================================================
// SPONGE API
// ===========================================================================
// ---------------------------------------------------------------------------
// __zero_state: -> state
// ---------------------------------------------------------------------------
// Pushes 8 zeros onto the stack.
// 9 counted instructions.
__zero_state:
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
return
// ---------------------------------------------------------------------------
// __absorb1: (a, state) -> state
// ---------------------------------------------------------------------------
// Adds 1 field element to s0. Caller permutes after.
// Stack in: [a, s0, s1, s2, s3, s4, s5, s6, s7]
// Stack out: [s0+a, s1, s2, s3, s4, s5, s6, s7]
//
// 2 counted instructions.
__absorb1:
add
return
// ---------------------------------------------------------------------------
// __absorb2: (b, a, state) -> state
// ---------------------------------------------------------------------------
// Adds a to s0, b to s1.
// Stack in: [b, a, s0, s1, s2, s3, s4, s5, s6, s7]
// Stack out: [s0+a, s1+b, s2, s3, s4, s5, s6, s7]
//
// Trace:
// [b, a, s0, s1, ...]
// swap 3 -> [s1, a, s0, b, ...] bring s1 to top
// swap 1 -> [a, s1, s0, b, ...] bring a to top
// swap 3 -> [b, s1, s0, a, ...] bring b to top
// add -> [s1+b, s0, a, ...] s1 done
// swap 2 -> [a, s0, s1', ...] bring a adjacent to s0
// add -> [s0+a, s1', s2, ...] s0 done
//
// 7 counted instructions.
__absorb2:
swap 3
swap 1
swap 3
add
swap 2
add
return
// ---------------------------------------------------------------------------
// __absorb4: (d, c, b, a, state) -> state
// ---------------------------------------------------------------------------
// Adds a to s0, b to s1, c to s2, d to s3.
// Stack in: [d, c, b, a, s0, s1, s2, s3, s4, s5, s6, s7]
// Stack out: [s0+a, s1+b, s2+c, s3+d, s4, s5, s6, s7]
//
// Process from deepest target (s3) to shallowest (s0).
// Each step: swap-chain to pair input and state element, add, swap result
// into position.
//
// 17 counted instructions.
__absorb4:
// s3 += d: d@0, s3@7 -> pair via swap-chain
swap 7
swap 1
swap 7
add
swap 6
// [c, b, a, s0, s1, s2, s3', s4, s5, s6, s7]
// s2 += c: c@0, s2@5
swap 5
swap 1
swap 5
add
swap 4
// [b, a, s0, s1, s2', s3', s4, s5, s6, s7]
// s1 += b: b@0, s1@3
swap 3
swap 1
swap 3
add
swap 2
// [a, s0, s1', s2', s3', s4, s5, s6, s7]
// s0 += a: a@0, s0@1 (adjacent)
add
// [s0+a, s1', s2', s3', s4, s5, s6, s7]
return
// ---------------------------------------------------------------------------
// __squeeze1: state -> (s0_copy, state)
// ---------------------------------------------------------------------------
// Non-destructive: copies s0 to top, state remains below.
// 2 counted instructions.
__squeeze1:
dup 0
return
// ---------------------------------------------------------------------------
// __squeeze4: state -> (s3, s2, s1, s0, state)
// ---------------------------------------------------------------------------
// Non-destructive: copies rate elements to top, state remains below.
// Return order matches tuple convention (first element deepest).
// 5 counted instructions.
__squeeze4:
dup 0
dup 2
dup 4
dup 6
return
// ===========================================================================
// HASH FUNCTIONS
// ===========================================================================
// ---------------------------------------------------------------------------
// __hash1: (input) -> hash (divines 86 constants)
// ---------------------------------------------------------------------------
// Hash 1 field element to 1 field element.
// Domain tag s4 = 1.
// Stack in: [input]
// Stack out: [hash]
//
// Build state: push s6..s1 as padding (s4=1 domain tag among them),
// then push 0 on top and swap with input at bottom to place input as s0.
//
// Pushes (1st..7th): s6=0, s5=0, s4=1, s3=0, s2=0, s1=0, swap_target=0
// These occupy indices 6,5,4,3,2,1,0 after pushing (input at idx 7).
// swap 7 brings input to idx 0 (s0), pushes 0 to idx 7 (s7).
//
// 13 counted instructions.
__hash1:
push 0
push 0
push 1
push 0
push 0
push 0
push 0
swap 7
// Stack: [input, 0, 0, 0, 1, 0, 0, 0]
call __permute
// Stack: [s0', s1', s2', s3', s4', s5', s6', s7']
swap 7
pop 5
pop 2
// Stack: [s0']
return
// ---------------------------------------------------------------------------
// __hash2: (a, b) -> hash (divines 86 constants)
// ---------------------------------------------------------------------------
// Hash 2 field elements to 1 field element.
// Domain tag s4 = 2.
// Stack in: [a, b] (a on top)
// Stack out: [hash]
//
// Build state: push s6..s2 padding, swap to place a at s0 and b at s1.
//
// After 6 pushes on [a, b]:
// indices: 0=p6, 1=p5, 2=p4, 3=p3, 4=p2, 5=p1, 6=a, 7=b
// Push order: s6=0, s5=0, s4=2, s3=0, s2=0, s1_swap=0
// Then: swap 6 -> a to idx 0; swap 1,swap 7,swap 1 -> b to idx 1.
//
// 15 counted instructions.
__hash2:
push 0
push 2
push 0
push 0
push 0
push 0
// [0, 0, 0, 0, 2, 0, a, b]
swap 6
swap 1
swap 7
swap 1
// [a, b, 0, 0, 2, 0, 0, 0]
call __permute
// [s0', s1', s2', s3', s4', s5', s6', s7']
swap 7
pop 5
pop 2
return
// ---------------------------------------------------------------------------
// __hash4: (a, b, c, d) -> (s0, s1, s2, s3) (divines 86 constants)
// ---------------------------------------------------------------------------
// Hash 4 field elements to 4 field elements (full-rate I/O).
// Domain tag s4 = 4.
// Stack in: [a, b, c, d] (a on top)
// Stack out: [s0', s1', s2', s3']
//
// Build state: push s7=0,s6=0,s5=0,s4=4 on top, then block-swap the two
// halves to place inputs a,b,c,d at s0..s3.
//
// After 4 pushes: [4, 0, 0, 0, a, b, c, d]
// Block-swap via: swap 7,swap 3 moves d; swap 6,swap 2 moves c;
// swap 5,swap 1 moves b; swap 4 moves a.
//
// After permute, remove capacity s4..s7 by cycling them to top and popping.
//
// 21 counted instructions.
__hash4:
push 0
push 0
push 4
push 0
// [0, 4, 0, 0, a, b, c, d]
swap 7
swap 3
swap 6
swap 2
swap 5
swap 1
swap 4
// [a, b, c, d, 4, 0, 0, 0]
call __permute
// [s0', s1', s2', s3', s4', s5', s6', s7']
swap 4
pop 1
swap 4
pop 1
swap 4
pop 1
swap 4
pop 1
// [s0', s1', s2', s3']
return