// Hand-optimized TASM baseline: std.trinity.inference
//
// Rosetta Stone unification: one lookup table, four readers.
// Real LWE encryption over Goldilocks, lookup-table ReLU activation,
// LUT sponge hash (S-box from LUT), Poseidon2 hash commitment,
// PBS demo (test polynomial from LUT), 2-qubit Bell pair commitment.
//
// Reader 1: lut.apply    in __dense_layer    โ€” neural activation
// Reader 2: lut.read     in __lut_sponge     โ€” crypto S-box
// Reader 3: lut.read     in __pbs_build_test โ€” FHE test polynomial
// Reader 4: STARK LogUp                      โ€” proof authentication (upstream)
//
// Pitch parameters: LWE dim 8, 8 inputs, 16 neurons,
// ring dim 64, domain 1024, Bell commitment.
//
// Stack convention:
//   Arguments pushed left-to-right (first arg deepest on stack).
//   Return values left on top of stack after return.
//
// Instruction count rules:
//   - Comments (// ...) are NOT counted
//   - Labels (ending with :) are NOT counted
//   - halt is NOT counted
//   - Blank lines are NOT counted
//   - Everything else IS counted (including return)
//
// Static instruction count summary:
//   __decrypt_loop     :  24
//   __dense_layer      :  17
//   __sum_loop         :  13
//   __hash_commit      :  15
//   __lut_hash_commit  :  17
//   __quantum_commit   :   3
//   __trinity          :  78
//   ----------------------------------------
//   Total              : 167


// ===========================================================================
// PHASE 1b: DECRYPT OUTPUTS (loop)
// ===========================================================================


// Decrypt loop: counts neurons down to 0.
// Stack: [counter, lwe_n, delta, result_addr, s_addr, ct_out_addr]
__decrypt_loop:
    dup 0
    push 0
    eq
    skiz
    return
    push -1
    add
    // Stack: [i, lwe_n, delta, result_addr, s_addr, ct_out_addr]
    // ct_addr = ct_out_addr + i * (lwe_n + 1)
    dup 0
    dup 2
    push 1
    add
    mul
    dup 6
    add
    // Stack: [ct_addr, i, lwe_n, delta, result_addr, s_addr, ct_out_addr]
    // lwe.decrypt(ct_addr, s_addr, delta, lwe_n) -> m
    dup 5
    dup 4
    dup 4
    call __lwe_decrypt
    // Stack: [m, i, lwe_n, delta, result_addr, s_addr, ct_out_addr]
    // mem.write(result_addr + i, m)
    dup 1
    dup 5
    add
    write_mem 1
    pop 1
    // Stack: [i, lwe_n, delta, result_addr, s_addr, ct_out_addr]
    recurse


// ===========================================================================
// PHASE 2: DENSE NEURAL LAYER (with lookup-table activation)
// ===========================================================================


// ---------------------------------------------------------------------------
// __dense_layer: (w_addr, x_addr, b_addr, out_addr, tmp_addr, lut_addr, neurons)
// ---------------------------------------------------------------------------
// Dense layer: out = lut_relu(W * x + b).  [Reader 1: lut.apply]
//
// Stack: [w, x, b, out, tmp, lut, neurons]
//
// 17 counted instructions.
__dense_layer:
    dup 6
    dup 7
    dup 7
    dup 4
    dup 4
    call __tensor_matvec
    dup 6
    dup 4
    dup 5
    dup 7
    call __tensor_bias_add
    dup 6
    dup 4
    dup 5
    dup 8
    call __lut_apply
    return


// ===========================================================================
// PHASE 3a: LUT SPONGE HASH COMMITMENT (Rosetta Stone Reader #2)
// ===========================================================================


// ---------------------------------------------------------------------------
// __sum_loop: (addr, counter, accumulator) -> (addr_end, 0, sum)
// ---------------------------------------------------------------------------
// 13 counted instructions.
__sum_loop:
    dup 1
    push 0
    eq
    skiz
    return
    read_mem 1
    swap 3
    add
    swap 2
    push -1
    add
    swap 1
    recurse


// ---------------------------------------------------------------------------
// __lut_hash_commit: (activated, neurons, w_dig, key_dig, class, lut_addr, domain, sponge_rc) -> digest
// ---------------------------------------------------------------------------
// LUT sponge hash: S-box reads from lut_addr (Reader #2).
// Computes output_digest = sum(activated), then hashes via LUT sponge.
//
// Stack: [activated, neurons, w_dig, key_dig, class, lut_addr, domain, sponge_rc]
//   st0: activated      st4: class
//   st1: neurons        st5: lut_addr
//   st2: w_dig          st6: domain
//   st3: key_dig        st7: sponge_rc
//
// 17 counted instructions.
__lut_hash_commit:
    // --- sum(activated, neurons) -> output_digest ---
    dup 0
    dup 2
    push 0
    call __sum_loop
    pop 2
    // Stack: [output_digest, activated, neurons, w_dig, key_dig, class, lut_addr, domain, sponge_rc]
    // --- lut_sponge.hash4_to_digest(w_dig, key_dig, output_digest, class, lut_addr, domain, sponge_rc) ---
    dup 3
    dup 5
    dup 3
    dup 8
    dup 10
    dup 12
    dup 14
    call __lut_sponge_hash4_to_digest
    // Stack: [digest, output_digest, activated, ...]
    swap 8
    pop 5
    pop 3
    return


// ===========================================================================
// PHASE 3b: POSEIDON2 HASH COMMITMENT (production binding)
// ===========================================================================


// ---------------------------------------------------------------------------
// __hash_commit: (activated_addr, neurons, weights_digest, key_digest, class, rc_addr) -> digest
// ---------------------------------------------------------------------------
// 15 counted instructions.
__hash_commit:
    dup 0
    dup 2
    push 0
    call __sum_loop
    pop 2
    dup 3
    dup 5
    dup 3
    dup 8
    dup 10
    call __poseidon2_hash4_to_digest
    swap 7
    pop 5
    pop 2
    return


// ===========================================================================
// PHASE 5: QUANTUM COMMITMENT (2-qubit Bell pair)
// ===========================================================================


// ---------------------------------------------------------------------------
// __quantum_commit: (class: Field) -> Bool
// ---------------------------------------------------------------------------
// 3 counted instructions.
__quantum_commit:
    push 0
    eq
    return


// ===========================================================================
// FULL TRINITY PIPELINE โ€” ROSETTA STONE UNIFICATION
// ===========================================================================


// ---------------------------------------------------------------------------
// __trinity: (cts_addr, s_addr, w_priv_addr, ct_out_addr, tmp_addr,
//             result_addr, delta, lwe_n, input_dim, neurons,
//             dense_w_addr, dense_b_addr, activated_addr,
//             lut_addr, expected_class,
//             rc_addr, weights_digest, key_digest, expected_digest,
//             domain, sponge_rc_addr, expected_lut_digest,
//             pbs_sample_ct, pbs_out_addr, ring_n,
//             pbs_acc_addr, pbs_test_addr, pbs_tmp_addr,
//             pbs_expected_m) -> Bool
// ---------------------------------------------------------------------------
// One table (lut_addr), four readers (1-3 demonstrated, 4 upstream):
//   Phase 2:  lut.apply (Reader 1 โ€” NN activation)
//   Phase 3a: lut.read  (Reader 2 โ€” crypto S-box)
//   Phase 4:  lut.read  (Reader 3 โ€” FHE test polynomial)
//
// Stack on entry (st0 = top, 29 args):
//   st0:  pbs_expected_m
//   st1:  pbs_tmp_addr
//   st2:  pbs_test_addr
//   st3:  pbs_acc_addr
//   st4:  ring_n
//   st5:  pbs_out_addr
//   st6:  pbs_sample_ct
//   st7:  expected_lut_digest
//   st8:  sponge_rc_addr
//   st9:  domain
//   st10: expected_digest
//   st11: key_digest
//   st12: weights_digest
//   st13: rc_addr
//   st14: expected_class
//   st15: lut_addr
//   st16: activated_addr
//   st17: dense_b_addr
//   st18: dense_w_addr
//   st19: neurons
//   st20: input_dim
//   st21: lwe_n
//   st22: delta
//   st23: result_addr
//   st24: tmp_addr
//   st25: ct_out_addr
//   st26: w_priv_addr
//   st27: s_addr
//   st28: cts_addr
//
// 78 counted instructions.
__trinity:
    // --- Phase 1: lwe.private_linear(cts, w_priv, ct_out, tmp, lwe_n, input_dim, neurons) ---
    dup 28
    dup 27
    dup 27
    dup 27
    dup 25
    dup 25
    dup 25
    call __lwe_private_linear
    // --- Phase 1b: decrypt_outputs ---
    // __decrypt_loop: [neurons, lwe_n, delta, result_addr, s_addr, ct_out_addr]
    dup 19
    dup 22
    dup 24
    dup 26
    dup 30
    dup 29
    call __decrypt_loop
    pop 6
    // --- Phase 2: dense_layer(dense_w, result, dense_b, activated, tmp, lut, neurons) ---
    // [Reader 1: lut.apply]
    dup 18
    dup 24
    dup 19
    dup 19
    dup 28
    dup 19
    dup 25
    call __dense_layer
    pop 7
    // --- argmax(activated_addr, neurons) + assert class == expected ---
    dup 16
    dup 20
    call __tensor_argmax
    // Stack: [class, pbs_exp_m, pbs_tmp, pbs_test, pbs_acc, ring_n, pbs_out, pbs_ct, exp_lut_dig, sponge_rc, domain, exp_dig, key_dig, w_dig, rc, exp_class, lut, activated, ...]
    dup 0
    dup 16
    eq
    assert
    // --- Phase 3a: lut_hash_commit(activated, neurons, w_dig, key_dig, class, lut, domain, sponge_rc) ---
    // [Reader 2: lut.read in S-box]
    dup 17
    dup 21
    dup 15
    dup 15
    dup 5
    dup 19
    dup 14
    dup 14
    call __lut_hash_commit
    // Stack: [lut_digest, class, ...]
    // assert lut_digest == expected_lut_digest
    dup 9
    eq
    assert
    // --- Phase 3b: hash_commit(activated, neurons, w_dig, key_dig, class, rc_addr) ---
    dup 17
    dup 21
    dup 15
    dup 15
    dup 5
    dup 17
    call __hash_commit
    // Stack: [digest, class, ...]
    // assert digest == expected_digest
    dup 12
    eq
    assert
    // --- Phase 4: pbs_demo ---
    // [Reader 3: lut.read in build_test_poly]
    // pbs.bootstrap(ct, s, lut, out, delta, lwe_n, ring_n, domain, acc, test, tmp) -> m
    dup 8
    dup 30
    dup 19
    dup 9
    dup 26
    dup 25
    dup 9
    dup 15
    dup 9
    dup 9
    dup 7
    call __pbs_bootstrap
    // assert m == expected_m
    dup 3
    eq
    assert
    // --- Phase 5: quantum_commit(class) ---
    // class is still on stack from earlier
    call __quantum_commit
    // Cleanup: swap result past 29 args, pop all
    swap 29
    pop 5
    pop 5
    pop 5
    pop 5
    pop 5
    pop 4
    return

Neighbours