// Hand-optimized TASM baseline: std.crypto.bigint
// 256-bit integer arithmetic over 8 U32 limbs (little-endian).
//
// U256 on stack (8 elements): [l7 l6 l5 l4 l3 l2 l1 l0]
// l7 = MSB (st0), l0 = LSB (st7)
//
// Calling convention: first arg deepest.
// fn(a, b): [b on top | a below]
//
// Memory layout (non-overlapping scratch regions):
// 0..7 slot A (store/load helpers, inline carry chains)
// 10..17 slot B (store/load helpers, inline carry chains)
// 20..27 accumulator (mul256_low)
// 30 scalar (mul256_by_u32, shl_limbs counter)
// 40..47 orig A (mul256_low, mod_reduce_once, sub_mod)
// 50..57 orig B (mul256_low, mod_reduce_once, sub_mod)
// 60..67 modulus (add_mod, sub_mod, mul_mod)
//
// Triton VM memory ops:
// write_mem N: st0=addr, st1..stN=vals -> writes st1..[addr] etc -> [addr+N]
// read_mem 1: st0=addr -> [mem[addr], addr-1]
// split: [x] -> st0=lo(32bit), st1=hi(32bit)
// skiz: pops st0; if nonzero, executes next instruction
//
// Counting: comments, labels (:), halt, blanks NOT counted.
// ===================================================================
// MEMORY HELPERS
// ===================================================================
// __store_a: store top U256 to mem[0..7]. 5 ops.
__store_a:
push 0
write_mem 5
write_mem 3
pop 1
return
// __store_b: store top U256 to mem[10..17]. 5 ops.
__store_b:
push 10
write_mem 5
write_mem 3
pop 1
return
// __load_a: load U256 from mem[0..7] as [l7..l0]. 25 ops.
// Reads addr 7 down to 0; each pushes on top in correct order.
__load_a:
push 7
read_mem 1
pop 1
push 6
read_mem 1
pop 1
push 5
read_mem 1
pop 1
push 4
read_mem 1
pop 1
push 3
read_mem 1
pop 1
push 2
read_mem 1
pop 1
push 1
read_mem 1
pop 1
push 0
read_mem 1
pop 1
return
// __load_b: load U256 from mem[10..17]. 25 ops.
__load_b:
push 17
read_mem 1
pop 1
push 16
read_mem 1
pop 1
push 15
read_mem 1
pop 1
push 14
read_mem 1
pop 1
push 13
read_mem 1
pop 1
push 12
read_mem 1
pop 1
push 11
read_mem 1
pop 1
push 10
read_mem 1
pop 1
return
// __store_orig_a: store top U256 to mem[40..47]. 5 ops.
__store_orig_a:
push 40
write_mem 5
write_mem 3
pop 1
return
// __load_orig_a: load U256 from mem[40..47]. 25 ops.
__load_orig_a:
push 47
read_mem 1
pop 1
push 46
read_mem 1
pop 1
push 45
read_mem 1
pop 1
push 44
read_mem 1
pop 1
push 43
read_mem 1
pop 1
push 42
read_mem 1
pop 1
push 41
read_mem 1
pop 1
push 40
read_mem 1
pop 1
return
// __store_orig_b: store top U256 to mem[50..57]. 5 ops.
__store_orig_b:
push 50
write_mem 5
write_mem 3
pop 1
return
// __load_orig_b: load U256 from mem[50..57]. 25 ops.
__load_orig_b:
push 57
read_mem 1
pop 1
push 56
read_mem 1
pop 1
push 55
read_mem 1
pop 1
push 54
read_mem 1
pop 1
push 53
read_mem 1
pop 1
push 52
read_mem 1
pop 1
push 51
read_mem 1
pop 1
push 50
read_mem 1
pop 1
return
// __load_mod: load U256 from mem[60..67]. 25 ops.
__load_mod:
push 67
read_mem 1
pop 1
push 66
read_mem 1
pop 1
push 65
read_mem 1
pop 1
push 64
read_mem 1
pop 1
push 63
read_mem 1
pop 1
push 62
read_mem 1
pop 1
push 61
read_mem 1
pop 1
push 60
read_mem 1
pop 1
return
// __load_accum: load U256 from mem[20..27]. 25 ops.
__load_accum:
push 27
read_mem 1
pop 1
push 26
read_mem 1
pop 1
push 25
read_mem 1
pop 1
push 24
read_mem 1
pop 1
push 23
read_mem 1
pop 1
push 22
read_mem 1
pop 1
push 21
read_mem 1
pop 1
push 20
read_mem 1
pop 1
return
// ===================================================================
// CONSTANTS
// ===================================================================
// zero256() -> U256. 9 ops.
__zero256:
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
return
// one256() -> U256. 9 ops.
__one256:
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 1
return
// from_u32(x: U32) -> U256. 9 ops.
// Entry: [x] Exit: [0 0 0 0 0 0 0 x]
__from_u32:
push 0
push 0
push 0
push 0
push 0
push 0
push 0
swap 7
return
// ===================================================================
// U32 HELPERS
// ===================================================================
// add_u32_carry(a, b, carry_in) -> (result, carry_out). 5 ops.
// Entry: [carry_in, b, a] Exit: [carry_out, result]
__add_u32_carry:
add
add
split
swap 1
return
// sub_u32_borrow(a, b, borrow_in) -> (result, borrow_out). 14 ops.
// Entry: [borrow_in, b, a] Exit: [borrow_out, result]
__sub_u32_borrow:
push -1
mul
swap 1
push -1
mul
add
add
push 4294967296
add
split
swap 1
push -1
mul
push 1
add
return
// ===================================================================
// 256-BIT ADDITION
// ===================================================================
// add256(a: U256, b: U256) -> (U256, U32 carry). 84 ops.
// Entry: [b.l7..b.l0, a.l7..a.l0] Exit: [carry, r.l7..r.l0]
// Stores both to mem, carry-chains limb 0 through 7.
__add256:
call __store_b
call __store_a
push 0
push 7
read_mem 1
pop 1
push 17
read_mem 1
pop 1
add
add
split
swap 1
push 6
read_mem 1
pop 1
push 16
read_mem 1
pop 1
add
add
split
swap 1
push 5
read_mem 1
pop 1
push 15
read_mem 1
pop 1
add
add
split
swap 1
push 4
read_mem 1
pop 1
push 14
read_mem 1
pop 1
add
add
split
swap 1
push 3
read_mem 1
pop 1
push 13
read_mem 1
pop 1
add
add
split
swap 1
push 2
read_mem 1
pop 1
push 12
read_mem 1
pop 1
add
add
split
swap 1
push 1
read_mem 1
pop 1
push 11
read_mem 1
pop 1
add
add
split
swap 1
push 0
read_mem 1
pop 1
push 10
read_mem 1
pop 1
add
add
split
swap 1
return
// ===================================================================
// 256-BIT SUBTRACTION
// ===================================================================
// sub256(a: U256, b: U256) -> U256. 116 ops.
// Entry: [b.l7..b.l0, a.l7..a.l0] Exit: [r.l7..r.l0]
// Uses inverted borrow: hi=1 means no borrow, hi=0 means borrow.
// Per limb: (2^32 - 1) + a_i - b_i + hi_prev; split -> [hi, result].
// Initial hi = 1 (no borrow). Final hi discarded.
__sub256:
call __store_b
call __store_a
push 1
// Limb 0: a.l0=mem[7], b.l0=mem[17]
push 17
read_mem 1
pop 1
push -1
mul
push 7
read_mem 1
pop 1
add
add
push 4294967295
add
split
swap 1
// Limb 1
push 16
read_mem 1
pop 1
push -1
mul
push 6
read_mem 1
pop 1
add
add
push 4294967295
add
split
swap 1
// Limb 2
push 15
read_mem 1
pop 1
push -1
mul
push 5
read_mem 1
pop 1
add
add
push 4294967295
add
split
swap 1
// Limb 3
push 14
read_mem 1
pop 1
push -1
mul
push 4
read_mem 1
pop 1
add
add
push 4294967295
add
split
swap 1
// Limb 4
push 13
read_mem 1
pop 1
push -1
mul
push 3
read_mem 1
pop 1
add
add
push 4294967295
add
split
swap 1
// Limb 5
push 12
read_mem 1
pop 1
push -1
mul
push 2
read_mem 1
pop 1
add
add
push 4294967295
add
split
swap 1
// Limb 6
push 11
read_mem 1
pop 1
push -1
mul
push 1
read_mem 1
pop 1
add
add
push 4294967295
add
split
swap 1
// Limb 7 (discard final hi)
push 10
read_mem 1
pop 1
push -1
mul
push 0
read_mem 1
pop 1
add
add
push 4294967295
add
split
swap 1
pop 1
return
// ===================================================================
// COMPARISON
// ===================================================================
// eq256(a: U256, b: U256) -> Bool. 59 ops.
// Entry: [b.l7..b.l0, a.l7..a.l0] Exit: [bool]
__eq256:
call __store_b
call __store_a
push 0
read_mem 1
pop 1
push 10
read_mem 1
pop 1
eq
push 1
read_mem 1
pop 1
push 11
read_mem 1
pop 1
eq
and
push 2
read_mem 1
pop 1
push 12
read_mem 1
pop 1
eq
and
push 3
read_mem 1
pop 1
push 13
read_mem 1
pop 1
eq
and
push 4
read_mem 1
pop 1
push 14
read_mem 1
pop 1
eq
and
push 5
read_mem 1
pop 1
push 15
read_mem 1
pop 1
eq
and
push 6
read_mem 1
pop 1
push 16
read_mem 1
pop 1
eq
and
push 7
read_mem 1
pop 1
push 17
read_mem 1
pop 1
eq
and
return
// lt256(a: U256, b: U256) -> Bool. 116 ops.
// Entry: [b.l7..b.l0, a.l7..a.l0] Exit: [bool (a < b)]
// Inverted borrow chain: hi=1 no borrow, hi=0 borrow.
// Per limb: (2^32-1) + a_i - b_i + hi; split; discard result, keep hi.
// Final: a < b iff final hi = 0, so result = 1 - hi.
__lt256:
call __store_b
call __store_a
push 1
// Limb 0: split gives [lo,hi]; pop lo to keep hi only.
push 17
read_mem 1
pop 1
push -1
mul
push 7
read_mem 1
pop 1
add
add
push 4294967295
add
split
pop 1
// Limb 1
push 16
read_mem 1
pop 1
push -1
mul
push 6
read_mem 1
pop 1
add
add
push 4294967295
add
split
pop 1
// Limb 2
push 15
read_mem 1
pop 1
push -1
mul
push 5
read_mem 1
pop 1
add
add
push 4294967295
add
split
pop 1
// Limb 3
push 14
read_mem 1
pop 1
push -1
mul
push 4
read_mem 1
pop 1
add
add
push 4294967295
add
split
pop 1
// Limb 4
push 13
read_mem 1
pop 1
push -1
mul
push 3
read_mem 1
pop 1
add
add
push 4294967295
add
split
pop 1
// Limb 5
push 12
read_mem 1
pop 1
push -1
mul
push 2
read_mem 1
pop 1
add
add
push 4294967295
add
split
pop 1
// Limb 6
push 11
read_mem 1
pop 1
push -1
mul
push 1
read_mem 1
pop 1
add
add
push 4294967295
add
split
pop 1
// Limb 7: split gives [lo,hi]; pop lo, keep hi.
// hi=1 means a>=b, hi=0 means a<b. Result = 1-hi.
push 10
read_mem 1
pop 1
push -1
mul
push 0
read_mem 1
pop 1
add
add
push 4294967295
add
split
pop 1
push -1
mul
push 1
add
return
// is_zero(a: U256) -> Bool. 10 ops.
// Entry: [l7..l0] Exit: [bool]
// Sum all 8 U32 limbs (max sum = 8*(2^32-1) << p, so sum=0 iff all zero).
__is_zero:
add
add
add
add
add
add
add
push 0
eq
return
// ===================================================================
// MULTIPLICATION HELPERS
// ===================================================================
// mul256_by_u32(a: U256, scalar: U32) -> (U256, U32 carry). 72 ops.
// Entry: [scalar, a.l7..a.l0] Exit: [carry, r.l7..r.l0]
// Stores scalar to mem[30], a to mem[0..7], processes with carry chain.
__mul256_by_u32:
push 30
swap 1
write_mem 1
pop 1
call __store_a
push 0
push 7
read_mem 1
pop 1
push 30
read_mem 1
pop 1
mul
add
split
swap 1
push 6
read_mem 1
pop 1
push 30
read_mem 1
pop 1
mul
add
split
swap 1
push 5
read_mem 1
pop 1
push 30
read_mem 1
pop 1
mul
add
split
swap 1
push 4
read_mem 1
pop 1
push 30
read_mem 1
pop 1
mul
add
split
swap 1
push 3
read_mem 1
pop 1
push 30
read_mem 1
pop 1
mul
add
split
swap 1
push 2
read_mem 1
pop 1
push 30
read_mem 1
pop 1
mul
add
split
swap 1
push 1
read_mem 1
pop 1
push 30
read_mem 1
pop 1
mul
add
split
swap 1
push 0
read_mem 1
pop 1
push 30
read_mem 1
pop 1
mul
add
split
swap 1
return
// shl_one_limb(a: U256) -> U256. 10 ops.
// Entry: [l7..l0] Exit: [l6 l5 l4 l3 l2 l1 l0 0]
__shl_one_limb:
pop 1
push 0
swap 1
swap 2
swap 3
swap 4
swap 5
swap 6
swap 7
return
// shl_limbs(a: U256, n: Field) -> U256
// Entry: [n, l7..l0] Exit: [r.l7..r.l0]
// Saves n to mem[30], loops calling shl_one_limb.
__shl_limbs:
push 30
swap 1
write_mem 1
pop 1
call __shl_loop
return
// __shl_loop: check counter, shift if nonzero, recurse.
// n is stored at mem[30]. Stack: [l7..l0].
__shl_loop:
push 30
read_mem 1
pop 1
// Stack: [n, l7..l0]
push 0
eq
// Stack: [n==0, l7..l0] -- n consumed by eq
skiz
return
// n > 0: reload n, decrement, save, shift, recurse
push 30
read_mem 1
pop 1
push -1
add
push 30
swap 1
write_mem 1
pop 1
call __shl_one_limb
recurse
// ===================================================================
// MUL256_LOW (SCHOOLBOOK)
// ===================================================================
// __mul_accum: inline-add partial product to accumulator in mem[20..27].
// Entry: [pp.l7..pp.l0] Exit: []
// Stores pp to mem[0..7], carry-chain adds with accum, writes back.
// Does NOT touch mem[40..57]. 108 ops.
__mul_accum:
push 0
write_mem 5
write_mem 3
pop 1
push 0
// Limb 0: pp.l0=mem[7], acc.l0=mem[27]
push 7
read_mem 1
pop 1
push 27
read_mem 1
pop 1
add
add
split
swap 1
swap 1
push 27
swap 1
write_mem 1
pop 1
// Limb 1
push 6
read_mem 1
pop 1
push 26
read_mem 1
pop 1
add
add
split
swap 1
swap 1
push 26
swap 1
write_mem 1
pop 1
// Limb 2
push 5
read_mem 1
pop 1
push 25
read_mem 1
pop 1
add
add
split
swap 1
swap 1
push 25
swap 1
write_mem 1
pop 1
// Limb 3
push 4
read_mem 1
pop 1
push 24
read_mem 1
pop 1
add
add
split
swap 1
swap 1
push 24
swap 1
write_mem 1
pop 1
// Limb 4
push 3
read_mem 1
pop 1
push 23
read_mem 1
pop 1
add
add
split
swap 1
swap 1
push 23
swap 1
write_mem 1
pop 1
// Limb 5
push 2
read_mem 1
pop 1
push 22
read_mem 1
pop 1
add
add
split
swap 1
swap 1
push 22
swap 1
write_mem 1
pop 1
// Limb 6
push 1
read_mem 1
pop 1
push 21
read_mem 1
pop 1
add
add
split
swap 1
swap 1
push 21
swap 1
write_mem 1
pop 1
// Limb 7 (discard carry)
push 0
read_mem 1
pop 1
push 20
read_mem 1
pop 1
add
add
split
pop 1
push 20
swap 1
write_mem 1
pop 1
return
// mul256_low(a: U256, b: U256) -> U256
// Entry: [b.l7..b.l0, a.l7..a.l0] Exit: [r.l7..r.l0]
// Stores originals to safe memory (40..57), zeroes accumulator (20..27),
// then for each b limb: load orig_a, mul by b_k, shift k, accum.
// mul256_by_u32 uses mem[0..7] and mem[30] as scratch -- safe.
// __mul_accum uses mem[0..7] as scratch -- safe.
__mul256_low:
call __store_orig_b
call __store_orig_a
// Zero accumulator
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 0
push 20
write_mem 5
write_mem 3
pop 1
// k=0: a * b.l0
call __load_orig_a
push 57
read_mem 1
pop 1
call __mul256_by_u32
pop 1
call __mul_accum
// k=1: a * b.l1, shift 1
call __load_orig_a
push 56
read_mem 1
pop 1
call __mul256_by_u32
pop 1
call __shl_one_limb
call __mul_accum
// k=2: a * b.l2, shift 2
call __load_orig_a
push 55
read_mem 1
pop 1
call __mul256_by_u32
pop 1
call __shl_one_limb
call __shl_one_limb
call __mul_accum
// k=3
call __load_orig_a
push 54
read_mem 1
pop 1
call __mul256_by_u32
pop 1
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __mul_accum
// k=4
call __load_orig_a
push 53
read_mem 1
pop 1
call __mul256_by_u32
pop 1
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __mul_accum
// k=5
call __load_orig_a
push 52
read_mem 1
pop 1
call __mul256_by_u32
pop 1
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __mul_accum
// k=6
call __load_orig_a
push 51
read_mem 1
pop 1
call __mul256_by_u32
pop 1
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __mul_accum
// k=7
call __load_orig_a
push 50
read_mem 1
pop 1
call __mul256_by_u32
pop 1
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __shl_one_limb
call __mul_accum
call __load_accum
return
// ===================================================================
// MODULAR ARITHMETIC
// ===================================================================
// mod_reduce_once(val: U256, m: U256) -> U256
// Entry: [m.l7..m.l0, val.l7..val.l0] Exit: [r.l7..r.l0]
// If val < m return val, else return val - m.
// Strategy: store both, compute val >= m. Use dual-skiz pattern:
// first skiz calls subtraction helper, second skiz (negated) is a no-op.
// Both paths store result to orig_a, which is loaded at the end.
__mod_reduce_once:
call __store_orig_b
call __store_orig_a
// Compute val >= m = NOT(val < m)
call __load_orig_a
call __load_orig_b
call __lt256
push -1
mul
push 1
add
// Stack: [val >= m]
// If val >= m: call helper to compute val-m and store to orig_a
skiz
call __mod_do_sub
// Load result: orig_a holds val (unchanged) or val-m (from helper)
call __load_orig_a
return
__mod_do_sub:
call __load_orig_a
call __load_orig_b
call __sub256
call __store_orig_a
return
// add_mod(a: U256, b: U256, m: U256) -> U256
// Entry: [m, b, a] Exit: [r]
__add_mod:
push 60
write_mem 5
write_mem 3
pop 1
call __add256
pop 1
call __load_mod
call __mod_reduce_once
return
// sub_mod(a: U256, b: U256, m: U256) -> U256
// Entry: [m, b, a] Exit: [r]
// If a >= b: a - b. If a < b: m - (b - a).
// Strategy: store all three. Save condition to mem[31].
// Use dual-skiz: first checks a>=b, second checks a<b.
// Both helpers store result to orig_a. Load at end.
__sub_mod:
push 60
write_mem 5
write_mem 3
pop 1
// Stack: [b, a]
call __store_orig_b
call __store_orig_a
// Compute a < b
call __load_orig_a
call __load_orig_b
call __lt256
// Stack: [a < b]
// Save condition to mem[31] for second check
dup 0
push 31
swap 1
write_mem 1
pop 1
// Negate: a >= b
push 31
read_mem 1
pop 1
push -1
mul
push 1
add
// If a >= b: compute a-b, store to orig_a
skiz
call __sub_mod_ge
// Reload condition (a < b)
push 31
read_mem 1
pop 1
// If a < b: compute m-(b-a), store to orig_a
skiz
call __sub_mod_lt
// Load result
call __load_orig_a
return
__sub_mod_ge:
// a >= b: a - b
call __load_orig_a
call __load_orig_b
call __sub256
call __store_orig_a
return
__sub_mod_lt:
// a < b: m - (b - a)
call __load_orig_b
call __load_orig_a
call __sub256
// Stack: [b - a]
call __store_orig_a
call __load_mod
call __load_orig_a
call __sub256
call __store_orig_a
return
// mul_mod(a: U256, b: U256, m: U256) -> U256
// Entry: [m, b, a] Exit: [r]
__mul_mod:
push 60
write_mem 5
write_mem 3
pop 1
call __mul256_low
call __load_mod
call __mod_reduce_once
return
// ===================================================================
// BITWISE
// ===================================================================
// and256(a: U256, b: U256) -> U256. 59 ops.
__and256:
call __store_b
call __store_a
push 0
read_mem 1
pop 1
push 10
read_mem 1
pop 1
and
push 1
read_mem 1
pop 1
push 11
read_mem 1
pop 1
and
push 2
read_mem 1
pop 1
push 12
read_mem 1
pop 1
and
push 3
read_mem 1
pop 1
push 13
read_mem 1
pop 1
and
push 4
read_mem 1
pop 1
push 14
read_mem 1
pop 1
and
push 5
read_mem 1
pop 1
push 15
read_mem 1
pop 1
and
push 6
read_mem 1
pop 1
push 16
read_mem 1
pop 1
and
push 7
read_mem 1
pop 1
push 17
read_mem 1
pop 1
and
return
// xor256(a: U256, b: U256) -> U256. 59 ops.
__xor256:
call __store_b
call __store_a
push 0
read_mem 1
pop 1
push 10
read_mem 1
pop 1
xor
push 1
read_mem 1
pop 1
push 11
read_mem 1
pop 1
xor
push 2
read_mem 1
pop 1
push 12
read_mem 1
pop 1
xor
push 3
read_mem 1
pop 1
push 13
read_mem 1
pop 1
xor
push 4
read_mem 1
pop 1
push 14
read_mem 1
pop 1
xor
push 5
read_mem 1
pop 1
push 15
read_mem 1
pop 1
xor
push 6
read_mem 1
pop 1
push 16
read_mem 1
pop 1
xor
push 7
read_mem 1
pop 1
push 17
read_mem 1
pop 1
xor
return