// Hand-optimized TASM baseline: std.crypto.ecdsa
//
// First principles rewrite. Key decisions:
//   - read_u256/divine_u256: batch I/O + split chain = 19 insns each
//   - read/divine_signature: call subroutines, 3 insns each
//   - write_u256: reverse + batch write = 13 insns
//   - write_signature: store s to mem, write r, reload s, write s
//   - is_zero_u256: OR chain (fewer insns than eq+and chain)
//   - lt256_sub: store b at addr 0, borrow chain, 8 limbs
//   - valid_range: store all 3 U256s, run 4 checks
//   - Memory: contiguous blocks, write_mem 5 + write_mem 3 for batch store
//
// U256 on stack: [l0 l1 l2 l3 l4 l5 l6 l7] l7 on top, l0 deepest.
// Signature: [r.l0..r.l7 s.l0..s.l7] s.l7 on top.

// ===================================================================
// SUBROUTINES
// ===================================================================

// is_zero_u256: [l0..l7] -> [flag]
// OR all limbs. If result is 0, all were zero.
__is_zero:
    or
    or
    or
    or
    or
    or
    or
    push 0
    eq
    return

// not_flag: [b] -> [!b]
__not:
    push 0
    eq
    return

// store_u256: [addr, l0..l7] -> [] writes 8 limbs starting at addr
// addr on top of the 8 limbs. write_mem 5 writes st1..st5 to [addr..addr+4], leaves addr+5.
// Need: addr below data. Swap addr to bottom.
__store_u256:
    swap 8
    write_mem 5
    write_mem 3
    pop 1
    return

// load_u256: [addr] -> [l0..l7] reads 8 limbs from addr
// read_mem 5 reads [addr-4..addr] into st1..st5, leaves addr-5.
// We want to read addr..addr+7, so start from addr+7.
__load_u256:
    push 7
    add
    read_mem 5
    read_mem 3
    pop 1
    return

// lt256_sub: [a.l0..a.l7 b.l0..b.l7] -> [flag]
// Unsigned a < b via subtraction borrow chain.
// Store b at mem[0..7], reverse a, borrow chain from LSB.
__lt256:
    // Store b at addr 0
    push 0
    call __store_u256
    // Stack: a.l0..a.l7 (l7 on top)
    // Reverse a so a.l0 is on top (process LSB first)
    swap 7
    swap 1
    swap 6
    swap 1
    swap 2
    swap 5
    swap 2
    swap 3
    swap 4
    swap 3
    // Stack: a0 a1 a2 a3 a4 a5 a6 a7 (a0 on top)
    // Limb 0: diff = 2^32 + a0 - b0, split, borrow = 1 - hi
    push 0
    read_mem 1
    pop 1
    push -1
    mul
    push 4294967296
    add
    add
    split
    pop 1
    push -1
    mul
    push 1
    add
    // Limbs 1-7: same pattern with borrow propagation
    swap 1
    push 1
    read_mem 1
    pop 1
    push -1
    mul
    push 4294967296
    add
    add
    swap 1
    push -1
    mul
    add
    split
    pop 1
    push -1
    mul
    push 1
    add

    swap 1
    push 2
    read_mem 1
    pop 1
    push -1
    mul
    push 4294967296
    add
    add
    swap 1
    push -1
    mul
    add
    split
    pop 1
    push -1
    mul
    push 1
    add

    swap 1
    push 3
    read_mem 1
    pop 1
    push -1
    mul
    push 4294967296
    add
    add
    swap 1
    push -1
    mul
    add
    split
    pop 1
    push -1
    mul
    push 1
    add

    swap 1
    push 4
    read_mem 1
    pop 1
    push -1
    mul
    push 4294967296
    add
    add
    swap 1
    push -1
    mul
    add
    split
    pop 1
    push -1
    mul
    push 1
    add

    swap 1
    push 5
    read_mem 1
    pop 1
    push -1
    mul
    push 4294967296
    add
    add
    swap 1
    push -1
    mul
    add
    split
    pop 1
    push -1
    mul
    push 1
    add

    swap 1
    push 6
    read_mem 1
    pop 1
    push -1
    mul
    push 4294967296
    add
    add
    swap 1
    push -1
    mul
    add
    split
    pop 1
    push -1
    mul
    push 1
    add

    swap 1
    push 7
    read_mem 1
    pop 1
    push -1
    mul
    push 4294967296
    add
    add
    swap 1
    push -1
    mul
    add
    split
    pop 1
    push -1
    mul
    push 1
    add
    return

// reverse_8: reverse top 8 stack elements
__rev8:
    swap 7
    swap 1
    swap 6
    swap 1
    swap 2
    swap 5
    swap 2
    swap 3
    swap 4
    swap 3
    return

// split_8: split top 8 field elements to U32 (range-check)
// [f0..f7] (f7 on top) -> [u0..u7] (u7 on top)
__split8:
    split
    pop 1
    swap 1
    split
    pop 1
    swap 2
    split
    pop 1
    swap 3
    split
    pop 1
    swap 4
    split
    pop 1
    swap 5
    split
    pop 1
    swap 6
    split
    pop 1
    swap 7
    split
    pop 1
    call __rev8
    return

// ===================================================================
// PUBLIC API
// ===================================================================

// read_u256: -> [l0..l7]
std_crypto_ecdsa__read_u256:
    read_io 5
    read_io 3
    call __split8
    return

// divine_u256: -> [l0..l7]
std_crypto_ecdsa__divine_u256:
    divine 5
    divine 3
    call __split8
    return

// read_signature: -> [r.l0..r.l7 s.l0..s.l7]
std_crypto_ecdsa__read_signature:
    call std_crypto_ecdsa__read_u256
    call std_crypto_ecdsa__read_u256
    return

// divine_signature: -> [r.l0..r.l7 s.l0..s.l7]
std_crypto_ecdsa__divine_signature:
    call std_crypto_ecdsa__divine_u256
    call std_crypto_ecdsa__divine_u256
    return

// valid_range: [r.l0..r.l7 s.l0..s.l7 order.l0..order.l7] -> [flag]
// flag = !is_zero(r) AND !is_zero(s) AND r < order AND s < order
std_crypto_ecdsa__valid_range:
    // Store order at mem[100..107]
    push 100
    call __store_u256
    // Store s at mem[110..117]
    push 110
    call __store_u256
    // Store r at mem[120..127]
    push 120
    call __store_u256

    // Check 1: r != 0
    push 120
    call __load_u256
    call __is_zero
    call __not

    // Check 2: s != 0
    push 110
    call __load_u256
    call __is_zero
    call __not
    and

    // Check 3: r < order
    push 120
    call __load_u256
    push 100
    call __load_u256
    call __lt256
    and

    // Check 4: s < order
    push 110
    call __load_u256
    push 100
    call __load_u256
    call __lt256
    and
    return

// is_low_s: [r.l0..r.l7 s.l0..s.l7 order.l0..order.l7] -> [flag]
// flag = s < order (discard r)
std_crypto_ecdsa__is_low_s:
    // Store order at mem[100..107]
    push 100
    call __store_u256
    // s is now on top. Store at mem[110..117]
    push 110
    call __store_u256
    // Discard r (8 limbs)
    pop 5
    pop 3
    // Load s, load order, compare
    push 110
    call __load_u256
    push 100
    call __load_u256
    call __lt256
    return

// write_u256: [l0..l7] -> (writes to public output)
std_crypto_ecdsa__write_u256:
    call __rev8
    write_io 5
    write_io 3
    return

// write_signature: [r.l0..r.l7 s.l0..s.l7] -> (writes r then s)
std_crypto_ecdsa__write_signature:
    // Store s at mem[130..137]
    push 130
    call __store_u256
    // Write r
    call std_crypto_ecdsa__write_u256
    // Reload and write s
    push 130
    call __load_u256
    call std_crypto_ecdsa__write_u256
    return

Neighbours