// F₂ tower field arithmetic — Trident implementation
//
// Tower construction: each level F_{2^{2k}} = F_{2^k}[x] / (x² + x + α_k)
// where α_k is the canonical generator (element 0b10) of level k.
//
// Representation: F₂ elements are bit-packed into U32. Up to F₂³² fits
// in a single U32. F₂⁶⁴ uses 2 × U32, F₂¹²⁸ uses 4 × U32.
// All arithmetic is via hardware XOR (^) and AND (&).
//
// Wiedemann tower α values:
//   F₂⁴:  α = 0x02       F₂⁸:  α = 0x08
//   F₂¹⁶: α = 0x0080     F₂³²: α = 0x00008000
//   F₂⁶⁴: α = 0x80000000 (in lo word)
//   F₂¹²⁸: α = 0x80000000 (in w1, i.e., bit 63 of 128)

module kuro.tower

// =========================================================================
// F₂ — base field. 1 bit in U32.
// =========================================================================

pub struct F2 { val: U32 }

pub fn f2_zero() -> F2 { F2 { val: 0u32 } }
pub fn f2_one()  -> F2 { F2 { val: 1u32 } }

pub fn f2_add(a: F2, b: F2) -> F2 { F2 { val: a.val ^ b.val } }
pub fn f2_mul(a: F2, b: F2) -> F2 { F2 { val: a.val & b.val } }
pub fn f2_inv(a: F2) -> F2 { a } // 1⁻¹ = 1; caller must ensure a ≠ 0
pub fn f2_square(a: F2) -> F2 { a }
pub fn f2_is_zero(a: F2) -> U32 { a.val ^ a.val } // always 0; use == 0u32

// =========================================================================
// F₂² — first extension. 2 bits in U32.
// Irreducible: x² + x + 1 over F₂.
// =========================================================================

pub struct F2_2 { val: U32 }

pub fn f2_2_zero() -> F2_2 { F2_2 { val: 0u32 } }
pub fn f2_2_one()  -> F2_2 { F2_2 { val: 1u32 } }

pub fn f2_2_add(a: F2_2, b: F2_2) -> F2_2 {
    F2_2 { val: a.val ^ b.val }
}

pub fn f2_2_mul(a: F2_2, b: F2_2) -> F2_2 {
    // a = a0 + a1·x, b = b0 + b1·x in F₂²
    // (a0 + a1·x)(b0 + b1·x) = a0·b0 + (a0·b1 + a1·b0)·x + a1·b1·x²
    // x² = x + 1, so a1·b1·x² = a1·b1·x + a1·b1
    // c0 = a0·b0 + a1·b1  (XOR of ANDs)
    // c1 = a0·b1 + a1·b0 + a1·b1
    let a0: U32 = a.val & 1u32;
    let a1: U32 = (a.val >> 1u32) & 1u32;
    let b0: U32 = b.val & 1u32;
    let b1: U32 = (b.val >> 1u32) & 1u32;
    let c0: U32 = (a0 & b0) ^ (a1 & b1);
    let c1: U32 = (a0 & b1) ^ (a1 & b0) ^ (a1 & b1);
    F2_2 { val: c0 | (c1 << 1u32) }
}

pub fn f2_2_square(a: F2_2) -> F2_2 {
    // sq(a0 + a1·x) = (a0 + a1) + a1·x  (Frobenius in char 2)
    let a0: U32 = a.val & 1u32;
    let a1: U32 = (a.val >> 1u32) & 1u32;
    F2_2 { val: (a0 ^ a1) | (a1 << 1u32) }
}

pub fn f2_2_inv(a: F2_2) -> F2_2 {
    // |F₂²*| = 3, so a⁻¹ = a²
    f2_2_square(a)
}

pub fn f2_2_trace(a: F2_2) -> F2 {
    F2 { val: (a.val >> 1u32) & 1u32 }
}

// =========================================================================
// F₂⁴ — 4 bits in U32. Extension of F₂² by x² + x + α, α = 0x02.
// =========================================================================

pub struct F2_4 { val: U32 }

pub fn f2_4_zero() -> F2_4 { F2_4 { val: 0u32 } }
pub fn f2_4_one()  -> F2_4 { F2_4 { val: 1u32 } }

fn f2_4_lo(a: F2_4) -> F2_2 { F2_2 { val: a.val & 0x03u32 } }
fn f2_4_hi(a: F2_4) -> F2_2 { F2_2 { val: (a.val >> 2u32) & 0x03u32 } }
fn f2_4_pack(lo: F2_2, hi: F2_2) -> F2_4 {
    F2_4 { val: (lo.val & 0x03u32) | ((hi.val & 0x03u32) << 2u32) }
}

pub fn f2_4_add(a: F2_4, b: F2_4) -> F2_4 {
    F2_4 { val: a.val ^ b.val }
}

pub fn f2_4_mul(a: F2_4, b: F2_4) -> F2_4 {
    // Karatsuba: a = a_lo + a_hi·x, b = b_lo + b_hi·x
    // ll = a_lo * b_lo, hh = a_hi * b_hi
    // cross = (a_lo + a_hi)(b_lo + b_hi) - ll - hh  (- is + in char 2)
    // result_lo = ll + hh * α, result_hi = cross + hh
    let a_lo: F2_2 = f2_4_lo(a);
    let a_hi: F2_2 = f2_4_hi(a);
    let b_lo: F2_2 = f2_4_lo(b);
    let b_hi: F2_2 = f2_4_hi(b);

    let ll: F2_2 = f2_2_mul(a_lo, b_lo);
    let hh: F2_2 = f2_2_mul(a_hi, b_hi);
    let cross: F2_2 = f2_2_add(f2_2_add(
        f2_2_mul(f2_2_add(a_lo, a_hi), f2_2_add(b_lo, b_hi)),
        ll), hh);

    // α = 0x02 = F2_2 { val: 2 }
    let alpha: F2_2 = F2_2 { val: 2u32 };
    let c_lo: F2_2 = f2_2_add(ll, f2_2_mul(hh, alpha));
    let c_hi: F2_2 = f2_2_add(cross, hh);

    f2_4_pack(c_lo, c_hi)
}

pub fn f2_4_square(a: F2_4) -> F2_4 {
    let lo: F2_2 = f2_4_lo(a);
    let hi: F2_2 = f2_4_hi(a);
    let lo_sq: F2_2 = f2_2_square(lo);
    let hi_sq: F2_2 = f2_2_square(hi);
    let alpha: F2_2 = F2_2 { val: 2u32 };
    let c_lo: F2_2 = f2_2_add(lo_sq, f2_2_mul(hi_sq, alpha));
    f2_4_pack(c_lo, hi_sq)
}

pub fn f2_4_inv(a: F2_4) -> F2_4 {
    // Tower-recursive: Δ = a_lo·(a_lo + a_hi) + a_hi²·α
    //                  a⁻¹ = Δ⁻¹·(a_lo + a_hi) + Δ⁻¹·a_hi·x
    let a_lo: F2_2 = f2_4_lo(a);
    let a_hi: F2_2 = f2_4_hi(a);
    let alpha: F2_2 = F2_2 { val: 2u32 };
    let sum: F2_2 = f2_2_add(a_lo, a_hi);
    let delta: F2_2 = f2_2_add(
        f2_2_mul(a_lo, sum),
        f2_2_mul(f2_2_square(a_hi), alpha)
    );
    let di: F2_2 = f2_2_inv(delta);
    let c_lo: F2_2 = f2_2_mul(di, sum);
    let c_hi: F2_2 = f2_2_mul(di, a_hi);
    f2_4_pack(c_lo, c_hi)
}

pub fn f2_4_trace(a: F2_4) -> F2 {
    f2_2_trace(f2_2_add(f2_4_lo(a), f2_4_hi(a)))
}

// =========================================================================
// F₂⁸ — 8 bits in U32. Extension of F₂⁴ by x² + x + α, α = 0x08.
// =========================================================================

pub struct F2_8 { val: U32 }

pub fn f2_8_zero() -> F2_8 { F2_8 { val: 0u32 } }
pub fn f2_8_one()  -> F2_8 { F2_8 { val: 1u32 } }

fn f2_8_lo(a: F2_8) -> F2_4 { F2_4 { val: a.val & 0x0Fu32 } }
fn f2_8_hi(a: F2_8) -> F2_4 { F2_4 { val: (a.val >> 4u32) & 0x0Fu32 } }
fn f2_8_pack(lo: F2_4, hi: F2_4) -> F2_8 {
    F2_8 { val: (lo.val & 0x0Fu32) | ((hi.val & 0x0Fu32) << 4u32) }
}

pub fn f2_8_add(a: F2_8, b: F2_8) -> F2_8 {
    F2_8 { val: a.val ^ b.val }
}

pub fn f2_8_mul(a: F2_8, b: F2_8) -> F2_8 {
    let a_lo: F2_4 = f2_8_lo(a);
    let a_hi: F2_4 = f2_8_hi(a);
    let b_lo: F2_4 = f2_8_lo(b);
    let b_hi: F2_4 = f2_8_hi(b);

    let ll: F2_4 = f2_4_mul(a_lo, b_lo);
    let hh: F2_4 = f2_4_mul(a_hi, b_hi);
    let cross: F2_4 = f2_4_add(f2_4_add(
        f2_4_mul(f2_4_add(a_lo, a_hi), f2_4_add(b_lo, b_hi)),
        ll), hh);

    let alpha: F2_4 = F2_4 { val: 0x08u32 };
    let c_lo: F2_4 = f2_4_add(ll, f2_4_mul(hh, alpha));
    let c_hi: F2_4 = f2_4_add(cross, hh);

    f2_8_pack(c_lo, c_hi)
}

pub fn f2_8_square(a: F2_8) -> F2_8 {
    let lo: F2_4 = f2_8_lo(a);
    let hi: F2_4 = f2_8_hi(a);
    let lo_sq: F2_4 = f2_4_square(lo);
    let hi_sq: F2_4 = f2_4_square(hi);
    let alpha: F2_4 = F2_4 { val: 0x08u32 };
    let c_lo: F2_4 = f2_4_add(lo_sq, f2_4_mul(hi_sq, alpha));
    f2_8_pack(c_lo, hi_sq)
}

pub fn f2_8_inv(a: F2_8) -> F2_8 {
    let a_lo: F2_4 = f2_8_lo(a);
    let a_hi: F2_4 = f2_8_hi(a);
    let alpha: F2_4 = F2_4 { val: 0x08u32 };
    let sum: F2_4 = f2_4_add(a_lo, a_hi);
    let delta: F2_4 = f2_4_add(
        f2_4_mul(a_lo, sum),
        f2_4_mul(f2_4_square(a_hi), alpha)
    );
    let di: F2_4 = f2_4_inv(delta);
    let c_lo: F2_4 = f2_4_mul(di, sum);
    let c_hi: F2_4 = f2_4_mul(di, a_hi);
    f2_8_pack(c_lo, c_hi)
}

pub fn f2_8_trace(a: F2_8) -> F2 {
    f2_4_trace(f2_4_add(f2_8_lo(a), f2_8_hi(a)))
}

// =========================================================================
// F₂¹⁶ — 16 bits in U32. Extension of F₂⁸ by x² + x + α, α = 0x80.
// =========================================================================

pub struct F2_16 { val: U32 }

pub fn f2_16_zero() -> F2_16 { F2_16 { val: 0u32 } }
pub fn f2_16_one()  -> F2_16 { F2_16 { val: 1u32 } }

fn f2_16_lo(a: F2_16) -> F2_8 { F2_8 { val: a.val & 0xFFu32 } }
fn f2_16_hi(a: F2_16) -> F2_8 { F2_8 { val: (a.val >> 8u32) & 0xFFu32 } }
fn f2_16_pack(lo: F2_8, hi: F2_8) -> F2_16 {
    F2_16 { val: (lo.val & 0xFFu32) | ((hi.val & 0xFFu32) << 8u32) }
}

pub fn f2_16_add(a: F2_16, b: F2_16) -> F2_16 {
    F2_16 { val: a.val ^ b.val }
}

pub fn f2_16_mul(a: F2_16, b: F2_16) -> F2_16 {
    let a_lo: F2_8 = f2_16_lo(a);
    let a_hi: F2_8 = f2_16_hi(a);
    let b_lo: F2_8 = f2_16_lo(b);
    let b_hi: F2_8 = f2_16_hi(b);

    let ll: F2_8 = f2_8_mul(a_lo, b_lo);
    let hh: F2_8 = f2_8_mul(a_hi, b_hi);
    let cross: F2_8 = f2_8_add(f2_8_add(
        f2_8_mul(f2_8_add(a_lo, a_hi), f2_8_add(b_lo, b_hi)),
        ll), hh);

    let alpha: F2_8 = F2_8 { val: 0x80u32 };
    let c_lo: F2_8 = f2_8_add(ll, f2_8_mul(hh, alpha));
    let c_hi: F2_8 = f2_8_add(cross, hh);

    f2_16_pack(c_lo, c_hi)
}

pub fn f2_16_square(a: F2_16) -> F2_16 {
    let lo: F2_8 = f2_16_lo(a);
    let hi: F2_8 = f2_16_hi(a);
    let lo_sq: F2_8 = f2_8_square(lo);
    let hi_sq: F2_8 = f2_8_square(hi);
    let alpha: F2_8 = F2_8 { val: 0x80u32 };
    let c_lo: F2_8 = f2_8_add(lo_sq, f2_8_mul(hi_sq, alpha));
    f2_16_pack(c_lo, hi_sq)
}

pub fn f2_16_inv(a: F2_16) -> F2_16 {
    let a_lo: F2_8 = f2_16_lo(a);
    let a_hi: F2_8 = f2_16_hi(a);
    let alpha: F2_8 = F2_8 { val: 0x80u32 };
    let sum: F2_8 = f2_8_add(a_lo, a_hi);
    let delta: F2_8 = f2_8_add(
        f2_8_mul(a_lo, sum),
        f2_8_mul(f2_8_square(a_hi), alpha)
    );
    let di: F2_8 = f2_8_inv(delta);
    let c_lo: F2_8 = f2_8_mul(di, sum);
    let c_hi: F2_8 = f2_8_mul(di, a_hi);
    f2_16_pack(c_lo, c_hi)
}

pub fn f2_16_trace(a: F2_16) -> F2 {
    f2_8_trace(f2_8_add(f2_16_lo(a), f2_16_hi(a)))
}

// =========================================================================
// F₂³² — 32 bits in U32. Extension of F₂¹⁶ by x² + x + α, α = 0x8000.
// =========================================================================

pub struct F2_32 { val: U32 }

pub fn f2_32_zero() -> F2_32 { F2_32 { val: 0u32 } }
pub fn f2_32_one()  -> F2_32 { F2_32 { val: 1u32 } }

fn f2_32_lo(a: F2_32) -> F2_16 { F2_16 { val: a.val & 0xFFFFu32 } }
fn f2_32_hi(a: F2_32) -> F2_16 { F2_16 { val: (a.val >> 16u32) & 0xFFFFu32 } }
fn f2_32_pack(lo: F2_16, hi: F2_16) -> F2_32 {
    F2_32 { val: (lo.val & 0xFFFFu32) | ((hi.val & 0xFFFFu32) << 16u32) }
}

pub fn f2_32_add(a: F2_32, b: F2_32) -> F2_32 {
    F2_32 { val: a.val ^ b.val }
}

pub fn f2_32_mul(a: F2_32, b: F2_32) -> F2_32 {
    let a_lo: F2_16 = f2_32_lo(a);
    let a_hi: F2_16 = f2_32_hi(a);
    let b_lo: F2_16 = f2_32_lo(b);
    let b_hi: F2_16 = f2_32_hi(b);

    let ll: F2_16 = f2_16_mul(a_lo, b_lo);
    let hh: F2_16 = f2_16_mul(a_hi, b_hi);
    let cross: F2_16 = f2_16_add(f2_16_add(
        f2_16_mul(f2_16_add(a_lo, a_hi), f2_16_add(b_lo, b_hi)),
        ll), hh);

    let alpha: F2_16 = F2_16 { val: 0x8000u32 };
    let c_lo: F2_16 = f2_16_add(ll, f2_16_mul(hh, alpha));
    let c_hi: F2_16 = f2_16_add(cross, hh);

    f2_32_pack(c_lo, c_hi)
}

pub fn f2_32_square(a: F2_32) -> F2_32 {
    let lo: F2_16 = f2_32_lo(a);
    let hi: F2_16 = f2_32_hi(a);
    let lo_sq: F2_16 = f2_16_square(lo);
    let hi_sq: F2_16 = f2_16_square(hi);
    let alpha: F2_16 = F2_16 { val: 0x8000u32 };
    let c_lo: F2_16 = f2_16_add(lo_sq, f2_16_mul(hi_sq, alpha));
    f2_32_pack(c_lo, hi_sq)
}

pub fn f2_32_inv(a: F2_32) -> F2_32 {
    let a_lo: F2_16 = f2_32_lo(a);
    let a_hi: F2_16 = f2_32_hi(a);
    let alpha: F2_16 = F2_16 { val: 0x8000u32 };
    let sum: F2_16 = f2_16_add(a_lo, a_hi);
    let delta: F2_16 = f2_16_add(
        f2_16_mul(a_lo, sum),
        f2_16_mul(f2_16_square(a_hi), alpha)
    );
    let di: F2_16 = f2_16_inv(delta);
    let c_lo: F2_16 = f2_16_mul(di, sum);
    let c_hi: F2_16 = f2_16_mul(di, a_hi);
    f2_32_pack(c_lo, c_hi)
}

pub fn f2_32_trace(a: F2_32) -> F2 {
    f2_16_trace(f2_16_add(f2_32_lo(a), f2_32_hi(a)))
}

// =========================================================================
// F₂⁶⁴ — 64 bits in 2 × U32. Extension of F₂³² by x² + x + α.
// α = 0x80000000 (bit 31 of the F₂³² sub-field).
// Layout: lo = bits [0:32), hi = bits [32:64).
// =========================================================================

pub struct F2_64 { lo: U32, hi: U32 }

pub fn f2_64_zero() -> F2_64 { F2_64 { lo: 0u32, hi: 0u32 } }
pub fn f2_64_one()  -> F2_64 { F2_64 { lo: 1u32, hi: 0u32 } }

fn f2_64_get_lo(a: F2_64) -> F2_32 { F2_32 { val: a.lo } }
fn f2_64_get_hi(a: F2_64) -> F2_32 { F2_32 { val: a.hi } }
fn f2_64_pack(lo: F2_32, hi: F2_32) -> F2_64 {
    F2_64 { lo: lo.val, hi: hi.val }
}

pub fn f2_64_add(a: F2_64, b: F2_64) -> F2_64 {
    F2_64 { lo: a.lo ^ b.lo, hi: a.hi ^ b.hi }
}

pub fn f2_64_mul(a: F2_64, b: F2_64) -> F2_64 {
    let a_lo: F2_32 = f2_64_get_lo(a);
    let a_hi: F2_32 = f2_64_get_hi(a);
    let b_lo: F2_32 = f2_64_get_lo(b);
    let b_hi: F2_32 = f2_64_get_hi(b);

    let ll: F2_32 = f2_32_mul(a_lo, b_lo);
    let hh: F2_32 = f2_32_mul(a_hi, b_hi);
    let cross: F2_32 = f2_32_add(f2_32_add(
        f2_32_mul(f2_32_add(a_lo, a_hi), f2_32_add(b_lo, b_hi)),
        ll), hh);

    let alpha: F2_32 = F2_32 { val: 0x80000000u32 };
    let c_lo: F2_32 = f2_32_add(ll, f2_32_mul(hh, alpha));
    let c_hi: F2_32 = f2_32_add(cross, hh);

    f2_64_pack(c_lo, c_hi)
}

pub fn f2_64_square(a: F2_64) -> F2_64 {
    let lo: F2_32 = f2_64_get_lo(a);
    let hi: F2_32 = f2_64_get_hi(a);
    let lo_sq: F2_32 = f2_32_square(lo);
    let hi_sq: F2_32 = f2_32_square(hi);
    let alpha: F2_32 = F2_32 { val: 0x80000000u32 };
    let c_lo: F2_32 = f2_32_add(lo_sq, f2_32_mul(hi_sq, alpha));
    f2_64_pack(c_lo, hi_sq)
}

pub fn f2_64_inv(a: F2_64) -> F2_64 {
    let a_lo: F2_32 = f2_64_get_lo(a);
    let a_hi: F2_32 = f2_64_get_hi(a);
    let alpha: F2_32 = F2_32 { val: 0x80000000u32 };
    let sum: F2_32 = f2_32_add(a_lo, a_hi);
    let delta: F2_32 = f2_32_add(
        f2_32_mul(a_lo, sum),
        f2_32_mul(f2_32_square(a_hi), alpha)
    );
    let di: F2_32 = f2_32_inv(delta);
    let c_lo: F2_32 = f2_32_mul(di, sum);
    let c_hi: F2_32 = f2_32_mul(di, a_hi);
    f2_64_pack(c_lo, c_hi)
}

pub fn f2_64_trace(a: F2_64) -> F2 {
    f2_32_trace(f2_32_add(f2_64_get_lo(a), f2_64_get_hi(a)))
}

pub fn f2_64_is_zero(a: F2_64) -> U32 {
    // returns 1 if zero, 0 otherwise
    let combined: U32 = a.lo | a.hi;
    // combined == 0u32 when both words are zero
    // Use: if combined == 0 then 1 else 0
    // In Trident U32 land, approximate with: 1 - min(combined, 1)
    // But simpler: rely on caller to check (a.lo | a.hi) == 0u32
    combined
}

// =========================================================================
// F₂¹²⁸ — 128 bits in 4 × U32. Extension of F₂⁶⁴ by x² + x + α.
// α = 0x80000000_00000000 in F₂⁶⁴, i.e., F2_64 { lo: 0, hi: 0x80000000 }.
// Layout: w0 w1 = lo half (F₂⁶⁴), w2 w3 = hi half (F₂⁶⁴).
// =========================================================================

pub struct F2_128 { w0: U32, w1: U32, w2: U32, w3: U32 }

pub fn f2_128_zero() -> F2_128 {
    F2_128 { w0: 0u32, w1: 0u32, w2: 0u32, w3: 0u32 }
}
pub fn f2_128_one() -> F2_128 {
    F2_128 { w0: 1u32, w1: 0u32, w2: 0u32, w3: 0u32 }
}

fn f2_128_get_lo(a: F2_128) -> F2_64 { F2_64 { lo: a.w0, hi: a.w1 } }
fn f2_128_get_hi(a: F2_128) -> F2_64 { F2_64 { lo: a.w2, hi: a.w3 } }
fn f2_128_pack(lo: F2_64, hi: F2_64) -> F2_128 {
    F2_128 { w0: lo.lo, w1: lo.hi, w2: hi.lo, w3: hi.hi }
}

pub fn f2_128_add(a: F2_128, b: F2_128) -> F2_128 {
    F2_128 {
        w0: a.w0 ^ b.w0,
        w1: a.w1 ^ b.w1,
        w2: a.w2 ^ b.w2,
        w3: a.w3 ^ b.w3,
    }
}

pub fn f2_128_mul(a: F2_128, b: F2_128) -> F2_128 {
    let a_lo: F2_64 = f2_128_get_lo(a);
    let a_hi: F2_64 = f2_128_get_hi(a);
    let b_lo: F2_64 = f2_128_get_lo(b);
    let b_hi: F2_64 = f2_128_get_hi(b);

    let ll: F2_64 = f2_64_mul(a_lo, b_lo);
    let hh: F2_64 = f2_64_mul(a_hi, b_hi);
    let cross: F2_64 = f2_64_add(f2_64_add(
        f2_64_mul(f2_64_add(a_lo, a_hi), f2_64_add(b_lo, b_hi)),
        ll), hh);

    // α for F₂¹²⁸: bit 63 of F₂⁶⁴ = { lo: 0, hi: 0x80000000 }
    let alpha: F2_64 = F2_64 { lo: 0u32, hi: 0x80000000u32 };
    let c_lo: F2_64 = f2_64_add(ll, f2_64_mul(hh, alpha));
    let c_hi: F2_64 = f2_64_add(cross, hh);

    f2_128_pack(c_lo, c_hi)
}

pub fn f2_128_square(a: F2_128) -> F2_128 {
    let lo: F2_64 = f2_128_get_lo(a);
    let hi: F2_64 = f2_128_get_hi(a);
    let lo_sq: F2_64 = f2_64_square(lo);
    let hi_sq: F2_64 = f2_64_square(hi);
    let alpha: F2_64 = F2_64 { lo: 0u32, hi: 0x80000000u32 };
    let c_lo: F2_64 = f2_64_add(lo_sq, f2_64_mul(hi_sq, alpha));
    f2_128_pack(c_lo, hi_sq)
}

pub fn f2_128_inv(a: F2_128) -> F2_128 {
    let a_lo: F2_64 = f2_128_get_lo(a);
    let a_hi: F2_64 = f2_128_get_hi(a);
    let alpha: F2_64 = F2_64 { lo: 0u32, hi: 0x80000000u32 };
    let sum: F2_64 = f2_64_add(a_lo, a_hi);
    let delta: F2_64 = f2_64_add(
        f2_64_mul(a_lo, sum),
        f2_64_mul(f2_64_square(a_hi), alpha)
    );
    let di: F2_64 = f2_64_inv(delta);
    let c_lo: F2_64 = f2_64_mul(di, sum);
    let c_hi: F2_64 = f2_64_mul(di, a_hi);
    f2_128_pack(c_lo, c_hi)
}

pub fn f2_128_trace(a: F2_128) -> F2 {
    f2_64_trace(f2_64_add(f2_128_get_lo(a), f2_128_get_hi(a)))
}

pub fn f2_128_is_zero(a: F2_128) -> U32 {
    a.w0 | a.w1 | a.w2 | a.w3
}

Local Graph