// Packed Fโ‚‚ operations โ€” 128 elements in 4 ร— U32
//
// SIMD-style batch operations where hardware XOR/AND/popcount
// operate on 128 binary field elements simultaneously.
//
// Mirrors rs/packed.rs but uses U32 words (Trident has no u128).

module kuro.packed

pub struct Packed128 { w0: U32, w1: U32, w2: U32, w3: U32 }

pub fn packed_zero() -> Packed128 {
    Packed128 { w0: 0u32, w1: 0u32, w2: 0u32, w3: 0u32 }
}

pub fn packed_ones() -> Packed128 {
    Packed128 { w0: 0xFFFFFFFFu32, w1: 0xFFFFFFFFu32,
                w2: 0xFFFFFFFFu32, w3: 0xFFFFFFFFu32 }
}

/// Vectorized addition: 128 parallel XOR operations.
pub fn packed_add(a: Packed128, b: Packed128) -> Packed128 {
    Packed128 {
        w0: a.w0 ^ b.w0,
        w1: a.w1 ^ b.w1,
        w2: a.w2 ^ b.w2,
        w3: a.w3 ^ b.w3,
    }
}

/// Vectorized multiplication: 128 parallel AND operations.
pub fn packed_mul(a: Packed128, b: Packed128) -> Packed128 {
    Packed128 {
        w0: a.w0 & b.w0,
        w1: a.w1 & b.w1,
        w2: a.w2 & b.w2,
        w3: a.w3 & b.w3,
    }
}

/// Vectorized NOT: 128 parallel complement operations.
pub fn packed_not(a: Packed128) -> Packed128 {
    Packed128 {
        w0: a.w0 ^ 0xFFFFFFFFu32,
        w1: a.w1 ^ 0xFFFFFFFFu32,
        w2: a.w2 ^ 0xFFFFFFFFu32,
        w3: a.w3 ^ 0xFFFFFFFFu32,
    }
}

/// Popcount: number of 1-elements in the packed vector.
pub fn packed_popcount(a: Packed128) -> U32 {
    u32.popcount(a.w0) + u32.popcount(a.w1)
        + u32.popcount(a.w2) + u32.popcount(a.w3)
}

/// Inner product: popcount(a AND b).
/// The binary matrix-vector multiply kernel.
pub fn packed_inner_product(a: Packed128, b: Packed128) -> U32 {
    packed_popcount(packed_mul(a, b))
}

/// Hamming distance between two packed vectors.
pub fn packed_hamming_distance(a: Packed128, b: Packed128) -> U32 {
    packed_popcount(packed_add(a, b))
}

/// Get a single bit at position i (0..127).
/// i < 32 => w0, 32..63 => w1, 64..95 => w2, 96..127 => w3.
pub fn packed_get_bit(a: Packed128, i: U32) -> U32 {
    let word_idx: U32 = i >> 5u32;     // i / 32
    let bit_idx: U32 = i & 31u32;      // i % 32
    // Select the appropriate word
    // Branchless: sum of (word & mask) where mask selects exactly one word
    let sel0: U32 = (1u32 >> word_idx) & 1u32; // 1 when word_idx == 0
    let w: U32 = a.w0; // default; caller should use word_idx externally
    // Simplified: shift selected word right by bit_idx, mask to 1 bit
    // For a proper implementation, branch on word_idx:
    // This is an approximation โ€” real Trident would need conditional select.
    (w >> bit_idx) & 1u32
}

Local Graph