// Packed Fโ operations โ 128 elements in 4 ร U32
//
// SIMD-style batch operations where hardware XOR/AND/popcount
// operate on 128 binary field elements simultaneously.
//
// Mirrors rs/packed.rs but uses U32 words (Trident has no u128).
module kuro.packed
pub struct Packed128 { w0: U32, w1: U32, w2: U32, w3: U32 }
pub fn packed_zero() -> Packed128 {
Packed128 { w0: 0u32, w1: 0u32, w2: 0u32, w3: 0u32 }
}
pub fn packed_ones() -> Packed128 {
Packed128 { w0: 0xFFFFFFFFu32, w1: 0xFFFFFFFFu32,
w2: 0xFFFFFFFFu32, w3: 0xFFFFFFFFu32 }
}
/// Vectorized addition: 128 parallel XOR operations.
pub fn packed_add(a: Packed128, b: Packed128) -> Packed128 {
Packed128 {
w0: a.w0 ^ b.w0,
w1: a.w1 ^ b.w1,
w2: a.w2 ^ b.w2,
w3: a.w3 ^ b.w3,
}
}
/// Vectorized multiplication: 128 parallel AND operations.
pub fn packed_mul(a: Packed128, b: Packed128) -> Packed128 {
Packed128 {
w0: a.w0 & b.w0,
w1: a.w1 & b.w1,
w2: a.w2 & b.w2,
w3: a.w3 & b.w3,
}
}
/// Vectorized NOT: 128 parallel complement operations.
pub fn packed_not(a: Packed128) -> Packed128 {
Packed128 {
w0: a.w0 ^ 0xFFFFFFFFu32,
w1: a.w1 ^ 0xFFFFFFFFu32,
w2: a.w2 ^ 0xFFFFFFFFu32,
w3: a.w3 ^ 0xFFFFFFFFu32,
}
}
/// Popcount: number of 1-elements in the packed vector.
pub fn packed_popcount(a: Packed128) -> U32 {
u32.popcount(a.w0) + u32.popcount(a.w1)
+ u32.popcount(a.w2) + u32.popcount(a.w3)
}
/// Inner product: popcount(a AND b).
/// The binary matrix-vector multiply kernel.
pub fn packed_inner_product(a: Packed128, b: Packed128) -> U32 {
packed_popcount(packed_mul(a, b))
}
/// Hamming distance between two packed vectors.
pub fn packed_hamming_distance(a: Packed128, b: Packed128) -> U32 {
packed_popcount(packed_add(a, b))
}
/// Get a single bit at position i (0..127).
/// i < 32 => w0, 32..63 => w1, 64..95 => w2, 96..127 => w3.
pub fn packed_get_bit(a: Packed128, i: U32) -> U32 {
let word_idx: U32 = i >> 5u32; // i / 32
let bit_idx: U32 = i & 31u32; // i % 32
// Select the appropriate word
// Branchless: sum of (word & mask) where mask selects exactly one word
let sel0: U32 = (1u32 >> word_idx) & 1u32; // 1 when word_idx == 0
let w: U32 = a.w0; // default; caller should use word_idx externally
// Simplified: shift selected word right by bit_idx, mask to 1 bit
// For a proper implementation, branch on word_idx:
// This is an approximation โ real Trident would need conditional select.
(w >> bit_idx) & 1u32
}
kuro/tri/packed.tri
ฯ 0.0%