use std::time::Instant;
fn median(t: &mut [u64]) -> u64 {
t.sort();
t[t.len() / 2]
}
fn main() {
let n = 64usize;
let k = 64;
let a: Vec<f32> = (0..n * k).map(|i| (i % 7) as f32 * 0.1).collect();
let b: Vec<f32> = (0..k * n).map(|i| (i % 11) as f32 * 0.1).collect();
let mut c = vec![0.0f32; n * n];
let iters = 500;
acpu::matmul_f32(&a, &b, &mut c, n, n, k);
let mut t = vec![0u64; iters];
for i in 0..iters {
c.fill(0.0);
let s = Instant::now();
acpu::matmul_f32(&a, &b, &mut c, n, n, k);
t[i] = s.elapsed().as_nanos() as u64;
}
eprintln!(
"full sgemm 64ร64: {:>5} ns {:.1} GFLOPS",
median(&mut t),
2.0 * 64f64.powi(3) / median(&mut t) as f64
);
for &kk in &[8, 16, 32, 64] {
let a2: Vec<f32> = (0..n * kk).map(|i| (i % 7) as f32 * 0.1).collect();
let b2: Vec<f32> = (0..kk * n).map(|i| (i % 11) as f32 * 0.1).collect();
let mut c2 = vec![0.0f32; n * n];
acpu::matmul_f32(&a2, &b2, &mut c2, n, n, kk);
for i in 0..iters {
c2.fill(0.0);
let s = Instant::now();
acpu::matmul_f32(&a2, &b2, &mut c2, n, n, kk);
t[i] = s.elapsed().as_nanos() as u64;
}
let ns = median(&mut t);
let gf = 2.0 * n as f64 * n as f64 * kk as f64 / ns as f64;
eprintln!(
"sgemm 64ร64รk={:<3}: {:>5} ns {:.1} GFLOPS ({:.1} ns/k-step)",
kk,
ns,
gf,
(ns as f64) / kk as f64
);
}
}