#[path = "common.rs"]
mod common;
use common::*;
#[link(name = "Accelerate", kind = "framework")]
extern "C" {}
fn main() {
let n: usize = 4096;
let ni = n as i32;
let nu = n as u64;
let src: Vec<f32> = (0..n).map(|i| (i as f32 + 1.0) / n as f32).collect(); let pos: Vec<f32> = (0..n).map(|i| (i as f32) * 0.01 - 20.0).collect(); let b: Vec<f32> = (0..n).map(|i| ((i as f32) * 0.7 + 0.3).sin()).collect();
let d: Vec<f32> = (0..n).map(|i| ((i as f32) * 1.3).cos()).collect();
let d2: Vec<f32> = (0..n)
.map(|i| ((i as f32) * 0.5 + 1.0).sin().abs() + 0.1)
.collect();
let mut score = Score::new();
score.hdr("ELEMENTWISE (4096 f32)");
{
let mut dst = vec![0.0f32; n];
let t_acpu = ns(|| acpu::vector::math::exp_to(&pos, &mut dst));
let mut out = vec![0.0f32; n];
let t_apple = ns(|| unsafe { vvexpf(out.as_mut_ptr(), pos.as_ptr(), &ni) });
score.row("exp", t_acpu, t_apple);
}
{
let mut dst = vec![0.0f32; n];
let t_acpu = ns(|| acpu::vector::math::log_to(&src, &mut dst));
let mut out = vec![0.0f32; n];
let t_apple = ns(|| unsafe { vvlogf(out.as_mut_ptr(), src.as_ptr(), &ni) });
score.row("log", t_acpu, t_apple);
}
{
let mut buf = pos.clone();
let t_acpu = ns(|| {
buf.copy_from_slice(&pos);
acpu::vector::math::tanh(&mut buf);
});
let mut out = vec![0.0f32; n];
let t_apple = ns(|| unsafe { vvtanhf(out.as_mut_ptr(), pos.as_ptr(), &ni) });
score.row("tanh", t_acpu, t_apple);
}
{
let mut buf = pos.clone();
let t_acpu = ns(|| {
buf.copy_from_slice(&pos);
acpu::vector::math::sigmoid(&mut buf);
});
let mut neg = vec![0.0f32; n];
let mut expn = vec![0.0f32; n];
let one = 1.0f32;
let mut out = vec![0.0f32; n];
let t_apple = ns(|| unsafe {
vDSP_vneg(pos.as_ptr(), 1, neg.as_mut_ptr(), 1, nu);
vvexpf(expn.as_mut_ptr(), neg.as_ptr(), &ni);
vDSP_vsadd(expn.as_ptr(), 1, &one, expn.as_mut_ptr(), 1, nu);
vDSP_svdiv(&one, expn.as_ptr(), 1, out.as_mut_ptr(), 1, nu);
});
score.row("sigmoid", t_acpu, t_apple);
}
{
let mut buf = pos.clone();
let t_acpu = ns(|| {
buf.copy_from_slice(&pos);
acpu::vector::math::gelu(&mut buf);
});
let mut tmp = vec![0.0f32; n];
let mut out = vec![0.0f32; n];
let half = 0.5f32;
let t_apple = ns(|| unsafe {
vDSP_vmul(pos.as_ptr(), 1, pos.as_ptr(), 1, tmp.as_mut_ptr(), 1, nu);
vDSP_vmul(tmp.as_ptr(), 1, pos.as_ptr(), 1, tmp.as_mut_ptr(), 1, nu);
let c = 0.044715f32;
vDSP_vsmul(tmp.as_ptr(), 1, &c, tmp.as_mut_ptr(), 1, nu);
vDSP_vadd(pos.as_ptr(), 1, tmp.as_ptr(), 1, tmp.as_mut_ptr(), 1, nu);
let s2pi = 0.7978845608f32;
vDSP_vsmul(tmp.as_ptr(), 1, &s2pi, tmp.as_mut_ptr(), 1, nu);
vvtanhf(out.as_mut_ptr(), tmp.as_ptr(), &ni);
let one = 1.0f32;
vDSP_vsadd(out.as_ptr(), 1, &one, out.as_mut_ptr(), 1, nu);
vDSP_vsmul(pos.as_ptr(), 1, &half, tmp.as_mut_ptr(), 1, nu);
vDSP_vmul(tmp.as_ptr(), 1, out.as_ptr(), 1, out.as_mut_ptr(), 1, nu);
});
score.row("gelu", t_acpu, t_apple);
}
{
let mut buf = pos.clone();
let t_acpu = ns(|| {
buf.copy_from_slice(&pos);
acpu::vector::math::silu(&mut buf);
});
let mut neg = vec![0.0f32; n];
let mut expn = vec![0.0f32; n];
let one = 1.0f32;
let mut sig = vec![0.0f32; n];
let mut out = vec![0.0f32; n];
let t_apple = ns(|| unsafe {
vDSP_vneg(pos.as_ptr(), 1, neg.as_mut_ptr(), 1, nu);
vvexpf(expn.as_mut_ptr(), neg.as_ptr(), &ni);
vDSP_vsadd(expn.as_ptr(), 1, &one, expn.as_mut_ptr(), 1, nu);
vDSP_svdiv(&one, expn.as_ptr(), 1, sig.as_mut_ptr(), 1, nu);
vDSP_vmul(pos.as_ptr(), 1, sig.as_ptr(), 1, out.as_mut_ptr(), 1, nu);
});
score.row("silu", t_acpu, t_apple);
}
score.hdr("REDUCTIONS (4096 f32)");
{
let t_acpu = ns(|| {
std::hint::black_box(acpu::vector::reduce::sum(&src));
});
let mut result = 0.0f32;
let t_apple = ns(|| unsafe {
vDSP_sve(src.as_ptr(), 1, &mut result, nu);
std::hint::black_box(result);
});
score.row("sum", t_acpu, t_apple);
}
{
let t_acpu = ns(|| {
std::hint::black_box(acpu::vector::reduce::dot(&b, &d));
});
let t_apple = ns(|| unsafe {
std::hint::black_box(cblas_sdot(ni, b.as_ptr(), 1, d.as_ptr(), 1));
});
score.row("dot", t_acpu, t_apple);
}
{
let t_acpu = ns(|| {
std::hint::black_box(acpu::vector::reduce::length(&src));
});
let t_apple = ns(|| unsafe {
std::hint::black_box(cblas_snrm2(ni, src.as_ptr(), 1));
});
score.row("length", t_acpu, t_apple);
}
{
let t_acpu = ns(|| {
std::hint::black_box(acpu::vector::reduce::max(&src));
});
let mut result = 0.0f32;
let t_apple = ns(|| unsafe {
vDSP_maxv(src.as_ptr(), 1, &mut result, nu);
std::hint::black_box(result);
});
score.row("max", t_acpu, t_apple);
}
{
let t_acpu = ns(|| {
std::hint::black_box(acpu::vector::reduce::min(&src));
});
let mut result = 0.0f32;
let t_apple = ns(|| unsafe {
vDSP_minv(src.as_ptr(), 1, &mut result, nu);
std::hint::black_box(result);
});
score.row("min", t_acpu, t_apple);
}
score.hdr("COMPOUND (4096 f32)");
{
let mut buf = pos.clone();
let t_acpu = ns(|| {
buf.copy_from_slice(&pos);
acpu::vector::softmax::softmax(&mut buf);
});
let mut out = vec![0.0f32; n];
let mut mx = 0.0f32;
let mut s = 0.0f32;
let t_apple = ns(|| unsafe {
vDSP_maxv(pos.as_ptr(), 1, &mut mx, nu);
let neg_mx = -mx;
vDSP_vsadd(pos.as_ptr(), 1, &neg_mx, out.as_mut_ptr(), 1, nu);
vvexpf(out.as_mut_ptr(), out.as_ptr(), &ni);
vDSP_sve(out.as_ptr(), 1, &mut s, nu);
vDSP_vsdiv(out.as_ptr(), 1, &s, out.as_mut_ptr(), 1, nu);
});
score.row("softmax", t_acpu, t_apple);
}
{
let weight = d2.clone();
let mut out_acpu = vec![0.0f32; n];
let t_acpu = ns(|| {
acpu::vector::softmax::normalize(&mut out_acpu, &src, &weight, 1e-5);
});
let mut out_apple = vec![0.0f32; n];
let mut ss = 0.0f32;
let t_apple = ns(|| unsafe {
vDSP_svesq(src.as_ptr(), 1, &mut ss, nu);
let inv_rms = 1.0 / (ss / n as f32 + 1e-5f32).sqrt();
vDSP_vsmul(src.as_ptr(), 1, &inv_rms, out_apple.as_mut_ptr(), 1, nu);
vDSP_vmul(
out_apple.as_ptr(),
1,
weight.as_ptr(),
1,
out_apple.as_mut_ptr(),
1,
nu,
);
});
score.row("normalize", t_acpu, t_apple);
}
score.summary();
}