#[cfg(target_arch = "aarch64")]
core::arch::global_asm!(
".section __DATA,__const",
".p2align 4",
"_acpu_exp_consts:",
".float 88.37626", ".float -87.33655", ".float 1.4426950", ".float 0.69314718", ".float 1.0", ".float 1.0", ".float 0.5000001", ".float 0.1666656", ".float 0.0416756", ".float 0.0083716", ".float 0.0", ".float 0.0", ".text",
".global _acpu_exp_f32",
".p2align 4",
"_acpu_exp_f32:",
"stp d8, d9, [sp, #-64]!",
"stp d10, d11, [sp, #16]",
"stp d12, d13, [sp, #32]",
"stp d14, d15, [sp, #48]",
"adrp x3, _acpu_exp_consts@PAGE",
"add x3, x3, _acpu_exp_consts@PAGEOFF",
"ld1r {{v24.4s}}, [x3]", "add x3, x3, #4",
"ld1r {{v25.4s}}, [x3]", "add x3, x3, #4",
"ld1r {{v26.4s}}, [x3]", "add x3, x3, #4",
"ld1r {{v27.4s}}, [x3]", "add x3, x3, #4",
"ld1r {{v28.4s}}, [x3]", "add x3, x3, #4",
"add x3, x3, #4",
"ld1r {{v29.4s}}, [x3]", "add x3, x3, #4",
"ld1r {{v30.4s}}, [x3]", "add x3, x3, #4",
"ld1r {{v8.4s}}, [x3]", "add x3, x3, #4",
"ld1r {{v9.4s}}, [x3]", "movi v10.4s, #127",
"cmp x2, #16",
"b.lt 2f",
".p2align 4",
"1:",
"ldp q0, q1, [x1]",
"ldp q2, q3, [x1, #32]",
"fmin v0.4s, v0.4s, v24.4s",
"fmin v1.4s, v1.4s, v24.4s",
"fmin v2.4s, v2.4s, v24.4s",
"fmin v3.4s, v3.4s, v24.4s",
"fmax v0.4s, v0.4s, v25.4s",
"fmax v1.4s, v1.4s, v25.4s",
"fmax v2.4s, v2.4s, v25.4s",
"fmax v3.4s, v3.4s, v25.4s",
"fmul v4.4s, v0.4s, v26.4s",
"fmul v5.4s, v1.4s, v26.4s",
"fmul v6.4s, v2.4s, v26.4s",
"fmul v7.4s, v3.4s, v26.4s",
"frintn v4.4s, v4.4s",
"frintn v5.4s, v5.4s",
"frintn v6.4s, v6.4s",
"frintn v7.4s, v7.4s",
"fmls v0.4s, v4.4s, v27.4s",
"fmls v1.4s, v5.4s, v27.4s",
"fmls v2.4s, v6.4s, v27.4s",
"fmls v3.4s, v7.4s, v27.4s",
"fmul v11.4s, v0.4s, v0.4s",
"fmul v12.4s, v1.4s, v1.4s",
"fmul v13.4s, v2.4s, v2.4s",
"fmul v14.4s, v3.4s, v3.4s",
"fadd v15.4s, v28.4s, v0.4s",
"fadd v16.4s, v28.4s, v1.4s",
"fadd v17.4s, v28.4s, v2.4s",
"fadd v18.4s, v28.4s, v3.4s",
"mov v19.16b, v29.16b",
"mov v20.16b, v29.16b",
"mov v21.16b, v29.16b",
"mov v22.16b, v29.16b",
"fmla v19.4s, v30.4s, v0.4s",
"fmla v20.4s, v30.4s, v1.4s",
"fmla v21.4s, v30.4s, v2.4s",
"fmla v22.4s, v30.4s, v3.4s",
"mov v23.16b, v8.16b",
"mov v31.16b, v8.16b",
"fmul v23.4s, v9.4s, v0.4s", "fadd v23.4s, v23.4s, v8.4s", "fmul v31.4s, v9.4s, v1.4s",
"fadd v31.4s, v31.4s, v8.4s",
"fmul v0.4s, v11.4s, v11.4s", "fmul v1.4s, v12.4s, v12.4s", "fmla v15.4s, v19.4s, v11.4s", "fmla v16.4s, v20.4s, v12.4s", "fmla v15.4s, v23.4s, v0.4s", "fmla v16.4s, v31.4s, v1.4s", "fmul v23.4s, v9.4s, v2.4s",
"fadd v23.4s, v23.4s, v8.4s",
"fmul v31.4s, v9.4s, v3.4s",
"fadd v31.4s, v31.4s, v8.4s",
"fmul v2.4s, v13.4s, v13.4s", "fmul v3.4s, v14.4s, v14.4s", "fmla v17.4s, v21.4s, v13.4s",
"fmla v18.4s, v22.4s, v14.4s",
"fmla v17.4s, v23.4s, v2.4s",
"fmla v18.4s, v31.4s, v3.4s",
"fcvtns v4.4s, v4.4s",
"fcvtns v5.4s, v5.4s",
"fcvtns v6.4s, v6.4s",
"fcvtns v7.4s, v7.4s",
"add v4.4s, v4.4s, v10.4s",
"add v5.4s, v5.4s, v10.4s",
"add v6.4s, v6.4s, v10.4s",
"add v7.4s, v7.4s, v10.4s",
"shl v4.4s, v4.4s, #23",
"shl v5.4s, v5.4s, #23",
"shl v6.4s, v6.4s, #23",
"shl v7.4s, v7.4s, #23",
"fmul v15.4s, v15.4s, v4.4s",
"fmul v16.4s, v16.4s, v5.4s",
"fmul v17.4s, v17.4s, v6.4s",
"fmul v18.4s, v18.4s, v7.4s",
"stp q15, q16, [x0]",
"stp q17, q18, [x0, #32]",
"add x0, x0, #64",
"add x1, x1, #64",
"sub x2, x2, #16",
"cmp x2, #16",
"b.ge 1b",
"2:",
"ldp d14, d15, [sp, #48]",
"ldp d12, d13, [sp, #32]",
"ldp d10, d11, [sp, #16]",
"ldp d8, d9, [sp], #64",
"ret",
);
#[cfg(target_arch = "aarch64")]
extern "C" {
fn acpu_exp_f32(dst: *mut f32, src: *const f32, n: usize);
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
pub unsafe fn exp_asm(dst: *mut f32, src: *const f32, n: usize) {
acpu_exp_f32(dst, src, n);
}