#[cfg(target_arch = "aarch64")]
core::arch::global_asm!(
".global _acpu_kern_32x32_first",
".p2align 4",
"_acpu_kern_32x32_first:",
"cbz x2, 2f",
"mov x12, #0x4000000000000000",
"mov w8, #0x8000000", "mov w9, #0x8110000", "movz w10, #0x0040", "movk w10, #0x0820, lsl #16",
"movz w11, #0x0040", "movk w11, #0x0831, lsl #16",
"orr x13, x1, x12",
"orr x14, x0, x12",
".word 0x0020100D", ".word 0x0020102E", ".word 0x00201188", ".word 0x00201189", ".word 0x0020118A", ".word 0x0020118B", "mov w8, #0",
"mov w9, #0x110000",
"movz w10, #0x0040",
"movk w10, #0x0020, lsl #16",
"movz w11, #0x0040",
"movk w11, #0x0031, lsl #16",
"sub x2, x2, #1",
"add x0, x0, #128",
"add x1, x1, x3",
"cbz x2, 2f",
".p2align 4",
"1:",
"orr x13, x1, x12",
"orr x14, x0, x12",
".word 0x0020100D", ".word 0x0020102E", ".word 0x00201188", ".word 0x00201189", ".word 0x0020118A", ".word 0x0020118B", "add x0, x0, #128",
"add x1, x1, x3",
"subs x2, x2, #1",
"b.ne 1b",
"2:",
"ret",
".global _acpu_kern_32x32_acc",
".p2align 4",
"_acpu_kern_32x32_acc:",
"cbz x2, 2f",
"mov x12, #0x4000000000000000",
"mov w8, #0",
"mov w9, #0x110000",
"movz w10, #0x0040",
"movk w10, #0x0020, lsl #16",
"movz w11, #0x0040",
"movk w11, #0x0031, lsl #16",
".p2align 4",
"1:",
"orr x13, x1, x12",
"orr x14, x0, x12",
".word 0x0020100D",
".word 0x0020102E",
".word 0x00201188",
".word 0x00201189",
".word 0x0020118A",
".word 0x0020118B",
"add x0, x0, #128",
"add x1, x1, x3",
"subs x2, x2, #1",
"b.ne 1b",
"2:",
"ret",
);
unsafe extern "C" {
fn acpu_kern_32x32_first(a_pair: *const u8, b_ptr: *const u8, k: usize, bs: usize);
fn acpu_kern_32x32_acc(a_pair: *const u8, b_ptr: *const u8, k: usize, bs: usize);
}
#[inline]
pub unsafe fn kern_32x32_first(a_pair: *const u8, b_ptr: *const u8, k: usize, bs: usize) {
acpu_kern_32x32_first(a_pair, b_ptr, k, bs);
}
#[inline]
pub unsafe fn kern_32x32_acc(a_pair: *const u8, b_ptr: *const u8, k: usize, bs: usize) {
acpu_kern_32x32_acc(a_pair, b_ptr, k, bs);
}