This commit is contained in:
Adrien Gallouët 2024-12-07 04:43:21 +08:00 committed by GitHub
commit 055154ad3b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1090,466 +1090,63 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
size_t res_stride = bs * sizeof(float);
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy;
for (int r = 0; r < nr; r += 4) {
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
const block_q8_0x4 * a_base = a_ptr;
for (int c = 0; c < nc; c += ncols_interleaved) {
a_ptr = a_base;
__asm__ __volatile__(
"mov x10, %x[nr]\n"
"mov x9, #0x88\n"
"cmp x10, #0x10\n"
"mul x9, %x[nb], x9\n"
"blt 4f\n"
"1:" // Row loop
"add x28, %x[b_ptr], #0x8\n"
"mov x27, %x[nc]\n"
"add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
"2:" // Column loop
"add x25, %x[a_ptr], #0x8\n"
"movi v15.16b, #0x0\n"
"movi v19.16b, #0x0\n"
"mov x24, %x[nb]\n"
"add x23, x25, x9\n"
"movi v18.16b, #0x0\n"
"movi v14.16b, #0x0\n"
"add x22, x23, x9\n"
"movi v11.16b, #0x0\n"
"movi v13.16b, #0x0\n"
"add x21, x22, x9\n"
"movi v23.16b, #0x0\n"
"movi v16.16b, #0x0\n"
"movi v25.16b, #0x0\n"
"movi v7.16b, #0x0\n"
"movi v0.16b, #0x0\n"
"movi v4.16b, #0x0\n"
"movi v5.16b, #0x0\n"
"movi v21.16b, #0x0\n"
"movi v8.16b, #0x0\n"
"movi v1.16b, #0x0\n"
"3:" // Block loop
"ldr q3, [x28, #0x0]\n"
"ldr q31, [x25, #0x0]\n"
"movi v28.16b, #0x4\n"
"movi v10.4s, #0x0\n"
"ldr q22, [x28, #0x10]\n"
"ldr q6, [x25, #0x10]\n"
"movi v29.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"ldr q27, [x28, #0x20]\n"
"ldr q30, [x28, #0x30]\n"
"movi v20.4s, #0x0\n"
"movi v24.16b, #0xf0\n"
"ldr d2, [x25, #-0x8]\n"
"ldr d26, [x23, #-0x8]\n"
"sshl v12.16b, v3.16b, v28.16b\n"
"sub x20, x28, #0x8\n"
"ldr d17, [x20, #0x0]\n"
"and v3.16b, v3.16b, v24.16b\n"
"subs x24, x24, #0x1\n"
"add x28, x28, #0x48\n"
".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n"
".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n"
".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n"
".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n"
"sshl v31.16b, v22.16b, v28.16b\n"
"and v22.16b, v22.16b, v24.16b\n"
"fcvtl v17.4s, v17.4h\n"
"fcvtl v2.4s, v2.4h\n"
"fcvtl v26.4s, v26.4h\n"
".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n"
".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n"
".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n"
".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n"
"sshl v6.16b, v27.16b, v28.16b\n"
"sshl v28.16b, v30.16b, v28.16b\n"
"and v27.16b, v27.16b, v24.16b\n"
"and v30.16b, v30.16b, v24.16b\n"
"ldr q24, [x25, #0x20]\n"
".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n"
".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n"
".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n"
"ldr q24, [x25, #0x30]\n"
".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n"
".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n"
".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n"
".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n"
"ldr q24, [x25, #0x40]\n"
".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n"
".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n"
".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n"
"ldr q24, [x25, #0x50]\n"
".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n"
".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n"
".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n"
".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n"
"ldr q24, [x25, #0x60]\n"
".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n"
".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n"
".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n"
"ldr q24, [x25, #0x70]\n"
"add x25, x25, #0x88\n"
".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n"
".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n"
".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n"
".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n"
"fmul v24.4s, v17.4s, v2.s[0]\n"
"scvtf v10.4s, v10.4s, #0x4\n"
"scvtf v29.4s, v29.4s, #0x4\n"
"scvtf v9.4s, v9.4s, #0x4\n"
"scvtf v20.4s, v20.4s, #0x4\n"
"fmla v15.4s, v10.4s, v24.4s\n"
"ldr q24, [x23, #0x0]\n"
"fmul v10.4s, v17.4s, v2.s[1]\n"
"fmla v19.4s, v29.4s, v10.4s\n"
"ldr q10, [x23, #0x10]\n"
"fmul v29.4s, v17.4s, v2.s[2]\n"
"fmul v2.4s, v17.4s, v2.s[3]\n"
"fmla v18.4s, v9.4s, v29.4s\n"
"movi v9.4s, #0x0\n"
"movi v29.4s, #0x0\n"
".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n"
".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n"
"fmla v14.4s, v20.4s, v2.4s\n"
"movi v20.4s, #0x0\n"
"movi v2.4s, #0x0\n"
".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n"
".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
"ldr q24, [x23, #0x20]\n"
".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n"
".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n"
".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n"
".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n"
"ldr q10, [x23, #0x30]\n"
".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n"
".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n"
".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
"ldr q24, [x23, #0x40]\n"
".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n"
".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n"
".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n"
".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n"
"ldr q10, [x23, #0x50]\n"
".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n"
".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n"
".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
"ldr q24, [x23, #0x60]\n"
".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n"
".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n"
".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n"
".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n"
"ldr q10, [x23, #0x70]\n"
"add x23, x23, #0x88\n"
".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n"
".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n"
".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
"ldr q24, [x22, #0x0]\n"
".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n"
".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n"
".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n"
".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n"
"fmul v10.4s, v17.4s, v26.s[0]\n"
"scvtf v9.4s, v9.4s, #0x4\n"
"scvtf v29.4s, v29.4s, #0x4\n"
"scvtf v20.4s, v20.4s, #0x4\n"
"scvtf v2.4s, v2.4s, #0x4\n"
"fmla v11.4s, v9.4s, v10.4s\n"
"ldr q9, [x22, #0x10]\n"
"fmul v10.4s, v17.4s, v26.s[1]\n"
"fmla v13.4s, v29.4s, v10.4s\n"
"ldr d29, [x22, #-0x8]\n"
"fmul v10.4s, v17.4s, v26.s[2]\n"
"fmul v26.4s, v17.4s, v26.s[3]\n"
"fcvtl v29.4s, v29.4h\n"
"fmla v23.4s, v20.4s, v10.4s\n"
"movi v20.4s, #0x0\n"
"movi v10.4s, #0x0\n"
"fmla v16.4s, v2.4s, v26.4s\n"
"movi v26.4s, #0x0\n"
"movi v2.4s, #0x0\n"
".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n"
".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n"
".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
"ldr q24, [x22, #0x20]\n"
".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n"
".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n"
".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n"
"ldr q9, [x22, #0x30]\n"
".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n"
".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n"
".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
"ldr q24, [x22, #0x40]\n"
".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n"
".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n"
".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n"
"ldr q9, [x22, #0x50]\n"
".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n"
".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n"
".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n"
".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
"ldr q24, [x22, #0x60]\n"
".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n"
".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n"
".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n"
"ldr q9, [x22, #0x70]\n"
"add x22, x22, #0x88\n"
".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n"
".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n"
".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n"
".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
"ldr q24, [x21, #0x0]\n"
".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n"
".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n"
".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n"
".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n"
"fmul v9.4s, v17.4s, v29.s[0]\n"
"scvtf v20.4s, v20.4s, #0x4\n"
"scvtf v10.4s, v10.4s, #0x4\n"
"scvtf v26.4s, v26.4s, #0x4\n"
"scvtf v2.4s, v2.4s, #0x4\n"
"fmla v25.4s, v20.4s, v9.4s\n"
"ldr q9, [x21, #0x10]\n"
"fmul v20.4s, v17.4s, v29.s[1]\n"
"fmla v7.4s, v10.4s, v20.4s\n"
"ldr d20, [x21, #-0x8]\n"
"fmul v10.4s, v17.4s, v29.s[2]\n"
"fmul v29.4s, v17.4s, v29.s[3]\n"
"fcvtl v20.4s, v20.4h\n"
"fmla v0.4s, v26.4s, v10.4s\n"
"movi v26.4s, #0x0\n"
"movi v10.4s, #0x0\n"
"fmla v4.4s, v2.4s, v29.4s\n"
"movi v2.4s, #0x0\n"
"movi v29.4s, #0x0\n"
".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n"
".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n"
".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n"
"ldr q12, [x21, #0x20]\n"
"fmul v24.4s, v17.4s, v20.s[0]\n"
".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n"
".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n"
".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n"
"ldr q9, [x21, #0x30]\n"
"fmul v31.4s, v17.4s, v20.s[1]\n"
".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n"
".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n"
".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n"
".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n"
"ldr q12, [x21, #0x40]\n"
"fmul v6.4s, v17.4s, v20.s[2]\n"
"fmul v20.4s, v17.4s, v20.s[3]\n"
".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n"
".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n"
".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n"
"ldr q9, [x21, #0x50]\n"
".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n"
".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n"
".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n"
".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n"
"ldr q12, [x21, #0x60]\n"
".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n"
".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n"
".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n"
"ldr q17, [x21, #0x70]\n"
"add x21, x21, #0x88\n"
".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n"
".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n"
".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n"
".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n"
".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n"
".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n"
".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n"
".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n"
"scvtf v26.4s, v26.4s, #0x4\n"
"scvtf v10.4s, v10.4s, #0x4\n"
"fmla v5.4s, v26.4s, v24.4s\n"
"scvtf v2.4s, v2.4s, #0x4\n"
"scvtf v29.4s, v29.4s, #0x4\n"
"fmla v21.4s, v10.4s, v31.4s\n"
"fmla v8.4s, v2.4s, v6.4s\n"
"fmla v1.4s, v29.4s, v20.4s\n"
"bgt 3b\n"
"mov x20, %x[res_ptr]\n"
"subs x27, x27, #0x4\n"
"add %x[res_ptr], %x[res_ptr], #0x10\n"
"str q15, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q19, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q18, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q14, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q11, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q13, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q23, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q16, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q25, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q7, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q0, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q4, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q5, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q21, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q8, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"str q1, [x20, #0x0]\n"
"bne 2b\n"
"mov x20, #0x4\n"
"sub x10, x10, #0x10\n"
"cmp x10, #0x10\n"
"mov %x[res_ptr], x26\n"
"madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
"bge 1b\n"
"4:" // Row loop skip
"cbz x10, 9f\n"
"5:" // Row tail: Row loop
"add x24, %x[b_ptr], #0x8\n"
"mov x23, %x[nc]\n"
"add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
"6:" // Row tail: Column loop
"movi v15.16b, #0x0\n"
"movi v19.16b, #0x0\n"
"add x25, %x[a_ptr], #0x8\n"
"mov x21, %x[nb]\n"
"movi v18.16b, #0x0\n"
"movi v14.16b, #0x0\n"
"7:" // Row tail: Block loop
"ldr q7, [x24, #0x0]\n"
"ldr q5, [x25, #0x0]\n"
"movi v9.16b, #0x4\n"
"movi v4.4s, #0x0\n"
"ldr q3, [x24, #0x10]\n"
"ldr q2, [x25, #0x10]\n"
"movi v1.4s, #0x0\n"
"movi v0.4s, #0x0\n"
"ldr q13, [x24, #0x20]\n"
"ldr q31, [x25, #0x20]\n"
"movi v30.4s, #0x0\n"
"movi v29.16b, #0xf0\n"
"ldr q28, [x24, #0x30]\n"
"ldr q27, [x25, #0x30]\n"
"sshl v20.16b, v7.16b, v9.16b\n"
"sub x20, x24, #0x8\n"
"ldr q26, [x25, #0x40]\n"
"ldr q25, [x25, #0x50]\n"
"sshl v17.16b, v3.16b, v9.16b\n"
"and v7.16b, v7.16b, v29.16b\n"
"ldr q24, [x25, #0x60]\n"
"ldr q16, [x25, #0x70]\n"
"sshl v22.16b, v13.16b, v9.16b\n"
"and v3.16b, v3.16b, v29.16b\n"
"ldr d21, [x20, #0x0]\n"
"ldr d12, [x25, #-0x8]\n"
".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n"
".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n"
".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n"
".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n"
"sshl v9.16b, v28.16b, v9.16b\n"
"subs x21, x21, #0x1\n"
"and v13.16b, v13.16b, v29.16b\n"
"and v28.16b, v28.16b, v29.16b\n"
"add x25, x25, #0x88\n"
"add x24, x24, #0x48\n"
"fcvtl v21.4s, v21.4h\n"
"fcvtl v12.4s, v12.4h\n"
".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n"
".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n"
".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n"
".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n"
"fmul v11.4s, v21.4s, v12.s[0]\n"
"fmul v23.4s, v21.4s, v12.s[1]\n"
"fmul v17.4s, v21.4s, v12.s[2]\n"
".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n"
"fmul v6.4s, v21.4s, v12.s[3]\n"
".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n"
".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n"
".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n"
".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n"
".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n"
".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n"
".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n"
".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n"
".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n"
".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n"
".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n"
".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n"
".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n"
".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n"
".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n"
".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n"
".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n"
".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n"
".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n"
".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n"
".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n"
".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n"
".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n"
"scvtf v4.4s, v4.4s, #0x4\n"
"scvtf v1.4s, v1.4s, #0x4\n"
"scvtf v0.4s, v0.4s, #0x4\n"
"fmla v15.4s, v4.4s, v11.4s\n"
"scvtf v30.4s, v30.4s, #0x4\n"
"fmla v19.4s, v1.4s, v23.4s\n"
"fmla v18.4s, v0.4s, v17.4s\n"
"fmla v14.4s, v30.4s, v6.4s\n"
"bgt 7b\n"
"mov x20, %x[res_ptr]\n"
"cmp x10, #0x1\n"
"str q15, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"ble 8f\n"
"cmp x10, #0x2\n"
"str q19, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"ble 8f\n"
"cmp x10, #0x3\n"
"str q18, [x20, #0x0]\n"
"add x20, x20, %x[res_stride]\n"
"ble 8f\n"
"str q14, [x20, #0x0]\n"
"8:" // Row tail: Accumulator store skip
"subs x23, x23, #0x4\n"
"add %x[res_ptr], %x[res_ptr], #0x10\n"
"bne 6b\n"
"subs x10, x10, #0x4\n"
"add %x[a_ptr], %x[a_ptr], x9\n"
"mov %x[res_ptr], x22\n"
"bgt 5b\n"
"9:" // Row tail: Row loop skip
: [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
: [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
float32x4_t acc0 = vdupq_n_f32(0);
float32x4_t acc1 = vdupq_n_f32(0);
float32x4_t acc2 = vdupq_n_f32(0);
float32x4_t acc3 = vdupq_n_f32(0);
for (int b = 0; b < nb; b++) {
float32x4_t ad = vcvt_f32_f16(vld1_f16((const __fp16 *)a_ptr->d));
float32x4_t bd = vcvt_f32_f16(vld1_f16((const __fp16 *)b_ptr->d));
int32x4_t ret0 = vdupq_n_s32(0);
int32x4_t ret1 = vdupq_n_s32(0);
int32x4_t ret2 = vdupq_n_s32(0);
int32x4_t ret3 = vdupq_n_s32(0);
for (int k = 0; k < 4; k++) {
int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs + 16 * k);
int8x16_t b1 = b0 & 0xF0U;
b0 <<= 4;
int8x16_t a0 = vld1q_s8(a_ptr->qs + 16 * k);
int8x16_t a1 = vld1q_s8(a_ptr->qs + 16 * k + 4 * qk/2);
ret0 = vdotq_laneq_s32(ret0, b0, a0, 0);
ret1 = vdotq_laneq_s32(ret1, b0, a0, 1);
ret2 = vdotq_laneq_s32(ret2, b0, a0, 2);
ret3 = vdotq_laneq_s32(ret3, b0, a0, 3);
ret0 = vdotq_laneq_s32(ret0, b1, a1, 0);
ret1 = vdotq_laneq_s32(ret1, b1, a1, 1);
ret2 = vdotq_laneq_s32(ret2, b1, a1, 2);
ret3 = vdotq_laneq_s32(ret3, b1, a1, 3);
}
acc0 = vfmaq_f32(acc0, vcvtq_n_f32_s32(ret0, 4), vmulq_laneq_f32(bd, ad, 0));
acc1 = vfmaq_f32(acc1, vcvtq_n_f32_s32(ret1, 4), vmulq_laneq_f32(bd, ad, 1));
acc2 = vfmaq_f32(acc2, vcvtq_n_f32_s32(ret2, 4), vmulq_laneq_f32(bd, ad, 2));
acc3 = vfmaq_f32(acc3, vcvtq_n_f32_s32(ret3, 4), vmulq_laneq_f32(bd, ad, 3));
a_ptr++;
b_ptr++;
}
vst1q_f32(s + (r + 0) * bs + c, acc0);
vst1q_f32(s + (r + 1) * bs + c, acc1);
vst1q_f32(s + (r + 2) * bs + c, acc2);
vst1q_f32(s + (r + 3) * bs + c, acc3);
}
}
return;
}
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
{
float sumf[4][4];
int sumi;