From 340ef07fca904bc77ac46aa3fec34436e60400e2 Mon Sep 17 00:00:00 2001
From: Dibakar Gope <dibakar109@gmail.com>
Date: Mon, 22 Apr 2024 08:08:17 +0000
Subject: [PATCH] Arm AArch64: add optimized GEMV and GEMM asm kernels for
 q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780
 suggestions

---
 examples/quantize/quantize.cpp |    1 +
 ggml/include/ggml.h            |   23 +-
 ggml/src/ggml-quants.c         | 2171 +++++++++++++++++++++++++++-----
 ggml/src/ggml-quants.h         |   46 +-
 ggml/src/ggml.c                |  400 ++----
 include/llama.h                |    1 +
 src/llama.cpp                  |   39 +-
 7 files changed, 1941 insertions(+), 740 deletions(-)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 76e2052d5..214edb03c 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -46,6 +46,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
     { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
+    { "Q4_0_AARCH64", LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
     { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  },
     { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  },
     { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 2d3772673..bea898c32 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -384,6 +384,7 @@ extern "C" {
         GGML_TYPE_F64     = 28,
         GGML_TYPE_IQ1_M   = 29,
         GGML_TYPE_BF16    = 30,
+        GGML_TYPE_Q4_0_AARCH64 = 31,
         GGML_TYPE_COUNT,
     };
 
@@ -425,6 +426,7 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
         GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0_AARCH64 = 25, // except 1d tensors
     };
 
     // available tensor operations:
@@ -603,11 +605,6 @@ extern "C" {
         void * extra; // extra things e.g. for ggml-cuda.cu
 
         // char padding[4];
-        char padding[9];
-
-        void * rearranged_weight_gemv;
-        void * rearranged_weight_gemm;
-        bool weight_rearranged;
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -2397,6 +2394,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_rpc        (void);
     GGML_API int ggml_cpu_has_vsx        (void);
     GGML_API int ggml_cpu_has_matmul_int8(void);
+    GGML_API int ggml_cpu_has_sve        (void);
 
     //
     // Internal types and functions exposed for tests and benchmarks
@@ -2412,6 +2410,9 @@ extern "C" {
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
     typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
+    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k, int n, int b);
+    typedef void (*ggml_gemv_t)      (size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+    typedef void (*ggml_gemm_t)      (size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
 
     typedef struct {
         const char      * type_name;
@@ -2424,19 +2425,13 @@ extern "C" {
         ggml_vec_dot_t    vec_dot;
         enum ggml_type    vec_dot_type;
         int64_t           nrows; // number of rows to process simultaneously;
+        ggml_from_float_to_mat_t from_float_to_mat;
+        ggml_gemv_t       gemv;
+        ggml_gemm_t       gemm;
     } ggml_type_traits_t;
 
     GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 
-    GGML_API void rearrange_q4_0_weights_blocked8_neon(struct ggml_tensor * cur);
-    GGML_API void rearrange_q4_0_weights_blocked8_sve(struct ggml_tensor * cur);
-    GGML_API void rearrange_q4_0_weights_for_gemv(struct ggml_tensor * cur);
-    GGML_API void rearrange_q4_0_weights_for_gemm(struct ggml_tensor * cur);
-    GGML_API void rearrange_q8_0_weights_blocked8_neon(struct ggml_tensor * cur);
-    GGML_API void rearrange_q8_0_weights_blocked8_sve(struct ggml_tensor * cur);
-    GGML_API void rearrange_q8_0_weights_for_gemv(struct ggml_tensor * cur);
-    GGML_API void rearrange_q8_0_weights_for_gemm(struct ggml_tensor * cur);
-
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 2c0e89d4d..f77481037 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -700,6 +700,64 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
     quantize_row_q4_0_reference(x, y, k);
 }
 
+void quantize_row_q4_0_aarch64(const float * src, void * dst, int n, int k) {
+    int nrows_interleaved, blocklen_per_row;
+    typedef block_q4_0x8 block_q4_0xn;
+    typedef block_q4_0xn (*make_block_q4_0xn_t)(const block_q4_0 *, unsigned int, unsigned int);
+    make_block_q4_0xn_t make_block_q4_0xn = make_block_q4_0x8;
+
+    if (ggml_cpu_has_sve() && (svcntw() == 8)) {
+        nrows_interleaved = 8;
+        blocklen_per_row = 8;
+        typedef block_q4_0x8 block_q4_0xn;
+        make_block_q4_0xn = make_block_q4_0x8;
+    }
+    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+        nrows_interleaved = 4;
+        blocklen_per_row = 8;
+        typedef block_q4_0x4 block_q4_0xn;
+        make_block_q4_0xn = make_block_q4_0x4;
+    }
+    else if (ggml_cpu_has_neon()) {
+        nrows_interleaved = 4;
+        blocklen_per_row = 4;
+        typedef block_q4_0x4 block_q4_0xn;
+        make_block_q4_0xn = make_block_q4_0x4;
+    }
+    else {
+        assert(false);
+    }
+
+    assert(k % QK4_0 == 0);
+    const int nb = k / QK4_0;
+
+    block_q4_0xn * out_ptr_B = (block_q4_0xn *) malloc(sizeof(block_q4_0xn) * nb);
+    block_q4_0xn * out_ptr_B_start = out_ptr_B;
+
+    for (int b = 0; b < n; b += nrows_interleaved * k) {
+        const block_q4_0 * in_ptrs[nrows_interleaved];
+
+        for (int i  = 0; i < nrows_interleaved; i++ ) {
+            in_ptrs[i] = (block_q4_0 *) dst + (b + i * k) / QK4_0;
+            quantize_row_q4_0_reference(src + b + i * k, in_ptrs[i], k);
+        }
+
+        for (int64_t x = 0; x < nb; x++) {
+            *out_ptr_B = make_block_q4_0xn(in_ptrs, blocklen_per_row, 0x88);
+            out_ptr_B++;
+
+            for (int i = 0; i < nrows_interleaved; i++) {
+                in_ptrs[i]++;
+            }
+        }
+        out_ptr_B = out_ptr_B_start;
+        memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0xn) * nb);
+    }
+    if (out_ptr_B_start) free(out_ptr_B_start);
+
+    return (n / QK4_0 * sizeof(block_q4_0));
+}
+
 
 void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
     const int qk = QK4_1;
@@ -3307,6 +3365,76 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
+size_t quantize_q4_0_aarch64(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    if (!quant_weights) {
+        //quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
+        //return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+
+        int nrows_interleaved, blocklen_per_row;
+        typedef block_q4_0x8 block_q4_0xn;
+        typedef block_q4_0xn (*make_block_q4_0xn_t)(const block_q4_0 *, unsigned int, unsigned int);
+        make_block_q4_0xn_t make_block_q4_0xn = make_block_q4_0x8;
+
+        if (ggml_cpu_has_sve() && (svcntw() == 8)) {
+            nrows_interleaved = 8;
+            blocklen_per_row = 8;
+            typedef block_q4_0x8 block_q4_0xn;
+            make_block_q4_0xn = make_block_q4_0x8;
+        }
+        else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+            nrows_interleaved = 4;
+            blocklen_per_row = 8;
+            typedef block_q4_0x4 block_q4_0xn;
+            make_block_q4_0xn = make_block_q4_0x4;
+        }
+        else if (ggml_cpu_has_neon()) {
+            nrows_interleaved = 4;
+            blocklen_per_row = 4;
+            typedef block_q4_0x4 block_q4_0xn;
+            make_block_q4_0xn = make_block_q4_0x4;
+        }
+        else {
+            assert(false);
+        }
+
+        assert(n_per_row % QK4_0 == 0);
+        const int nb = n_per_row / QK4_0;
+
+        block_q4_0xn * out_ptr_B = (block_q4_0xn *) malloc(sizeof(block_q4_0xn) * nb);
+        block_q4_0xn * out_ptr_B_start = out_ptr_B;
+
+        for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
+            const block_q4_0 * in_ptrs[nrows_interleaved];
+
+            for (int i  = 0; i < nrows_interleaved; i++ ) {
+                in_ptrs[i] = (block_q4_0 *) dst + (b + i * n_per_row) / QK4_0;
+                quantize_row_q4_0_reference(src + b + i * n_per_row, in_ptrs[i], n_per_row);
+            }
+
+            for (int64_t x = 0; x < nb; x++) {
+                *out_ptr_B = make_block_q4_0xn(in_ptrs, blocklen_per_row, 0x88);
+                out_ptr_B++;
+
+                for (int i = 0; i < nrows_interleaved; i++) {
+                    in_ptrs[i]++;
+                }
+            }
+            out_ptr_B = out_ptr_B_start;
+            memcpy ((block_q4_0 *) dst + b / QK4_0, out_ptr_B_start, sizeof(block_q4_0xn) * nb);
+        }
+        if (out_ptr_B_start) free(out_ptr_B_start);
+        return (nrow * n_per_row / QK4_0 * sizeof(block_q4_0));
+    }
+    size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int64_t row = 0; row < nrow; ++row) {
+        quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
 // ====================== "True" 2-bit (de)-quantization
 
 void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
@@ -14714,7 +14842,7 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
 // and GEMV (using SDOT) cases.  For GEMM, we interleave 8 pairs of values
 // at a time (with the two nibbles separated at runtime to give 2x2x8
 // matrices).  For GEMV, we need to interleave 4 pairs of values instead.
-block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len) {
+block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask) {
     block_q4_0x4 out;
 
     for (int i = 0; i < 4; i++) {
@@ -14736,14 +14864,14 @@ block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int bloc
         int src_id = (i % (4 * block_len)) / block_len;
         src_offset += (i % block_len);
 
-        out.qs[i] = in[src_id]->qs[src_offset] + 0x80;
+        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
     }
 
     return out;
 }
 
 // 8-block version - see comments in code above
-block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len) {
+block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask) {
     block_q4_0x8 out;
 
     for (int i = 0; i < 8; i++) {
@@ -14755,7 +14883,7 @@ block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int bloc
         int src_id = (i % (8 * block_len)) / block_len;
         src_offset += (i % block_len);
 
-        out.qs[i] = in[src_id]->qs[src_offset] + 0x80;
+        out.qs[i] = in[src_id]->qs[src_offset] ^ xor_mask;
     }
 
     return out;
@@ -14798,68 +14926,7 @@ block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int bloc
     return out;
 }
 
-void quantize_row_q8_0_and_make_block_q8_0x2(const float * restrict x, void * restrict vy, int k, int rows_interleaved) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0x2 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv[rows_interleaved][8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-        float id[rows_interleaved];
-
-        for (int row_iter = 0; row_iter < rows_interleaved; row_iter++) {
-            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
-            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
-
-            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
-            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
-            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
-
-            const float amax = vmaxvq_f32(amaxv[0]);
-
-            const float d = amax / ((1 << 7) - 1);
-            id[row_iter] = d ? 1.0f / d : 0.0f;
-
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
-        }
-
-        for (int j = 0; j < 4; j++) {
-            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
-            int32x4_t vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#endif
-}
-
-void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * restrict vy, int k, int rows_interleaved) {
+void quantize_row_q8_0_aarch64(const float * restrict x, void * restrict vy, int k, int nrows_interleaved, int blocklen_per_row) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
@@ -14868,12 +14935,12 @@ void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * re
 
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
-        float32x4_t srcv[rows_interleaved][8];
+        float32x4_t srcv[nrows_interleaved][8];
         float32x4_t asrcv[8];
         float32x4_t amaxv[8];
-        float id[rows_interleaved];
+        float id[nrows_interleaved];
 
-        for (int row_iter = 0; row_iter < rows_interleaved; row_iter++) {
+        for (int row_iter = 0; row_iter < nrows_interleaved; row_iter++) {
             for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
             for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
 
@@ -14889,58 +14956,91 @@ void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * re
             y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
         }
 
-        for (int j = 0; j < 4; j++) {
-            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
-            int32x4_t vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
+        if (blocklen_per_row == 8) {
+            for (int j = 0; j < 4; j++) {
+                float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
+                int32x4_t vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
 
-            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[1][2 * j], id[1]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
 
-            v = vmulq_n_f32(srcv[2][2 * j], id[2]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[2][2 * j], id[2]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
 
-            v = vmulq_n_f32(srcv[3][2 * j], id[3]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[3][2 * j], id[3]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
+                v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
+                y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
+                y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
+                y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
+            }
+        }
+        else if (blocklen_per_row == 4) {
+            for (int j = 0; j < 8; j++) {
+                float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
+                int32x4_t vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[1][j], id[1]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[2][j], id[2]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
+
+                v = vmulq_n_f32(srcv[3][j], id[3]);
+                vi = vcvtnq_s32_f32(v);
+                y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
+                y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
+                y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
+                y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
+            }
         }
     }
 #endif
@@ -15134,184 +15234,227 @@ void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int inpu
 #endif
 }
 
-void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+void ggml_gemv_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_FEATURE_SVE)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
+    size_t width = xend - x0;
 
-    int64_t nb = n / QK4_0;
-    int64_t a_nb = n / QK8_0;
+    int64_t nb = depth / QK4_0;
+    const void * b_ptr = (void *)((block_q4_0x8 *) vx + ((x0 / 8) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
 
-    const uint8x16_t m4b = vdupq_n_u8(0x0F);
-    const int8x16_t  s8b = vdupq_n_s8(0x8);
+    assert(depth % 32 == 0);
+    assert(width % 8 == 0);
 
-    const block_q4_0x4 * b_ptr_start = vx;
-    const block_q8_0x4 * a_ptr_start = vy;
+    size_t num_blocks = depth / 32;
 
-    for (int64_t y = 0; y < input_width / 4; y += rows / 4) {
-        for (int64_t x = x0 / 4; x < xend / 4; x++) {
-            const block_q8_0x4 * a_ptrs[rows / 4];
-
-            a_ptrs[0] = a_ptr_start + (y * a_nb);
-            for (int i = 0; i < (rows / 4) - 1; i++) {
-                a_ptrs[i + 1] = a_ptrs[i] + a_nb;
-            }
-
-            const block_q4_0x4 * b_ptr = b_ptr_start + (x * nb);
-
-            // Master FP accumulators
-            float32x4_t acc_rows[rows];
-            for (int i = 0; i < rows; i++) {
-                acc_rows[i] = vdupq_n_f32(0.0f);
-            }
-
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const uint8x16_t rhs_raw_mat_01_0 = vld1q_u8(b_ptr[b].qs);
-                const uint8x16_t rhs_raw_mat_23_0 = vld1q_u8(b_ptr[b].qs + 16);
-                const uint8x16_t rhs_raw_mat_01_1 = vld1q_u8(b_ptr[b].qs + 32);
-                const uint8x16_t rhs_raw_mat_23_1 = vld1q_u8(b_ptr[b].qs + 48);
-
-                // 4-bit -> 8-bit
-                const int8x16_t rhs_mat_01_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_0, m4b)), s8b);
-                const int8x16_t rhs_mat_23_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_0, m4b)), s8b);
-                const int8x16_t rhs_mat_01_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_1, m4b)), s8b);
-                const int8x16_t rhs_mat_23_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_1, m4b)), s8b);
-                const int8x16_t rhs_mat_01_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_0), 4);
-                const int8x16_t rhs_mat_23_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_0), 4);
-                const int8x16_t rhs_mat_01_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_1), 4);
-                const int8x16_t rhs_mat_23_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_1), 4);
-
-                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
-                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
-                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
-
-                // Process LHS in pairs of rows
-                for (int rp = 0; rp < rows / 4; rp++) {
-                    const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
-                    const int8x16_t lhs_mat_23_0 = vld1q_s8(a_ptrs[rp][b].qs + 16);
-                    const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 32);
-                    const int8x16_t lhs_mat_23_1 = vld1q_s8(a_ptrs[rp][b].qs + 48);
-
-                    const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 64);
-                    const int8x16_t lhs_mat_23_2 = vld1q_s8(a_ptrs[rp][b].qs + 80);
-                    const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 96);
-                    const int8x16_t lhs_mat_23_3 = vld1q_s8(a_ptrs[rp][b].qs + 112);
-
-                    // Do the MMLAs into 2x2 matrices
-                    const int32x4_t iacc_mat_00 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
-                    const int32x4_t iacc_mat_01 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
-                    const int32x4_t iacc_mat_10 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_01_0), lhs_mat_23_1, rhs_mat_01_1), lhs_mat_23_2, rhs_mat_01_2), lhs_mat_23_3, rhs_mat_01_3);
-                    const int32x4_t iacc_mat_11 =
-                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_23_0), lhs_mat_23_1, rhs_mat_23_1), lhs_mat_23_2, rhs_mat_23_2), lhs_mat_23_3, rhs_mat_23_3);
-
-                    // Straighten out to make 4 row vectors
-                    const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                    const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                    const int32x4_t iacc_row_2 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
-                    const int32x4_t iacc_row_3 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
-
-                    const float16x4_t row_scale_f16 = vld1_f16(a_ptrs[rp][b].d);
-                    const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
-
-                    acc_rows[rp * 4] = vfmaq_f32(acc_rows[rp * 4], vcvtq_f32_s32(iacc_row_0), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 0));
-                    acc_rows[rp * 4 + 1] = vfmaq_f32(acc_rows[rp * 4 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 1));
-                    acc_rows[rp * 4 + 2] = vfmaq_f32(acc_rows[rp * 4 + 2], vcvtq_f32_s32(iacc_row_2), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 2));
-                    acc_rows[rp * 4 + 3] = vfmaq_f32(acc_rows[rp * 4 + 3], vcvtq_f32_s32(iacc_row_3), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 3));
-                }
-            }
-
-            for (int i = 0; i < rows; i++) {
-                vst1q_f32(s + ((y * 4 + i) * output_channels + x * 4), acc_rows[i]);
-            }
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "add %x[b_ptr], %x[b_ptr], #0x10\n"
+      "1:"  // Column loop
+      "add x22, %x[a_ptr], #0x2\n"
+      "mov z31.b, #0x0\n"
+      "mov x21, %x[num_blocks]\n"
+      "2:"  // Block loop
+      "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
+      "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
+      "mov z28.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "ld1rd { z26.d }, p0/Z, [x22]\n"
+      "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
+      "sub x20, x22, #0x2\n"
+      "sub x21, x21, #0x1\n"
+      "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
+      "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
+      "lsl z22.b, z30.b, #0x4\n"
+      "lsl z16.b, z29.b, #0x4\n"
+      "and z30.b, z30.b, #0xf0\n"
+      "and z29.b, z29.b, #0xf0\n"
+      "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
+      "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
+      "lsl z19.b, z25.b, #0x4\n"
+      "and z25.b, z25.b, #0xf0\n"
+      "ld1rh { z17.h }, p0/Z, [x20]\n"
+      "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
+      "sdot z28.s, z22.b, z26.b\n"
+      "sdot z27.s, z16.b, z26.b\n"
+      "lsl z16.b, z24.b, #0x4\n"
+      "add x22, x22, #0x22\n"
+      "and z24.b, z24.b, #0xf0\n"
+      "add %x[b_ptr], %x[b_ptr], #0x90\n"
+      "fcvt z17.s, p0/m, z17.h\n"
+      "fcvt z18.s, p0/m, z18.h\n"
+      "sdot z28.s, z19.b, z23.b\n"
+      "sdot z27.s, z16.b, z23.b\n"
+      "fmul z18.s, z18.s, z17.s\n"
+      "sdot z28.s, z30.b, z21.b\n"
+      "sdot z27.s, z29.b, z21.b\n"
+      "sdot z28.s, z25.b, z20.b\n"
+      "sdot z27.s, z24.b, z20.b\n"
+      "uzp1 z17.s, z28.s, z27.s\n"
+      "uzp2 z16.s, z28.s, z27.s\n"
+      "add z17.s, z17.s, z16.s\n"
+      "asr z17.s, z17.s, #0x4\n"
+      "scvtf z17.s, p0/m, z17.s\n"
+      "fmla z31.s, p0/M, z17.s, z18.s\n"
+      "cbnz x21, 2b\n"
+      "sub %x[width], %x[width], #0x8\n"
+      "st1w { z31.s }, p0, [%x[res_ptr]]\n"
+      "add %x[res_ptr], %x[res_ptr], #0x20\n"
+      "cbnz %x[width], 1b\n"
+      : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
+      : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+      : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 #endif
 }
 
-void ggml_gemm_q4_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    int rows = 2;
+void ggml_gemv_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_NEON)
     int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
     int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+    size_t width = xend - x0;
 
-    int64_t nb = n / QK4_0;
-    int64_t a_nb = n / QK8_0;
+    int64_t nb = depth / QK4_0;
+    const void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
 
-    const uint8x16_t m4b = vdupq_n_u8(0x0F);
-    const int8x16_t  s8b = vdupq_n_s8(0x8);
+    assert(depth % 32 == 0);
+    assert(width % 4 == 0);
 
-    const block_q4_0x4 * b_ptr_start = vx;
-    const block_q8_0x2 * a_ptr_start = vy;
+    size_t num_blocks = depth / 32;
 
-    for (int64_t y = 0; y < input_width / 2; y += rows / 2) {
-        for (int64_t x = x0 / 4; x < xend / 4; x++) {
-            const block_q8_0x2 * a_ptrs[rows / 2];
+    __asm__ __volatile__(
+      "movi v2.16b, #0x4\n"
+      "movi v1.16b, #0xf0\n"
+      "add %x[b_ptr], %x[b_ptr], #0x8\n"
+      "1:"  // Column loop
+      "add x23, %x[a_ptr], #0x2\n"
+      "movi v0.16b, #0x0\n"
+      "mov x22, %x[num_blocks]\n"
+      "2:"  // Block loop
+      "ldr q31, [%x[b_ptr], #0x0]\n"
+      "ldr q30, [%x[b_ptr], #0x10]\n"
+      "mov x21, x23\n"
+      "movi v29.4s, #0x0\n"
+      "ldr q28, [%x[b_ptr], #0x20]\n"
+      "ldr q27, [%x[b_ptr], #0x30]\n"
+      "movi v26.4s, #0x0\n"
+      "sub x20, x23, #0x2\n"
+      "ld1r { v25.8h }, [x20]\n"
+      "ldr q24, [%x[b_ptr], #-0x8]\n"
+      "sub x22, x22, #0x1\n"
+      "add x23, x23, #0x22\n"
+      "ld1r { v23.2d }, [x21], #0x8\n"
+      "sshl v22.16b, v31.16b, v2.16b\n"
+      "sshl v16.16b, v30.16b, v2.16b\n"
+      "add %x[b_ptr], %x[b_ptr], #0x48\n"
+      "ld1r { v21.2d }, [x21], #0x8\n"
+      "sshl v20.16b, v28.16b, v2.16b\n"
+      "sshl v19.16b, v27.16b, v2.16b\n"
+      "ld1r { v18.2d }, [x21], #0x8\n"
+      "ld1r { v17.2d }, [x21], #0x8\n"
+      "and v31.16b, v31.16b, v1.16b\n"
+      "and v30.16b, v30.16b, v1.16b\n"
+      ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
+      ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
+      "and v28.16b, v28.16b, v1.16b\n"
+      "and v27.16b, v27.16b, v1.16b\n"
+      "fcvtl v25.4s, v25.4h\n"
+      "fcvtl v16.4s, v24.4h\n"
+      ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
+      ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
+      "fmul v16.4s, v16.4s, v25.4s\n"
+      ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
+      ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
+      ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
+      ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
+      "addp v29.4s, v29.4s, v26.4s\n"
+      "scvtf v29.4s, v29.4s, #0x4\n"
+      "fmla v0.4s, v29.4s, v16.4s\n"
+      "cbnz x22, 2b\n"
+      "sub %x[width], %x[width], #0x4\n"
+      "str q0, [%x[res_ptr], #0x0]\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "cbnz %x[width], 1b\n"
+      : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
+      : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+      : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
+    );
+#endif
+}
 
-            a_ptrs[0] = a_ptr_start + (y * a_nb);
+void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_NEON)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+    size_t width = xend - x0;
 
-            const block_q4_0x4 * b_ptr = b_ptr_start + (x * nb);
+    int64_t nb = depth / QK4_0;
+    const void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
 
-            // Master FP accumulators
-            float32x4_t acc_rows[rows];
-            acc_rows[0] = vdupq_n_f32(0.0f);
-            acc_rows[1] = vdupq_n_f32(0.0f);
+    assert(depth % 32 == 0);
+    assert(width % 4 == 0);
 
-            for (int64_t b = 0; b < nb; b++) {
-                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
-                const uint8x16_t rhs_raw_mat_01_0 = vld1q_u8(b_ptr[b].qs);
-                const uint8x16_t rhs_raw_mat_23_0 = vld1q_u8(b_ptr[b].qs + 16);
-                const uint8x16_t rhs_raw_mat_01_1 = vld1q_u8(b_ptr[b].qs + 32);
-                const uint8x16_t rhs_raw_mat_23_1 = vld1q_u8(b_ptr[b].qs + 48);
+    size_t num_blocks = depth / 32;
 
-                const int8x16_t rhs_mat_01_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_0, m4b)), s8b);
-                const int8x16_t rhs_mat_23_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_0, m4b)), s8b);
-                const int8x16_t rhs_mat_01_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_1, m4b)), s8b);
-                const int8x16_t rhs_mat_23_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_1, m4b)), s8b);
-
-                const int8x16_t rhs_mat_01_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_0), 4);
-                const int8x16_t rhs_mat_23_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_0), 4);
-                const int8x16_t rhs_mat_01_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_1), 4);
-                const int8x16_t rhs_mat_23_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_1), 4);
-
-                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
-                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
-                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
-
-                // Process LHS in pairs of rows
-                int rp = 0;
-                const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
-                const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 16);
-
-                const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 32);
-                const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 48);
-
-                // Do the MMLAs into 2x2 matrices
-                const int32x4_t iacc_mat_00 =
-                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
-                const int32x4_t iacc_mat_01 =
-                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
-
-                // Straighten out to make 2 row vectors
-                const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-
-                const float16x4_t row_scale_f16_0 = vld1_dup_f16(&(a_ptrs[rp][b].d[0]));
-                const float32x4_t row_scale_f32_0 = vcvt_f32_f16(row_scale_f16_0);
-                const float16x4_t row_scale_f16_1 = vld1_dup_f16(&(a_ptrs[rp][b].d[1]));
-                const float32x4_t row_scale_f32_1 = vcvt_f32_f16(row_scale_f16_1);
-
-                acc_rows[rp * 2] = vfmaq_f32(acc_rows[rp * 2], vcvtq_f32_s32(iacc_row_0), vmulq_f32(col_scale_f32, row_scale_f32_0));
-                acc_rows[rp * 2 + 1] = vfmaq_f32(acc_rows[rp * 2 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_f32(col_scale_f32, row_scale_f32_1));
-            }
-
-            vst1q_f32(s + ((y * 2) * output_channels + x * 4), acc_rows[0]);
-            vst1q_f32(s + ((y * 2 + 1) * output_channels + x * 4), acc_rows[1]);
-        }
-    }
+    __asm__ __volatile__(
+      "movi v31.16b, #0x4\n"
+      "movi v30.16b, #0xf0\n"
+      "add %x[b_ptr], %x[b_ptr], #0x8\n"
+      "1:"  // Column loop
+      "add x22, %x[a_ptr], #0x2\n"
+      "movi v29.16b, #0x0\n"
+      "mov x21, %x[num_blocks]\n"
+      "2:"  // Block loop
+      "ldr q28, [%x[b_ptr], #0x0]\n"
+      "ldr q27, [x22, #0x0]\n"
+      "movi v26.4s, #0x0\n"
+      "sub x20, x22, #0x2\n"
+      "ldr q25, [x22, #0x10]\n"
+      "ldr q24, [%x[b_ptr], #0x10]\n"
+      "sub x21, x21, #0x1\n"
+      "add x22, x22, #0x22\n"
+      "ldr q23, [%x[b_ptr], #0x20]\n"
+      "ldr q22, [%x[b_ptr], #0x30]\n"
+      "ld1r { v21.8h }, [x20]\n"
+      "ldr q20, [%x[b_ptr], #-0x8]\n"
+      "sshl v16.16b, v28.16b, v31.16b\n"
+      "and v28.16b, v28.16b, v30.16b\n"
+      "sshl v19.16b, v24.16b, v31.16b\n"
+      "and v24.16b, v24.16b, v30.16b\n"
+      "add %x[b_ptr], %x[b_ptr], #0x48\n"
+      "sshl v18.16b, v23.16b, v31.16b\n"
+      "and v23.16b, v23.16b, v30.16b\n"
+      ".inst 0x4f9be21a  // sdot v26.4s, v16.16b, v27.4b[0]\n"
+      "sshl v17.16b, v22.16b, v31.16b\n"
+      "and v22.16b, v22.16b, v30.16b\n"
+      "fcvtl v21.4s, v21.4h\n"
+      "fcvtl v16.4s, v20.4h\n"
+      ".inst 0x4f99e39a  // sdot v26.4s, v28.16b, v25.4b[0]\n"
+      "fmul v16.4s, v16.4s, v21.4s\n"
+      ".inst 0x4fbbe27a  // sdot v26.4s, v19.16b, v27.4b[1]\n"
+      ".inst 0x4fb9e31a  // sdot v26.4s, v24.16b, v25.4b[1]\n"
+      ".inst 0x4f9bea5a  // sdot v26.4s, v18.16b, v27.4b[2]\n"
+      ".inst 0x4f99eafa  // sdot v26.4s, v23.16b, v25.4b[2]\n"
+      ".inst 0x4fbbea3a  // sdot v26.4s, v17.16b, v27.4b[3]\n"
+      ".inst 0x4fb9eada  // sdot v26.4s, v22.16b, v25.4b[3]\n"
+      "scvtf v26.4s, v26.4s, #0x4\n"
+      "fmla v29.4s, v26.4s, v16.4s\n"
+      "cbnz x21, 2b\n"
+      "sub %x[width], %x[width], #0x4\n"
+      "str q29, [%x[res_ptr], #0x0]\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "cbnz %x[width], 1b\n"
+      : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [width] "+&r" (width)
+      : [a_ptr] "r" (a_ptr), [num_blocks] "r" (num_blocks)
+      : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
+    );
 #endif
 }
 
@@ -15471,6 +15614,1406 @@ void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int inpu
 #endif
 }
 
+void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+
+    int64_t nb = n / QK4_0;
+    int64_t a_nb = n / QK8_0;
+
+    const uint8x16_t m4b = vdupq_n_u8(0x0F);
+    const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+    const block_q4_0x4 * b_ptr_start = vx;
+    const block_q8_0x4 * a_ptr_start = vy;
+
+    for (int64_t y = 0; y < input_width / 4; y += rows / 4) {
+        for (int64_t x = x0 / 4; x < xend / 4; x++) {
+            const block_q8_0x4 * a_ptrs[rows / 4];
+
+            a_ptrs[0] = a_ptr_start + (y * a_nb);
+            for (int i = 0; i < (rows / 4) - 1; i++) {
+                a_ptrs[i + 1] = a_ptrs[i] + a_nb;
+            }
+
+            const block_q4_0x4 * b_ptr = b_ptr_start + (x * nb);
+
+            // Master FP accumulators
+            float32x4_t acc_rows[rows];
+            for (int i = 0; i < rows; i++) {
+                acc_rows[i] = vdupq_n_f32(0.0f);
+            }
+
+            for (int64_t b = 0; b < nb; b++) {
+                // Set up RHS - we need rhs_mat_* and col_scale_f32 (9 registers)
+                const uint8x16_t rhs_raw_mat_01_0 = vld1q_u8(b_ptr[b].qs);
+                const uint8x16_t rhs_raw_mat_23_0 = vld1q_u8(b_ptr[b].qs + 16);
+                const uint8x16_t rhs_raw_mat_01_1 = vld1q_u8(b_ptr[b].qs + 32);
+                const uint8x16_t rhs_raw_mat_23_1 = vld1q_u8(b_ptr[b].qs + 48);
+
+                // 4-bit -> 8-bit
+                const int8x16_t rhs_mat_01_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_0, m4b)), s8b);
+                const int8x16_t rhs_mat_23_0 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_0, m4b)), s8b);
+                const int8x16_t rhs_mat_01_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_01_1, m4b)), s8b);
+                const int8x16_t rhs_mat_23_1 = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(rhs_raw_mat_23_1, m4b)), s8b);
+                const int8x16_t rhs_mat_01_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_0), 4);
+                const int8x16_t rhs_mat_23_2 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_0), 4);
+                const int8x16_t rhs_mat_01_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_01_1), 4);
+                const int8x16_t rhs_mat_23_3 = vshrq_n_s8(vreinterpretq_s8_u8(rhs_raw_mat_23_1), 4);
+
+                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
+                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
+                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
+
+                // Process LHS in pairs of rows
+                for (int rp = 0; rp < rows / 4; rp++) {
+                    const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
+                    const int8x16_t lhs_mat_23_0 = vld1q_s8(a_ptrs[rp][b].qs + 16);
+                    const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 32);
+                    const int8x16_t lhs_mat_23_1 = vld1q_s8(a_ptrs[rp][b].qs + 48);
+
+                    const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 64);
+                    const int8x16_t lhs_mat_23_2 = vld1q_s8(a_ptrs[rp][b].qs + 80);
+                    const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 96);
+                    const int8x16_t lhs_mat_23_3 = vld1q_s8(a_ptrs[rp][b].qs + 112);
+
+                    // Do the MMLAs into 2x2 matrices
+                    const int32x4_t iacc_mat_00 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
+                    const int32x4_t iacc_mat_01 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
+                    const int32x4_t iacc_mat_10 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_01_0), lhs_mat_23_1, rhs_mat_01_1), lhs_mat_23_2, rhs_mat_01_2), lhs_mat_23_3, rhs_mat_01_3);
+                    const int32x4_t iacc_mat_11 =
+                        vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_23_0, rhs_mat_23_0), lhs_mat_23_1, rhs_mat_23_1), lhs_mat_23_2, rhs_mat_23_2), lhs_mat_23_3, rhs_mat_23_3);
+
+                    // Straighten out to make 4 row vectors
+                    const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                    const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
+                    const int32x4_t iacc_row_2 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
+                    const int32x4_t iacc_row_3 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_10), vreinterpretq_u64_s32(iacc_mat_11)));
+
+                    const float16x4_t row_scale_f16 = vld1_f16(a_ptrs[rp][b].d);
+                    const float32x4_t row_scale_f32 = vcvt_f32_f16(row_scale_f16);
+
+                    acc_rows[rp * 4] = vfmaq_f32(acc_rows[rp * 4], vcvtq_f32_s32(iacc_row_0), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 0));
+                    acc_rows[rp * 4 + 1] = vfmaq_f32(acc_rows[rp * 4 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 1));
+                    acc_rows[rp * 4 + 2] = vfmaq_f32(acc_rows[rp * 4 + 2], vcvtq_f32_s32(iacc_row_2), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 2));
+                    acc_rows[rp * 4 + 3] = vfmaq_f32(acc_rows[rp * 4 + 3], vcvtq_f32_s32(iacc_row_3), vmulq_laneq_f32(col_scale_f32, row_scale_f32, 3));
+                }
+            }
+
+            for (int i = 0; i < rows; i++) {
+                vst1q_f32(s + ((y * 4 + i) * output_channels + x * 4), acc_rows[i]);
+            }
+        }
+    }
+#endif
+}
+
+void ggml_gemm_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)8);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)8);
+    size_t width = xend - x0;
+
+    int64_t nb = depth / QK4_0;
+    const void * b_ptr = (void *)((block_q4_0x8 *) vx + ((x0 / 8) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+    size_t res_stride = output_channels  * sizeof(float);
+
+    assert(depth % 32 == 0);
+    assert(width % 8 == 0);
+
+    size_t num_blocks = depth / 32;
+
+    __asm__ __volatile__(
+      "mov x20, #0x4\n"
+      "mov x13, %x[height]\n"
+      "mov z28.s, #-0x4\n"
+      "mov x12, #0x88\n"
+      "ptrue p1.b\n"
+      "whilelt p0.s, XZR, x20\n"
+      "cmp x13, #0x10\n"
+      "mul x12, %x[num_blocks], x12\n"
+      "blt 4f\n"
+      "1:"  // Row loop
+      "add x11, %x[b_ptr], #0x10\n"
+      "mov x10, %x[width]\n"
+      "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
+      "2:"  // Column loop
+      "add x28, %x[a_ptr], #0x8\n"
+      "mov z24.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov x27, %x[num_blocks]\n"
+      "add x26, x28, x12\n"
+      "mov z12.b, #0x0\n"
+      "mov z0.b, #0x0\n"
+      "add x25, x26, x12\n"
+      "mov z13.b, #0x0\n"
+      "mov z1.b, #0x0\n"
+      "add x24, x25, x12\n"
+      "mov z20.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z8.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "3:"  // Block loop
+      "ld1b { z30.b }, p1/Z, [x11]\n"
+      "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
+      "mov z18.s, #0x0\n"
+      "mov z7.s, #0x0\n"
+      "ld1rqb { z3.b }, p1/Z, [x28]\n"
+      "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
+      "mov z9.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
+      "sub x20, x11, #0x10\n"
+      "sub x23, x28, #0x8\n"
+      "lsl z31.b, z30.b, #0x4\n"
+      "lsl z6.b, z21.b, #0x4\n"
+      "ld1h { z23.s }, p1/Z, [x20]\n"
+      "sub x22, x26, #0x8\n"
+      "and z30.b, z30.b, #0xf0\n"
+      "and z21.b, z21.b, #0xf0\n"
+      "sub x21, x25, #0x8\n"
+      "sub x20, x24, #0x8\n"
+      "lsl z14.b, z4.b, #0x4\n"
+      "lsl z2.b, z17.b, #0x4\n"
+      "subs x27, x27, #0x1\n"
+      "add x11, x11, #0x90\n"
+      ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
+      ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
+      "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
+      "and z4.b, z4.b, #0xf0\n"
+      ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+      ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
+      "and z17.b, z17.b, #0xf0\n"
+      "fcvt z23.s, p1/m, z23.h\n"
+      ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
+      ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
+      "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
+      ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+      ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
+      "fscale z23.s, p1/m, z23.s, z28.s\n"
+      ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
+      ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
+      "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
+      ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+      ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
+      "add x28, x28, #0x88\n"
+      ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
+      ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
+      "ld1h { z3.s }, p0/Z, [x23]\n"
+      ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+      ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+      "fcvt z3.s, p1/m, z3.h\n"
+      "uzp1 z5.d, z18.d, z7.d\n"
+      "uzp2 z18.d, z18.d, z7.d\n"
+      "mov z3.q, z3.q[0]\n"
+      "uzp1 z7.d, z9.d, z22.d\n"
+      "uzp2 z22.d, z9.d, z22.d\n"
+      "fmul z9.s, z23.s, z3.s[0]\n"
+      "scvtf z5.s, p1/m, z5.s\n"
+      "scvtf z18.s, p1/m, z18.s\n"
+      "scvtf z7.s, p1/m, z7.s\n"
+      "scvtf z22.s, p1/m, z22.s\n"
+      "fmla z24.s, p1/M, z5.s, z9.s\n"
+      "ld1rqb { z5.b }, p1/Z, [x26]\n"
+      "fmul z9.s, z23.s, z3.s[1]\n"
+      "fmla z15.s, p1/M, z18.s, z9.s\n"
+      "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
+      "fmul z9.s, z23.s, z3.s[2]\n"
+      "fmul z3.s, z23.s, z3.s[3]\n"
+      "fmla z12.s, p1/M, z7.s, z9.s\n"
+      "mov z9.s, #0x0\n"
+      "ld1h { z7.s }, p0/Z, [x22]\n"
+      ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+      "fmla z0.s, p1/M, z22.s, z3.s\n"
+      "mov z22.s, #0x0\n"
+      "ld1h { z3.s }, p0/Z, [x21]\n"
+      ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
+      "fcvt z7.s, p1/m, z7.h\n"
+      "fcvt z3.s, p1/m, z3.h\n"
+      ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+      ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
+      "mov z7.q, z7.q[0]\n"
+      "mov z3.q, z3.q[0]\n"
+      ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+      ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
+      ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+      ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+      "uzp1 z5.d, z9.d, z22.d\n"
+      "scvtf z5.s, p1/m, z5.s\n"
+      "uzp2 z22.d, z9.d, z22.d\n"
+      "fmul z9.s, z23.s, z7.s[0]\n"
+      "scvtf z22.s, p1/m, z22.s\n"
+      "fmla z13.s, p1/M, z5.s, z9.s\n"
+      "ld1rqb { z9.b }, p1/Z, [x25]\n"
+      "fmul z5.s, z23.s, z7.s[1]\n"
+      "fmla z1.s, p1/M, z22.s, z5.s\n"
+      "mov z5.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
+      ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
+      ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
+      ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
+      ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
+      ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
+      "add x26, x26, #0x88\n"
+      ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
+      ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
+      "uzp1 z18.d, z5.d, z22.d\n"
+      "scvtf z18.s, p1/m, z18.s\n"
+      "uzp2 z22.d, z5.d, z22.d\n"
+      "fmul z5.s, z23.s, z7.s[2]\n"
+      "fmul z7.s, z23.s, z7.s[3]\n"
+      "scvtf z22.s, p1/m, z22.s\n"
+      "fmla z20.s, p1/M, z18.s, z5.s\n"
+      "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
+      "ld1h { z5.s }, p0/Z, [x20]\n"
+      "fcvt z5.s, p1/m, z5.h\n"
+      "fmla z25.s, p1/M, z22.s, z7.s\n"
+      "mov z22.s, #0x0\n"
+      "mov z7.s, #0x0\n"
+      ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
+      ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+      "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
+      "mov z5.q, z5.q[0]\n"
+      ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
+      ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+      "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
+      ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
+      ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
+      "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
+      ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
+      ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
+      "uzp1 z9.d, z22.d, z7.d\n"
+      "scvtf z9.s, p1/m, z9.s\n"
+      "uzp2 z22.d, z22.d, z7.d\n"
+      "fmul z7.s, z23.s, z3.s[0]\n"
+      "scvtf z22.s, p1/m, z22.s\n"
+      "fmla z11.s, p1/M, z9.s, z7.s\n"
+      "ld1rqb { z9.b }, p1/Z, [x24]\n"
+      "fmul z7.s, z23.s, z3.s[1]\n"
+      "fmla z16.s, p1/M, z22.s, z7.s\n"
+      "mov z22.s, #0x0\n"
+      "mov z7.s, #0x0\n"
+      ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
+      ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
+      ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
+      ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
+      ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
+      ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
+      "add x25, x25, #0x88\n"
+      ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
+      ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
+      "uzp1 z18.d, z22.d, z7.d\n"
+      "scvtf z18.s, p1/m, z18.s\n"
+      "uzp2 z7.d, z22.d, z7.d\n"
+      "fmul z22.s, z23.s, z3.s[2]\n"
+      "fmul z3.s, z23.s, z3.s[3]\n"
+      "scvtf z7.s, p1/m, z7.s\n"
+      "fmla z19.s, p1/M, z18.s, z22.s\n"
+      "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
+      "fmul z22.s, z23.s, z5.s[0]\n"
+      "fmla z26.s, p1/M, z7.s, z3.s\n"
+      "mov z3.s, #0x0\n"
+      "mov z7.s, #0x0\n"
+      ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
+      ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+      "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
+      ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
+      ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+      "mov z9.s, #0x0\n"
+      ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
+      "mov z31.s, #0x0\n"
+      ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
+      "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
+      "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
+      ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
+      "fmul z14.s, z23.s, z5.s[1]\n"
+      ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
+      "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
+      "fmul z2.s, z23.s, z5.s[2]\n"
+      "fmul z23.s, z23.s, z5.s[3]\n"
+      ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
+      ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+      "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
+      ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
+      ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
+      "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
+      "add x24, x24, #0x88\n"
+      ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
+      ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
+      ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
+      ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
+      "uzp1 z18.d, z3.d, z7.d\n"
+      "uzp2 z5.d, z3.d, z7.d\n"
+      "scvtf z18.s, p1/m, z18.s\n"
+      "uzp1 z6.d, z9.d, z31.d\n"
+      "uzp2 z9.d, z9.d, z31.d\n"
+      "scvtf z5.s, p1/m, z5.s\n"
+      "fmla z8.s, p1/M, z18.s, z22.s\n"
+      "scvtf z6.s, p1/m, z6.s\n"
+      "scvtf z9.s, p1/m, z9.s\n"
+      "fmla z29.s, p1/M, z5.s, z14.s\n"
+      "fmla z27.s, p1/M, z6.s, z2.s\n"
+      "fmla z10.s, p1/M, z9.s, z23.s\n"
+      "bgt 3b\n"
+      "mov x20, %x[res_ptr]\n"
+      "subs x10, x10, #0x8\n"
+      "add %x[res_ptr], %x[res_ptr], #0x20\n"
+      "st1w { z24.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z15.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z12.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z0.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z13.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z1.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z20.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z25.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z11.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z16.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z19.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z26.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z8.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z29.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z27.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "st1w { z10.s }, p1, [x20]\n"
+      "bne 2b\n"
+      "mov x20, #0x4\n"
+      "sub x13, x13, #0x10\n"
+      "cmp x13, #0x10\n"
+      "mov %x[res_ptr], x9\n"
+      "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
+      "bge 1b\n"
+      "4:"  // Row loop skip
+      "cbz x13, 9f\n"
+      "5:"  // Row tail: Row loop
+      "add x25, %x[b_ptr], #0x10\n"
+      "mov x24, %x[width]\n"
+      "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
+      "6:"  // Row tail: Column loop
+      "mov z24.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "add x28, %x[a_ptr], #0x8\n"
+      "mov x22, %x[num_blocks]\n"
+      "mov z12.b, #0x0\n"
+      "mov z0.b, #0x0\n"
+      "7:"  // Row tail: Block loop
+      "ld1b { z3.b }, p1/Z, [x25]\n"
+      "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "mov z2.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "ld1rqb { z26.b }, p1/Z, [x28]\n"
+      "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
+      "mov z27.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
+      "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
+      "sub x21, x25, #0x10\n"
+      "sub x20, x28, #0x8\n"
+      "lsl z20.b, z3.b, #0x4\n"
+      "lsl z4.b, z6.b, #0x4\n"
+      "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
+      "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
+      "and z3.b, z3.b, #0xf0\n"
+      "and z6.b, z6.b, #0xf0\n"
+      "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
+      "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
+      "lsl z8.b, z29.b, #0x4\n"
+      "lsl z14.b, z16.b, #0x4\n"
+      "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
+      "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
+      ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
+      ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
+      "and z29.b, z29.b, #0xf0\n"
+      "ld1h { z17.s }, p1/Z, [x21]\n"
+      ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
+      ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
+      "and z16.b, z16.b, #0xf0\n"
+      "ld1h { z4.s }, p0/Z, [x20]\n"
+      "subs x22, x22, #0x1\n"
+      "add x28, x28, #0x88\n"
+      "fcvt z17.s, p1/m, z17.h\n"
+      "add x25, x25, #0x90\n"
+      ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
+      ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
+      "fcvt z4.s, p1/m, z4.h\n"
+      ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
+      ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
+      "fscale z17.s, p1/m, z17.s, z28.s\n"
+      "mov z4.q, z4.q[0]\n"
+      ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
+      ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
+      "fmul z23.s, z17.s, z4.s[0]\n"
+      "fmul z9.s, z17.s, z4.s[1]\n"
+      "fmul z21.s, z17.s, z4.s[2]\n"
+      "fmul z4.s, z17.s, z4.s[3]\n"
+      ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
+      ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
+      ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
+      ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
+      ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
+      ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
+      "uzp1 z31.d, z2.d, z25.d\n"
+      "uzp2 z13.d, z2.d, z25.d\n"
+      "scvtf z31.s, p1/m, z31.s\n"
+      "uzp1 z17.d, z27.d, z19.d\n"
+      "uzp2 z18.d, z27.d, z19.d\n"
+      "scvtf z13.s, p1/m, z13.s\n"
+      "fmla z24.s, p1/M, z31.s, z23.s\n"
+      "scvtf z17.s, p1/m, z17.s\n"
+      "scvtf z18.s, p1/m, z18.s\n"
+      "fmla z15.s, p1/M, z13.s, z9.s\n"
+      "fmla z12.s, p1/M, z17.s, z21.s\n"
+      "fmla z0.s, p1/M, z18.s, z4.s\n"
+      "bgt 7b\n"
+      "mov x20, %x[res_ptr]\n"
+      "cmp x13, #0x1\n"
+      "st1w { z24.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x13, #0x2\n"
+      "st1w { z15.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x13, #0x3\n"
+      "st1w { z12.s }, p1, [x20]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "st1w { z0.s }, p1, [x20]\n"
+      "8:"  // Row tail: Accumulator store skip
+      "subs x24, x24, #0x8\n"
+      "add %x[res_ptr], %x[res_ptr], #0x20\n"
+      "bne 6b\n"
+      "subs x13, x13, #0x4\n"
+      "add %x[a_ptr], %x[a_ptr], x12\n"
+      "mov %x[res_ptr], x23\n"
+      "bgt 5b\n"
+      "9:"  // Row tail: Row loop skip
+      : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+      : [b_ptr] "r" (b_ptr), [height] "r" (height), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+#endif
+}
+
+void ggml_gemm_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+    size_t width = xend - x0;
+
+    int64_t nb = depth / QK4_0;
+    const void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0 / 4) * nb));
+    const void * a_ptr = vy;
+    float * res_ptr = s + x0;
+    size_t res_stride = output_channels  * sizeof(float);
+
+    assert(depth % 32 == 0);
+    assert(width % 4 == 0);
+
+    size_t num_blocks = depth / 32;
+
+    __asm__ __volatile__(
+      "mov x10, %x[height]\n"
+      "mov x9, #0x88\n"
+      "cmp x10, #0x10\n"
+      "mul x9, %x[num_blocks], x9\n"
+      "blt 4f\n"
+      "1:"  // Row loop
+      "add x28, %x[b_ptr], #0x8\n"
+      "mov x27, %x[width]\n"
+      "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+      "2:"  // Column loop
+      "add x25, %x[a_ptr], #0x8\n"
+      "movi v2.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "mov x24, %x[num_blocks]\n"
+      "add x23, x25, x9\n"
+      "movi v12.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "add x22, x23, x9\n"
+      "movi v11.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "add x21, x22, x9\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v5.16b, #0x0\n"
+      "movi v7.16b, #0x0\n"
+      "movi v4.16b, #0x0\n"
+      "movi v6.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "3:"  // Block loop
+      "ldr q21, [x28, #0x0]\n"
+      "ldr q16, [x28, #0x10]\n"
+      "movi v1.16b, #0x4\n"
+      "movi v19.4s, #0x0\n"
+      "ldr q27, [x25, #0x0]\n"
+      "ldr q15, [x25, #0x10]\n"
+      "movi v26.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "ldr q29, [x28, #0x20]\n"
+      "ldr q3, [x28, #0x30]\n"
+      "movi v17.4s, #0x0\n"
+      "movi v0.16b, #0xf0\n"
+      "ldr d20, [x25, #-0x8]\n"
+      "ldr d9, [x23, #-0x8]\n"
+      "sshl v8.16b, v21.16b, v1.16b\n"
+      "sshl v31.16b, v16.16b, v1.16b\n"
+      "and v21.16b, v21.16b, v0.16b\n"
+      "and v16.16b, v16.16b, v0.16b\n"
+      "sub x20, x28, #0x8\n"
+      "subs x24, x24, #0x1\n"
+      "add x28, x28, #0x48\n"
+      ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
+      ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
+      "ldr q27, [x25, #0x20]\n"
+      ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
+      ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
+      "sshl v15.16b, v29.16b, v1.16b\n"
+      "sshl v1.16b, v3.16b, v1.16b\n"
+      "and v29.16b, v29.16b, v0.16b\n"
+      "and v3.16b, v3.16b, v0.16b\n"
+      "ldr q0, [x25, #0x30]\n"
+      "fcvtl v20.4s, v20.4h\n"
+      ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
+      "fcvtl v9.4s, v9.4h\n"
+      ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
+      "ldr q27, [x25, #0x40]\n"
+      ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+      "ldr q0, [x25, #0x50]\n"
+      ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
+      ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
+      "ldr q27, [x25, #0x60]\n"
+      ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
+      ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
+      "ldr q0, [x25, #0x70]\n"
+      "add x25, x25, #0x88\n"
+      ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
+      ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
+      "ldr d27, [x20, #0x0]\n"
+      ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
+      ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
+      "fcvtl v27.4s, v27.4h\n"
+      "uzp1 v0.2d, v19.2d, v26.2d\n"
+      "uzp2 v26.2d, v19.2d, v26.2d\n"
+      "fmul v19.4s, v27.4s, v20.s[0]\n"
+      "scvtf v0.4s, v0.4s, #0x4\n"
+      "scvtf v26.4s, v26.4s, #0x4\n"
+      "fmla v2.4s, v0.4s, v19.4s\n"
+      "ldr q19, [x23, #0x0]\n"
+      "uzp1 v0.2d, v18.2d, v17.2d\n"
+      "uzp2 v18.2d, v18.2d, v17.2d\n"
+      "fmul v17.4s, v27.4s, v20.s[1]\n"
+      "scvtf v0.4s, v0.4s, #0x4\n"
+      "scvtf v18.4s, v18.4s, #0x4\n"
+      "fmla v10.4s, v26.4s, v17.4s\n"
+      "ldr q17, [x23, #0x10]\n"
+      "fmul v26.4s, v27.4s, v20.s[2]\n"
+      "fmul v20.4s, v27.4s, v20.s[3]\n"
+      "fmla v12.4s, v0.4s, v26.4s\n"
+      "ldr d0, [x22, #-0x8]\n"
+      "ldr d26, [x21, #-0x8]\n"
+      "fcvtl v0.4s, v0.4h\n"
+      "fmla v28.4s, v18.4s, v20.4s\n"
+      "movi v20.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+      ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+      "ldr q19, [x23, #0x20]\n"
+      "fcvtl v26.4s, v26.4h\n"
+      ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+      ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+      "ldr q19, [x23, #0x40]\n"
+      ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+      ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+      "ldr q19, [x23, #0x60]\n"
+      ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
+      ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
+      "uzp1 v19.2d, v20.2d, v18.2d\n"
+      "scvtf v19.4s, v19.4s, #0x4\n"
+      "uzp2 v20.2d, v20.2d, v18.2d\n"
+      "fmul v18.4s, v27.4s, v9.s[0]\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "fmla v11.4s, v19.4s, v18.4s\n"
+      "ldr q18, [x22, #0x0]\n"
+      "fmul v19.4s, v27.4s, v9.s[1]\n"
+      "fmla v13.4s, v20.4s, v19.4s\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
+      ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
+      "ldr q17, [x23, #0x30]\n"
+      ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
+      ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
+      "ldr q17, [x23, #0x50]\n"
+      ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
+      ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
+      "ldr q17, [x23, #0x70]\n"
+      "add x23, x23, #0x88\n"
+      ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
+      ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
+      "uzp1 v17.2d, v19.2d, v20.2d\n"
+      "scvtf v17.4s, v17.4s, #0x4\n"
+      "uzp2 v20.2d, v19.2d, v20.2d\n"
+      "fmul v19.4s, v27.4s, v9.s[2]\n"
+      "fmul v9.4s, v27.4s, v9.s[3]\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "fmla v22.4s, v17.4s, v19.4s\n"
+      "ldr q17, [x22, #0x10]\n"
+      "movi v19.4s, #0x0\n"
+      ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
+      "fmla v23.4s, v20.4s, v9.4s\n"
+      "movi v20.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
+      "ldr q18, [x22, #0x20]\n"
+      ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+      ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
+      ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
+      "ldr q18, [x22, #0x40]\n"
+      ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
+      ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
+      "ldr q18, [x22, #0x60]\n"
+      ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
+      ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
+      "movi v18.4s, #0x0\n"
+      ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
+      "ldr q17, [x22, #0x30]\n"
+      ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+      ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
+      "ldr q17, [x22, #0x50]\n"
+      ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
+      ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
+      "ldr q17, [x22, #0x70]\n"
+      "add x22, x22, #0x88\n"
+      ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
+      ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
+      "uzp1 v17.2d, v19.2d, v20.2d\n"
+      "uzp2 v20.2d, v19.2d, v20.2d\n"
+      "fmul v19.4s, v27.4s, v0.s[0]\n"
+      "scvtf v17.4s, v17.4s, #0x4\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "fmla v25.4s, v17.4s, v19.4s\n"
+      "ldr q19, [x21, #0x0]\n"
+      "fmul v17.4s, v27.4s, v0.s[1]\n"
+      "fmla v5.4s, v20.4s, v17.4s\n"
+      "ldr q17, [x21, #0x10]\n"
+      "uzp1 v20.2d, v9.2d, v18.2d\n"
+      "uzp2 v9.2d, v9.2d, v18.2d\n"
+      "fmul v18.4s, v27.4s, v0.s[2]\n"
+      "fmul v0.4s, v27.4s, v0.s[3]\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "scvtf v9.4s, v9.4s, #0x4\n"
+      "fmla v7.4s, v20.4s, v18.4s\n"
+      "movi v20.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+      ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+      "ldr q19, [x21, #0x20]\n"
+      "fmla v4.4s, v9.4s, v0.4s\n"
+      "movi v9.4s, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+      "fmul v8.4s, v27.4s, v26.s[0]\n"
+      ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
+      "ldr q17, [x21, #0x30]\n"
+      ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+      "fmul v31.4s, v27.4s, v26.s[1]\n"
+      ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+      "ldr q19, [x21, #0x40]\n"
+      ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+      "fmul v15.4s, v27.4s, v26.s[2]\n"
+      "fmul v27.4s, v27.4s, v26.s[3]\n"
+      ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
+      "ldr q1, [x21, #0x50]\n"
+      ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+      ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+      "ldr q26, [x21, #0x60]\n"
+      ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
+      ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
+      "ldr q21, [x21, #0x70]\n"
+      "add x21, x21, #0x88\n"
+      ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
+      ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
+      ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
+      ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
+      "uzp1 v29.2d, v20.2d, v18.2d\n"
+      "uzp2 v21.2d, v20.2d, v18.2d\n"
+      "scvtf v29.4s, v29.4s, #0x4\n"
+      "uzp1 v18.2d, v9.2d, v0.2d\n"
+      "uzp2 v16.2d, v9.2d, v0.2d\n"
+      "scvtf v21.4s, v21.4s, #0x4\n"
+      "fmla v6.4s, v29.4s, v8.4s\n"
+      "scvtf v18.4s, v18.4s, #0x4\n"
+      "scvtf v16.4s, v16.4s, #0x4\n"
+      "fmla v30.4s, v21.4s, v31.4s\n"
+      "fmla v24.4s, v18.4s, v15.4s\n"
+      "fmla v14.4s, v16.4s, v27.4s\n"
+      "bgt 3b\n"
+      "mov x20, %x[res_ptr]\n"
+      "subs x27, x27, #0x4\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "str q2, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q10, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q12, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q28, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q11, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q13, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q22, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q23, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q25, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q5, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q7, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q4, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q6, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q30, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q24, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q14, [x20, #0x0]\n"
+      "bne 2b\n"
+      "mov x20, #0x4\n"
+      "sub x10, x10, #0x10\n"
+      "cmp x10, #0x10\n"
+      "mov %x[res_ptr], x26\n"
+      "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+      "bge 1b\n"
+      "4:"  // Row loop skip
+      "cbz x10, 9f\n"
+      "5:"  // Row tail: Row loop
+      "add x24, %x[b_ptr], #0x8\n"
+      "mov x23, %x[width]\n"
+      "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+      "6:"  // Row tail: Column loop
+      "movi v2.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "add x25, %x[a_ptr], #0x8\n"
+      "mov x21, %x[num_blocks]\n"
+      "movi v12.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "7:"  // Row tail: Block loop
+      "ldr q6, [x24, #0x0]\n"
+      "ldr q5, [x24, #0x10]\n"
+      "movi v17.16b, #0x4\n"
+      "movi v8.4s, #0x0\n"
+      "ldr q4, [x25, #0x0]\n"
+      "ldr q13, [x25, #0x10]\n"
+      "movi v27.4s, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr q31, [x24, #0x20]\n"
+      "ldr q14, [x24, #0x30]\n"
+      "movi v29.4s, #0x0\n"
+      "movi v22.16b, #0xf0\n"
+      "ldr q11, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "sshl v21.16b, v6.16b, v17.16b\n"
+      "sshl v16.16b, v5.16b, v17.16b\n"
+      "ldr q20, [x25, #0x40]\n"
+      "ldr q26, [x25, #0x50]\n"
+      "and v6.16b, v6.16b, v22.16b\n"
+      "and v5.16b, v5.16b, v22.16b\n"
+      "ldr q25, [x25, #0x60]\n"
+      "ldr q3, [x25, #0x70]\n"
+      "sshl v19.16b, v31.16b, v17.16b\n"
+      "sshl v18.16b, v14.16b, v17.16b\n"
+      "ldr d17, [x25, #-0x8]\n"
+      ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
+      ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
+      "and v31.16b, v31.16b, v22.16b\n"
+      ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
+      ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
+      "and v14.16b, v14.16b, v22.16b\n"
+      "sub x20, x24, #0x8\n"
+      "ldr d16, [x20, #0x0]\n"
+      "subs x21, x21, #0x1\n"
+      "add x25, x25, #0x88\n"
+      "fcvtl v17.4s, v17.4h\n"
+      "add x24, x24, #0x48\n"
+      ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
+      ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
+      ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
+      ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
+      "fcvtl v16.4s, v16.4h\n"
+      ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
+      ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
+      "fmul v23.4s, v16.4s, v17.s[0]\n"
+      "fmul v21.4s, v16.4s, v17.s[1]\n"
+      "fmul v1.4s, v16.4s, v17.s[2]\n"
+      "fmul v20.4s, v16.4s, v17.s[3]\n"
+      ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
+      ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
+      ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
+      ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
+      ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
+      ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
+      "uzp1 v19.2d, v8.2d, v27.2d\n"
+      "uzp2 v18.2d, v8.2d, v27.2d\n"
+      "scvtf v19.4s, v19.4s, #0x4\n"
+      "uzp1 v17.2d, v0.2d, v29.2d\n"
+      "uzp2 v16.2d, v0.2d, v29.2d\n"
+      "scvtf v18.4s, v18.4s, #0x4\n"
+      "fmla v2.4s, v19.4s, v23.4s\n"
+      "scvtf v17.4s, v17.4s, #0x4\n"
+      "scvtf v16.4s, v16.4s, #0x4\n"
+      "fmla v10.4s, v18.4s, v21.4s\n"
+      "fmla v12.4s, v17.4s, v1.4s\n"
+      "fmla v28.4s, v16.4s, v20.4s\n"
+      "bgt 7b\n"
+      "mov x20, %x[res_ptr]\n"
+      "cmp x10, #0x1\n"
+      "str q2, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x10, #0x2\n"
+      "str q10, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x10, #0x3\n"
+      "str q12, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "str q28, [x20, #0x0]\n"
+      "8:"  // Row tail: Accumulator store skip
+      "subs x23, x23, #0x4\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "bne 6b\n"
+      "subs x10, x10, #0x4\n"
+      "add %x[a_ptr], %x[a_ptr], x9\n"
+      "mov %x[res_ptr], x22\n"
+      "bgt 5b\n"
+      "9:"  // Row tail: Row loop skip
+      : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+      : [b_ptr] "r" (b_ptr), [height] "r" (height), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+#endif
+}
+
+void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
+#if defined(__ARM_NEON)
+    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
+    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
+    size_t width = xend - x0;
+
+    int64_t nb = depth / QK4_0;
+    void * b_ptr = (void *)((block_q4_0x4 *) vx + ((x0/4) * nb));
+    void * a_ptr = vy;
+    float * res_ptr = s + x0;
+    size_t res_stride = output_channels  * sizeof(float);
+
+    assert(depth % 32 == 0);
+    assert(width % 4 == 0);
+
+    size_t num_blocks = depth / 32;
+
+    __asm__ __volatile__(
+      "mov x10, %x[height]\n"
+      "mov x9, #0x88\n"
+      "cmp x10, #0x10\n"
+      "mul x9, %x[num_blocks], x9\n"
+      "blt 4f\n"
+      "1:"  // Row loop
+      "add x28, %x[b_ptr], #0x8\n"
+      "mov x27, %x[width]\n"
+      "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+      "2:"  // Column loop
+      "add x25, %x[a_ptr], #0x8\n"
+      "movi v15.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "mov x24, %x[num_blocks]\n"
+      "add x23, x25, x9\n"
+      "movi v18.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "add x22, x23, x9\n"
+      "movi v11.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "add x21, x22, x9\n"
+      "movi v23.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v7.16b, #0x0\n"
+      "movi v0.16b, #0x0\n"
+      "movi v4.16b, #0x0\n"
+      "movi v5.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v8.16b, #0x0\n"
+      "movi v1.16b, #0x0\n"
+      "3:"  // Block loop
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q31, [x25, #0x0]\n"
+      "movi v28.16b, #0x4\n"
+      "movi v10.4s, #0x0\n"
+      "ldr q22, [x28, #0x10]\n"
+      "ldr q6, [x25, #0x10]\n"
+      "movi v29.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "ldr q27, [x28, #0x20]\n"
+      "ldr q30, [x28, #0x30]\n"
+      "movi v20.4s, #0x0\n"
+      "movi v24.16b, #0xf0\n"
+      "ldr d2, [x25, #-0x8]\n"
+      "ldr d26, [x23, #-0x8]\n"
+      "sshl v12.16b, v3.16b, v28.16b\n"
+      "sub x20, x28, #0x8\n"
+      "ldr d17, [x20, #0x0]\n"
+      "and v3.16b, v3.16b, v24.16b\n"
+      "subs x24, x24, #0x1\n"
+      "add x28, x28, #0x48\n"
+      ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
+      ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
+      ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
+      ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
+      "sshl v31.16b, v22.16b, v28.16b\n"
+      "and v22.16b, v22.16b, v24.16b\n"
+      "fcvtl v17.4s, v17.4h\n"
+      "fcvtl v2.4s, v2.4h\n"
+      "fcvtl v26.4s, v26.4h\n"
+      ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
+      ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
+      ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
+      ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
+      "sshl v6.16b, v27.16b, v28.16b\n"
+      "sshl v28.16b, v30.16b, v28.16b\n"
+      "and v27.16b, v27.16b, v24.16b\n"
+      "and v30.16b, v30.16b, v24.16b\n"
+      "ldr q24, [x25, #0x20]\n"
+      ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+      ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
+      "ldr q24, [x25, #0x30]\n"
+      ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
+      ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
+      ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
+      "ldr q24, [x25, #0x40]\n"
+      ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+      ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
+      "ldr q24, [x25, #0x50]\n"
+      ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
+      ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
+      ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
+      "ldr q24, [x25, #0x60]\n"
+      ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+      ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
+      ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
+      "ldr q24, [x25, #0x70]\n"
+      "add x25, x25, #0x88\n"
+      ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
+      ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
+      ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
+      "fmul v24.4s, v17.4s, v2.s[0]\n"
+      "scvtf v10.4s, v10.4s, #0x4\n"
+      "scvtf v29.4s, v29.4s, #0x4\n"
+      "scvtf v9.4s, v9.4s, #0x4\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "fmla v15.4s, v10.4s, v24.4s\n"
+      "ldr q24, [x23, #0x0]\n"
+      "fmul v10.4s, v17.4s, v2.s[1]\n"
+      "fmla v19.4s, v29.4s, v10.4s\n"
+      "ldr q10, [x23, #0x10]\n"
+      "fmul v29.4s, v17.4s, v2.s[2]\n"
+      "fmul v2.4s, v17.4s, v2.s[3]\n"
+      "fmla v18.4s, v9.4s, v29.4s\n"
+      "movi v9.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e19d  // sdot v29.4s, v12.16b, v24.4b[1]\n"
+      "fmla v14.4s, v20.4s, v2.4s\n"
+      "movi v20.4s, #0x0\n"
+      "movi v2.4s, #0x0\n"
+      ".inst 0x4f98e994  // sdot v20.4s, v12.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+      "ldr q24, [x23, #0x20]\n"
+      ".inst 0x4f8ae3e9  // sdot v9.4s, v31.16b, v10.4b[0]\n"
+      ".inst 0x4faae3fd  // sdot v29.4s, v31.16b, v10.4b[1]\n"
+      ".inst 0x4f8aebf4  // sdot v20.4s, v31.16b, v10.4b[2]\n"
+      ".inst 0x4faaebe2  // sdot v2.4s, v31.16b, v10.4b[3]\n"
+      "ldr q10, [x23, #0x30]\n"
+      ".inst 0x4f98e0c9  // sdot v9.4s, v6.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+      ".inst 0x4f98e8d4  // sdot v20.4s, v6.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+      "ldr q24, [x23, #0x40]\n"
+      ".inst 0x4f8ae389  // sdot v9.4s, v28.16b, v10.4b[0]\n"
+      ".inst 0x4faae39d  // sdot v29.4s, v28.16b, v10.4b[1]\n"
+      ".inst 0x4f8aeb94  // sdot v20.4s, v28.16b, v10.4b[2]\n"
+      ".inst 0x4faaeb82  // sdot v2.4s, v28.16b, v10.4b[3]\n"
+      "ldr q10, [x23, #0x50]\n"
+      ".inst 0x4f98e069  // sdot v9.4s, v3.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+      ".inst 0x4f98e874  // sdot v20.4s, v3.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+      "ldr q24, [x23, #0x60]\n"
+      ".inst 0x4f8ae2c9  // sdot v9.4s, v22.16b, v10.4b[0]\n"
+      ".inst 0x4faae2dd  // sdot v29.4s, v22.16b, v10.4b[1]\n"
+      ".inst 0x4f8aead4  // sdot v20.4s, v22.16b, v10.4b[2]\n"
+      ".inst 0x4faaeac2  // sdot v2.4s, v22.16b, v10.4b[3]\n"
+      "ldr q10, [x23, #0x70]\n"
+      "add x23, x23, #0x88\n"
+      ".inst 0x4f98e369  // sdot v9.4s, v27.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+      ".inst 0x4f98eb74  // sdot v20.4s, v27.16b, v24.4b[2]\n"
+      ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+      "ldr q24, [x22, #0x0]\n"
+      ".inst 0x4f8ae3c9  // sdot v9.4s, v30.16b, v10.4b[0]\n"
+      ".inst 0x4faae3dd  // sdot v29.4s, v30.16b, v10.4b[1]\n"
+      ".inst 0x4f8aebd4  // sdot v20.4s, v30.16b, v10.4b[2]\n"
+      ".inst 0x4faaebc2  // sdot v2.4s, v30.16b, v10.4b[3]\n"
+      "fmul v10.4s, v17.4s, v26.s[0]\n"
+      "scvtf v9.4s, v9.4s, #0x4\n"
+      "scvtf v29.4s, v29.4s, #0x4\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "scvtf v2.4s, v2.4s, #0x4\n"
+      "fmla v11.4s, v9.4s, v10.4s\n"
+      "ldr q9, [x22, #0x10]\n"
+      "fmul v10.4s, v17.4s, v26.s[1]\n"
+      "fmla v13.4s, v29.4s, v10.4s\n"
+      "ldr d29, [x22, #-0x8]\n"
+      "fmul v10.4s, v17.4s, v26.s[2]\n"
+      "fmul v26.4s, v17.4s, v26.s[3]\n"
+      "fcvtl v29.4s, v29.4h\n"
+      "fmla v23.4s, v20.4s, v10.4s\n"
+      "movi v20.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "fmla v16.4s, v2.4s, v26.4s\n"
+      "movi v26.4s, #0x0\n"
+      "movi v2.4s, #0x0\n"
+      ".inst 0x4f98e194  // sdot v20.4s, v12.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+      ".inst 0x4f98e99a  // sdot v26.4s, v12.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+      "ldr q24, [x22, #0x20]\n"
+      ".inst 0x4f89e3f4  // sdot v20.4s, v31.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+      ".inst 0x4f89ebfa  // sdot v26.4s, v31.16b, v9.4b[2]\n"
+      ".inst 0x4fa9ebe2  // sdot v2.4s, v31.16b, v9.4b[3]\n"
+      "ldr q9, [x22, #0x30]\n"
+      ".inst 0x4f98e0d4  // sdot v20.4s, v6.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e0ca  // sdot v10.4s, v6.16b, v24.4b[1]\n"
+      ".inst 0x4f98e8da  // sdot v26.4s, v6.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+      "ldr q24, [x22, #0x40]\n"
+      ".inst 0x4f89e394  // sdot v20.4s, v28.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+      ".inst 0x4f89eb9a  // sdot v26.4s, v28.16b, v9.4b[2]\n"
+      ".inst 0x4fa9eb82  // sdot v2.4s, v28.16b, v9.4b[3]\n"
+      "ldr q9, [x22, #0x50]\n"
+      ".inst 0x4f98e074  // sdot v20.4s, v3.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e06a  // sdot v10.4s, v3.16b, v24.4b[1]\n"
+      ".inst 0x4f98e87a  // sdot v26.4s, v3.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+      "ldr q24, [x22, #0x60]\n"
+      ".inst 0x4f89e2d4  // sdot v20.4s, v22.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+      ".inst 0x4f89eada  // sdot v26.4s, v22.16b, v9.4b[2]\n"
+      ".inst 0x4fa9eac2  // sdot v2.4s, v22.16b, v9.4b[3]\n"
+      "ldr q9, [x22, #0x70]\n"
+      "add x22, x22, #0x88\n"
+      ".inst 0x4f98e374  // sdot v20.4s, v27.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e36a  // sdot v10.4s, v27.16b, v24.4b[1]\n"
+      ".inst 0x4f98eb7a  // sdot v26.4s, v27.16b, v24.4b[2]\n"
+      ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+      "ldr q24, [x21, #0x0]\n"
+      ".inst 0x4f89e3d4  // sdot v20.4s, v30.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e3ca  // sdot v10.4s, v30.16b, v9.4b[1]\n"
+      ".inst 0x4f89ebda  // sdot v26.4s, v30.16b, v9.4b[2]\n"
+      ".inst 0x4fa9ebc2  // sdot v2.4s, v30.16b, v9.4b[3]\n"
+      "fmul v9.4s, v17.4s, v29.s[0]\n"
+      "scvtf v20.4s, v20.4s, #0x4\n"
+      "scvtf v10.4s, v10.4s, #0x4\n"
+      "scvtf v26.4s, v26.4s, #0x4\n"
+      "scvtf v2.4s, v2.4s, #0x4\n"
+      "fmla v25.4s, v20.4s, v9.4s\n"
+      "ldr q9, [x21, #0x10]\n"
+      "fmul v20.4s, v17.4s, v29.s[1]\n"
+      "fmla v7.4s, v10.4s, v20.4s\n"
+      "ldr d20, [x21, #-0x8]\n"
+      "fmul v10.4s, v17.4s, v29.s[2]\n"
+      "fmul v29.4s, v17.4s, v29.s[3]\n"
+      "fcvtl v20.4s, v20.4h\n"
+      "fmla v0.4s, v26.4s, v10.4s\n"
+      "movi v26.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "fmla v4.4s, v2.4s, v29.4s\n"
+      "movi v2.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      ".inst 0x4f98e19a  // sdot v26.4s, v12.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+      ".inst 0x4f98e982  // sdot v2.4s, v12.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e99d  // sdot v29.4s, v12.16b, v24.4b[3]\n"
+      "ldr q12, [x21, #0x20]\n"
+      "fmul v24.4s, v17.4s, v20.s[0]\n"
+      ".inst 0x4f89e3fa  // sdot v26.4s, v31.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+      ".inst 0x4f89ebe2  // sdot v2.4s, v31.16b, v9.4b[2]\n"
+      ".inst 0x4fa9ebfd  // sdot v29.4s, v31.16b, v9.4b[3]\n"
+      "ldr q9, [x21, #0x30]\n"
+      "fmul v31.4s, v17.4s, v20.s[1]\n"
+      ".inst 0x4f8ce0da  // sdot v26.4s, v6.16b, v12.4b[0]\n"
+      ".inst 0x4face0ca  // sdot v10.4s, v6.16b, v12.4b[1]\n"
+      ".inst 0x4f8ce8c2  // sdot v2.4s, v6.16b, v12.4b[2]\n"
+      ".inst 0x4face8dd  // sdot v29.4s, v6.16b, v12.4b[3]\n"
+      "ldr q12, [x21, #0x40]\n"
+      "fmul v6.4s, v17.4s, v20.s[2]\n"
+      "fmul v20.4s, v17.4s, v20.s[3]\n"
+      ".inst 0x4f89e39a  // sdot v26.4s, v28.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+      ".inst 0x4f89eb82  // sdot v2.4s, v28.16b, v9.4b[2]\n"
+      ".inst 0x4fa9eb9d  // sdot v29.4s, v28.16b, v9.4b[3]\n"
+      "ldr q9, [x21, #0x50]\n"
+      ".inst 0x4f8ce07a  // sdot v26.4s, v3.16b, v12.4b[0]\n"
+      ".inst 0x4face06a  // sdot v10.4s, v3.16b, v12.4b[1]\n"
+      ".inst 0x4f8ce862  // sdot v2.4s, v3.16b, v12.4b[2]\n"
+      ".inst 0x4face87d  // sdot v29.4s, v3.16b, v12.4b[3]\n"
+      "ldr q12, [x21, #0x60]\n"
+      ".inst 0x4f89e2da  // sdot v26.4s, v22.16b, v9.4b[0]\n"
+      ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+      ".inst 0x4f89eac2  // sdot v2.4s, v22.16b, v9.4b[2]\n"
+      ".inst 0x4fa9eadd  // sdot v29.4s, v22.16b, v9.4b[3]\n"
+      "ldr q17, [x21, #0x70]\n"
+      "add x21, x21, #0x88\n"
+      ".inst 0x4f8ce37a  // sdot v26.4s, v27.16b, v12.4b[0]\n"
+      ".inst 0x4face36a  // sdot v10.4s, v27.16b, v12.4b[1]\n"
+      ".inst 0x4f8ceb62  // sdot v2.4s, v27.16b, v12.4b[2]\n"
+      ".inst 0x4faceb7d  // sdot v29.4s, v27.16b, v12.4b[3]\n"
+      ".inst 0x4f91e3da  // sdot v26.4s, v30.16b, v17.4b[0]\n"
+      ".inst 0x4fb1e3ca  // sdot v10.4s, v30.16b, v17.4b[1]\n"
+      ".inst 0x4f91ebc2  // sdot v2.4s, v30.16b, v17.4b[2]\n"
+      ".inst 0x4fb1ebdd  // sdot v29.4s, v30.16b, v17.4b[3]\n"
+      "scvtf v26.4s, v26.4s, #0x4\n"
+      "scvtf v10.4s, v10.4s, #0x4\n"
+      "fmla v5.4s, v26.4s, v24.4s\n"
+      "scvtf v2.4s, v2.4s, #0x4\n"
+      "scvtf v29.4s, v29.4s, #0x4\n"
+      "fmla v21.4s, v10.4s, v31.4s\n"
+      "fmla v8.4s, v2.4s, v6.4s\n"
+      "fmla v1.4s, v29.4s, v20.4s\n"
+      "bgt 3b\n"
+      "mov x20, %x[res_ptr]\n"
+      "subs x27, x27, #0x4\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "str q15, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q19, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q18, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q14, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q11, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q13, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q23, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q16, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q25, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q7, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q0, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q4, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q5, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q21, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q8, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "str q1, [x20, #0x0]\n"
+      "bne 2b\n"
+      "mov x20, #0x4\n"
+      "sub x10, x10, #0x10\n"
+      "cmp x10, #0x10\n"
+      "mov %x[res_ptr], x26\n"
+      "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+      "bge 1b\n"
+      "4:"  // Row loop skip
+      "cbz x10, 9f\n"
+      "5:"  // Row tail: Row loop
+      "add x24, %x[b_ptr], #0x8\n"
+      "mov x23, %x[width]\n"
+      "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+      "6:"  // Row tail: Column loop
+      "movi v15.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "add x25, %x[a_ptr], #0x8\n"
+      "mov x21, %x[num_blocks]\n"
+      "movi v18.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "7:"  // Row tail: Block loop
+      "ldr q7, [x24, #0x0]\n"
+      "ldr q5, [x25, #0x0]\n"
+      "movi v9.16b, #0x4\n"
+      "movi v4.4s, #0x0\n"
+      "ldr q3, [x24, #0x10]\n"
+      "ldr q2, [x25, #0x10]\n"
+      "movi v1.4s, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr q13, [x24, #0x20]\n"
+      "ldr q31, [x25, #0x20]\n"
+      "movi v30.4s, #0x0\n"
+      "movi v29.16b, #0xf0\n"
+      "ldr q28, [x24, #0x30]\n"
+      "ldr q27, [x25, #0x30]\n"
+      "sshl v20.16b, v7.16b, v9.16b\n"
+      "sub x20, x24, #0x8\n"
+      "ldr q26, [x25, #0x40]\n"
+      "ldr q25, [x25, #0x50]\n"
+      "sshl v17.16b, v3.16b, v9.16b\n"
+      "and v7.16b, v7.16b, v29.16b\n"
+      "ldr q24, [x25, #0x60]\n"
+      "ldr q16, [x25, #0x70]\n"
+      "sshl v22.16b, v13.16b, v9.16b\n"
+      "and v3.16b, v3.16b, v29.16b\n"
+      "ldr d21, [x20, #0x0]\n"
+      "ldr d12, [x25, #-0x8]\n"
+      ".inst 0x4f85e284  // sdot v4.4s, v20.16b, v5.4b[0]\n"
+      ".inst 0x4fa5e281  // sdot v1.4s, v20.16b, v5.4b[1]\n"
+      ".inst 0x4f85ea80  // sdot v0.4s, v20.16b, v5.4b[2]\n"
+      ".inst 0x4fa5ea9e  // sdot v30.4s, v20.16b, v5.4b[3]\n"
+      "sshl v9.16b, v28.16b, v9.16b\n"
+      "subs x21, x21, #0x1\n"
+      "and v13.16b, v13.16b, v29.16b\n"
+      "and v28.16b, v28.16b, v29.16b\n"
+      "add x25, x25, #0x88\n"
+      "add x24, x24, #0x48\n"
+      "fcvtl v21.4s, v21.4h\n"
+      "fcvtl v12.4s, v12.4h\n"
+      ".inst 0x4f82e224  // sdot v4.4s, v17.16b, v2.4b[0]\n"
+      ".inst 0x4fa2e221  // sdot v1.4s, v17.16b, v2.4b[1]\n"
+      ".inst 0x4f82ea20  // sdot v0.4s, v17.16b, v2.4b[2]\n"
+      ".inst 0x4fa2ea3e  // sdot v30.4s, v17.16b, v2.4b[3]\n"
+      "fmul v11.4s, v21.4s, v12.s[0]\n"
+      "fmul v23.4s, v21.4s, v12.s[1]\n"
+      "fmul v17.4s, v21.4s, v12.s[2]\n"
+      ".inst 0x4f9fe2c4  // sdot v4.4s, v22.16b, v31.4b[0]\n"
+      "fmul v6.4s, v21.4s, v12.s[3]\n"
+      ".inst 0x4fbfe2c1  // sdot v1.4s, v22.16b, v31.4b[1]\n"
+      ".inst 0x4f9feac0  // sdot v0.4s, v22.16b, v31.4b[2]\n"
+      ".inst 0x4fbfeade  // sdot v30.4s, v22.16b, v31.4b[3]\n"
+      ".inst 0x4f9be124  // sdot v4.4s, v9.16b, v27.4b[0]\n"
+      ".inst 0x4fbbe121  // sdot v1.4s, v9.16b, v27.4b[1]\n"
+      ".inst 0x4f9be920  // sdot v0.4s, v9.16b, v27.4b[2]\n"
+      ".inst 0x4fbbe93e  // sdot v30.4s, v9.16b, v27.4b[3]\n"
+      ".inst 0x4f9ae0e4  // sdot v4.4s, v7.16b, v26.4b[0]\n"
+      ".inst 0x4fbae0e1  // sdot v1.4s, v7.16b, v26.4b[1]\n"
+      ".inst 0x4f9ae8e0  // sdot v0.4s, v7.16b, v26.4b[2]\n"
+      ".inst 0x4fbae8fe  // sdot v30.4s, v7.16b, v26.4b[3]\n"
+      ".inst 0x4f99e064  // sdot v4.4s, v3.16b, v25.4b[0]\n"
+      ".inst 0x4fb9e061  // sdot v1.4s, v3.16b, v25.4b[1]\n"
+      ".inst 0x4f99e860  // sdot v0.4s, v3.16b, v25.4b[2]\n"
+      ".inst 0x4fb9e87e  // sdot v30.4s, v3.16b, v25.4b[3]\n"
+      ".inst 0x4f98e1a4  // sdot v4.4s, v13.16b, v24.4b[0]\n"
+      ".inst 0x4fb8e1a1  // sdot v1.4s, v13.16b, v24.4b[1]\n"
+      ".inst 0x4f98e9a0  // sdot v0.4s, v13.16b, v24.4b[2]\n"
+      ".inst 0x4fb8e9be  // sdot v30.4s, v13.16b, v24.4b[3]\n"
+      ".inst 0x4f90e384  // sdot v4.4s, v28.16b, v16.4b[0]\n"
+      ".inst 0x4fb0e381  // sdot v1.4s, v28.16b, v16.4b[1]\n"
+      ".inst 0x4f90eb80  // sdot v0.4s, v28.16b, v16.4b[2]\n"
+      ".inst 0x4fb0eb9e  // sdot v30.4s, v28.16b, v16.4b[3]\n"
+      "scvtf v4.4s, v4.4s, #0x4\n"
+      "scvtf v1.4s, v1.4s, #0x4\n"
+      "scvtf v0.4s, v0.4s, #0x4\n"
+      "fmla v15.4s, v4.4s, v11.4s\n"
+      "scvtf v30.4s, v30.4s, #0x4\n"
+      "fmla v19.4s, v1.4s, v23.4s\n"
+      "fmla v18.4s, v0.4s, v17.4s\n"
+      "fmla v14.4s, v30.4s, v6.4s\n"
+      "bgt 7b\n"
+      "mov x20, %x[res_ptr]\n"
+      "cmp x10, #0x1\n"
+      "str q15, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x10, #0x2\n"
+      "str q19, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "cmp x10, #0x3\n"
+      "str q18, [x20, #0x0]\n"
+      "add x20, x20, %x[res_stride]\n"
+      "ble 8f\n"
+      "str q14, [x20, #0x0]\n"
+      "8:"  // Row tail: Accumulator store skip
+      "subs x23, x23, #0x4\n"
+      "add %x[res_ptr], %x[res_ptr], #0x10\n"
+      "bne 6b\n"
+      "subs x10, x10, #0x4\n"
+      "add %x[a_ptr], %x[a_ptr], x9\n"
+      "mov %x[res_ptr], x22\n"
+      "bgt 5b\n"
+      "9:"  // Row tail: Row loop skip
+      : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+      : [b_ptr] "r" (b_ptr), [height] "r" (height), [num_blocks] "r" (num_blocks), [res_stride] "r" (res_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+#endif
+}
+
 void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
 #if defined(__ARM_FEATURE_MATMUL_INT8)
     int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
@@ -15560,78 +17103,6 @@ void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_w
 #endif
 }
 
-void ggml_gemm_q8_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    int rows = 2;
-    int64_t x0 = roundup((ith * output_channels) / nth, (int64_t)4);
-    int64_t xend = roundup(((ith + 1) * output_channels) / nth, (int64_t)4);
-
-    int64_t nb = n / QK8_0;
-    int64_t a_nb = n / QK8_0;
-
-    const block_q8_0x4 * b_ptr_start = vx;
-    const block_q8_0x2 * a_ptr_start = vy;
-
-    for (int64_t y = 0; y < input_width / 2; y += rows / 2) {
-        for (int64_t x = x0 / 4; x < xend / 4; x++) {
-            const block_q8_0x2 * a_ptrs[rows / 2];
-
-            a_ptrs[0] = a_ptr_start + (y * a_nb);
-
-            const block_q8_0x4 * b_ptr = b_ptr_start + (x * nb);
-
-            // Master FP accumulators
-            float32x4_t acc_rows[rows];
-            acc_rows[0] = vdupq_n_f32(0.0f);
-            acc_rows[1] = vdupq_n_f32(0.0f);
-
-            for (int64_t b = 0; b < nb; b++) {
-                const int8x16_t rhs_mat_01_0 = vld1q_s8(b_ptr[b].qs);
-                const int8x16_t rhs_mat_23_0 = vld1q_s8(b_ptr[b].qs + 16);
-                const int8x16_t rhs_mat_01_1 = vld1q_s8(b_ptr[b].qs + 32);
-                const int8x16_t rhs_mat_23_1 = vld1q_s8(b_ptr[b].qs + 48);
-                const int8x16_t rhs_mat_01_2 = vld1q_s8(b_ptr[b].qs + 64);
-                const int8x16_t rhs_mat_23_2 = vld1q_s8(b_ptr[b].qs + 80);
-                const int8x16_t rhs_mat_01_3 = vld1q_s8(b_ptr[b].qs + 96);
-                const int8x16_t rhs_mat_23_3 = vld1q_s8(b_ptr[b].qs + 112);
-
-                // Scale values - assemble the four row/column scales into a (64-bit) vector, then expand to FP32
-                const float16x4_t col_scale_f16 = vld1_f16(b_ptr[b].d);
-                const float32x4_t col_scale_f32 = vcvt_f32_f16(col_scale_f16);
-
-                // Process LHS in pairs of rows
-                int rp = 0;
-                const int8x16_t lhs_mat_01_0 = vld1q_s8(a_ptrs[rp][b].qs);
-                const int8x16_t lhs_mat_01_1 = vld1q_s8(a_ptrs[rp][b].qs + 16);
-
-                const int8x16_t lhs_mat_01_2 = vld1q_s8(a_ptrs[rp][b].qs + 32);
-                const int8x16_t lhs_mat_01_3 = vld1q_s8(a_ptrs[rp][b].qs + 48);
-
-                // Do the MMLAs into 2x2 matrices
-                const int32x4_t iacc_mat_00 =
-                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_01_0), lhs_mat_01_1, rhs_mat_01_1), lhs_mat_01_2, rhs_mat_01_2), lhs_mat_01_3, rhs_mat_01_3);
-                const int32x4_t iacc_mat_01 =
-                    vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vmmlaq_s32(vdupq_n_s32(0), lhs_mat_01_0, rhs_mat_23_0), lhs_mat_01_1, rhs_mat_23_1), lhs_mat_01_2, rhs_mat_23_2), lhs_mat_01_3, rhs_mat_23_3);
-
-                // Straighten out to make 2 row vectors
-                const int32x4_t iacc_row_0 = vreinterpretq_s32_u64(vtrn1q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-                const int32x4_t iacc_row_1 = vreinterpretq_s32_u64(vtrn2q_u64(vreinterpretq_u64_s32(iacc_mat_00), vreinterpretq_u64_s32(iacc_mat_01)));
-
-                const float16x4_t row_scale_f16_0 = vld1_dup_f16(&(a_ptrs[rp][b].d[0]));
-                const float32x4_t row_scale_f32_0 = vcvt_f32_f16(row_scale_f16_0);
-                const float16x4_t row_scale_f16_1 = vld1_dup_f16(&(a_ptrs[rp][b].d[1]));
-                const float32x4_t row_scale_f32_1 = vcvt_f32_f16(row_scale_f16_1);
-
-                acc_rows[rp * 2] = vfmaq_f32(acc_rows[rp * 2], vcvtq_f32_s32(iacc_row_0), vmulq_f32(col_scale_f32, row_scale_f32_0));
-                acc_rows[rp * 2 + 1] = vfmaq_f32(acc_rows[rp * 2 + 1], vcvtq_f32_s32(iacc_row_1), vmulq_f32(col_scale_f32, row_scale_f32_1));
-            }
-            vst1q_f32(s + ((y * 2) * output_channels + x * 4), acc_rows[0]);
-            vst1q_f32(s + ((y * 2 + 1) * output_channels + x * 4), acc_rows[1]);
-        }
-    }
-#endif
-}
-
 static bool validate_float(float f, size_t i) {
     if (isinf(f)) {
         fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index 852263da6..61b8ce421 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -70,24 +70,6 @@ typedef struct {
 } block_q4_0x8;
 static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_fp16_t) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
 
-typedef struct {
-    ggml_fp16_t d[16];     // deltas for 16 q4_0 blocks
-    uint8_t qs[QK4_0 * 8]; // nibbles / quants for 16 q4_0 blocks
-} block_q4_0x16;
-static_assert(sizeof(block_q4_0x16) == 16 * sizeof(ggml_fp16_t) + QK4_0 * 8, "wrong q4_0x16 block size/padding");
-
-typedef struct {
-    ggml_fp16_t d[64];     // deltas for 64 q4_0 blocks
-    uint8_t qs[QK4_0 * 32];// nibbles / quants for 64 q4_0 blocks
-} block_q4_0x64;
-static_assert(sizeof(block_q4_0x64) == 64 * sizeof(ggml_fp16_t) + QK4_0 * 32, "wrong q4_0x64 block size/padding");
-
-typedef struct {
-    ggml_fp16_t d[2];      // deltas for 2 q8_0 blocks
-    int8_t qs[QK8_0 * 2];  // quants for 2 q8_0 blocks
-} block_q8_0x2;
-static_assert(sizeof(block_q8_0x2) == 2 * sizeof(ggml_fp16_t) + QK8_0 * 2, "wrong q8_0x2 block size/padding");
-
 typedef struct {
     ggml_fp16_t d[4];      // deltas for 4 q8_0 blocks
     int8_t qs[QK8_0 * 4];  // quants for 4 q8_0 blocks
@@ -366,30 +348,34 @@ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
 size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0_aarch64(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
 void iq2xs_init_impl(enum ggml_type type);
 void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 
-block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len);
-block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len);
+block_q4_0x4 make_block_q4_0x4(const block_q4_0 * const in[4], unsigned int block_len, unsigned int xor_mask);
+block_q4_0x8 make_block_q4_0x8(const block_q4_0 * const in[8], unsigned int block_len, unsigned int xor_mask);
 block_q8_0x4 make_block_q8_0x4(const block_q8_0 * const in[4], unsigned int block_len);
 block_q8_0x8 make_block_q8_0x8(const block_q8_0 * const in[8], unsigned int block_len);
-void quantize_row_q8_0_and_make_block_q8_0x2(const float * restrict x, void * restrict vy, int k, int rows_interleaved);
-void quantize_row_q8_0_and_make_block_q8_0x4(const float * restrict x, void * restrict vy, int k, int rows_interleaved);
+void quantize_row_q8_0_aarch64(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k, int nrows_interleaved, int blocklen_per_row);
 
 // GEMV
-void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q8_0_q8_0_blocked8_neon(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemv_q8_0_q8_0_blocked8_sve(const int n, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
 
 // GEMM
-void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemm_q4_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-void ggml_gemm_q8_0_q8_0_2x4blocked_mmla(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
+void ggml_gemm_q4_0_q8_0(const int n, int rows, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemm_q4_0_q8_0_aarch64_sve256(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemm_q4_0_q8_0_aarch64_neon(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm(size_t depth, size_t output_channels, size_t height, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
+void ggml_gemm_q8_0_q8_0(const int n, int rows, int output_channels, int input_width, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int ith, int nth);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 8b613a6a0..ddeda4336 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -486,192 +486,6 @@ int64_t ggml_cycles_per_ms(void) {
 #define ggml_perf_cycles_per_ms() 0
 #endif
 
-void rearrange_q4_0_weights_blocked8_neon(struct ggml_tensor * cur) {
-    block_q4_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q4_0x8 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK4_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
-        const block_q4_0 * in_ptrs[8];
-
-        in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 8 * nb);
-        for (int i = 0; i < 7; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B = make_block_q4_0x8(in_ptrs, 4);  // block_len=4 for SDOT
-            out_ptr_B++;
-
-            for (int i = 0; i < 8; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
-}
-
-void rearrange_q4_0_weights_blocked8_sve(struct ggml_tensor * cur) {
-#if defined(__ARM_FEATURE_SVE)
-    if (svcntw() != 8) {
-        printf("ggml_gemv_q4_0_q8_0_blocked8_sve: SVE VL != 256 - aborting. Use Arm Neon GEMV kernels\n");
-        exit(1);
-    }
-
-    block_q4_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q4_0x8 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK4_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
-        const block_q4_0 * in_ptrs[8];
-
-        in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 8 * nb);
-        for (int i = 0; i < 7; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B = make_block_q4_0x8(in_ptrs, 4);  // block_len=4 for SDOT
-            out_ptr_B++;
-
-            for (int i = 0; i < 8; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
-#endif
-}
-
-#if defined(__ARM_FEATURE_SVE)
-static void (*_rearrange_q4_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q4_0_weights_blocked8_sve;
-#elif defined(__ARM_NEON)
-static void (*_rearrange_q4_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q4_0_weights_blocked8_neon;
-#endif
-
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-void rearrange_q4_0_weights_for_gemv(struct ggml_tensor * cur) { _rearrange_q4_0_weights_for_gemv(cur); }
-#endif
-
-void rearrange_q4_0_weights_for_gemm(struct ggml_tensor * cur) {
-    block_q4_0x4 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q4_0x4 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK4_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 4; y_out++) {
-        const block_q4_0 * in_ptrs[4];
-
-        in_ptrs[0] = (block_q4_0 *) cur->data + (y_out * 4 * nb);
-        for (int i = 0; i < 3; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B =
-                make_block_q4_0x4(in_ptrs, 8);  // block_len=8 for SMMLA
-            out_ptr_B++;
-
-            for (int i = 0; i < 4; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemm = (uint8_t *) out_ptr_B_start;
-}
-
-void rearrange_q8_0_weights_blocked8_neon(struct ggml_tensor * cur) {
-    block_q8_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q8_0x8 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK8_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
-        const block_q8_0 * in_ptrs[8];
-
-        in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 8 * nb);
-        for (int i = 0; i < 7; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B = make_block_q8_0x8(in_ptrs, 4);  // block_len=4 for SDOT
-            out_ptr_B++;
-
-            for (int i = 0; i < 8; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
-}
-
-void rearrange_q8_0_weights_blocked8_sve(struct ggml_tensor * cur) {
-#if defined(__ARM_FEATURE_SVE)
-    if (svcntw() != 8) {
-        printf("ggml_gemv_q8_0_q8_0_blocked8_sve: SVE VL != 256 - aborting. Use Arm Neon GEMV kernels\n");
-        exit(1);
-    }
-
-    block_q8_0x8 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q8_0x8 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK8_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 8; y_out++) {
-        const block_q8_0 * in_ptrs[8];
-
-        in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 8 * nb);
-        for (int i = 0; i < 7; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B = make_block_q8_0x8(in_ptrs, 4);  // block_len=4 for SDOT
-            out_ptr_B++;
-
-            for (int i = 0; i < 8; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemv = (uint8_t *) out_ptr_B_start;
-#endif
-}
-
-#if defined(__ARM_FEATURE_SVE)
-static void (*_rearrange_q8_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q8_0_weights_blocked8_sve;
-#elif defined(__ARM_NEON)
-static void (*_rearrange_q8_0_weights_for_gemv)(struct ggml_tensor *) = &rearrange_q8_0_weights_blocked8_neon;
-#endif
-
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-void rearrange_q8_0_weights_for_gemv(struct ggml_tensor * cur) { _rearrange_q8_0_weights_for_gemv(cur); }
-#endif
-
-void rearrange_q8_0_weights_for_gemm(struct ggml_tensor * cur) {
-    block_q8_0x4 * out_ptr_B = malloc(ggml_nbytes(cur));  // B_blocked->data;
-    block_q8_0x4 * out_ptr_B_start = out_ptr_B;
-    int64_t nb = cur->ne[0] / QK8_0;
-
-    for (int y_out = 0; y_out < cur->ne[1] / 4; y_out++) {
-        const block_q8_0 * in_ptrs[4];
-
-        in_ptrs[0] = (block_q8_0 *) cur->data + (y_out * 4 * nb);
-        for (int i = 0; i < 3; i++) {
-            in_ptrs[i + 1] = in_ptrs[i] + nb;
-        }
-
-        for (int64_t x = 0; x < nb; x++) {
-            *out_ptr_B =
-                make_block_q8_0x4(in_ptrs, 8);  // block_len=8 for SMMLA
-            out_ptr_B++;
-
-            for (int i = 0; i < 4; i++) {
-                in_ptrs[i]++;
-            }
-        }
-    }
-    cur->rearranged_weight_gemm = (uint8_t *) out_ptr_B_start;
-}
-
 //
 // cross-platform UTF-8 file paths
 //
@@ -891,6 +705,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
 #else
         .nrows                    = 1,
 #endif
+        .from_float_to_mat        = quantize_row_q8_0_aarch64,
     },
     [GGML_TYPE_Q8_1] = {
         .type_name                = "q8_1",
@@ -1088,6 +903,32 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
         .vec_dot_type             = GGML_TYPE_BF16,
         .nrows                    = 1,
+    },
+    [GGML_TYPE_Q4_0_AARCH64] = {
+        .type_name                = "q4_0_aarch64",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
+        .from_float               = quantize_row_q4_0,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
+        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
+        .nrows                    = 1,
+#endif
+#if defined(__ARM_FEATURE_SVE)
+        .gemv                     = ggml_gemv_q4_0_q8_0_aarch64_sve256,
+        .gemm                     = ggml_gemm_q4_0_q8_0_aarch64_sve256,
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+        .gemv                     = ggml_gemv_q4_0_q8_0_aarch64_neon,
+        .gemm                     = ggml_gemm_q4_0_q8_0_aarch64_neon,
+#elif defined(__ARM_NEON)
+        .gemv                     = ggml_gemv_q4_0_q8_0_aarch64_neon_noi8mm,
+        .gemm                     = ggml_gemm_q4_0_q8_0_aarch64_neon_noi8mm,
+#endif
     }
 };
 
@@ -2804,10 +2645,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
     *s = idx;
 }
 
-static void ggml_gemv_q4_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-
-static void ggml_gemv_q8_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth);
-
 //
 // data types
 //
@@ -3391,6 +3228,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
         case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
         case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
         case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
+        case GGML_FTYPE_MOSTLY_Q4_0_AARCH64:  wtype = GGML_TYPE_Q4_0_AARCH64;  break;
         case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
         case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
     }
@@ -3850,9 +3688,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
         ///*.padding      =*/ { 0 },
-        /*.rearranged_weight_gemv =*/ NULL,
-        /*.rearranged_weight_gemm =*/ NULL,
-        /*.weight_rearranged      =*/ false,
     };
 
 #ifdef __clang__
@@ -9638,6 +9473,7 @@ static void ggml_compute_forward_add(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
             {
                 ggml_compute_forward_add_q_f32(params, dst);
             } break;
@@ -10013,6 +9849,7 @@ static void ggml_compute_forward_add1(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
             {
                 ggml_compute_forward_add1_q_f32(params, dst);
             } break;
@@ -10138,6 +9975,7 @@ static void ggml_compute_forward_acc(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
         default:
             {
                 GGML_ASSERT(false);
@@ -12340,6 +12178,9 @@ static void ggml_compute_forward_mul_mat(
     enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
     ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
     int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
+    ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat;
+    ggml_gemv_t       const gemv                  = type_traits[type].gemv;
+    ggml_gemm_t       const gemm                  = type_traits[type].gemm;
 
     GGML_ASSERT(ne0 == ne01);
     GGML_ASSERT(ne1 == ne11);
@@ -12405,10 +12246,9 @@ UseGgmlGemm1:;
                 }
             }
         }
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-            if ((src0->weight_rearranged == true) && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
+            if ((type == GGML_TYPE_Q4_0_AARCH64) && (ne11 >= 4) && (ne12 == 1) && (ne13 == 1)) {
                     for (int64_t i11 = 0; i11 < ne11 / 4; ++i11) {
-                        quantize_row_q8_0_and_make_block_q8_0x4((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10, 4);
+                        from_float_to_mat((float *)((char *) src1->data + i11 * 4 * nb11), (void *) wdata, ne10, 4, ggml_cpu_has_matmul_int8() ? 8 : 4);
                         wdata += row_size * 4;
                     }
                     for (int64_t i11 = (ne11 / 4) * 4; i11 < ne11; ++i11) {
@@ -12416,10 +12256,7 @@ UseGgmlGemm1:;
                          wdata += row_size;
                      }
             }
-#endif
-#if defined(__ARM_FEATURE_MATMUL_INT8)
             else {
-#endif
                 for (int64_t i13 = 0; i13 < ne13; ++i13) {
                     for (int64_t i12 = 0; i12 < ne12; ++i12) {
                         for (int64_t i11 = 0; i11 < ne11; ++i11) {
@@ -12428,9 +12265,7 @@ UseGgmlGemm1:;
                         }
                     }
                 }
-#if defined(__ARM_FEATURE_MATMUL_INT8)
             }
-#endif
 
     if (ith == 0) {
         // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
@@ -12509,114 +12344,50 @@ UseGgmlGemm2:;
     //if (ith == 0)
     //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
 
-#if defined(__ARM_FEATURE_MATMUL_INT8) && (defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE))
-    if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (src0->weight_rearranged == true)) {
-        if (src0->type == GGML_TYPE_Q4_0) {
-            ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->rearranged_weight_gemv, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
-        } else if (src0->type == GGML_TYPE_Q8_0) {
-            ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->rearranged_weight_gemv, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
-        }
+    if ((ggml_n_dims(src0) == 2) && (ne11 == 1) && (type == GGML_TYPE_Q4_0_AARCH64)) {
+        gemv(ne00, ne01, 1, (float *)((char *) dst->data), (const char *) src0->data, (const char *) wdata, ith, nth); // use Arm Neon/SVE GEMV kernels
     }
-    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 16) && (src0->weight_rearranged == true)) {
+    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 16) && (type == GGML_TYPE_Q4_0_AARCH64)) {
         // use batch-sized 16, 8, and 4 GEMM kernels
-        if (src0->type == GGML_TYPE_Q4_0) {
-            for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 16, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
-            }
-            int rows_processed = (ne11 / 16) * 16;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
-            }
-            rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
-            }
-            rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
-            for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
-                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
-        } else if (src0->type == GGML_TYPE_Q8_0) {
-            for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 16, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
-            }
-            int rows_processed = (ne11 / 16) * 16;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
-            }
-            rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
-            }
-            rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
-            for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
-                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
+        for (int row_iter = 0; row_iter < ne11 / 16; row_iter++) {
+            gemm(ne00, ne01, 16, (float *)((char *) dst->data + (row_iter * 16 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 16) * row_size : (row_iter * 16 * nb11)), ith, nth);
         }
-    } else if ((ggml_n_dims(src0) == 2) && (ne11 >= 8) && (src0->weight_rearranged == true)) {
+        int rows_processed = (ne11 / 16) * 16;
+        for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 8; row_iter++) {
+            gemm(ne00, ne01, 8, (float *)((char *) dst->data + ((rows_processed + row_iter * 8) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 8) * row_size : ((rows_processed + row_iter * 8) * nb11)), ith, nth);
+        }
+        rows_processed = rows_processed + ((ne11 - rows_processed) / 8) * 8;
+        for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
+            gemm(ne00, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
+        }
+        rows_processed = rows_processed + ((ne11 - rows_processed) / 4) * 4;
+        for (int row_iter = rows_processed; row_iter < ne11; row_iter++) {
+            gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+        }
+    }
+    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 8) && (type == GGML_TYPE_Q4_0_AARCH64)) {
         // use batch-sized 8, and 4 GEMM kernels
-        if (src0->type == GGML_TYPE_Q4_0) {
-            for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
-            }
-            int rows_processed = (ne11 / 8) * 8;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
-            }
-            for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
-                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
-        } else if (src0->type == GGML_TYPE_Q8_0) {
-            for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 8, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
-            }
-            int rows_processed = (ne11 / 8) * 8;
-            for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->rearranged_weight_gemm,
-                                            (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
-            }
-            for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
-                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
+        for (int row_iter = 0; row_iter < ne11 / 8; row_iter++) {
+            gemm(ne00, ne01, 8, (float *)((char *) dst->data + (row_iter * 8 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 8) * row_size : (row_iter * 8 * nb11)), ith, nth);
         }
-    } else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (src0->weight_rearranged == true)) {
+        int rows_processed = (ne11 / 8) * 8;
+        for (int row_iter = 0; row_iter < (ne11 - rows_processed) / 4; row_iter++) {
+            gemm(ne00, ne01, 4, (float *)((char *) dst->data + ((rows_processed + row_iter * 4) * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (rows_processed + row_iter * 4) * row_size : ((rows_processed + row_iter * 4) * nb11)), ith, nth);
+        }
+        for (int row_iter = ((ne11 / 8) * 8) + ((ne11 - rows_processed) / 4 * 4); row_iter < ne11; row_iter++) {
+            gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
+        }
+    } 
+    else if ((ggml_n_dims(src0) == 2) && (ne11 >= 4) && (type == GGML_TYPE_Q4_0_AARCH64)) {
         // use batch-sized 4 GEMM kernel
-        if (src0->type == GGML_TYPE_Q4_0) {
-            for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
-                ggml_gemm_q4_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
-            }
-            for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
-                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
-        } else if (src0->type == GGML_TYPE_Q8_0) {
-            for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
-                ggml_gemm_q8_0_q8_0(ne00, 4, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->rearranged_weight_gemm, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
-            }
-            for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
-                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
+        for (int row_iter = 0; row_iter < ne11 / 4; row_iter++) {
+            gemm(ne00, ne01, 4, (float *)((char *) dst->data + (row_iter * 4 * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter * 4) * row_size : (row_iter * 4 * nb11)), ith, nth);
+        }
+        for (int row_iter = (ne11 / 4) * 4; row_iter < ne11; row_iter++) {
+            gemv(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->data, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
         }
     }
-#elif defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-    if ((ggml_n_dims(src0) == 2) && (src0->weight_rearranged == true)) {
-        if (src0->type == GGML_TYPE_Q4_0) {
-            for (int row_iter = 0; row_iter < ne11; row_iter++) {
-                ggml_gemv_q4_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
-        } else if (src0->type == GGML_TYPE_Q8_0) {
-            for (int row_iter = 0; row_iter < ne11; row_iter++) {
-                ggml_gemv_q8_0_q8_0(ne00, ne01, 1, (float *)((char *) dst->data + (row_iter * nb1)), (const char *) src0->rearranged_weight_gemv, (const char *) wdata + (src1_cont || src1->type != vec_dot_type ? (row_iter)*row_size : (row_iter * nb11)), ith, nth);
-            }
-        }
-    }
-#endif
-#if defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
     else {
-#endif
         // The first chunk comes from our thread_id, the rest will get auto-assigned.
         int current_chunk = ith;
 
@@ -12638,9 +12409,7 @@ UseGgmlGemm2:;
 
             current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
         }
-#if defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
     }
-#endif
 }
 
 // ggml_compute_forward_mul_mat_id
@@ -13051,6 +12820,7 @@ static void ggml_compute_forward_out_prod(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
             {
                 ggml_compute_forward_out_prod_q_f32(params, dst);
             } break;
@@ -13236,6 +13006,7 @@ static void ggml_compute_forward_set(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
         default:
             {
                 GGML_ASSERT(false);
@@ -13495,6 +13266,7 @@ static void ggml_compute_forward_get_rows(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_Q4_0_AARCH64:
             {
                 ggml_compute_forward_get_rows_q(params, dst);
             } break;
@@ -14081,6 +13853,7 @@ static void ggml_compute_forward_clamp(
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
         case GGML_TYPE_Q8_K:
+        case GGML_TYPE_Q4_0_AARCH64:
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
@@ -20804,6 +20577,7 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0_AARCH64: result = quantize_q4_0_aarch64(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_F16:
             {
                 size_t elemsize = sizeof(ggml_fp16_t);
@@ -22238,26 +22012,12 @@ int ggml_cpu_has_matmul_int8(void) {
 #endif
 }
 
+int ggml_cpu_has_sve(void) {
 #if defined(__ARM_FEATURE_SVE)
-static void (*_ggml_gemv_q4_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q4_0_q8_0_blocked8_sve;
-#elif defined(__ARM_NEON)
-static void (*_ggml_gemv_q4_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q4_0_q8_0_blocked8_neon;
+    return 1;
+#else
+    return 0;
 #endif
-
-#if defined(__ARM_FEATURE_SVE)
-static void (*_ggml_gemv_q8_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q8_0_q8_0_blocked8_sve;
-#elif defined(__ARM_NEON)
-static void (*_ggml_gemv_q8_0_q8_0)(const int, int, int, float * restrict, const void * restrict, const void * restrict, int, int) = &ggml_gemv_q8_0_q8_0_blocked8_neon;
-#endif
-
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-static void ggml_gemv_q4_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-    _ggml_gemv_q4_0_q8_0(n, output_channels, input_width, s, vx, vy, ith, nth);
 }
 
-static void ggml_gemv_q8_0_q8_0(const int n, int output_channels, int input_width, float * restrict s, const void * restrict vx, const void * restrict vy, int ith, int nth) {
-    _ggml_gemv_q8_0_q8_0(n, output_channels, input_width, s, vx, vy, ith, nth);
-}
-#endif
-
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/llama.h b/include/llama.h
index bb4b05ba6..bd108ec69 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -162,6 +162,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64  = 33, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
diff --git a/src/llama.cpp b/src/llama.cpp
index 7aecda2f5..ff7631054 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3783,6 +3783,7 @@ struct llama_model_loader {
                 case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
                 case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
                 case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
+                case GGML_TYPE_Q4_0_AARCH64: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64; break;
                 default:
                     {
                         LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -4359,32 +4360,6 @@ struct llama_model_loader {
                 }
             }
 
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
-            if ((cur->type == GGML_TYPE_Q4_0) && (cur->ne[1] % 4 == 0)) {
-                cur->weight_rearranged = true;
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-                rearrange_q4_0_weights_for_gemv(cur); // rearrange weights for Arm Neon/SVE GEMV kernels
-#endif
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-                rearrange_q4_0_weights_for_gemm(cur); // rearrange weights for GEMM MMLA kernels
-#endif
-            }
-            else if ((cur->type == GGML_TYPE_Q8_0) && (cur->ne[1] % 4 == 0)) {
-                cur->weight_rearranged = true;
-#if defined(__ARM_NEON) || defined(__ARM_FEATURE_SVE)
-                rearrange_q8_0_weights_for_gemv(cur); // rearrange weights for Arm Neon/SVE GEMV kernels
-#endif
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-                rearrange_q8_0_weights_for_gemm(cur); // rearrange weights for GEMM MMLA kernels
-#endif
-            }
-            else {
-                cur->weight_rearranged = false;
-            }
-#else
-            cur->weight_rearranged = false;
-#endif
-
             size_done += n_size;
         }
 
@@ -4502,6 +4477,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_S:  return "IQ3_S - 3.4375 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_M:  return "IQ3_S mix - 3.66 bpw";
+        case LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64: return "Q4_0_AARCH64";
 
         default: return "unknown, may not work";
     }
@@ -17787,6 +17763,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
                 new_type = GGML_TYPE_IQ3_S;
             }
+            else if (new_type == GGML_TYPE_Q4_0_AARCH64) {
+                new_type = GGML_TYPE_Q4_0;
+            }
         }
     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@@ -18099,6 +18078,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
         case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
         case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0_AARCH64: default_type = GGML_TYPE_Q4_0_AARCH64; break;
 
         default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
@@ -18409,6 +18389,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 f32_data = (float *) f32_conv_buf.data();
             }
 
+            if (new_type == GGML_TYPE_Q4_0_AARCH64) {
+                if ((ggml_cpu_has_neon() == 0) && (ggml_cpu_has_sve() == 0)) new_type = GGML_TYPE_Q4_0;
+                if ((nelements / tensor->ne[0]) % 4 != 0) new_type = GGML_TYPE_Q4_0;
+                if (nthread > 1) nthread = 1;
+            }
+
             LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
             fflush(stdout);
 
@@ -21702,6 +21688,7 @@ const char * llama_print_system_info(void) {
 #else
     s += "LLAMAFILE = 0 | ";
 #endif
+    s += "SVE = "         + std::to_string(ggml_cpu_has_sve())         + " | ";
 
     return s.c_str();
 }