From ba2040d1df9ca5fbcea3b4d8ad12a637b5ed1f99 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Thu, 3 Aug 2023 12:52:06 +0800
Subject: [PATCH] compile fix for ARM NEON

---
 koboldcpp.py        |  6 ++--
 otherarch/ggml_v1.c | 86 ---------------------------------------------
 2 files changed, 3 insertions(+), 89 deletions(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index 1ba082d92..131ae2122 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -704,7 +704,7 @@ def show_new_gui():
         (lib_default, "Use No BLAS"),
         (lib_noavx2, "NoAVX2 Mode (Old CPU)"),
         (lib_failsafe, "Failsafe Mode (Old CPU)")]
-    openblas_option, clblast_option, cublas_option, default_option, openblas_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
+    openblas_option, clblast_option, cublas_option, default_option, noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
     # slider data
     blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024"]
     blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024"]
@@ -1151,8 +1151,8 @@ def show_new_gui():
             if failsafe_option is not None:
                 runopts_var.set(failsafe_option)
         elif "noavx2" in dict and dict["noavx2"]:
-            if openblas_noavx2_option is not None:
-                runopts_var.set(openblas_noavx2_option)
+            if noavx2_option is not None:
+                runopts_var.set(noavx2_option)
         elif "noblas" in dict and dict["noblas"]:
             if default_option is not None:
                 runopts_var.set(default_option)
diff --git a/otherarch/ggml_v1.c b/otherarch/ggml_v1.c
index 04e981a26..bfa0fe709 100644
--- a/otherarch/ggml_v1.c
+++ b/otherarch/ggml_v1.c
@@ -1577,63 +1577,6 @@ inline static void ggml_v1_vec_mad_q4_0(const int n, float * restrict y, void *
     const float   * restrict pd = (const float *)   (x);
     const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
 
-#if __ARM_NEON
-#if QK == 32
-    for (int i = 0; i < nb; ++i) {
-        const float d0 = pd[i]*v;
-
-        const uint8_t * restrict pp = pb + i*16;
-
-        const uint8x8_t m4b = vdup_n_u8(0xf);
-        const int8x8_t  s8b = vdup_n_s8(0x8);
-
-        const float32x4_t vd = vdupq_n_f32(d0);
-
-        for (int j = 0; j < 2; j++) {
-            const uint8x8_t vx = vld1_u8(pp + j*8);
-
-            const int8x8_t vxl = vreinterpret_s8_u8(vand_u8(vx, m4b));
-            const int8x8_t vxh = vreinterpret_s8_u8(vshr_n_u8(vx, 4));
-
-            // sub 8
-            const int8x8_t vxls = vsub_s8(vxl, s8b);
-            const int8x8_t vxhs = vsub_s8(vxh, s8b);
-
-            //const int8x8_t vxlt = vzip_s8(vxls, vxhs)[0];
-            //const int8x8_t vxht = vzip_s8(vxls, vxhs)[1];
-            const int8x8_t vxlt = vzip1_s8(vxls, vxhs);
-            const int8x8_t vxht = vzip2_s8(vxls, vxhs);
-
-            const int8x16_t vxq = vcombine_s8(vxlt, vxht);
-
-            // convert to 2x int16x8_t
-            const int16x8_t vxq0 = vmovl_s8(vget_low_s8 (vxq));
-            const int16x8_t vxq1 = vmovl_s8(vget_high_s8(vxq));
-
-            // convert to 4x float32x4_t
-            const float32x4_t vx0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq0)));
-            const float32x4_t vx1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq0)));
-            const float32x4_t vx2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq1)));
-            const float32x4_t vx3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq1)));
-
-            const float32x4_t vy0 = vld1q_f32(y + i*32 + j*16 + 0);
-            const float32x4_t vy1 = vld1q_f32(y + i*32 + j*16 + 4);
-            const float32x4_t vy2 = vld1q_f32(y + i*32 + j*16 + 8);
-            const float32x4_t vy3 = vld1q_f32(y + i*32 + j*16 + 12);
-
-            const float32x4_t vr0 = vfmaq_f32(vy0, vx0, vd);
-            const float32x4_t vr1 = vfmaq_f32(vy1, vx1, vd);
-            const float32x4_t vr2 = vfmaq_f32(vy2, vx2, vd);
-            const float32x4_t vr3 = vfmaq_f32(vy3, vx3, vd);
-
-            vst1q_f32(y + i*32 + j*16 + 0,  vr0);
-            vst1q_f32(y + i*32 + j*16 + 4,  vr1);
-            vst1q_f32(y + i*32 + j*16 + 8,  vr2);
-            vst1q_f32(y + i*32 + j*16 + 12, vr3);
-        }
-    }
-#endif
-#else
     // scalar
     for (int i = 0; i < nb; i++) {
         const float d = pd[i];
@@ -1658,7 +1601,6 @@ inline static void ggml_v1_vec_mad_q4_0(const int n, float * restrict y, void *
             assert(!isinf(y[i*QK + l + 1]));
         }
     }
-#endif
 }
 
 inline static void ggml_v1_vec_mad_q4_1(const int n, float * restrict y, void * restrict x, const float v) {
@@ -10199,31 +10141,3 @@ int ggml_v1_cpu_has_vsx(void) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
-
-#if __ARM_NEON
-#if !defined(__aarch64__)
-int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
-    int8x8_t res;
-
-    res[0] = a[0]; res[1] = b[0];
-    res[2] = a[1]; res[3] = b[1];
-    res[4] = a[2]; res[5] = b[2];
-    res[6] = a[3]; res[7] = b[3];
-
-    return res;
-}
-
-int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
-    int8x8_t res;
-
-    res[0] = a[4]; res[1] = b[4];
-    res[2] = a[5]; res[3] = b[5];
-    res[4] = a[6]; res[5] = b[6];
-    res[6] = a[7]; res[7] = b[7];
-
-    return res;
-}
-
-#endif
-#endif
\ No newline at end of file