Fix the AARCH64 build

2025-06-30 08:18:30 +00:00 · 2023-05-13 08:19:44 -07:00 · 2023-05-13 08:19:44 -07:00 · 410c8785c9
commit 410c8785c9
parent 5a4cf9560f
4 changed files with 22 additions and 19 deletions
--- a/third_party/ggml/fp16.c
+++ b/third_party/ggml/fp16.c
@ -45,8 +45,10 @@ asm(".include \"libc/disclaimer.inc\"");
 #define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
 #define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
 #define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
-// precomputed tables for expanding 8bits to 8 bytes (shl 4)
-const uint64_t table_b2b_u[1 << 8] = { B8(00, 10) };
+
+// precomputed tables for expanding 8bits to 8 bytes:
+const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
+const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
 #endif

 //
--- a/third_party/ggml/fp16.internal.h
+++ b/third_party/ggml/fp16.internal.h
@ -13,7 +13,8 @@ extern ggml_fp16_t table_silu_f16[1 << 16];
 extern ggml_fp16_t table_exp_f16[1 << 16];
 extern float table_f32_f16[1 << 16];
 #if defined(__ARM_NEON) || defined(__wasm_simd128__)
-extern const uint64_t table_b2b_u[1 << 8];
+extern const uint64_t table_b2b_0[1 << 8];
+extern const uint64_t table_b2b_1[1 << 8];
 #endif

 inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t* y,
--- a/third_party/ggml/ggjt.v1.q5_0.c
+++ b/third_party/ggml/ggjt.v1.q5_0.c
@ -179,10 +179,10 @@ void ggml_vec_dot_v1_q5_0_q8_0(const int n, float * restrict s, const void * res
        uint32_t qh;
        memcpy(&qh, x0->qh, sizeof(qh));

-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];

        const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
        const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
@ -241,10 +241,10 @@ void ggml_vec_dot_v1_q5_0_q8_0(const int n, float * restrict s, const void * res
        uint32_t qh;
        memcpy(&qh, x0->qh, sizeof(qh));

-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];

        const v128_t qhl = wasm_v128_load(tmp + 0);
        const v128_t qhh = wasm_v128_load(tmp + 2);
--- a/third_party/ggml/ggjt.v1.q5_1.c
+++ b/third_party/ggml/ggjt.v1.q5_1.c
@ -172,10 +172,10 @@ void ggml_vec_dot_v1_q5_1_q8_1(const int n, float * restrict s, const void * res
        uint32_t qh;
        memcpy(&qh, x0->qh, sizeof(qh));

-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];

        const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
        const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
@ -237,10 +237,10 @@ void ggml_vec_dot_v1_q5_1_q8_1(const int n, float * restrict s, const void * res
        uint32_t qh;
        memcpy(&qh, x0->qh, sizeof(qh));

-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];

        const v128_t qhl = wasm_v128_load(tmp + 0);
        const v128_t qhh = wasm_v128_load(tmp + 2);