add batch fp16<->fp32 conversion functions.

2024-05-09 19:31:28 +00:00 · 2024-05-09 19:31:28 +00:00 · 9fa06f4767
commit 9fa06f4767
parent 1c2fdc3412
4 changed files with 41 additions and 1 deletions
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@ -55,6 +55,14 @@ float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src)
    return f32[0];
 }

+/* convert many FP16s to FP32s. */
+void GGML_PHI_FP16_TO_FP32_ROW(const ggml_fp16_t * x, float * y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        y[i] = GGML_PHI_FP16_TO_FP32(x[i]);
+    }
+}
+
 /* Convert a FP32 to a FP16. */
 ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src)
 {
@ -74,6 +82,13 @@ ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src)
    return f16[0];
 }

+/* convert many FP32s to FP16s. */
+void GGML_PHI_FP32_TO_FP16_ROW(const float * x, ggml_fp16_t * y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        y[i] = GGML_PHI_FP32_TO_FP16(x[i]);
+    }
+}

 // This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. Then it does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16. When done, it multiplies this I32x16 by a float, returning a F32x16.
 // It loops 8 times. Well, actually four, with an unroll.
--- a/ggml-phi-knc-dot_q5_K_q8_K.h
+++ b/ggml-phi-knc-dot_q5_K_q8_K.h
@ -20,12 +20,17 @@ extern "C"
    typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16)));
    typedef int32_t int32x16_t __attribute__((vector_size (64), aligned(64)));

-    // Zero out a vector of Floats
+    // Zero out a vector of 16 Floats.
    void GGML_F32x16_VEC_ZERO(float32x16_t *target);
    // Convert an FP16 value to FP32(Float).
    float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src);
+    // Convert a set of FP16 values to FP32(Float).
+    void GGML_PHI_FP16_TO_FP32_ROW(const ggml_fp16_t * x, float * y, int n);
    // Convert an FP32(Float) value to FP16.
    ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src);
+    // Convert an FP32(Float) value to FP16.
+    void GGML_PHI_FP32_TO_FP16_ROW(const float * x, ggml_fp16_t * y, int n);
+
    // Create a 5 bit int vector from a 4 bit vector and a 1 bit vector, both in packed forms.
    void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst);
    // Multiply a Q5 and Q8 vector against each other, with some scaling.
--- a/ggml.c
+++ b/ggml.c
@ -45,6 +45,7 @@
 // hand assembled replacement functions are cool.
 #if defined(__k1om__)
 #include <ggml-phi-knc.h>
+#include <ggml-phi-knc-dot_q5_K_q8_K.h>
 #endif

 #if defined(_WIN32)
@ -335,6 +336,14 @@ const char * ggml_status_to_string(enum ggml_status status) {

 // note: do not use these inside ggml.c
 // these are meant to be used via the ggml.h API
+#if defined(__k1om__)
+
+#define ggml_fp16_to_fp32 GGML_PHI_FP16_TO_FP32
+#define ggml_fp32_to_fp16 GGML_PHI_FP32_TO_FP16
+#define ggml_fp16_to_fp32_row GGML_PHI_FP16_TO_FP32_ROW
+#define ggml_fp32_to_fp16_row GGML_PHI_FP32_TO_FP16_ROW
+
+#else
 float ggml_fp16_to_fp32(ggml_fp16_t x) {
    return GGML_FP16_TO_FP32(x);
 }
@ -368,6 +377,8 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
    }
 }

+#endif /* defined(__k1om__) */
+
 bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
    return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -7,6 +7,15 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"

+// hand assembled replacement functions are cool.
+#if defined(__k1om__)
+#include "ggml-phi-knc-dot_q5_K_q8_K.h"
+
+#define ggml_fp16_to_fp32_row GGML_PHI_FP16_TO_FP32_ROW
+#define ggml_fp32_to_fp16_row GGML_PHI_FP32_TO_FP16_ROW
+
+#endif
+
 #ifdef GGML_USE_CUDA
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)