diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c index 4999f6ca0..db82653b4 100644 --- a/ggml-phi-knc-dot_q5_K_q8_K.c +++ b/ggml-phi-knc-dot_q5_K_q8_K.c @@ -55,6 +55,14 @@ float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src) return f32[0]; } +/* convert many FP16s to FP32s. */ +void GGML_PHI_FP16_TO_FP32_ROW(const ggml_fp16_t * x, float * y, int n) +{ + for (int i = 0; i < n; i++) { + y[i] = GGML_PHI_FP16_TO_FP32(x[i]); + } +} + /* Convert a FP32 to a FP16. */ ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src) { @@ -74,6 +82,13 @@ ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src) return f16[0]; } +/* convert many FP32s to FP16s. */ +void GGML_PHI_FP32_TO_FP16_ROW(const float * x, ggml_fp16_t * y, int n) +{ + for (int i = 0; i < n; i++) { + y[i] = GGML_PHI_FP32_TO_FP16(x[i]); + } +} // This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. Then it does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16. When done, it multiplies this I32x16 by a float, returning a F32x16. // It loops 8 times. Well, actually four, with an unroll. diff --git a/ggml-phi-knc-dot_q5_K_q8_K.h b/ggml-phi-knc-dot_q5_K_q8_K.h index bd4d814ae..efc629a8a 100644 --- a/ggml-phi-knc-dot_q5_K_q8_K.h +++ b/ggml-phi-knc-dot_q5_K_q8_K.h @@ -20,12 +20,17 @@ extern "C" typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16))); typedef int32_t int32x16_t __attribute__((vector_size (64), aligned(64))); - // Zero out a vector of Floats + // Zero out a vector of 16 Floats. void GGML_F32x16_VEC_ZERO(float32x16_t *target); // Convert an FP16 value to FP32(Float). float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src); + // Convert a set of FP16 values to FP32(Float). + void GGML_PHI_FP16_TO_FP32_ROW(const ggml_fp16_t * x, float * y, int n); // Convert an FP32(Float) value to FP16. ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src); + // Convert an FP32(Float) value to FP16. + void GGML_PHI_FP32_TO_FP16_ROW(const float * x, ggml_fp16_t * y, int n); + // Create a 5 bit int vector from a 4 bit vector and a 1 bit vector, both in packed forms. void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst); // Multiply a Q5 and Q8 vector against each other, with some scaling. diff --git a/ggml.c b/ggml.c index 3c9ce17cd..daab6bee9 100644 --- a/ggml.c +++ b/ggml.c @@ -53,6 +53,7 @@ // hand assembled replacement functions are cool. #if defined(__k1om__) #include +#include #endif #if defined(_WIN32) @@ -341,6 +342,21 @@ GGML_CALL const char * ggml_status_to_string(enum ggml_status status) { return "GGML status: unknown"; } +// note: do not use these inside ggml.c +// these are meant to be used via the ggml.h API +#if defined(__k1om__) + +#define ggml_fp16_to_fp32 GGML_PHI_FP16_TO_FP32 +#define ggml_fp32_to_fp16 GGML_PHI_FP32_TO_FP16 +#define ggml_fp16_to_fp32_row GGML_PHI_FP16_TO_FP32_ROW +#define ggml_fp32_to_fp16_row GGML_PHI_FP32_TO_FP16_ROW + +#define ggml_fp16_to_fp32 GGML_PHI_FP16_TO_FP32 +#define ggml_fp32_to_fp16 GGML_PHI_FP32_TO_FP16 +#define ggml_fp16_to_fp32_row GGML_PHI_FP16_TO_FP32_ROW +#define ggml_fp32_to_fp16_row GGML_PHI_FP32_TO_FP16_ROW + +#else float ggml_fp16_to_fp32(ggml_fp16_t x) { #define ggml_fp16_to_fp32 do_not_use__ggml_fp16_to_fp32__in_ggml return GGML_FP16_TO_FP32(x); @@ -386,6 +402,8 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { } } +#endif /* defined(__k1om__) */ + void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { int64_t i = 0; #if defined(__AVX512F__) diff --git a/llama.cpp b/llama.cpp index 407d9816e..ab099e409 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7,6 +7,15 @@ #include "ggml-alloc.h" #include "ggml-backend.h" +// hand assembled replacement functions are cool. +#if defined(__k1om__) +#include "ggml-phi-knc-dot_q5_K_q8_K.h" + +#define ggml_fp16_to_fp32_row GGML_PHI_FP16_TO_FP32_ROW +#define ggml_fp32_to_fp16_row GGML_PHI_FP32_TO_FP16_ROW + +#endif /* defined(__k1om__) */ + #ifdef GGML_USE_CUDA # include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST)