From 429d69fd2284594b0b7f2e7c53a9f3eac9987473 Mon Sep 17 00:00:00 2001 From: Julia Longtin Date: Wed, 13 Mar 2024 19:18:10 +0000 Subject: [PATCH] try to implement one intrinsic --- ggml.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index aeb01e711..559030483 100644 --- a/ggml.c +++ b/ggml.c @@ -876,7 +876,38 @@ inline static float vaddvq_f32(float32x4_t v) { // number of elements to fit in a single register // -#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) + +#if defined(__k1om__) /* Xeon PHI Knights Corner (IMCI) */ + +// No, we have an SIMD unit. +// #define GGML_SIMD + +// This SIMD unit can work with 32 float32s at once. +#define GGML_F32_STEP 32 +// We can fit 16 of these float32s in a single vector register. +#define GGML_F32_EPR 16 + +// because we are not defining GGML_SIMD, we have to do this ourself. +#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) + +// our vector. 128*32=512 +typedef float32_t float32x16_t __attribute__((vector_size (128))); +#define GGML_F32x16 float32x16_t +#define GGML_F32x16_ZERO \ + { \ + __mmask16 mask=0xFFFF; \ + float32x16_t res; \ + asm ("vbroadcastf32x4 [RES] {[M]}, 0[%2]" \ + : [RES] "=x"(res) \ + : [M] "k" mask, \ + [V] "r" 0.0f) \ + return res; \ + } +//vdupq_n_f32(0.0f) + +#define GGML_F32_VEC GGML_F32x16 + +#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) #define GGML_SIMD @@ -1498,6 +1529,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } + static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -1530,6 +1562,17 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * for (int i = np; i < n; ++i) { sumf += x[i]*y[i]; } +#elif defined(__k1om__) + // our result, in the end. + float sumf = 0.0f; + // the number of vector-sized steps we will need to do. + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; + for (int i = 0; i < 16; ++i) { + fprintf(stderr, "boo: %f\n",sum[0]); + } + #else // scalar ggml_float sumf = 0.0;