From dc1c5ae7ecc9bdceece7720ba894aa0868b1ff0e Mon Sep 17 00:00:00 2001 From: Sebastian Apel <13675545+SebastianApel@users.noreply.github.com> Date: Mon, 3 Apr 2023 13:49:15 +0200 Subject: [PATCH] Experimental code that achives 30k FLOPS --- ggml.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 192 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 59e84ab45..a2ed6bca0 100644 --- a/ggml.c +++ b/ggml.c @@ -2188,6 +2188,162 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest *s = sumf; } +static void seap_ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy, const int tilesize_x, const int tilesize_y, const int rowlength, const int dst_stridelength) { + const int nb = n / QK; + + assert(n % QK == 0); + assert(nb % 2 == 0); + + const block_q4_0 * restrict x = vx; + const block_q4_0 * restrict y = vy; + + float sumf = 0.0; + + +//#if defined(__AVX2__) +#if 1 + +#define SEAP_TILESIZE_X 1 +#define SEAP_TILESIZE_Y 8 +#define UNROLL_COUNT 8/SEAP_TILESIZE_Y +#undef SEAP_DEBUG + + // Initialize accumulator with zeros + __m256 acc[SEAP_TILESIZE_Y]; // = 0; // _mm256_setzero_ps(); + for (int i=0;id); +#endif + + /* get input from y + Input: 32 Nibbles (16 bytes) at *y[i+u] + Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */ + __m256i y_high_q[SEAP_TILESIZE_Y]; + __m256i y_low_q[SEAP_TILESIZE_Y]; + + EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS(y_high_q, y_low_q, y[i+u+t*rowlength].qs,t) + + /* Compute products of int16_t integers, add pairwise, store as int32_t */ + __m256i xy_high_q[SEAP_TILESIZE_Y]; + xy_high_q[t] = _mm256_madd_epi16( x_high_q[0], y_high_q[t] ); + __m256i xy_low_q[SEAP_TILESIZE_Y]; + xy_low_q[t]= _mm256_madd_epi16( x_low_q[0], y_low_q[t] ); + + /* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */ + __m256i xy_q[SEAP_TILESIZE_Y]; + xy_q[t] = _mm256_add_epi32( xy_high_q[t], xy_low_q[t] ); + + /* Convert to vectore of 8 int32_t to 8 floats */ + __m256 q[SEAP_TILESIZE_Y]; + q[t] = _mm256_cvtepi32_ps( xy_q[t] ); + + /* Multiply q with scale and accumulate */ + acc[t] = _mm256_fmadd_ps( scale[t], q[t], acc[t] ); + + } + + } + + } + + for (int t=0;t> 4) - 8); + + const float f2 = d1*((int8_t) (v1 & 0xf) - 8); + const float f3 = d1*((int8_t) (v1 >> 4) - 8); + + sumf += f0*f2 + f1*f3; + } + } + *s = sumf; +#endif +} + static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK; @@ -6718,9 +6874,43 @@ static void ggml_compute_forward_mul_mat_q_f32( assert(ne00 % 32 == 0); - for (int64_t ic = 0; ic < ne11; ++ic) { - vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); + if (ne11 < SEAP_TILESIZE_Y) { + // existing implementation tiled implementation + for (int64_t ic = 0; ic < ne11; ++ic) { + vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); + } + } else { + // tiled implementation + if ((ne11 % SEAP_TILESIZE_Y) != 0) { + printf("ne11=%i\n",ne11); + } + assert((ne11 % SEAP_TILESIZE_Y) == 0); // make sure we have a multiple of the tilesize + + for (int64_t ic = 0; ic < ne11; ic+=SEAP_TILESIZE_Y) { + //vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); + + #ifdef SEAP_DEBUG + for (int t=0;td); + } + #endif + + seap_ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size), SEAP_TILESIZE_X, SEAP_TILESIZE_Y, row_size/GGML_TYPE_SIZE[type], ne0); + + #ifdef SEAP_DEBUG + for (int t=0;t=3) exit(0); + #endif + } + + } + } //int64_t t1 = ggml_time_us();