diff --git a/ggml.c b/ggml.c index a2ed6bca0..a5f0054d1 100644 --- a/ggml.c +++ b/ggml.c @@ -2188,7 +2188,8 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest *s = sumf; } -static void seap_ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy, const int tilesize_x, const int tilesize_y, const int rowlength, const int dst_stridelength) { +static void seap_ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy, + const int rowlength_x, const int rowlength_y, const int dst_stridelength_x, const int dst_stridelength_y) { const int nb = n / QK; assert(n % QK == 0); @@ -2203,15 +2204,28 @@ static void seap_ggml_vec_dot_q4_0(const int n, float * restrict s, const void * //#if defined(__AVX2__) #if 1 -#define SEAP_TILESIZE_X 1 -#define SEAP_TILESIZE_Y 8 -#define UNROLL_COUNT 8/SEAP_TILESIZE_Y -#undef SEAP_DEBUG +#define EXPERIMENT_TILESIZE_X 8 +#define EXPERIMENT_TILESIZE_Y 1 +//#define EXPERIMENT_TILESIZE_X 2 +//#define EXPERIMENT_TILESIZE_Y 2 + +#define UNROLL_COUNT 1 // 8/EXPERIMENT_TILESIZE_Y +//#define EXPERIMENT_DEBUG +#undef EXPERIMENT_DEBUG +#undef EXPERIMENT_DEBUG2 + +#ifdef EXPERIMENT_DEBUG + printf("rowlength_x=%i,rowlength_y=%i,dst_stridelength_x=%i,dst_stridelength_y=%i\n",rowlength_x,rowlength_y,dst_stridelength_x,dst_stridelength_y); +#endif + // Initialize accumulator with zeros - __m256 acc[SEAP_TILESIZE_Y]; // = 0; // _mm256_setzero_ps(); - for (int i=0;id); + } + + #endif - for (int t=0;td); -#endif + for (int ty=0;tyd); + } + + #endif - EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS(y_high_q, y_low_q, y[i+u+t*rowlength].qs,t) - - /* Compute products of int16_t integers, add pairwise, store as int32_t */ - __m256i xy_high_q[SEAP_TILESIZE_Y]; - xy_high_q[t] = _mm256_madd_epi16( x_high_q[0], y_high_q[t] ); - __m256i xy_low_q[SEAP_TILESIZE_Y]; - xy_low_q[t]= _mm256_madd_epi16( x_low_q[0], y_low_q[t] ); + /* get input from y + Input: 32 Nibbles (16 bytes) at *y[i+u] + Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */ + __m256i y_high_q[EXPERIMENT_TILESIZE_X][EXPERIMENT_TILESIZE_Y]; + __m256i y_low_q[EXPERIMENT_TILESIZE_X][EXPERIMENT_TILESIZE_Y]; - /* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */ - __m256i xy_q[SEAP_TILESIZE_Y]; - xy_q[t] = _mm256_add_epi32( xy_high_q[t], xy_low_q[t] ); + EXPAND_32_Q4_NIBBLES_INTO_TWO_M256_VECTORS(y[i+u+ty*rowlength_y].qs, y_high_q[tx], y_low_q[tx], ty) + + /* Compute products of int16_t integers, add pairwise, store as int32_t */ + __m256i xy_high_q[EXPERIMENT_TILESIZE_X][EXPERIMENT_TILESIZE_Y]; + xy_high_q[tx][ty] = _mm256_madd_epi16( x_high_q[tx], y_high_q[tx][ty] ); - /* Convert to vectore of 8 int32_t to 8 floats */ - __m256 q[SEAP_TILESIZE_Y]; - q[t] = _mm256_cvtepi32_ps( xy_q[t] ); + __m256i xy_low_q[EXPERIMENT_TILESIZE_X][EXPERIMENT_TILESIZE_Y]; + xy_low_q[tx][ty]= _mm256_madd_epi16( x_low_q[tx], y_low_q[tx][ty] ); - /* Multiply q with scale and accumulate */ - acc[t] = _mm256_fmadd_ps( scale[t], q[t], acc[t] ); + /* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */ + __m256i xy_q[EXPERIMENT_TILESIZE_X][EXPERIMENT_TILESIZE_Y]; + xy_q[tx][ty] = _mm256_add_epi32( xy_high_q[tx][ty], xy_low_q[tx][ty] ); + /* Convert to vectore of 8 int32_t to 8 floats */ + __m256 q[EXPERIMENT_TILESIZE_X][EXPERIMENT_TILESIZE_Y]; + q[tx][ty] = _mm256_cvtepi32_ps( xy_q[tx][ty] ); + + /* Multiply q with scale and accumulate */ + acc[tx][ty] = _mm256_fmadd_ps( scale[tx][ty], q[tx][ty], acc[tx][ty] ); + + } } - } } - for (int t=0;t dot_vec dst[%i,%i] @ %li = %f \n",sum,tx,ty, (long int)p, (float *)(p)); #endif - } + } // for ty + } // for tx #else // scalar @@ -6704,6 +6732,42 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { }, }; +float tensor_sum_elements(struct ggml_tensor * tensor) { + float sum = 0; + if (tensor->type==6) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + void *p = &((float *) tensor->data)[j*tensor->ne[0]+k]; + float val = ((float *) tensor->data)[j*tensor->ne[0]+k]; +#ifdef EXPERIMENT_DEBUG2 + + printf("val[%i,%i] @ %lli =%f\n",j,k,p,val); +#endif + sum += val; + } + } + return sum; + } else if (tensor->type==0) { + + for (int j = 0; j < tensor->ne[1] / QK; j++) { + for (int k = 0; k < tensor->ne[0] / QK; k++) { + block_q4_0 *blk = tensor->data; + + float *p = (float *) &(blk[k+j*tensor->ne[0]].d); + sum += *p; + //printf("j,k,offset =%i,%i,%i @ %lli\n",j,k,k+j*tensor->ne[0],p); + } + //printf("j=%i\n",j); + } + return sum; + } else { + printf("canot sum type %i", tensor->type); + return 0; + } + +} + + static void ggml_compute_forward_mul_mat_q_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -6854,7 +6918,33 @@ static void ggml_compute_forward_mul_mat_q_f32( void * wdata = params->wdata; const size_t row_size = ne00*GGML_TYPE_SIZE[type]/GGML_BLCK_SIZE[type]; - for (int ir = ir0; ir < ir1; ++ir) { +#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN" + +#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) @ %lli - ", #TENSOR, \ + TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\ + TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2],(long long int)TENSOR->data); \ + { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); } + +#ifdef EXPERIMENT_DEBUG2 +//#if 1 + printf("\n"); + TENSOR_DUMP(src0) + TENSOR_DUMP(src1) + //TENSOR_DUMP(src1) + + //printf("rowlength_x=%i,rowlength_y=%i,dst_stridelength_x=%i,dst_stridelength_y=%i\n",rowlength_x,rowlength_y,dst_stridelength_x,dst_stridelength_y); +#endif + + //void *p = (void *) src0->data; + assert((ir1-ir0) % EXPERIMENT_TILESIZE_X == 0); + + int x_stride = EXPERIMENT_TILESIZE_X; + if (ne11 < EXPERIMENT_TILESIZE_Y) { + x_stride = 1; + } + + + for (int ir = ir0; ir < ir1; ir+=x_stride) { // src0 indices const int i03 = ir/(ne02*ne01); const int i02 = (ir - i03*ne02*ne01)/ne01; @@ -6868,43 +6958,74 @@ static void ggml_compute_forward_mul_mat_q_f32( const int i3 = i03; void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*row_size)); float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); +#if 0 + printf("src0->type=%i, src0->n_dims=%i, src0->nb=(%i,%i,%i), type_size=%lli \n",src0->type, src0->n_dims, nb01,nb02,nb03,GGML_TYPE_SIZE[src0->type]); + + /*if (src0->n_dims == 3) { + rowlength *= nb03; + }*/ + void *p = src0_row; + printf("src_row[%i] @ %li = %f, rowlength = %li \n", + ir, (long int)p, (float *)(p), + row_size/GGML_TYPE_SIZE[src0->type]); + + if (ir > 5) exit(0); +#endif +#ifdef EXPERIMENT_DEBUG + printf("ir=%i, src0_row=%lli, src1_col=%lli, dst_col=%lli\n",ir, (long long int)src0_row,(long long int)src1_col,(long long int)dst_col ); + if (ir > 5) exit(0); +#endif + assert(ne00 % 32 == 0); - if (ne11 < SEAP_TILESIZE_Y) { + if (ne11 < EXPERIMENT_TILESIZE_Y) { + //printf("using legacy tile size implementation\n"); // existing implementation tiled implementation for (int64_t ic = 0; ic < ne11; ++ic) { vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); } + } else { // tiled implementation - if ((ne11 % SEAP_TILESIZE_Y) != 0) { + if ((ne11 % EXPERIMENT_TILESIZE_Y) != 0) { printf("ne11=%i\n",ne11); } - assert((ne11 % SEAP_TILESIZE_Y) == 0); // make sure we have a multiple of the tilesize + assert((ne11 % EXPERIMENT_TILESIZE_Y) == 0); // make sure we have a multiple of the tilesize - for (int64_t ic = 0; ic < ne11; ic+=SEAP_TILESIZE_Y) { + for (int64_t ic = 0; ic < ne11; ic+=EXPERIMENT_TILESIZE_Y) { //vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); - #ifdef SEAP_DEBUG - for (int t=0;td); } + printf("calling seap_ggml_vec_dot_q4_0 for row, col=(%i,%i)\n",ir,ic); #endif - seap_ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size), SEAP_TILESIZE_X, SEAP_TILESIZE_Y, row_size/GGML_TYPE_SIZE[type], ne0); + seap_ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size), + nb01/20, + //row_size/GGML_TYPE_SIZE[type], // rowlength_x + row_size/GGML_TYPE_SIZE[type], // rowlength_y + sizeof(float), // dst_stridelength_x + ne0 // dst_stridelength_y + ); - #ifdef SEAP_DEBUG - for (int t=0;tdata, *(float *)(p)); + } } - if (ic>=3) exit(0); + //if (ic>=3) exit(0); #endif } @@ -6924,6 +7045,12 @@ static void ggml_compute_forward_mul_mat_q_f32( // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); //} +#ifdef EXPERIMENT_DEBUG2 +//#if 1 + //printf("\n"); + TENSOR_DUMP(dst) + //exit(0); +#endif } static void ggml_compute_forward_mul_mat(