From 554340f5654ab35066390e22037f15af6993b021 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Tue, 16 May 2023 15:45:05 +0800 Subject: [PATCH] revert library back first --- ggml.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 8 deletions(-) diff --git a/ggml.c b/ggml.c index fbf75d706..f4ff09214 100644 --- a/ggml.c +++ b/ggml.c @@ -9327,7 +9327,7 @@ static void ggml_compute_forward_rms_norm_back( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster static bool ggml_compute_forward_mul_mat_use_blas( @@ -9368,7 +9368,7 @@ static void ggml_compute_forward_mul_mat_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) const int64_t ne10 = src1->ne[0]; #endif const int64_t ne11 = src1->ne[1]; @@ -9441,7 +9441,7 @@ static void ggml_compute_forward_mul_mat_f32( } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)|| defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -9461,11 +9461,33 @@ static void ggml_compute_forward_mul_mat_f32( const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); +#if defined(GGML_USE_CLBLAST) + // zT = y * xT + if(quants_unshuffled) + { + ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne10, + 0.0f, d, ne01, + GGML_TYPE_F32); + } + else + { + ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne10, + 0.0f, d, ne01, + GGML_TYPE_F32); + } +#else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); +#endif } } //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); @@ -9613,7 +9635,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)|| defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); @@ -9643,6 +9665,32 @@ static void ggml_compute_forward_mul_mat_f16_f32( assert(id*sizeof(float) <= params->wsize); } +#if defined(GGML_USE_CLBLAST) + const float * x = wdata; + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + + // zT = y * xT + if(quants_unshuffled) + { + ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne10, + 0.0f, d, ne01, + GGML_TYPE_F32); + } + else + { + ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne10, + 0.0f, d, ne01, + GGML_TYPE_F32); + } +#else const float * x = wdata; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); @@ -9654,6 +9702,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); +#endif } } @@ -9825,7 +9874,7 @@ static void ggml_compute_forward_mul_mat_q_f32( } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)|| defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -9848,6 +9897,9 @@ static void ggml_compute_forward_mul_mat_q_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); +#if defined(GGML_USE_CLBLAST) + const void* x = (char *) src0->data + i03*nb03 + i02*nb02; +#else { size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { @@ -9859,12 +9911,35 @@ static void ggml_compute_forward_mul_mat_q_f32( } const float * x = wdata; +#endif +#if defined(GGML_USE_CLBLAST) + // zT = y * xT + if(quants_unshuffled) + { + ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne10, + 0.0f, d, ne01, + type); + } + else + { + ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne10, + 0.0f, d, ne01, + type); + } +#else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); +#endif } } @@ -13979,7 +14054,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) else #endif if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)|| defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning @@ -13993,13 +14068,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) #endif } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; } #endif } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);