From 2bfdb7fe4e6c71d919bef19324f100589af6bd2f Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 6 Jun 2024 03:12:50 +0200 Subject: [PATCH] support multithreaded dequantization with std::async when openmp is not available --- CMakeLists.txt | 2 +- Makefile | 6 ++-- ggml-blas.c => ggml-blas.cpp | 66 ++++++++++++++++++++---------------- 3 files changed, 41 insertions(+), 33 deletions(-) rename ggml-blas.c => ggml-blas.cpp (84%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e5baa6a4..d8f7780f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -381,7 +381,7 @@ if (LLAMA_BLAS) endif() set(GGML_HEADERS_BLAS ggml-blas.h) - set(GGML_SOURCES_BLAS ggml-blas.c) + set(GGML_SOURCES_BLAS ggml-blas.cpp) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) diff --git a/Makefile b/Makefile index 59dd85336..e57fcd9ad 100644 --- a/Makefile +++ b/Makefile @@ -448,9 +448,6 @@ ifdef LLAMA_RPC OBJS += ggml-rpc.o endif # LLAMA_RPC -ggml-blas.o: ggml-blas.c ggml-blas.h - $(CC) $(CFLAGS) -c $< -o $@ - ifdef LLAMA_CUBLAS # LLAMA_CUBLAS is deprecated and will be removed in the future LLAMA_CUDA := 1 @@ -752,6 +749,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ +ggml-blas.o: ggml-blas.cpp ggml-blas.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + unicode.o: unicode.cpp unicode.h $(CXX) $(CXXFLAGS) -c $< -o $@ diff --git a/ggml-blas.c b/ggml-blas.cpp similarity index 84% rename from ggml-blas.c rename to ggml-blas.cpp index edb5474dd..2537a4a0f 100644 --- a/ggml-blas.c +++ b/ggml-blas.cpp @@ -1,7 +1,8 @@ #include "ggml-blas.h" #include "ggml-backend-impl.h" -#include +#include +#include #if defined(GGML_USE_ACCELERATE) # include @@ -13,7 +14,7 @@ struct ggml_backend_blas_context { int n_threads; - void * work_data; + char * work_data; size_t work_size; }; @@ -41,7 +42,7 @@ static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) { return false; } -static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { +static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; @@ -74,15 +75,15 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); if (ctx->work_size < desired_wsize) { - free(ctx->work_data); - ctx->work_data = malloc(desired_wsize); - GGML_ASSERT(ctx->work_data != NULL); + delete[] ctx->work_data; + ctx->work_data = new char[desired_wsize]; ctx->work_size = desired_wsize; } void * wdata = ctx->work_data; // convert src0 to float if (type != GGML_TYPE_F32) { + std::vector> tasks; ggml_to_float_t const to_float = type_traits.to_float; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -92,12 +93,26 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st #ifdef GGML_USE_OPENMP #pragma omp parallel for num_threads(ctx->n_threads) -#endif for (int64_t i01 = 0; i01 < ne01; i01++) { to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } +#else + for (int i = 0; i < ctx->n_threads; i++) { + tasks.push_back(std::async(std::launch::async, [=]() { + const int64_t start = i*ne01/ctx->n_threads; + const int64_t end = (i + 1)*ne01/ctx->n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } +#endif } } + // wait for all tasks to finish + for (auto & task : tasks) { + task.get(); + } } for (int64_t i13 = 0; i13 < ne13; i13++) { @@ -105,7 +120,7 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st const int64_t i03 = i13/r3; const int64_t i02 = i12/r2; - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); @@ -122,7 +137,7 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st } } -static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { +static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; @@ -163,7 +178,7 @@ static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, s int k = src0->ne[1]; int m = src1->ne[0]; - int transposeA; + CBLAS_TRANSPOSE transposeA; int lda; if (!ggml_is_transposed(src1)) { @@ -192,10 +207,10 @@ GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { } GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { - struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context; - free(ctx->work_data); - free(ctx); - free(backend); + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + delete[] ctx->work_data; + delete ctx; + delete backend; } GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { @@ -205,7 +220,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer } GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context; + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; @@ -287,20 +302,13 @@ static ggml_guid_t ggml_backend_blas_guid(void) { } ggml_backend_t ggml_backend_blas_init(void) { - ggml_backend_t backend = malloc(sizeof(struct ggml_backend)); - if (backend == NULL) { - return NULL; - } - struct ggml_backend_blas_context * ctx = malloc(sizeof(struct ggml_backend_blas_context)); - if (ctx == NULL) { - return NULL; - } + ggml_backend_blas_context * ctx = new ggml_backend_blas_context{ + /* .n_threads = */ GGML_DEFAULT_N_THREADS, + /* .work_data = */ NULL, + /* .work_size = */ 0, + }; - ctx->n_threads = GGML_DEFAULT_N_THREADS; - ctx->work_data = NULL; - ctx->work_size = 0; - - *backend = (struct ggml_backend) { + ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_blas_guid(), /* .interface = */ blas_backend_i, /* .context = */ ctx, @@ -316,6 +324,6 @@ GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) { void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { GGML_ASSERT(ggml_backend_is_blas(backend_blas)); - struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend_blas->context; + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context; ctx->n_threads = n_threads; }