support multithreaded dequantization with std::async when openmp is not available

This commit is contained in:
slaren 2024-06-06 03:12:50 +02:00
parent 845fa20f26
commit 2bfdb7fe4e
3 changed files with 41 additions and 33 deletions

View file

@ -381,7 +381,7 @@ if (LLAMA_BLAS)
endif() endif()
set(GGML_HEADERS_BLAS ggml-blas.h) set(GGML_HEADERS_BLAS ggml-blas.h)
set(GGML_SOURCES_BLAS ggml-blas.c) set(GGML_SOURCES_BLAS ggml-blas.cpp)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})

View file

@ -448,9 +448,6 @@ ifdef LLAMA_RPC
OBJS += ggml-rpc.o OBJS += ggml-rpc.o
endif # LLAMA_RPC endif # LLAMA_RPC
ggml-blas.o: ggml-blas.c ggml-blas.h
$(CC) $(CFLAGS) -c $< -o $@
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
# LLAMA_CUBLAS is deprecated and will be removed in the future # LLAMA_CUBLAS is deprecated and will be removed in the future
LLAMA_CUDA := 1 LLAMA_CUDA := 1
@ -752,6 +749,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ggml-blas.o: ggml-blas.cpp ggml-blas.h
$(CXX) $(CXXFLAGS) -c $< -o $@
unicode.o: unicode.cpp unicode.h unicode.o: unicode.cpp unicode.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@

View file

@ -1,7 +1,8 @@
#include "ggml-blas.h" #include "ggml-blas.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include <stdlib.h> #include <future>
#include <vector>
#if defined(GGML_USE_ACCELERATE) #if defined(GGML_USE_ACCELERATE)
# include <Accelerate/Accelerate.h> # include <Accelerate/Accelerate.h>
@ -13,7 +14,7 @@
struct ggml_backend_blas_context { struct ggml_backend_blas_context {
int n_threads; int n_threads;
void * work_data; char * work_data;
size_t work_size; size_t work_size;
}; };
@ -41,7 +42,7 @@ static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
return false; return false;
} }
static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1];
@ -74,15 +75,15 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st
const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float);
if (ctx->work_size < desired_wsize) { if (ctx->work_size < desired_wsize) {
free(ctx->work_data); delete[] ctx->work_data;
ctx->work_data = malloc(desired_wsize); ctx->work_data = new char[desired_wsize];
GGML_ASSERT(ctx->work_data != NULL);
ctx->work_size = desired_wsize; ctx->work_size = desired_wsize;
} }
void * wdata = ctx->work_data; void * wdata = ctx->work_data;
// convert src0 to float // convert src0 to float
if (type != GGML_TYPE_F32) { if (type != GGML_TYPE_F32) {
std::vector<std::future<void>> tasks;
ggml_to_float_t const to_float = type_traits.to_float; ggml_to_float_t const to_float = type_traits.to_float;
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -92,12 +93,26 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st
#ifdef GGML_USE_OPENMP #ifdef GGML_USE_OPENMP
#pragma omp parallel for num_threads(ctx->n_threads) #pragma omp parallel for num_threads(ctx->n_threads)
#endif
for (int64_t i01 = 0; i01 < ne01; i01++) { for (int64_t i01 = 0; i01 < ne01; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
} }
#else
for (int i = 0; i < ctx->n_threads; i++) {
tasks.push_back(std::async(std::launch::async, [=]() {
const int64_t start = i*ne01/ctx->n_threads;
const int64_t end = (i + 1)*ne01/ctx->n_threads;
for (int64_t i01 = start; i01 < end; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
}
}));
}
#endif
} }
} }
// wait for all tasks to finish
for (auto & task : tasks) {
task.get();
}
} }
for (int64_t i13 = 0; i13 < ne13; i13++) { for (int64_t i13 = 0; i13 < ne13; i13++) {
@ -105,7 +120,7 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st
const int64_t i03 = i13/r3; const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2; const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03; const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
@ -122,7 +137,7 @@ static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, st
} }
} }
static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1];
@ -163,7 +178,7 @@ static void ggml_backend_blas_out_prod(struct ggml_backend_blas_context * ctx, s
int k = src0->ne[1]; int k = src0->ne[1];
int m = src1->ne[0]; int m = src1->ne[0];
int transposeA; CBLAS_TRANSPOSE transposeA;
int lda; int lda;
if (!ggml_is_transposed(src1)) { if (!ggml_is_transposed(src1)) {
@ -192,10 +207,10 @@ GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
} }
GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context; ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
free(ctx->work_data); delete[] ctx->work_data;
free(ctx); delete ctx;
free(backend); delete backend;
} }
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
@ -205,7 +220,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer
} }
GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context; ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i]; struct ggml_tensor * node = cgraph->nodes[i];
@ -287,20 +302,13 @@ static ggml_guid_t ggml_backend_blas_guid(void) {
} }
ggml_backend_t ggml_backend_blas_init(void) { ggml_backend_t ggml_backend_blas_init(void) {
ggml_backend_t backend = malloc(sizeof(struct ggml_backend)); ggml_backend_blas_context * ctx = new ggml_backend_blas_context{
if (backend == NULL) { /* .n_threads = */ GGML_DEFAULT_N_THREADS,
return NULL; /* .work_data = */ NULL,
} /* .work_size = */ 0,
struct ggml_backend_blas_context * ctx = malloc(sizeof(struct ggml_backend_blas_context)); };
if (ctx == NULL) {
return NULL;
}
ctx->n_threads = GGML_DEFAULT_N_THREADS; ggml_backend_t backend = new ggml_backend {
ctx->work_data = NULL;
ctx->work_size = 0;
*backend = (struct ggml_backend) {
/* .guid = */ ggml_backend_blas_guid(), /* .guid = */ ggml_backend_blas_guid(),
/* .interface = */ blas_backend_i, /* .interface = */ blas_backend_i,
/* .context = */ ctx, /* .context = */ ctx,
@ -316,6 +324,6 @@ GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) {
GGML_ASSERT(ggml_backend_is_blas(backend_blas)); GGML_ASSERT(ggml_backend_is_blas(backend_blas));
struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend_blas->context; ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
ctx->n_threads = n_threads; ctx->n_threads = n_threads;
} }