cuBLAS: use host pinned memory
This commit is contained in:
parent
d3fd04e92e
commit
2dd6deeb49
6 changed files with 52 additions and 9 deletions
5
Makefile
5
Makefile
|
@ -106,6 +106,7 @@ ifdef LLAMA_OPENBLAS
|
|||
endif
|
||||
ifdef LLAMA_CUBLAS
|
||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||
OBJS += ggml-cuda.o
|
||||
NVCC = nvcc
|
||||
|
@ -164,10 +165,10 @@ $(info )
|
|||
# Build library
|
||||
#
|
||||
|
||||
ggml.o: ggml.c ggml.h
|
||||
ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
llama.o: llama.cpp ggml.h llama.h llama_util.h
|
||||
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
|
|
10
ggml-cuda.cu
10
ggml-cuda.cu
|
@ -353,3 +353,13 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
|
|||
return cudaSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
void * ggml_cuda_host_malloc(size_t size) {
|
||||
void * ptr;
|
||||
CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void ggml_cuda_host_free(void * ptr) {
|
||||
CUDA_CHECK(cudaFreeHost(ptr));
|
||||
}
|
||||
|
|
|
@ -31,6 +31,9 @@ extern cudaStream_t g_cudaStream2;
|
|||
extern cudaEvent_t g_cudaEvent;
|
||||
|
||||
void ggml_init_cublas(void);
|
||||
void * ggml_cuda_host_malloc(size_t size);
|
||||
void ggml_cuda_host_free(void * ptr);
|
||||
|
||||
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
|
||||
void ggml_cuda_pool_free(void * ptr, size_t size);
|
||||
|
||||
|
|
11
ggml.c
11
ggml.c
|
@ -8235,8 +8235,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
}
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
ggml_fp16_t * const wdata = params->wdata;
|
||||
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
const int x_ne = ne01 * ne00;
|
||||
|
@ -8254,6 +8252,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02);
|
||||
{
|
||||
size_t id = 0;
|
||||
for (int64_t i01 = 0; i01 < ne11; ++i01) {
|
||||
|
@ -8540,7 +8539,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
const float * x = wdata;
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
// copy data to device
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
|
||||
|
@ -11571,7 +11569,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||
// the threads are still spinning
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0));
|
||||
//printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
|
||||
//printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
|
||||
//printf("cur = %zu\n", cur);
|
||||
|
@ -11583,6 +11581,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||
#endif
|
||||
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
||||
cur = 0;
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1;
|
||||
}
|
||||
#endif
|
||||
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
|
|
|
@ -167,7 +167,7 @@ struct llama_model {
|
|||
struct llama_kv_cache kv_self;
|
||||
|
||||
// the model memory buffer
|
||||
llama_buffer buf;
|
||||
llama_ctx_buffer buf;
|
||||
|
||||
// model memory mapped file
|
||||
std::unique_ptr<llama_mmap> mapping;
|
||||
|
@ -228,8 +228,8 @@ struct llama_context {
|
|||
|
||||
// memory buffers used to evaluate the model
|
||||
// TODO: move in llama_state
|
||||
llama_buffer buf_compute;
|
||||
llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
||||
llama_ctx_buffer buf_compute;
|
||||
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
||||
|
||||
int buf_last = 0;
|
||||
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
||||
|
|
26
llama_util.h
26
llama_util.h
|
@ -405,4 +405,30 @@ struct llama_buffer {
|
|||
delete[] addr;
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
struct llama_ctx_buffer {
|
||||
uint8_t * addr = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
void resize(size_t size) {
|
||||
if (addr) {
|
||||
ggml_cuda_host_free(addr);
|
||||
}
|
||||
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
||||
this->size = size;
|
||||
}
|
||||
|
||||
~llama_ctx_buffer() {
|
||||
if (addr) {
|
||||
ggml_cuda_host_free(addr);
|
||||
}
|
||||
}
|
||||
};
|
||||
#else
|
||||
typedef llama_buffer llama_ctx_buffer;
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue