fix ggml_tensor_extra_gpu memory leak

Allocate the `extra` member of `ggml_tensor` statically
This commit is contained in:
eajechiloae 2023-07-08 15:08:05 +03:00 committed by GitHub
parent 061f5f8d21
commit 4749543a88
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 32 additions and 31 deletions

View file

@ -233,10 +233,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif #endif
struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
};
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) { static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
@ -2193,7 +2189,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
src_ptr = (char *) src->data; src_ptr = (char *) src->data;
} else if (src->backend == GGML_BACKEND_GPU) { } else if (src->backend == GGML_BACKEND_GPU) {
kind = cudaMemcpyDeviceToDevice; kind = cudaMemcpyDeviceToDevice;
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; const struct ggml_tensor_extra_gpu * extra = (const ggml_tensor_extra_gpu *) src->extra;
int id; int id;
CUDA_CHECK(cudaGetDevice(&id)); CUDA_CHECK(cudaGetDevice(&id));
src_ptr = (char *) extra->data_device[id]; src_ptr = (char *) extra->data_device[id];
@ -2631,9 +2627,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
const size_t src0_ts = ggml_type_size(src0->type); const size_t src0_ts = ggml_type_size(src0->type);
const size_t src0_bs = ggml_blck_size(src0->type); const size_t src0_bs = ggml_blck_size(src0->type);
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; const struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (const ggml_tensor_extra_gpu *) src1->extra : nullptr;
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; const struct ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src0_is_contiguous = ggml_is_contiguous(src0);
@ -2964,13 +2960,13 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
CUDA_CHECK(cudaSetDevice(g_main_device)); CUDA_CHECK(cudaSetDevice(g_main_device));
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
void * src0_ddq = src0_extra->data_device[g_main_device]; void * src0_ddq = src0_extra->data_device[g_main_device];
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; const struct ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; const struct ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main); ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
@ -2993,13 +2989,13 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
CUDA_CHECK(cudaSetDevice(g_main_device)); CUDA_CHECK(cudaSetDevice(g_main_device));
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
void * src0_ddq = src0_extra->data_device[g_main_device]; void * src0_ddq = src0_extra->data_device[g_main_device];
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; const struct ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; const struct ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
const int row_stride_x = nb01 / sizeof(half); const int row_stride_x = nb01 / sizeof(half);
@ -3063,8 +3059,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
CUDA_CHECK(cudaSetDevice(g_main_device)); CUDA_CHECK(cudaSetDevice(g_main_device));
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; const struct ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
char * src1_ddc = (char *) src1_extra->data_device[g_main_device]; char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
@ -3107,7 +3103,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
int nrows = ggml_nrows(tensor); int nrows = ggml_nrows(tensor);
const size_t nb1 = tensor->nb[1]; const size_t nb1 = tensor->nb[1];
ggml_backend backend = tensor->backend; ggml_backend backend = tensor->backend;
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
memset(extra, 0, sizeof(*extra)); memset(extra, 0, sizeof(*extra));
for (int id = 0; id < g_device_count; ++id) { for (int id = 0; id < g_device_count; ++id) {
@ -3148,8 +3144,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming)); CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
} }
} }
tensor->extra = extra;
} }
void ggml_cuda_free_data(struct ggml_tensor * tensor) { void ggml_cuda_free_data(struct ggml_tensor * tensor) {
@ -3157,7 +3151,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
return; return;
} }
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; const ggml_tensor_extra_gpu * extra = (const ggml_tensor_extra_gpu *) tensor->extra;
for (int id = 0; id < g_device_count; ++id) { for (int id = 0; id < g_device_count; ++id) {
if (extra->data_device[id] != nullptr) { if (extra->data_device[id] != nullptr) {
@ -3170,8 +3164,6 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
CUDA_CHECK(cudaEventDestroy(extra->events[id])); CUDA_CHECK(cudaEventDestroy(extra->events[id]));
} }
} }
delete extra;
} }
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) { void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
@ -3191,7 +3183,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
} }
tensor->backend = GGML_BACKEND_GPU; tensor->backend = GGML_BACKEND_GPU;
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu * ) tensor->extra;
memset(extra, 0, sizeof(*extra)); memset(extra, 0, sizeof(*extra));
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) || const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
@ -3234,8 +3226,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
CUDA_CHECK(cudaMemset(data, 0, size)); CUDA_CHECK(cudaMemset(data, 0, size));
extra->data_device[g_main_device] = data; extra->data_device[g_main_device] = data;
} }
tensor->extra = extra;
} }
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {

View file

@ -1,12 +1,21 @@
#pragma once #pragma once
#define GGML_CUDA_MAX_DEVICES 16
#include <cuda_runtime.h>
struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
};
#include "ggml.h" #include "ggml.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
#define GGML_CUDA_MAX_DEVICES 16
void ggml_init_cublas(void); void ggml_init_cublas(void);
void ggml_cuda_set_tensor_split(const float * tensor_split); void ggml_cuda_set_tensor_split(const float * tensor_split);

2
ggml.c
View file

@ -4588,8 +4588,6 @@ struct ggml_tensor * ggml_new_tensor_impl(
/*.perf_time_us =*/ 0, /*.perf_time_us =*/ 0,
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
/*.name =*/ { 0 }, /*.name =*/ { 0 },
/*.extra =*/ NULL,
/*.padding =*/ { 0 },
}; };
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads // TODO: this should not be needed as long as we don't rely on aligned SIMD loads

10
ggml.h
View file

@ -235,6 +235,10 @@
const type prefix##3 = (pointer)->array[3]; \ const type prefix##3 = (pointer)->array[3]; \
GGML_UNUSED(prefix##3); GGML_UNUSED(prefix##3);
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -427,9 +431,9 @@ extern "C" {
char name[GGML_MAX_NAME]; char name[GGML_MAX_NAME];
void * extra; // extra things e.g. for ggml-cuda.cu #ifdef GGML_USE_CUBLAS
char extra[sizeof(struct ggml_tensor_extra_gpu)]; // extra things e.g. for ggml-cuda.cu
char padding[8]; #endif
}; };
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);