fix ggml_tensor_extra_gpu memory leak
Allocate the `extra` member of `ggml_tensor` statically
This commit is contained in:
parent
061f5f8d21
commit
4749543a88
4 changed files with 32 additions and 31 deletions
40
ggml-cuda.cu
40
ggml-cuda.cu
|
@ -233,10 +233,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
||||||
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu {
|
|
||||||
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
|
||||||
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
|
||||||
};
|
|
||||||
|
|
||||||
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
||||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
@ -2193,7 +2189,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||||
src_ptr = (char *) src->data;
|
src_ptr = (char *) src->data;
|
||||||
} else if (src->backend == GGML_BACKEND_GPU) {
|
} else if (src->backend == GGML_BACKEND_GPU) {
|
||||||
kind = cudaMemcpyDeviceToDevice;
|
kind = cudaMemcpyDeviceToDevice;
|
||||||
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
const struct ggml_tensor_extra_gpu * extra = (const ggml_tensor_extra_gpu *) src->extra;
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
src_ptr = (char *) extra->data_device[id];
|
src_ptr = (char *) extra->data_device[id];
|
||||||
|
@ -2631,9 +2627,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
||||||
const size_t src0_ts = ggml_type_size(src0->type);
|
const size_t src0_ts = ggml_type_size(src0->type);
|
||||||
const size_t src0_bs = ggml_blck_size(src0->type);
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
|
||||||
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
const struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (const ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
const struct ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||||
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
||||||
|
@ -2964,13 +2960,13 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
CUDA_CHECK(cudaSetDevice(g_main_device));
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
||||||
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
|
||||||
void * src0_ddq = src0_extra->data_device[g_main_device];
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
const struct ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
|
||||||
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
const struct ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
|
||||||
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||||
|
|
||||||
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
||||||
|
@ -2993,13 +2989,13 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
CUDA_CHECK(cudaSetDevice(g_main_device));
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
||||||
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
|
||||||
void * src0_ddq = src0_extra->data_device[g_main_device];
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
const struct ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
|
||||||
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
const struct ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
|
||||||
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||||
|
|
||||||
const int row_stride_x = nb01 / sizeof(half);
|
const int row_stride_x = nb01 / sizeof(half);
|
||||||
|
@ -3063,8 +3059,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
||||||
CUDA_CHECK(cudaSetDevice(g_main_device));
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
||||||
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
||||||
|
|
||||||
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
const struct ggml_tensor_extra_gpu * src0_extra = (const ggml_tensor_extra_gpu *) src0->extra;
|
||||||
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
const struct ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
|
||||||
|
|
||||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||||
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
||||||
|
@ -3107,7 +3103,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
||||||
int nrows = ggml_nrows(tensor);
|
int nrows = ggml_nrows(tensor);
|
||||||
const size_t nb1 = tensor->nb[1];
|
const size_t nb1 = tensor->nb[1];
|
||||||
ggml_backend backend = tensor->backend;
|
ggml_backend backend = tensor->backend;
|
||||||
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||||
memset(extra, 0, sizeof(*extra));
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
|
@ -3148,8 +3144,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
||||||
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
|
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tensor->extra = extra;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
||||||
|
@ -3157,7 +3151,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
const ggml_tensor_extra_gpu * extra = (const ggml_tensor_extra_gpu *) tensor->extra;
|
||||||
|
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
if (extra->data_device[id] != nullptr) {
|
if (extra->data_device[id] != nullptr) {
|
||||||
|
@ -3170,8 +3164,6 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
||||||
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
|
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
delete extra;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
||||||
|
@ -3191,7 +3183,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||||
}
|
}
|
||||||
|
|
||||||
tensor->backend = GGML_BACKEND_GPU;
|
tensor->backend = GGML_BACKEND_GPU;
|
||||||
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu * ) tensor->extra;
|
||||||
memset(extra, 0, sizeof(*extra));
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
||||||
|
@ -3234,8 +3226,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||||
CUDA_CHECK(cudaMemset(data, 0, size));
|
CUDA_CHECK(cudaMemset(data, 0, size));
|
||||||
extra->data_device[g_main_device] = data;
|
extra->data_device[g_main_device] = data;
|
||||||
}
|
}
|
||||||
|
|
||||||
tensor->extra = extra;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
||||||
|
|
11
ggml-cuda.h
11
ggml-cuda.h
|
@ -1,12 +1,21 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#define GGML_CUDA_MAX_DEVICES 16
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
struct ggml_tensor_extra_gpu {
|
||||||
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
||||||
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
||||||
|
};
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GGML_CUDA_MAX_DEVICES 16
|
|
||||||
|
|
||||||
void ggml_init_cublas(void);
|
void ggml_init_cublas(void);
|
||||||
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
||||||
|
|
2
ggml.c
2
ggml.c
|
@ -4588,8 +4588,6 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
/*.perf_time_us =*/ 0,
|
/*.perf_time_us =*/ 0,
|
||||||
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
||||||
/*.name =*/ { 0 },
|
/*.name =*/ { 0 },
|
||||||
/*.extra =*/ NULL,
|
|
||||||
/*.padding =*/ { 0 },
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
||||||
|
|
10
ggml.h
10
ggml.h
|
@ -235,6 +235,10 @@
|
||||||
const type prefix##3 = (pointer)->array[3]; \
|
const type prefix##3 = (pointer)->array[3]; \
|
||||||
GGML_UNUSED(prefix##3);
|
GGML_UNUSED(prefix##3);
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -427,9 +431,9 @@ extern "C" {
|
||||||
|
|
||||||
char name[GGML_MAX_NAME];
|
char name[GGML_MAX_NAME];
|
||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
char extra[sizeof(struct ggml_tensor_extra_gpu)]; // extra things e.g. for ggml-cuda.cu
|
||||||
char padding[8];
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue