llama : ggml-backend integration
This commit is contained in:
parent
eec22a1c63
commit
33f0761e9b
8 changed files with 756 additions and 1514 deletions
11
ggml-alloc.c
11
ggml-alloc.c
|
@ -779,10 +779,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
|
||||
if (nbytes == 0) {
|
||||
// all the tensors in the context are already allocated
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
||||
if (buffer == NULL) {
|
||||
// failed to allocate buffer
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
||||
|
||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
|
|
|
@ -51,6 +51,7 @@ extern "C" {
|
|||
ggml_backend_buffer_type_t buft;
|
||||
ggml_backend_buffer_context_t context;
|
||||
size_t size;
|
||||
enum ggml_backend_buffer_usage usage;
|
||||
};
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
|
|
|
@ -58,6 +58,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|||
/* .buft = */ buft,
|
||||
/* .context = */ context,
|
||||
/* .size = */ size,
|
||||
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
|
||||
};
|
||||
|
||||
return buffer;
|
||||
|
@ -109,6 +110,10 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
|||
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
||||
buffer->usage = usage;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
||||
return buffer->buft;
|
||||
}
|
||||
|
@ -777,7 +782,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
|
|||
}
|
||||
|
||||
#if 0
|
||||
static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
|
||||
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
|
||||
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
||||
#define GET_CAUSE(node) causes[hash_id(node)]
|
||||
#else
|
||||
|
@ -812,17 +817,25 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
|
|||
if (src == NULL) {
|
||||
break;
|
||||
}
|
||||
|
||||
ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
|
||||
if (src_backend != NULL) {
|
||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
// operations with weights are always on the same backend as the weights
|
||||
cur_backend = src_backend;
|
||||
SET_CAUSE(node, "1.wgt%d", i);
|
||||
break;
|
||||
}
|
||||
|
||||
//if (src_backend != NULL) {
|
||||
int src_prio = sched_backend_prio(sched, src_backend);
|
||||
size_t src_size = ggml_nbytes(src);
|
||||
if (src_prio < cur_prio && src_size >= cur_size) {
|
||||
if (/*src_prio < cur_prio &&*/ src_size >= cur_size) {
|
||||
cur_prio = src_prio;
|
||||
cur_size = src_size;
|
||||
cur_backend = src_backend;
|
||||
SET_CAUSE(node, "1.src%d", i);
|
||||
}
|
||||
}
|
||||
//}
|
||||
}
|
||||
return cur_backend;
|
||||
}
|
||||
|
@ -933,6 +946,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|||
}
|
||||
//printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||
|
||||
#if 0
|
||||
// pass 2: assign backends to ops from current assignments
|
||||
// TODO:
|
||||
// - reuse sched_backend_from_cur
|
||||
|
@ -964,6 +978,23 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
// pass 2: assign backends to ops from current assignments
|
||||
// start from the end and assign the same backend to previous ops
|
||||
{
|
||||
ggml_tallocr_t cur_allocr = NULL;
|
||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||
if (node_allocr != NULL) {
|
||||
cur_allocr = node_allocr;
|
||||
} else {
|
||||
node_allocr(node) = cur_allocr;
|
||||
SET_CAUSE(node, "2.cur");
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||
|
||||
// pass 3: assign backends to remaining src from dst (should only be leafs)
|
||||
|
@ -1029,9 +1060,21 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|||
}
|
||||
ggml_tallocr_t src_allocr = node_allocr(src);
|
||||
if (src_allocr != node_allocr) {
|
||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
||||
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
||||
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
|
||||
// check if the input is already in the split
|
||||
bool found = false;
|
||||
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
||||
if (sched->splits[cur_split].inputs[k] == src) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
||||
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
|
||||
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
||||
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
|
||||
}
|
||||
|
||||
// create copies
|
||||
size_t id = hash_id(src);
|
||||
|
@ -1235,6 +1278,10 @@ void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
|
|||
sched_reset(sched);
|
||||
}
|
||||
|
||||
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
||||
return sched->n_splits;
|
||||
}
|
||||
|
||||
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
int backend_index = sched_backend_prio(sched, backend);
|
||||
return sched->tallocs[backend_index];
|
||||
|
@ -1320,6 +1367,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
|||
|
||||
struct ggml_tensor * dst = node_copies[id];
|
||||
if (dst->view_src != NULL) {
|
||||
graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
||||
ggml_backend_view_init(dst->view_src->buffer, dst);
|
||||
}
|
||||
else {
|
||||
|
|
|
@ -24,6 +24,11 @@ extern "C" {
|
|||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
|
||||
// buffer
|
||||
enum ggml_backend_buffer_usage {
|
||||
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
||||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
||||
};
|
||||
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
|
@ -32,8 +37,10 @@ extern "C" {
|
|||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
||||
|
||||
|
||||
//
|
||||
// Backend
|
||||
//
|
||||
|
@ -146,6 +153,7 @@ extern "C" {
|
|||
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||
|
||||
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
|
324
ggml-cuda.cu
324
ggml-cuda.cu
|
@ -554,10 +554,6 @@ struct cuda_device_capabilities {
|
|||
|
||||
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
|
||||
|
||||
static void * g_scratch_buffer = nullptr;
|
||||
static size_t g_scratch_size = 0; // disabled by default
|
||||
static size_t g_scratch_offset = 0;
|
||||
|
||||
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||
|
||||
[[noreturn]]
|
||||
|
@ -9158,247 +9154,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
|||
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
||||
}
|
||||
|
||||
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
||||
const int64_t nrows = ggml_nrows(tensor);
|
||||
|
||||
const int64_t ne0 = tensor->ne[0];
|
||||
|
||||
const size_t nb1 = tensor->nb[1];
|
||||
|
||||
ggml_backend_type backend = tensor->backend;
|
||||
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_cuda_set_device(id);
|
||||
|
||||
int64_t row_low, row_high;
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
row_low = 0;
|
||||
row_high = nrows;
|
||||
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
const int64_t rounding = get_row_rounding(tensor->type);
|
||||
|
||||
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
||||
row_low -= row_low % rounding;
|
||||
|
||||
if (id == g_device_count - 1) {
|
||||
row_high = nrows;
|
||||
} else {
|
||||
row_high = nrows*g_tensor_split[id + 1];
|
||||
row_high -= row_high % rounding;
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
if (row_low == row_high) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
|
||||
const size_t offset_split = row_low*nb1;
|
||||
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
||||
const size_t original_size = size;
|
||||
|
||||
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
}
|
||||
|
||||
char * buf;
|
||||
CUDA_CHECK(cudaMalloc(&buf, size));
|
||||
char * buf_host = (char *)data + offset_split;
|
||||
|
||||
// set padding to 0 to avoid possible NaN values
|
||||
if (size > original_size) {
|
||||
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
||||
|
||||
extra->data_device[id] = buf;
|
||||
|
||||
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tensor->extra = extra;
|
||||
}
|
||||
|
||||
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
||||
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
ggml_cuda_set_device(id);
|
||||
if (extra->data_device[id] != nullptr) {
|
||||
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
||||
}
|
||||
|
||||
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
||||
if (extra->events[id][is] != nullptr) {
|
||||
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete extra;
|
||||
}
|
||||
|
||||
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
||||
static size_t g_temp_tensor_extra_index = 0;
|
||||
|
||||
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||
if (g_temp_tensor_extras == nullptr) {
|
||||
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
||||
}
|
||||
|
||||
size_t alloc_index = g_temp_tensor_extra_index;
|
||||
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
||||
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
|
||||
return extra;
|
||||
}
|
||||
|
||||
static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
||||
if (scratch && g_scratch_size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
|
||||
// recursively assign CUDA buffers until a compute tensor is found
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
||||
const ggml_op src0_op = tensor->src[0]->op;
|
||||
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
||||
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
||||
}
|
||||
}
|
||||
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
||||
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
||||
}
|
||||
|
||||
if (scratch && no_alloc) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_gpu * extra;
|
||||
|
||||
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
||||
tensor->op == GGML_OP_VIEW ||
|
||||
force_inplace;
|
||||
const size_t size = ggml_nbytes(tensor);
|
||||
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||
size_t offset = 0;
|
||||
if (tensor->op == GGML_OP_VIEW) {
|
||||
memcpy(&offset, tensor->op_params, sizeof(size_t));
|
||||
}
|
||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
extra->data_device[g_main_device] = src0_ddc + offset;
|
||||
} else if (tensor->op == GGML_OP_CPY) {
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
||||
void * src1_ddv = src1_extra->data_device[g_main_device];
|
||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
extra->data_device[g_main_device] = src1_ddv;
|
||||
} else if (scratch) {
|
||||
GGML_ASSERT(size <= g_scratch_size);
|
||||
if (g_scratch_offset + size > g_scratch_size) {
|
||||
g_scratch_offset = 0;
|
||||
}
|
||||
|
||||
char * data = (char *) g_scratch_buffer;
|
||||
if (data == nullptr) {
|
||||
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
||||
g_scratch_buffer = data;
|
||||
}
|
||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
extra->data_device[g_main_device] = data + g_scratch_offset;
|
||||
|
||||
g_scratch_offset += size;
|
||||
|
||||
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
||||
} else { // allocate new buffers outside of scratch
|
||||
void * data;
|
||||
CUDA_CHECK(cudaMalloc(&data, size));
|
||||
CUDA_CHECK(cudaMemset(data, 0, size));
|
||||
extra = new ggml_tensor_extra_gpu;
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
extra->data_device[g_main_device] = data;
|
||||
}
|
||||
|
||||
tensor->extra = extra;
|
||||
}
|
||||
|
||||
void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
|
||||
if (g_scratch_size == 0) {
|
||||
return;
|
||||
}
|
||||
if (g_scratch_buffer == nullptr) {
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
||||
}
|
||||
|
||||
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
|
||||
const bool inplace = tensor->view_src != nullptr;
|
||||
|
||||
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||
size_t view_offset = 0;
|
||||
if (tensor->op == GGML_OP_VIEW) {
|
||||
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
|
||||
}
|
||||
extra->data_device[g_main_device] = src0_ddc + view_offset;
|
||||
} else {
|
||||
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
|
||||
}
|
||||
|
||||
tensor->extra = extra;
|
||||
}
|
||||
|
||||
void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
||||
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
||||
}
|
||||
|
||||
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
|
||||
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
|
||||
}
|
||||
|
||||
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
||||
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
|
||||
}
|
||||
|
||||
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
||||
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
||||
}
|
||||
|
||||
void ggml_cuda_set_main_device(const int main_device) {
|
||||
static void ggml_cuda_set_main_device(const int main_device) {
|
||||
if (main_device >= g_device_count) {
|
||||
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
||||
main_device, g_device_count, g_main_device);
|
||||
|
@ -9407,30 +9163,12 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|||
|
||||
if (g_main_device != main_device && g_device_count > 1) {
|
||||
g_main_device = main_device;
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
||||
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
||||
//cudaDeviceProp prop;
|
||||
//CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
||||
//fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
||||
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
||||
// it still won't always work as expected, but it's better than nothing
|
||||
if (scratch_size > g_scratch_size) {
|
||||
ggml_cuda_free_scratch();
|
||||
}
|
||||
g_scratch_size = std::max(g_scratch_size, scratch_size);
|
||||
}
|
||||
|
||||
void ggml_cuda_free_scratch() {
|
||||
if (g_scratch_buffer == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaFree(g_scratch_buffer));
|
||||
g_scratch_buffer = nullptr;
|
||||
}
|
||||
|
||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
if (!g_cublas_loaded) return false;
|
||||
|
||||
|
@ -9609,6 +9347,12 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
struct ggml_backend_context_cuda {
|
||||
int device;
|
||||
char name[128];
|
||||
};
|
||||
|
||||
|
||||
// cuda buffer
|
||||
|
||||
struct ggml_backend_buffer_context_cuda {
|
||||
|
@ -9689,8 +9433,8 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
|||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
|
@ -9700,7 +9444,6 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
|||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
|
@ -9709,8 +9452,8 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
||||
|
@ -9734,7 +9477,11 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
|||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||
|
||||
void * dev_ptr;
|
||||
CUDA_CHECK(cudaMalloc(&dev_ptr, size));
|
||||
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, device, cudaGetErrorString(err));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
|
||||
|
||||
|
@ -9768,9 +9515,14 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|||
}
|
||||
|
||||
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
return ggml_backend_is_cuda(backend);
|
||||
if (!ggml_backend_is_cuda(backend)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
UNUSED(buft);
|
||||
int device = (int) (intptr_t) buft->context;
|
||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||
|
||||
return device == cuda_ctx->device;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||
|
@ -9838,14 +9590,13 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|||
|
||||
// backend
|
||||
|
||||
struct ggml_backend_context_cuda {
|
||||
int device;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
||||
return GGML_CUDA_NAME;
|
||||
|
||||
UNUSED(backend);
|
||||
//return GGML_CUDA_NAME;
|
||||
//UNUSED(backend);
|
||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||
// TODO: on init
|
||||
sprintf(cuda_ctx->name, "%s%d", "CUD", cuda_ctx->device);
|
||||
return cuda_ctx->name;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||
|
@ -10125,6 +9876,21 @@ bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
|||
return backend->iface.get_name == ggml_backend_cuda_name;
|
||||
}
|
||||
|
||||
int ggml_backend_cuda_get_device_count() {
|
||||
return ggml_cuda_get_device_count();
|
||||
}
|
||||
|
||||
void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
||||
ggml_cuda_get_device_description(device, description, description_size);
|
||||
}
|
||||
|
||||
void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
ggml_cuda_set_device(device);
|
||||
|
||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
}
|
||||
|
||||
// backend registry
|
||||
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
||||
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
||||
return cuda_backend;
|
||||
|
|
26
ggml-cuda.h
26
ggml-cuda.h
|
@ -27,22 +27,6 @@ GGML_API void * ggml_cuda_host_malloc(size_t size);
|
|||
GGML_API void ggml_cuda_host_free(void * ptr);
|
||||
|
||||
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
|
||||
GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
|
||||
GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_cuda_set_main_device(int main_device);
|
||||
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
||||
GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
|
||||
GGML_API void ggml_cuda_free_scratch(void);
|
||||
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API int ggml_cuda_get_device_count(void);
|
||||
|
@ -52,13 +36,17 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description,
|
|||
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||
|
||||
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
|
||||
// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
|
||||
GGML_API int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
4
ggml.c
4
ggml.c
|
@ -2324,6 +2324,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||
}
|
||||
|
||||
void ggml_free(struct ggml_context * ctx) {
|
||||
if (ctx == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
// make this function thread safe
|
||||
ggml_critical_section_start();
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue