llama : ggml-backend integration

This commit is contained in:
slaren 2023-12-28 19:15:46 +01:00
parent eec22a1c63
commit 33f0761e9b
8 changed files with 756 additions and 1514 deletions

View file

@ -779,10 +779,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
if (nbytes == 0) {
// all the tensors in the context are already allocated
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
if (buffer == NULL) {
// failed to allocate buffer
#ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
#endif
return NULL;
}
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {

View file

@ -51,6 +51,7 @@ extern "C" {
ggml_backend_buffer_type_t buft;
ggml_backend_buffer_context_t context;
size_t size;
enum ggml_backend_buffer_usage usage;
};
ggml_backend_buffer_t ggml_backend_buffer_init(

View file

@ -58,6 +58,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
/* .buft = */ buft,
/* .context = */ context,
/* .size = */ size,
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
};
return buffer;
@ -109,6 +110,10 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
}
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
buffer->usage = usage;
}
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
return buffer->buft;
}
@ -777,7 +782,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
}
#if 0
static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
#define GET_CAUSE(node) causes[hash_id(node)]
#else
@ -812,17 +817,25 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
if (src == NULL) {
break;
}
ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
if (src_backend != NULL) {
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
// operations with weights are always on the same backend as the weights
cur_backend = src_backend;
SET_CAUSE(node, "1.wgt%d", i);
break;
}
//if (src_backend != NULL) {
int src_prio = sched_backend_prio(sched, src_backend);
size_t src_size = ggml_nbytes(src);
if (src_prio < cur_prio && src_size >= cur_size) {
if (/*src_prio < cur_prio &&*/ src_size >= cur_size) {
cur_prio = src_prio;
cur_size = src_size;
cur_backend = src_backend;
SET_CAUSE(node, "1.src%d", i);
}
}
//}
}
return cur_backend;
}
@ -933,6 +946,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
}
//printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
#if 0
// pass 2: assign backends to ops from current assignments
// TODO:
// - reuse sched_backend_from_cur
@ -964,6 +978,23 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
}
}
}
#else
// pass 2: assign backends to ops from current assignments
// start from the end and assign the same backend to previous ops
{
ggml_tallocr_t cur_allocr = NULL;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
struct ggml_tensor * node = graph->nodes[i];
ggml_tallocr_t node_allocr = node_allocr(node);
if (node_allocr != NULL) {
cur_allocr = node_allocr;
} else {
node_allocr(node) = cur_allocr;
SET_CAUSE(node, "2.cur");
}
}
}
#endif
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
// pass 3: assign backends to remaining src from dst (should only be leafs)
@ -1029,9 +1060,21 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
}
ggml_tallocr_t src_allocr = node_allocr(src);
if (src_allocr != node_allocr) {
int n_inputs = sched->splits[cur_split].n_inputs++;
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
// check if the input is already in the split
bool found = false;
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
if (sched->splits[cur_split].inputs[k] == src) {
found = true;
break;
}
}
if (!found) {
int n_inputs = sched->splits[cur_split].n_inputs++;
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
}
// create copies
size_t id = hash_id(src);
@ -1235,6 +1278,10 @@ void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
sched_reset(sched);
}
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
return sched->n_splits;
}
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
int backend_index = sched_backend_prio(sched, backend);
return sched->tallocs[backend_index];
@ -1320,6 +1367,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
struct ggml_tensor * dst = node_copies[id];
if (dst->view_src != NULL) {
graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
ggml_backend_view_init(dst->view_src->buffer, dst);
}
else {

View file

@ -24,6 +24,11 @@ extern "C" {
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer
enum ggml_backend_buffer_usage {
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
};
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
@ -32,8 +37,10 @@ extern "C" {
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
//
// Backend
//
@ -146,6 +153,7 @@ extern "C" {
// Initialize backend buffers from a measure graph
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);

View file

@ -554,10 +554,6 @@ struct cuda_device_capabilities {
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
static void * g_scratch_buffer = nullptr;
static size_t g_scratch_size = 0; // disabled by default
static size_t g_scratch_offset = 0;
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
[[noreturn]]
@ -9158,247 +9154,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
}
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
const int64_t nrows = ggml_nrows(tensor);
const int64_t ne0 = tensor->ne[0];
const size_t nb1 = tensor->nb[1];
ggml_backend_type backend = tensor->backend;
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
memset(extra, 0, sizeof(*extra));
for (int id = 0; id < g_device_count; ++id) {
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
continue;
}
ggml_cuda_set_device(id);
int64_t row_low, row_high;
if (backend == GGML_BACKEND_GPU) {
row_low = 0;
row_high = nrows;
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
const int64_t rounding = get_row_rounding(tensor->type);
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
row_low -= row_low % rounding;
if (id == g_device_count - 1) {
row_high = nrows;
} else {
row_high = nrows*g_tensor_split[id + 1];
row_high -= row_high % rounding;
}
} else {
GGML_ASSERT(false);
}
if (row_low == row_high) {
continue;
}
int64_t nrows_split = row_high - row_low;
const size_t offset_split = row_low*nb1;
size_t size = ggml_nbytes_split(tensor, nrows_split);
const size_t original_size = size;
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
if (ne0 % MATRIX_ROW_PADDING != 0) {
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}
char * buf;
CUDA_CHECK(cudaMalloc(&buf, size));
char * buf_host = (char *)data + offset_split;
// set padding to 0 to avoid possible NaN values
if (size > original_size) {
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
}
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
extra->data_device[id] = buf;
if (backend == GGML_BACKEND_GPU_SPLIT) {
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
}
}
}
tensor->extra = extra;
}
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
return;
}
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
for (int id = 0; id < g_device_count; ++id) {
ggml_cuda_set_device(id);
if (extra->data_device[id] != nullptr) {
CUDA_CHECK(cudaFree(extra->data_device[id]));
}
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
if (extra->events[id][is] != nullptr) {
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
}
}
}
delete extra;
}
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
static size_t g_temp_tensor_extra_index = 0;
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
if (g_temp_tensor_extras == nullptr) {
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
}
size_t alloc_index = g_temp_tensor_extra_index;
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
memset(extra, 0, sizeof(*extra));
return extra;
}
static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
if (scratch && g_scratch_size == 0) {
return;
}
tensor->backend = GGML_BACKEND_GPU;
// recursively assign CUDA buffers until a compute tensor is found
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
const ggml_op src0_op = tensor->src[0]->op;
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
}
}
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
}
if (scratch && no_alloc) {
return;
}
ggml_tensor_extra_gpu * extra;
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
tensor->op == GGML_OP_VIEW ||
force_inplace;
const size_t size = ggml_nbytes(tensor);
ggml_cuda_set_device(g_main_device);
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
size_t offset = 0;
if (tensor->op == GGML_OP_VIEW) {
memcpy(&offset, tensor->op_params, sizeof(size_t));
}
extra = ggml_cuda_alloc_temp_tensor_extra();
extra->data_device[g_main_device] = src0_ddc + offset;
} else if (tensor->op == GGML_OP_CPY) {
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
void * src1_ddv = src1_extra->data_device[g_main_device];
extra = ggml_cuda_alloc_temp_tensor_extra();
extra->data_device[g_main_device] = src1_ddv;
} else if (scratch) {
GGML_ASSERT(size <= g_scratch_size);
if (g_scratch_offset + size > g_scratch_size) {
g_scratch_offset = 0;
}
char * data = (char *) g_scratch_buffer;
if (data == nullptr) {
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
g_scratch_buffer = data;
}
extra = ggml_cuda_alloc_temp_tensor_extra();
extra->data_device[g_main_device] = data + g_scratch_offset;
g_scratch_offset += size;
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
} else { // allocate new buffers outside of scratch
void * data;
CUDA_CHECK(cudaMalloc(&data, size));
CUDA_CHECK(cudaMemset(data, 0, size));
extra = new ggml_tensor_extra_gpu;
memset(extra, 0, sizeof(*extra));
extra->data_device[g_main_device] = data;
}
tensor->extra = extra;
}
void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
if (g_scratch_size == 0) {
return;
}
if (g_scratch_buffer == nullptr) {
ggml_cuda_set_device(g_main_device);
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
}
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
const bool inplace = tensor->view_src != nullptr;
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
size_t view_offset = 0;
if (tensor->op == GGML_OP_VIEW) {
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
}
extra->data_device[g_main_device] = src0_ddc + view_offset;
} else {
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
}
tensor->extra = extra;
}
void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
GGML_ASSERT(ggml_is_contiguous(tensor));
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
ggml_cuda_set_device(g_main_device);
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
}
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
}
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
}
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
}
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
}
void ggml_cuda_set_main_device(const int main_device) {
static void ggml_cuda_set_main_device(const int main_device) {
if (main_device >= g_device_count) {
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
main_device, g_device_count, g_main_device);
@ -9407,30 +9163,12 @@ void ggml_cuda_set_main_device(const int main_device) {
if (g_main_device != main_device && g_device_count > 1) {
g_main_device = main_device;
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
//cudaDeviceProp prop;
//CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
//fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
}
}
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
// it still won't always work as expected, but it's better than nothing
if (scratch_size > g_scratch_size) {
ggml_cuda_free_scratch();
}
g_scratch_size = std::max(g_scratch_size, scratch_size);
}
void ggml_cuda_free_scratch() {
if (g_scratch_buffer == nullptr) {
return;
}
CUDA_CHECK(cudaFree(g_scratch_buffer));
g_scratch_buffer = nullptr;
}
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
if (!g_cublas_loaded) return false;
@ -9609,6 +9347,12 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
#define UNUSED GGML_UNUSED
struct ggml_backend_context_cuda {
int device;
char name[128];
};
// cuda buffer
struct ggml_backend_buffer_context_cuda {
@ -9689,8 +9433,8 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaDeviceSynchronize());
}
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -9700,7 +9444,6 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
}
@ -9709,8 +9452,8 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
CUDA_CHECK(cudaDeviceSynchronize());
}
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
@ -9734,7 +9477,11 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
void * dev_ptr;
CUDA_CHECK(cudaMalloc(&dev_ptr, size));
cudaError_t err = cudaMalloc(&dev_ptr, size);
if (err != cudaSuccess) {
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, device, cudaGetErrorString(err));
return nullptr;
}
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
@ -9768,9 +9515,14 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
}
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_cuda(backend);
if (!ggml_backend_is_cuda(backend)) {
return false;
}
UNUSED(buft);
int device = (int) (intptr_t) buft->context;
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
return device == cuda_ctx->device;
}
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
@ -9838,14 +9590,13 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
// backend
struct ggml_backend_context_cuda {
int device;
};
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
return GGML_CUDA_NAME;
UNUSED(backend);
//return GGML_CUDA_NAME;
//UNUSED(backend);
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
// TODO: on init
sprintf(cuda_ctx->name, "%s%d", "CUD", cuda_ctx->device);
return cuda_ctx->name;
}
static void ggml_backend_cuda_free(ggml_backend_t backend) {
@ -10125,6 +9876,21 @@ bool ggml_backend_is_cuda(ggml_backend_t backend) {
return backend->iface.get_name == ggml_backend_cuda_name;
}
int ggml_backend_cuda_get_device_count() {
return ggml_cuda_get_device_count();
}
void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
ggml_cuda_get_device_description(device, description, description_size);
}
void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
ggml_cuda_set_device(device);
CUDA_CHECK(cudaMemGetInfo(free, total));
}
// backend registry
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
return cuda_backend;

View file

@ -27,22 +27,6 @@ GGML_API void * ggml_cuda_host_malloc(size_t size);
GGML_API void ggml_cuda_host_free(void * ptr);
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_set_main_device(int main_device);
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
GGML_API void ggml_cuda_free_scratch(void);
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
GGML_API int ggml_cuda_get_device_count(void);
@ -52,13 +36,17 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description,
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_API int ggml_backend_cuda_get_device_count(void);
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
#ifdef __cplusplus
}
#endif

4
ggml.c
View file

@ -2324,6 +2324,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
}
void ggml_free(struct ggml_context * ctx) {
if (ctx == NULL) {
return;
}
// make this function thread safe
ggml_critical_section_start();

1834
llama.cpp

File diff suppressed because it is too large Load diff