llama : initial ggml-backend integration

This commit is contained in:
slaren 2023-12-17 21:21:07 +01:00
parent b1306c4394
commit 8e6735ec60
8 changed files with 386 additions and 546 deletions

View file

@ -65,7 +65,7 @@ test: $(TEST_TARGETS)
./$$test_target; \ ./$$test_target; \
fi; \ fi; \
if [ $$? -ne 0 ]; then \ if [ $$? -ne 0 ]; then \
printf 'Test $$test_target FAILED!\n\n' $$test_target; \ printf 'Test %s FAILED!\n\n' $$test_target; \
failures=$$(( failures + 1 )); \ failures=$$(( failures + 1 )); \
else \ else \
printf 'Test %s passed.\n\n' $$test_target; \ printf 'Test %s passed.\n\n' $$test_target; \

View file

@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
if (update_backend) { if (update_backend) {
view->backend = view->view_src->backend; view->backend = view->view_src->backend;
} }
view->buffer = view->view_src->buffer; // views are initialized in the alloc buffer rather than the view_src buffer
view->buffer = alloc->buffer;
view->data = (char *)view->view_src->data + view->view_offs; view->data = (char *)view->view_src->data + view->view_offs;
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft); assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
if (!alloc->measure) { if (!alloc->measure) {
@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
} }
void ggml_allocr_free(ggml_allocr_t alloc) { void ggml_allocr_free(ggml_allocr_t alloc) {
if (alloc == NULL) {
return;
}
ggml_gallocr_free(alloc->galloc); ggml_gallocr_free(alloc->galloc);
ggml_tallocr_free(alloc->talloc); ggml_tallocr_free(alloc->talloc);
free(alloc); free(alloc);
@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
} }
if (nbytes == 0) { if (nbytes == 0) {
fprintf(stderr, "%s: no tensors to allocate\n", __func__); //fprintf(stderr, "%s: no tensors to allocate\n", __func__);
return NULL; return NULL;
} }

View file

@ -378,7 +378,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context); free(buffer->context);
GGML_UNUSED(buffer);
} }
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@ -456,7 +455,7 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
} }
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
/* .iface = */ { /* .iface = */ {
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
@ -466,9 +465,51 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
/* .context = */ NULL, /* .context = */ NULL,
}; };
return &ggml_backend_buffer_type_cpu; return &ggml_backend_cpu_buffer_type;
} }
#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
// HBM buffer type
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
hbw_free(buffer->context);
}
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
//void * ptr = hbw_malloc(size);
void * ptr;
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
if (result != 0) {
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
return NULL;
}
// FIXME: this is a hack to avoid having to implement a new buffer type
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
return buffer;
}
struct ggml_backend_buffer_type_i cpu_backend_hbm_buffer_type_interface = {
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
};
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
/* .iface = */ cpu_backend_hbm_buffer_type_interface,
/* .context = */ NULL,
};
return &ggml_backend_cpu_buffer_type_hbm;
}
#endif
struct ggml_backend_cpu_context { struct ggml_backend_cpu_context {
int n_threads; int n_threads;
void * work_data; void * work_data;
@ -505,7 +546,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
cpu_plan->cgraph = *cgraph; cpu_plan->cgraph = *cgraph; // FIXME: deep copy
if (cpu_plan->cplan.work_size > 0) { if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);

View file

@ -76,6 +76,10 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
// //
// Backend registry // Backend registry
// //

View file

@ -7057,6 +7057,7 @@ inline void ggml_cuda_op_upscale(
(void) src1; (void) src1;
(void) dst; (void) dst;
(void) src1_dd;
} }
inline void ggml_cuda_op_pad( inline void ggml_cuda_op_pad(
@ -7073,6 +7074,7 @@ inline void ggml_cuda_op_pad(
(void) src1; (void) src1;
(void) dst; (void) dst;
(void) src1_dd;
} }
inline void ggml_cuda_op_rms_norm( inline void ggml_cuda_op_rms_norm(
@ -8958,7 +8960,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
char * buf; char * buf;
CUDA_CHECK(cudaMalloc(&buf, size)); CUDA_CHECK(cudaMalloc(&buf, size));
char * buf_host = (char*)data + offset_split; char * buf_host = (char *)data + offset_split;
// set padding to 0 to avoid possible NaN values // set padding to 0 to avoid possible NaN values
if (size > original_size) { if (size > original_size) {
@ -9103,11 +9105,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || const bool inplace = tensor->view_src != nullptr;
tensor->op == GGML_OP_VIEW;
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
size_t view_offset = 0; size_t view_offset = 0;
if (tensor->op == GGML_OP_VIEW) { if (tensor->op == GGML_OP_VIEW) {
@ -9431,9 +9432,12 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
UNUSED(buffer); ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
} }
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -9441,9 +9445,12 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
UNUSED(buffer); ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
} }
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
@ -9505,7 +9512,7 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
UNUSED(buft); UNUSED(buft);
} }
static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = { static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
@ -9513,27 +9520,27 @@ static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
}; };
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES]; static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
static bool ggml_backend_buffer_type_cuda_initialized = false;
if (!ggml_backend_buffer_type_cuda_initialized) { static bool ggml_backend_cuda_buffer_type_initialized = false;
if (!ggml_backend_cuda_buffer_type_initialized) {
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
ggml_backend_buffer_type_cuda[i] = { ggml_backend_cuda_buffer_types[i] = {
/* .iface = */ cuda_backend_buffer_type_interface, /* .iface = */ ggml_backend_cuda_buffer_type_interface,
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
}; };
} }
ggml_backend_buffer_type_cuda_initialized = true; ggml_backend_cuda_buffer_type_initialized = true;
} }
return &ggml_backend_buffer_type_cuda[device]; return &ggml_backend_cuda_buffer_types[device];
} }
// host buffer type // host buffer type
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; CUDA_CHECK(cudaFreeHost(buffer->context));
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
delete ctx;
} }
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@ -9546,11 +9553,9 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
return buffer; return buffer;
UNUSED(buft);
} }
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = { struct ggml_backend_buffer_type_i ggml_backend_cuda_host_buffer_type_interface = {
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
@ -9558,12 +9563,12 @@ struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
}; };
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = { static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
/* .iface = */ cuda_backend_host_buffer_type_interface, /* .iface = */ ggml_backend_cuda_host_buffer_type_interface,
/* .context = */ nullptr, /* .context = */ nullptr,
}; };
return &ggml_backend_buffer_type_cuda_host; return &ggml_backend_cuda_buffer_type_host;
} }
// backend // backend

24
ggml.c
View file

@ -2383,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
size_t max_size = 0; size_t max_size = 0;
struct ggml_object * obj = ctx->objects_begin; for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
max_size = MAX(max_size, ggml_nbytes(tensor));
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
const size_t size = ggml_nbytes(tensor);
if (max_size < size) {
max_size = size;
}
}
obj = obj->next;
} }
return max_size; return max_size;
@ -3093,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
return result; return result;
} }
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) { struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
struct ggml_object * obj = ctx->objects_begin; struct ggml_object * obj = ctx->objects_begin;
char * const mem_buffer = ctx->mem_buffer; char * const mem_buffer = ctx->mem_buffer;
@ -3109,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
return NULL; return NULL;
} }
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) { struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE); struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
obj = obj->next; obj = obj->next;
@ -19179,6 +19167,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
return ctx->infos[i].name.data; return ctx->infos[i].name.data;
} }
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
return ctx->infos[i].type;
}
// returns the index // returns the index
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
const int idx = gguf_find_key(ctx, key); const int idx = gguf_find_key(ctx, key);

13
ggml.h
View file

@ -729,8 +729,8 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
// Context tensor enumeration and lookup // Context tensor enumeration and lookup
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx); GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@ -2123,10 +2123,11 @@ extern "C" {
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
// overrides existing values or adds a new one // overrides existing values or adds a new one
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);

772
llama.cpp

File diff suppressed because it is too large Load diff