metal : map the CPU buffers to Metal buffers (WIP)
This commit is contained in:
parent
f38433ef5d
commit
290cb700bf
5 changed files with 78 additions and 24 deletions
|
@ -94,6 +94,7 @@ struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size
|
||||||
*allocator = (struct ggml_backend_buffer){
|
*allocator = (struct ggml_backend_buffer){
|
||||||
/* .interface = */ ggml_allocator_simple_interface,
|
/* .interface = */ ggml_allocator_simple_interface,
|
||||||
/* .context = */ ctx,
|
/* .context = */ ctx,
|
||||||
|
/* .backend_size = */ 0,
|
||||||
/* .backend_data = */ NULL,
|
/* .backend_data = */ NULL,
|
||||||
};
|
};
|
||||||
return allocator;
|
return allocator;
|
||||||
|
@ -192,6 +193,7 @@ static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_ba
|
||||||
|
|
||||||
struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT);
|
struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT);
|
||||||
buffer->interface.free_data = ggml_backend_cpu_free_buffer;
|
buffer->interface.free_data = ggml_backend_cpu_free_buffer;
|
||||||
|
buffer->backend_size = size;
|
||||||
buffer->backend_data = data;
|
buffer->backend_data = data;
|
||||||
|
|
||||||
return buffer;
|
return buffer;
|
||||||
|
|
|
@ -27,6 +27,7 @@ extern "C" {
|
||||||
struct ggml_backend_buffer {
|
struct ggml_backend_buffer {
|
||||||
struct ggml_backend_buffer_interface interface;
|
struct ggml_backend_buffer_interface interface;
|
||||||
ggml_buffer_context_t context;
|
ggml_buffer_context_t context;
|
||||||
|
size_t backend_size;
|
||||||
void * backend_data;
|
void * backend_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
10
ggml-metal.h
10
ggml-metal.h
|
@ -34,9 +34,17 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// GG: maybe return ptr and avoid the "ggml.h" include
|
|
||||||
struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu);
|
struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu);
|
||||||
|
|
||||||
|
// TODO: temporary - move to backend interface
|
||||||
|
bool ggml_backend_metal_map_buffer(
|
||||||
|
struct ggml_backend * backend,
|
||||||
|
const char * name,
|
||||||
|
void * data,
|
||||||
|
size_t size,
|
||||||
|
size_t max_size);
|
||||||
|
|
||||||
|
|
||||||
//struct ggml_metal_context;
|
//struct ggml_metal_context;
|
||||||
//
|
//
|
||||||
//// number of command buffers to use
|
//// number of command buffers to use
|
||||||
|
|
43
ggml-metal.m
43
ggml-metal.m
|
@ -242,6 +242,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
||||||
return nil;
|
return nil;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: rename to ggml_metal_map_buffer
|
||||||
bool ggml_metal_add_buffer(
|
bool ggml_metal_add_buffer(
|
||||||
struct ggml_metal_context * ctx,
|
struct ggml_metal_context * ctx,
|
||||||
const char * name,
|
const char * name,
|
||||||
|
@ -993,38 +994,42 @@ void ggml_metal_graph_compute(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * ggml_backend_metal_name(ggml_backend_context_t ctx) {
|
bool ggml_backend_metal_map_buffer(
|
||||||
|
struct ggml_backend * backend,
|
||||||
|
const char * name,
|
||||||
|
void * data,
|
||||||
|
size_t size,
|
||||||
|
size_t max_size) {
|
||||||
|
return ggml_metal_add_buffer(backend->context, name, data, size, max_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * ggml_backend_metal_name(struct ggml_backend * ctx) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
|
|
||||||
UNUSED(ctx);
|
UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_graph_compute(ggml_backend_context_t ctx, struct ggml_cgraph * cgraph) {
|
static void ggml_backend_metal_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_metal_context * ctx_metal = (struct ggml_metal_context *) ctx;
|
ggml_metal_graph_compute(backend->context, cgraph);
|
||||||
|
|
||||||
ggml_metal_graph_compute(ctx_metal, cgraph);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_interface metal_backend_interface = {
|
static struct ggml_backend_interface metal_backend_interface = {
|
||||||
/* .get_name = */ ggml_backend_metal_name,
|
/* .get_name = */ ggml_backend_metal_name,
|
||||||
/* .free_context = */ NULL, //ggml_backend_metal_free_context,
|
/* .free = */ NULL, //ggml_backend_metal_alloc_buffer,
|
||||||
/* .alloc_buffer = */ NULL, //ggml_backend_metal_alloc_buffer,
|
/* .alloc_buffer = */ NULL, //ggml_backend_metal_free_buffer,
|
||||||
/* .free_buffer = */ NULL, //ggml_backend_metal_free_buffer,
|
/* .set_tensor_async = */ NULL, //ggml_backend_metal_reset_buffer,
|
||||||
/* .reset_buffer = */ NULL, //ggml_backend_metal_reset_buffer,
|
/* .get_tensor_async = */ NULL, //ggml_backend_metal_alloc_tensor,
|
||||||
/* .alloc_tensor = */ NULL, //ggml_backend_metal_alloc_tensor,
|
/* .synchronize = */ NULL, //ggml_backend_metal_set_tensor_async,
|
||||||
/* .set_tensor_async = */ NULL, //ggml_backend_metal_set_tensor_async,
|
/* .cpy_tensor_from = */ NULL, //ggml_backend_metal_get_tensor_async,
|
||||||
/* .get_tensor_async = */ NULL, //ggml_backend_metal_get_tensor_async,
|
/* .cpy_tensor_to = */ NULL, //ggml_backend_metal_synchronize,
|
||||||
/* .synchronize = */ NULL, //ggml_backend_metal_synchronize,
|
/* .graph_plan_create = */ NULL, //nullptr,
|
||||||
/* .cpy_tensor_from = */ NULL, //nullptr,
|
/* .graph_plan_free = */ NULL, //nullptr,
|
||||||
/* .cpy_tensor_to = */ NULL, //nullptr,
|
/* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_create,
|
||||||
/* .graph_plan_create = */ NULL, //ggml_backend_metal_graph_plan_create,
|
|
||||||
/* .graph_plan_free = */ NULL, //ggml_backend_metal_graph_plan_free,
|
|
||||||
/* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_compute,
|
|
||||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu) {
|
struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu) {
|
||||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
struct ggml_metal_context * ctx = ggml_metal_init(8);
|
||||||
|
|
||||||
struct ggml_backend * backend_metal = malloc(sizeof(struct ggml_backend));
|
struct ggml_backend * backend_metal = malloc(sizeof(struct ggml_backend));
|
||||||
*backend_metal = (struct ggml_backend){
|
*backend_metal = (struct ggml_backend){
|
||||||
|
|
38
llama.cpp
38
llama.cpp
|
@ -2817,6 +2817,44 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (params.n_gpu_layers > 0) {
|
||||||
|
void * data_ptr = NULL;
|
||||||
|
size_t data_size = 0;
|
||||||
|
|
||||||
|
if (params.use_mmap) {
|
||||||
|
data_ptr = ctx->model.mapping->addr;
|
||||||
|
data_size = ctx->model.mapping->size;
|
||||||
|
} else {
|
||||||
|
data_ptr = ggml_get_mem_buffer(ctx->model.ctx_metal);
|
||||||
|
data_size = ggml_get_mem_size (ctx->model.ctx_metal);
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx_metal);
|
||||||
|
|
||||||
|
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
||||||
|
|
||||||
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
||||||
|
if (!(result)) { \
|
||||||
|
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
||||||
|
llama_free(ctx); \
|
||||||
|
return NULL; \
|
||||||
|
}
|
||||||
|
|
||||||
|
LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "data", data_ptr, data_size, max_size));
|
||||||
|
|
||||||
|
struct ggml_backend_buffer * buf_compute = ctx->buf_compute_metal->backend_buffer;
|
||||||
|
struct ggml_backend_buffer * buf_kv = ctx->kv_self.buf->backend_buffer;
|
||||||
|
|
||||||
|
LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "eval", buf_compute->backend_data, buf_compute->backend_size, 0));
|
||||||
|
LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "kv", buf_kv->backend_data, buf_kv->backend_size, 0));
|
||||||
|
|
||||||
|
//LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
||||||
|
//LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
||||||
|
#undef LLAMA_METAL_CHECK_BUF
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
fprintf(stderr, "%s: layer backends: ", __func__);
|
fprintf(stderr, "%s: layer backends: ", __func__);
|
||||||
fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_inp));
|
fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_inp));
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue