metal : create backend, mostly reuse CPU backend interface
This commit is contained in:
parent
ed960fa1ab
commit
70c55c17c7
3 changed files with 61 additions and 29 deletions
|
@ -35,7 +35,7 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// GG: maybe return ptr and avoid the "ggml.h" include
|
// GG: maybe return ptr and avoid the "ggml.h" include
|
||||||
struct ggml_backend ggml_backend_metal_init();
|
struct ggml_backend ggml_backend_metal_init(struct ggml_backend * backend_cpu);
|
||||||
|
|
||||||
//struct ggml_metal_context;
|
//struct ggml_metal_context;
|
||||||
//
|
//
|
||||||
|
|
69
ggml-metal.m
69
ggml-metal.m
|
@ -993,30 +993,59 @@ void ggml_metal_graph_compute(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char * ggml_backend_metal_name(ggml_backend_context_t ctx) {
|
||||||
|
return "Metal";
|
||||||
|
|
||||||
|
UNUSED(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_metal_graph_compute(ggml_backend_context_t ctx, struct ggml_cgraph * cgraph) {
|
||||||
|
struct ggml_metal_context * ctx_metal = (struct ggml_metal_context *) ctx;
|
||||||
|
|
||||||
|
ggml_metal_graph_compute(ctx_metal, cgraph);
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_backend_interface metal_backend_interface = {
|
static struct ggml_backend_interface metal_backend_interface = {
|
||||||
/* .get_name = */ //ggml_backend_metal_name,
|
/* .get_name = */ ggml_backend_metal_name,
|
||||||
/* .free_context = */ //ggml_backend_metal_free_context,
|
/* .free_context = */ NULL, //ggml_backend_metal_free_context,
|
||||||
/* .alloc_buffer = */ //ggml_backend_metal_alloc_buffer,
|
/* .alloc_buffer = */ NULL, //ggml_backend_metal_alloc_buffer,
|
||||||
/* .free_buffer = */ //ggml_backend_metal_free_buffer,
|
/* .free_buffer = */ NULL, //ggml_backend_metal_free_buffer,
|
||||||
/* .reset_buffer = */ //ggml_backend_metal_reset_buffer,
|
/* .reset_buffer = */ NULL, //ggml_backend_metal_reset_buffer,
|
||||||
/* .alloc_tensor = */ //ggml_backend_metal_alloc_tensor,
|
/* .alloc_tensor = */ NULL, //ggml_backend_metal_alloc_tensor,
|
||||||
/* .set_tensor_async = */ //ggml_backend_metal_set_tensor_async,
|
/* .set_tensor_async = */ NULL, //ggml_backend_metal_set_tensor_async,
|
||||||
/* .get_tensor_async = */ //ggml_backend_metal_get_tensor_async,
|
/* .get_tensor_async = */ NULL, //ggml_backend_metal_get_tensor_async,
|
||||||
/* .synchronize = */ //ggml_backend_metal_synchronize,
|
/* .synchronize = */ NULL, //ggml_backend_metal_synchronize,
|
||||||
/* .cpy_tensor_from = */ //nullptr,
|
/* .cpy_tensor_from = */ NULL, //nullptr,
|
||||||
/* .cpy_tensor_to = */ //nullptr,
|
/* .cpy_tensor_to = */ NULL, //nullptr,
|
||||||
/* .graph_plan_create = */ //ggml_backend_metal_graph_plan_create,
|
/* .graph_plan_create = */ NULL, //ggml_backend_metal_graph_plan_create,
|
||||||
/* .graph_plan_free = */ //ggml_backend_metal_graph_plan_free,
|
/* .graph_plan_free = */ NULL, //ggml_backend_metal_graph_plan_free,
|
||||||
/* .graph_plan_compute = */ //ggml_backend_metal_graph_plan_compute,
|
/* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_compute,
|
||||||
/* .graph_compute = */ //ggml_backend_metal_graph_compute
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend ggml_backend_metal_init(void) {
|
struct ggml_backend ggml_backend_metal_init(struct ggml_backend * backend_cpu) {
|
||||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||||
|
|
||||||
struct ggml_backend metal_backend = {
|
struct ggml_backend backend_metal = {
|
||||||
/* .interface = */ &metal_backend_interface,
|
/* .interface = */ &metal_backend_interface,
|
||||||
/* .context = */ ctx
|
/* .context = */ ctx,
|
||||||
|
/* .is_ram_shared = */ true,
|
||||||
};
|
};
|
||||||
return metal_backend;
|
|
||||||
|
// reuses CPU calls for now
|
||||||
|
backend_metal.interface->free_context = backend_cpu->interface->free_context;
|
||||||
|
backend_metal.interface->alloc_buffer = backend_cpu->interface->alloc_buffer;
|
||||||
|
backend_metal.interface->free_buffer = backend_cpu->interface->free_buffer;
|
||||||
|
backend_metal.interface->reset_buffer = backend_cpu->interface->reset_buffer;
|
||||||
|
backend_metal.interface->alloc_tensor = backend_cpu->interface->alloc_tensor;
|
||||||
|
backend_metal.interface->set_tensor_async = backend_cpu->interface->set_tensor_async;
|
||||||
|
backend_metal.interface->get_tensor_async = backend_cpu->interface->get_tensor_async;
|
||||||
|
backend_metal.interface->synchronize = backend_cpu->interface->synchronize;
|
||||||
|
backend_metal.interface->cpy_tensor_from = backend_cpu->interface->cpy_tensor_from;
|
||||||
|
backend_metal.interface->cpy_tensor_to = backend_cpu->interface->cpy_tensor_to;
|
||||||
|
backend_metal.interface->graph_plan_create = backend_cpu->interface->graph_plan_create;
|
||||||
|
backend_metal.interface->graph_plan_free = backend_cpu->interface->graph_plan_free;
|
||||||
|
backend_metal.interface->graph_plan_compute = backend_cpu->interface->graph_plan_compute;
|
||||||
|
|
||||||
|
return backend_metal;
|
||||||
}
|
}
|
||||||
|
|
19
llama.cpp
19
llama.cpp
|
@ -968,7 +968,7 @@ static void llama_model_load_internal(
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
model.backend_metal = ggml_backend_metal_init();
|
model.backend_metal = ggml_backend_metal_init(backend_cpu);
|
||||||
backend_gpu = &model.backend_metal;
|
backend_gpu = &model.backend_metal;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -1008,17 +1008,20 @@ static void llama_model_load_internal(
|
||||||
// TODO: generalize support for mmap
|
// TODO: generalize support for mmap
|
||||||
size_t mmap_size = 0;
|
size_t mmap_size = 0;
|
||||||
if (ml->use_mmap) {
|
if (ml->use_mmap) {
|
||||||
mmap_size = ctx_sizes[backend_cpu];
|
for (auto & it : ctx_sizes) {
|
||||||
ctx_sizes[backend_cpu] = 0;
|
if (it.first->is_ram_shared) {
|
||||||
|
mmap_size += it.second;
|
||||||
|
ctx_sizes[it.first] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: ggml ctx sizes:\n", __func__);
|
fprintf(stderr, "%s: ggml ctx sizes:\n", __func__);
|
||||||
for (const auto & it : ctx_sizes) {
|
for (const auto & it : ctx_sizes) {
|
||||||
fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0);
|
fprintf(stderr, "%8s = %7.2f MB\n", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0);
|
||||||
if (it.first->is_ram_shared && ml->use_mmap) {
|
}
|
||||||
fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0);
|
if (mmap_size > 0) {
|
||||||
}
|
fprintf(stderr, "%8s = %7.2f MB\n", "mmap", mmap_size / 1024.0 / 1024.0);
|
||||||
fprintf(stderr, "\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// create the buffers and contexts
|
// create the buffers and contexts
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue