diff --git a/ggml-metal.h b/ggml-metal.h index 928f1705c..a726ddd1c 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -19,51 +19,56 @@ #pragma once +#include "ggml.h" + #include #include // max memory buffers that can be mapped to the device #define GGML_METAL_MAX_BUFFERS 16 -struct ggml_tensor; -struct ggml_cgraph; +//struct ggml_tensor; +//struct ggml_cgraph; #ifdef __cplusplus extern "C" { #endif -struct ggml_metal_context; +// GG: maybe return ptr and avoid the "ggml.h" include +struct ggml_backend ggml_backend_metal_init(); -// number of command buffers to use -struct ggml_metal_context * ggml_metal_init(int n_cb); -void ggml_metal_free(struct ggml_metal_context * ctx); - -// set the number of command buffers to use -void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); - -// creates a mapping between a host memory buffer and a device memory buffer -// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute -// - the mapping is used during computation to determine the arguments of the compute kernels -// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal -// - max_size specifies the maximum size of a tensor and is used to create shared views such -// that it is guaranteed that the tensor will fit in at least one of the views +//struct ggml_metal_context; // -bool ggml_metal_add_buffer( - struct ggml_metal_context * ctx, - const char * name, - void * data, - size_t size, - size_t max_size); - -// set data from host memory into the device -void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); - -// get data from the device into host memory -void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); - -// same as ggml_graph_compute but uses Metal -// creates gf->n_threads command buffers in parallel -void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +//// number of command buffers to use +//struct ggml_metal_context * ggml_metal_init(int n_cb); +//void ggml_metal_free(struct ggml_metal_context * ctx); +// +//// set the number of command buffers to use +//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); +// +//// creates a mapping between a host memory buffer and a device memory buffer +//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute +//// - the mapping is used during computation to determine the arguments of the compute kernels +//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal +//// - max_size specifies the maximum size of a tensor and is used to create shared views such +//// that it is guaranteed that the tensor will fit in at least one of the views +//// +//bool ggml_metal_add_buffer( +// struct ggml_metal_context * ctx, +// const char * name, +// void * data, +// size_t size, +// size_t max_size); +// +//// set data from host memory into the device +//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); +// +//// get data from the device into host memory +//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); +// +//// same as ggml_graph_compute but uses Metal +//// creates gf->n_threads command buffers in parallel +//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); #ifdef __cplusplus } diff --git a/ggml-metal.m b/ggml-metal.m index ee205bcdf..d7ff833a4 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -992,3 +992,31 @@ void ggml_metal_graph_compute( } } } + +static struct ggml_backend_interface metal_backend_interface = { + /* .get_name = */ //ggml_backend_metal_name, + /* .free_context = */ //ggml_backend_metal_free_context, + /* .alloc_buffer = */ //ggml_backend_metal_alloc_buffer, + /* .free_buffer = */ //ggml_backend_metal_free_buffer, + /* .reset_buffer = */ //ggml_backend_metal_reset_buffer, + /* .alloc_tensor = */ //ggml_backend_metal_alloc_tensor, + /* .set_tensor_async = */ //ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ //ggml_backend_metal_get_tensor_async, + /* .synchronize = */ //ggml_backend_metal_synchronize, + /* .cpy_tensor_from = */ //nullptr, + /* .cpy_tensor_to = */ //nullptr, + /* .graph_plan_create = */ //ggml_backend_metal_graph_plan_create, + /* .graph_plan_free = */ //ggml_backend_metal_graph_plan_free, + /* .graph_plan_compute = */ //ggml_backend_metal_graph_plan_compute, + /* .graph_compute = */ //ggml_backend_metal_graph_compute +}; + +struct ggml_backend ggml_backend_metal_init(void) { + struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + + struct ggml_backend metal_backend = { + /* .interface = */ &metal_backend_interface, + /* .context = */ ctx + }; + return metal_backend; +} diff --git a/llama.cpp b/llama.cpp index 61e31f45f..d4b563b84 100644 --- a/llama.cpp +++ b/llama.cpp @@ -233,6 +233,11 @@ struct llama_model { ggml_buffer buf_cuda; ggml_context * ctx_cuda = NULL; #endif +#ifdef GGML_USE_METAL + ggml_backend backend_metal; + ggml_buffer buf_metal; + ggml_context * ctx_metal = NULL; +#endif // backend assigned to each layer ggml_backend * backend_input = NULL; @@ -249,6 +254,12 @@ struct llama_model { ggml_free(ctx_cuda); ggml_backend_free_buffer(&buf_cuda); } +#endif +#ifdef GGML_USE_METAL + if (ctx_metal) { + ggml_free(ctx_metal); + ggml_backend_free_buffer(&buf_metal); + } #endif } }; @@ -290,6 +301,9 @@ struct llama_context { #ifdef GGML_USE_CUDA ggml_buffer buf_compute_cuda = {}; #endif +#ifdef GGML_USE_METAL + ggml_buffer buf_compute_metal = {}; +#endif // input tensors struct ggml_tensor * graph_tokens_in = nullptr; @@ -940,6 +954,8 @@ static void llama_model_load_internal( const uint32_t n_layer = hparams.n_layer; model.backend_cpu = ggml_backend_cpu_init(); + + ggml_backend * backend_cpu = &model.backend_cpu; ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection #ifdef GGML_USE_CUDA if (n_gpu_layers > 0) { @@ -947,14 +963,21 @@ static void llama_model_load_internal( backend_gpu = &model.backend_cuda; } #endif +#ifdef GGML_USE_METAL + if (n_gpu_layers > 0) { + model.backend_metal = ggml_backend_metal_init(); + backend_gpu = &model.backend_metal; + } +#endif // assign splits to the backends const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers); - model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : &model.backend_cpu; - model.backend_output = n_gpu_layers > 0 ? backend_gpu : &model.backend_cpu; + model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu; + model.backend_output = n_gpu_layers > 0 ? backend_gpu : backend_cpu; + model.backend_layers.resize(n_layer); - std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, &model.backend_cpu); - std::fill(model.backend_layers.begin() + i_gpu_start, model.backend_layers.end(), backend_gpu); + std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, backend_cpu); + std::fill(model.backend_layers.begin() + i_gpu_start, model.backend_layers.end(), backend_gpu); // calculate the size of each context std::unordered_map ctx_sizes; @@ -977,17 +1000,18 @@ static void llama_model_load_internal( ctx_sizes[model.backend_layers[layer]] += lt.size; } } + // TODO: generalize support for mmap size_t mmap_size = 0; if (ml->use_mmap) { - mmap_size = ctx_sizes[&model.backend_cpu]; - ctx_sizes[&model.backend_cpu] = 0; + mmap_size = ctx_sizes[backend_cpu]; + ctx_sizes[backend_cpu] = 0; } fprintf(stderr, "%s: ggml ctx sizes:\n", __func__); for (const auto & it : ctx_sizes) { fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0); - if (it.first == &model.backend_cpu && ml->use_mmap) { + if (it.first == backend_cpu && ml->use_mmap) { fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0); } fprintf(stderr, "\n"); @@ -996,8 +1020,8 @@ static void llama_model_load_internal( // create the buffers and contexts { size_t cpu_num_tensors = ml->tensors_map.tensors.size(); - size_t ctx_size = ctx_sizes[&model.backend_cpu]; - model.buf_cpu = ggml_backend_alloc_buffer(&model.backend_cpu, ctx_size, cpu_num_tensors); + size_t ctx_size = ctx_sizes[backend_cpu]; + model.buf_cpu = ggml_backend_alloc_buffer(backend_cpu, ctx_size, cpu_num_tensors); struct ggml_init_params params = ggml_init_params_default(); params.buffer = &model.buf_cpu; params.no_alloc = ml->use_mmap; @@ -1028,6 +1052,7 @@ static void llama_model_load_internal( if (model.backend_input == backend_gpu) ctx_input = ctx_gpu; ggml_context * ctx_output = model.ctx_cpu; if (model.backend_output == backend_gpu) ctx_output = ctx_gpu; + std::vector ctx_layers(n_layer, model.ctx_cpu); for (uint32_t i = 0; i < n_layer; ++i) { if (model.backend_layers[i] == backend_gpu) {