metal : adapting to ggml_backend (WIP)
This commit is contained in:
parent
1102ff56db
commit
0a3861c47b
3 changed files with 99 additions and 41 deletions
69
ggml-metal.h
69
ggml-metal.h
|
@ -19,51 +19,56 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
// max memory buffers that can be mapped to the device
|
// max memory buffers that can be mapped to the device
|
||||||
#define GGML_METAL_MAX_BUFFERS 16
|
#define GGML_METAL_MAX_BUFFERS 16
|
||||||
|
|
||||||
struct ggml_tensor;
|
//struct ggml_tensor;
|
||||||
struct ggml_cgraph;
|
//struct ggml_cgraph;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct ggml_metal_context;
|
// GG: maybe return ptr and avoid the "ggml.h" include
|
||||||
|
struct ggml_backend ggml_backend_metal_init();
|
||||||
|
|
||||||
// number of command buffers to use
|
//struct ggml_metal_context;
|
||||||
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
|
||||||
void ggml_metal_free(struct ggml_metal_context * ctx);
|
|
||||||
|
|
||||||
// set the number of command buffers to use
|
|
||||||
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
|
||||||
|
|
||||||
// creates a mapping between a host memory buffer and a device memory buffer
|
|
||||||
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
|
||||||
// - the mapping is used during computation to determine the arguments of the compute kernels
|
|
||||||
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
|
||||||
// - max_size specifies the maximum size of a tensor and is used to create shared views such
|
|
||||||
// that it is guaranteed that the tensor will fit in at least one of the views
|
|
||||||
//
|
//
|
||||||
bool ggml_metal_add_buffer(
|
//// number of command buffers to use
|
||||||
struct ggml_metal_context * ctx,
|
//struct ggml_metal_context * ggml_metal_init(int n_cb);
|
||||||
const char * name,
|
//void ggml_metal_free(struct ggml_metal_context * ctx);
|
||||||
void * data,
|
//
|
||||||
size_t size,
|
//// set the number of command buffers to use
|
||||||
size_t max_size);
|
//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
||||||
|
//
|
||||||
// set data from host memory into the device
|
//// creates a mapping between a host memory buffer and a device memory buffer
|
||||||
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
||||||
|
//// - the mapping is used during computation to determine the arguments of the compute kernels
|
||||||
// get data from the device into host memory
|
//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
||||||
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
//// - max_size specifies the maximum size of a tensor and is used to create shared views such
|
||||||
|
//// that it is guaranteed that the tensor will fit in at least one of the views
|
||||||
// same as ggml_graph_compute but uses Metal
|
////
|
||||||
// creates gf->n_threads command buffers in parallel
|
//bool ggml_metal_add_buffer(
|
||||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
// struct ggml_metal_context * ctx,
|
||||||
|
// const char * name,
|
||||||
|
// void * data,
|
||||||
|
// size_t size,
|
||||||
|
// size_t max_size);
|
||||||
|
//
|
||||||
|
//// set data from host memory into the device
|
||||||
|
//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||||
|
//
|
||||||
|
//// get data from the device into host memory
|
||||||
|
//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||||
|
//
|
||||||
|
//// same as ggml_graph_compute but uses Metal
|
||||||
|
//// creates gf->n_threads command buffers in parallel
|
||||||
|
//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
28
ggml-metal.m
28
ggml-metal.m
|
@ -992,3 +992,31 @@ void ggml_metal_graph_compute(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_interface metal_backend_interface = {
|
||||||
|
/* .get_name = */ //ggml_backend_metal_name,
|
||||||
|
/* .free_context = */ //ggml_backend_metal_free_context,
|
||||||
|
/* .alloc_buffer = */ //ggml_backend_metal_alloc_buffer,
|
||||||
|
/* .free_buffer = */ //ggml_backend_metal_free_buffer,
|
||||||
|
/* .reset_buffer = */ //ggml_backend_metal_reset_buffer,
|
||||||
|
/* .alloc_tensor = */ //ggml_backend_metal_alloc_tensor,
|
||||||
|
/* .set_tensor_async = */ //ggml_backend_metal_set_tensor_async,
|
||||||
|
/* .get_tensor_async = */ //ggml_backend_metal_get_tensor_async,
|
||||||
|
/* .synchronize = */ //ggml_backend_metal_synchronize,
|
||||||
|
/* .cpy_tensor_from = */ //nullptr,
|
||||||
|
/* .cpy_tensor_to = */ //nullptr,
|
||||||
|
/* .graph_plan_create = */ //ggml_backend_metal_graph_plan_create,
|
||||||
|
/* .graph_plan_free = */ //ggml_backend_metal_graph_plan_free,
|
||||||
|
/* .graph_plan_compute = */ //ggml_backend_metal_graph_plan_compute,
|
||||||
|
/* .graph_compute = */ //ggml_backend_metal_graph_compute
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend ggml_backend_metal_init(void) {
|
||||||
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||||
|
|
||||||
|
struct ggml_backend metal_backend = {
|
||||||
|
/* .interface = */ &metal_backend_interface,
|
||||||
|
/* .context = */ ctx
|
||||||
|
};
|
||||||
|
return metal_backend;
|
||||||
|
}
|
||||||
|
|
43
llama.cpp
43
llama.cpp
|
@ -233,6 +233,11 @@ struct llama_model {
|
||||||
ggml_buffer buf_cuda;
|
ggml_buffer buf_cuda;
|
||||||
ggml_context * ctx_cuda = NULL;
|
ggml_context * ctx_cuda = NULL;
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
ggml_backend backend_metal;
|
||||||
|
ggml_buffer buf_metal;
|
||||||
|
ggml_context * ctx_metal = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
// backend assigned to each layer
|
// backend assigned to each layer
|
||||||
ggml_backend * backend_input = NULL;
|
ggml_backend * backend_input = NULL;
|
||||||
|
@ -249,6 +254,12 @@ struct llama_model {
|
||||||
ggml_free(ctx_cuda);
|
ggml_free(ctx_cuda);
|
||||||
ggml_backend_free_buffer(&buf_cuda);
|
ggml_backend_free_buffer(&buf_cuda);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (ctx_metal) {
|
||||||
|
ggml_free(ctx_metal);
|
||||||
|
ggml_backend_free_buffer(&buf_metal);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -290,6 +301,9 @@ struct llama_context {
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
ggml_buffer buf_compute_cuda = {};
|
ggml_buffer buf_compute_cuda = {};
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
ggml_buffer buf_compute_metal = {};
|
||||||
|
#endif
|
||||||
|
|
||||||
// input tensors
|
// input tensors
|
||||||
struct ggml_tensor * graph_tokens_in = nullptr;
|
struct ggml_tensor * graph_tokens_in = nullptr;
|
||||||
|
@ -940,6 +954,8 @@ static void llama_model_load_internal(
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
|
||||||
model.backend_cpu = ggml_backend_cpu_init();
|
model.backend_cpu = ggml_backend_cpu_init();
|
||||||
|
|
||||||
|
ggml_backend * backend_cpu = &model.backend_cpu;
|
||||||
ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection
|
ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
|
@ -947,14 +963,21 @@ static void llama_model_load_internal(
|
||||||
backend_gpu = &model.backend_cuda;
|
backend_gpu = &model.backend_cuda;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (n_gpu_layers > 0) {
|
||||||
|
model.backend_metal = ggml_backend_metal_init();
|
||||||
|
backend_gpu = &model.backend_metal;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// assign splits to the backends
|
// assign splits to the backends
|
||||||
const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers);
|
const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers);
|
||||||
model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : &model.backend_cpu;
|
model.backend_input = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu;
|
||||||
model.backend_output = n_gpu_layers > 0 ? backend_gpu : &model.backend_cpu;
|
model.backend_output = n_gpu_layers > 0 ? backend_gpu : backend_cpu;
|
||||||
|
|
||||||
model.backend_layers.resize(n_layer);
|
model.backend_layers.resize(n_layer);
|
||||||
std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, &model.backend_cpu);
|
std::fill(model.backend_layers.begin(), model.backend_layers.begin() + i_gpu_start, backend_cpu);
|
||||||
std::fill(model.backend_layers.begin() + i_gpu_start, model.backend_layers.end(), backend_gpu);
|
std::fill(model.backend_layers.begin() + i_gpu_start, model.backend_layers.end(), backend_gpu);
|
||||||
|
|
||||||
// calculate the size of each context
|
// calculate the size of each context
|
||||||
std::unordered_map<struct ggml_backend *, size_t> ctx_sizes;
|
std::unordered_map<struct ggml_backend *, size_t> ctx_sizes;
|
||||||
|
@ -977,17 +1000,18 @@ static void llama_model_load_internal(
|
||||||
ctx_sizes[model.backend_layers[layer]] += lt.size;
|
ctx_sizes[model.backend_layers[layer]] += lt.size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: generalize support for mmap
|
// TODO: generalize support for mmap
|
||||||
size_t mmap_size = 0;
|
size_t mmap_size = 0;
|
||||||
if (ml->use_mmap) {
|
if (ml->use_mmap) {
|
||||||
mmap_size = ctx_sizes[&model.backend_cpu];
|
mmap_size = ctx_sizes[backend_cpu];
|
||||||
ctx_sizes[&model.backend_cpu] = 0;
|
ctx_sizes[backend_cpu] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: ggml ctx sizes:\n", __func__);
|
fprintf(stderr, "%s: ggml ctx sizes:\n", __func__);
|
||||||
for (const auto & it : ctx_sizes) {
|
for (const auto & it : ctx_sizes) {
|
||||||
fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0);
|
fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0);
|
||||||
if (it.first == &model.backend_cpu && ml->use_mmap) {
|
if (it.first == backend_cpu && ml->use_mmap) {
|
||||||
fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0);
|
fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
@ -996,8 +1020,8 @@ static void llama_model_load_internal(
|
||||||
// create the buffers and contexts
|
// create the buffers and contexts
|
||||||
{
|
{
|
||||||
size_t cpu_num_tensors = ml->tensors_map.tensors.size();
|
size_t cpu_num_tensors = ml->tensors_map.tensors.size();
|
||||||
size_t ctx_size = ctx_sizes[&model.backend_cpu];
|
size_t ctx_size = ctx_sizes[backend_cpu];
|
||||||
model.buf_cpu = ggml_backend_alloc_buffer(&model.backend_cpu, ctx_size, cpu_num_tensors);
|
model.buf_cpu = ggml_backend_alloc_buffer(backend_cpu, ctx_size, cpu_num_tensors);
|
||||||
struct ggml_init_params params = ggml_init_params_default();
|
struct ggml_init_params params = ggml_init_params_default();
|
||||||
params.buffer = &model.buf_cpu;
|
params.buffer = &model.buf_cpu;
|
||||||
params.no_alloc = ml->use_mmap;
|
params.no_alloc = ml->use_mmap;
|
||||||
|
@ -1028,6 +1052,7 @@ static void llama_model_load_internal(
|
||||||
if (model.backend_input == backend_gpu) ctx_input = ctx_gpu;
|
if (model.backend_input == backend_gpu) ctx_input = ctx_gpu;
|
||||||
ggml_context * ctx_output = model.ctx_cpu;
|
ggml_context * ctx_output = model.ctx_cpu;
|
||||||
if (model.backend_output == backend_gpu) ctx_output = ctx_gpu;
|
if (model.backend_output == backend_gpu) ctx_output = ctx_gpu;
|
||||||
|
|
||||||
std::vector<ggml_context *> ctx_layers(n_layer, model.ctx_cpu);
|
std::vector<ggml_context *> ctx_layers(n_layer, model.ctx_cpu);
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
if (model.backend_layers[i] == backend_gpu) {
|
if (model.backend_layers[i] == backend_gpu) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue