ggml : add is_ram_shared to ggml_backend
Metal can share the RAM memory and can utilize mmap without temp buffer
This commit is contained in:
parent
90503f150d
commit
652c849643
4 changed files with 68 additions and 21 deletions
|
@ -256,7 +256,8 @@ struct ggml_backend ggml_backend_cpu_init(void) {
|
||||||
|
|
||||||
struct ggml_backend cpu_backend = {
|
struct ggml_backend cpu_backend = {
|
||||||
/* .interface = */ &cpu_backend_interface,
|
/* .interface = */ &cpu_backend_interface,
|
||||||
/* .context = */ ctx
|
/* .context = */ ctx,
|
||||||
|
/* .is_ram_shared = */ true,
|
||||||
};
|
};
|
||||||
return cpu_backend;
|
return cpu_backend;
|
||||||
}
|
}
|
||||||
|
|
|
@ -61,7 +61,10 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_backend {
|
struct ggml_backend {
|
||||||
struct ggml_backend_interface * interface;
|
struct ggml_backend_interface * interface;
|
||||||
|
|
||||||
ggml_backend_context_t context;
|
ggml_backend_context_t context;
|
||||||
|
|
||||||
|
bool is_ram_shared;
|
||||||
};
|
};
|
||||||
|
|
||||||
// backend helper functions
|
// backend helper functions
|
||||||
|
@ -78,7 +81,16 @@ extern "C" {
|
||||||
static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface->graph_compute(backend->context, cgraph); }
|
static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface->graph_compute(backend->context, cgraph); }
|
||||||
|
|
||||||
// buffer and tensor allocation
|
// buffer and tensor allocation
|
||||||
GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); // GG: probably return ptr
|
// TODO:
|
||||||
|
// - return "struct ggml_buffer *"
|
||||||
|
// - fix namings:
|
||||||
|
// - ggml_backend_alloc_buffer -> ggml_backend_buffer_alloc
|
||||||
|
// - ggml_backend_free_buffer -> ggml_backend_buffer_free
|
||||||
|
// - ggml_backend_reset_buffer -> ggml_backend_buffer_reset
|
||||||
|
// - ggml_backend_alloc_tensor -> ggml_backend_tensor_alloc
|
||||||
|
// - ggml_backend_tensor_cpy -> ggml_backend_tensor_copy
|
||||||
|
//
|
||||||
|
GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors);
|
||||||
GGML_API void ggml_backend_free_buffer(struct ggml_buffer * buffer);
|
GGML_API void ggml_backend_free_buffer(struct ggml_buffer * buffer);
|
||||||
static inline void ggml_backend_reset_buffer(struct ggml_buffer * buffer) { buffer->backend->interface->reset_buffer(buffer->backend->context, buffer->backend_buffer); }
|
static inline void ggml_backend_reset_buffer(struct ggml_buffer * buffer) { buffer->backend->interface->reset_buffer(buffer->backend->context, buffer->backend_buffer); }
|
||||||
static inline void ggml_backend_alloc_tensor(struct ggml_buffer * buffer, struct ggml_tensor * tensor) { buffer->backend->interface->alloc_tensor(buffer->backend->context, buffer->backend_buffer, tensor); }
|
static inline void ggml_backend_alloc_tensor(struct ggml_buffer * buffer, struct ggml_tensor * tensor) { buffer->backend->interface->alloc_tensor(buffer->backend->context, buffer->backend_buffer, tensor); }
|
||||||
|
|
|
@ -1834,8 +1834,9 @@ ggml_backend ggml_backend_cuda_init(void) {
|
||||||
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context;
|
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context;
|
||||||
|
|
||||||
ggml_backend cuda_backend = {
|
ggml_backend cuda_backend = {
|
||||||
/* .interface = */ &cuda_backend_interface,
|
/* .interface = = */ &cuda_backend_interface,
|
||||||
/* .context = */ ctx
|
/* .context = */ ctx,
|
||||||
|
/* .is_ram_shared = */ false,
|
||||||
};
|
};
|
||||||
return cuda_backend;
|
return cuda_backend;
|
||||||
}
|
}
|
||||||
|
|
57
llama.cpp
57
llama.cpp
|
@ -225,6 +225,7 @@ struct llama_model {
|
||||||
llama_vocab vocab;
|
llama_vocab vocab;
|
||||||
|
|
||||||
// backends
|
// backends
|
||||||
|
// TODO: change to pointers
|
||||||
ggml_backend backend_cpu;
|
ggml_backend backend_cpu;
|
||||||
ggml_buffer buf_cpu;
|
ggml_buffer buf_cpu;
|
||||||
ggml_context * ctx_cpu = NULL;
|
ggml_context * ctx_cpu = NULL;
|
||||||
|
@ -298,6 +299,7 @@ struct llama_context {
|
||||||
|
|
||||||
// memory buffers used to evaluate the model
|
// memory buffers used to evaluate the model
|
||||||
ggml_buffer buf_compute_cpu = {};
|
ggml_buffer buf_compute_cpu = {};
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
ggml_buffer buf_compute_cuda = {};
|
ggml_buffer buf_compute_cuda = {};
|
||||||
#endif
|
#endif
|
||||||
|
@ -634,11 +636,11 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
||||||
|
|
||||||
bool is_cpu = lt.ggml_tensor->backend == &model->backend_cpu;
|
const bool is_ram_shared = lt.ggml_tensor->backend->is_ram_shared;
|
||||||
|
|
||||||
// select buffer to load data into
|
// select buffer to load data into
|
||||||
if (!use_mmap) {
|
if (!use_mmap) {
|
||||||
if (is_cpu) {
|
if (is_ram_shared) {
|
||||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
||||||
} else {
|
} else {
|
||||||
// read to temporary buffer
|
// read to temporary buffer
|
||||||
|
@ -649,7 +651,7 @@ struct llama_model_loader {
|
||||||
|
|
||||||
load_data_for(lt);
|
load_data_for(lt);
|
||||||
|
|
||||||
if (is_cpu) {
|
if (is_ram_shared) {
|
||||||
if (use_mmap) {
|
if (use_mmap) {
|
||||||
lt.ggml_tensor->data = lt.data;
|
lt.ggml_tensor->data = lt.data;
|
||||||
// TODO: this assumes that the data to lock is contiguous, which may not always be the case
|
// TODO: this assumes that the data to lock is contiguous, which may not always be the case
|
||||||
|
@ -671,7 +673,7 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_data_for(llama_load_tensor & lt) {
|
void load_data_for(llama_load_tensor & lt) const {
|
||||||
if (use_mmap) {
|
if (use_mmap) {
|
||||||
lt.data = (uint8_t *) mapping->addr + lt.file_off;
|
lt.data = (uint8_t *) mapping->addr + lt.file_off;
|
||||||
} else {
|
} else {
|
||||||
|
@ -957,6 +959,7 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
ggml_backend * backend_cpu = &model.backend_cpu;
|
ggml_backend * backend_cpu = &model.backend_cpu;
|
||||||
ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection
|
ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
model.backend_cuda = ggml_backend_cuda_init();
|
model.backend_cuda = ggml_backend_cuda_init();
|
||||||
|
@ -965,13 +968,14 @@ static void llama_model_load_internal(
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
model.backend_metal = ggml_backend_cpu_init();
|
model.backend_metal = ggml_backend_metal_init();
|
||||||
backend_gpu = &model.backend_metal;
|
backend_gpu = &model.backend_metal;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// assign splits to the backends
|
// assign splits to the backends
|
||||||
const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers);
|
const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers);
|
||||||
|
|
||||||
model.backend_inp = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu;
|
model.backend_inp = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu;
|
||||||
model.backend_out = n_gpu_layers > 0 ? backend_gpu : backend_cpu;
|
model.backend_out = n_gpu_layers > 0 ? backend_gpu : backend_cpu;
|
||||||
|
|
||||||
|
@ -1011,7 +1015,7 @@ static void llama_model_load_internal(
|
||||||
fprintf(stderr, "%s: ggml ctx sizes:\n", __func__);
|
fprintf(stderr, "%s: ggml ctx sizes:\n", __func__);
|
||||||
for (const auto & it : ctx_sizes) {
|
for (const auto & it : ctx_sizes) {
|
||||||
fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0);
|
fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0);
|
||||||
if (it.first == backend_cpu && ml->use_mmap) {
|
if (it.first->is_ram_shared && ml->use_mmap) {
|
||||||
fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0);
|
fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
@ -1135,12 +1139,10 @@ static void llama_model_load_internal(
|
||||||
ctx_sum += it.second;
|
ctx_sum += it.second;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t mem_required =
|
const size_t mem_required = ctx_sum + MEM_REQ_EVAL().at(model.type);
|
||||||
ctx_sum + MEM_REQ_EVAL().at(model.type);
|
|
||||||
|
|
||||||
// this is the memory required by one llama_state
|
// this is the memory required by one llama_state
|
||||||
const size_t mem_required_state =
|
const size_t mem_required_state = scale*MEM_REQ_KV_SELF().at(model.type);
|
||||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||||
|
@ -1162,6 +1164,7 @@ static void llama_model_load_internal(
|
||||||
// loading time will be recalculate after the first eval, so
|
// loading time will be recalculate after the first eval, so
|
||||||
// we take page faults deferred by mmap() into consideration
|
// we take page faults deferred by mmap() into consideration
|
||||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_model_load(
|
static bool llama_model_load(
|
||||||
|
@ -1226,6 +1229,7 @@ static ggml_graph_splits llama_build_graph(
|
||||||
// initialize contexts for every backend
|
// initialize contexts for every backend
|
||||||
|
|
||||||
struct ggml_context * ctx_cpu = nullptr;
|
struct ggml_context * ctx_cpu = nullptr;
|
||||||
|
|
||||||
if (lctx.buf_compute_cpu.mem_size > 0) {
|
if (lctx.buf_compute_cpu.mem_size > 0) {
|
||||||
struct ggml_init_params params = ggml_init_params_default();
|
struct ggml_init_params params = ggml_init_params_default();
|
||||||
params.buffer = &lctx.buf_compute_cpu;
|
params.buffer = &lctx.buf_compute_cpu;
|
||||||
|
@ -1235,6 +1239,7 @@ static ggml_graph_splits llama_build_graph(
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
struct ggml_context * ctx_cuda = nullptr;
|
struct ggml_context * ctx_cuda = nullptr;
|
||||||
|
|
||||||
if (lctx.buf_compute_cuda.mem_size > 0) {
|
if (lctx.buf_compute_cuda.mem_size > 0) {
|
||||||
struct ggml_init_params params = ggml_init_params_default();
|
struct ggml_init_params params = ggml_init_params_default();
|
||||||
params.buffer = &lctx.buf_compute_cuda;
|
params.buffer = &lctx.buf_compute_cuda;
|
||||||
|
@ -1243,30 +1248,54 @@ static ggml_graph_splits llama_build_graph(
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
struct ggml_context * ctx_metal = nullptr;
|
||||||
|
|
||||||
|
if (lctx.buf_compute_metal.mem_size > 0) {
|
||||||
|
struct ggml_init_params params = ggml_init_params_default();
|
||||||
|
params.buffer = &lctx.buf_compute_metal;
|
||||||
|
params.compute_type = compute_type;
|
||||||
|
ctx_metal = ggml_init(params);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// TODO: clean this
|
// TODO: clean this
|
||||||
struct ggml_context * ctx_i = nullptr;
|
struct ggml_context * ctx_i = nullptr;
|
||||||
struct ggml_context * ctx_ls[80] = {nullptr};
|
|
||||||
struct ggml_context * ctx_o = nullptr;
|
struct ggml_context * ctx_o = nullptr;
|
||||||
struct ggml_context * ctx_kv = nullptr;
|
struct ggml_context * ctx_kv = nullptr;
|
||||||
|
struct ggml_context * ctx_ls[80] = {nullptr};
|
||||||
|
|
||||||
if (lctx.model.backend_inp == &lctx.model.backend_cpu) ctx_i = ctx_cpu;
|
if (lctx.model.backend_inp == &lctx.model.backend_cpu) ctx_i = ctx_cpu;
|
||||||
if (lctx.model.backend_out == &lctx.model.backend_cpu) ctx_o = ctx_cpu;
|
if (lctx.model.backend_out == &lctx.model.backend_cpu) ctx_o = ctx_cpu;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
if (lctx.model.backend_inp == &lctx.model.backend_cuda) ctx_i = ctx_cuda;
|
if (lctx.model.backend_inp == &lctx.model.backend_cuda) ctx_i = ctx_cuda;
|
||||||
if (lctx.model.backend_out == &lctx.model.backend_cuda) ctx_o = ctx_cuda;
|
if (lctx.model.backend_out == &lctx.model.backend_cuda) ctx_o = ctx_cuda;
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (lctx.model.backend_inp == &lctx.model.backend_metal) ctx_i = ctx_metal;
|
||||||
|
if (lctx.model.backend_out == &lctx.model.backend_metal) ctx_o = ctx_metal;
|
||||||
|
#endif
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; il++) {
|
for (int il = 0; il < n_layer; il++) {
|
||||||
if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu;
|
if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
if (lctx.model.backend_layers[il] == &lctx.model.backend_cuda) ctx_ls[il] = ctx_cuda;
|
if (lctx.model.backend_layers[il] == &lctx.model.backend_cuda) ctx_ls[il] = ctx_cuda;
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (lctx.model.backend_layers[il] == &lctx.model.backend_metal) ctx_ls[il] = ctx_metal;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu;
|
if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
if (lctx.backend_kv == &lctx.model.backend_cuda) ctx_kv = ctx_cuda;
|
if (lctx.backend_kv == &lctx.model.backend_cuda) ctx_kv = ctx_cuda;
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (lctx.backend_kv == &lctx.model.backend_metal) ctx_kv = ctx_metal;
|
||||||
|
#endif
|
||||||
|
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
@ -1547,6 +1576,11 @@ static ggml_graph_splits llama_build_graph(
|
||||||
ggml_free(ctx_cuda);
|
ggml_free(ctx_cuda);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (ctx_metal != nullptr) {
|
||||||
|
ggml_free(ctx_metal);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return splits;
|
return splits;
|
||||||
}
|
}
|
||||||
|
@ -2651,7 +2685,6 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->rng = std::mt19937(params.seed);
|
ctx->rng = std::mt19937(params.seed);
|
||||||
ctx->logits_all = params.logits_all;
|
ctx->logits_all = params.logits_all;
|
||||||
|
|
||||||
|
|
||||||
// TODO: choose backend depending on n_layers/low_vram
|
// TODO: choose backend depending on n_layers/low_vram
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
if ((uint32_t)params.n_gpu_layers >= model->hparams.n_layer/2) {
|
if ((uint32_t)params.n_gpu_layers >= model->hparams.n_layer/2) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue