llama : refactor model loader with backend registry (#10026)

This commit is contained in:
Diego Devesa 2024-10-30 02:01:23 +01:00 committed by GitHub
parent 8f275a7c45
commit c5b0f4b5d9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 1903 additions and 2019 deletions

View file

@ -205,7 +205,7 @@ extern "C" {
enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
};
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@ -274,10 +274,7 @@ extern "C" {
int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// main_gpu interpretation depends on split_mode:
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_MODE_LAYER: ignored
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
int32_t main_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()