PR #4766
This commit is contained in:
parent
3773e1afe7
commit
1eb8804c18
18 changed files with 2183 additions and 2081 deletions
17
llama.h
17
llama.h
|
@ -116,6 +116,12 @@ extern "C" {
|
|||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||
};
|
||||
|
||||
enum llama_split_mode {
|
||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
||||
};
|
||||
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
float logit; // log-odds of the token
|
||||
|
@ -178,8 +184,15 @@ extern "C" {
|
|||
|
||||
struct llama_model_params {
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
// main_gpu interpretation depends on split_mode:
|
||||
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
||||
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
||||
// LLAMA_SPLIT_LAYER: ignored
|
||||
int32_t main_gpu;
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
||||
const float * tensor_split;
|
||||
|
||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||
// If the provided progress_callback returns true, model loading continues.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue