speculative: add --n-gpu-layers-draft option (#3063)
This commit is contained in:
parent
b52b29ab9d
commit
84e723653c
3 changed files with 15 additions and 0 deletions
|
@ -38,6 +38,7 @@ struct gpt_params {
|
|||
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue