speculative: add --n-gpu-layers-draft option (#3063)
This commit is contained in:
		
							parent
							
								
									b52b29ab9d
								
							
						
					
					
						commit
						84e723653c
					
				
					 3 changed files with 15 additions and 0 deletions
				
			
		|  | @ -374,6 +374,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { | |||
| #else | ||||
|             fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); | ||||
|             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); | ||||
| #endif | ||||
|         } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
| #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD | ||||
|             params.n_gpu_layers_draft = std::stoi(argv[i]); | ||||
| #else | ||||
|             fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); | ||||
|             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); | ||||
| #endif | ||||
|         } else if (arg == "--main-gpu" || arg == "-mg") { | ||||
|             if (++i >= argc) { | ||||
|  | @ -664,6 +675,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { | |||
| #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD | ||||
|     printf("  -ngl N, --n-gpu-layers N\n"); | ||||
|     printf("                        number of layers to store in VRAM\n"); | ||||
|     printf("  -ngld N, --n-gpu-layers-draft N\n"); | ||||
|     printf("                        number of layers to store in VRAM for the draft model\n"); | ||||
|     printf("  -ts SPLIT --tensor-split SPLIT\n"); | ||||
|     printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); | ||||
|     printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n"); | ||||
|  |  | |||
|  | @ -38,6 +38,7 @@ struct gpt_params { | |||
|     int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
 | ||||
|     int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
 | ||||
|     int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
 | ||||
|     int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
 | ||||
|     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
 | ||||
|     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
 | ||||
|     int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
 | ||||
|  |  | |||
|  | @ -42,6 +42,7 @@ int main(int argc, char ** argv) { | |||
| 
 | ||||
|     // load the draft model
 | ||||
|     params.model = params.model_draft; | ||||
|     params.n_gpu_layers = params.n_gpu_layers_draft; | ||||
|     std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); | ||||
| 
 | ||||
|     // tokenize the prompt
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue