Support using mmap when applying LoRA (#2095)
* Support using mmap when applying LoRA * Fix Linux * Update comment to reflect the support lora with mmap
This commit is contained in:
		
							parent
							
								
									bbef28218f
								
							
						
					
					
						commit
						2347463201
					
				
					 5 changed files with 7 additions and 9 deletions
				
			
		|  | @ -267,7 +267,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { | ||||||
|                 break; |                 break; | ||||||
|             } |             } | ||||||
|             params.lora_adapter = argv[i]; |             params.lora_adapter = argv[i]; | ||||||
|             params.use_mmap = false; |  | ||||||
|         } else if (arg == "--lora-base") { |         } else if (arg == "--lora-base") { | ||||||
|             if (++i >= argc) { |             if (++i >= argc) { | ||||||
|                 invalid_param = true; |                 invalid_param = true; | ||||||
|  | @ -499,7 +498,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { | ||||||
|     fprintf(stderr, "  --mtest               compute maximum memory usage\n"); |     fprintf(stderr, "  --mtest               compute maximum memory usage\n"); | ||||||
|     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n"); |     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n"); | ||||||
|     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n"); |     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n"); | ||||||
|     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n"); |     fprintf(stderr, "  --lora FNAME          apply LoRA adapter\n"); | ||||||
|     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n"); |     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n"); | ||||||
|     fprintf(stderr, "  -m FNAME, --model FNAME\n"); |     fprintf(stderr, "  -m FNAME, --model FNAME\n"); | ||||||
|     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str()); |     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str()); | ||||||
|  |  | ||||||
|  | @ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa | ||||||
| -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. | -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. | ||||||
| -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. | -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. | ||||||
| -   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS. | -   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS. | ||||||
| -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. | -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains. | ||||||
| -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. | -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. | ||||||
|  |  | ||||||
|  | @ -16,7 +16,7 @@ Command line options: | ||||||
| -   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. | -   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. | ||||||
| -   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. | -   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. | ||||||
| -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. | -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. | ||||||
| -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. | -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains. | ||||||
| -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. | -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. | ||||||
| -   `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. | -   `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. | ||||||
| -   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. | -   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. | ||||||
|  |  | ||||||
|  | @ -632,7 +632,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, | ||||||
|     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str()); |     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str()); | ||||||
|     fprintf(stderr, "  -a ALIAS, --alias ALIAS\n"); |     fprintf(stderr, "  -a ALIAS, --alias ALIAS\n"); | ||||||
|     fprintf(stderr, "                        set an alias for the model, will be added as `model` field in completion response\n"); |     fprintf(stderr, "                        set an alias for the model, will be added as `model` field in completion response\n"); | ||||||
|     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n"); |     fprintf(stderr, "  --lora FNAME          apply LoRA adapter\n"); | ||||||
|     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n"); |     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n"); | ||||||
|     fprintf(stderr, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str()); |     fprintf(stderr, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str()); | ||||||
|     fprintf(stderr, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port); |     fprintf(stderr, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port); | ||||||
|  | @ -820,7 +820,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, | ||||||
|                 break; |                 break; | ||||||
|             } |             } | ||||||
|             params.lora_adapter = argv[i]; |             params.lora_adapter = argv[i]; | ||||||
|             params.use_mmap = false; |  | ||||||
|         } |         } | ||||||
|         else if (arg == "--lora-base") |         else if (arg == "--lora-base") | ||||||
|         { |         { | ||||||
|  |  | ||||||
|  | @ -175,13 +175,13 @@ struct llama_mmap { | ||||||
|     llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { |     llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { | ||||||
|         size = file->size; |         size = file->size; | ||||||
|         int fd = fileno(file->fp); |         int fd = fileno(file->fp); | ||||||
|         int flags = MAP_SHARED; |         int flags = MAP_PRIVATE; | ||||||
|         // prefetch/readahead impairs performance on NUMA systems
 |         // prefetch/readahead impairs performance on NUMA systems
 | ||||||
|         if (numa) { prefetch = 0; } |         if (numa) { prefetch = 0; } | ||||||
| #ifdef __linux__ | #ifdef __linux__ | ||||||
|         if (prefetch) { flags |= MAP_POPULATE; } |         if (prefetch) { flags |= MAP_POPULATE; } | ||||||
| #endif | #endif | ||||||
|         addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); |         addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0); | ||||||
|         if (addr == MAP_FAILED) { |         if (addr == MAP_FAILED) { | ||||||
|             throw std::runtime_error(format("mmap failed: %s", strerror(errno))); |             throw std::runtime_error(format("mmap failed: %s", strerror(errno))); | ||||||
|         } |         } | ||||||
|  | @ -223,7 +223,7 @@ struct llama_mmap { | ||||||
|             throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); |             throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); |         addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0); | ||||||
|         error = GetLastError(); |         error = GetLastError(); | ||||||
|         CloseHandle(hMapping); |         CloseHandle(hMapping); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue