build : on Mac OS enable Metal by default (#2901)
* build : on Mac OS enable Metal by default * make : try to fix build on Linux * make : move targets back to the top * make : fix target clean * llama : enable GPU inference by default with Metal * llama : fix vocab_only logic when GPU is enabled * common : better `n_gpu_layers` assignment * readme : update Metal instructions * make : fix merge conflict remnants * gitignore : metal
This commit is contained in:
		
							parent
							
								
									bd33e5ab92
								
							
						
					
					
						commit
						e36ecdccc8
					
				
					 9 changed files with 143 additions and 133 deletions
				
			
		
							
								
								
									
										54
									
								
								llama.cpp
									
										
									
									
									
								
							
							
						
						
									
										54
									
								
								llama.cpp
									
										
									
									
									
								
							|  | @ -5340,7 +5340,7 @@ struct llama_context_params llama_context_default_params() { | |||
|         /*.seed                        =*/ LLAMA_DEFAULT_SEED, | ||||
|         /*.n_ctx                       =*/ 512, | ||||
|         /*.n_batch                     =*/ 512, | ||||
|         /*.gpu_layers                  =*/ 0, | ||||
|         /*.n_gpu_layers                =*/ 0, | ||||
|         /*.main_gpu                    =*/ 0, | ||||
|         /*.tensor_split                =*/ nullptr, | ||||
|         /*.rope_freq_base              =*/ 10000.0f, | ||||
|  | @ -5357,6 +5357,10 @@ struct llama_context_params llama_context_default_params() { | |||
|         /*.embedding                   =*/ false, | ||||
|     }; | ||||
| 
 | ||||
| #ifdef GGML_USE_METAL | ||||
|     result.n_gpu_layers = 1; | ||||
| #endif | ||||
| 
 | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
|  | @ -5549,43 +5553,43 @@ struct llama_context * llama_new_context_with_model( | |||
|             } | ||||
| #endif | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
| #ifdef GGML_USE_METAL | ||||
|     if (params.n_gpu_layers > 0) { | ||||
|         // this allocates all Metal resources and memory buffers
 | ||||
|         if (params.n_gpu_layers > 0) { | ||||
|             // this allocates all Metal resources and memory buffers
 | ||||
| 
 | ||||
|         void * data_ptr  = NULL; | ||||
|         size_t data_size = 0; | ||||
|             void * data_ptr  = NULL; | ||||
|             size_t data_size = 0; | ||||
| 
 | ||||
|         if (params.use_mmap) { | ||||
|             data_ptr  = ctx->model.mapping->addr; | ||||
|             data_size = ctx->model.mapping->size; | ||||
|         } else { | ||||
|             data_ptr  = ggml_get_mem_buffer(ctx->model.ctx); | ||||
|             data_size = ggml_get_mem_size  (ctx->model.ctx); | ||||
|         } | ||||
|             if (params.use_mmap) { | ||||
|                 data_ptr  = ctx->model.mapping->addr; | ||||
|                 data_size = ctx->model.mapping->size; | ||||
|             } else { | ||||
|                 data_ptr  = ggml_get_mem_buffer(ctx->model.ctx); | ||||
|                 data_size = ggml_get_mem_size  (ctx->model.ctx); | ||||
|             } | ||||
| 
 | ||||
|         const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); | ||||
|             const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); | ||||
| 
 | ||||
|         LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); | ||||
|             LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); | ||||
| 
 | ||||
| #define LLAMA_METAL_CHECK_BUF(result)                            \ | ||||
|     if (!(result)) {                                             \ | ||||
|         LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ | ||||
|         llama_free(ctx);                                         \ | ||||
|         return NULL;                                             \ | ||||
|     } | ||||
|             if (!(result)) {                                             \ | ||||
|                 LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ | ||||
|                 llama_free(ctx);                                         \ | ||||
|                 return NULL;                                             \ | ||||
|             } | ||||
| 
 | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); | ||||
|             LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); | ||||
| 
 | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0)); | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); | ||||
|             LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0)); | ||||
|             LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); | ||||
| 
 | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0)); | ||||
|             LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0)); | ||||
| #undef LLAMA_METAL_CHECK_BUF | ||||
|     } | ||||
|         } | ||||
| #endif | ||||
|     } | ||||
| 
 | ||||
| #ifdef GGML_USE_MPI | ||||
|     ctx->ctx_mpi = ggml_mpi_init(); | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue