commit
						4fe0861a89
					
				
					 2 changed files with 37 additions and 49 deletions
				
			
		|  | @ -1875,7 +1875,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor | ||||||
| 
 | 
 | ||||||
|     bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) |     bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) | ||||||
|         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 |         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 | ||||||
|         && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1; |         && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2 | ||||||
|  |         && src1->ne[1] == 1; | ||||||
|     bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type) |     bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type) | ||||||
|         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 |         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 | ||||||
|         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; |         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; | ||||||
|  |  | ||||||
|  | @ -2831,7 +2831,7 @@ struct llama_context { | ||||||
| struct llama_lora_weight { | struct llama_lora_weight { | ||||||
|     struct ggml_tensor * a = nullptr; |     struct ggml_tensor * a = nullptr; | ||||||
|     struct ggml_tensor * b = nullptr; |     struct ggml_tensor * b = nullptr; | ||||||
|     llama_lora_weight() {} |     llama_lora_weight() = default; | ||||||
|     llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} |     llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | @ -18519,13 +18519,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { | static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { | ||||||
|     static const int n_inp_tensors = 5; // see llama_model
 |  | ||||||
|     static const int n_out_tensors = 5; // see llama_model
 |  | ||||||
|     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora); |     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora); | ||||||
| 
 | 
 | ||||||
|     ggml_context * ctx = nullptr; |     ggml_context * ctx = nullptr; | ||||||
|     struct gguf_init_params meta_gguf_params = { |     struct gguf_init_params meta_gguf_params = { | ||||||
|         /* .no_alloc = */ false, |         /* .no_alloc = */ true, | ||||||
|         /* .ctx      = */ &ctx, |         /* .ctx      = */ &ctx, | ||||||
|     }; |     }; | ||||||
|     struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); |     struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); | ||||||
|  | @ -18536,7 +18534,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c | ||||||
|     // check metadata
 |     // check metadata
 | ||||||
|     { |     { | ||||||
|         auto get_kv_str = [&](std::string key) -> std::string { |         auto get_kv_str = [&](std::string key) -> std::string { | ||||||
|             std::vector<char> str_buf(32, 0); // we only get the arch, so no need big buffer here
 |  | ||||||
|             int id = gguf_find_key(ctx_gguf, key.c_str()); |             int id = gguf_find_key(ctx_gguf, key.c_str()); | ||||||
|             return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); |             return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); | ||||||
|         }; |         }; | ||||||
|  | @ -18544,50 +18541,36 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c | ||||||
|         auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); |         auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); | ||||||
|         auto lora_arch = llm_arch_from_string(lora_arch_name); |         auto lora_arch = llm_arch_from_string(lora_arch_name); | ||||||
|         if (lora_arch != model->arch) { |         if (lora_arch != model->arch) { | ||||||
|  |             gguf_free(ctx_gguf); | ||||||
|             throw std::runtime_error("model arch and LoRA arch mismatch"); |             throw std::runtime_error("model arch and LoRA arch mismatch"); | ||||||
|         } |         } | ||||||
|  | 
 | ||||||
|         auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE)); |         auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE)); | ||||||
|         if (train_type != "finetune_lora") { |         if (train_type != "finetune_lora") { | ||||||
|  |             gguf_free(ctx_gguf); | ||||||
|             throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type); |             throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     // calculate n_tensors_per_layer
 |     int n_tensors = gguf_get_n_tensors(ctx_gguf); | ||||||
|     int n_tensors_per_layer = 0; |  | ||||||
|     { |  | ||||||
|         int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); |  | ||||||
|         for (int i = 0; i < n_tensors; i++) { |  | ||||||
|             int il = -1; |  | ||||||
|             sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il); |  | ||||||
|             if (il == 0) n_tensors_per_layer++; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| 
 | 
 | ||||||
|     // count layer buffer types
 |     // contexts for each buffer type
 | ||||||
|     std::map<ggml_backend_buffer_type_t, int> buft_tensor_count; |  | ||||||
|     for (int64_t i = 0; i < model->hparams.n_layer; i++) { |  | ||||||
|         buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer; |  | ||||||
|     } |  | ||||||
|     buft_tensor_count[model->buft_input.buft]  += n_inp_tensors; |  | ||||||
|     buft_tensor_count[model->buft_output.buft] += n_out_tensors; |  | ||||||
| 
 |  | ||||||
|     // allocate contexts
 |  | ||||||
|     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; |     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; | ||||||
|     { |     auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { | ||||||
|         auto new_ggml_ctx = [](size_t n_tensors) { |         auto it = ctx_map.find(buft); | ||||||
|  |         if (it == ctx_map.end()) { | ||||||
|  |             // add a new context
 | ||||||
|             struct ggml_init_params params = { |             struct ggml_init_params params = { | ||||||
|                 /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(), |                 /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(), | ||||||
|                 /*.mem_buffer =*/ NULL, |                 /*.mem_buffer =*/ NULL, | ||||||
|                 /*.no_alloc   =*/ true, |                 /*.no_alloc   =*/ true, | ||||||
|             }; |             }; | ||||||
|             return ggml_init(params); |             ggml_context * buft_ctx = ggml_init(params); | ||||||
|  |             ctx_map[buft] = buft_ctx; | ||||||
|  |             return buft_ctx; | ||||||
|  |         }; | ||||||
|  |         return it->second; | ||||||
|     }; |     }; | ||||||
|         for (auto & it : buft_tensor_count) { |  | ||||||
|             int n_tensors = it.second; |  | ||||||
|             // LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second);
 |  | ||||||
|             ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors
 |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| 
 | 
 | ||||||
|     // bundle lora_a and lora_b into pairs
 |     // bundle lora_a and lora_b into pairs
 | ||||||
|     std::map<std::string, llama_lora_weight> ab_map; |     std::map<std::string, llama_lora_weight> ab_map; | ||||||
|  | @ -18611,33 +18594,40 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c | ||||||
|                 ab_map[name].b = cur; |                 ab_map[name].b = cur; | ||||||
|             } |             } | ||||||
|         } else { |         } else { | ||||||
|             // maybe "optimizer.*"" tensors
 |             gguf_free(ctx_gguf); | ||||||
|             LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name); |             ggml_free(ctx); | ||||||
|  |             throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     // add tensors
 |     // add tensors
 | ||||||
|     for (auto & it : ab_map) { |     for (auto & it : ab_map) { | ||||||
|         std::string name = it.first; |         const std::string & name = it.first; | ||||||
|         const char * cname = name.c_str(); |  | ||||||
|         llama_lora_weight & w = it.second; |         llama_lora_weight & w = it.second; | ||||||
|         GGML_ASSERT(w.a != nullptr); | 
 | ||||||
|         GGML_ASSERT(w.b != nullptr); |         if (!w.a || !w.b) { | ||||||
|         int il = -1; |             gguf_free(ctx_gguf); | ||||||
|         sscanf(cname, "blk.%d.", &il); |             ggml_free(ctx); | ||||||
|  |             throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|         // device buft and device ctx
 |         // device buft and device ctx
 | ||||||
|         auto model_tensor = llama_get_model_tensor(model, cname); |         auto * model_tensor = llama_get_model_tensor(model, name.c_str()); | ||||||
|         if (!model_tensor) { |         if (!model_tensor) { | ||||||
|             gguf_free(ctx_gguf); |             gguf_free(ctx_gguf); | ||||||
|             ggml_free(ctx); |             ggml_free(ctx); | ||||||
|             throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); |             throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); | ||||||
|         } |         } | ||||||
|         struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer)); |         struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); | ||||||
|         // validate tensor shape
 |         // validate tensor shape
 | ||||||
|         if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { |         if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { | ||||||
|  |             gguf_free(ctx_gguf); | ||||||
|  |             ggml_free(ctx); | ||||||
|             throw std::runtime_error("tensor '" + name + "' has incorrect shape"); |             throw std::runtime_error("tensor '" + name + "' has incorrect shape"); | ||||||
|         } |         } | ||||||
|         if (w.a->ne[1] != w.b->ne[0]) { |         if (w.a->ne[1] != w.b->ne[0]) { | ||||||
|  |             gguf_free(ctx_gguf); | ||||||
|  |             ggml_free(ctx); | ||||||
|             throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); |             throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); | ||||||
|         } |         } | ||||||
|         // save tensor to adapter
 |         // save tensor to adapter
 | ||||||
|  | @ -18661,7 +18651,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c | ||||||
|                 ggml_free(ctx); |                 ggml_free(ctx); | ||||||
|                 throw std::runtime_error("failed to allocate buffer for lora adapter\n"); |                 throw std::runtime_error("failed to allocate buffer for lora adapter\n"); | ||||||
|             } |             } | ||||||
|             ggml_backend_buffer_clear(buf, 0); |             LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); | ||||||
|             adapter.ctxs.push_back(ctx_dev); |             adapter.ctxs.push_back(ctx_dev); | ||||||
|             adapter.bufs.push_back(buf); |             adapter.bufs.push_back(buf); | ||||||
|         } |         } | ||||||
|  | @ -18674,12 +18664,9 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c | ||||||
|         auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { |         auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { | ||||||
|             size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name)); |             size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name)); | ||||||
|             size_t size = ggml_nbytes(orig); |             size_t size = ggml_nbytes(orig); | ||||||
|             if (read_buf.size() < size) { |  | ||||||
|             read_buf.resize(size); |             read_buf.resize(size); | ||||||
|             } |  | ||||||
|             gguf_file.seek(offs, SEEK_SET); |             gguf_file.seek(offs, SEEK_SET); | ||||||
|             gguf_file.read_raw(read_buf.data(), size); |             gguf_file.read_raw(read_buf.data(), size); | ||||||
|             // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size);
 |  | ||||||
|             ggml_backend_tensor_set(dev, read_buf.data(), 0, size); |             ggml_backend_tensor_set(dev, read_buf.data(), 0, size); | ||||||
|         }; |         }; | ||||||
|         for (auto & it : adapter.ab_map) { |         for (auto & it : adapter.ab_map) { | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue