Fix passing param
This commit is contained in:
parent
12112bfa48
commit
26df64ad04
4 changed files with 40 additions and 30 deletions
|
@ -30,19 +30,21 @@
|
|||
|
||||
Run main with base model and lora adapter to hot-swap
|
||||
```bash
|
||||
./main ./models/open-llama/ggml-model-f16.gguf \
|
||||
--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \
|
||||
./main -m ./models/open-llama/ggml-model-f16.gguf \
|
||||
--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
|
||||
-ngl 0 \
|
||||
-n 128
|
||||
```
|
||||
|
||||
With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors.
|
||||
With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
|
||||
|
||||
# Logic
|
||||
|
||||
|
||||
|
||||
|
||||
# Current status
|
||||
|
||||
- Only ony Lora adapter can be passed.
|
||||
- Only one Lora adapter can be passed.
|
||||
- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
|
||||
- GPU not supported
|
|
@ -2443,6 +2443,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||
cparams.n_ubatch = params.n_ubatch;
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
const char* c_string = params.hot_lora.c_str();
|
||||
strncpy(cparams.hot_lora, c_string, sizeof(cparams.hot_lora) - 1);
|
||||
cparams.hot_lora[sizeof(cparams.hot_lora) - 1] = '\0'; // Ensure null-termination
|
||||
|
||||
cparams.seed = params.seed;
|
||||
cparams.logits_all = params.logits_all;
|
||||
cparams.embeddings = params.embedding;
|
||||
|
|
55
llama.cpp
55
llama.cpp
|
@ -145,7 +145,7 @@ struct lora_info {
|
|||
std::string filename;
|
||||
float scale;
|
||||
};
|
||||
// TODO lora_data should maybe sub lora_weights in llama.cpp
|
||||
// TODO lora_data should maybe sub lora_weights
|
||||
struct lora_data {
|
||||
struct lora_info info;
|
||||
std::vector<uint8_t> data;
|
||||
|
@ -2502,7 +2502,7 @@ struct llama_context {
|
|||
|
||||
llama_cparams cparams;
|
||||
bool lora_loaded = false;
|
||||
std::map<std::string, lora_weights> lora_weights_map;
|
||||
std::map<std::string, lora_weights> lora_weights_map; // only one LoRA adapter at the moment
|
||||
lora_data llora_data;
|
||||
float lora_scale = 1.0f;
|
||||
|
||||
|
@ -16109,6 +16109,7 @@ struct llama_context_params llama_context_default_params() {
|
|||
/*.n_seq_max =*/ 1,
|
||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||
/*.hot_lora =*/ "",
|
||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
||||
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
||||
/*.rope_freq_base =*/ 0.0f,
|
||||
|
@ -16321,33 +16322,35 @@ struct llama_context * llama_new_context_with_model(
|
|||
/// LORA
|
||||
struct export_lora_params * lora_params = new struct export_lora_params;
|
||||
struct lora_info lora;
|
||||
lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin";
|
||||
lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras
|
||||
lora_params->lora.push_back(lora);
|
||||
// load all loras
|
||||
std::vector<struct lora_data *> loras;
|
||||
for (size_t i = 0; i < lora_params->lora.size(); ++i) {
|
||||
struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
|
||||
if (llora_data != NULL) {
|
||||
loras.push_back(llora_data);
|
||||
// lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin";
|
||||
lora.filename = params.hot_lora;
|
||||
if (strlen(params.hot_lora) > 0) {
|
||||
|
||||
lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras?
|
||||
lora_params->lora.push_back(lora);
|
||||
// load all loras
|
||||
std::vector<struct lora_data *> loras;
|
||||
for (size_t i = 0; i < lora_params->lora.size(); ++i) {
|
||||
struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
|
||||
if (llora_data != NULL) {
|
||||
loras.push_back(llora_data);
|
||||
}
|
||||
}
|
||||
if (loras.size() == 0) {
|
||||
fprintf(stderr, "warning: no lora adapters will be applied.\n");
|
||||
}
|
||||
// Assign data
|
||||
ctx->llora_data = *loras[0];
|
||||
|
||||
// build the map?
|
||||
ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
|
||||
std::vector<std::string> keys;
|
||||
for (const auto& pair : ctx->lora_weights_map) {
|
||||
keys.push_back(pair.first);
|
||||
}
|
||||
}
|
||||
if (loras.size() == 0) {
|
||||
fprintf(stderr, "warning: no lora adapters will be applied.\n");
|
||||
}
|
||||
// Assign data
|
||||
ctx->llora_data = *loras[0];
|
||||
|
||||
// build the map?
|
||||
ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
|
||||
std::vector<std::string> keys;
|
||||
for (const auto& pair : ctx->lora_weights_map) {
|
||||
keys.push_back(pair.first);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// END LORA
|
||||
/// LORA
|
||||
|
||||
const auto & hparams = model->hparams;
|
||||
auto & cparams = ctx->cparams;
|
||||
|
|
1
llama.h
1
llama.h
|
@ -292,6 +292,7 @@ extern "C" {
|
|||
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
char hot_lora[256]; // path to the hot lora file
|
||||
|
||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue