Fix passing param
This commit is contained in:
parent
12112bfa48
commit
26df64ad04
4 changed files with 40 additions and 30 deletions
|
@ -30,19 +30,21 @@
|
||||||
|
|
||||||
Run main with base model and lora adapter to hot-swap
|
Run main with base model and lora adapter to hot-swap
|
||||||
```bash
|
```bash
|
||||||
./main ./models/open-llama/ggml-model-f16.gguf \
|
./main -m ./models/open-llama/ggml-model-f16.gguf \
|
||||||
--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \
|
--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
|
||||||
-ngl 0 \
|
-ngl 0 \
|
||||||
-n 128
|
-n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors.
|
With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
|
||||||
|
|
||||||
# Logic
|
# Logic
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Current status
|
# Current status
|
||||||
|
|
||||||
- Only ony Lora adapter can be passed.
|
- Only one Lora adapter can be passed.
|
||||||
|
- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
|
||||||
- GPU not supported
|
- GPU not supported
|
|
@ -2443,6 +2443,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.n_ubatch = params.n_ubatch;
|
cparams.n_ubatch = params.n_ubatch;
|
||||||
cparams.n_threads = params.n_threads;
|
cparams.n_threads = params.n_threads;
|
||||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
|
const char* c_string = params.hot_lora.c_str();
|
||||||
|
strncpy(cparams.hot_lora, c_string, sizeof(cparams.hot_lora) - 1);
|
||||||
|
cparams.hot_lora[sizeof(cparams.hot_lora) - 1] = '\0'; // Ensure null-termination
|
||||||
|
|
||||||
cparams.seed = params.seed;
|
cparams.seed = params.seed;
|
||||||
cparams.logits_all = params.logits_all;
|
cparams.logits_all = params.logits_all;
|
||||||
cparams.embeddings = params.embedding;
|
cparams.embeddings = params.embedding;
|
||||||
|
|
17
llama.cpp
17
llama.cpp
|
@ -145,7 +145,7 @@ struct lora_info {
|
||||||
std::string filename;
|
std::string filename;
|
||||||
float scale;
|
float scale;
|
||||||
};
|
};
|
||||||
// TODO lora_data should maybe sub lora_weights in llama.cpp
|
// TODO lora_data should maybe sub lora_weights
|
||||||
struct lora_data {
|
struct lora_data {
|
||||||
struct lora_info info;
|
struct lora_info info;
|
||||||
std::vector<uint8_t> data;
|
std::vector<uint8_t> data;
|
||||||
|
@ -2502,7 +2502,7 @@ struct llama_context {
|
||||||
|
|
||||||
llama_cparams cparams;
|
llama_cparams cparams;
|
||||||
bool lora_loaded = false;
|
bool lora_loaded = false;
|
||||||
std::map<std::string, lora_weights> lora_weights_map;
|
std::map<std::string, lora_weights> lora_weights_map; // only one LoRA adapter at the moment
|
||||||
lora_data llora_data;
|
lora_data llora_data;
|
||||||
float lora_scale = 1.0f;
|
float lora_scale = 1.0f;
|
||||||
|
|
||||||
|
@ -16109,6 +16109,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.n_seq_max =*/ 1,
|
/*.n_seq_max =*/ 1,
|
||||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||||
|
/*.hot_lora =*/ "",
|
||||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
||||||
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
||||||
/*.rope_freq_base =*/ 0.0f,
|
/*.rope_freq_base =*/ 0.0f,
|
||||||
|
@ -16321,8 +16322,11 @@ struct llama_context * llama_new_context_with_model(
|
||||||
/// LORA
|
/// LORA
|
||||||
struct export_lora_params * lora_params = new struct export_lora_params;
|
struct export_lora_params * lora_params = new struct export_lora_params;
|
||||||
struct lora_info lora;
|
struct lora_info lora;
|
||||||
lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin";
|
// lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin";
|
||||||
lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras
|
lora.filename = params.hot_lora;
|
||||||
|
if (strlen(params.hot_lora) > 0) {
|
||||||
|
|
||||||
|
lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras?
|
||||||
lora_params->lora.push_back(lora);
|
lora_params->lora.push_back(lora);
|
||||||
// load all loras
|
// load all loras
|
||||||
std::vector<struct lora_data *> loras;
|
std::vector<struct lora_data *> loras;
|
||||||
|
@ -16344,10 +16348,9 @@ struct llama_context * llama_new_context_with_model(
|
||||||
for (const auto& pair : ctx->lora_weights_map) {
|
for (const auto& pair : ctx->lora_weights_map) {
|
||||||
keys.push_back(pair.first);
|
keys.push_back(pair.first);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// LORA
|
||||||
|
|
||||||
/// END LORA
|
|
||||||
|
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
auto & cparams = ctx->cparams;
|
auto & cparams = ctx->cparams;
|
||||||
|
|
1
llama.h
1
llama.h
|
@ -292,6 +292,7 @@ extern "C" {
|
||||||
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
||||||
uint32_t n_threads; // number of threads to use for generation
|
uint32_t n_threads; // number of threads to use for generation
|
||||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||||
|
char hot_lora[256]; // path to the hot lora file
|
||||||
|
|
||||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue