Fix passing param

This commit is contained in:
ltoniazzi 2024-06-21 17:28:14 +01:00
parent 12112bfa48
commit 26df64ad04
4 changed files with 40 additions and 30 deletions

View file

@ -30,19 +30,21 @@
Run main with base model and lora adapter to hot-swap
```bash
./main ./models/open-llama/ggml-model-f16.gguf \
--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \
./main -m ./models/open-llama/ggml-model-f16.gguf \
--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
-ngl 0 \
-n 128
```
With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors.
With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
# Logic
# Current status
- Only ony Lora adapter can be passed.
- Only one Lora adapter can be passed.
- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
- GPU not supported

View file

@ -2443,6 +2443,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.n_ubatch = params.n_ubatch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
const char* c_string = params.hot_lora.c_str();
strncpy(cparams.hot_lora, c_string, sizeof(cparams.hot_lora) - 1);
cparams.hot_lora[sizeof(cparams.hot_lora) - 1] = '\0'; // Ensure null-termination
cparams.seed = params.seed;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;

View file

@ -145,7 +145,7 @@ struct lora_info {
std::string filename;
float scale;
};
// TODO lora_data should maybe sub lora_weights in llama.cpp
// TODO lora_data should maybe sub lora_weights
struct lora_data {
struct lora_info info;
std::vector<uint8_t> data;
@ -2502,7 +2502,7 @@ struct llama_context {
llama_cparams cparams;
bool lora_loaded = false;
std::map<std::string, lora_weights> lora_weights_map;
std::map<std::string, lora_weights> lora_weights_map; // only one LoRA adapter at the moment
lora_data llora_data;
float lora_scale = 1.0f;
@ -16109,6 +16109,7 @@ struct llama_context_params llama_context_default_params() {
/*.n_seq_max =*/ 1,
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
/*.hot_lora =*/ "",
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
/*.rope_freq_base =*/ 0.0f,
@ -16321,33 +16322,35 @@ struct llama_context * llama_new_context_with_model(
/// LORA
struct export_lora_params * lora_params = new struct export_lora_params;
struct lora_info lora;
lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin";
lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras
lora_params->lora.push_back(lora);
// load all loras
std::vector<struct lora_data *> loras;
for (size_t i = 0; i < lora_params->lora.size(); ++i) {
struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
if (llora_data != NULL) {
loras.push_back(llora_data);
// lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin";
lora.filename = params.hot_lora;
if (strlen(params.hot_lora) > 0) {
lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras?
lora_params->lora.push_back(lora);
// load all loras
std::vector<struct lora_data *> loras;
for (size_t i = 0; i < lora_params->lora.size(); ++i) {
struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
if (llora_data != NULL) {
loras.push_back(llora_data);
}
}
if (loras.size() == 0) {
fprintf(stderr, "warning: no lora adapters will be applied.\n");
}
// Assign data
ctx->llora_data = *loras[0];
// build the map?
ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
std::vector<std::string> keys;
for (const auto& pair : ctx->lora_weights_map) {
keys.push_back(pair.first);
}
}
if (loras.size() == 0) {
fprintf(stderr, "warning: no lora adapters will be applied.\n");
}
// Assign data
ctx->llora_data = *loras[0];
// build the map?
ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
std::vector<std::string> keys;
for (const auto& pair : ctx->lora_weights_map) {
keys.push_back(pair.first);
}
/// END LORA
/// LORA
const auto & hparams = model->hparams;
auto & cparams = ctx->cparams;

View file

@ -292,6 +292,7 @@ extern "C" {
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
char hot_lora[256]; // path to the hot lora file
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id