llama : add llama_vocab
, functions -> methods, naming (#11110)
* llama : functions -> methods (#11110) * llama : add struct llama_vocab to the API (#11156) ggml-ci * hparams : move vocab params to llama_vocab (#11159) ggml-ci * vocab : more pimpl (#11165) ggml-ci * vocab : minor tokenization optimizations (#11160) ggml-ci Co-authored-by: Diego Devesa <slarengh@gmail.com> * lora : update API names (#11167) ggml-ci * llama : update API names to use correct prefix (#11174) * llama : update API names to use correct prefix ggml-ci * cont ggml-ci * cont ggml-ci * minor [no ci] * vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174) ggml-ci * vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174) ggml-ci --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>
This commit is contained in:
parent
c05e8c9934
commit
afa8a9ec9b
68 changed files with 5855 additions and 5400 deletions
|
@ -1,5 +1,7 @@
|
|||
#include "llama-adapter.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-model.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
@ -9,7 +11,7 @@
|
|||
|
||||
// vec
|
||||
|
||||
struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
|
||||
struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
|
||||
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -17,7 +19,7 @@ struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
|
|||
return tensors[il];
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
|
||||
struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
|
||||
ggml_tensor * layer_dir = tensor_for(il);
|
||||
if (layer_dir != nullptr) {
|
||||
cur = ggml_add(ctx, cur, layer_dir);
|
||||
|
@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
|
|||
return cur;
|
||||
}
|
||||
|
||||
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
|
||||
bool llama_adapter_cvec::init(const llama_model & model) {
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
GGML_ASSERT(cvec.tensors.empty());
|
||||
GGML_ASSERT(cvec.ctxs.empty());
|
||||
GGML_ASSERT(cvec.bufs.empty());
|
||||
GGML_ASSERT(tensors.empty());
|
||||
GGML_ASSERT(ctxs.empty());
|
||||
GGML_ASSERT(bufs.empty());
|
||||
|
||||
// create a context for each buffer type
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
|
@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|||
}
|
||||
|
||||
ctx_map[buft] = ctx;
|
||||
cvec.ctxs.emplace_back(ctx);
|
||||
ctxs.emplace_back(ctx);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|||
};
|
||||
|
||||
// make tensors
|
||||
cvec.tensors.reserve(hparams.n_layer);
|
||||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
tensors.reserve(hparams.n_layer);
|
||||
tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
|
||||
ggml_backend_buffer_type_t buft = model.select_buft(il);
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
if (!ctx) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
|
||||
cvec.tensors.push_back(tensor);
|
||||
tensors.push_back(tensor);
|
||||
}
|
||||
|
||||
// allocate tensors / buffers and zero
|
||||
cvec.bufs.reserve(ctx_map.size());
|
||||
bufs.reserve(ctx_map.size());
|
||||
for (auto it : ctx_map) {
|
||||
ggml_backend_buffer_type_t buft = it.first;
|
||||
ggml_context * ctx = it.second;
|
||||
|
@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|||
return false;
|
||||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
cvec.bufs.emplace_back(buf);
|
||||
bufs.emplace_back(buf);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t llama_control_vector_apply(
|
||||
struct llama_control_vector & cvec,
|
||||
int32_t llama_adapter_cvec::apply(
|
||||
const llama_model & model,
|
||||
const float * data,
|
||||
size_t len,
|
||||
|
@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(
|
|||
|
||||
if (data == nullptr) {
|
||||
// disable the current control vector (but leave allocated for later)
|
||||
cvec.layer_start = -1;
|
||||
cvec.layer_end = -1;
|
||||
layer_start = -1;
|
||||
layer_end = -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
|
|||
return 1;
|
||||
}
|
||||
|
||||
if (cvec.tensors.empty()) {
|
||||
if (!llama_control_vector_init(cvec, model)) {
|
||||
if (tensors.empty()) {
|
||||
if (!init(model)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
cvec.layer_start = il_start;
|
||||
cvec.layer_end = il_end;
|
||||
layer_start = il_start;
|
||||
layer_end = il_end;
|
||||
|
||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||
assert(cvec.tensors[il] != nullptr);
|
||||
assert(tensors[il] != nullptr);
|
||||
|
||||
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
||||
if (off + n_embd <= len) {
|
||||
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
|
||||
ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(
|
|||
|
||||
// lora
|
||||
|
||||
llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
|
||||
llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
|
||||
const std::string name(w->name);
|
||||
|
||||
const auto pos = ab_map.find(name);
|
||||
|
@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
||||
delete adapter;
|
||||
}
|
||||
|
||||
static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
|
||||
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
|
||||
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
||||
|
||||
ggml_context * ctx_init;
|
||||
|
@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
};
|
||||
|
||||
// bundle lora_a and lora_b into pairs
|
||||
std::map<std::string, llama_lora_weight> ab_map;
|
||||
std::map<std::string, llama_adapter_lora_weight> ab_map;
|
||||
auto str_endswith = [](const std::string & str, const std::string & suffix) {
|
||||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
||||
};
|
||||
|
@ -231,14 +228,14 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
if (str_endswith(name, ".lora_a")) {
|
||||
replace_all(name, ".lora_a", "");
|
||||
if (ab_map.find(name) == ab_map.end()) {
|
||||
ab_map[name] = llama_lora_weight(cur, nullptr);
|
||||
ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
|
||||
} else {
|
||||
ab_map[name].a = cur;
|
||||
}
|
||||
} else if (str_endswith(name, ".lora_b")) {
|
||||
replace_all(name, ".lora_b", "");
|
||||
if (ab_map.find(name) == ab_map.end()) {
|
||||
ab_map[name] = llama_lora_weight(nullptr, cur);
|
||||
ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
|
||||
} else {
|
||||
ab_map[name].b = cur;
|
||||
}
|
||||
|
@ -254,7 +251,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
// add tensors
|
||||
for (auto & it : ab_map) {
|
||||
const std::string & name = it.first;
|
||||
llama_lora_weight & w = it.second;
|
||||
llama_adapter_lora_weight & w = it.second;
|
||||
bool is_token_embd = str_endswith(name, "token_embd.weight");
|
||||
|
||||
if (!w.a || !w.b) {
|
||||
|
@ -262,7 +259,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
}
|
||||
|
||||
// device buft and device ctx
|
||||
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
|
||||
const auto * model_tensor = model.get_tensor(name.c_str());
|
||||
if (!model_tensor) {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
||||
}
|
||||
|
@ -288,7 +285,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
|
||||
ggml_set_name(tensor_a, w.a->name);
|
||||
ggml_set_name(tensor_b, w.b->name);
|
||||
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
|
||||
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
|
||||
}
|
||||
|
||||
// allocate tensors / buffers and zero
|
||||
|
@ -330,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
||||
}
|
||||
|
||||
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
|
||||
struct llama_lora_adapter * adapter = new llama_lora_adapter();
|
||||
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
|
||||
struct llama_adapter_lora * adapter = new llama_adapter_lora();
|
||||
|
||||
try {
|
||||
llama_lora_adapter_init_impl(*model, path_lora, *adapter);
|
||||
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
||||
return adapter;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
|
@ -344,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,
|
|||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
|
||||
delete adapter;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue