llama : add llama_model methods

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-01-06 16:13:01 +02:00
parent c3f9d25706
commit 662dd05016
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
5 changed files with 342 additions and 347 deletions

View file

@ -62,7 +62,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
cvec.tensors.reserve(hparams.n_layer);
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < hparams.n_layer; il++) {
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
ggml_backend_buffer_type_t buft = model.select_buft(il);
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
@ -262,7 +262,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
}
// device buft and device ctx
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
const auto * model_tensor = model.get_tensor( name.c_str());
if (!model_tensor) {
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
}

File diff suppressed because it is too large Load diff

View file

@ -10,6 +10,8 @@
#include <vector>
struct llama_model_loader;
// available models
// TODO: this enum does not follow the enum naming convention
enum llm_type {
@ -370,27 +372,26 @@ struct llama_model {
// total size of all the tensors in the model in bytes
size_t n_bytes = 0;
std::string arch_name() const;
std::string type_name() const;
std::string ftype_name() const;
ggml_backend_buffer_type_t select_buft(int il) const;
const struct ggml_tensor * get_tensor(const char * name) const;
size_t max_nodes() const;
void load_stats (llama_model_loader & ml);
void load_arch (llama_model_loader & ml);
void load_hparams(llama_model_loader & ml);
void load_vocab (llama_model_loader & ml);
void print_meta(llama_model_loader & ml);
private:
std::string token_to_piece(llama_token token, bool special) const;
};
const char * llm_type_name(llm_type type);
std::string llama_model_arch_name (const llama_model & model);
std::string llama_model_type_name (const llama_model & model);
std::string llama_model_ftype_name(const llama_model & model);
// used by llama_adapter_cvec
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
// used by llama_adapter_lora
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
size_t llama_model_max_nodes(const llama_model & model);
struct llama_model_loader;
// TODO: become llama_model methods
void llm_load_stats (llama_model_loader & ml, llama_model & model);
void llm_load_arch (llama_model_loader & ml, llama_model & model);
void llm_load_hparams (llama_model_loader & ml, llama_model & model);
void llm_load_vocab (llama_model_loader & ml, llama_model & model);
void llm_load_print_meta(llama_model_loader & ml, llama_model & model);

View file

@ -529,9 +529,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
ml.init_mappings(false); // no prefetching
llama_model model;
llm_load_arch (ml, model);
llm_load_hparams(ml, model);
llm_load_stats (ml, model);
model.load_arch (ml);
model.load_hparams(ml);
model.load_stats (ml);
struct quantize_state_impl qs(model, params);

View file

@ -2556,23 +2556,23 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
model.hparams.vocab_only = params.vocab_only;
try {
llm_load_arch(ml, model);
model.load_arch(ml);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
}
try {
llm_load_hparams(ml, model);
model.load_hparams(ml);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
}
try {
llm_load_vocab(ml, model);
model.load_vocab(ml);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
}
llm_load_stats(ml, model);
llm_load_print_meta(ml, model);
model.load_stats(ml);
model.print_meta(ml);
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
@ -3686,7 +3686,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_k_shift() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
GGML_ASSERT(kv_self.size == n_ctx);
@ -3736,7 +3736,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
for (uint32_t i = 0; i < ids.size(); ++i) {
const uint32_t id = ids[i];
@ -3995,7 +3995,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_llama() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -4161,7 +4161,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_deci() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -4322,7 +4322,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -4437,7 +4437,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_xverse() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -4540,7 +4540,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_falcon() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -4660,7 +4660,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_grok() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -4819,7 +4819,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_dbrx() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -4947,7 +4947,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_starcoder() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -5051,7 +5051,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_refact() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -5145,7 +5145,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_bert() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -5339,7 +5339,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_bloom() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -5440,7 +5440,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_mpt() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -5730,7 +5730,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_qwen() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -5842,7 +5842,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_qwen2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -5954,7 +5954,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_qwen2vl() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
@ -6072,7 +6072,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_qwen2moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -6220,7 +6220,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_phi2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -6341,7 +6341,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_phi3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -6595,7 +6595,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gpt2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -6700,7 +6700,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_codeshell() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -6811,7 +6811,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_orion() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -6929,7 +6929,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_internlm2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -7047,7 +7047,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_minicpm3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
//TODO: if the model varies, these parameters need to be read from the model
const int64_t n_embd_base = 256;
@ -7256,7 +7256,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gemma() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
@ -7364,7 +7364,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gemma2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
@ -7500,7 +7500,7 @@ struct llm_build_context {
struct ggml_cgraph * build_starcoder2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -7619,7 +7619,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_mamba() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@ -7674,7 +7674,7 @@ struct llm_build_context {
struct ggml_cgraph * build_command_r() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -7822,7 +7822,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_cohere2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -7959,7 +7959,7 @@ struct llm_build_context {
// * removed bias
// * removed MoE
struct ggml_cgraph * build_olmo() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -8083,7 +8083,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_olmo2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -8211,7 +8211,7 @@ struct llm_build_context {
// * removed bias
// * added q, k norm
struct ggml_cgraph * build_olmoe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -8337,7 +8337,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_openelm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -8462,7 +8462,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gptneox() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -8604,7 +8604,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_arctic() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -8738,7 +8738,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_deepseek() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -8895,7 +8895,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_deepseek2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -9125,7 +9125,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_bitnet() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -9276,7 +9276,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_t5_enc() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -9408,7 +9408,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_t5_dec() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -9613,7 +9613,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_jais() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -9705,7 +9705,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_chatglm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -9819,7 +9819,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_nemotron() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -9940,7 +9940,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_exaone() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -10067,7 +10067,7 @@ struct llm_build_context {
}
ggml_cgraph * build_rwkv6() {
ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
ggml_cgraph *gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// Token shift state dimensions should be 2 * n_emb
GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
@ -10298,7 +10298,7 @@ struct llm_build_context {
// * removed bias
// * removed MoE
struct ggml_cgraph * build_chameleon() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -10470,7 +10470,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_wavtokenizer_dec() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@ -11502,9 +11502,9 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
// - x2 for keys and values
//const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
//const uint32_t max_moves = model.max_nodes()/(6*n_layer);
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
const uint32_t max_moves = (lctx.model.max_nodes() - 2*n_layer)/(6*n_layer);
// determine which KV cells to move where
//
@ -12280,7 +12280,7 @@ struct llama_context * llama_new_context_with_model(
backend_ptrs.push_back(backend.get());
}
const size_t max_nodes = llama_model_max_nodes(*model);
const size_t max_nodes = model->max_nodes();
// buffer used to store the computation graph and the tensor meta data
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));