This commit is contained in:
mike dupont 2023-12-05 11:06:00 -05:00
parent 2b6ff2ec54
commit 5ea96cc710
3 changed files with 225 additions and 352 deletions

View file

@ -13,9 +13,11 @@ enum llm_arch {
LLM_ARCH_REFACT, LLM_ARCH_REFACT,
LLM_ARCH_BLOOM, LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM, LLM_ARCH_STABLELM,
LLM_ARCH_QWEN,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
enum llm_kv { enum llm_kv {
LLM_KV_GENERAL_ARCHITECTURE, LLM_KV_GENERAL_ARCHITECTURE,
LLM_KV_GENERAL_QUANTIZATION_VERSION, LLM_KV_GENERAL_QUANTIZATION_VERSION,
@ -141,41 +143,7 @@ struct llama_cparams {
bool mul_mat_q; bool mul_mat_q;
}; };
struct llama_layer { #include "llama-layer.hpp"
// normalization
struct ggml_tensor * attn_norm;
struct ggml_tensor * attn_norm_b;
struct ggml_tensor * attn_norm_2;
struct ggml_tensor * attn_norm_2_b;
struct ggml_tensor * attn_q_norm;
struct ggml_tensor * attn_q_norm_b;
struct ggml_tensor * attn_k_norm;
struct ggml_tensor * attn_k_norm_b;
// attention
struct ggml_tensor * wq;
struct ggml_tensor * wk;
struct ggml_tensor * wv;
struct ggml_tensor * wo;
struct ggml_tensor * wqkv;
// attention bias
struct ggml_tensor * bo;
struct ggml_tensor * bqkv;
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
// ff
struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * ffn_up; // w3
// ff bias
struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3
};
struct llama_kv_cell { struct llama_kv_cell {
llama_pos pos = -1; llama_pos pos = -1;
@ -211,7 +179,8 @@ struct llama_kv_cache {
// for a free KV slot. llama_decode_internal also uses it, so it // for a free KV slot. llama_decode_internal also uses it, so it
// cannot be freely changed after a slot has been allocated. // cannot be freely changed after a slot has been allocated.
uint32_t head = 0; uint32_t head = 0;
uint32_t size = 0; uint32_t size = 0;
uint32_t used = 0; // used cells (i.e. at least one seq_id);
// computed before each graph build // computed before each graph build
uint32_t n = 0; uint32_t n = 0;
@ -225,18 +194,7 @@ struct llama_kv_cache {
llama_buffer buf; llama_buffer buf;
~llama_kv_cache() { ~llama_kv_cache();
if (ctx) {
ggml_free(ctx);
}
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
ggml_cuda_free_data(k);
ggml_cuda_free_data(v);
}
#endif
}
}; };
struct llama_vocab { struct llama_vocab {
@ -275,19 +233,7 @@ struct llama_vocab {
id special_suffix_id = 32008; id special_suffix_id = 32008;
id special_eot_id = 32010; id special_eot_id = 32010;
int find_bpe_rank(std::string token_left, std::string token_right) const { int find_bpe_rank(std::string token_left, std::string token_right) const;
GGML_ASSERT(token_left.find(" ") == std::string::npos);
GGML_ASSERT(token_left.find("\n") == std::string::npos);
GGML_ASSERT(token_right.find(" ") == std::string::npos);
GGML_ASSERT(token_right.find("\n") == std::string::npos);
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
if (it == bpe_ranks.end()) {
return -1;
}
return it->second;
}
}; };
struct llama_mmap { struct llama_mmap {
@ -429,30 +375,12 @@ struct llama_model {
int64_t t_load_us = 0; int64_t t_load_us = 0;
int64_t t_start_us = 0; int64_t t_start_us = 0;
~llama_model() { ~llama_model() ;
if (ctx) {
ggml_free(ctx);
}
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cuda_free_data(tensors_by_name[i].second);
}
ggml_cuda_free_scratch();
}
#endif
#if defined(GGML_USE_CLBLAST)
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cl_free_data(tensors_by_name[i].second);
}
#endif
}
}; };
struct llama_context { struct llama_context {
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} llama_context(const llama_model & model);
~llama_context(); ~llama_context();
llama_cparams cparams; llama_cparams cparams;
@ -540,6 +468,8 @@ struct llama_state {
// We save the log callback globally // We save the log callback globally
ggml_log_callback log_callback; ggml_log_callback log_callback;
void * log_callback_user_data = nullptr; void * log_callback_user_data = nullptr;
bool operator!=(const llama_hparams & other) const;
static llama_state g_state;
}; };
@ -578,7 +508,7 @@ struct llama_model_loader {
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ; struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ;
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) ; struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) ;
void done_getting_tensors() const; void done_getting_tensors() const;
@ -739,6 +669,7 @@ struct llm_build_context {
struct ggml_cgraph * build_bloom() ; struct ggml_cgraph * build_bloom() ;
struct ggml_cgraph * build_mpt() ; struct ggml_cgraph * build_mpt() ;
struct ggml_cgraph * build_stablelm(); struct ggml_cgraph * build_stablelm();
struct ggml_cgraph * build_qwen();
}; };

477
llama.cpp
View file

@ -119,13 +119,13 @@ static size_t utf8_len(char src) {
static void replace_all(std::string & s, const std::string & search, const std::string & replace) { static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result; std::string result;
for (size_t pos = 0; ; pos += search.length()) { for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos); auto new_pos = s.find(search, pos);
if (new_pos == std::string::npos) { if (new_pos == std::string::npos) {
result += s.substr(pos, s.size() - pos); result += s.substr(pos, s.size() - pos);
break; break;
} }
result += s.substr(pos, new_pos - pos) + replace; result += s.substr(pos, new_pos - pos) + replace;
pos = new_pos; pos = new_pos;
} }
s = std::move(result); s = std::move(result);
} }
@ -182,22 +182,6 @@ static std::string format(const char * fmt, ...) {
// gguf constants (sync with gguf.py) // gguf constants (sync with gguf.py)
// //
enum llm_arch {
LLM_ARCH_LLAMA,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GPT2,
LLM_ARCH_GPTJ,
LLM_ARCH_GPTNEOX,
LLM_ARCH_MPT,
LLM_ARCH_STARCODER,
LLM_ARCH_PERSIMMON,
LLM_ARCH_REFACT,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
LLM_ARCH_QWEN,
LLM_ARCH_UNKNOWN,
};
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = { static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_LLAMA, "llama" },
@ -641,6 +625,8 @@ static std::string llama_format_win_err(DWORD err) {
} }
#endif #endif
//struct llama_buffer { //struct llama_buffer {
void llama_buffer::resize(size_t n) { void llama_buffer::resize(size_t n) {
@ -1014,168 +1000,128 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
// globals // globals
// //
struct llama_state { //struct llama_state {
llama_state() { llama_state::llama_state() {
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
ggml_metal_log_set_callback(log_callback, log_callback_user_data); ggml_metal_log_set_callback(log_callback, log_callback_user_data);
#endif #endif
} }
// We save the log callback globally // We save the log callback globally
ggml_log_callback log_callback = llama_log_callback_default; // ggml_log_callback log_callback = llama_log_callback_default;
void * log_callback_user_data = nullptr; // void * log_callback_user_data = nullptr;
}; //};
// uint32_t n_gqa() const {
// return n_head/n_head_kv;
// }
// uint32_t n_embd_head() const {
// return n_embd/n_head;
// }
// uint32_t n_embd_gqa() const {
// return n_embd/n_gqa();
// }
// };
static llama_state g_state; static llama_state g_state;
// struct llama_cparams {
// uint32_t n_ctx; // context size used during inference
// uint32_t n_batch;
// uint32_t n_threads; // number of threads to use for generation
// uint32_t n_threads_batch; // number of threads to use for batch processing
static const size_t kiB = 1024; // float rope_freq_base;
static const size_t MiB = 1024*kiB; // float rope_freq_scale;
static const size_t GiB = 1024*MiB;
// uint32_t n_yarn_orig_ctx;
// // These hyperparameters are not exposed in GGUF, because all
// // existing YaRN models use the same values for them.
// float yarn_ext_factor;
// float yarn_attn_factor;
// float yarn_beta_fast;
// float yarn_beta_slow;
float f_norm_eps; // bool mul_mat_q;
float f_norm_rms_eps; // };
float rope_freq_base_train; // struct llama_layer {
float rope_freq_scale_train; // // normalization
uint32_t n_yarn_orig_ctx; // struct ggml_tensor * attn_norm;
int8_t rope_scaling_type_train : 3; // struct ggml_tensor * attn_norm_b;
bool rope_finetuned : 1; // struct ggml_tensor * attn_norm_2;
// struct ggml_tensor * attn_norm_2_b;
// struct ggml_tensor * attn_q_norm;
// struct ggml_tensor * attn_q_norm_b;
// struct ggml_tensor * attn_k_norm;
// struct ggml_tensor * attn_k_norm_b;
float f_clamp_kqv; // // attention
float f_max_alibi_bias; // struct ggml_tensor * wq;
// struct ggml_tensor * wk;
// struct ggml_tensor * wv;
// struct ggml_tensor * wo;
// struct ggml_tensor * wqkv;
bool operator!=(const llama_hparams & other) const { // // attention bias
if (this->vocab_only != other.vocab_only) return true; // struct ggml_tensor * bq;
if (this->n_vocab != other.n_vocab) return true; // struct ggml_tensor * bk;
if (this->n_ctx_train != other.n_ctx_train) return true; // struct ggml_tensor * bv;
if (this->n_embd != other.n_embd) return true; // struct ggml_tensor * bo;
if (this->n_head != other.n_head) return true; // struct ggml_tensor * bqkv;
if (this->n_head_kv != other.n_head_kv) return true;
if (this->n_layer != other.n_layer) return true;
if (this->n_rot != other.n_rot) return true;
if (this->n_ff != other.n_ff) return true;
if (this->rope_finetuned != other.rope_finetuned) return true;
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
const float EPSILON = 1e-9; // // normalization
// struct ggml_tensor * ffn_norm;
// struct ggml_tensor * ffn_norm_b;
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; // // ff
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; // struct ggml_tensor * ffn_gate; // w1
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; // struct ggml_tensor * ffn_down; // w2
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; // struct ggml_tensor * ffn_up; // w3
return false; // // ff bias
} // struct ggml_tensor * ffn_down_b; // b2
// struct ggml_tensor * ffn_up_b; // b3
// };
uint32_t n_gqa() const { // struct llama_kv_cell {
return n_head/n_head_kv; // llama_pos pos = -1;
} // llama_pos delta = 0;
uint32_t n_embd_head() const { // std::set<llama_seq_id> seq_id;
return n_embd/n_head;
}
uint32_t n_embd_gqa() const { // bool has_seq_id(const llama_seq_id & id) const {
return n_embd/n_gqa(); // return seq_id.find(id) != seq_id.end();
} // }
}; // };
struct llama_cparams {
uint32_t n_ctx; // context size used during inference
uint32_t n_batch;
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
float rope_freq_base;
float rope_freq_scale;
uint32_t n_yarn_orig_ctx;
// These hyperparameters are not exposed in GGUF, because all
// existing YaRN models use the same values for them.
float yarn_ext_factor;
float yarn_attn_factor;
float yarn_beta_fast;
float yarn_beta_slow;
bool mul_mat_q;
};
struct llama_layer {
// normalization
struct ggml_tensor * attn_norm;
struct ggml_tensor * attn_norm_b;
struct ggml_tensor * attn_norm_2;
struct ggml_tensor * attn_norm_2_b;
struct ggml_tensor * attn_q_norm;
struct ggml_tensor * attn_q_norm_b;
struct ggml_tensor * attn_k_norm;
struct ggml_tensor * attn_k_norm_b;
// attention
struct ggml_tensor * wq;
struct ggml_tensor * wk;
struct ggml_tensor * wv;
struct ggml_tensor * wo;
struct ggml_tensor * wqkv;
// attention bias
struct ggml_tensor * bq;
struct ggml_tensor * bk;
struct ggml_tensor * bv;
struct ggml_tensor * bo;
struct ggml_tensor * bqkv;
// normalization
struct ggml_tensor * ffn_norm;
struct ggml_tensor * ffn_norm_b;
// ff
struct ggml_tensor * ffn_gate; // w1
struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * ffn_up; // w3
// ff bias
struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3
};
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
std::set<llama_seq_id> seq_id;
bool has_seq_id(const llama_seq_id & id) const {
return seq_id.find(id) != seq_id.end();
}
};
// ring-buffer of cached KV data // ring-buffer of cached KV data
struct llama_kv_cache { // struct llama_kv_cache {
bool has_shift = false; // bool has_shift = false;
// Note: The value of head isn't only used to optimize searching // // Note: The value of head isn't only used to optimize searching
// for a free KV slot. llama_decode_internal also uses it, so it // // for a free KV slot. llama_decode_internal also uses it, so it
// cannot be freely changed after a slot has been allocated. // // cannot be freely changed after a slot has been allocated.
uint32_t head = 0; // uint32_t head = 0;
uint32_t size = 0; // uint32_t size = 0;
uint32_t used = 0; // used cells (i.e. at least one seq_id) // uint32_t used = 0; // used cells (i.e. at least one seq_id)
// computed before each graph build // // computed before each graph build
uint32_t n = 0; // uint32_t n = 0;
std::vector<llama_kv_cell> cells; // std::vector<llama_kv_cell> cells;
struct ggml_tensor * k = NULL; // struct ggml_tensor * k = NULL;
struct ggml_tensor * v = NULL; // struct ggml_tensor * v = NULL;
struct ggml_context * ctx = NULL; // struct ggml_context * ctx = NULL;
llama_buffer buf; // llama_buffer buf;
~llama_kv_cache() { llama_kv_cache::~llama_kv_cache() {
if (ctx) { if (ctx) {
ggml_free(ctx); ggml_free(ctx);
} }
@ -1187,45 +1133,45 @@ struct llama_kv_cache {
} }
#endif #endif
} }
}; //};
struct llama_vocab { // struct llama_vocab {
using id = int32_t; // using id = int32_t;
using token = std::string; // using token = std::string;
using ttype = llama_token_type; // using ttype = llama_token_type;
struct token_data { // struct token_data {
token text; // token text;
float score; // float score;
ttype type; // ttype type;
}; // };
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; // enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
std::unordered_map<token, id> token_to_id; // std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token; // std::vector<token_data> id_to_token;
std::unordered_map<token, id> special_tokens_cache; // std::unordered_map<token, id> special_tokens_cache;
std::map<std::pair<std::string, std::string>, int> bpe_ranks; // std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens // // default LLaMA special tokens
id special_bos_id = 1; // id special_bos_id = 1;
id special_eos_id = 2; // id special_eos_id = 2;
id special_unk_id = 0; // id special_unk_id = 0;
id special_sep_id = -1; // id special_sep_id = -1;
id special_pad_id = -1; // id special_pad_id = -1;
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add. // int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add. // int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
id linefeed_id = 13; // id linefeed_id = 13;
id special_prefix_id = 32007; // id special_prefix_id = 32007;
id special_middle_id = 32009; // id special_middle_id = 32009;
id special_suffix_id = 32008; // id special_suffix_id = 32008;
id special_eot_id = 32010; // id special_eot_id = 32010;
int find_bpe_rank(std::string token_left, std::string token_right) const { int llama_vocab::find_bpe_rank(std::string token_left, std::string token_right) const {
GGML_ASSERT(token_left.find(" ") == std::string::npos); GGML_ASSERT(token_left.find(" ") == std::string::npos);
GGML_ASSERT(token_left.find("\n") == std::string::npos); GGML_ASSERT(token_left.find("\n") == std::string::npos);
GGML_ASSERT(token_right.find(" ") == std::string::npos); GGML_ASSERT(token_right.find(" ") == std::string::npos);
@ -1238,54 +1184,54 @@ struct llama_vocab {
return it->second; return it->second;
} }
}; //};
struct llama_model { // struct llama_model {
e_model type = MODEL_UNKNOWN; // e_model type = MODEL_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN; // llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32; // llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a"; // std::string name = "n/a";
llama_hparams hparams = {}; // llama_hparams hparams = {};
llama_vocab vocab; // llama_vocab vocab;
struct ggml_tensor * tok_embd; // struct ggml_tensor * tok_embd;
struct ggml_tensor * pos_embd; // struct ggml_tensor * pos_embd;
struct ggml_tensor * tok_norm; // struct ggml_tensor * tok_norm;
struct ggml_tensor * tok_norm_b; // struct ggml_tensor * tok_norm_b;
struct ggml_tensor * output_norm; // struct ggml_tensor * output_norm;
struct ggml_tensor * output_norm_b; // struct ggml_tensor * output_norm_b;
struct ggml_tensor * output; // struct ggml_tensor * output;
std::vector<llama_layer> layers; // std::vector<llama_layer> layers;
int n_gpu_layers; // int n_gpu_layers;
// gguf metadata // // gguf metadata
std::unordered_map<std::string, std::string> gguf_kv; // std::unordered_map<std::string, std::string> gguf_kv;
// context // // context
struct ggml_context * ctx = NULL; // struct ggml_context * ctx = NULL;
// the model memory buffer // // the model memory buffer
llama_buffer buf; // llama_buffer buf;
// model memory mapped file // // model memory mapped file
std::unique_ptr<llama_mmap> mapping; // std::unique_ptr<llama_mmap> mapping;
// objects representing data potentially being locked in memory // // objects representing data potentially being locked in memory
llama_mlock mlock_buf; // llama_mlock mlock_buf;
llama_mlock mlock_mmap; // llama_mlock mlock_mmap;
// for quantize-stats only // // for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name; // std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0; // int64_t t_load_us = 0;
int64_t t_start_us = 0; // int64_t t_start_us = 0;
~llama_model() { llama_model::~llama_model() {
if (ctx) { if (ctx) {
ggml_free(ctx); ggml_free(ctx);
} }
@ -1305,11 +1251,11 @@ struct llama_model {
} }
#endif #endif
} }
}; //};
struct llama_context { //struct llama_context {
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} llama_context::llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
~llama_context() { llama_context::~llama_context() {
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (ctx_metal) { if (ctx_metal) {
ggml_metal_free(ctx_metal); ggml_metal_free(ctx_metal);
@ -1320,51 +1266,51 @@ struct llama_context {
} }
} }
llama_cparams cparams; // llama_cparams cparams;
const llama_model & model; // const llama_model & model;
// key + value cache for the self attention // // key + value cache for the self attention
struct llama_kv_cache kv_self; // struct llama_kv_cache kv_self;
std::mt19937 rng; // std::mt19937 rng;
bool has_evaluated_once = false; // bool has_evaluated_once = false;
int64_t t_start_us; // int64_t t_start_us;
int64_t t_load_us; // int64_t t_load_us;
int64_t t_sample_us = 0; // int64_t t_sample_us = 0;
int64_t t_p_eval_us = 0; // int64_t t_p_eval_us = 0;
int64_t t_eval_us = 0; // int64_t t_eval_us = 0;
int32_t n_sample = 0; // number of tokens sampled // int32_t n_sample = 0; // number of tokens sampled
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) // int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
int32_t n_eval = 0; // number of eval calls // int32_t n_eval = 0; // number of eval calls
// decode output (2-dimensional array: [n_tokens][n_vocab]) // // decode output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits; // std::vector<float> logits;
bool logits_all = false; // bool logits_all = false;
// input embedding (1-dimensional array: [n_embd]) // // input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding; // std::vector<float> embedding;
// reusable buffer for `struct ggml_graph_plan.work_data` // // reusable buffer for `struct ggml_graph_plan.work_data`
std::vector<uint8_t> work_buffer; // std::vector<uint8_t> work_buffer;
// memory buffers used to evaluate the model // // memory buffers used to evaluate the model
llama_buffer buf_compute; // llama_buffer buf_compute;
llama_buffer buf_alloc; // llama_buffer buf_alloc;
ggml_allocr * alloc = NULL; // ggml_allocr * alloc = NULL;
#ifdef GGML_USE_METAL // #ifdef GGML_USE_METAL
ggml_metal_context * ctx_metal = NULL; // ggml_metal_context * ctx_metal = NULL;
#endif // #endif
#ifdef GGML_USE_MPI // #ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL; // ggml_mpi_context * ctx_mpi = NULL;
#endif // #endif
}; // };
// //
// kv cache helpers // kv cache helpers
@ -1827,7 +1773,12 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap)
return tensor; return tensor;
} }
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) { struct ggml_tensor * llama_model_loader::create_tensor(
struct ggml_context * ctx,
const std::string & name,
const std::vector<int64_t> & ne,
ggml_backend_type backend,
bool required = true) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
if (cur == NULL) { if (cur == NULL) {
@ -4837,7 +4788,7 @@ struct ggml_cgraph * llm_build_context::build_stablelm() {
return gf; return gf;
} }
struct ggml_cgraph * build_qwen() { struct ggml_cgraph * llm_build_context::build_qwen() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_tensor * cur; struct ggml_tensor * cur;
@ -4951,7 +4902,6 @@ struct ggml_cgraph * llm_build_context::build_stablelm() {
return gf; return gf;
} }
};
// //
// tensor offloading helpers // tensor offloading helpers
@ -4968,7 +4918,7 @@ llm_offload_trie::node::~node() {
delete children[i]; delete children[i];
} }
} }
} }
// node * children[256] = { nullptr }; // node * children[256] = { nullptr };
// llm_offload_func_e func = OFFLOAD_FUNC_NOP; // llm_offload_func_e func = OFFLOAD_FUNC_NOP;
@ -9793,16 +9743,3 @@ std::string LLM_KV::operator()(llm_kv kv) const {
} }
llama_context::~llama_context() {
#ifdef GGML_USE_METAL
if (ctx_metal) {
ggml_metal_free(ctx_metal);
}
#endif
if (alloc) {
ggml_allocr_free(alloc);
}
}
llama_state::llama_state(){
log_callback= llama_log_callback_default;
}

View file

@ -49,6 +49,11 @@
#define LLAMA_SUPPORTS_GPU_OFFLOAD #define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif #endif
static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
static const size_t GiB = 1024*MiB;
#ifdef __cplusplus #ifdef __cplusplus
//extern "C" { //extern "C" {
#endif #endif