removed the k-quants changes
This commit is contained in:
parent
d8f3f7089f
commit
6e7f15ddf8
1 changed files with 78 additions and 147 deletions
225
llama.cpp
225
llama.cpp
|
@ -21,13 +21,9 @@
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
#ifndef QK_K
|
#ifndef QK_K
|
||||||
#ifdef GGML_QKK_64
|
|
||||||
#define QK_K 64
|
|
||||||
#else
|
|
||||||
#define QK_K 256
|
#define QK_K 256
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
@ -186,19 +182,6 @@ struct llama_kv_cache {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_vocab {
|
|
||||||
using id = int32_t;
|
|
||||||
using token = std::string;
|
|
||||||
|
|
||||||
struct token_score {
|
|
||||||
token tok;
|
|
||||||
float score;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::unordered_map<token, id> token_to_id;
|
|
||||||
std::vector<token_score> id_to_token;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_model {
|
struct llama_model {
|
||||||
e_model type = MODEL_UNKNOWN;
|
e_model type = MODEL_UNKNOWN;
|
||||||
|
|
||||||
|
@ -215,6 +198,10 @@ struct llama_model {
|
||||||
// context
|
// context
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
|
// key + value cache for the self attention
|
||||||
|
// TODO: move to llama_state
|
||||||
|
struct llama_kv_cache kv_self;
|
||||||
|
|
||||||
// the model memory buffer
|
// the model memory buffer
|
||||||
llama_ctx_buffer buf;
|
llama_ctx_buffer buf;
|
||||||
|
|
||||||
|
@ -228,11 +215,6 @@ struct llama_model {
|
||||||
// for quantize-stats only
|
// for quantize-stats only
|
||||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||||
|
|
||||||
int64_t t_load_us = 0;
|
|
||||||
int64_t t_start_us = 0;
|
|
||||||
|
|
||||||
llama_vocab vocab;
|
|
||||||
|
|
||||||
~llama_model() {
|
~llama_model() {
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
|
@ -251,11 +233,24 @@ struct llama_model {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_context {
|
struct llama_vocab {
|
||||||
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
using id = int32_t;
|
||||||
|
using token = std::string;
|
||||||
|
|
||||||
|
struct token_score {
|
||||||
|
token tok;
|
||||||
|
float score;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unordered_map<token, id> token_to_id;
|
||||||
|
std::vector<token_score> id_to_token;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_context {
|
||||||
std::mt19937 rng;
|
std::mt19937 rng;
|
||||||
|
|
||||||
|
int64_t t_load_us = 0;
|
||||||
|
int64_t t_start_us = 0;
|
||||||
bool has_evaluated_once = false;
|
bool has_evaluated_once = false;
|
||||||
|
|
||||||
int64_t t_sample_us = 0;
|
int64_t t_sample_us = 0;
|
||||||
|
@ -266,16 +261,8 @@ struct llama_context {
|
||||||
int32_t n_eval = 0; // number of eval calls
|
int32_t n_eval = 0; // number of eval calls
|
||||||
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||||
|
|
||||||
const llama_model & model;
|
llama_model model;
|
||||||
const llama_vocab & vocab;
|
llama_vocab vocab;
|
||||||
|
|
||||||
bool model_owner = false;
|
|
||||||
|
|
||||||
int64_t t_load_us;
|
|
||||||
int64_t t_start_us;
|
|
||||||
|
|
||||||
// key + value cache for the self attention
|
|
||||||
struct llama_kv_cache kv_self;
|
|
||||||
|
|
||||||
size_t mem_per_token = 0;
|
size_t mem_per_token = 0;
|
||||||
|
|
||||||
|
@ -774,7 +761,7 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_mmap) {
|
if (use_mmap) {
|
||||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
|
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
||||||
if (lmlock) {
|
if (lmlock) {
|
||||||
lmlock->init(mapping->addr);
|
lmlock->init(mapping->addr);
|
||||||
}
|
}
|
||||||
|
@ -977,7 +964,7 @@ bool llama_mlock_supported() {
|
||||||
return llama_mlock::SUPPORTED;
|
return llama_mlock::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_init_backend(bool numa) {
|
void llama_init_backend() {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
// needed to initialize f16 tables
|
// needed to initialize f16 tables
|
||||||
|
@ -986,10 +973,6 @@ void llama_init_backend(bool numa) {
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (numa) {
|
|
||||||
ggml_numa_init();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t llama_time_us() {
|
int64_t llama_time_us() {
|
||||||
|
@ -1050,8 +1033,7 @@ static const char *llama_model_type_name(e_model type) {
|
||||||
|
|
||||||
static void llama_model_load_internal(
|
static void llama_model_load_internal(
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
llama_model & model,
|
llama_context & lctx,
|
||||||
llama_vocab & vocab,
|
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
|
@ -1065,11 +1047,12 @@ static void llama_model_load_internal(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void * progress_callback_user_data) {
|
void * progress_callback_user_data) {
|
||||||
|
|
||||||
model.t_start_us = ggml_time_us();
|
lctx.t_start_us = ggml_time_us();
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
||||||
|
|
||||||
vocab = std::move(ml->file_loaders.at(0)->vocab);
|
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
||||||
|
auto & model = lctx.model;
|
||||||
model.hparams = ml->file_loaders.at(0)->hparams;
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
||||||
model.n_gpu_layers = n_gpu_layers;
|
model.n_gpu_layers = n_gpu_layers;
|
||||||
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
||||||
|
@ -1139,15 +1122,15 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
// create the ggml context
|
// create the ggml context
|
||||||
{
|
{
|
||||||
model.buf.resize(ctx_size);
|
lctx.model.buf.resize(ctx_size);
|
||||||
if (use_mlock) {
|
if (use_mlock) {
|
||||||
model.mlock_buf.init(model.buf.addr);
|
lctx.model.mlock_buf.init(lctx.model.buf.addr);
|
||||||
model.mlock_buf.grow_to(model.buf.size);
|
lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ model.buf.size,
|
/*.mem_size =*/ lctx.model.buf.size,
|
||||||
/*.mem_buffer =*/ model.buf.addr,
|
/*.mem_buffer =*/ lctx.model.buf.addr,
|
||||||
/*.no_alloc =*/ ml->use_mmap,
|
/*.no_alloc =*/ ml->use_mmap,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1328,7 +1311,7 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
||||||
|
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
progress_callback(1.0f, progress_callback_user_data);
|
progress_callback(1.0f, progress_callback_user_data);
|
||||||
|
@ -1338,13 +1321,12 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
// loading time will be recalculate after the first eval, so
|
// loading time will be recalculate after the first eval, so
|
||||||
// we take page faults deferred by mmap() into consideration
|
// we take page faults deferred by mmap() into consideration
|
||||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_model_load(
|
static bool llama_model_load(
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
llama_model & model,
|
llama_context & lctx,
|
||||||
llama_vocab & vocab,
|
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
|
@ -1358,7 +1340,7 @@ static bool llama_model_load(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void *progress_callback_user_data) {
|
void *progress_callback_user_data) {
|
||||||
try {
|
try {
|
||||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
||||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
|
@ -1396,7 +1378,7 @@ static bool llama_eval_internal(
|
||||||
const auto & model = lctx.model;
|
const auto & model = lctx.model;
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
const auto & kv_self = lctx.kv_self;
|
const auto & kv_self = model.kv_self;
|
||||||
|
|
||||||
LLAMA_ASSERT(!!kv_self.ctx);
|
LLAMA_ASSERT(!!kv_self.ctx);
|
||||||
|
|
||||||
|
@ -1744,7 +1726,7 @@ static bool llama_eval_internal(
|
||||||
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
||||||
|
|
||||||
// update kv token count
|
// update kv token count
|
||||||
lctx.kv_self.n = n_past + N;
|
lctx.model.kv_self.n = n_past + N;
|
||||||
|
|
||||||
// extract logits
|
// extract logits
|
||||||
{
|
{
|
||||||
|
@ -2023,10 +2005,9 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
cum_sum += candidates->data[i].p;
|
cum_sum += candidates->data[i].p;
|
||||||
|
|
||||||
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
// Check if the running sum is greater than p or if we have kept at least min_keep tokens
|
||||||
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
if (cum_sum > p && i >= min_keep) {
|
||||||
if (cum_sum >= p && i + 1 >= min_keep) {
|
last_idx = i;
|
||||||
last_idx = i + 1;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2478,10 +2459,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
std::vector<std::thread> workers;
|
std::vector<std::thread> workers;
|
||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
|
|
||||||
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
|
||||||
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
|
||||||
};
|
|
||||||
|
|
||||||
size_t idx = 0;
|
size_t idx = 0;
|
||||||
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
||||||
llama_buffer read_data;
|
llama_buffer read_data;
|
||||||
|
@ -2533,26 +2510,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
new_type = GGML_TYPE_Q6_K;
|
new_type = GGML_TYPE_Q6_K;
|
||||||
}
|
}
|
||||||
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
} else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
use_more_bits(i_attention_wv, n_attention_wv)) { new_type = GGML_TYPE_Q6_K;
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
||||||
} else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) { new_type = GGML_TYPE_Q6_K;
|
|
||||||
}
|
|
||||||
++i_attention_wv;
|
++i_attention_wv;
|
||||||
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { { new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
} } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
} else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) { new_type = GGML_TYPE_Q6_K;
|
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
||||||
}
|
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||||
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
|
||||||
++i_feed_forward_w2;
|
++i_feed_forward_w2;
|
||||||
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2661,39 +2634,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
// interface implementation
|
// interface implementation
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_model * llama_load_model_from_file(
|
struct llama_context * llama_init_from_file(
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_context_params params) {
|
struct llama_context_params params) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
llama_model * model = new llama_model;
|
llama_context * ctx = new llama_context;
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
|
||||||
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
|
||||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
|
||||||
delete model;
|
|
||||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
return model;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_free_model(struct llama_model * model) {
|
|
||||||
delete model;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llama_context * llama_new_context_with_model(
|
|
||||||
struct llama_model * model,
|
|
||||||
struct llama_context_params params) {
|
|
||||||
|
|
||||||
if (!model) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_context * ctx = new llama_context(*model, model->vocab);
|
|
||||||
|
|
||||||
if (params.seed < 0) {
|
if (params.seed < 0) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
|
@ -2721,16 +2667,24 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
|
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
||||||
|
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
||||||
|
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
||||||
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
// reserve memory for context buffers
|
// reserve memory for context buffers
|
||||||
if (!params.vocab_only) {
|
if (!params.vocab_only) {
|
||||||
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
||||||
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
|
||||||
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2782,8 +2736,8 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
||||||
|
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
|
||||||
|
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
||||||
|
@ -2794,23 +2748,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_context * llama_init_from_file(
|
|
||||||
const char * path_model,
|
|
||||||
struct llama_context_params params) {
|
|
||||||
|
|
||||||
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
|
||||||
if (!model) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
|
||||||
ctx->model_owner = true;
|
|
||||||
return ctx;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_free(struct llama_context * ctx) {
|
void llama_free(struct llama_context * ctx) {
|
||||||
if (ctx->model_owner) {
|
|
||||||
delete &ctx->model;
|
|
||||||
}
|
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2827,9 +2765,11 @@ int llama_model_quantize(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||||
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
||||||
|
|
||||||
|
auto & model = ctx->model;
|
||||||
|
|
||||||
const int64_t t_start_lora_us = ggml_time_us();
|
const int64_t t_start_lora_us = ggml_time_us();
|
||||||
|
|
||||||
auto fin = std::ifstream(path_lora, std::ios::binary);
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
||||||
|
@ -2906,7 +2846,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
|
|
||||||
// maybe this should in llama_model_loader
|
// maybe this should in llama_model_loader
|
||||||
if (model_loader->use_mmap) {
|
if (model_loader->use_mmap) {
|
||||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
|
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3072,16 +3012,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
|
|
||||||
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||||
try {
|
try {
|
||||||
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
||||||
} catch (const std::exception & err) {
|
|
||||||
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
|
||||||
try {
|
|
||||||
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -3089,7 +3020,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
||||||
return ctx->kv_self.n;
|
return ctx->model.kv_self.n;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LLAMA_MAX_RNG_STATE (64*1024)
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||||
|
@ -3114,7 +3045,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||||
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
||||||
const size_t s_kv_size = sizeof(size_t);
|
const size_t s_kv_size = sizeof(size_t);
|
||||||
const size_t s_kv_ntok = sizeof(int);
|
const size_t s_kv_ntok = sizeof(int);
|
||||||
const size_t s_kv = ctx->kv_self.buf.size;
|
const size_t s_kv = ctx->model.kv_self.buf.size;
|
||||||
|
|
||||||
const size_t s_total = (
|
const size_t s_total = (
|
||||||
+ s_rng_size
|
+ s_rng_size
|
||||||
|
@ -3180,7 +3111,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
|
|
||||||
// copy kv cache
|
// copy kv cache
|
||||||
{
|
{
|
||||||
const auto & kv_self = ctx->kv_self;
|
const auto & kv_self = ctx->model.kv_self;
|
||||||
const auto & hparams = ctx->model.hparams;
|
const auto & hparams = ctx->model.hparams;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
|
@ -3284,7 +3215,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
|
|
||||||
// set kv cache
|
// set kv cache
|
||||||
{
|
{
|
||||||
const auto & kv_self = ctx->kv_self;
|
const auto & kv_self = ctx->model.kv_self;
|
||||||
const auto & hparams = ctx->model.hparams;
|
const auto & hparams = ctx->model.hparams;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
|
@ -3328,7 +3259,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->kv_self.n = kv_ntok;
|
ctx->model.kv_self.n = kv_ntok;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t nread = inp - src;
|
const size_t nread = inp - src;
|
||||||
|
@ -3575,6 +3506,6 @@ const char * llama_print_system_info(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// For internal test use
|
// For internal test use
|
||||||
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
||||||
return ctx->model.tensors_by_name;
|
return ctx->model.tensors_by_name;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue