llama.cpp : split llama_context_params into model and context params (#3301)
* llama.cpp : split llama_context_params into model and context params ggml-ci * fix metal build * fix freq_base/scale default to model value * llama-bench : keep the same model between tests when possible * move n_threads to llama_context_params, add n_threads_batch * fix mpi build * remove kv_size(), cuda scratch fixes * remove low-vram option * add n_threads_batch to system info, refactor to get_system_info() * add documentation about --threads-batch to the READMEs * llama-bench fix * main : fix rope freq/scale warning * llama.cpp : add llama_get_model common : add llama_tokenize from model * remove duplicated ctx/model functions ggml-ci * cuda : print total VRAM used
This commit is contained in:
parent
0512d66670
commit
16bc66d947
27 changed files with 713 additions and 633 deletions
|
@ -48,8 +48,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
|
|||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
}
|
||||
struct MyModel * ret = new MyModel();
|
||||
ret->ctx = ctx;
|
||||
|
@ -71,7 +70,7 @@ bool eval_float(void * model, float * input, int N){
|
|||
MyModel * mymodel = (MyModel*)model;
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
gpt_params params = mymodel->params;
|
||||
int n_emb = llama_n_embd(ctx);
|
||||
int n_emb = llama_n_embd(llama_get_model(ctx));
|
||||
int n_past = mymodel->n_past;
|
||||
int n_batch = N; // params.n_batch;
|
||||
|
||||
|
@ -81,7 +80,7 @@ bool eval_float(void * model, float * input, int N){
|
|||
n_eval = n_batch;
|
||||
}
|
||||
llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
|
||||
if (llama_decode(ctx, batch, params.n_threads)) {
|
||||
if (llama_decode(ctx, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
@ -102,7 +101,7 @@ bool eval_tokens(void * model, std::vector<llama_token> tokens) {
|
|||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0), params.n_threads)) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
@ -133,7 +132,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
|||
|
||||
// out of user input, sample next token
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
|
@ -149,7 +148,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
|||
llama_token id = 0;
|
||||
{
|
||||
auto logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue