llama.cpp : split llama_context_params into model and context params (#3301)

* llama.cpp : split llama_context_params into model and context params ggml-ci * fix metal build * fix freq_base/scale default to model value * llama-bench : keep the same model between tests when possible * move n_threads to llama_context_params, add n_threads_batch * fix mpi build * remove kv_size(), cuda scratch fixes * remove low-vram option * add n_threads_batch to system info, refactor to get_system_info() * add documentation about --threads-batch to the READMEs * llama-bench fix * main : fix rope freq/scale warning * llama.cpp : add llama_get_model common : add llama_tokenize from model * remove duplicated ctx/model functions ggml-ci * cuda : print total VRAM used
2023-09-28 21:42:38 +02:00 · 2023-09-28 21:42:38 +02:00 · 16bc66d947
commit 16bc66d947
parent 0512d66670
27 changed files with 713 additions and 633 deletions
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@ -48,8 +48,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }
    struct MyModel * ret = new MyModel();
    ret->ctx = ctx;
@ -71,7 +70,7 @@ bool eval_float(void * model, float * input, int N){
    MyModel * mymodel = (MyModel*)model;
    llama_context * ctx = mymodel->ctx;
    gpt_params params = mymodel->params;
-    int n_emb = llama_n_embd(ctx);
+    int n_emb = llama_n_embd(llama_get_model(ctx));
    int n_past = mymodel->n_past;
    int n_batch = N; // params.n_batch;

@ -81,7 +80,7 @@ bool eval_float(void * model, float * input, int N){
            n_eval = n_batch;
        }
        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
-        if (llama_decode(ctx, batch, params.n_threads)) {
+        if (llama_decode(ctx, batch)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
@ -102,7 +101,7 @@ bool eval_tokens(void * model, std::vector<llama_token> tokens) {
        if (n_eval > params.n_batch) {
            n_eval = params.n_batch;
        }
-        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0), params.n_threads)) {
+        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
@ -133,7 +132,7 @@ llama_token sampling_id(struct MyModel* mymodel) {

    // out of user input, sample next token
    const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
    const float   top_p           = params.top_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
@ -149,7 +148,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
    llama_token id = 0;
    {
        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(ctx);
+        auto n_vocab = llama_n_vocab(llama_get_model(ctx));

        // Apply params.logit_bias map
        for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {