Merge branch 'ggerganov:master' into iq2_s

2024-03-19 14:08:56 +05:30 · 2024-03-19 14:08:56 +05:30 · 4b7aaae8f3
commit 4b7aaae8f3
parent 7f70fbe227 b80cf3b2d1
7 changed files with 22 additions and 15 deletions
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@ -12,6 +12,7 @@ jobs:
    steps:
      - uses: actions/stale@v5
        with:
          exempt-issue-labels: "refactor,help wanted,good first issue,research"
          days-before-issue-stale: 30
          days-before-issue-close: 14
          stale-issue-label: "stale"
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1056,7 +1056,8 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
        return true;
    }
    if (arg == "-h" || arg == "--help") {
-        return false;
+        gpt_print_usage(argc, argv, gpt_params());
        exit(0);
    }
    if (arg == "--version") {
        fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
--- a/common/sampling.h
+++ b/common/sampling.h
@ -32,13 +32,13 @@ typedef struct llama_sampling_params {
    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
+    float       penalty_repeat        = 1.00f;    // 1.0 = disabled
    float       penalty_freq          = 0.00f;    // 0.0 = disabled
    float       penalty_present       = 0.00f;    // 0.0 = disabled
    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float       mirostat_tau          = 5.00f;    // target entropy
    float       mirostat_eta          = 0.10f;    // learning rate
-    bool        penalize_nl           = true;     // consider newlines as a repeatable token
+    bool        penalize_nl           = false;     // consider newlines as a repeatable token
    std::vector<llama_sampler_type> samplers_sequence = {
        llama_sampler_type::TOP_K,
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -497,7 +497,6 @@ struct clip_ctx {
    // memory buffers to evaluate the model
    ggml_backend_buffer_t params_buffer  = NULL;
    ggml_backend_buffer_t compute_buffer = NULL;
    ggml_backend_t backend       = NULL;
    ggml_gallocr_t compute_alloc = NULL;
@ -1676,6 +1675,9 @@ void clip_free(clip_ctx * ctx) {
    ggml_free(ctx->ctx_data);
    gguf_free(ctx->ctx_gguf);
    ggml_backend_buffer_free(ctx->params_buffer);
    ggml_backend_free(ctx->backend);
    ggml_gallocr_free(ctx->compute_alloc);
    delete ctx;
 }
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1709703039,
+        "lastModified": 1710451336,
-        "narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=",
+        "narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d",
+        "rev": "d691274a972b3165335d261cc4671335f5c67de9",
        "type": "github"
      },
      "original": {
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -1015,7 +1015,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
 #endif
 #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
-#define GGML_SCHED_MAX_SPLIT_INPUTS 4
+#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
 #endif
 #ifndef GGML_SCHED_MAX_COPIES
--- a/llama.cpp
+++ b/llama.cpp
@ -540,6 +540,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output"},
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
@ -4300,9 +4301,9 @@ static bool llm_load_tensors(
                    {
                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
-                        if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
+
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,     "weight"), {n_embd, n_vocab});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, false);
-                        } else {
+                        if (!model.output) {
                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
                            ml.n_created--; // artificial tensor
                            ml.size_data += ggml_nbytes(model.output);
@ -4507,11 +4508,13 @@ static bool llm_load_tensors(
                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false);
-                        // same as tok_embd, duplicated to allow offloading
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, false);
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab});
+                        if (!model.output) {
                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
                            ml.n_created--; // artificial tensor
                            ml.size_data += ggml_nbytes(model.output);
                        }
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        ggml_context * ctx_layer = ctx_for_layer(i);