From 5e1b7f94a03e0b3b8e4578625bbdadc7bbd2b93c Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 18 Mar 2024 16:33:44 +0100 Subject: [PATCH 1/7] backend : set max split inputs to GGML_MAX_SRC (#6137) --- ggml-backend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-backend.c b/ggml-backend.c index 9f0084df7..6026570ae 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1015,7 +1015,7 @@ static bool ggml_is_view_op(enum ggml_op op) { #endif #ifndef GGML_SCHED_MAX_SPLIT_INPUTS -#define GGML_SCHED_MAX_SPLIT_INPUTS 4 +#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC #endif #ifndef GGML_SCHED_MAX_COPIES From 104f5e0fc156d48476258295457cafeec2a2af10 Mon Sep 17 00:00:00 2001 From: Felix Date: Mon, 18 Mar 2024 16:40:22 +0100 Subject: [PATCH 2/7] clip : fix memory leak (#6138) --- examples/llava/clip.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index a0ed82d7e..690bca2eb 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -497,7 +497,6 @@ struct clip_ctx { // memory buffers to evaluate the model ggml_backend_buffer_t params_buffer = NULL; - ggml_backend_buffer_t compute_buffer = NULL; ggml_backend_t backend = NULL; ggml_gallocr_t compute_alloc = NULL; @@ -1676,6 +1675,9 @@ void clip_free(clip_ctx * ctx) { ggml_free(ctx->ctx_data); gguf_free(ctx->ctx_gguf); + ggml_backend_buffer_free(ctx->params_buffer); + ggml_backend_free(ctx->backend); + ggml_gallocr_free(ctx->compute_alloc); delete ctx; } From d199ca79f279e84ebe27caafe0aa59c461d88969 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Mon, 18 Mar 2024 12:49:02 -0400 Subject: [PATCH 3/7] mpt : implement backwards compatiblity with duped output tensor (#6139) --- llama.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index b8bef6daf..1a9fe0c4d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -540,6 +540,7 @@ static const std::map> LLM_TENSOR_NA { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output"}, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, @@ -4300,9 +4301,9 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); - } else { + + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + if (!model.output) { model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU ml.n_created--; // artificial tensor ml.size_data += ggml_nbytes(model.output); @@ -4507,10 +4508,12 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); - // same as tok_embd, duplicated to allow offloading - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + if (!model.output) { + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } } for (int i = 0; i < n_layer; ++i) { From 2d15886bb092c3b780c676b5cc57ff3337af9c83 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 17 Mar 2024 06:37:44 +0000 Subject: [PATCH 4/7] flake.lock: Update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/9df3e30ce24fd28c7b3e2de0d986769db5d6225d' (2024-03-06) → 'github:NixOS/nixpkgs/d691274a972b3165335d261cc4671335f5c67de9' (2024-03-14) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index f9865d5e4..80de76dbf 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1709703039, - "narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=", + "lastModified": 1710451336, + "narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d", + "rev": "d691274a972b3165335d261cc4671335f5c67de9", "type": "github" }, "original": { From 4c28b8252907561165827125d2d1a4bad6926ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Tue, 19 Mar 2024 01:59:36 -0400 Subject: [PATCH 5/7] common : print usage on '-h' and '--help' (#6145) --- common/common.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 919182862..5f10718ec 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1056,7 +1056,8 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int return true; } if (arg == "-h" || arg == "--help") { - return false; + gpt_print_usage(argc, argv, gpt_params()); + exit(0); } if (arg == "--version") { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); From 970a48060ab9a6cc67aa063870323781c2a7bd7d Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 19 Mar 2024 09:06:54 +0100 Subject: [PATCH 6/7] ci : exempt some labels from being tagged as stale (#6140) --- .github/workflows/close-issue.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index eaffd074d..a151c6780 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -12,6 +12,7 @@ jobs: steps: - uses: actions/stale@v5 with: + exempt-issue-labels: "refactor,help wanted,good first issue,research" days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" From b80cf3b2d1dee0ad325f7a794fecc66befce7336 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 Mar 2024 10:21:54 +0200 Subject: [PATCH 7/7] common : disable repeat penalties by default (#6127) --- common/sampling.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/sampling.h b/common/sampling.h index 48b2459d1..79a998be8 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -32,13 +32,13 @@ typedef struct llama_sampling_params { float dynatemp_range = 0.00f; // 0.0 = disabled float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.10f; // 1.0 = disabled + float penalty_repeat = 1.00f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled float penalty_present = 0.00f; // 0.0 = disabled int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = true; // consider newlines as a repeatable token + bool penalize_nl = false; // consider newlines as a repeatable token std::vector samplers_sequence = { llama_sampler_type::TOP_K,