From cd60b88bf7ad7785fb6ac9864e360cf10e42faad Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 9 Oct 2024 16:40:35 +0200 Subject: [PATCH 1/4] ggml-alloc : remove buffer_id from leaf_alloc (ggml/987) This commit removes the buffer_id field from the leaf_alloc struct. The motivation for is that this field is only written to and never read/used as far as I can tell. Each tensor_alloc has a buffer_id field and this is what caused me to look into this more closely, to understand what the buffer_id in leaf_alloc was used for. --- ggml/src/ggml-alloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 28548fbbb..041de9e3e 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -348,7 +348,6 @@ struct tensor_alloc { }; struct leaf_alloc { - int buffer_id; struct tensor_alloc leaf; }; @@ -740,7 +739,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); - galloc->leaf_allocs[i].buffer_id = hn->buffer_id; if (leaf->view_src || leaf->data) { galloc->leaf_allocs[i].leaf.buffer_id = -1; galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; From 0e41b300ed28f7fe185d938b2e3d56a0bf7411ed Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 16 Oct 2024 11:28:14 +0300 Subject: [PATCH 2/4] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 3cca9cc2f..6d31b21b9 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -564f42082f858f9674b2a2e06e9e779d9ed2c754 +2327bda7a55ac6b72614ac5ebd5c5a5e02553b9b From 1f66b699c48cb5ab3265ed72c48e8549b1674291 Mon Sep 17 00:00:00 2001 From: Alexey Parfenov Date: Wed, 16 Oct 2024 08:35:53 +0000 Subject: [PATCH 3/4] server : fix the disappearance of the end of the text (#9867) * server: fix the disappearance of the end of the text when streaming with stop strings * simplify "send text" checks --- examples/server/server.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d53cca84c..b5e63384c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1090,22 +1090,21 @@ struct server_context { size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; + bool send_text = true; size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL); if (stop_pos != std::string::npos) { - is_stop_full = true; slot.generated_text.erase( slot.generated_text.begin() + pos + stop_pos, slot.generated_text.end()); pos = std::min(slot.n_sent_text, slot.generated_text.size()); - } else { - is_stop_full = false; + } else if (slot.has_next_token) { stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL); + send_text = stop_pos == std::string::npos; } // check if there is any token to predict - if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) { + if (send_text) { // no send the stop word in the response result.text_to_send = slot.generated_text.substr(pos, std::string::npos); slot.n_sent_text += result.text_to_send.size(); From 10433e8b457c4cfd759cbb41fc55fc398db4a5da Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Wed, 16 Oct 2024 18:10:21 +0800 Subject: [PATCH 4/4] llama : add tensor name for "result_norm" (#9907) Signed-off-by: Molly Sophia --- src/llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 8d44c73c8..c51b49c56 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16095,9 +16095,11 @@ struct llm_build_context { cur = ggml_get_rows(ctx0, cur, inp_out_ids); cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_norm", -1); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); + ggml_build_forward_expand(gf, cur); return gf;