From 78675f35eed651f8a437d86d3c7c770c411e7e90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Tue, 2 Jul 2024 11:04:04 +0200 Subject: [PATCH] llama : add early return in Unigram tokenizer when normalized input is empty --- src/llama.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 7cdd27d86..3dc36d536 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -14888,6 +14888,9 @@ struct llm_tokenizer_ugm { std::string normalized; normalize(text, &normalized); size_t input_len = normalized.size(); + if (input_len == 0) { + return; + } // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores std::vector tokenization_results(input_len + 1, {0, 0, -FLT_MAX});