diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 3843b4c3e..04ad8fb6a 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -189,7 +189,8 @@ print("\n") # generate tests for each tokenizer model tests = [ - "ied 4 ½ months" + "ied 4 ½ months", + "Führer", "", " ", " ", diff --git a/llama.cpp b/llama.cpp index 7ce81d6b9..0ef2e6277 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11952,7 +11952,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(llama_is_byte_token(vocab, id)); - const auto& token_data = vocab.id_to_token.at(id); + const auto & token_data = vocab.id_to_token.at(id); switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { auto buf = token_data.text.substr(3, 2); @@ -17471,9 +17471,10 @@ int32_t llama_tokenize( static std::string llama_decode_text(const std::string & text) { std::string decoded_text; - auto unicode_sequences = unicode_cpts_from_utf8(text); - for (auto & unicode_sequence : unicode_sequences) { - decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence)); + + const auto cpts = unicode_cpts_from_utf8(text); + for (const auto cpt : cpts) { + decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt)); } return decoded_text; diff --git a/models/ggml-vocab-bert-bge.gguf.inp b/models/ggml-vocab-bert-bge.gguf.inp index 5e3062bab..0a89107c6 100644 --- a/models/ggml-vocab-bert-bge.gguf.inp +++ b/models/ggml-vocab-bert-bge.gguf.inp @@ -1,4 +1,8 @@ ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-bert-bge.gguf.out b/models/ggml-vocab-bert-bge.gguf.out index 8afef45ef..e4a76cdb0 100644 --- a/models/ggml-vocab-bert-bge.gguf.out +++ b/models/ggml-vocab-bert-bge.gguf.out @@ -1,4 +1,6 @@ 29464 2094 1018 1092 2706 + 11865 17875 + diff --git a/models/ggml-vocab-deepseek-coder.gguf.inp b/models/ggml-vocab-deepseek-coder.gguf.inp index 5e3062bab..0a89107c6 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.inp +++ b/models/ggml-vocab-deepseek-coder.gguf.inp @@ -1,4 +1,8 @@ ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-coder.gguf.out b/models/ggml-vocab-deepseek-coder.gguf.out index 094c772cd..9ccc560d6 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.out +++ b/models/ggml-vocab-deepseek-coder.gguf.out @@ -1,4 +1,6 @@ 1050 207 19 207 19192 4217 + 37 32009 71 6247 + 207 243 315 diff --git a/models/ggml-vocab-deepseek-llm.gguf.inp b/models/ggml-vocab-deepseek-llm.gguf.inp index 5e3062bab..0a89107c6 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.inp +++ b/models/ggml-vocab-deepseek-llm.gguf.inp @@ -1,4 +1,8 @@ ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-llm.gguf.out b/models/ggml-vocab-deepseek-llm.gguf.out index 0bb8b5230..fd94b896d 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.out +++ b/models/ggml-vocab-deepseek-llm.gguf.out @@ -1,4 +1,6 @@ 1052 207 19 207 19109 4223 + 37 100014 71 6245 + 207 243 300 diff --git a/models/ggml-vocab-falcon.gguf.inp b/models/ggml-vocab-falcon.gguf.inp index 5e3062bab..0a89107c6 100644 --- a/models/ggml-vocab-falcon.gguf.inp +++ b/models/ggml-vocab-falcon.gguf.inp @@ -1,4 +1,8 @@ ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-falcon.gguf.out b/models/ggml-vocab-falcon.gguf.out index 8aff91553..209b04cda 100644 --- a/models/ggml-vocab-falcon.gguf.out +++ b/models/ggml-vocab-falcon.gguf.out @@ -1,4 +1,6 @@ 878 204 31 3068 133 2137 + 28611 132 30042 + 204 258 466 diff --git a/models/ggml-vocab-gpt-2.gguf.inp b/models/ggml-vocab-gpt-2.gguf.inp index 5e3062bab..0a89107c6 100644 --- a/models/ggml-vocab-gpt-2.gguf.inp +++ b/models/ggml-vocab-gpt-2.gguf.inp @@ -1,4 +1,8 @@ ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-gpt-2.gguf.out b/models/ggml-vocab-gpt-2.gguf.out index 14cfb7b36..78430f0d3 100644 --- a/models/ggml-vocab-gpt-2.gguf.out +++ b/models/ggml-vocab-gpt-2.gguf.out @@ -1,4 +1,6 @@ 798 604 25208 1933 + 37 9116 71 11751 + 220 220 220 220 220 220 diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp index 5e3062bab..0a89107c6 100644 --- a/models/ggml-vocab-llama-bpe.gguf.inp +++ b/models/ggml-vocab-llama-bpe.gguf.inp @@ -1,4 +1,8 @@ ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out index 555ed323d..1f00e3812 100644 --- a/models/ggml-vocab-llama-bpe.gguf.out +++ b/models/ggml-vocab-llama-bpe.gguf.out @@ -1,4 +1,6 @@ 1142 220 19 220 27154 4038 + 37 51853 261 + 220 256 262 diff --git a/models/ggml-vocab-llama-spm.gguf.inp b/models/ggml-vocab-llama-spm.gguf.inp index 5e3062bab..0a89107c6 100644 --- a/models/ggml-vocab-llama-spm.gguf.inp +++ b/models/ggml-vocab-llama-spm.gguf.inp @@ -1,4 +1,8 @@ ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-spm.gguf.out b/models/ggml-vocab-llama-spm.gguf.out index 86a7eff91..9c3327cb5 100644 --- a/models/ggml-vocab-llama-spm.gguf.out +++ b/models/ggml-vocab-llama-spm.gguf.out @@ -1,4 +1,6 @@ 474 287 29871 29946 29871 30226 7378 + 383 4000 261 + 259 1678 268 diff --git a/models/ggml-vocab-mpt.gguf.inp b/models/ggml-vocab-mpt.gguf.inp index 5e3062bab..0a89107c6 100644 --- a/models/ggml-vocab-mpt.gguf.inp +++ b/models/ggml-vocab-mpt.gguf.inp @@ -1,4 +1,8 @@ ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-mpt.gguf.out b/models/ggml-vocab-mpt.gguf.out index e7e578022..d8d0fe909 100644 --- a/models/ggml-vocab-mpt.gguf.out +++ b/models/ggml-vocab-mpt.gguf.out @@ -1,4 +1,6 @@ 728 577 24142 2607 + 39 26288 6554 + 209 50276 50275 diff --git a/models/ggml-vocab-phi-3.gguf.inp b/models/ggml-vocab-phi-3.gguf.inp index 5e3062bab..0a89107c6 100644 --- a/models/ggml-vocab-phi-3.gguf.inp +++ b/models/ggml-vocab-phi-3.gguf.inp @@ -1,4 +1,8 @@ ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-phi-3.gguf.out b/models/ggml-vocab-phi-3.gguf.out index 86a7eff91..9c3327cb5 100644 --- a/models/ggml-vocab-phi-3.gguf.out +++ b/models/ggml-vocab-phi-3.gguf.out @@ -1,4 +1,6 @@ 474 287 29871 29946 29871 30226 7378 + 383 4000 261 + 259 1678 268 diff --git a/models/ggml-vocab-starcoder.gguf.inp b/models/ggml-vocab-starcoder.gguf.inp index 5e3062bab..0a89107c6 100644 --- a/models/ggml-vocab-starcoder.gguf.inp +++ b/models/ggml-vocab-starcoder.gguf.inp @@ -1,4 +1,8 @@ ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-starcoder.gguf.out b/models/ggml-vocab-starcoder.gguf.out index 551b3ce6e..ccb55c7fe 100644 --- a/models/ggml-vocab-starcoder.gguf.out +++ b/models/ggml-vocab-starcoder.gguf.out @@ -1,4 +1,6 @@ 4850 244 57 244 162 159 17722 + 75 2022 3943 284 + 244 280 283