From cf00fe1ea325f8fba3f99b953146827a07434c9e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 May 2024 11:00:15 +0300 Subject: [PATCH] starcoder : fix pre-tokenizer --- convert-hf-to-gguf-update.py | 1 + llama.cpp | 5 +++++ models/ggml-vocab-bert-bge.gguf.inp | 2 +- models/ggml-vocab-bert-bge.gguf.out | 2 +- models/ggml-vocab-deepseek-coder.gguf.inp | 2 +- models/ggml-vocab-deepseek-coder.gguf.out | 2 +- models/ggml-vocab-deepseek-llm.gguf.inp | 2 +- models/ggml-vocab-deepseek-llm.gguf.out | 2 +- models/ggml-vocab-falcon.gguf.inp | 2 +- models/ggml-vocab-falcon.gguf.out | 2 +- models/ggml-vocab-gpt-2.gguf.inp | 2 +- models/ggml-vocab-gpt-2.gguf.out | 2 +- models/ggml-vocab-llama-bpe.gguf.inp | 2 +- models/ggml-vocab-llama-bpe.gguf.out | 2 +- models/ggml-vocab-llama-spm.gguf.inp | 2 +- models/ggml-vocab-llama-spm.gguf.out | 2 +- models/ggml-vocab-mpt.gguf.inp | 2 +- models/ggml-vocab-mpt.gguf.out | 2 +- models/ggml-vocab-phi-3.gguf.inp | 2 +- models/ggml-vocab-phi-3.gguf.out | 2 +- models/ggml-vocab-starcoder.gguf.inp | 2 +- models/ggml-vocab-starcoder.gguf.out | 2 +- 22 files changed, 26 insertions(+), 20 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index b019c1e3d..3843b4c3e 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -189,6 +189,7 @@ print("\n") # generate tests for each tokenizer model tests = [ + "ied 4 ½ months" "", " ", " ", diff --git a/llama.cpp b/llama.cpp index 18d6297ce..7ce81d6b9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12235,6 +12235,11 @@ struct llm_tokenizer_bpe { }); break; case LLAMA_VOCAB_PRE_TYPE_STARCODER: + word_collection = unicode_regex_split(text, { + "\\p{N}", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + break; case LLAMA_VOCAB_PRE_TYPE_GPT2: word_collection = unicode_regex_split(text, { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", diff --git a/models/ggml-vocab-bert-bge.gguf.inp b/models/ggml-vocab-bert-bge.gguf.inp index 0389f00c7..5e3062bab 100644 --- a/models/ggml-vocab-bert-bge.gguf.inp +++ b/models/ggml-vocab-bert-bge.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-bert-bge.gguf.out b/models/ggml-vocab-bert-bge.gguf.out index 969552e1c..8afef45ef 100644 --- a/models/ggml-vocab-bert-bge.gguf.out +++ b/models/ggml-vocab-bert-bge.gguf.out @@ -1,4 +1,4 @@ - + 29464 2094 1018 1092 2706 diff --git a/models/ggml-vocab-deepseek-coder.gguf.inp b/models/ggml-vocab-deepseek-coder.gguf.inp index 0389f00c7..5e3062bab 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.inp +++ b/models/ggml-vocab-deepseek-coder.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-coder.gguf.out b/models/ggml-vocab-deepseek-coder.gguf.out index 8ef585c78..094c772cd 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.out +++ b/models/ggml-vocab-deepseek-coder.gguf.out @@ -1,4 +1,4 @@ - + 1050 207 19 207 19192 4217 207 243 315 diff --git a/models/ggml-vocab-deepseek-llm.gguf.inp b/models/ggml-vocab-deepseek-llm.gguf.inp index 0389f00c7..5e3062bab 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.inp +++ b/models/ggml-vocab-deepseek-llm.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-llm.gguf.out b/models/ggml-vocab-deepseek-llm.gguf.out index 0ea9d66e3..0bb8b5230 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.out +++ b/models/ggml-vocab-deepseek-llm.gguf.out @@ -1,4 +1,4 @@ - + 1052 207 19 207 19109 4223 207 243 300 diff --git a/models/ggml-vocab-falcon.gguf.inp b/models/ggml-vocab-falcon.gguf.inp index 0389f00c7..5e3062bab 100644 --- a/models/ggml-vocab-falcon.gguf.inp +++ b/models/ggml-vocab-falcon.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-falcon.gguf.out b/models/ggml-vocab-falcon.gguf.out index cb8da7b1a..8aff91553 100644 --- a/models/ggml-vocab-falcon.gguf.out +++ b/models/ggml-vocab-falcon.gguf.out @@ -1,4 +1,4 @@ - + 878 204 31 3068 133 2137 204 258 466 diff --git a/models/ggml-vocab-gpt-2.gguf.inp b/models/ggml-vocab-gpt-2.gguf.inp index 0389f00c7..5e3062bab 100644 --- a/models/ggml-vocab-gpt-2.gguf.inp +++ b/models/ggml-vocab-gpt-2.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-gpt-2.gguf.out b/models/ggml-vocab-gpt-2.gguf.out index 9986f38e4..14cfb7b36 100644 --- a/models/ggml-vocab-gpt-2.gguf.out +++ b/models/ggml-vocab-gpt-2.gguf.out @@ -1,4 +1,4 @@ - + 798 604 25208 1933 220 220 220 220 220 220 diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp index 0389f00c7..5e3062bab 100644 --- a/models/ggml-vocab-llama-bpe.gguf.inp +++ b/models/ggml-vocab-llama-bpe.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out index 4d903e1cd..555ed323d 100644 --- a/models/ggml-vocab-llama-bpe.gguf.out +++ b/models/ggml-vocab-llama-bpe.gguf.out @@ -1,4 +1,4 @@ - + 1142 220 19 220 27154 4038 220 256 262 diff --git a/models/ggml-vocab-llama-spm.gguf.inp b/models/ggml-vocab-llama-spm.gguf.inp index 0389f00c7..5e3062bab 100644 --- a/models/ggml-vocab-llama-spm.gguf.inp +++ b/models/ggml-vocab-llama-spm.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-spm.gguf.out b/models/ggml-vocab-llama-spm.gguf.out index 15d00b106..86a7eff91 100644 --- a/models/ggml-vocab-llama-spm.gguf.out +++ b/models/ggml-vocab-llama-spm.gguf.out @@ -1,4 +1,4 @@ - + 474 287 29871 29946 29871 30226 7378 259 1678 268 diff --git a/models/ggml-vocab-mpt.gguf.inp b/models/ggml-vocab-mpt.gguf.inp index 0389f00c7..5e3062bab 100644 --- a/models/ggml-vocab-mpt.gguf.inp +++ b/models/ggml-vocab-mpt.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-mpt.gguf.out b/models/ggml-vocab-mpt.gguf.out index 1f4b0eb3a..e7e578022 100644 --- a/models/ggml-vocab-mpt.gguf.out +++ b/models/ggml-vocab-mpt.gguf.out @@ -1,4 +1,4 @@ - + 728 577 24142 2607 209 50276 50275 diff --git a/models/ggml-vocab-phi-3.gguf.inp b/models/ggml-vocab-phi-3.gguf.inp index 0389f00c7..5e3062bab 100644 --- a/models/ggml-vocab-phi-3.gguf.inp +++ b/models/ggml-vocab-phi-3.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-phi-3.gguf.out b/models/ggml-vocab-phi-3.gguf.out index 15d00b106..86a7eff91 100644 --- a/models/ggml-vocab-phi-3.gguf.out +++ b/models/ggml-vocab-phi-3.gguf.out @@ -1,4 +1,4 @@ - + 474 287 29871 29946 29871 30226 7378 259 1678 268 diff --git a/models/ggml-vocab-starcoder.gguf.inp b/models/ggml-vocab-starcoder.gguf.inp index 0389f00c7..5e3062bab 100644 --- a/models/ggml-vocab-starcoder.gguf.inp +++ b/models/ggml-vocab-starcoder.gguf.inp @@ -1,4 +1,4 @@ - +ied 4 ½ months __ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-starcoder.gguf.out b/models/ggml-vocab-starcoder.gguf.out index cd04254af..551b3ce6e 100644 --- a/models/ggml-vocab-starcoder.gguf.out +++ b/models/ggml-vocab-starcoder.gguf.out @@ -1,4 +1,4 @@ - + 4850 244 57 244 162 159 17722 244 280 283