tests : add test-tokenizer-0.sh + fix some tokenizers (#7036)

* tests : add test-tokenizer-0.sh

* unicode : add all unicode number ranges

* starcoder : fix pre-tokenizer

* tests : add test that fails with DeepSeek tokenizers

* falcon : fix regex

* unicode : regenerate unicode tables

* refact : add tokenizer model

* lint : fix

* tests : disable failing tests

ggml-ci

* refact : add tests files

ggml-ci

* convert : print -> logging

ggml-ci

* lint : fix

* unicode : digit -> number

* phi-3 : update
This commit is contained in:
Georgi Gerganov 2024-05-04 08:32:32 +03:00 committed by GitHub
parent a2ac89d6ef
commit 92139b90af
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
41 changed files with 903 additions and 719 deletions

View file

@ -308,6 +308,9 @@ class Model(ABC):
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
# ref: https://huggingface.co/openai-community/gpt2
res = "gpt-2"
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact"
if res is None:
logger.warning("\n")
@ -324,7 +327,7 @@ class Model(ABC):
logger.warning("\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
logger.debug(f"tokenizer.ggml.pre: {res}")
logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
logger.debug(f"chkhsh: {chkhsh}")
return res