tests : add test-tokenizer-0.sh + fix some tokenizers (#7036)
* tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update
This commit is contained in:
parent
a2ac89d6ef
commit
92139b90af
41 changed files with 903 additions and 719 deletions
3
Makefile
3
Makefile
|
@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
|
|||
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
|
||||
continue; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue