diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ab9fda261..a26156b39 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -171,7 +171,7 @@ class Model: if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return StableLMModel return Model - + @staticmethod def from_model_name(model_name: str): model_name_lower = model_name.lower() @@ -864,7 +864,7 @@ class DeepseekCoderModel(Model): def set_vocab(self): self._set_vocab_gpt2("deepseek_coder") - + class StableLMModel(Model): def set_gguf_parameters(self): diff --git a/tests/test-tokenizer-0-deepseek_coder.py b/tests/test-tokenizer-0-deepseek_coder.py index 439be3470..b99840e1b 100644 --- a/tests/test-tokenizer-0-deepseek_coder.py +++ b/tests/test-tokenizer-0-deepseek_coder.py @@ -1,7 +1,5 @@ # tests with BPE tokenizer -import os -import sys import argparse from transformers import AutoTokenizer @@ -16,35 +14,35 @@ dir_tokenizer = args.dir_tokenizer tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is πŸ¦™.cpp", - "w048 7tuijk dsdfhu", - "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", - "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", - "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - "\n =", - "' era", - "Hello, y'all! How are you 😁 ?ζˆ‘ζƒ³εœ¨appleε·₯作1314151倩~", - ] + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is πŸ¦™.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", + "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + "\n =", + "' era", + "Hello, y'all! How are you 😁 ?ζˆ‘ζƒ³εœ¨appleε·₯作1314151倩~", +] for text in tests: print('text: ', text)