convert : exercise contractions

ggml-ci
2024-04-28 22:18:20 +03:00 · 2024-04-28 22:18:20 +03:00 · 78081502e9
commit 78081502e9
parent 0f9058ceec
3 changed files with 7 additions and 9 deletions
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -36,7 +36,7 @@ class TOKENIZER_TYPE(IntEnum):
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
-chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български what\'s \'\'\'\'\'\'```````\"\"\"\"......!!!!!!??????'
+chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 if len(sys.argv) == 2:
    token = sys.argv[1]
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -229,8 +229,6 @@ class Model(ABC):
            return ("pytorch_model.bin",)
        return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
        raise NotImplementedError(f'Architecture "{arch}" not supported!')
    # used for GPT-2 BPE and WordPiece vocabs
    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
        tokens: list[str] = []
@ -271,7 +269,7 @@ class Model(ABC):
        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
        # use in llama.cpp to implement the same pre-tokenizer
-        chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български what\'s \'\'\'\'\'\'```````""""......!!!!!!??????'
+        chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
        chktok = tokenizer.encode(chktxt)
        chkhsh = sha256(str(chktok).encode()).hexdigest()
@ -283,19 +281,19 @@ class Model(ABC):
        # NOTE: if you get an error here, you need to add the model to the if-elif chain below
        #       don't do this manually - use the convert-hf-to-gguf-update.py script!
-        if chkhsh == "0fc850edd52197e357970116fbf58f6c2567f259cdc1bfc3df081d7e4bc658c1":
+        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
            res = "llama-v3"
-        if chkhsh == "58c3d0e812ae7fa6a20931006d2398274732c105a9a964c148c43cf898c5fb7a":
+        if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
            # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
            res = "deepseek-llm"
-        if chkhsh == "0438d2a948d7fb26c7a662705ac68374f3138ee29e44f133b1f059203500fb4d":
+        if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
            # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
            res = "deepseek-coder"
-        if chkhsh == "822bdd323c3ef8667a9526b16b5bfe97974059838d992a170f965063f99c9b9e":
+        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
            # ref: https://huggingface.co/tiiuae/falcon-7b
            res = "falcon"
-        if chkhsh == "406f3f61e1c67d7b0456c5df2fce5cbb30c77dd3671a436b07a6c510303f721e":
+        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
            res = "bert-bge"
--- a/models/ggml-vocab-llama-v3.gguf
+++ b/models/ggml-vocab-llama-v3.gguf