diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index fd0fd9bf3..688c907e0 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -36,7 +36,7 @@ class TOKENIZER_TYPE(IntEnum): # TODO: this string has to exercise as much pre-tokenizer functionality as possible # will be updated with time - contributions welcome -chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български what\'s \'\'\'\'\'\'```````\"\"\"\"......!!!!!!??????' +chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' if len(sys.argv) == 2: token = sys.argv[1] diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6bf987767..9b421d669 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -229,8 +229,6 @@ class Model(ABC): return ("pytorch_model.bin",) return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) - raise NotImplementedError(f'Architecture "{arch}" not supported!') - # used for GPT-2 BPE and WordPiece vocabs def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] @@ -271,7 +269,7 @@ class Model(ABC): # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # use in llama.cpp to implement the same pre-tokenizer - chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български what\'s \'\'\'\'\'\'```````""""......!!!!!!??????' + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() @@ -283,19 +281,19 @@ class Model(ABC): # NOTE: if you get an error here, you need to add the model to the if-elif chain below # don't do this manually - use the convert-hf-to-gguf-update.py script! - if chkhsh == "0fc850edd52197e357970116fbf58f6c2567f259cdc1bfc3df081d7e4bc658c1": + if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B res = "llama-v3" - if chkhsh == "58c3d0e812ae7fa6a20931006d2398274732c105a9a964c148c43cf898c5fb7a": + if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base res = "deepseek-llm" - if chkhsh == "0438d2a948d7fb26c7a662705ac68374f3138ee29e44f133b1f059203500fb4d": + if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base res = "deepseek-coder" - if chkhsh == "822bdd323c3ef8667a9526b16b5bfe97974059838d992a170f965063f99c9b9e": + if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": # ref: https://huggingface.co/tiiuae/falcon-7b res = "falcon" - if chkhsh == "406f3f61e1c67d7b0456c5df2fce5cbb30c77dd3671a436b07a6c510303f721e": + if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" diff --git a/models/ggml-vocab-llama-v3.gguf b/models/ggml-vocab-llama-v3.gguf index bf7354c26..fedd738a0 100644 Binary files a/models/ggml-vocab-llama-v3.gguf and b/models/ggml-vocab-llama-v3.gguf differ