diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 7fc89fa3d..3d6298118 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -77,7 +77,6 @@ models = [ {"name": "mixtral-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", }, {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, - {"name": "qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen-7B", }, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, @@ -126,28 +125,11 @@ for model in models: logger.info(f"Downloading {name} to {model_name_or_path}") # model and repo urls are not the same - # url = "https://huggingface.co/Qwen/Qwen-tokenizer/raw/main/tokenizer.json" - if name == "qwen": # qwen is an outlier and will raise a FileNotFoundError - # override the tokenizer path - model_tokenizer_path = f"{model_name_or_path}/qwen.tiktoken" - # fetch the qwens BPE tokenizer - download_file_with_auth( - url="https://huggingface.co/Qwen/Qwen-7B/raw/main/qwen.tiktoken", - token=token, - save_path=model_tokenizer_path - ) - # fetch qwens tokenizer script; this is required. - download_file_with_auth( - url="https://huggingface.co/Qwen/Qwen-7B/raw/main/tokenization_qwen.py", - token=token, - save_path=f"{model_name_or_path}/tokenization_qwen.py" - ) - else: # Get the models tokenizer - download_file_with_auth( - url=f"{url_resolve}/tokenizer.json", - token=token, - save_path=model_tokenizer_path - ) + download_file_with_auth( + url=f"{url_resolve}/tokenizer.json", + token=token, + save_path=model_tokenizer_path + ) # Get the models hyper params download_file_with_auth( diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5a420d5ad..3432ca335 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -11,6 +11,7 @@ import re import sys from enum import IntEnum from hashlib import sha256 +from pathlib import Path from typing import ( TYPE_CHECKING, Any, @@ -22,16 +23,6 @@ from typing import ( TypeVar, cast, ) - TYPE_CHECKING, - Any, - Callable, - ContextManager, - Iterable, - Iterator, - Sequence, - TypeVar, - cast, -) import numpy as np import torch @@ -495,9 +486,6 @@ class Model: if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 res = "command-r" - if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": - # ref: https://huggingface.co/Qwen/Qwen-7B - res = "qwen" if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": # ref: https://huggingface.co/Qwen/Qwen1.5-7B res = "qwen2" diff --git a/generate-vocab.sh b/generate-vocab.sh index 6f1a8fc2b..60bc39d3e 100755 --- a/generate-vocab.sh +++ b/generate-vocab.sh @@ -18,6 +18,9 @@ python3 convert-hf-to-gguf.py models/tokenizers/mixtral-bpe --outfile models/ggm python3 convert-hf-to-gguf.py models/tokenizers/mixtral-spm --outfile models/ggml-vocab-mixtral-spm.gguf --vocab-only python3 convert-hf-to-gguf.py models/tokenizers/refact --outfile models/ggml-vocab-refact.gguf --vocab-only python3 convert-hf-to-gguf.py models/tokenizers/command-r --outfile models/ggml-vocab-command-r.gguf --vocab-only -python3 convert-hf-to-gguf.py models/tokenizers/qwen --outfile models/ggml-vocab-qwen.gguf --vocab-only python3 convert-hf-to-gguf.py models/tokenizers/qwen2 --outfile models/ggml-vocab-qwen2.gguf --vocab-only python3 convert-hf-to-gguf.py models/tokenizers/olmo --outfile models/ggml-vocab-olmo.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/dbrx --outfile models/ggml-vocab-dbrx.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/jina-en --outfile models/ggml-vocab-jina-en.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/jina-es --outfile models/ggml-vocab-jina-es.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/jina-de --outfile models/ggml-vocab-jina-de.gguf --vocab-only diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6252bf5ff..766a01752 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -84,7 +84,6 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf) -llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf) # build test-tokenizer-1-bpe target once and add many tests