Remove qwen and fix mauled imports

2024-05-12 21:44:31 -04:00 · 2024-05-12 21:44:31 -04:00 · 932ab05d69
commit 932ab05d69
parent fc0007eca5
4 changed files with 10 additions and 38 deletions
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -77,7 +77,6 @@ models = [
    {"name": "mixtral-spm",    "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
-    {"name": "qwen",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen-7B", },
    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
@ -126,23 +125,6 @@ for model in models:
    logger.info(f"Downloading {name} to {model_name_or_path}")

    # model and repo urls are not the same
-    # url = "https://huggingface.co/Qwen/Qwen-tokenizer/raw/main/tokenizer.json"
-    if name == "qwen":  # qwen is an outlier and will raise a FileNotFoundError
-        # override the tokenizer path
-        model_tokenizer_path = f"{model_name_or_path}/qwen.tiktoken"
-        # fetch the qwens BPE tokenizer
-        download_file_with_auth(
-            url="https://huggingface.co/Qwen/Qwen-7B/raw/main/qwen.tiktoken",
-            token=token,
-            save_path=model_tokenizer_path
-        )
-        # fetch qwens tokenizer script; this is required.
-        download_file_with_auth(
-            url="https://huggingface.co/Qwen/Qwen-7B/raw/main/tokenization_qwen.py",
-            token=token,
-            save_path=f"{model_name_or_path}/tokenization_qwen.py"
-        )
-    else:  # Get the models tokenizer
    download_file_with_auth(
        url=f"{url_resolve}/tokenizer.json",
        token=token,
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -11,6 +11,7 @@ import re
 import sys
 from enum import IntEnum
 from hashlib import sha256
+from pathlib import Path
 from typing import (
    TYPE_CHECKING,
    Any,
@ -22,16 +23,6 @@ from typing import (
    TypeVar,
    cast,
 )
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    ContextManager,
-    Iterable,
-    Iterator,
-    Sequence,
-    TypeVar,
-    cast,
-)

 import numpy as np
 import torch
@ -495,9 +486,6 @@ class Model:
        if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
            res = "command-r"
-        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
-            # ref: https://huggingface.co/Qwen/Qwen-7B
-            res = "qwen"
        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
            res = "qwen2"
--- a/generate-vocab.sh
+++ b/generate-vocab.sh
@ -18,6 +18,9 @@ python3 convert-hf-to-gguf.py models/tokenizers/mixtral-bpe --outfile models/ggm
 python3 convert-hf-to-gguf.py models/tokenizers/mixtral-spm --outfile models/ggml-vocab-mixtral-spm.gguf --vocab-only
 python3 convert-hf-to-gguf.py models/tokenizers/refact --outfile models/ggml-vocab-refact.gguf --vocab-only
 python3 convert-hf-to-gguf.py models/tokenizers/command-r --outfile models/ggml-vocab-command-r.gguf --vocab-only
-python3 convert-hf-to-gguf.py models/tokenizers/qwen --outfile models/ggml-vocab-qwen.gguf --vocab-only
 python3 convert-hf-to-gguf.py models/tokenizers/qwen2 --outfile models/ggml-vocab-qwen2.gguf --vocab-only
 python3 convert-hf-to-gguf.py models/tokenizers/olmo --outfile models/ggml-vocab-olmo.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/dbrx --outfile models/ggml-vocab-dbrx.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/jina-en --outfile models/ggml-vocab-jina-en.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/jina-es --outfile models/ggml-vocab-jina-es.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/jina-de --outfile models/ggml-vocab-jina-de.gguf --vocab-only
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -84,7 +84,6 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen              ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)

 # build test-tokenizer-1-bpe target once and add many tests