Remove qwen and fix mauled imports

This commit is contained in:
teleprint-me 2024-05-12 21:44:31 -04:00
parent fc0007eca5
commit 932ab05d69
No known key found for this signature in database
GPG key ID: B0D11345E65C4D48
4 changed files with 10 additions and 38 deletions

View file

@ -77,7 +77,6 @@ models = [
{"name": "mixtral-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen-7B", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
@ -126,23 +125,6 @@ for model in models:
logger.info(f"Downloading {name} to {model_name_or_path}")
# model and repo urls are not the same
# url = "https://huggingface.co/Qwen/Qwen-tokenizer/raw/main/tokenizer.json"
if name == "qwen": # qwen is an outlier and will raise a FileNotFoundError
# override the tokenizer path
model_tokenizer_path = f"{model_name_or_path}/qwen.tiktoken"
# fetch the qwens BPE tokenizer
download_file_with_auth(
url="https://huggingface.co/Qwen/Qwen-7B/raw/main/qwen.tiktoken",
token=token,
save_path=model_tokenizer_path
)
# fetch qwens tokenizer script; this is required.
download_file_with_auth(
url="https://huggingface.co/Qwen/Qwen-7B/raw/main/tokenization_qwen.py",
token=token,
save_path=f"{model_name_or_path}/tokenization_qwen.py"
)
else: # Get the models tokenizer
download_file_with_auth(
url=f"{url_resolve}/tokenizer.json",
token=token,

View file

@ -11,6 +11,7 @@ import re
import sys
from enum import IntEnum
from hashlib import sha256
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
@ -22,16 +23,6 @@ from typing import (
TypeVar,
cast,
)
TYPE_CHECKING,
Any,
Callable,
ContextManager,
Iterable,
Iterator,
Sequence,
TypeVar,
cast,
)
import numpy as np
import torch
@ -495,9 +486,6 @@ class Model:
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
res = "command-r"
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
# ref: https://huggingface.co/Qwen/Qwen-7B
res = "qwen"
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
res = "qwen2"

View file

@ -18,6 +18,9 @@ python3 convert-hf-to-gguf.py models/tokenizers/mixtral-bpe --outfile models/ggm
python3 convert-hf-to-gguf.py models/tokenizers/mixtral-spm --outfile models/ggml-vocab-mixtral-spm.gguf --vocab-only
python3 convert-hf-to-gguf.py models/tokenizers/refact --outfile models/ggml-vocab-refact.gguf --vocab-only
python3 convert-hf-to-gguf.py models/tokenizers/command-r --outfile models/ggml-vocab-command-r.gguf --vocab-only
python3 convert-hf-to-gguf.py models/tokenizers/qwen --outfile models/ggml-vocab-qwen.gguf --vocab-only
python3 convert-hf-to-gguf.py models/tokenizers/qwen2 --outfile models/ggml-vocab-qwen2.gguf --vocab-only
python3 convert-hf-to-gguf.py models/tokenizers/olmo --outfile models/ggml-vocab-olmo.gguf --vocab-only
python3 convert-hf-to-gguf.py models/tokenizers/dbrx --outfile models/ggml-vocab-dbrx.gguf --vocab-only
python3 convert-hf-to-gguf.py models/tokenizers/jina-en --outfile models/ggml-vocab-jina-en.gguf --vocab-only
python3 convert-hf-to-gguf.py models/tokenizers/jina-es --outfile models/ggml-vocab-jina-es.gguf --vocab-only
python3 convert-hf-to-gguf.py models/tokenizers/jina-de --outfile models/ggml-vocab-jina-de.gguf --vocab-only

View file

@ -84,7 +84,6 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
# build test-tokenizer-1-bpe target once and add many tests