From 5840b6f0b05b0147ea92cbdd9ec00700d9c183ed Mon Sep 17 00:00:00 2001 From: teleprint-me <77757836+teleprint-me@users.noreply.github.com> Date: Sat, 18 May 2024 23:59:52 -0400 Subject: [PATCH] refactor: Simplify the get_vocab_base_pre method --- convert-hf-to-gguf.py | 121 ++++++++++++++---------------------------- 1 file changed, 40 insertions(+), 81 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index cd1750aa3..fc8214831 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2,17 +2,27 @@ from __future__ import annotations -import logging import argparse import contextlib import json +import logging import os import re import sys from enum import IntEnum -from pathlib import Path from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ContextManager, + Iterable, + Iterator, + Sequence, + TypeVar, + cast, +) import numpy as np import torch @@ -409,87 +419,36 @@ class Model: # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # use in llama.cpp to implement the same pre-tokenizer - chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + checksum = sha256(str(tokenizer.vocab).encode()).hexdigest() + logger.debug(f"checksum: {checksum}") - chktok = tokenizer.encode(chktxt) - chkhsh = sha256(str(chktok).encode()).hexdigest() + # NOTE: IF you get an error here: + # Update the huggingface_hub.py module and add the vocab, model, and repo. + # Run the `gguf-py/scripts/gguf-gen-pre.py` script to generate the checksums. + # This script should ideally pull in the latest version of the model from HuggingFace. + # DO NOT MANUALLY EDIT THIS METHOD! + models = json.load("models/checksums.json") + for model in models: + if checksum == model["checksum"]: + if + logger.debug(f"tokenizer.ggml.pre: {repr(result)}") + logger.debug(f"tokenizer checksum: {checksum}") + return model["tokt"] # NOTE: Use the enum to id the vocab - logger.debug(f"chktok: {chktok}") - logger.debug(f"chkhsh: {chkhsh}") + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** tokenizer checksum: {checksum}") + logger.warning("**************************************************************************************") + logger.warning("\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - res = None - - # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script - # or pull the latest version of the model from Huggingface - # don't edit the hashes manually! - if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B - res = "llama-bpe" - if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": - # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base - res = "deepseek-llm" - if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": - # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base - res = "deepseek-coder" - if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": - # ref: https://huggingface.co/tiiuae/falcon-7b - res = "falcon" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 - res = "bert-bge" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/mosaicml/mpt-7b - res = "mpt" - if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": - # ref: https://huggingface.co/bigcode/starcoder2-3b - res = "starcoder" - if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": - # ref: https://huggingface.co/openai-community/gpt2 - res = "gpt-2" - if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": - # ref: https://huggingface.co/smallcloudai/Refact-1_6-base - res = "refact" - if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": - # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 - res = "command-r" - if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": - # ref: https://huggingface.co/Qwen/Qwen1.5-7B - res = "qwen2" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf - res = "olmo" - if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": - # ref: https://huggingface.co/databricks/dbrx-base - res = "dbrx" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en - res = "jina-v2-en" - if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es - res = "jina-v2-es" - if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de - res = "jina-v2-de" - - if res is None: - logger.warning("\n") - logger.warning("**************************************************************************************") - logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") - logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") - logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") - logger.warning("**") - logger.warning(f"** chkhsh: {chkhsh}") - logger.warning("**************************************************************************************") - logger.warning("\n") - raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - - logger.debug(f"tokenizer.ggml.pre: {repr(res)}") - logger.debug(f"chkhsh: {chkhsh}") - - return res # Marker: End get_vocab_base_pre def _set_vocab_gpt2(self) -> None: