From 1a9cf9291cf5957061b69f6641af64990b6c01d5 Mon Sep 17 00:00:00 2001
From: teleprint-me <77757836+teleprint-me@users.noreply.github.com>
Date: Tue, 7 May 2024 01:43:56 -0400
Subject: [PATCH] feat: Add stablelm vocab to gguf update

---
 convert-hf-to-gguf-update.py | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 46a225462..3c5eb5b93 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -23,14 +23,14 @@
 # TODO: automate the update of convert-hf-to-gguf.py
 #
 
+import json
 import logging
 import os
-import requests
 import sys
-import json
-
-from hashlib import sha256
 from enum import IntEnum, auto
+from hashlib import sha256
+
+import requests
 from transformers import AutoTokenizer
 
 logging.basicConfig(level=logging.DEBUG)
@@ -65,6 +65,13 @@ models = [
     {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
     {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
     {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+    {"name": "phi",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-1", },
+    {"name": "stablelm",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
+    {"name": "qwen",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen-tokenizer", },
+    {"name": "mistral-bpe",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", },
+    {"name": "mistral-spm",    "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", },
+    {"name": "mixtral-bpe",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
+    {"name": "mixtral-spm",    "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
     {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
     {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
 ]
@@ -290,12 +297,18 @@ for model in models:
     logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
 
 # generate commands for creating vocab files
-
-logger.info("\nRun the following commands to generate the vocab files for testing:\n")
+shscript = "#!/usr/bin/env bash\n\n"
 
 for model in models:
     name = model["name"]
+    tmpline = f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only\n"
+    shscript += tmpline
+    logging.info(tmpline.strip())
 
-    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
+with open("generate-vocab.sh", "w", encoding="utf-8") as f:
+    f.writelines(shscript)
+    logging.info(f"Wrote {len(shscript)} bytes to generate-vocab.sh")
 
-logger.info("\n")
+logging.info("Run the following command to generate the vocab files for testing:")
+logging.info("Enable execution: chmod +x generate-vocab.sh")
+logging.info("Execute with ./generate-vocab.sh")