From d5d67316e60c908265e3685a990c5b8af9d27532 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Fri, 3 May 2024 07:49:43 -0400 Subject: [PATCH] Add BPE pre-tokenization for Command-R/R+. --- convert-hf-to-gguf-update.py | 9 +++++++++ convert-hf-to-gguf.py | 3 +++ llama.cpp | 8 ++++++++ llama.h | 1 + tests/CMakeLists.txt | 1 + 5 files changed, 22 insertions(+) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 917a4469d..e40a07a49 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -64,6 +64,7 @@ models = [ {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, + {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, ] # make directory "models/tokenizers" if it doesn't exist @@ -104,6 +105,14 @@ for model in models: save_path = f"models/tokenizers/{name}/tokenizer.json" download_file_with_auth(url, token, save_path) + # if downloaded file is less than 1KB, we likely need to download an LFS instead + if os.path.getsize(save_path) < 1024: + # remove the file + os.remove(save_path) + url = f"{repo}/resolve/main/tokenizer.json" + save_path = f"models/tokenizers/{name}/tokenizer.json" + download_file_with_auth(url, token, save_path) + if tokt == TOKENIZER_TYPE.SPM: url = f"{repo}/resolve/main/tokenizer.model" save_path = f"models/tokenizers/{name}/tokenizer.model" diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 88c16676b..f7441e6b8 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -311,6 +311,9 @@ class Model(ABC): if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": # ref: https://huggingface.co/smallcloudai/Refact-1_6-base res = "refact" + if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 + res = "command-r" if res is None: logger.warning("\n") diff --git a/llama.cpp b/llama.cpp index 7c7a06df3..34e3750e5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4386,6 +4386,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "refact") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT; + } else if ( + tokenizer_pre == "command-r") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -12248,6 +12251,11 @@ struct llm_tokenizer_bpe { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }); break; + case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: + word_collection = unicode_regex_split(text, { + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + break; default: // default regex for BPE tokenization pre-processing word_collection = unicode_regex_split(text, { diff --git a/llama.h b/llama.h index 5ff04c1d4..e2fd53ab7 100644 --- a/llama.h +++ b/llama.h @@ -80,6 +80,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, LLAMA_VOCAB_PRE_TYPE_REFACT = 8, + LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, }; // note: these values should be synchronized with ggml_rope diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index cad703fce..208f0cf7b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -83,6 +83,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf) # build test-tokenizer-1-bpe target once and add many tests add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)