Merge 201559d177
into fc4ca27b25
This commit is contained in:
commit
f98191b01e
7 changed files with 57 additions and 8 deletions
|
@ -590,6 +590,9 @@ class Model:
|
|||
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
||||
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
||||
res = "smollm"
|
||||
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
|
||||
res = "jina-v2-zh"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
|
|
@ -94,6 +94,7 @@ models = [
|
|||
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
||||
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
||||
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
|
||||
{"name": "jina-v2-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -93,6 +93,7 @@ extern "C" {
|
|||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||
LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH = 23,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include <forward_list>
|
||||
#include <queue>
|
||||
#include <sstream>
|
||||
#include <regex>
|
||||
|
||||
//
|
||||
// helpers
|
||||
|
@ -432,6 +433,9 @@ struct llm_tokenizer_bpe {
|
|||
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
|
||||
regex_exprs = {"\\w+|[^\\w\\s]+"};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
|
@ -484,7 +488,20 @@ struct llm_tokenizer_bpe {
|
|||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||
int final_prev_index = -1;
|
||||
|
||||
const auto word_collection = unicode_regex_split(text, regex_exprs);
|
||||
std::vector<std::string> word_collection;
|
||||
if (vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH) {
|
||||
|
||||
std::string lowercase_text = lowercase(text);
|
||||
std::regex regexPattern(regex_exprs[0]);
|
||||
std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern);
|
||||
std::sregex_token_iterator end;
|
||||
|
||||
while (it != end) {
|
||||
word_collection.push_back(*it++);
|
||||
}
|
||||
} else {
|
||||
word_collection = unicode_regex_split(text, regex_exprs);
|
||||
}
|
||||
|
||||
symbols_final.clear();
|
||||
|
||||
|
|
|
@ -5416,8 +5416,8 @@ static void llm_load_vocab(
|
|||
tokenizer_pre == "jina-v2-de" ||
|
||||
tokenizer_pre == "jina-v2-code") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||
} else if (
|
||||
tokenizer_pre == "refact") {
|
||||
|
||||
} else if (tokenizer_pre == "refact") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
||||
} else if (
|
||||
tokenizer_pre == "command-r") {
|
||||
|
@ -5467,6 +5467,9 @@ static void llm_load_vocab(
|
|||
} else if (
|
||||
tokenizer_pre == "codeshell") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
||||
} else if (
|
||||
tokenizer_pre == "jina-v2-zh") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
|
@ -5517,8 +5520,7 @@ static void llm_load_vocab(
|
|||
|
||||
for (uint32_t i = 0; i < n_vocab; i++) {
|
||||
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
||||
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
||||
|
||||
//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); Remove check, some vocabs contain by mistake the NULL in vocab, (not ideal if it happens more than once) (jinaai-embeddings-v2-base-zh)
|
||||
vocab.token_to_id[word] = i;
|
||||
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
||||
|
||||
|
@ -5591,9 +5593,18 @@ static void llm_load_vocab(
|
|||
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
||||
vocab.linefeed_id = vocab.special_pad_id;
|
||||
} else {
|
||||
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
||||
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
||||
vocab.linefeed_id = ids[0];
|
||||
try {
|
||||
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
||||
if (ids.empty()) {
|
||||
LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), "\xC4\x8A");
|
||||
vocab.linefeed_id = -1;
|
||||
} else {
|
||||
vocab.linefeed_id = ids[0];
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), e.what());
|
||||
vocab.linefeed_id = vocab.special_pad_id;
|
||||
}
|
||||
}
|
||||
|
||||
// special tokens
|
||||
|
|
|
@ -816,3 +816,17 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|||
|
||||
return unicode_byte_encoding_process(bpe_words);
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::string lowercase(const std::string & text) {
|
||||
std::string lowercase("");
|
||||
const std::vector<uint32_t> cpts = unicode_cpts_from_utf8(text);
|
||||
|
||||
for (const char32_t cpt : cpts) {
|
||||
const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
|
||||
lowercase += unicode_cpt_to_utf8(unicode_tolower(cpt)); // append char to word
|
||||
}
|
||||
|
||||
return lowercase;
|
||||
}
|
||||
|
|
|
@ -65,3 +65,5 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
|||
uint32_t unicode_tolower(uint32_t cp);
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
||||
|
||||
std::string lowercase(const std::string & text);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue