models : add phi-3, mpt, gpt-2, starcoder

This commit is contained in:
Georgi Gerganov 2024-04-29 13:40:30 +03:00
parent c21ab1833e
commit 120cf37d54
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
20 changed files with 645 additions and 10 deletions

View file

@ -424,9 +424,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
std::vector<size_t> bpe_offsets;
if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
}
(void)(text);
(void)(regex_expr);
(void)(offsets);
// TODO: this implementation is actually wrong, uncomment and run:
// make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
//if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
// bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
//}
return bpe_offsets;
}