models : add phi-3, mpt, gpt-2, starcoder

2024-04-29 13:40:30 +03:00 · 2024-04-29 13:40:30 +03:00 · 120cf37d54
commit 120cf37d54
parent c21ab1833e
20 changed files with 645 additions and 10 deletions
--- a/unicode.cpp
+++ b/unicode.cpp
@ -424,9 +424,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets;

-    if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
-        bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
-    }
+    (void)(text);
+    (void)(regex_expr);
+    (void)(offsets);
+    // TODO: this implementation is actually wrong, uncomment and run:
+    //       make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
+    //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
+    //    bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
+    //}

    return bpe_offsets;
 }