fix falcon preprocess and add deepseek coder

2023-11-14 17:34:46 +08:00 · 2023-11-14 17:34:46 +08:00 · c31263e0cb
commit c31263e0cb
parent 5600bd8cbc
8 changed files with 397 additions and 27 deletions
--- a/8
+++ b/8
@ -8,7 +8,8 @@ BUILD_TARGETS = \
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek_coder 									   \
 	tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -69,6 +70,8 @@ test: $(TEST_TARGETS)
 			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek_coder" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
 			continue; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
@ -712,6 +715,9 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o
 tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-tokenizer-0-deepseek_coder: tests/test-tokenizer-0-deepseek_coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/llama.cpp
+++ b/llama.cpp
@ -76,6 +76,7 @@
 #include <sstream>
 #include <thread>
 #include <unordered_map>
 #include <iostream>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -2266,7 +2267,12 @@ static void llm_load_vocab(
            vocab.special_sep_id = -1;
            vocab.special_pad_id = -1;
        } else if (tokenizer_name == "gpt2" || tokenizer_name == "deepseek_coder") {
            if(tokenizer_name == "gpt2"){
                vocab.type = LLAMA_VOCAB_TYPE_BPE;
            }
            else if (tokenizer_name == "deepseek_coder"){
                vocab.type = LLAMA_VOCAB_TYPE_DEEPSEEKCODER;
            }
            // read bpe merges and populate bpe ranks
            const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
@ -2463,7 +2469,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    // hparams
    LLAMA_LOG_INFO("%s: format           = %s\n",     __func__, llama_file_version_name(ml.fver));
    LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
-    LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
+    LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : (vocab.type ==LLAMA_VOCAB_TYPE_BPE ? "BPE" : "DEEPSEEKCODER")); // TODO: fix
    LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, hparams.n_vocab);
    LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (int) vocab.bpe_ranks.size());
    LLAMA_LOG_INFO("%s: n_ctx_train      = %u\n",     __func__, hparams.n_ctx_train);
@ -5342,6 +5348,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
        auto buf = token_data.text.substr(3, 2);
        return strtol(buf.c_str(), NULL, 16);
    }
    case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
    case LLAMA_VOCAB_TYPE_BPE: {
        GGML_ASSERT(false);
        return unicode_to_bytes_bpe(token_data.text);
@ -5358,6 +5365,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
        const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
        return vocab.token_to_id.at(buf);
    }
    case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
    case LLAMA_VOCAB_TYPE_BPE: {
        return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
    }
@ -5554,7 +5562,11 @@ struct llm_tokenizer_bpe {
    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
        int final_prev_index = -1;
-        auto word_collection = bpe_gpt2_preprocess(text);
+        std::vector<std::string> word_collection;
        if(vocab.type == LLAMA_VOCAB_TYPE_BPE)
            word_collection = bpe_gpt2_preprocess(text);
        else if(vocab.type==LLAMA_VOCAB_TYPE_DEEPSEEKCODER)
            word_collection = bpe_deepseek_coder_preprocess(text);
        symbols_final.clear();
@ -5681,26 +5693,9 @@ private:
        work_queue.push(bigram);
    }
-    std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
+    std::vector<std::string> byte_encoding_process(const std::vector<std::string> &bpe_words){
-
+        std::vector<std::string>bpe_encoded_words;
-        std::vector<std::string> bpe_words;
+        for (auto word : bpe_words) {
        std::vector<std::string> bpe_encoded_words;
        // convert input string to wstring
        std::wstring input = from_utf8(text);
        std::wstring regex = from_utf8(gpt2_regex);
        std::wregex expr(regex);
        // std::wsmatch m;
        // // use regex match to get where to split the test string
        int array[] = {-1,0};
        std::wsregex_token_iterator iter(input.begin(), input.end(),  expr, array);
        std::wsregex_token_iterator end;
        for ( ; iter != end; ++iter){
                if ((*iter).length()>0){
                    bpe_words.push_back(to_utf8(*iter));
                }
            }
        // convert each word to utf8
        for (std::string & word : bpe_words) {
            std::string text_utf = "";
            auto utf_word =  codepoints_from_utf8(word);
            for (size_t i = 0; i < utf_word.size(); ++i)
@ -5712,6 +5707,80 @@ private:
            }
            bpe_encoded_words.emplace_back(encoded_token);
        }
        return bpe_encoded_words;
    }
    std::vector<std::string> regex_preprocess(const std::vector<std::string> &input, const std::string & regex_expr){
        std::regex expr(regex_expr);
        std::vector<std::string> bpe_words;
        // std::wsmatch m;
        // // use regex match to get where to split the test string
        for(auto& text:input){
            std::cregex_iterator it(text.data(), text.data() + text.size(), expr);
            std::cregex_iterator end;
            // Print the matches
            unsigned int start_idx = 0;
            while (it != end) {
                std::cmatch match = *it;
                std::string match_str = match.str();
                if(match.position()>start_idx){
                    bpe_words.emplace_back(text.substr(start_idx, match.position()-start_idx));
                }
                bpe_words.emplace_back(match_str);
                start_idx = match.position() + match.length();
                ++it;
            }
            if(start_idx < text.size()){
                bpe_words.emplace_back(text.substr(start_idx, text.size()-start_idx));
            }
        }
        return bpe_words;
    }
    std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
        std::vector<std::string> bpe_words = {text};
        for(auto & regex_expr : gpt2_regex){
            bpe_words = regex_preprocess(bpe_words, regex_expr);
        }
        std::vector<std::string> bpe_encoded_words = byte_encoding_process(bpe_words);
        return bpe_encoded_words;
    }
    std::vector<std::string> bpe_deepseek_coder_preprocess(const std::string & text) {
        std::vector<std::string> bpe_words;
        std::wstring wtext = from_utf8(text);
        // extract all cjk characters
        std::wregex expr(L"[\u4e00-\u9fa5\u0800-\u4e00\uac00-\ud7ff]+");
        std::wcregex_iterator it(wtext.data(), wtext.data() + wtext.size(), expr);
        std::wcregex_iterator end;
        unsigned int start_idx = 0;
        while (it != end) {
            std::wcmatch match = *it;
            std::wstring match_str = match.str();
            if(match.position()>start_idx){
                bpe_words.emplace_back(to_utf8(wtext.substr(start_idx, match.position()-start_idx)));
            }
            bpe_words.emplace_back(to_utf8(match_str));
            start_idx = match.position() + match.length();
            ++it;
        }
        if(start_idx < wtext.size()){
            bpe_words.emplace_back(to_utf8(wtext.substr(start_idx, wtext.size()-start_idx)));
        }
        for(auto & regex_expr : deepseek_coder_regex){
            bpe_words = regex_preprocess(bpe_words, regex_expr);
        }
        std::vector<std::string> bpe_encoded_words = byte_encoding_process(bpe_words);
        return bpe_encoded_words;
    }
@ -5903,6 +5972,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                    }
                }
            } break;
        case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
        case LLAMA_VOCAB_TYPE_BPE:
            {
                for (const auto & fragment: fragment_buffer)
@ -8972,6 +9042,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
            }
            break;
        }
        case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
        case LLAMA_VOCAB_TYPE_BPE: {
            if (llama_is_normal_token(model->vocab, token)) {
                std::string result = model->vocab.id_to_token[token].text;
--- a/llama.h
+++ b/llama.h
@ -69,6 +69,7 @@ extern "C" {
    enum llama_vocab_type {
        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
        LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // deepseek coder
    };
    enum llama_token_type {
--- a/tests/test-tokenizer-0-deepseek_coder.cpp
+++ b/tests/test-tokenizer-0-deepseek_coder.cpp
@ -0,0 +1,188 @@
 #include "llama.h"
 #include "common.h"
 #include "console.h"
 #include <cstdio>
 #include <string>
 #include <map>
 #include <vector>
 #include <fstream>
 // generate using test-tokenizer-0-falcon.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
    static std::map<std::string, std::vector<llama_token>> _k_tests = {
        { ""                      , {    }, },
        { " "                     , {       207, }, },
        { "  "                    , {       243, }, },
        { "   "                   , {       315, }, },
        { "\t"                    , {       184, }, },
        { "\n"                    , {       185, }, },
        { "\t\n"                  , {       184,    185, }, },
        { "Hello world"           , {     17535,   1835, }, },
        { " Hello world"          , {       414,   9489,   1835, }, },
        { "Hello World"           , {     17535,   5414, }, },
        { " Hello World"          , {       414,   9489,   5414, }, },
        { " Hello World!"         , {       414,   9489,   5414,      0, }, },
        { "Hello, world!"         , {     17535,     11,   1835,      0, }, },
        { " Hello, world!"        , {       414,   9489,     11,   1835,      0, }, },
        { " this is 🦙.cpp"        , {       437,    317,  12394,     99,    234,     13,  14789, }, },
        { "w048 7tuijk dsdfhu"    , {        86,     15,     19,     23,    207,     22,     83,   3963,  27659,  26078,   3934,  14072, }, },
        { "нещо на Български"     , {      1593,   6478,    616,   2251,  14994, }, },
        { "កាន់តែពិសេសអាចខលចេញ"   , {       155,    239,    209,    155,    239,    114,    155,    239,    228,    155,    240,    220,    155,    239,    224,    155,    240,    211,    155,    239,    231,    155,    239,    115,    155,    239,    240,    155,    240,    210,    155,    239,    240,    155,    239,     95,    155,    239,    114,    155,    239,    214,    155,    239,    210,    155,    239,    236,    155,    239,    214,    155,    240,    210,    155,    239,    218, }, },
        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {     10047,    235,    209,    334,   8760,      8,  12394,    233,    114,    350,    222,  10047,    221,    104,    169,    116,    224,    334,   4684,   3909,    992,  24330,    262,  29651,    612,      8,    207,    156,    237,    214,    334,   5950,    992,     78,  12896,    344,    638,    891,   1372,  10736,      8, }, },
        { "Hello"                 , {     17535, }, },
        { " Hello"                , {       414,   9489, }, },
        { "  Hello"               , {       207,    414,   9489, }, },
        { "   Hello"              , {       243,    414,   9489, }, },
        { "    Hello"             , {       315,    414,   9489, }, },
        { "    Hello\n    Hello"  , {       315,    414,   9489,    185,    315,    414,   9489, }, },
        { "\n ="                  , {       185,    405, }, },
        { "' era"                 , {         6,   2895, }, },
        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {     17535,     11,    320,      6,    435,      0,   1717,    417,    340,  12394,    233,    210,   3015,  19100,    608,   9413,   2668,     16,     18,     16,     19,     16,     20,     16,   1393,    169,    121,    239, }, },
    };
    return _k_tests;
 }
 int main(int argc, char **argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
        return 1;
    }
    const std::string fname = argv[1];
    std::string fname_text;
    if (argc > 2) {
        fname_text = argv[2];
    }
    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
    llama_model * model;
    llama_context * ctx;
    llama_backend_init(false);
    // load the vocab
    {
        auto mparams = llama_model_default_params();
        mparams.vocab_only = true;
        model = llama_load_model_from_file(fname.c_str(), mparams);
        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }
        auto cparams = llama_context_default_params();
        ctx = llama_new_context_with_model(model, cparams);
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            llama_free_model(model);
            return 1;
        }
    }
    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_DEEPSEEKCODER) {
        fprintf(stderr, "%s : error: vocab type is not DEEPSEEKCODER\n", __func__);
        llama_free_model(model);
        llama_free(ctx);
        return 2;
    }
 #ifdef _WIN32
    // We need this for unicode console support
    console::init(false, false);
    atexit([]() { console::cleanup(); });
 #endif
    bool success = true;
    for (const auto & test_kv : k_tests()) {
        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
        printf("\n");
        printf("src: '%s'\n", test_kv.first.c_str());
        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
        printf("tok: ");
        for (const auto & tok : res) {
            printf("%d ", tok);
        }
        printf("\n");
        bool correct = res.size() == test_kv.second.size();
        for (int i = 0; i < (int) res.size() && correct; ++i) {
            if (test_kv.second[i] != res[i]) {
                correct = false;
            }
        }
        if (!correct) {
            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
                llama_detokenize_bpe(ctx, res).c_str(),
                llama_detokenize_bpe(ctx, test_kv.second).c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            fprintf(stderr, "%s : got tokens:      ", __func__);
            for (const auto & t : res) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            success = false;
        }
    }
    if (!fname_text.empty()) {
        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
        std::string text;
        {
            std::ifstream ifs(fname_text);
            if (!ifs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
                return 1;
            }
            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
        }
        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
        {
            const std::string fname_out = fname_text + ".tokcpp";
            std::ofstream ofs(fname_out);
            if (!ofs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
                return 1;
            }
            for (const auto & tok : res) {
                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
            }
        }
        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
    }
    llama_free_model(model);
    llama_free(ctx);
    llama_backend_free();
    return success ? 0 : 3;
 }
--- a/tests/test-tokenizer-0-deepseek_coder.py
+++ b/tests/test-tokenizer-0-deepseek_coder.py
@ -0,0 +1,85 @@
 # tests with BPE tokenizer
 import os
 import sys
 import argparse
 from transformers import AutoTokenizer
 parser = argparse.ArgumentParser()
 parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
 parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
 args = parser.parse_args()
 dir_tokenizer = args.dir_tokenizer
 tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
 tests = [
        "",
        " ",
        "  ",
        "   ",
        "\t",
        "\n",
        "\t\n",
        "Hello world",
        " Hello world",
        "Hello World",
        " Hello World",
        " Hello World!",
        "Hello, world!",
        " Hello, world!",
        " this is 🦙.cpp",
        "w048 7tuijk dsdfhu",
        "нещо на Български",
        "កាន់តែពិសេសអាចខលចេញ",
        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
        "Hello",
        " Hello",
        "  Hello",
        "   Hello",
        "    Hello",
        "    Hello\n    Hello",
        "\n =",
        "' era",
        "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
    ]
 for text in tests:
    print('text: ', text)
    print(tokenizer.encode(text))
    print(tokenizer.decode(tokenizer.encode(text)))
 print("\n\ntests for C++:\n")
 for text in tests:
    res = tokenizer.encode(text)
    k = text.replace('\n', '\\n')
    k = k.replace('\t', '\\t')
    k = '"' + k + '"'
    print("{ %-24s, { " % k, end='')
    for x in res:
        print("%7d," % x, end='')
    print(" }, },")
 print(tokenizer.encode('hello'))
 print(tokenizer.encode('world'))
 print(tokenizer.encode(' world'))
 print(tokenizer.encode('hello world'))
 fname_tok = args.fname_tok
 if fname_tok:
    print('tokenizing file: ', fname_tok)
    fname_out = fname_tok + '.tok'
    with open(fname_tok, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        s = ''.join(lines)
        res = tokenizer.encode(s)
        # write to file
        with open(fname_out, 'w', encoding='utf-8') as f:
            for x in res:
                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))
    print('results written to: ', fname_out)
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -38,6 +38,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
        { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
        { "\n ="                  , {    1212,     40, }, },
        { "' era"                 , {      18,   4932, }, },
        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    9856,     23,    291,     18,    436,     12,   1265,    362,    299,   8196,    207,    204,     42,  50087,    123,   2727,  20300,  32022,    133,    234,  17419,  30137,     28,   7858,    181,    133,    236, }, },
    };
    return _k_tests;
@ -115,7 +116,6 @@ int main(int argc, char **argv) {
        printf("\n");
        bool correct = res.size() == test_kv.second.size();
        for (int i = 0; i < (int) res.size() && correct; ++i) {
            if (test_kv.second[i] != res[i]) {
                correct = false;
--- a/tests/test-tokenizer-0-falcon.py
+++ b/tests/test-tokenizer-0-falcon.py
@ -43,6 +43,7 @@ tests = [
        "    Hello\n    Hello",
        "\n =",
        "' era",
        "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
    ]
 for text in tests:
--- a/unicode.h
+++ b/unicode.h