ChatON: tokenize keeping in mind the taggedMessage subparts

Initial go
2024-05-08 18:29:48 +05:30 · 2024-05-08 18:29:48 +05:30 · b6da7d9c9d
commit b6da7d9c9d
parent 8dfa31bb91
1 changed files with 45 additions and 0 deletions
--- a/common/chaton.hpp
+++ b/common/chaton.hpp
@ -628,6 +628,51 @@ inline int32_t chaton_tmpl_apply_ex_capi(
    return taggedLength;
 }
 // Copied from common.cpp
 std::vector<llama_token> chaton_llama_tokenize(
    const struct llama_model * model,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special) {
    LOGLN("DBUG:%s:%s:special[add:%d, parse:%d]", __func__, text.c_str(), add_special, parse_special);
    // upper limit for the number of tokens
    int n_tokens = text.length() + 2 * add_special;
    std::vector<llama_token> result(n_tokens);
    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }
    return result;
 }
 // Tokenize the passed taggedText, keeping in mind the subparts within and
 // inturn whether to parse special tokens in them or not (partsTypes).
 std::vector<llama_token> chaton_llama_tokenize_ex(
        const llama_context *ctx,
        const std::string &taggedText,
        const std::string &partsTypes,
        const std::vector<int32_t> &partsLengths,
        bool addSpecial
        ) {
    std::vector<llama_token> tokens;
    int iPart = 0;
    int iStart = 0;
    for(auto partLen: partsLengths) {
        auto partType = partsTypes[iPart];
        iPart += 1;
        auto msgPart = taggedText.substr(iStart, partLen);
        iStart += partLen;
        auto parseSpecial = partType == ChatParts::S ? true : false;
        auto curTokens = chaton_llama_tokenize(llama_get_model(ctx), msgPart, addSpecial, parseSpecial);
        tokens.insert(tokens.end(), curTokens.begin(), curTokens.end());
    }
    return tokens;
 }
 /**
 * if tmpl is