ChatON:SubPartsAwareTokenizePath: Allow extract subparts testing

This commit is contained in:
HanishKVC 2024-05-08 19:36:39 +05:30
parent a49697b488
commit 8fe8231313
2 changed files with 8 additions and 3 deletions

View file

@ -635,6 +635,10 @@ inline std::vector<llama_token> chaton_llama_tokenize(
bool add_special, bool add_special,
bool parse_special) { bool parse_special) {
LOGLN("DBUG:%s:%s:special[add:%d, parse:%d]", __func__, text.c_str(), add_special, parse_special); LOGLN("DBUG:%s:%s:special[add:%d, parse:%d]", __func__, text.c_str(), add_special, parse_special);
if (model == nullptr) {
LOG_TEELN("ERRR:%s:Model NOT Provided:%s:special[add:%d, parse:%d]", __func__, text.c_str(), add_special, parse_special);
return std::vector<llama_token>{};
}
// upper limit for the number of tokens // upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special; int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens); std::vector<llama_token> result(n_tokens);
@ -654,7 +658,7 @@ inline std::vector<llama_token> chaton_llama_tokenize(
// If you want to parse special tokens in the taggedText, independent of what // If you want to parse special tokens in the taggedText, independent of what
// partsTypes specifies, then set forceParseSpecial to true. // partsTypes specifies, then set forceParseSpecial to true.
inline std::vector<llama_token> chaton_llama_tokenize_ex( inline std::vector<llama_token> chaton_llama_tokenize_ex(
const llama_context *ctx, const struct llama_model *model,
const std::string &taggedText, const std::string &taggedText,
const std::string &partsTypes, const std::string &partsTypes,
const std::vector<int32_t> &partsLengths, const std::vector<int32_t> &partsLengths,
@ -671,7 +675,7 @@ inline std::vector<llama_token> chaton_llama_tokenize_ex(
iStart += partLen; iStart += partLen;
auto parseSpecial = partType == ChatParts::S ? true : false; auto parseSpecial = partType == ChatParts::S ? true : false;
parseSpecial |= forceParseSpecial; parseSpecial |= forceParseSpecial;
auto curTokens = chaton_llama_tokenize(llama_get_model(ctx), msgPart, addSpecial, parseSpecial); auto curTokens = chaton_llama_tokenize(model, msgPart, addSpecial, parseSpecial);
tokens.insert(tokens.end(), curTokens.begin(), curTokens.end()); tokens.insert(tokens.end(), curTokens.begin(), curTokens.end());
} }
return tokens; return tokens;

View file

@ -173,6 +173,7 @@ int main(int argc, char **argv) {
exit(1); exit(1);
} }
std::string metaJson(argv[1]); std::string metaJson(argv[1]);
check_chaton(metaJson); //check_chaton(metaJson);
check_chaton_ex(metaJson);
return 0; return 0;
} }