ChatON:SubPartsAwareTokenizePath: Allow extract subparts testing
This commit is contained in:
parent
a49697b488
commit
8fe8231313
2 changed files with 8 additions and 3 deletions
|
@ -635,6 +635,10 @@ inline std::vector<llama_token> chaton_llama_tokenize(
|
||||||
bool add_special,
|
bool add_special,
|
||||||
bool parse_special) {
|
bool parse_special) {
|
||||||
LOGLN("DBUG:%s:%s:special[add:%d, parse:%d]", __func__, text.c_str(), add_special, parse_special);
|
LOGLN("DBUG:%s:%s:special[add:%d, parse:%d]", __func__, text.c_str(), add_special, parse_special);
|
||||||
|
if (model == nullptr) {
|
||||||
|
LOG_TEELN("ERRR:%s:Model NOT Provided:%s:special[add:%d, parse:%d]", __func__, text.c_str(), add_special, parse_special);
|
||||||
|
return std::vector<llama_token>{};
|
||||||
|
}
|
||||||
// upper limit for the number of tokens
|
// upper limit for the number of tokens
|
||||||
int n_tokens = text.length() + 2 * add_special;
|
int n_tokens = text.length() + 2 * add_special;
|
||||||
std::vector<llama_token> result(n_tokens);
|
std::vector<llama_token> result(n_tokens);
|
||||||
|
@ -654,7 +658,7 @@ inline std::vector<llama_token> chaton_llama_tokenize(
|
||||||
// If you want to parse special tokens in the taggedText, independent of what
|
// If you want to parse special tokens in the taggedText, independent of what
|
||||||
// partsTypes specifies, then set forceParseSpecial to true.
|
// partsTypes specifies, then set forceParseSpecial to true.
|
||||||
inline std::vector<llama_token> chaton_llama_tokenize_ex(
|
inline std::vector<llama_token> chaton_llama_tokenize_ex(
|
||||||
const llama_context *ctx,
|
const struct llama_model *model,
|
||||||
const std::string &taggedText,
|
const std::string &taggedText,
|
||||||
const std::string &partsTypes,
|
const std::string &partsTypes,
|
||||||
const std::vector<int32_t> &partsLengths,
|
const std::vector<int32_t> &partsLengths,
|
||||||
|
@ -671,7 +675,7 @@ inline std::vector<llama_token> chaton_llama_tokenize_ex(
|
||||||
iStart += partLen;
|
iStart += partLen;
|
||||||
auto parseSpecial = partType == ChatParts::S ? true : false;
|
auto parseSpecial = partType == ChatParts::S ? true : false;
|
||||||
parseSpecial |= forceParseSpecial;
|
parseSpecial |= forceParseSpecial;
|
||||||
auto curTokens = chaton_llama_tokenize(llama_get_model(ctx), msgPart, addSpecial, parseSpecial);
|
auto curTokens = chaton_llama_tokenize(model, msgPart, addSpecial, parseSpecial);
|
||||||
tokens.insert(tokens.end(), curTokens.begin(), curTokens.end());
|
tokens.insert(tokens.end(), curTokens.begin(), curTokens.end());
|
||||||
}
|
}
|
||||||
return tokens;
|
return tokens;
|
||||||
|
|
|
@ -173,6 +173,7 @@ int main(int argc, char **argv) {
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
std::string metaJson(argv[1]);
|
std::string metaJson(argv[1]);
|
||||||
check_chaton(metaJson);
|
//check_chaton(metaJson);
|
||||||
|
check_chaton_ex(metaJson);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue