ChatON: Add forceParseSpecial flag to subparts aware tokenizing
This commit is contained in:
parent
b6da7d9c9d
commit
868ab608f0
1 changed files with 5 additions and 1 deletions
|
@ -651,12 +651,15 @@ std::vector<llama_token> chaton_llama_tokenize(
|
||||||
|
|
||||||
// Tokenize the passed taggedText, keeping in mind the subparts within and
|
// Tokenize the passed taggedText, keeping in mind the subparts within and
|
||||||
// inturn whether to parse special tokens in them or not (partsTypes).
|
// inturn whether to parse special tokens in them or not (partsTypes).
|
||||||
|
// If you want to parse special tokens in the taggedText, independent of what
|
||||||
|
// partsTypes specifies, then set forceParseSpecial to true.
|
||||||
std::vector<llama_token> chaton_llama_tokenize_ex(
|
std::vector<llama_token> chaton_llama_tokenize_ex(
|
||||||
const llama_context *ctx,
|
const llama_context *ctx,
|
||||||
const std::string &taggedText,
|
const std::string &taggedText,
|
||||||
const std::string &partsTypes,
|
const std::string &partsTypes,
|
||||||
const std::vector<int32_t> &partsLengths,
|
const std::vector<int32_t> &partsLengths,
|
||||||
bool addSpecial
|
bool addSpecial,
|
||||||
|
bool forceParseSpecial
|
||||||
) {
|
) {
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
int iPart = 0;
|
int iPart = 0;
|
||||||
|
@ -667,6 +670,7 @@ std::vector<llama_token> chaton_llama_tokenize_ex(
|
||||||
auto msgPart = taggedText.substr(iStart, partLen);
|
auto msgPart = taggedText.substr(iStart, partLen);
|
||||||
iStart += partLen;
|
iStart += partLen;
|
||||||
auto parseSpecial = partType == ChatParts::S ? true : false;
|
auto parseSpecial = partType == ChatParts::S ? true : false;
|
||||||
|
parseSpecial |= forceParseSpecial;
|
||||||
auto curTokens = chaton_llama_tokenize(llama_get_model(ctx), msgPart, addSpecial, parseSpecial);
|
auto curTokens = chaton_llama_tokenize(llama_get_model(ctx), msgPart, addSpecial, parseSpecial);
|
||||||
tokens.insert(tokens.end(), curTokens.begin(), curTokens.end());
|
tokens.insert(tokens.end(), curTokens.begin(), curTokens.end());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue