From 868ab608f0e55abb6ce23ffd09a20b4c0fadfee1 Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Wed, 8 May 2024 18:42:22 +0530 Subject: [PATCH] ChatON: Add forceParseSpecial flag to subparts aware tokenizing --- common/chaton.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/common/chaton.hpp b/common/chaton.hpp index c7cf5de2b..c4ffd6e58 100644 --- a/common/chaton.hpp +++ b/common/chaton.hpp @@ -651,12 +651,15 @@ std::vector chaton_llama_tokenize( // Tokenize the passed taggedText, keeping in mind the subparts within and // inturn whether to parse special tokens in them or not (partsTypes). +// If you want to parse special tokens in the taggedText, independent of what +// partsTypes specifies, then set forceParseSpecial to true. std::vector chaton_llama_tokenize_ex( const llama_context *ctx, const std::string &taggedText, const std::string &partsTypes, const std::vector &partsLengths, - bool addSpecial + bool addSpecial, + bool forceParseSpecial ) { std::vector tokens; int iPart = 0; @@ -667,6 +670,7 @@ std::vector chaton_llama_tokenize_ex( auto msgPart = taggedText.substr(iStart, partLen); iStart += partLen; auto parseSpecial = partType == ChatParts::S ? true : false; + parseSpecial |= forceParseSpecial; auto curTokens = chaton_llama_tokenize(llama_get_model(ctx), msgPart, addSpecial, parseSpecial); tokens.insert(tokens.end(), curTokens.begin(), curTokens.end()); }