ChatON:ChatParts: Allow flexibility for more refined tokenization

This commit is contained in:
HanishKVC 2024-04-24 20:23:44 +05:30
parent 6b23f15ffe
commit 92e780fb1a

View file

@ -85,7 +85,9 @@ json conMeta;
/** /**
* Helps keep user prompt and chat-hs-template tag parts seperate, but in sequence * Helps keep user prompt and chat-hs-template tag parts seperate, but in sequence.
* Inturn gives the flexibility to tokenize with or without parse_special flag, wrt the different parts of the chat msg(s).
* One could use the triplet of str, get_types and get_partslens to achieve the above mentioned flexibility.
*/ */
class ChatParts { class ChatParts {
@ -100,7 +102,7 @@ public:
// Identify no string condition and or ignore string. // Identify no string condition and or ignore string.
static const auto X = '?'; static const auto X = '?';
ChatParts() :parts{}, types{""} {} ChatParts() : parts{}, types{""} {}
char last_type() { char last_type() {
if (types.length() == 0) { if (types.length() == 0) {
@ -126,6 +128,18 @@ public:
return allin; return allin;
} }
std::string get_types() {
return types;
}
std::vector<int> get_partslens() {
std::vector<int> lens = {};
for(auto part: parts) {
lens.push_back(part.length());
}
return lens;
}
std::string name() { std::string name() {
return typeid(*this).name(); return typeid(*this).name();
} }