ChatON:ChatParts: Allow flexibility for more refined tokenization

2024-04-24 20:23:44 +05:30 · 2024-04-24 20:23:44 +05:30 · 92e780fb1a
commit 92e780fb1a
parent 6b23f15ffe
1 changed files with 16 additions and 2 deletions
--- a/common/chaton.hpp
+++ b/common/chaton.hpp
@ -85,7 +85,9 @@ json conMeta;


 /**
- * Helps keep user prompt and chat-hs-template tag parts seperate, but in sequence
+ * Helps keep user prompt and chat-hs-template tag parts seperate, but in sequence.
+ * Inturn gives the flexibility to tokenize with or without parse_special flag, wrt the different parts of the chat msg(s).
+ * One could use the triplet of str, get_types and get_partslens to achieve the above mentioned flexibility.
 */
 class ChatParts {

@ -126,6 +128,18 @@ public:
        return allin;
    }

+    std::string get_types() {
+        return types;
+    }
+
+    std::vector<int> get_partslens() {
+        std::vector<int> lens = {};
+        for(auto part: parts) {
+            lens.push_back(part.length());
+        }
+        return lens;
+    }
+
    std::string name() {
        return typeid(*this).name();
    }