From 0a8797b28eacb3916f0b9be137d62c9725b66918 Mon Sep 17 00:00:00 2001
From: HanishKVC <hanishkvc@gmail.com>
Date: Sat, 20 Apr 2024 18:40:55 +0530
Subject: [PATCH] Main:Update to support chaton mode

Glanced through existing interactive and chatml flow, to incorporate
this flow. Need to look deeper later.

NOTE: Till this point is reapplying of my initial go at chaton, by
simplifying the amount of change done to existing code, a bitmore.
---
 examples/main/main.cpp | 48 ++++++++++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 11 deletions(-)
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 249fc2bb6..a073a7bfd 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,4 +1,5 @@
 #include "common.h"
+#include "chaton.hpp"
 
 #include "console.h"
 #include "llama.h"
@@ -251,11 +252,14 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd_inp;
 
-    if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
-        LOG("tokenize the prompt\n");
+    if (params.interactive_first || params.instruct || params.chatml || params.chaton || !params.prompt.empty() || session_tokens.empty()) {
+        LOG("tokenize the prompt: %s\n", params.prompt.c_str());
         if (params.chatml) {
             params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
         }
+        if (params.chaton) {
+            params.prompt = llama_chat_apply_template_simple(params.chaton_template_id, "system", params.prompt, false);
+        }
         embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
     } else {
         LOG("use session tokens\n");
@@ -333,7 +337,7 @@ int main(int argc, char ** argv) {
     }
 
     // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml || params.chaton) {
         params.n_keep = (int)embd_inp.size();
     } else {
         params.n_keep += add_bos; // always keep the BOS token
@@ -363,6 +367,19 @@ int main(int argc, char ** argv) {
         params.interactive_first = true;
         params.antiprompt.emplace_back("<|im_start|>user\n");
     }
+    // handle chaton mode, it adds on to any reverse prompt specified explicitly by the user
+    if (params.chaton) {
+        params.interactive_first = true;
+        std::vector<std::string> resp_ends = llama_chat_reverse_prompt(params.chaton_template_id);
+        if (resp_ends.size() == 0) {
+            LOG_TEELN("ERRR:%s:ChatOn:Unsupported ChatType:%s", __func__, params.chaton_template_id.c_str());
+            exit(1);
+        }
+        for (size_t i = 0; i < resp_ends.size(); i++)
+        {
+            params.antiprompt.emplace_back(resp_ends[i]);
+        }
+    }
 
     // enable interactive mode if interactive start is specified
     if (params.interactive_first) {
@@ -817,7 +834,7 @@ int main(int argc, char ** argv) {
             if (n_past > 0 && is_interacting) {
                 LOG("waiting for user input\n");
 
-                if (params.instruct || params.chatml) {
+                if (params.instruct || params.chatml || params.chaton) {
                     printf("\n> ");
                 }
 
@@ -876,15 +893,23 @@ int main(int argc, char ** argv) {
                         process_escapes(buffer);
                     }
 
-                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
-                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                    std::vector<int> line_inp;
+                    if (params.chaton) {
+                        std::string f_chat = llama_chat_apply_template_simple(params.chaton_template_id, "user", buffer.c_str(), true);
+                        line_inp = ::llama_tokenize(ctx, f_chat, false, true);
+                        LOG("formatted input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                        embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                    } else {
+                        const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
+                        line_inp = ::llama_tokenize(ctx, buffer,              false, false);
+                        const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
 
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                        LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
 
-                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
-                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
+                        embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
+                        embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                        embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
+                    }
 
                     // instruct mode: insert response suffix
                     if (params.instruct) {
@@ -921,6 +946,7 @@ int main(int argc, char ** argv) {
         }
 
         // end of text token
+        // chaton expected to be used along with interactive argument, so not checking for chaton seperately
         if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
             LOG_TEE(" [end of text]\n");
             break;