From 7972929a3b89d95f4563ca820043a2e6f8288426 Mon Sep 17 00:00:00 2001
From: mike dupont <mike.dupont@introspector.local>
Date: Wed, 6 Dec 2023 09:37:04 -0500
Subject: [PATCH] now getting response from python

---
 embedding.py           |  3 ++-
 examples/main/main.cpp | 27 +++++++++++++++++++--------
 plugin_python.cpp      | 12 +++++++++---
 3 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/embedding.py b/embedding.py
index 32977492f..d6ff4fc2b 100644
--- a/embedding.py
+++ b/embedding.py
@@ -1 +1,2 @@
-print("hello llama.cpp")
+print("hello llama.cpp" + llm_input)
+llm_output = "Is it because of your mother that " + llm_input + "?";
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 7bc7f012b..6e6965817 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -32,6 +32,7 @@
 #endif
 
 #include "print.hpp"
+#include "plugin_python.hpp"
 
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
@@ -130,7 +131,7 @@ int main(int argc, char ** argv) {
 
     // TODO: Dump params ?
     //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
-    print_fields(params);
+    //print_fields(params);
     
     // save choice to use color for later
     // (note for later: this is a slightly awkward choice)
@@ -248,7 +249,7 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd_inp;
 
-    print_fields(*model);
+    //print_fields(*model);
 	
     if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
@@ -293,7 +294,7 @@ int main(int argc, char ** argv) {
         LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
     }
-    print_fields(*ctx);
+    //print_fields(*ctx);
     //print_fields(session_tokens);
     // debug message about similarity of saved session, if applicable
     size_t n_matching_session_tokens = 0;
@@ -383,7 +384,7 @@ int main(int argc, char ** argv) {
                 LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
             }
 	    
-	    print_fields(*ctx_guidance);
+	    //print_fields(*ctx_guidance);
 
 
         }
@@ -495,7 +496,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd_guidance;
 
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-    print_fields(*ctx_sampling);
+    //print_fields(*ctx_sampling);
     
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
@@ -532,7 +533,7 @@ int main(int argc, char ** argv) {
                 LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-		print_fields(*ctx);
+		//print_fields(*ctx);
                 llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
                 llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
@@ -649,7 +650,7 @@ int main(int argc, char ** argv) {
             }
 
             const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
-	    print_fields(id);
+	    //print_fields(id);
             llama_sampling_accept(ctx_sampling, ctx, id, true);
 
             LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@@ -686,6 +687,8 @@ int main(int argc, char ** argv) {
                 const std::string token_str = llama_token_to_piece(ctx, id);
                 printf("TOKEN:%s\n", token_str.c_str());
 
+		//print_fields(id);
+		
                 if (embd.size() > 1) {
                     input_tokens.push_back(id);
                 } else {
@@ -700,12 +703,20 @@ int main(int argc, char ** argv) {
             console::set_display(console::reset);
         }
 
+	// just print the whole thing       	
+	const std::string last_output1 = output_ss.str();
+	printf("%s",last_output1.c_str());
+	const std::string last_output = process_output_plugin(last_output1);
+	printf("%s",last_output.c_str());
+		    
         // if not currently processing queued inputs;
         if ((int) embd_inp.size() <= n_consumed) {
             // check for reverse prompt in the last n_prev tokens
             if (!params.antiprompt.empty()) {
                 const int n_prev = 32;
-                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
+                const std::string last_output1 = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
+		// now plugin the python :
+		const std::string last_output = process_output_plugin(last_output1);
 
                 is_antiprompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
diff --git a/plugin_python.cpp b/plugin_python.cpp
index 324b18d56..6900a95b0 100644
--- a/plugin_python.cpp
+++ b/plugin_python.cpp
@@ -26,7 +26,7 @@ using namespace boost::python;
 #endif
 
 
-int call_python()
+std::string process_output_plugin(const std::string input)
 {
     try {
         PyImport_AppendInittab((char*)"mymodule", INIT_MODULE);
@@ -36,12 +36,18 @@ int call_python()
         object mymodule = import("mymodule");
 
         main_namespace["precreated_object"] = Base("created on C++ side");
+	main_namespace["llm_input"] = input;       
         exec_file("embedding.py", main_namespace, main_namespace);
+
+	boost::python::object llm_output = main_namespace["llm_output"];
+	std::string message = boost::python::extract<std::string>(llm_output);
+
+	return message;
+	
     } catch (error_already_set& e) {
         PyErr_PrintEx(0);
-        return 1;
+        return "";
     }
-    return 0;
 }