now getting response from python

This commit is contained in:
mike dupont 2023-12-06 09:37:04 -05:00
parent 1c861466dc
commit 7972929a3b
3 changed files with 30 additions and 12 deletions

View file

@ -1 +1,2 @@
print("hello llama.cpp") print("hello llama.cpp" + llm_input)
llm_output = "Is it because of your mother that " + llm_input + "?";

View file

@ -32,6 +32,7 @@
#endif #endif
#include "print.hpp" #include "print.hpp"
#include "plugin_python.hpp"
static llama_context ** g_ctx; static llama_context ** g_ctx;
static llama_model ** g_model; static llama_model ** g_model;
@ -130,7 +131,7 @@ int main(int argc, char ** argv) {
// TODO: Dump params ? // TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
print_fields(params); //print_fields(params);
// save choice to use color for later // save choice to use color for later
// (note for later: this is a slightly awkward choice) // (note for later: this is a slightly awkward choice)
@ -248,7 +249,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
print_fields(*model); //print_fields(*model);
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n"); LOG("tokenize the prompt\n");
@ -293,7 +294,7 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1; return 1;
} }
print_fields(*ctx); //print_fields(*ctx);
//print_fields(session_tokens); //print_fields(session_tokens);
// debug message about similarity of saved session, if applicable // debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0; size_t n_matching_session_tokens = 0;
@ -383,7 +384,7 @@ int main(int argc, char ** argv) {
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
} }
print_fields(*ctx_guidance); //print_fields(*ctx_guidance);
} }
@ -495,7 +496,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_guidance; std::vector<llama_token> embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
print_fields(*ctx_sampling); //print_fields(*ctx_sampling);
while ((n_remain != 0 && !is_antiprompt) || params.interactive) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict // predict
@ -532,7 +533,7 @@ int main(int argc, char ** argv) {
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard); n_past, n_left, n_ctx, params.n_keep, n_discard);
print_fields(*ctx); //print_fields(*ctx);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
@ -649,7 +650,7 @@ int main(int argc, char ** argv) {
} }
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
print_fields(id); //print_fields(id);
llama_sampling_accept(ctx_sampling, ctx, id, true); llama_sampling_accept(ctx_sampling, ctx, id, true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@ -686,6 +687,8 @@ int main(int argc, char ** argv) {
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = llama_token_to_piece(ctx, id);
printf("TOKEN:%s\n", token_str.c_str()); printf("TOKEN:%s\n", token_str.c_str());
//print_fields(id);
if (embd.size() > 1) { if (embd.size() > 1) {
input_tokens.push_back(id); input_tokens.push_back(id);
} else { } else {
@ -700,12 +703,20 @@ int main(int argc, char ** argv) {
console::set_display(console::reset); console::set_display(console::reset);
} }
// just print the whole thing
const std::string last_output1 = output_ss.str();
printf("%s",last_output1.c_str());
const std::string last_output = process_output_plugin(last_output1);
printf("%s",last_output.c_str());
// if not currently processing queued inputs; // if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) { if ((int) embd_inp.size() <= n_consumed) {
// check for reverse prompt in the last n_prev tokens // check for reverse prompt in the last n_prev tokens
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {
const int n_prev = 32; const int n_prev = 32;
const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev); const std::string last_output1 = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
// now plugin the python :
const std::string last_output = process_output_plugin(last_output1);
is_antiprompt = false; is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output. // Check if each of the reverse prompts appears at the end of the output.

View file

@ -26,7 +26,7 @@ using namespace boost::python;
#endif #endif
int call_python() std::string process_output_plugin(const std::string input)
{ {
try { try {
PyImport_AppendInittab((char*)"mymodule", INIT_MODULE); PyImport_AppendInittab((char*)"mymodule", INIT_MODULE);
@ -36,12 +36,18 @@ int call_python()
object mymodule = import("mymodule"); object mymodule = import("mymodule");
main_namespace["precreated_object"] = Base("created on C++ side"); main_namespace["precreated_object"] = Base("created on C++ side");
main_namespace["llm_input"] = input;
exec_file("embedding.py", main_namespace, main_namespace); exec_file("embedding.py", main_namespace, main_namespace);
boost::python::object llm_output = main_namespace["llm_output"];
std::string message = boost::python::extract<std::string>(llm_output);
return message;
} catch (error_already_set& e) { } catch (error_already_set& e) {
PyErr_PrintEx(0); PyErr_PrintEx(0);
return 1; return "";
} }
return 0;
} }