From 523fc3be523ecacc179c8ef5b87c38eb9f0cc47a Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Mon, 10 Jul 2023 20:05:53 +0800
Subject: [PATCH] fixed rwkv, standardized new ctx usage

---
 .gitignore            |  4 +++-
 otherarch/gpt2_v3.cpp |  2 +-
 otherarch/gptj_v3.cpp |  2 +-
 otherarch/mpt_v3.cpp  |  2 +-
 otherarch/neox_v3.cpp |  2 +-
 otherarch/rwkv_v3.cpp |  7 +++++--
 otherarch/utils.cpp   | 12 ++++++++++++
 otherarch/utils.h     |  4 +++-
 8 files changed, 27 insertions(+), 8 deletions(-)
diff --git a/.gitignore b/.gitignore
index c79b78b57..622f5dd67 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,4 +67,6 @@ koboldcpp_failsafe.dll
 koboldcpp_openblas.dll
 koboldcpp_openblas_noavx2.dll
 koboldcpp_clblast.dll
-koboldcpp_cublas.dll
\ No newline at end of file
+koboldcpp_cublas.dll
+cublas64_11.dll
+cublasLt64_11.dll
\ No newline at end of file
diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp
index 2e7806d3d..608a61ac2 100644
--- a/otherarch/gpt2_v3.cpp
+++ b/otherarch/gpt2_v3.cpp
@@ -707,7 +707,7 @@ bool gpt2_eval(
 
     // run the computation
     ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+    kcpp_graph_compute_helper(&gf, n_threads);
 
     //if (n_past%100 == 0) {
     //    ggml_graph_print   (&gf);
diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp
index aeaa7bbd5..46f1ad064 100644
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@@ -619,7 +619,7 @@ bool gptj_eval(
 
     // run the computation
     ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+    kcpp_graph_compute_helper(&gf, n_threads);
 
     //if (n_past%100 == 0) {
     //    ggml_graph_print   (&gf);
diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
index 35006688d..211464f89 100644
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@@ -542,7 +542,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
 
     // run the computation
     ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+    kcpp_graph_compute_helper(&gf, n_threads);
 
     // std::cout << "Qcur" << std::endl;
     // print_tensor(Qcur);
diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp
index 9c1ab2545..d8ccaa9b6 100644
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@@ -638,7 +638,7 @@ bool gpt_neox_eval(
 
     // run the computation
     ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+    kcpp_graph_compute_helper(&gf, n_threads);
 
     //if (n_past%100 == 0) {
     //    ggml_graph_print   (&gf);
diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp
index 2ef44dd1b..3bdf221fd 100644
--- a/otherarch/rwkv_v3.cpp
+++ b/otherarch/rwkv_v3.cpp
@@ -13,6 +13,8 @@
 #include "ggml-opencl.h"
 #endif
 
+#include "utils.h"
+
 #include <string>
 #include <vector>
 #include <cstring>
@@ -729,6 +731,7 @@ struct rwkv_context {
     float * logits_out = 0; //stores address of output logit buffer
 
     size_t gpu_layers;
+    std::vector<uint8_t> work_buffer;
 };
 
 // https://stackoverflow.com/a/6458689
@@ -1627,7 +1630,7 @@ bool rwkv_eval(struct rwkv_context * ctx, const int n_threads, const uint32_t to
         ctx->serial_graph.cgraph->n_leafs = ctx->serial_graph.post_logits_leafs;
     }
 
-    ggml_graph_compute_with_ctx(ctx->serial_graph.ctx.ctx, ctx->serial_graph.cgraph.get(),n_threads);
+    kcpp_graph_compute_helper(ctx->serial_graph.cgraph.get(),n_threads);
     rwkv_get_outputs(ctx, state_out, logits_out);
 
     return true;
@@ -1715,7 +1718,7 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const ui
             ctx->sequence_graph.cgraph->n_leafs = ctx->sequence_graph.post_logits_leafs;
         }
 
-        ggml_graph_compute_with_ctx(ctx->sequence_graph.ctx.ctx, ctx->sequence_graph.cgraph.get(),n_threads);
+        kcpp_graph_compute_helper(ctx->sequence_graph.cgraph.get(),n_threads);
         rwkv_get_outputs(ctx, state_out, logits_out);
     }
 
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
index 02637069a..16e015c84 100644
--- a/otherarch/utils.cpp
+++ b/otherarch/utils.cpp
@@ -221,4 +221,16 @@ bool should_transpose_layer(std::string name)
         return true;
     }
     return false;
+}
+
+static std::vector<uint8_t> kcpp_compute_buf;
+void kcpp_graph_compute_helper(ggml_cgraph *graph, int n_threads)
+{
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    if (plan.work_size > 0)
+    {
+        kcpp_compute_buf.resize(plan.work_size);
+        plan.work_data = kcpp_compute_buf.data();
+    }
+    ggml_graph_compute(graph, &plan);
 }
\ No newline at end of file
diff --git a/otherarch/utils.h b/otherarch/utils.h
index f9857823f..cbd7bfb51 100644
--- a/otherarch/utils.h
+++ b/otherarch/utils.h
@@ -54,4 +54,6 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 
 
 
-bool should_transpose_layer(std::string name);
\ No newline at end of file
+bool should_transpose_layer(std::string name);
+
+void kcpp_graph_compute_helper(ggml_cgraph * graph, int n_threads);
\ No newline at end of file