From 523fc3be523ecacc179c8ef5b87c38eb9f0cc47a Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 10 Jul 2023 20:05:53 +0800 Subject: [PATCH] fixed rwkv, standardized new ctx usage --- .gitignore | 4 +++- otherarch/gpt2_v3.cpp | 2 +- otherarch/gptj_v3.cpp | 2 +- otherarch/mpt_v3.cpp | 2 +- otherarch/neox_v3.cpp | 2 +- otherarch/rwkv_v3.cpp | 7 +++++-- otherarch/utils.cpp | 12 ++++++++++++ otherarch/utils.h | 4 +++- 8 files changed, 27 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index c79b78b57..622f5dd67 100644 --- a/.gitignore +++ b/.gitignore @@ -67,4 +67,6 @@ koboldcpp_failsafe.dll koboldcpp_openblas.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll -koboldcpp_cublas.dll \ No newline at end of file +koboldcpp_cublas.dll +cublas64_11.dll +cublasLt64_11.dll \ No newline at end of file diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index 2e7806d3d..608a61ac2 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -707,7 +707,7 @@ bool gpt2_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + kcpp_graph_compute_helper(&gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index aeaa7bbd5..46f1ad064 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -619,7 +619,7 @@ bool gptj_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + kcpp_graph_compute_helper(&gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index 35006688d..211464f89 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -542,7 +542,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + kcpp_graph_compute_helper(&gf, n_threads); // std::cout << "Qcur" << std::endl; // print_tensor(Qcur); diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 9c1ab2545..d8ccaa9b6 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -638,7 +638,7 @@ bool gpt_neox_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + kcpp_graph_compute_helper(&gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp index 2ef44dd1b..3bdf221fd 100644 --- a/otherarch/rwkv_v3.cpp +++ b/otherarch/rwkv_v3.cpp @@ -13,6 +13,8 @@ #include "ggml-opencl.h" #endif +#include "utils.h" + #include #include #include @@ -729,6 +731,7 @@ struct rwkv_context { float * logits_out = 0; //stores address of output logit buffer size_t gpu_layers; + std::vector work_buffer; }; // https://stackoverflow.com/a/6458689 @@ -1627,7 +1630,7 @@ bool rwkv_eval(struct rwkv_context * ctx, const int n_threads, const uint32_t to ctx->serial_graph.cgraph->n_leafs = ctx->serial_graph.post_logits_leafs; } - ggml_graph_compute_with_ctx(ctx->serial_graph.ctx.ctx, ctx->serial_graph.cgraph.get(),n_threads); + kcpp_graph_compute_helper(ctx->serial_graph.cgraph.get(),n_threads); rwkv_get_outputs(ctx, state_out, logits_out); return true; @@ -1715,7 +1718,7 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const ui ctx->sequence_graph.cgraph->n_leafs = ctx->sequence_graph.post_logits_leafs; } - ggml_graph_compute_with_ctx(ctx->sequence_graph.ctx.ctx, ctx->sequence_graph.cgraph.get(),n_threads); + kcpp_graph_compute_helper(ctx->sequence_graph.cgraph.get(),n_threads); rwkv_get_outputs(ctx, state_out, logits_out); } diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp index 02637069a..16e015c84 100644 --- a/otherarch/utils.cpp +++ b/otherarch/utils.cpp @@ -221,4 +221,16 @@ bool should_transpose_layer(std::string name) return true; } return false; +} + +static std::vector kcpp_compute_buf; +void kcpp_graph_compute_helper(ggml_cgraph *graph, int n_threads) +{ + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + if (plan.work_size > 0) + { + kcpp_compute_buf.resize(plan.work_size); + plan.work_data = kcpp_compute_buf.data(); + } + ggml_graph_compute(graph, &plan); } \ No newline at end of file diff --git a/otherarch/utils.h b/otherarch/utils.h index f9857823f..cbd7bfb51 100644 --- a/otherarch/utils.h +++ b/otherarch/utils.h @@ -54,4 +54,6 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri -bool should_transpose_layer(std::string name); \ No newline at end of file +bool should_transpose_layer(std::string name); + +void kcpp_graph_compute_helper(ggml_cgraph * graph, int n_threads); \ No newline at end of file