diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 314b54b38..7ca3cb95d 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -251,9 +251,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); n_past = 0; - if(file_format!=FileFormat::RWKV_1) + if (file_format == FileFormat::RWKV_1) { - ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext); + ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true); + } + else + { + ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext, false); } //if using BLAS and prompt is big enough, switch to single thread and use a huge batch @@ -303,7 +307,19 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o else if(file_format == FileFormat::RWKV_1) { n_vocab = vocab.id_to_token.size(); //handled seperately - rwkv_context_v1->state_in = nullptr; + if(n_past==0) + { + rwkv_context_v1->state_in = nullptr; + } + else + { + rwkv_context_v1->state_in = rwkv_context_v1->state_out; + //if it's empty, push in the final previous token + if(embd_inp.size()==0 && current_context_tokens.size()>0) + { + embd_inp.push_back(current_context_tokens[current_context_tokens.size()-1]); + } + } } else { diff --git a/koboldcpp.py b/koboldcpp.py index 1bd559fa9..fa240468c 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -1,7 +1,6 @@ # A hacky little script from Concedo that exposes llama.cpp function bindings # allowing it to be used via a simulated kobold api endpoint -# it's not very usable as there is a fundamental flaw with llama.cpp -# which causes generation delay to scale linearly with original prompt length. +# generation delay scales linearly with original prompt length. import ctypes import os @@ -399,17 +398,17 @@ def main(args): root.destroy() if not ggml_selected_file: print("\nNo ggml model file was selected. Exiting.") - time.sleep(1) + time.sleep(2) sys.exit(2) except Exception as ex: print("File selection GUI unsupported. Please check command line: script.py --help") - time.sleep(1) + time.sleep(2) sys.exit(2) if not os.path.exists(ggml_selected_file): print(f"Cannot find model file: {ggml_selected_file}") - time.sleep(1) + time.sleep(2) sys.exit(2) mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1 @@ -420,7 +419,7 @@ def main(args): if not loadok: print("Could not load model: " + modelname) - time.sleep(1) + time.sleep(2) sys.exit(3) try: basepath = os.path.abspath(os.path.dirname(__file__)) diff --git a/llama_adapter.cpp b/llama_adapter.cpp index 4cdb09476..b872c1bf0 100644 --- a/llama_adapter.cpp +++ b/llama_adapter.cpp @@ -145,7 +145,7 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); n_past = 0; - ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext); + ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext,false); //if using BLAS and prompt is big enough, switch to single thread and use a huge batch bool blasmode = (embd_inp.size() >= 32 && ggml_cpu_has_blas()); diff --git a/model_adapter.cpp b/model_adapter.cpp index 6d7094d10..a8893f06c 100644 --- a/model_adapter.cpp +++ b/model_adapter.cpp @@ -236,7 +236,8 @@ void print_tok_vec(std::vector &embd) } void ContextFastForward(std::vector ¤t_context_tokens, std::vector &embd_inp, - int &n_past, std::vector &last_n_tokens, const int nctx, std::vector &smartcontext, bool useSmartContext) + int &n_past, std::vector &last_n_tokens, const int nctx, std::vector &smartcontext, + bool useSmartContext, const bool requireFullSubset) { const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext @@ -244,13 +245,11 @@ void print_tok_vec(std::vector &embd) const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext - // printf("\nORIGINAL CTX:\n"); - // print_tok_vec(current_context_tokens); - // printf("\nORIGINAL EMBD:\n"); - // print_tok_vec(embd_inp); //fast forward the past based on identical tokens, stop once a divergence is noted int embd_inp_len = embd_inp.size(); + bool fastforwardok = true; + for (int i = 0; i < current_context_tokens.size(); ++i) { if (current_context_tokens[i] == embd_inp[i]) @@ -260,37 +259,48 @@ void print_tok_vec(std::vector &embd) } else { + if(requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context + { + last_n_tokens.erase(last_n_tokens.end() - n_past, last_n_tokens.end()); + n_past = 0; + fastforwardok = false; + } break; } - if ((i + 2) >= embd_inp_len) + + if (requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context { - break; + if (i >= embd_inp_len) + { + last_n_tokens.erase(last_n_tokens.end() - n_past, last_n_tokens.end()); + n_past = 0; + fastforwardok = false; + break; + } + } + else + { + if ((i + 2) >= embd_inp_len) + { + break; + } } } - - last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past); - embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past); - embd_inp_len = embd_inp.size(); + if(fastforwardok) + { + last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past); + embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past); + embd_inp_len = embd_inp.size(); + } //smart context mode, detect if we have a shifted context at max length //requirement: previous context was at least nctx/2 longer than current, //mode is on, and current context already maxed. - // printf("\nconds: %d %d %d\n",current_context_tokens.size() >= nctx*0.8 - // ,embd_inp_len >= nctx*0.6 ,current_context_tokens.size() - n_past > nctx*0.5); - // printf("csiz:%d par:%d eilen:%d np:%d",current_context_tokens.size(), (int)(nctx*0.8),embd_inp_len,n_past); - - if (useSmartContext && smartcontext.size() > 0 && embd_inp_len >= SCInpLenThreshold) + if (fastforwardok && useSmartContext && smartcontext.size() > 0 && embd_inp_len >= SCInpLenThreshold) { - // printf("curfullcontext:\n"); - // print_tok_vec(current_context_tokens); - - //see if smartcontext is still usable - // printf("smartctx:\n"); - // print_tok_vec(smartcontext); - // printf("embinp:\n"); - // print_tok_vec(embd_inp); + //see if smartcontext is still usable auto shared = LongestCommonSubseq(smartcontext, embd_inp); if (shared.size() > SCTokThreshold && ArrStartWith(smartcontext, shared)) //at least 32 tokens in common { @@ -300,8 +310,6 @@ void print_tok_vec(std::vector &embd) auto trimmed = std::vector(embd_inp.begin() + found, embd_inp.end()); embd_inp = trimmed; embd_inp_len = embd_inp.size(); - // printf("trimmed:\n"); - // print_tok_vec(embd_inp,&vocab.id_to_token); printf("\n[Reusing Smart Context: %d allowance remaining]", found); int old_n_past = n_past; @@ -313,7 +321,6 @@ void print_tok_vec(std::vector &embd) for (int i = n_past; i < current_context_tokens.size(); ++i) { - //printf("\n%s and %s\n",vocab.id_to_token[current_context_tokens[i]].c_str(), vocab.id_to_token[embd_inp[i-offset_fix]].c_str()); if (current_context_tokens[i] == embd_inp[i-offset_fix]) { n_past += 1; @@ -331,8 +338,7 @@ void print_tok_vec(std::vector &embd) last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + (n_past-old_n_past)); embd_inp.erase(embd_inp.begin(), embd_inp.begin() + (n_past-old_n_past)); - // printf("np:%d newembinp: \n",n_past); - // print_tok_vec(embd_inp); + }else{ smartcontext.clear(); } @@ -347,17 +353,16 @@ void print_tok_vec(std::vector &embd) smartcontext.clear(); } - if(useSmartContext + if(fastforwardok && useSmartContext && smartcontext.size()==0 && current_context_tokens.size() >= SCCtxLenThreshold && embd_inp_len >= SCInpLenThreshold && current_context_tokens.size() - n_past > SCPastLenThreshold) - { + { //determine longest common substring after removing start part int shiftamt = embd_inp.size() * SCTruncationRatio; smartcontext = std::vector(embd_inp.begin() + shiftamt, embd_inp.end()); printf("\n[New Smart Context Triggered! Buffered Token Allowance: %d]",shiftamt); - // printf("smartctx:\n"); - // print_tok_vec(smartcontext,&vocab.id_to_token); + embd_inp = smartcontext; //if max ctx length is exceeded, chop the prompt in half after the start part, and memorize it. The memorized part becomes LCS marker. //when a future prompt comes in, find the LCS again. If LCS > a length and LCS starts with memorized LCS diff --git a/model_adapter.h b/model_adapter.h index 3d6376aac..040196063 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -54,4 +54,5 @@ int ArrFindIndexOf(const std::vector targetArray, const std::vector se FileFormat check_file_format(const std::string & fname); void ContextFastForward(std::vector ¤t_context_tokens, std::vector &embd_inp, - int &n_past, std::vector &last_n_tokens, const int nctx, std::vector &smartcontext, const bool useSmartContext); \ No newline at end of file + int &n_past, std::vector &last_n_tokens, const int nctx, std::vector &smartcontext, + const bool useSmartContext, const bool requireFullSubset); \ No newline at end of file diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index 859774c99..2621219d3 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -371,7 +371,7 @@ bool gpt2_eval( const int n_vocab = hparams.n_vocab; //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now - static size_t buf_size = 1024u*1024*1024; + static size_t buf_size = 1280u*1024*1024; static void * buf = malloc(buf_size); if (mem_per_token > 0 && mem_per_token*N > buf_size) { diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 62cf224b2..741fe6d3f 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -382,7 +382,7 @@ bool gptj_eval( const int d_key = n_embd/n_head; //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now - static size_t buf_size = 1024u*1024*1024; + static size_t buf_size = 1280u*1024*1024; static void * buf = malloc(buf_size); if (mem_per_token > 0 && mem_per_token*N > buf_size) { diff --git a/otherarch/rwkv.cpp b/otherarch/rwkv.cpp index 686d2452a..cc4aa2b6e 100644 --- a/otherarch/rwkv.cpp +++ b/otherarch/rwkv.cpp @@ -1,3 +1,6 @@ +//adapted from RWKV.cpp repo under MIT license +// https://github.com/saharNooby/rwkv.cpp + #include "rwkv.h" #include "ggml_rwkv.h" #include "otherarch.h"