diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 314b54b38..7ca3cb95d 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -251,9 +251,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
     n_past = 0;
 
-    if(file_format!=FileFormat::RWKV_1)
+    if (file_format == FileFormat::RWKV_1)
     {
-        ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext);
+        ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true);
+    }
+    else
+    {
+        ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext, false);
     }
 
     //if using BLAS and prompt is big enough, switch to single thread and use a huge batch
@@ -303,7 +307,19 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     else if(file_format == FileFormat::RWKV_1)
     {
         n_vocab = vocab.id_to_token.size(); //handled seperately
-        rwkv_context_v1->state_in = nullptr;
+        if(n_past==0)
+        {
+            rwkv_context_v1->state_in = nullptr;
+        }
+        else
+        {
+            rwkv_context_v1->state_in = rwkv_context_v1->state_out;
+            //if it's empty, push in the final previous token
+            if(embd_inp.size()==0 && current_context_tokens.size()>0)
+            {
+                embd_inp.push_back(current_context_tokens[current_context_tokens.size()-1]);
+            }
+        }
     }
     else
     {
diff --git a/koboldcpp.py b/koboldcpp.py
index 1bd559fa9..fa240468c 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -1,7 +1,6 @@
 # A hacky little script from Concedo that exposes llama.cpp function bindings 
 # allowing it to be used via a simulated kobold api endpoint
-# it's not very usable as there is a fundamental flaw with llama.cpp 
-# which causes generation delay to scale linearly with original prompt length.
+# generation delay scales linearly with original prompt length.
 
 import ctypes
 import os
@@ -399,17 +398,17 @@ def main(args):
             root.destroy()
             if not ggml_selected_file:
                 print("\nNo ggml model file was selected. Exiting.")
-                time.sleep(1)
+                time.sleep(2)
                 sys.exit(2)
         except Exception as ex:
             print("File selection GUI unsupported. Please check command line: script.py --help")
-            time.sleep(1)
+            time.sleep(2)
             sys.exit(2)
        
 
     if not os.path.exists(ggml_selected_file):
         print(f"Cannot find model file: {ggml_selected_file}")
-        time.sleep(1)
+        time.sleep(2)
         sys.exit(2)
 
     mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1
@@ -420,7 +419,7 @@ def main(args):
 
     if not loadok:
         print("Could not load model: " + modelname)
-        time.sleep(1)
+        time.sleep(2)
         sys.exit(3)
     try:
         basepath = os.path.abspath(os.path.dirname(__file__))
diff --git a/llama_adapter.cpp b/llama_adapter.cpp
index 4cdb09476..b872c1bf0 100644
--- a/llama_adapter.cpp
+++ b/llama_adapter.cpp
@@ -145,7 +145,7 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
     n_past = 0;
 
-    ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext);
+    ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext,false);
 
     //if using BLAS and prompt is big enough, switch to single thread and use a huge batch
     bool blasmode = (embd_inp.size() >= 32 && ggml_cpu_has_blas());
diff --git a/model_adapter.cpp b/model_adapter.cpp
index 6d7094d10..a8893f06c 100644
--- a/model_adapter.cpp
+++ b/model_adapter.cpp
@@ -236,7 +236,8 @@ void print_tok_vec(std::vector<float> &embd)
  }
 
  void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
- int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, bool useSmartContext)
+ int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, 
+ bool useSmartContext, const bool requireFullSubset)
  {     
      const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
      const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
@@ -244,13 +245,11 @@ void print_tok_vec(std::vector<float> &embd)
      const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
      const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
 
-    // printf("\nORIGINAL CTX:\n");
-    // print_tok_vec(current_context_tokens);
-    // printf("\nORIGINAL EMBD:\n");
-    // print_tok_vec(embd_inp);
 
     //fast forward the past based on identical tokens, stop once a divergence is noted
     int embd_inp_len = embd_inp.size();
+    bool fastforwardok = true;
+    
     for (int i = 0; i < current_context_tokens.size(); ++i)
     {
         if (current_context_tokens[i] == embd_inp[i])
@@ -260,37 +259,48 @@ void print_tok_vec(std::vector<float> &embd)
         }
         else
         {
+            if(requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context 
+            {
+                last_n_tokens.erase(last_n_tokens.end() - n_past, last_n_tokens.end());
+                n_past = 0;                
+                fastforwardok = false;          
+            }
             break;
         }
-        if ((i + 2) >= embd_inp_len)
+
+        if (requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context
         {
-            break;
+            if (i >= embd_inp_len)
+            {
+                last_n_tokens.erase(last_n_tokens.end() - n_past, last_n_tokens.end());
+                n_past = 0;
+                fastforwardok = false;
+                break;
+            }
+        }
+        else
+        {
+            if ((i + 2) >= embd_inp_len)
+            {
+                break;
+            }
         }
     }
 
-
-    last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past);
-    embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past);
-    embd_inp_len = embd_inp.size();
+    if(fastforwardok)
+    {
+        last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past);
+        embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past);
+        embd_inp_len = embd_inp.size();
+    }
 
     //smart context mode, detect if we have a shifted context at max length
     //requirement: previous context was at least nctx/2 longer than current,
     //mode is on, and current context already maxed.
 
-    // printf("\nconds: %d %d %d\n",current_context_tokens.size() >= nctx*0.8
-    // ,embd_inp_len >= nctx*0.6 ,current_context_tokens.size() - n_past > nctx*0.5);
-    // printf("csiz:%d par:%d eilen:%d np:%d",current_context_tokens.size(), (int)(nctx*0.8),embd_inp_len,n_past);
-
-    if (useSmartContext && smartcontext.size() > 0 && embd_inp_len >= SCInpLenThreshold)
+    if (fastforwardok && useSmartContext && smartcontext.size() > 0 && embd_inp_len >= SCInpLenThreshold)
     {
-        // printf("curfullcontext:\n");
-        // print_tok_vec(current_context_tokens);
-
-        //see if smartcontext is still usable
-        // printf("smartctx:\n");
-        // print_tok_vec(smartcontext);
-        // printf("embinp:\n");
-        // print_tok_vec(embd_inp);
+        //see if smartcontext is still usable       
         auto shared = LongestCommonSubseq(smartcontext, embd_inp);       
         if (shared.size() > SCTokThreshold && ArrStartWith(smartcontext, shared)) //at least 32 tokens in common
         {            
@@ -300,8 +310,6 @@ void print_tok_vec(std::vector<float> &embd)
                 auto trimmed = std::vector<int>(embd_inp.begin() + found, embd_inp.end());
                 embd_inp = trimmed;
                 embd_inp_len = embd_inp.size();
-                // printf("trimmed:\n");
-                // print_tok_vec(embd_inp,&vocab.id_to_token);
                 printf("\n[Reusing Smart Context: %d allowance remaining]", found);
 
                 int old_n_past = n_past;
@@ -313,7 +321,6 @@ void print_tok_vec(std::vector<float> &embd)
                                 
                 for (int i = n_past; i < current_context_tokens.size(); ++i)
                 {
-                    //printf("\n%s and %s\n",vocab.id_to_token[current_context_tokens[i]].c_str(), vocab.id_to_token[embd_inp[i-offset_fix]].c_str());
                     if (current_context_tokens[i] == embd_inp[i-offset_fix])
                     {
                         n_past += 1;
@@ -331,8 +338,7 @@ void print_tok_vec(std::vector<float> &embd)
 
                 last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + (n_past-old_n_past));
                 embd_inp.erase(embd_inp.begin(), embd_inp.begin() + (n_past-old_n_past));
-                // printf("np:%d newembinp: \n",n_past);
-                // print_tok_vec(embd_inp);
+               
             }else{
                 smartcontext.clear();
             }
@@ -347,17 +353,16 @@ void print_tok_vec(std::vector<float> &embd)
         smartcontext.clear();
     }
 
-    if(useSmartContext 
+    if(fastforwardok && useSmartContext 
     && smartcontext.size()==0 && current_context_tokens.size() >= SCCtxLenThreshold 
     && embd_inp_len >= SCInpLenThreshold 
     && current_context_tokens.size() - n_past > SCPastLenThreshold)
-    {       
+    {  
         //determine longest common substring after removing start part
         int shiftamt = embd_inp.size() * SCTruncationRatio;
         smartcontext = std::vector<int>(embd_inp.begin() + shiftamt, embd_inp.end());
          printf("\n[New Smart Context Triggered! Buffered Token Allowance: %d]",shiftamt);
-        // printf("smartctx:\n");
-        // print_tok_vec(smartcontext,&vocab.id_to_token);
+        
         embd_inp = smartcontext;
         //if max ctx length is exceeded, chop the prompt in half after the start part, and memorize it. The memorized part becomes LCS marker.
         //when a future prompt comes in, find the LCS again. If LCS > a length and LCS starts with memorized LCS
diff --git a/model_adapter.h b/model_adapter.h
index 3d6376aac..040196063 100644
--- a/model_adapter.h
+++ b/model_adapter.h
@@ -54,4 +54,5 @@ int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> se
 
 FileFormat check_file_format(const std::string & fname);
 void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
- int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, const bool useSmartContext);
\ No newline at end of file
+ int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, 
+ const bool useSmartContext, const bool requireFullSubset);
\ No newline at end of file
diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp
index 859774c99..2621219d3 100644
--- a/otherarch/gpt2_v2.cpp
+++ b/otherarch/gpt2_v2.cpp
@@ -371,7 +371,7 @@ bool gpt2_eval(
     const int n_vocab = hparams.n_vocab;
 
     //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now  
-    static size_t buf_size = 1024u*1024*1024;
+    static size_t buf_size = 1280u*1024*1024;
     static void * buf = malloc(buf_size);
 
     if (mem_per_token > 0 && mem_per_token*N > buf_size) {
diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp
index 62cf224b2..741fe6d3f 100644
--- a/otherarch/gptj_v2.cpp
+++ b/otherarch/gptj_v2.cpp
@@ -382,7 +382,7 @@ bool gptj_eval(
     const int d_key = n_embd/n_head;
 
     //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now  
-    static size_t buf_size = 1024u*1024*1024;
+    static size_t buf_size = 1280u*1024*1024;
     static void * buf = malloc(buf_size);
 
     if (mem_per_token > 0 && mem_per_token*N > buf_size) {
diff --git a/otherarch/rwkv.cpp b/otherarch/rwkv.cpp
index 686d2452a..cc4aa2b6e 100644
--- a/otherarch/rwkv.cpp
+++ b/otherarch/rwkv.cpp
@@ -1,3 +1,6 @@
+//adapted from RWKV.cpp repo under MIT license
+// https://github.com/saharNooby/rwkv.cpp
+
 #include "rwkv.h"
 #include "ggml_rwkv.h"
 #include "otherarch.h"