Merge branch 'concedo_experimental' into concedo
This commit is contained in:
commit
65bfcdb1cc
8 changed files with 70 additions and 46 deletions
|
@ -251,9 +251,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
||||
n_past = 0;
|
||||
|
||||
if(file_format!=FileFormat::RWKV_1)
|
||||
if (file_format == FileFormat::RWKV_1)
|
||||
{
|
||||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext);
|
||||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext, false);
|
||||
}
|
||||
|
||||
//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
|
||||
|
@ -303,7 +307,19 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
else if(file_format == FileFormat::RWKV_1)
|
||||
{
|
||||
n_vocab = vocab.id_to_token.size(); //handled seperately
|
||||
rwkv_context_v1->state_in = nullptr;
|
||||
if(n_past==0)
|
||||
{
|
||||
rwkv_context_v1->state_in = nullptr;
|
||||
}
|
||||
else
|
||||
{
|
||||
rwkv_context_v1->state_in = rwkv_context_v1->state_out;
|
||||
//if it's empty, push in the final previous token
|
||||
if(embd_inp.size()==0 && current_context_tokens.size()>0)
|
||||
{
|
||||
embd_inp.push_back(current_context_tokens[current_context_tokens.size()-1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
11
koboldcpp.py
11
koboldcpp.py
|
@ -1,7 +1,6 @@
|
|||
# A hacky little script from Concedo that exposes llama.cpp function bindings
|
||||
# allowing it to be used via a simulated kobold api endpoint
|
||||
# it's not very usable as there is a fundamental flaw with llama.cpp
|
||||
# which causes generation delay to scale linearly with original prompt length.
|
||||
# generation delay scales linearly with original prompt length.
|
||||
|
||||
import ctypes
|
||||
import os
|
||||
|
@ -399,17 +398,17 @@ def main(args):
|
|||
root.destroy()
|
||||
if not ggml_selected_file:
|
||||
print("\nNo ggml model file was selected. Exiting.")
|
||||
time.sleep(1)
|
||||
time.sleep(2)
|
||||
sys.exit(2)
|
||||
except Exception as ex:
|
||||
print("File selection GUI unsupported. Please check command line: script.py --help")
|
||||
time.sleep(1)
|
||||
time.sleep(2)
|
||||
sys.exit(2)
|
||||
|
||||
|
||||
if not os.path.exists(ggml_selected_file):
|
||||
print(f"Cannot find model file: {ggml_selected_file}")
|
||||
time.sleep(1)
|
||||
time.sleep(2)
|
||||
sys.exit(2)
|
||||
|
||||
mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1
|
||||
|
@ -420,7 +419,7 @@ def main(args):
|
|||
|
||||
if not loadok:
|
||||
print("Could not load model: " + modelname)
|
||||
time.sleep(1)
|
||||
time.sleep(2)
|
||||
sys.exit(3)
|
||||
try:
|
||||
basepath = os.path.abspath(os.path.dirname(__file__))
|
||||
|
|
|
@ -145,7 +145,7 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
|
|||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
||||
n_past = 0;
|
||||
|
||||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext);
|
||||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext,false);
|
||||
|
||||
//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
|
||||
bool blasmode = (embd_inp.size() >= 32 && ggml_cpu_has_blas());
|
||||
|
|
|
@ -236,7 +236,8 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
}
|
||||
|
||||
void ContextFastForward(std::vector<int> ¤t_context_tokens, std::vector<int> &embd_inp,
|
||||
int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, bool useSmartContext)
|
||||
int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
|
||||
bool useSmartContext, const bool requireFullSubset)
|
||||
{
|
||||
const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
|
||||
const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
|
||||
|
@ -244,13 +245,11 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
|
||||
const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
|
||||
|
||||
// printf("\nORIGINAL CTX:\n");
|
||||
// print_tok_vec(current_context_tokens);
|
||||
// printf("\nORIGINAL EMBD:\n");
|
||||
// print_tok_vec(embd_inp);
|
||||
|
||||
//fast forward the past based on identical tokens, stop once a divergence is noted
|
||||
int embd_inp_len = embd_inp.size();
|
||||
bool fastforwardok = true;
|
||||
|
||||
for (int i = 0; i < current_context_tokens.size(); ++i)
|
||||
{
|
||||
if (current_context_tokens[i] == embd_inp[i])
|
||||
|
@ -260,37 +259,48 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
}
|
||||
else
|
||||
{
|
||||
if(requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context
|
||||
{
|
||||
last_n_tokens.erase(last_n_tokens.end() - n_past, last_n_tokens.end());
|
||||
n_past = 0;
|
||||
fastforwardok = false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if ((i + 2) >= embd_inp_len)
|
||||
|
||||
if (requireFullSubset) //RWKV can only do this if embd_inp contains everything in current context
|
||||
{
|
||||
break;
|
||||
if (i >= embd_inp_len)
|
||||
{
|
||||
last_n_tokens.erase(last_n_tokens.end() - n_past, last_n_tokens.end());
|
||||
n_past = 0;
|
||||
fastforwardok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((i + 2) >= embd_inp_len)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past);
|
||||
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past);
|
||||
embd_inp_len = embd_inp.size();
|
||||
if(fastforwardok)
|
||||
{
|
||||
last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past);
|
||||
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_past);
|
||||
embd_inp_len = embd_inp.size();
|
||||
}
|
||||
|
||||
//smart context mode, detect if we have a shifted context at max length
|
||||
//requirement: previous context was at least nctx/2 longer than current,
|
||||
//mode is on, and current context already maxed.
|
||||
|
||||
// printf("\nconds: %d %d %d\n",current_context_tokens.size() >= nctx*0.8
|
||||
// ,embd_inp_len >= nctx*0.6 ,current_context_tokens.size() - n_past > nctx*0.5);
|
||||
// printf("csiz:%d par:%d eilen:%d np:%d",current_context_tokens.size(), (int)(nctx*0.8),embd_inp_len,n_past);
|
||||
|
||||
if (useSmartContext && smartcontext.size() > 0 && embd_inp_len >= SCInpLenThreshold)
|
||||
if (fastforwardok && useSmartContext && smartcontext.size() > 0 && embd_inp_len >= SCInpLenThreshold)
|
||||
{
|
||||
// printf("curfullcontext:\n");
|
||||
// print_tok_vec(current_context_tokens);
|
||||
|
||||
//see if smartcontext is still usable
|
||||
// printf("smartctx:\n");
|
||||
// print_tok_vec(smartcontext);
|
||||
// printf("embinp:\n");
|
||||
// print_tok_vec(embd_inp);
|
||||
//see if smartcontext is still usable
|
||||
auto shared = LongestCommonSubseq(smartcontext, embd_inp);
|
||||
if (shared.size() > SCTokThreshold && ArrStartWith(smartcontext, shared)) //at least 32 tokens in common
|
||||
{
|
||||
|
@ -300,8 +310,6 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
auto trimmed = std::vector<int>(embd_inp.begin() + found, embd_inp.end());
|
||||
embd_inp = trimmed;
|
||||
embd_inp_len = embd_inp.size();
|
||||
// printf("trimmed:\n");
|
||||
// print_tok_vec(embd_inp,&vocab.id_to_token);
|
||||
printf("\n[Reusing Smart Context: %d allowance remaining]", found);
|
||||
|
||||
int old_n_past = n_past;
|
||||
|
@ -313,7 +321,6 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
|
||||
for (int i = n_past; i < current_context_tokens.size(); ++i)
|
||||
{
|
||||
//printf("\n%s and %s\n",vocab.id_to_token[current_context_tokens[i]].c_str(), vocab.id_to_token[embd_inp[i-offset_fix]].c_str());
|
||||
if (current_context_tokens[i] == embd_inp[i-offset_fix])
|
||||
{
|
||||
n_past += 1;
|
||||
|
@ -331,8 +338,7 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
|
||||
last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + (n_past-old_n_past));
|
||||
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + (n_past-old_n_past));
|
||||
// printf("np:%d newembinp: \n",n_past);
|
||||
// print_tok_vec(embd_inp);
|
||||
|
||||
}else{
|
||||
smartcontext.clear();
|
||||
}
|
||||
|
@ -347,17 +353,16 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
smartcontext.clear();
|
||||
}
|
||||
|
||||
if(useSmartContext
|
||||
if(fastforwardok && useSmartContext
|
||||
&& smartcontext.size()==0 && current_context_tokens.size() >= SCCtxLenThreshold
|
||||
&& embd_inp_len >= SCInpLenThreshold
|
||||
&& current_context_tokens.size() - n_past > SCPastLenThreshold)
|
||||
{
|
||||
{
|
||||
//determine longest common substring after removing start part
|
||||
int shiftamt = embd_inp.size() * SCTruncationRatio;
|
||||
smartcontext = std::vector<int>(embd_inp.begin() + shiftamt, embd_inp.end());
|
||||
printf("\n[New Smart Context Triggered! Buffered Token Allowance: %d]",shiftamt);
|
||||
// printf("smartctx:\n");
|
||||
// print_tok_vec(smartcontext,&vocab.id_to_token);
|
||||
|
||||
embd_inp = smartcontext;
|
||||
//if max ctx length is exceeded, chop the prompt in half after the start part, and memorize it. The memorized part becomes LCS marker.
|
||||
//when a future prompt comes in, find the LCS again. If LCS > a length and LCS starts with memorized LCS
|
||||
|
|
|
@ -54,4 +54,5 @@ int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> se
|
|||
|
||||
FileFormat check_file_format(const std::string & fname);
|
||||
void ContextFastForward(std::vector<int> ¤t_context_tokens, std::vector<int> &embd_inp,
|
||||
int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, const bool useSmartContext);
|
||||
int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
|
||||
const bool useSmartContext, const bool requireFullSubset);
|
|
@ -371,7 +371,7 @@ bool gpt2_eval(
|
|||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
//todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now
|
||||
static size_t buf_size = 1024u*1024*1024;
|
||||
static size_t buf_size = 1280u*1024*1024;
|
||||
static void * buf = malloc(buf_size);
|
||||
|
||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||
|
|
|
@ -382,7 +382,7 @@ bool gptj_eval(
|
|||
const int d_key = n_embd/n_head;
|
||||
|
||||
//todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now
|
||||
static size_t buf_size = 1024u*1024*1024;
|
||||
static size_t buf_size = 1280u*1024*1024;
|
||||
static void * buf = malloc(buf_size);
|
||||
|
||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
//adapted from RWKV.cpp repo under MIT license
|
||||
// https://github.com/saharNooby/rwkv.cpp
|
||||
|
||||
#include "rwkv.h"
|
||||
#include "ggml_rwkv.h"
|
||||
#include "otherarch.h"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue