fixes to stopper tokens, fixed BLAS mode for GPT2 and GPTJ, updated kobold lite
This commit is contained in:
parent
6548d3b3fb
commit
c757fbee1d
6 changed files with 17 additions and 14 deletions
|
@ -157,9 +157,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
stop_sequence.clear();
|
stop_sequence.clear();
|
||||||
for(int x=0;x<stop_token_max;++x)
|
for(int x=0;x<stop_token_max;++x)
|
||||||
{
|
{
|
||||||
if(inputs.stop_sequence[x]!="")
|
std::string stopper = inputs.stop_sequence[x];
|
||||||
|
if(stopper!="")
|
||||||
{
|
{
|
||||||
stop_sequence.push_back(inputs.stop_sequence[x]);
|
stop_sequence.push_back(stopper);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
params.prompt = inputs.prompt;
|
params.prompt = inputs.prompt;
|
||||||
|
@ -211,14 +212,16 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext);
|
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext);
|
||||||
|
|
||||||
//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
|
//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
|
||||||
// bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2);
|
bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2);
|
||||||
// bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
|
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
|
||||||
bool blasmode = false;
|
// bool blasmode = false;
|
||||||
int original_batch = params.n_batch;
|
int original_batch = params.n_batch;
|
||||||
int original_threads = params.n_threads;
|
int original_threads = params.n_threads;
|
||||||
if (blasmode)
|
if (blasmode)
|
||||||
{
|
{
|
||||||
params.n_batch = blasbatchsize; //received reports of 1024 and above crashing on some models
|
//for gpttype, GPT2 crashes above 256.
|
||||||
|
int bbs = (blasbatchsize>256?256:blasbatchsize);
|
||||||
|
params.n_batch = bbs; //received reports of 1024 and above crashing on some models
|
||||||
params.n_threads = 1;
|
params.n_threads = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -350,7 +353,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
if (concat_output.find(matched) != std::string::npos)
|
if (concat_output.find(matched) != std::string::npos)
|
||||||
{
|
{
|
||||||
remaining_tokens = 0;
|
remaining_tokens = 0;
|
||||||
printf("\n(Stop sequence triggered)");
|
printf("\n(Stop sequence triggered: %s)",matched.c_str());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -453,7 +453,7 @@ if __name__ == '__main__':
|
||||||
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
|
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
|
||||||
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
||||||
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
||||||
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[128,256,512,1024], default=512)
|
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[64,128,256,512,1024], default=512)
|
||||||
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
|
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
|
||||||
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
||||||
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
||||||
|
|
|
@ -246,7 +246,7 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
|
||||||
if (concat_output.find(matched) != std::string::npos)
|
if (concat_output.find(matched) != std::string::npos)
|
||||||
{
|
{
|
||||||
remaining_tokens = 0;
|
remaining_tokens = 0;
|
||||||
printf("\n(Stop sequence triggered)");
|
printf("\n(Stop sequence triggered: %s)",matched.c_str());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -375,7 +375,7 @@ bool gpt2_eval(
|
||||||
static void * buf = malloc(buf_size);
|
static void * buf = malloc(buf_size);
|
||||||
|
|
||||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||||
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
const size_t buf_size_new = 2*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||||
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||||
|
|
||||||
// reallocate
|
// reallocate
|
||||||
|
|
|
@ -386,7 +386,7 @@ bool gptj_eval(
|
||||||
static void * buf = malloc(buf_size);
|
static void * buf = malloc(buf_size);
|
||||||
|
|
||||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||||
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
const size_t buf_size_new = 1.5*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||||
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||||
|
|
||||||
// reallocate
|
// reallocate
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue