fixes to stopper tokens, fixed BLAS mode for GPT2 and GPTJ, updated kobold lite

This commit is contained in:
Concedo 2023-04-16 21:54:18 +08:00
parent 6548d3b3fb
commit c757fbee1d
6 changed files with 17 additions and 14 deletions

View file

@ -157,9 +157,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
stop_sequence.clear();
for(int x=0;x<stop_token_max;++x)
{
if(inputs.stop_sequence[x]!="")
std::string stopper = inputs.stop_sequence[x];
if(stopper!="")
{
stop_sequence.push_back(inputs.stop_sequence[x]);
stop_sequence.push_back(stopper);
}
}
params.prompt = inputs.prompt;
@ -211,14 +212,16 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext);
//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
// bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2);
// bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
bool blasmode = false;
bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2);
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
// bool blasmode = false;
int original_batch = params.n_batch;
int original_threads = params.n_threads;
if (blasmode)
{
params.n_batch = blasbatchsize; //received reports of 1024 and above crashing on some models
//for gpttype, GPT2 crashes above 256.
int bbs = (blasbatchsize>256?256:blasbatchsize);
params.n_batch = bbs; //received reports of 1024 and above crashing on some models
params.n_threads = 1;
}
@ -350,7 +353,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
if (concat_output.find(matched) != std::string::npos)
{
remaining_tokens = 0;
printf("\n(Stop sequence triggered)");
printf("\n(Stop sequence triggered: %s)",matched.c_str());
break;
}
}

File diff suppressed because one or more lines are too long

View file

@ -453,7 +453,7 @@ if __name__ == '__main__':
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[128,256,512,1024], default=512)
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[64,128,256,512,1024], default=512)
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')

View file

@ -240,13 +240,13 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
// decrement remaining sampling budget
--remaining_tokens;
//printf("\nid:%d word:%s\n",id,llama_token_to_str(ctx, id));
concat_output += llama_token_to_str(ctx, id);
concat_output += llama_token_to_str(ctx, id);
for (const auto &matched : stop_sequence)
{
if (concat_output.find(matched) != std::string::npos)
{
remaining_tokens = 0;
printf("\n(Stop sequence triggered)");
printf("\n(Stop sequence triggered: %s)",matched.c_str());
break;
}
}

View file

@ -375,7 +375,7 @@ bool gpt2_eval(
static void * buf = malloc(buf_size);
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
const size_t buf_size_new = 2*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
// reallocate

View file

@ -386,7 +386,7 @@ bool gptj_eval(
static void * buf = malloc(buf_size);
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
const size_t buf_size_new = 1.5*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
// reallocate