fixes to stopper tokens, fixed BLAS mode for GPT2 and GPTJ, updated kobold lite
This commit is contained in:
parent
6548d3b3fb
commit
c757fbee1d
6 changed files with 17 additions and 14 deletions
|
@ -157,9 +157,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
stop_sequence.clear();
|
||||
for(int x=0;x<stop_token_max;++x)
|
||||
{
|
||||
if(inputs.stop_sequence[x]!="")
|
||||
std::string stopper = inputs.stop_sequence[x];
|
||||
if(stopper!="")
|
||||
{
|
||||
stop_sequence.push_back(inputs.stop_sequence[x]);
|
||||
stop_sequence.push_back(stopper);
|
||||
}
|
||||
}
|
||||
params.prompt = inputs.prompt;
|
||||
|
@ -211,14 +212,16 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext);
|
||||
|
||||
//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
|
||||
// bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2);
|
||||
// bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
|
||||
bool blasmode = false;
|
||||
bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2);
|
||||
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
|
||||
// bool blasmode = false;
|
||||
int original_batch = params.n_batch;
|
||||
int original_threads = params.n_threads;
|
||||
if (blasmode)
|
||||
{
|
||||
params.n_batch = blasbatchsize; //received reports of 1024 and above crashing on some models
|
||||
//for gpttype, GPT2 crashes above 256.
|
||||
int bbs = (blasbatchsize>256?256:blasbatchsize);
|
||||
params.n_batch = bbs; //received reports of 1024 and above crashing on some models
|
||||
params.n_threads = 1;
|
||||
}
|
||||
|
||||
|
@ -350,7 +353,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
if (concat_output.find(matched) != std::string::npos)
|
||||
{
|
||||
remaining_tokens = 0;
|
||||
printf("\n(Stop sequence triggered)");
|
||||
printf("\n(Stop sequence triggered: %s)",matched.c_str());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -453,7 +453,7 @@ if __name__ == '__main__':
|
|||
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
|
||||
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
||||
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
||||
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[128,256,512,1024], default=512)
|
||||
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[64,128,256,512,1024], default=512)
|
||||
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
|
||||
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
||||
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
||||
|
|
|
@ -240,13 +240,13 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
|
|||
// decrement remaining sampling budget
|
||||
--remaining_tokens;
|
||||
//printf("\nid:%d word:%s\n",id,llama_token_to_str(ctx, id));
|
||||
concat_output += llama_token_to_str(ctx, id);
|
||||
concat_output += llama_token_to_str(ctx, id);
|
||||
for (const auto &matched : stop_sequence)
|
||||
{
|
||||
if (concat_output.find(matched) != std::string::npos)
|
||||
{
|
||||
remaining_tokens = 0;
|
||||
printf("\n(Stop sequence triggered)");
|
||||
printf("\n(Stop sequence triggered: %s)",matched.c_str());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -375,7 +375,7 @@ bool gpt2_eval(
|
|||
static void * buf = malloc(buf_size);
|
||||
|
||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||
const size_t buf_size_new = 2*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||
|
||||
// reallocate
|
||||
|
|
|
@ -386,7 +386,7 @@ bool gptj_eval(
|
|||
static void * buf = malloc(buf_size);
|
||||
|
||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||
const size_t buf_size_new = 1.5*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||
|
||||
// reallocate
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue