diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 1f4537e8c..e9ad7114d 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -157,9 +157,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o stop_sequence.clear(); for(int x=0;x= 32 && ggml_cpu_has_blas()); - bool blasmode = false; + bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2); + bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas()); + // bool blasmode = false; int original_batch = params.n_batch; int original_threads = params.n_threads; if (blasmode) { - params.n_batch = blasbatchsize; //received reports of 1024 and above crashing on some models + //for gpttype, GPT2 crashes above 256. + int bbs = (blasbatchsize>256?256:blasbatchsize); + params.n_batch = bbs; //received reports of 1024 and above crashing on some models params.n_threads = 1; } @@ -350,7 +353,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if (concat_output.find(matched) != std::string::npos) { remaining_tokens = 0; - printf("\n(Stop sequence triggered)"); + printf("\n(Stop sequence triggered: %s)",matched.c_str()); break; } } diff --git a/klite.embd b/klite.embd index 76ca58248..8a06baa6e 100644 --- a/klite.embd +++ b/klite.embd @@ -27,8 +27,8 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp. Please - +
diff --git a/koboldcpp.py b/koboldcpp.py index 0fbdd2e71..e98b650bb 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -453,7 +453,7 @@ if __name__ == '__main__': default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1)) parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads) parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true') - parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[128,256,512,1024], default=512) + parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[64,128,256,512,1024], default=512) parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true') parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true') parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true') diff --git a/llama_adapter.cpp b/llama_adapter.cpp index 861d87422..f1d9a1893 100644 --- a/llama_adapter.cpp +++ b/llama_adapter.cpp @@ -240,13 +240,13 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out // decrement remaining sampling budget --remaining_tokens; //printf("\nid:%d word:%s\n",id,llama_token_to_str(ctx, id)); - concat_output += llama_token_to_str(ctx, id); + concat_output += llama_token_to_str(ctx, id); for (const auto &matched : stop_sequence) { if (concat_output.find(matched) != std::string::npos) { remaining_tokens = 0; - printf("\n(Stop sequence triggered)"); + printf("\n(Stop sequence triggered: %s)",matched.c_str()); break; } } diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index 5e3d625c4..859774c99 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -375,7 +375,7 @@ bool gpt2_eval( static void * buf = malloc(buf_size); if (mem_per_token > 0 && mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead + const size_t buf_size_new = 2*(mem_per_token*N); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); // reallocate diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index f02d9cfcd..62cf224b2 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -386,7 +386,7 @@ bool gptj_eval( static void * buf = malloc(buf_size); if (mem_per_token > 0 && mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead + const size_t buf_size_new = 1.5*(mem_per_token*N); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); // reallocate