fixes to stopper tokens, fixed BLAS mode for GPT2 and GPTJ, updated kobold lite

2023-04-16 21:54:18 +08:00 · 2023-04-16 21:54:18 +08:00 · c757fbee1d
commit c757fbee1d
parent 6548d3b3fb
6 changed files with 17 additions and 14 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -157,9 +157,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    stop_sequence.clear();
    for(int x=0;x<stop_token_max;++x)
    {
-        if(inputs.stop_sequence[x]!="")
+        std::string stopper = inputs.stop_sequence[x];
+        if(stopper!="")
        {
-            stop_sequence.push_back(inputs.stop_sequence[x]);
+            stop_sequence.push_back(stopper);
        }
    }
    params.prompt = inputs.prompt;
@ -211,14 +212,16 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext);

    //if using BLAS and prompt is big enough, switch to single thread and use a huge batch
-    // bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2);
-    // bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
-    bool blasmode = false;
+    bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2);
+    bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
+    // bool blasmode = false;
    int original_batch = params.n_batch;
    int original_threads = params.n_threads;
    if (blasmode)
    {
-        params.n_batch = blasbatchsize; //received reports of 1024 and above crashing on some models
+        //for gpttype, GPT2 crashes above 256.
+        int bbs = (blasbatchsize>256?256:blasbatchsize);
+        params.n_batch = bbs; //received reports of 1024 and above crashing on some models
        params.n_threads = 1;
    }

@ -350,7 +353,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
                    if (concat_output.find(matched) != std::string::npos)
                    {
                        remaining_tokens = 0;
-                        printf("\n(Stop sequence triggered)");
+                        printf("\n(Stop sequence triggered: %s)",matched.c_str());
                        break;
                    }
                }
--- a/klite.embd
+++ b/klite.embd
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -453,7 +453,7 @@ if __name__ == '__main__':
    default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
    parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
    parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
-    parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[128,256,512,1024], default=512)
+    parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[64,128,256,512,1024], default=512)
    parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
    parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
    parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
--- a/llama_adapter.cpp
+++ b/llama_adapter.cpp
@ -240,13 +240,13 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
            // decrement remaining sampling budget
            --remaining_tokens;
            //printf("\nid:%d word:%s\n",id,llama_token_to_str(ctx, id));
-            concat_output += llama_token_to_str(ctx, id);
+            concat_output += llama_token_to_str(ctx, id);           
            for (const auto &matched : stop_sequence)
            {
                if (concat_output.find(matched) != std::string::npos)
                {
                    remaining_tokens = 0;
-                    printf("\n(Stop sequence triggered)");
+                    printf("\n(Stop sequence triggered: %s)",matched.c_str());
                    break;
                }
            }
--- a/otherarch/gpt2_v2.cpp
+++ b/otherarch/gpt2_v2.cpp
@ -375,7 +375,7 @@ bool gpt2_eval(
    static void * buf = malloc(buf_size);

    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
+        const size_t buf_size_new = 2*(mem_per_token*N); // add 10% to account for ggml object overhead
        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);

        // reallocate
--- a/otherarch/gptj_v2.cpp
+++ b/otherarch/gptj_v2.cpp
@ -386,7 +386,7 @@ bool gptj_eval(
    static void * buf = malloc(buf_size);

    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
+        const size_t buf_size_new = 1.5*(mem_per_token*N); // add 10% to account for ggml object overhead
        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);

        // reallocate