fixed a few OOM errors with larger contexts - I cannot figure out why they happen, so I am forced to increase the buffer size.

2023-04-11 00:14:57 +08:00 · 2023-04-11 00:14:57 +08:00 · 69b85f5b61
commit 69b85f5b61
parent f53238f570
5 changed files with 25 additions and 27 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -49,6 +49,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
    n_threads = params.n_threads = inputs.threads;
    n_batch = params.n_batch = inputs.batch_size;
    modelname = params.model = inputs.model_filename;
+    params.memory_f16 = inputs.f16_kv;
+    params.n_ctx = inputs.max_context_length;
+    model_v1.hparams.n_ctx = model_v2.hparams.n_ctx = model_gpt2_v1.hparams.n_ctx = model_gpt2_v2.hparams.n_ctx = params.n_ctx;

    if (file_format == FileFormat::GPT2_1)
    {
@ -153,6 +156,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    params.temp = inputs.temperature;
    params.repeat_last_n = inputs.rep_pen_range;
    params.repeat_penalty = inputs.rep_pen;
+    params.n_ctx = inputs.max_context_length;
    params.n_batch = n_batch;
    params.n_threads = n_threads;

@ -173,23 +177,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);

    //truncate to front of the prompt if its too long
-    int32_t nctx = 512;
-    if(file_format == FileFormat::GPTJ_1||file_format == FileFormat::GPTJ_2)
-    {
-        nctx = model_v1.hparams.n_ctx;
-    }    
-    else if(file_format==FileFormat::GPTJ_3)
-    {
-        nctx = model_v2.hparams.n_ctx;
-    }
-    else if(file_format==FileFormat::GPT2_1)
-    {
-        nctx = model_gpt2_v1.hparams.n_ctx;
-    }
-    else if(file_format==FileFormat::GPT2_2)
-    {
-        nctx = model_gpt2_v2.hparams.n_ctx;
-    }
+    int32_t nctx = params.n_ctx;

    if (embd_inp.size() + params.n_predict > nctx)
    {
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -349,7 +349,7 @@ def main(args):
    mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1
    modelname = os.path.abspath(ggml_selected_file)
    print(f"Loading model: {modelname} \n[Parts: {mdl_nparts}, Threads: {args.threads}]")
-    loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,args.usemmap)
+    loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,(not args.nommap))
    print("Load Model OK: " + str(loadok))

    if not loadok:
@ -378,7 +378,7 @@ def main(args):
    RunServerMultiThreaded(args.host, args.port, embedded_kailite)

 if __name__ == '__main__':
-    print("Welcome to KoboldCpp - Version 1.3") # just update version manually
+    print("Welcome to KoboldCpp - Version 1.4") # just update version manually
    parser = argparse.ArgumentParser(description='Kobold llama.cpp server')
    parser.add_argument("model_file", help="Model file to load", nargs="?")
    portgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args
@ -396,6 +396,6 @@ if __name__ == '__main__':
    parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
    parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
    parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
-    parser.add_argument("--usemmap", help="Use mmap to load newer models (default false)", action='store_true')
+    parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
    args = parser.parse_args()
    main(args)
--- a/otherarch/gpt2_v1.cpp
+++ b/otherarch/gpt2_v1.cpp
@ -36,10 +36,13 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
        }
    }

+    auto desiredMaxCtx = model.hparams.n_ctx;
+
    // load hparams
    {
        auto & hparams = model.hparams;

+        
        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
@ -47,6 +50,9 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));

+        //used to expand KV size if needed
+        desiredMaxCtx = std::max(hparams.n_ctx,desiredMaxCtx);
+
        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
@ -94,7 +100,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model

        const int n_embd  = hparams.n_embd;
        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
+        const int n_ctx   = desiredMaxCtx;
        const int n_vocab = hparams.n_vocab;

        ctx_size += n_embd*ggml_v1_type_size(GGML_V1_TYPE_F32); // ln_f_g
@ -215,7 +221,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model

        const int n_embd  = hparams.n_embd;
        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
+        const int n_ctx   = desiredMaxCtx;

        const int n_mem      = n_layer*n_ctx;
        const int n_elements = n_embd*n_mem;
--- a/otherarch/gpt2_v2.cpp
+++ b/otherarch/gpt2_v2.cpp
@ -81,6 +81,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
        }
    }

+    auto memory_type = GGML_TYPE_F16;
+
    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
    ggml_type wtype = GGML_TYPE_COUNT;
@ -243,8 +245,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
        const int n_mem      = n_layer*n_ctx;
        const int n_elements = n_embd*n_mem;
       
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+        model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);

        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);

@ -370,7 +372,8 @@ bool gpt2_eval(
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;

-    static size_t buf_size = 256u*1024*1024;
+    //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now  
+    static size_t buf_size = 1024u*1024*1024;
    static void * buf = malloc(buf_size);

    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
--- a/otherarch/gptj_v2.cpp
+++ b/otherarch/gptj_v2.cpp
@ -378,7 +378,8 @@ bool gptj_eval(

    const int d_key = n_embd/n_head;

-    static size_t buf_size = 256u*1024*1024;
+    //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now  
+    static size_t buf_size = 1024u*1024*1024;
    static void * buf = malloc(buf_size);

    if (mem_per_token > 0 && mem_per_token*N > buf_size) {