hide gpu input box when dropdown not selected, minor memory fix for neox and gptj

2023-06-04 21:47:17 +08:00 · 2023-06-04 21:47:17 +08:00 · 9aa2d8535b
commit 9aa2d8535b
parent 1ddbb9acd9
3 changed files with 22 additions and 10 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -459,7 +459,14 @@ def show_gui():
        opts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use No BLAS","Use OpenBLAS (Old CPU, noavx2)","Failsafe Mode (Old CPU, noavx)"]
        runchoice = tk.StringVar()
        runchoice.set("Use OpenBLAS")
-        tk.OptionMenu( root , runchoice , *opts ).grid(row=2,column=0)
+        def onDropdownChange(event):
            sel = runchoice.get()
            if sel==opts[1] or sel==opts[2] or sel==opts[3]:
                frm1.grid(row=4,column=0,pady=4)
            else:
                frm1.grid_forget()
            pass
        tk.OptionMenu( root , runchoice , command = onDropdownChange ,*opts ).grid(row=2,column=0)
        frm2 = tk.Frame(root)
@ -479,6 +486,7 @@ def show_gui():
        gpu_lbl.grid(row=0,column=0)
        gpu_layers_input.grid(row=0,column=1)
        frm1.grid(row=4,column=0,pady=4)
        onDropdownChange(None)
        stream = tk.IntVar()
        smartcontext = tk.IntVar()
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@ -368,7 +368,7 @@ bool gptj_eval(
    static void * buf = malloc(buf_size);
    if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
-        const size_t buf_size_new = 320u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead
+        const size_t buf_size_new = 320u*1024*1024 + 1.7*(mem_per_token*N); // add 10% to account for ggml object overhead
        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
        // reallocate
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@ -138,8 +138,8 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
        ctx_size += (6 + 16*n_layer)*512; // object overhead
@ -410,15 +410,19 @@ bool gpt_neox_eval(
    static void * buf = malloc(buf_size);
    if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
-        const size_t buf_size_new = 360u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead
+        const size_t buf_size_new = 360u*1024*1024 + 1.7*(mem_per_token*N); // add 10% to account for ggml object overhead
        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
        // reallocate
-        buf_size = buf_size_new;
+        if (buf_size_new > buf_size)
-        buf = realloc(buf, buf_size);
+        {
-        if (buf == nullptr) {
+            buf_size = buf_size_new;
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+            buf = realloc(buf, buf_size);
-            return false;
+            if (buf == nullptr)
            {
                fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
                return false;
            }
        }
    }