diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 20e4b4eed..47e325fe7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5827,8 +5827,8 @@ void ggml_init_cublas() { CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; - fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: %s\n", __func__,(g_mul_mat_q?"yes":"maybe")); - fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: %s\n", __func__,(g_mul_mat_q?"no":"maybe")); + fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: %s\n", __func__,"maybe"); + fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: %s\n", __func__,"maybe"); fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); for (int id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 2cc8bd693..d7ee627cd 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -1902,7 +1902,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o int realnpredict = params.n_predict-stopper_unused_tokens; float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict)); float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2)); - printf("\nContextLimit: %d/%d, Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs (%.1fT/s)",current_context_tokens.size(),nctx, time1, pt1, time2, pt2, (time1 + time2), tokens_per_second); + printf("\nContextLimit: %d/%d, Processing:%.2fs (%.1fms/T), Generation:%.2fs (%.1fms/T), Total:%.2fs (%.2fT/s)",current_context_tokens.size(),nctx, time1, pt1, time2, pt2, (time1 + time2), tokens_per_second); fflush(stdout); output.status = 1; generation_finished = true; diff --git a/koboldcpp.py b/koboldcpp.py index de7aa1296..4569f51cc 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -392,6 +392,7 @@ rewardcounter = 0 #reduces error counts for successful jobs totalgens = 0 currentusergenkey = "" #store a special key so polled streaming works even in multiuser args = None #global args +gui_layers_untouched = True class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" @@ -1129,18 +1130,30 @@ def show_new_gui(): MaxMemory[0] = max(int(FetchedCUdeviceMem[idx])*1024*1024,MaxMemory[0]) pass + #autopick cublas if suitable + global exitcounter + if exitcounter < 100 and MaxMemory[0]>3500000000 and CUDevicesNames[0]!="" and "Use CuBLAS" in runopts and runopts_var.get()=="Use OpenBLAS": + runopts_var.set("Use CuBLAS") + pass + changed_gpu_choice_var() return def autoset_gpu_layers(filepath): #shitty algo to determine how many layers to use try: + global gui_layers_untouched fsize = os.path.getsize(filepath) if fsize>10000000: #dont bother with models < 10mb mem = MaxMemory[0] sizeperlayer = fsize*0.05714 layerlimit = int(min(200,mem/sizeperlayer)) - if (gpulayers_var.get()=="" or gpulayers_var.get()=="0") and layerlimit>0: + old_gui_layers_untouched = gui_layers_untouched + gui_layers_zeroed = gpulayers_var.get()=="" or gpulayers_var.get()=="0" + if (gui_layers_untouched or gui_layers_zeroed) and layerlimit>0: gpulayers_var.set(str(layerlimit)) + gui_layers_untouched = old_gui_layers_untouched + if gui_layers_zeroed: + gui_layers_untouched = True except Exception as ex: pass @@ -1173,6 +1186,11 @@ def show_new_gui(): num_backends_built.bind("", lambda event: show_tooltip(event, f"Number of backends you have built and available." + (f"\n\nMissing Backends: \n\n{nl.join(antirunopts)}" if len(runopts) != 6 else ""))) num_backends_built.bind("", hide_tooltip) + def changed_gpulayers(*args): + global gui_layers_untouched + gui_layers_untouched = False + pass + def changed_gpu_choice_var(*args): global exitcounter if exitcounter > 100: @@ -1194,6 +1212,7 @@ def show_new_gui(): gpuname_label.configure(text="") gpu_choice_var.trace("w", changed_gpu_choice_var) + gpulayers_var.trace("w", changed_gpulayers) def changerunmode(a,b,c): index = runopts_var.get()