autopick cublas in gui if possible, better layer picking logic

This commit is contained in:
Concedo 2023-11-05 01:35:27 +08:00
parent 7a8c0df2e5
commit 1e7088a80b
3 changed files with 23 additions and 4 deletions

View file

@ -5827,8 +5827,8 @@ void ggml_init_cublas() {
CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0; int64_t total_vram = 0;
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: %s\n", __func__,(g_mul_mat_q?"yes":"maybe")); fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: %s\n", __func__,"maybe");
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: %s\n", __func__,(g_mul_mat_q?"no":"maybe")); fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: %s\n", __func__,"maybe");
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
for (int id = 0; id < g_device_count; ++id) { for (int id = 0; id < g_device_count; ++id) {
cudaDeviceProp prop; cudaDeviceProp prop;

View file

@ -1902,7 +1902,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
int realnpredict = params.n_predict-stopper_unused_tokens; int realnpredict = params.n_predict-stopper_unused_tokens;
float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict)); float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2)); float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
printf("\nContextLimit: %d/%d, Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs (%.1fT/s)",current_context_tokens.size(),nctx, time1, pt1, time2, pt2, (time1 + time2), tokens_per_second); printf("\nContextLimit: %d/%d, Processing:%.2fs (%.1fms/T), Generation:%.2fs (%.1fms/T), Total:%.2fs (%.2fT/s)",current_context_tokens.size(),nctx, time1, pt1, time2, pt2, (time1 + time2), tokens_per_second);
fflush(stdout); fflush(stdout);
output.status = 1; output.status = 1;
generation_finished = true; generation_finished = true;

View file

@ -392,6 +392,7 @@ rewardcounter = 0 #reduces error counts for successful jobs
totalgens = 0 totalgens = 0
currentusergenkey = "" #store a special key so polled streaming works even in multiuser currentusergenkey = "" #store a special key so polled streaming works even in multiuser
args = None #global args args = None #global args
gui_layers_untouched = True
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
sys_version = "" sys_version = ""
@ -1129,18 +1130,30 @@ def show_new_gui():
MaxMemory[0] = max(int(FetchedCUdeviceMem[idx])*1024*1024,MaxMemory[0]) MaxMemory[0] = max(int(FetchedCUdeviceMem[idx])*1024*1024,MaxMemory[0])
pass pass
#autopick cublas if suitable
global exitcounter
if exitcounter < 100 and MaxMemory[0]>3500000000 and CUDevicesNames[0]!="" and "Use CuBLAS" in runopts and runopts_var.get()=="Use OpenBLAS":
runopts_var.set("Use CuBLAS")
pass
changed_gpu_choice_var() changed_gpu_choice_var()
return return
def autoset_gpu_layers(filepath): #shitty algo to determine how many layers to use def autoset_gpu_layers(filepath): #shitty algo to determine how many layers to use
try: try:
global gui_layers_untouched
fsize = os.path.getsize(filepath) fsize = os.path.getsize(filepath)
if fsize>10000000: #dont bother with models < 10mb if fsize>10000000: #dont bother with models < 10mb
mem = MaxMemory[0] mem = MaxMemory[0]
sizeperlayer = fsize*0.05714 sizeperlayer = fsize*0.05714
layerlimit = int(min(200,mem/sizeperlayer)) layerlimit = int(min(200,mem/sizeperlayer))
if (gpulayers_var.get()=="" or gpulayers_var.get()=="0") and layerlimit>0: old_gui_layers_untouched = gui_layers_untouched
gui_layers_zeroed = gpulayers_var.get()=="" or gpulayers_var.get()=="0"
if (gui_layers_untouched or gui_layers_zeroed) and layerlimit>0:
gpulayers_var.set(str(layerlimit)) gpulayers_var.set(str(layerlimit))
gui_layers_untouched = old_gui_layers_untouched
if gui_layers_zeroed:
gui_layers_untouched = True
except Exception as ex: except Exception as ex:
pass pass
@ -1173,6 +1186,11 @@ def show_new_gui():
num_backends_built.bind("<Enter>", lambda event: show_tooltip(event, f"Number of backends you have built and available." + (f"\n\nMissing Backends: \n\n{nl.join(antirunopts)}" if len(runopts) != 6 else ""))) num_backends_built.bind("<Enter>", lambda event: show_tooltip(event, f"Number of backends you have built and available." + (f"\n\nMissing Backends: \n\n{nl.join(antirunopts)}" if len(runopts) != 6 else "")))
num_backends_built.bind("<Leave>", hide_tooltip) num_backends_built.bind("<Leave>", hide_tooltip)
def changed_gpulayers(*args):
global gui_layers_untouched
gui_layers_untouched = False
pass
def changed_gpu_choice_var(*args): def changed_gpu_choice_var(*args):
global exitcounter global exitcounter
if exitcounter > 100: if exitcounter > 100:
@ -1194,6 +1212,7 @@ def show_new_gui():
gpuname_label.configure(text="") gpuname_label.configure(text="")
gpu_choice_var.trace("w", changed_gpu_choice_var) gpu_choice_var.trace("w", changed_gpu_choice_var)
gpulayers_var.trace("w", changed_gpulayers)
def changerunmode(a,b,c): def changerunmode(a,b,c):
index = runopts_var.get() index = runopts_var.get()