diff --git a/Makefile b/Makefile index 5eb22f024..4eb27e369 100644 --- a/Makefile +++ b/Makefile @@ -199,9 +199,9 @@ ifdef LLAMA_HIPBLAS CC := $(ROCM_PATH)/llvm/bin/clang CXX := $(ROCM_PATH)/llvm/bin/clang++ GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) - LLAMA_CUDA_DMMV_X ?= 128 + LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 2 - LLAMA_CUDA_KQUANTS_ITER ?= 1 + LLAMA_CUDA_KQUANTS_ITER ?= 2 HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) ifdef LLAMA_CUDA_FORCE_DMMV HIPFLAGS += -DGGML_CUDA_FORCE_DMMV diff --git a/koboldcpp.py b/koboldcpp.py index fbd911458..73275b1e0 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -223,21 +223,13 @@ def load_model(model_filename): else: inputs.tensor_split[n] = 0 - # we must force an explicit tensor split - # otherwise the default will divide equally and multigpu crap will slow it down badly inputs.cublas_info = 0 if (args.usecublas and "0" in args.usecublas): - inputs.cublas_info = 0 - if not args.tensor_split: - inputs.tensor_split[inputs.cublas_info] = 100 + os.environ["CUDA_VISIBLE_DEVICES"] = "0" elif (args.usecublas and "1" in args.usecublas): - inputs.cublas_info = 1 - if not args.tensor_split: - inputs.tensor_split[inputs.cublas_info] = 100 + os.environ["CUDA_VISIBLE_DEVICES"] = "1" elif (args.usecublas and "2" in args.usecublas): - inputs.cublas_info = 2 - if not args.tensor_split: - inputs.tensor_split[inputs.cublas_info] = 100 + os.environ["CUDA_VISIBLE_DEVICES"] = "2" inputs.executable_path = (getdirpath()+"/").encode("UTF-8") inputs.debugmode = args.debugmode