diff --git a/Makefile b/Makefile
index 5eb22f024..4eb27e369 100644
--- a/Makefile
+++ b/Makefile
@@ -199,9 +199,9 @@ ifdef LLAMA_HIPBLAS
 	CC         := $(ROCM_PATH)/llvm/bin/clang
 	CXX        := $(ROCM_PATH)/llvm/bin/clang++
 	GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
-	LLAMA_CUDA_DMMV_X ?= 128
+	LLAMA_CUDA_DMMV_X ?= 32
 	LLAMA_CUDA_MMV_Y ?= 2
-	LLAMA_CUDA_KQUANTS_ITER ?= 1
+	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	HIPFLAGS   += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 ifdef LLAMA_CUDA_FORCE_DMMV
 	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
diff --git a/koboldcpp.py b/koboldcpp.py
index fbd911458..73275b1e0 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -223,21 +223,13 @@ def load_model(model_filename):
         else:
             inputs.tensor_split[n] = 0
 
-    # we must force an explicit tensor split
-    # otherwise the default will divide equally and multigpu crap will slow it down badly
     inputs.cublas_info = 0
     if (args.usecublas and "0" in args.usecublas):
-        inputs.cublas_info = 0
-        if not args.tensor_split:
-            inputs.tensor_split[inputs.cublas_info] = 100
+        os.environ["CUDA_VISIBLE_DEVICES"] = "0"
     elif (args.usecublas and "1" in args.usecublas):
-        inputs.cublas_info = 1
-        if not args.tensor_split:
-            inputs.tensor_split[inputs.cublas_info] = 100
+        os.environ["CUDA_VISIBLE_DEVICES"] = "1"
     elif (args.usecublas and "2" in args.usecublas):
-        inputs.cublas_info = 2
-        if not args.tensor_split:
-            inputs.tensor_split[inputs.cublas_info] = 100
+        os.environ["CUDA_VISIBLE_DEVICES"] = "2"
 
     inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
     inputs.debugmode = args.debugmode