tweaks for rocm blas
This commit is contained in:
parent
6d06695c7e
commit
8df03ed026
2 changed files with 5 additions and 13 deletions
4
Makefile
4
Makefile
|
@ -199,9 +199,9 @@ ifdef LLAMA_HIPBLAS
|
||||||
CC := $(ROCM_PATH)/llvm/bin/clang
|
CC := $(ROCM_PATH)/llvm/bin/clang
|
||||||
CXX := $(ROCM_PATH)/llvm/bin/clang++
|
CXX := $(ROCM_PATH)/llvm/bin/clang++
|
||||||
GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||||
LLAMA_CUDA_DMMV_X ?= 128
|
LLAMA_CUDA_DMMV_X ?= 32
|
||||||
LLAMA_CUDA_MMV_Y ?= 2
|
LLAMA_CUDA_MMV_Y ?= 2
|
||||||
LLAMA_CUDA_KQUANTS_ITER ?= 1
|
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
||||||
HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
|
HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
|
||||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
|
|
14
koboldcpp.py
14
koboldcpp.py
|
@ -223,21 +223,13 @@ def load_model(model_filename):
|
||||||
else:
|
else:
|
||||||
inputs.tensor_split[n] = 0
|
inputs.tensor_split[n] = 0
|
||||||
|
|
||||||
# we must force an explicit tensor split
|
|
||||||
# otherwise the default will divide equally and multigpu crap will slow it down badly
|
|
||||||
inputs.cublas_info = 0
|
inputs.cublas_info = 0
|
||||||
if (args.usecublas and "0" in args.usecublas):
|
if (args.usecublas and "0" in args.usecublas):
|
||||||
inputs.cublas_info = 0
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||||
if not args.tensor_split:
|
|
||||||
inputs.tensor_split[inputs.cublas_info] = 100
|
|
||||||
elif (args.usecublas and "1" in args.usecublas):
|
elif (args.usecublas and "1" in args.usecublas):
|
||||||
inputs.cublas_info = 1
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
||||||
if not args.tensor_split:
|
|
||||||
inputs.tensor_split[inputs.cublas_info] = 100
|
|
||||||
elif (args.usecublas and "2" in args.usecublas):
|
elif (args.usecublas and "2" in args.usecublas):
|
||||||
inputs.cublas_info = 2
|
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
|
||||||
if not args.tensor_split:
|
|
||||||
inputs.tensor_split[inputs.cublas_info] = 100
|
|
||||||
|
|
||||||
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
|
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
|
||||||
inputs.debugmode = args.debugmode
|
inputs.debugmode = args.debugmode
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue