From c1cb70d64d307d3fd9b7b9f61bb574e36520499a Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Mon, 31 Jul 2023 19:56:44 +0300 Subject: [PATCH] new build arg LLAMA_CUDA_MMQ_Y --- CMakeLists.txt | 1 + Makefile | 2 ++ README.md | 7 ++++--- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 969a3de69..14eefe005 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -375,6 +375,7 @@ if (LLAMA_HIPBLAS) message(STATUS "HIP and hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) diff --git a/Makefile b/Makefile index 21fed1a46..cdb12a872 100644 --- a/Makefile +++ b/Makefile @@ -270,6 +270,7 @@ ifdef LLAMA_HIPBLAS GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 + LLAMA_CUDA_MMQ_Y ?= 64 LLAMA_CUDA_KQUANTS_ITER ?= 2 CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) @@ -278,6 +279,7 @@ ifdef LLAMA_HIPBLAS ggml-cuda.o: CXXFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) +ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y) ggml-cuda.o: CXXFLAGS += -DGGML_CUDA_FORCE_DMMV ggml-cuda.o: CXXFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h diff --git a/README.md b/README.md index a56b64a88..947c2b694 100644 --- a/README.md +++ b/README.md @@ -437,9 +437,10 @@ Building the program with BLAS support may lead to some performance improvements | Option | Legal values | Default | Description | |-------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom HIP kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. | + | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### CLBlast