revert FORCE_DMMV both in cuda and sycl
This commit is contained in:
parent
4bf6133b0e
commit
c7ed1d8ddc
4 changed files with 30 additions and 19 deletions
|
@ -96,8 +96,8 @@ option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM"
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
||||||
option(LLAMA_FORCE_DMMV "llama: use dmmv instead of mmvq kernels on GPU" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
option(LLAMA_FORCE_MMQ "llama: use mmq kernels instead of Math Lib" OFF)
|
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||||
|
@ -405,10 +405,10 @@ if (LLAMA_CUDA)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUDA)
|
add_compile_definitions(GGML_USE_CUDA)
|
||||||
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
||||||
if (LLAMA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_FORCE_MMQ)
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_CUDA_NO_VMM)
|
if (LLAMA_CUDA_NO_VMM)
|
||||||
|
@ -578,11 +578,11 @@ if (LLAMA_HIPBLAS)
|
||||||
add_compile_definitions(GGML_HIP_UMA)
|
add_compile_definitions(GGML_HIP_UMA)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_FORCE_MMQ)
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -628,10 +628,7 @@ if (LLAMA_SYCL)
|
||||||
add_compile_definitions(GGML_SYCL_F16)
|
add_compile_definitions(GGML_SYCL_F16)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
add_compile_definitions(GGML_SYCL_FORCE_DMMV)
|
|
||||||
endif()
|
|
||||||
if (LLAMA_FORCE_MMQ)
|
|
||||||
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
|
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -457,7 +457,7 @@ endif # CUDA_DOCKER_ARCH
|
||||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
endif # LLAMA_CUDA_FORCE_DMMV
|
endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
ifdef LLAMA_FORCE_MMQ
|
ifdef LLAMA_CUDA_FORCE_MMQ
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||||
endif # LLAMA_CUDA_FORCE_MMQ
|
endif # LLAMA_CUDA_FORCE_MMQ
|
||||||
ifdef LLAMA_CUDA_DMMV_X
|
ifdef LLAMA_CUDA_DMMV_X
|
||||||
|
|
|
@ -475,10 +475,10 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.
|
||||||
| LLAMA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
|
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
|
||||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
|
|
|
@ -15212,6 +15212,25 @@ bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
||||||
|
switch (type) {
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool all_on_device =
|
const bool all_on_device =
|
||||||
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
||||||
|
@ -15228,7 +15247,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
}
|
}
|
||||||
|
|
||||||
// check data types and tensor shapes for custom matrix multiplication kernels:
|
// check data types and tensor shapes for custom matrix multiplication kernels:
|
||||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
&& src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
|
&& src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
|
||||||
|
|
||||||
|
@ -15243,17 +15262,12 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
const bool fp16_performance_good = true;
|
const bool fp16_performance_good = true;
|
||||||
|
|
||||||
// mmvq and mmq need the __dp4a instruction which is available for gen12+
|
// mmvq and mmq need the __dp4a instruction which is available for gen12+
|
||||||
use_mul_mat_vec_q = use_mul_mat_vec_q; // Check dp4a
|
|
||||||
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
|
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
|
||||||
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
||||||
#ifdef SYCL_USE_XMX
|
#ifdef SYCL_USE_XMX
|
||||||
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
||||||
#endif // SYCL_USE_XMX
|
#endif // SYCL_USE_XMX
|
||||||
|
|
||||||
#ifndef GGML_SYCL_FORCE_DMMV
|
|
||||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
|
||||||
#endif // GGML_SYCL_FORCE_DMMV
|
|
||||||
|
|
||||||
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
// KQ single-batch
|
// KQ single-batch
|
||||||
ggml_sycl_mul_mat_vec_p021(src0, src1, dst);
|
ggml_sycl_mul_mat_vec_p021(src0, src1, dst);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue