diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 44e6445ec..afab6704f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -85,7 +85,11 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo #define CUDA_MUL_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 -#define GGML_CUDA_DMMV_BLOCK_X 32 // dmmv = dequantize_mul_mat_vec +// dmmv = dequantize_mul_mat_vec +#define GGML_CUDA_DMMV_BLOCK_X 32 +#ifndef GGML_CUDA_DMMV_BLOCK_Y +#define GGML_CUDA_DMMV_BLOCK_Y 1 // can by set by compiler option LLAMA_CUDA_BY +#endif static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x;