Revert "warp size fixes"

It seems like 32 is faster for me, at least and it won't cause so many conflicts. This reverts commit 5d6eb72164.
2023-06-06 18:41:08 +03:00 · 2023-06-06 18:41:08 +03:00 · 1ba4ce4ad7
commit 1ba4ce4ad7
parent 5d6eb72164
1 changed files with 2 additions and 6 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -182,11 +182,7 @@ typedef struct {
 } block_q6_k;
 static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding");
 #if defined(GGML_USE_HIPBLAS)
 #define WARP_SIZE warpSize
 #else
 #define WARP_SIZE 32
 #endif
 #define CUDA_MUL_BLOCK_SIZE 256
@ -683,8 +679,8 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
    // sum up partial sums and write back result
    __syncthreads();
 #pragma unroll
-    for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) {
+    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, WARP_SIZE);
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
    }
    if (tid == 0) {