Revert "warp size fixes"
It seems like 32 is faster for me, at least and it won't cause so many conflicts.
This reverts commit 5d6eb72164
.
This commit is contained in:
parent
5d6eb72164
commit
1ba4ce4ad7
1 changed files with 2 additions and 6 deletions
|
@ -182,11 +182,7 @@ typedef struct {
|
||||||
} block_q6_k;
|
} block_q6_k;
|
||||||
static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding");
|
static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding");
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
|
||||||
#define WARP_SIZE warpSize
|
|
||||||
#else
|
|
||||||
#define WARP_SIZE 32
|
#define WARP_SIZE 32
|
||||||
#endif
|
|
||||||
|
|
||||||
#define CUDA_MUL_BLOCK_SIZE 256
|
#define CUDA_MUL_BLOCK_SIZE 256
|
||||||
|
|
||||||
|
@ -683,8 +679,8 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, WARP_SIZE);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue