llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/simpler_mul_mat_q4_1.comp
Sergio Lopez 8b47fce07a vulkan: enable the use of simpler matmul shaders
Import simpler matmul shaders from the kompute backend and use them on
GPUs know to not be able to use the regular ones.

Signed-off-by: Sergio Lopez <slp@redhat.com>
2025-02-10 11:52:52 +01:00

35 lines
1.1 KiB
Text

#version 450
#include "simpler_common.comp"
#define BLOCKS_IN_QUANT QK4_1
#define SIZE_OF_BLOCK sizeof_block_q4_1
#define N_ROWS 4
#include "simpler_mul_mv_q_n_pre.comp"
// The q4_1 version of this function
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
vec2 acc = vec2(0.0, 0.0);
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
float d = float(u8BufToFloat16(inA, index));
float m = float(u8BufToFloat16(inA, index+2));
float sumy = 0.0f;
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
const float yl0 = inB[yb + i];
const float yl1 = inB[yb + i + 1];
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
sumy += yl0 + yl1 + yl8 + yl9;
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
}
return d * (acc[0] + acc[1]) + sumy * m;
}
#include "simpler_mul_mv_q_n.comp"