Vulkan Shader Refactor, Memory Debugging Option (#7947)
* Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use
This commit is contained in:
parent
0c7b3595b9
commit
7c7836d9d4
54 changed files with 25266 additions and 21885 deletions
32
vulkan-shaders/dequant_q4_1.comp
Normal file
32
vulkan-shaders/dequant_q4_1.comp
Normal file
|
@ -0,0 +1,32 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {block_q4_1 data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x % 64;
|
||||
const uint il = tid/32;
|
||||
const uint ir = tid%32;
|
||||
const uint ib = 32*i + ir;
|
||||
if (ib >= p.nel / 32) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint b_idx = 1024*i + 32*ir + 8*il;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const float m = float(data_a[ib].m);
|
||||
|
||||
const uint q_idx = 8*il;
|
||||
|
||||
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
||||
data_b[b_idx + l + 0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + m);
|
||||
data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >> 4) + m);
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue