Allow parallel execution of kernels, parallelize third and fourth dimension calls

This commit is contained in:
0cc4m 2023-07-24 22:51:19 +02:00
parent 53809c9c26
commit 4e580284c0
3 changed files with 388 additions and 366 deletions

View file

@ -239,6 +239,7 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/matmul_split_k_reduce.glsl -o vk_shaders/matmul_split_k_reduce.spv & \
glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/f16_to_f32.glsl -o vk_shaders/f16_to_f32.spv & \
glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_q4_0.glsl -o vk_shaders/dequant_q4_0.spv & \
glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_mul_mat_vec_f16.glsl -o vk_shaders/dequant_mul_mat_vec_f16.spv & \
glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_mul_mat_vec_q4_0.glsl -o vk_shaders/dequant_mul_mat_vec_q4_0.spv & \
glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/mul_f32.glsl -o vk_shaders/mul_f32.spv & \
wait

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,59 @@
#version 450
#extension GL_EXT_control_flow_attributes : enable
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#define QUANT_K 32
#define QUANT_R 2
#define BLOCK_SIZE 32
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A { float16_t x[]; };
layout (binding = 1) readonly buffer B { float y[]; };
layout (binding = 2) writeonly buffer D { float dst[]; };
layout (push_constant) uniform parameter
{
int ncols;
} p;
shared float tmp[BLOCK_SIZE];
void main() {
const int block_size = int(gl_WorkGroupSize.x);
const int row = int(gl_WorkGroupID.x);
const int tid = int(gl_LocalInvocationID.x);
const int y_offset = QUANT_K/2;
tmp[tid] = 0;
[[unroll]] for (int i = 0; i < p.ncols/block_size; i += 2) {
const int col = i*block_size + 2*tid;
const int ib = (row*p.ncols + col)/QUANT_K; // block index
const int iqs = (col%QUANT_K)/QUANT_R; // quant index
const int iybs = col - col%QUANT_K; // y block start index
// dequantize
float v0 = float(x[ib + 0]);
float v1 = float(x[ib + 1]);
// matrix multiplication
tmp[tid] += v0 * y[iybs + iqs + 0];
tmp[tid] += v1 * y[iybs + iqs + y_offset];
}
// sum up partial sums and write back result
barrier();
[[unroll]] for (int s=block_size/2; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier();
}
if (tid == 0) {
dst[row] = tmp[0];
}
}