subgroup iq4_nl, 3% slower than original
This commit is contained in:
parent
ed038a26e3
commit
1d949a62c6
4 changed files with 15 additions and 8 deletions
|
@ -1,6 +1,8 @@
|
||||||
#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
|
#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
||||||
#endif
|
#endif
|
||||||
|
#extension GL_KHR_shader_subgroup_shuffle : require
|
||||||
|
#extension GL_EXT_shader_subgroup_extended_types_float16 : require
|
||||||
|
|
||||||
#include "types.comp"
|
#include "types.comp"
|
||||||
|
|
||||||
|
@ -91,11 +93,11 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
#if defined(DATA_A_IQ4_NL)
|
#if defined(DATA_A_IQ4_NL)
|
||||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]);
|
return vec2(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 4));
|
||||||
}
|
}
|
||||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||||
return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]);
|
return vec4(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, (vui >> 4) & 0xF), subgroupShuffle(kvalues_iq4nl, (vui >> 8) & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 12));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
#version 450
|
#version 450
|
||||||
|
|
||||||
#include "dequant_head.comp"
|
#include "dequant_head.comp"
|
||||||
|
#extension GL_KHR_shader_subgroup_shuffle : require
|
||||||
|
#extension GL_EXT_shader_subgroup_extended_types_float16 : require
|
||||||
|
|
||||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
@ -26,7 +28,7 @@ void main() {
|
||||||
const float d = float(data_a[ib].d);
|
const float d = float(data_a[ib].d);
|
||||||
|
|
||||||
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
||||||
data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
|
data_b[b_idx + l + 0] = D_TYPE(d * subgroupShuffle(kvalues_iq4nl, data_a[ib].qs[q_idx + l] & 0xF));
|
||||||
data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]);
|
data_b[b_idx + l + 16] = D_TYPE(d * subgroupShuffle(kvalues_iq4nl, data_a[ib].qs[q_idx + l] >> 4));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
#extension GL_EXT_control_flow_attributes : enable
|
#extension GL_EXT_control_flow_attributes : enable
|
||||||
#extension GL_EXT_shader_16bit_storage : require
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
|
#extension GL_KHR_shader_subgroup_shuffle : require
|
||||||
|
#extension GL_EXT_shader_subgroup_extended_types_float16 : require
|
||||||
|
|
||||||
#ifdef FLOAT16
|
#ifdef FLOAT16
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||||
|
@ -448,7 +450,7 @@ void main() {
|
||||||
|
|
||||||
const float d = float(data_a[ib].d);
|
const float d = float(data_a[ib].d);
|
||||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
const uint vui = uint(data_a[ib].qs[iqs]);
|
||||||
const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
|
const vec2 v = vec2(subgroupShuffle(kvalues_iq4nl, vui & 0xF), subgroupShuffle(kvalues_iq4nl, vui >> 4)) * d;
|
||||||
|
|
||||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
#define GGML_TYPES_COMP
|
#define GGML_TYPES_COMP
|
||||||
|
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||||
|
#extension GL_KHR_shader_subgroup_basic : require
|
||||||
|
|
||||||
#if defined(DATA_A_F32)
|
#if defined(DATA_A_F32)
|
||||||
#define QUANT_K 1
|
#define QUANT_K 1
|
||||||
|
@ -305,13 +306,13 @@ const int8_t kvalues_iq4nl_const[16] = {
|
||||||
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
|
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
|
||||||
};
|
};
|
||||||
|
|
||||||
shared FLOAT_TYPE kvalues_iq4nl[16];
|
FLOAT_TYPE kvalues_iq4nl = FLOAT_TYPE(0);
|
||||||
|
|
||||||
void init_iq4nl_shmem()
|
void init_iq4nl_shmem()
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
if (gl_LocalInvocationIndex.x < 16) {
|
if (gl_SubgroupInvocationID < 16) {
|
||||||
kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]);
|
kvalues_iq4nl = FLOAT_TYPE(kvalues_iq4nl_const[gl_SubgroupInvocationID]);
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue