vulkan: avoid using workgroup size before it is referenced
This commit is contained in:
parent
118b4f08a8
commit
aa17d321b3
14 changed files with 26 additions and 26 deletions
|
@ -13,7 +13,7 @@ layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
if (gl_LocalInvocationIndex.x != 0) {
|
if (gl_LocalInvocationIndex.x != 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -218,7 +218,7 @@ void quantize(uint dst_idx, uint src_idx)
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
if (gl_LocalInvocationIndex.x != 0) {
|
if (gl_LocalInvocationIndex.x != 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,7 @@ void main() {
|
||||||
// Each thread handles 1 subblock (32 values with 2 scales)
|
// Each thread handles 1 subblock (32 values with 2 scales)
|
||||||
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
||||||
|
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
|
|
||||||
if (ib >= p.nel / 256) {
|
if (ib >= p.nel / 256) {
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -11,7 +11,7 @@ void main() {
|
||||||
// Each thread handles 1 subblock (32 values with 2 scales)
|
// Each thread handles 1 subblock (32 values with 2 scales)
|
||||||
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
||||||
|
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
|
|
||||||
if (ib >= p.nel / 256) {
|
if (ib >= p.nel / 256) {
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -12,7 +12,7 @@ void main() {
|
||||||
// Each block is described by 4 lattice indices, 4x7 sign bits and 4 scale bits
|
// Each block is described by 4 lattice indices, 4x7 sign bits and 4 scale bits
|
||||||
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
||||||
|
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
|
|
||||||
if (ib >= p.nel / 256) {
|
if (ib >= p.nel / 256) {
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -12,7 +12,7 @@ void main() {
|
||||||
// Each block contains 4 scale bytes (8 scales) for 256 output values.
|
// Each block contains 4 scale bytes (8 scales) for 256 output values.
|
||||||
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
||||||
|
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
|
|
||||||
if (ib >= p.nel / 256) {
|
if (ib >= p.nel / 256) {
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -12,7 +12,7 @@ void main() {
|
||||||
// 8 threads handle 1 superblock
|
// 8 threads handle 1 superblock
|
||||||
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
|
||||||
|
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
|
|
||||||
if (ib >= p.nel / 256) {
|
if (ib >= p.nel / 256) {
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||||
void main() {
|
void main() {
|
||||||
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
||||||
|
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
|
|
||||||
const uint tid = gl_LocalInvocationID.x % 64;
|
const uint tid = gl_LocalInvocationID.x % 64;
|
||||||
const uint il = tid/32;
|
const uint il = tid/32;
|
||||||
|
|
|
@ -105,7 +105,7 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const uint32_t N = p.N;
|
const uint32_t N = p.N;
|
||||||
|
|
|
@ -13,7 +13,7 @@ void main() {
|
||||||
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
||||||
|
|
||||||
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (i00 >= p.ne00) {
|
if (i00 >= p.ne00) {
|
||||||
|
|
|
@ -134,7 +134,7 @@ void main() {
|
||||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||||
|
|
||||||
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||||
|
|
|
@ -96,7 +96,7 @@ shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef MUL_MAT_ID
|
#ifdef MUL_MAT_ID
|
||||||
|
|
|
@ -107,7 +107,7 @@ D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
#if defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
|
||||||
init_iq_shmem();
|
init_iq_shmem(gl_WorkGroupSize);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef MUL_MAT_ID
|
#ifdef MUL_MAT_ID
|
||||||
|
|
|
@ -380,10 +380,10 @@ const uvec2[256] iq2xxs_grid_const = {
|
||||||
|
|
||||||
shared uvec2 iq2xxs_grid[256];
|
shared uvec2 iq2xxs_grid[256];
|
||||||
|
|
||||||
void init_iq_shmem()
|
void init_iq_shmem(uvec3 wgsize)
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += gl_WorkGroupSize.x) {
|
for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += wgsize.x) {
|
||||||
iq2xxs_grid[i] = iq2xxs_grid_const[i];
|
iq2xxs_grid[i] = iq2xxs_grid_const[i];
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
|
@ -547,10 +547,10 @@ const uvec2 iq2xs_grid_const[512] = {
|
||||||
|
|
||||||
shared uvec2 iq2xs_grid[512];
|
shared uvec2 iq2xs_grid[512];
|
||||||
|
|
||||||
void init_iq_shmem()
|
void init_iq_shmem(uvec3 wgsize)
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += gl_WorkGroupSize.x) {
|
for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += wgsize.x) {
|
||||||
iq2xs_grid[i] = iq2xs_grid_const[i];
|
iq2xs_grid[i] = iq2xs_grid_const[i];
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
|
@ -836,10 +836,10 @@ const uvec2 iq2s_grid_const[1024] = {
|
||||||
|
|
||||||
shared uvec2 iq2s_grid[1024];
|
shared uvec2 iq2s_grid[1024];
|
||||||
|
|
||||||
void init_iq_shmem()
|
void init_iq_shmem(uvec3 wgsize)
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += gl_WorkGroupSize.x) {
|
for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += wgsize.x) {
|
||||||
iq2s_grid[i] = iq2s_grid_const[i];
|
iq2s_grid[i] = iq2s_grid_const[i];
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
|
@ -904,10 +904,10 @@ const uint32_t iq3xxs_grid_const[256] = {
|
||||||
|
|
||||||
shared uint32_t iq3xxs_grid[256];
|
shared uint32_t iq3xxs_grid[256];
|
||||||
|
|
||||||
void init_iq_shmem()
|
void init_iq_shmem(uvec3 wgsize)
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += gl_WorkGroupSize.x) {
|
for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += wgsize.x) {
|
||||||
iq3xxs_grid[i] = iq3xxs_grid_const[i];
|
iq3xxs_grid[i] = iq3xxs_grid_const[i];
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
|
@ -1011,10 +1011,10 @@ const uint32_t iq3s_grid_const[512] = {
|
||||||
|
|
||||||
shared uint32_t iq3s_grid[512];
|
shared uint32_t iq3s_grid[512];
|
||||||
|
|
||||||
void init_iq_shmem()
|
void init_iq_shmem(uvec3 wgsize)
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += gl_WorkGroupSize.x) {
|
for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += wgsize.x) {
|
||||||
iq3s_grid[i] = iq3s_grid_const[i];
|
iq3s_grid[i] = iq3s_grid_const[i];
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
|
@ -1050,11 +1050,11 @@ const int8_t kvalues_iq4nl_const[16] = {
|
||||||
|
|
||||||
shared FLOAT_TYPE kvalues_iq4nl[16];
|
shared FLOAT_TYPE kvalues_iq4nl[16];
|
||||||
|
|
||||||
void init_iq_shmem()
|
void init_iq_shmem(uvec3 wgsize)
|
||||||
{
|
{
|
||||||
// copy the table into shared memory and sync
|
// copy the table into shared memory and sync
|
||||||
if (gl_LocalInvocationIndex.x < 16) {
|
for (uint i = gl_LocalInvocationIndex.x; i < kvalues_iq4nl.length(); i += wgsize.x) {
|
||||||
kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]);
|
kvalues_iq4nl[i] = FLOAT_TYPE(kvalues_iq4nl_const[i]);
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue