CUDA: refactor and optimize IQ MMVQ (#8215)
* CUDA: refactor and optimize IQ MMVQ * uint -> uint32_t * __dp4a -> ggml_cuda_dp4a * remove MIN_CC_DP4A checks * change default * try CI fix
This commit is contained in:
parent
dae57a1ebc
commit
cb5fad4c6c
8 changed files with 406 additions and 487 deletions
|
@ -820,7 +820,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|||
#if QK_K == 256
|
||||
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
||||
|
||||
#if QR2_XXS == 8
|
||||
const int ib32 = iqs;
|
||||
const uint16_t * q2 = bq2->qs + 4*ib32;
|
||||
const uint8_t * aux8 = (const uint8_t *)q2;
|
||||
|
@ -838,26 +837,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
|||
}
|
||||
const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f;
|
||||
return d * sumi;
|
||||
#else
|
||||
// iqs is 0...15
|
||||
const int ib32 = iqs/2;
|
||||
const int il = iqs%2;
|
||||
const uint16_t * q2 = bq2->qs + 4*ib32;
|
||||
const uint8_t * aux8 = (const uint8_t *)q2;
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
||||
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
||||
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * bq8_1[ib32].ds[0] * 0.25f;
|
||||
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
||||
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
|
||||
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
|
||||
sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
return d * (sumi1 + sumi2);
|
||||
#endif
|
||||
#else
|
||||
assert(false);
|
||||
return 0.f;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue