copy right block.

This commit is contained in:
Julia Longtin 2024-03-23 19:47:21 +00:00
parent e99f3a9bf4
commit 656bf28c91

View file

@ -30,7 +30,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target) inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
{ {
uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0}; uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
uint32_t mask=0x000000FF; uint32_t mask=0x0000FF00;
__asm__ __volatile__ ( __asm__ __volatile__ (
"vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value. "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
@ -55,43 +55,63 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
static const uint32_t kmask2 = 0x0f0f0f0f; static const uint32_t kmask2 = 0x0f0f0f0f;
static const uint32_t kmask3 = 0x03030303; static const uint32_t kmask3 = 0x03030303;
uint32_t utmp[4]; const uint8_t * scales = (const uint8_t*)&utmp[0];
int8_t aux8[QK_K]; const uint8_t * mins = (const uint8_t*)&utmp[2];
// int16_t aux16[16];
int16x16_t aux16 __attribute__((aligned(64)));
float32x8_t sums __attribute__((aligned(64)));
/* use a vector operation to clear these floats. */ int8_t aux8[QK_K];
GGML_F32x8_VEC_ZERO(&sums); int16_t aux16[8];
float sums [8];
int32_t aux32[8];
memset(sums, 0, 8*sizeof(float));
float sumf = 0; float sumf = 0;
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; ++i) {
// quants, 4 low bits.
const uint8_t * restrict q4 = x[i].qs; const uint8_t * restrict q4 = x[i].qs;
// quants, 1 high bit.
const uint8_t * restrict hm = x[i].qh; const uint8_t * restrict hm = x[i].qh;
const int8_t * restrict q8 = y[i].qs; const int8_t * restrict q8 = y[i].qs;
memset(aux32, 0, 8*sizeof(int32_t));
int8_t * restrict a = aux8; int8_t * restrict a = aux8;
for (int l = 0; l < 32; ++l) { uint8_t m = 1;
a[l+ 0] = q4[l] & 0xF; for (int j = 0; j < QK_K/64; ++j) {
a[l+32] = q4[l] >> 4; for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
a += 32; m <<= 1;
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
a += 32; m <<= 1;
q4 += 32;
} }
for (int is = 0; is < 8; ++is) { memcpy(utmp, x[i].scales, 12);
uint8_t m = 1 << is; utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16); const uint32_t uaux = utmp[1] & kmask1;
} utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[2] = uaux;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); utmp[0] &= kmask1;
const int8_t * restrict sc = x[i].scales;
int sumi = 0;
for (int j = 0; j < QK_K/16; ++j) { for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
const float dl = d * sc[j]; a = aux8;
for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l]; int is = 0;
for (int l = 0; l < 8; ++l) ((float *)&sums)[l] += dl * (((int16_t *)&aux16)[l] + ((int16_t *)&aux16)[8+l]); for (int j = 0; j < QK_K/32; ++j) {
q8 += 16; a += 16; int32_t scale = scales[is++];
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
} }
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
} }
for (int l = 0; l < 8; ++l) sumf += ((float *)&sums)[l]; for (int l = 0; l < 8; ++l) sumf += sums[l];
*s = sumf; *s = sumf;
} }