attempt to speed up float clearing.
This commit is contained in:
parent
a015d8485e
commit
7f5adf3b5c
1 changed files with 59 additions and 32 deletions
|
@ -15,51 +15,78 @@
|
||||||
// For block_q5_K and block_q8_K. only given the second time.
|
// For block_q5_K and block_q8_K. only given the second time.
|
||||||
#include "ggml-common.h"
|
#include "ggml-common.h"
|
||||||
|
|
||||||
|
|
||||||
|
// This SIMD unit can work with 32 float32s at once.
|
||||||
|
#define GGML_F32_STEP 32
|
||||||
|
// We can fit 16 of these float32s in a single vector register.
|
||||||
|
#define GGML_F32_EPR 16
|
||||||
|
|
||||||
|
typedef float float32x8_t __attribute__((vector_size (64)));
|
||||||
|
|
||||||
/* A forward declaration, to keep GCC happy. */
|
/* A forward declaration, to keep GCC happy. */
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc);
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc);
|
||||||
|
|
||||||
|
inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
|
||||||
|
{
|
||||||
|
uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
|
||||||
|
uint32_t mask=0x000000FF;
|
||||||
|
|
||||||
|
__asm__ __volatile__ (
|
||||||
|
"vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
|
||||||
|
"kmov\t%[M],\t%%k1\n\t"
|
||||||
|
"vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
|
||||||
|
: [RES] "+m" (*target)
|
||||||
|
: [Z] "m" (zero)
|
||||||
|
: [M] "r" (mask)
|
||||||
|
: "r9", "zmm8", "k1");
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
|
|
||||||
|
/* interpret X and Y as vectors. */
|
||||||
const block_q5_K * restrict x = vx;
|
const block_q5_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
||||||
|
/* the number of blocks we will process this in. */
|
||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
static const uint32_t kmask1 = 0x3f3f3f3f;
|
static const uint32_t kmask1 = 0x3f3f3f3f;
|
||||||
static const uint32_t kmask2 = 0x0f0f0f0f;
|
static const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
static const uint32_t kmask3 = 0x03030303;
|
static const uint32_t kmask3 = 0x03030303;
|
||||||
|
|
||||||
uint32_t utmp[4];
|
uint32_t utmp[4];
|
||||||
int8_t aux8[QK_K];
|
int8_t aux8[QK_K];
|
||||||
int16_t aux16[16];
|
int16_t aux16[16];
|
||||||
float sums [8];
|
float32x8_t sums __attribute__((aligned(64)));
|
||||||
memset(sums, 0, 8*sizeof(float));
|
|
||||||
|
|
||||||
float sumf = 0;
|
/* use a vector operation to clear these floats. */
|
||||||
for (int i = 0; i < nb; ++i) {
|
GGML_F32x8_VEC_ZERO(&sums);
|
||||||
const uint8_t * restrict q4 = x[i].qs;
|
|
||||||
const uint8_t * restrict hm = x[i].qh;
|
|
||||||
const int8_t * restrict q8 = y[i].qs;
|
|
||||||
int8_t * restrict a = aux8;
|
|
||||||
for (int l = 0; l < 32; ++l) {
|
|
||||||
a[l+ 0] = q4[l] & 0xF;
|
|
||||||
a[l+32] = q4[l] >> 4;
|
|
||||||
}
|
|
||||||
for (int is = 0; is < 8; ++is) {
|
|
||||||
uint8_t m = 1 << is;
|
|
||||||
for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
|
|
||||||
}
|
|
||||||
|
|
||||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
float sumf = 0;
|
||||||
const int8_t * restrict sc = x[i].scales;
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const uint8_t * restrict q4 = x[i].qs;
|
||||||
for (int j = 0; j < QK_K/16; ++j) {
|
const uint8_t * restrict hm = x[i].qh;
|
||||||
const float dl = d * sc[j];
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
|
int8_t * restrict a = aux8;
|
||||||
for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]);
|
for (int l = 0; l < 32; ++l) {
|
||||||
q8 += 16; a += 16;
|
a[l+ 0] = q4[l] & 0xF;
|
||||||
}
|
a[l+32] = q4[l] >> 4;
|
||||||
}
|
}
|
||||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
for (int is = 0; is < 8; ++is) {
|
||||||
*s = sumf;
|
uint8_t m = 1 << is;
|
||||||
|
for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
const int8_t * restrict sc = x[i].scales;
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
const float dl = d * sc[j];
|
||||||
|
for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
|
||||||
|
for (int l = 0; l < 8; ++l) ((float *)sums)[l] += dl * (aux16[l] + aux16[8+l]);
|
||||||
|
q8 += 16; a += 16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int l = 0; l < 8; ++l) sumf += ((float *)sums)[l];
|
||||||
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue