reformat, and label what these files are.
This commit is contained in:
parent
b8abefbec6
commit
fb83cd987d
2 changed files with 260 additions and 261 deletions
|
@ -1,3 +1,6 @@
|
|||
/* Xeon PHI IMCI support. */
|
||||
/* formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
|
||||
|
||||
// For uint32_t
|
||||
#include <stdint.h>
|
||||
|
||||
|
@ -37,6 +40,7 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
|
|||
: [RES] "+m" (*target)
|
||||
: [Z] "m" (zero)
|
||||
: "zmm8", "memory");
|
||||
|
||||
}
|
||||
|
||||
// This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. then does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16.
|
||||
|
@ -54,7 +58,6 @@ inline static void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16 (int8x16_t
|
|||
"mov\t%[SRC21],\t%%r8\n\t"
|
||||
"mov\t%[SCALE],\t%%r9\n\t"
|
||||
"vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t" // empty our result.
|
||||
|
||||
"1:\n\t"
|
||||
"inc\t%%ecx\n\t" // we are in our loop, increment our counter.
|
||||
"cmp\t$4,\t%%ecx\n\t" // see if this is our last run-through.
|
||||
|
@ -92,7 +95,7 @@ inline static void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16 (int8x16_t
|
|||
[SRC21] "r" (src21),
|
||||
[SCALE] "r" (scale),
|
||||
[Z] "m" (zero)
|
||||
: "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "ecx", "r8", "r9", "r12", "memory");
|
||||
: "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "r8", "r9", "r12", "memory");
|
||||
}
|
||||
|
||||
// Unpack 256 unsigned 5 bit values into an 8 bit vector.
|
||||
|
@ -104,7 +107,7 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
|
|||
uint8_t bit5 = 0x10;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"vprefetche0\t(%[SRC1])\n\t"
|
||||
"vprefetche0\t(%[SRC1])\n\t" // Issue our memory requests first thing.
|
||||
"vprefetche0\t(%[SRC4])\n\t"
|
||||
"vprefetche1\t64(%[SRC4])\n\t"
|
||||
"mov\t%[SRC4],\t%%r12\n\t" // load the address of the head of our 4-bit list.
|
||||
|
@ -115,10 +118,8 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
|
|||
"vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm2\n\t " // load our mask.
|
||||
"vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm9\n\t" // load the bit we want to add (conditionally).
|
||||
"vpbroadcastd\t%[M]%{uint8%},\t%%zmm8\n\t" // select which bit we want to test for.
|
||||
|
||||
"1:\n\t"
|
||||
"inc\t%%ecx\n\t" // we are in the loop. increment the counter.
|
||||
|
||||
"vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t" // perform our test.
|
||||
"vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t" // perform our test.
|
||||
"vmovdqa32\t\t(%%r12)%{uint8%},\t%%zmm0\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
|
||||
|
@ -129,11 +130,9 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
|
|||
"vpandd\t%%zmm1,\t%%zmm2,\t%%zmm5\n\t" // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
|
||||
"vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t" // turn on bit 5 for all values that passed the prior test.
|
||||
"vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t" // save our result.
|
||||
|
||||
"add\t$32,\t%%r8\n\t"
|
||||
"cmp\t$4,\t%%ecx\n\t"
|
||||
"vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t" // select which bit we want to test for.
|
||||
|
||||
"vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t" // perform our test.
|
||||
"vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t" // perform our test.
|
||||
"vpsrld\t$4,\t%%zmm0,\t%%zmm4\n\t" // load our even 4 bit sequence into zmm4.
|
||||
|
@ -142,9 +141,7 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
|
|||
"vpsrld\t$4,\t%%zmm1,\t%%zmm5\n\t" // load our even 4 bit sequence into zmm5.
|
||||
"vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t" // turn on bit 5 for all values that passed the prior test.
|
||||
"vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t" // save our result.
|
||||
|
||||
"je\t2f\n\t"
|
||||
|
||||
"vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t" // select which bit we want to test for.
|
||||
"add\t$32,\t%%r12\n\t"
|
||||
"add\t$32,\t%%r8\n\t"
|
||||
|
@ -157,7 +154,7 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
|
|||
[M] "m" (m),
|
||||
[ALL] "m" (allmask),
|
||||
[BIT5] "m" (bit5)
|
||||
: "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "ecx", "k1", "k2", "r12", "r8", "memory"
|
||||
: "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "cc", "ecx", "k1", "k2", "r12", "r8", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
/* Xeon PHI IMCI support. */
|
||||
/* formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
// For size_t
|
||||
|
@ -25,6 +28,7 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
|
|||
: [RES] "+m" (*target)
|
||||
: [Z] "m" (zero)
|
||||
: "zmm8");
|
||||
|
||||
}
|
||||
|
||||
// Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. optionally clear the sum before starting.
|
||||
|
@ -113,10 +117,9 @@ void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restri
|
|||
|
||||
GGML_F32x16_VEC_FMA((const float32x16_t *)x, (const float32x16_t *)y, &sum, np/GGML_F32_EPR, 1);
|
||||
|
||||
// FIXME: replace this with a final round using masked vectors.
|
||||
// add the leftovers, that could not be handled by the vector loop.
|
||||
if ( n - np != 0 )
|
||||
{
|
||||
// add the leftovers, that could not be handled by the vector loop.
|
||||
// our extended last part of x.
|
||||
float32x16_t v1;
|
||||
GGML_F32x16_VEC_ZERO(&v1);
|
||||
|
@ -130,7 +133,6 @@ void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restri
|
|||
GGML_F32x16_VEC_FMA(&v1,
|
||||
&v2,
|
||||
&sum, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
// reduce sum, and store it in s.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue