Q4 cleanup
This commit is contained in:
parent
8944a13296
commit
21ee6d97cc
3 changed files with 9 additions and 350 deletions
|
@ -174,7 +174,6 @@ if (LLAMA_ALL_WARNINGS)
|
||||||
-Wshadow
|
-Wshadow
|
||||||
-Wstrict-prototypes
|
-Wstrict-prototypes
|
||||||
-Wpointer-arith
|
-Wpointer-arith
|
||||||
-Wno-unused-function
|
|
||||||
)
|
)
|
||||||
set(cxx_flags
|
set(cxx_flags
|
||||||
-Wall
|
-Wall
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -36,7 +36,7 @@ CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
|
||||||
# warnings
|
# warnings
|
||||||
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
|
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
|
||||||
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
||||||
|
|
||||||
# OS specific
|
# OS specific
|
||||||
|
|
356
ggml.c
356
ggml.c
|
@ -1562,7 +1562,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
||||||
.quantize_row_q_dot = quantize_row_q8_0,
|
.quantize_row_q_dot = quantize_row_q8_0,
|
||||||
.vec_dot_q = ggml_vec_dot_q4_2_q8_0,
|
.vec_dot_q = ggml_vec_dot_q4_2_q8_0,
|
||||||
},
|
},
|
||||||
// TODO: GGML_TYPE_Q8_0
|
[GGML_TYPE_Q8_0] = {
|
||||||
|
.dequantize_row_q = NULL, // TODO
|
||||||
|
.quantize_row_q = quantize_row_q8_0,
|
||||||
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference,
|
||||||
|
.quantize_row_q_dot = quantize_row_q8_0,
|
||||||
|
.vec_dot_q = NULL, // TODO
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
// For internal test use
|
// For internal test use
|
||||||
|
@ -2349,352 +2355,6 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
|
||||||
const int nb = n / QK4_0;
|
|
||||||
|
|
||||||
assert(n % QK4_0 == 0);
|
|
||||||
assert(nb % 2 == 0);
|
|
||||||
|
|
||||||
const block_q4_0 * restrict x = vx;
|
|
||||||
const block_q4_0 * restrict y = vy;
|
|
||||||
|
|
||||||
float sumf = 0.0;
|
|
||||||
|
|
||||||
#if defined(__ARM_NEON)
|
|
||||||
float sum0 = 0.0f;
|
|
||||||
float sum1 = 0.0f;
|
|
||||||
|
|
||||||
for (int i = 0; i < nb; i += 2) {
|
|
||||||
const block_q4_0 * restrict x0 = &x[i + 0];
|
|
||||||
const block_q4_0 * restrict y0 = &y[i + 0];
|
|
||||||
const block_q4_0 * restrict x1 = &x[i + 1];
|
|
||||||
const block_q4_0 * restrict y1 = &y[i + 1];
|
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
|
||||||
const int8x16_t s8b = vdupq_n_s8(0x8);
|
|
||||||
|
|
||||||
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
|
||||||
const uint8x16_t v1_0 = vld1q_u8(y0->qs);
|
|
||||||
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
|
||||||
const uint8x16_t v1_1 = vld1q_u8(y1->qs);
|
|
||||||
|
|
||||||
// 4-bit -> 8-bit
|
|
||||||
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
|
|
||||||
const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));
|
|
||||||
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
|
||||||
const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));
|
|
||||||
|
|
||||||
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
|
|
||||||
const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));
|
|
||||||
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
|
||||||
const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));
|
|
||||||
|
|
||||||
// sub 8
|
|
||||||
const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
|
|
||||||
const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
|
|
||||||
const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
|
|
||||||
const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
|
|
||||||
|
|
||||||
const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
|
|
||||||
const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);
|
|
||||||
const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
|
|
||||||
const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
|
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
// dot product into int32x4_t
|
|
||||||
int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
|
|
||||||
int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
|
|
||||||
|
|
||||||
p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
|
|
||||||
p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
|
|
||||||
|
|
||||||
sum0 += x0->d*y0->d*vaddvq_s32(p_0);
|
|
||||||
sum1 += x1->d*y1->d*vaddvq_s32(p_1);
|
|
||||||
#else
|
|
||||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
|
|
||||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
|
|
||||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
|
|
||||||
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
|
|
||||||
|
|
||||||
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
|
|
||||||
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
|
|
||||||
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
|
|
||||||
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
|
|
||||||
|
|
||||||
const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h);
|
|
||||||
const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h);
|
|
||||||
|
|
||||||
const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h);
|
|
||||||
const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h);
|
|
||||||
|
|
||||||
const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
|
|
||||||
const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
|
|
||||||
|
|
||||||
sum0 += x0->d*y0->d*vaddvq_s16(p_0);
|
|
||||||
sum1 += x1->d*y1->d*vaddvq_s16(p_1);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
sumf = sum0 + sum1;
|
|
||||||
#elif defined(__AVX512F__)
|
|
||||||
// Initialize accumulator with zeros
|
|
||||||
__m512 acc0 = _mm512_setzero_ps();
|
|
||||||
__m512 acc1 = _mm512_setzero_ps();
|
|
||||||
|
|
||||||
const int superblock_size = 16;
|
|
||||||
|
|
||||||
const int superblock_count = nb / superblock_size;
|
|
||||||
|
|
||||||
for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
|
|
||||||
int i = superblock_ix * superblock_size;
|
|
||||||
|
|
||||||
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+0 );
|
|
||||||
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+2 );
|
|
||||||
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+4 );
|
|
||||||
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+6 );
|
|
||||||
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+8 );
|
|
||||||
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+10 );
|
|
||||||
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+12 );
|
|
||||||
acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+14 );
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remainders
|
|
||||||
for (int i = superblock_count * superblock_size; i < nb; i += 2) {
|
|
||||||
acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i );
|
|
||||||
}
|
|
||||||
|
|
||||||
// Horizontal sum of all lanes of the accumulator
|
|
||||||
sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 );
|
|
||||||
#elif defined(__AVX2__)
|
|
||||||
// Initialize accumulator with zeros
|
|
||||||
__m256 acc = _mm256_setzero_ps();
|
|
||||||
|
|
||||||
/* Prepare the constants we will need during execution */
|
|
||||||
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
|
||||||
const __m256i offset_8 = _mm256_set1_epi16( 8 );
|
|
||||||
|
|
||||||
#define UNROLL_COUNT 8
|
|
||||||
// make sure we only unroll multiples of the block count
|
|
||||||
assert(nb % UNROLL_COUNT == 0);
|
|
||||||
|
|
||||||
// Main loop
|
|
||||||
for (int i = 0; i < nb; i+=UNROLL_COUNT) {
|
|
||||||
// This loop will be unrolled by the compiler
|
|
||||||
for (int u=0;u<UNROLL_COUNT;u++) {
|
|
||||||
/* Compute combined scale for the block */
|
|
||||||
const __m256 scale = _mm256_mul_ps(
|
|
||||||
_mm256_broadcast_ss( &x[i+u].d ),
|
|
||||||
_mm256_broadcast_ss( &y[i+u].d ) );
|
|
||||||
|
|
||||||
/* get input from x
|
|
||||||
Input: 32 Nibbles (16 bytes) at *x[i+u]
|
|
||||||
Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
|
|
||||||
|
|
||||||
/* Load 16 bytes from memory */
|
|
||||||
const __m128i tmp_x = _mm_loadu_si128( ( const __m128i* ) x[i+u].qs);
|
|
||||||
/* Expand bytes into uint16_t values */
|
|
||||||
const __m256i bytes_x = _mm256_cvtepu8_epi16(tmp_x);
|
|
||||||
/* Unpack values into individual bytes */
|
|
||||||
__m256i x_low_q = _mm256_and_si256( lowMask, bytes_x );
|
|
||||||
const __m256i pre_shift_x_high_q = _mm256_andnot_si256( lowMask, bytes_x );
|
|
||||||
__m256i x_high_q = _mm256_srli_epi16( pre_shift_x_high_q, 4 );
|
|
||||||
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
|
||||||
x_high_q = _mm256_sub_epi16( x_high_q, offset_8 );
|
|
||||||
x_low_q = _mm256_sub_epi16( x_low_q, offset_8 );
|
|
||||||
|
|
||||||
/* get input from y
|
|
||||||
Input: 32 Nibbles (16 bytes) at *y[i+u]
|
|
||||||
Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
|
|
||||||
|
|
||||||
/* Load 16 bytes from memory */
|
|
||||||
const __m128i tmp_y = _mm_loadu_si128( (const __m128i* ) y[i+u].qs);
|
|
||||||
/* Expand bytes into uint16_t values */
|
|
||||||
const __m256i bytes_y = _mm256_cvtepu8_epi16(tmp_y);
|
|
||||||
/* Unpack values into individual bytes */
|
|
||||||
const __m256i pre_shift_y_high_q = _mm256_andnot_si256( lowMask, bytes_y );
|
|
||||||
__m256i y_high_q = _mm256_srli_epi16( pre_shift_y_high_q, 4 );
|
|
||||||
__m256i y_low_q = _mm256_and_si256( lowMask, bytes_y );
|
|
||||||
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
|
||||||
y_high_q = _mm256_sub_epi16( y_high_q, offset_8 );
|
|
||||||
y_low_q = _mm256_sub_epi16( y_low_q, offset_8 );
|
|
||||||
|
|
||||||
/* Compute products of int16_t integers, add pairwise, store as int32_t */
|
|
||||||
__m256i xy_high_q = _mm256_madd_epi16( x_high_q, y_high_q );
|
|
||||||
__m256i xy_low_q = _mm256_madd_epi16( x_low_q, y_low_q );
|
|
||||||
|
|
||||||
/* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
|
|
||||||
__m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q );
|
|
||||||
|
|
||||||
/* Convert to vectore of 8 int32_t to 8 floats */
|
|
||||||
__m256 q = _mm256_cvtepi32_ps( xy_q );
|
|
||||||
|
|
||||||
/* Multiply q with scale and accumulate */
|
|
||||||
acc = _mm256_fmadd_ps( scale, q, acc );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return horizontal sum of the acc vector
|
|
||||||
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
|
||||||
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
|
|
||||||
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
|
||||||
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
|
||||||
|
|
||||||
sumf = _mm_cvtss_f32( res );
|
|
||||||
#elif defined(__AVX__)
|
|
||||||
// Initialize accumulator with zeros
|
|
||||||
__m256 acc = _mm256_setzero_ps();
|
|
||||||
|
|
||||||
// Main loop
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
|
||||||
// Compute combined scale for the block
|
|
||||||
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
|
|
||||||
|
|
||||||
__m128i i32[2];
|
|
||||||
for (int j = 0; j < 2; ++j) {
|
|
||||||
// Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
|
|
||||||
__m128i bx = bytesFromNibbles( x[i].qs + 8*j );
|
|
||||||
__m128i by = bytesFromNibbles( y[i].qs + 8*j );
|
|
||||||
|
|
||||||
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
|
||||||
const __m128i off = _mm_set1_epi8( 8 );
|
|
||||||
bx = _mm_sub_epi8( bx, off );
|
|
||||||
by = _mm_sub_epi8( by, off );
|
|
||||||
|
|
||||||
// Get absolute values of x vectors
|
|
||||||
const __m128i ax = _mm_sign_epi8(bx, bx);
|
|
||||||
|
|
||||||
// Sign the values of the y vectors
|
|
||||||
const __m128i sy = _mm_sign_epi8(by, bx);
|
|
||||||
|
|
||||||
// Perform multiplication and create 16-bit values
|
|
||||||
const __m128i dot = _mm_maddubs_epi16(ax, sy);
|
|
||||||
|
|
||||||
const __m128i ones = _mm_set1_epi16(1);
|
|
||||||
i32[j] = _mm_madd_epi16(ones, dot);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert int32_t to float
|
|
||||||
__m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
|
|
||||||
// Apply the scale, and accumulate
|
|
||||||
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return horizontal sum of the acc vector
|
|
||||||
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
|
||||||
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
|
|
||||||
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
|
||||||
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
|
||||||
|
|
||||||
sumf = _mm_cvtss_f32( res );
|
|
||||||
#elif defined(__wasm_simd128__)
|
|
||||||
// wasm simd
|
|
||||||
float sum0 = 0.0f;
|
|
||||||
float sum1 = 0.0f;
|
|
||||||
|
|
||||||
for (int i = 0; i < nb; i += 2) {
|
|
||||||
const block_q4_0 * restrict x0 = &x[i + 0];
|
|
||||||
const block_q4_0 * restrict y0 = &y[i + 0];
|
|
||||||
const block_q4_0 * restrict x1 = &x[i + 1];
|
|
||||||
const block_q4_0 * restrict y1 = &y[i + 1];
|
|
||||||
|
|
||||||
const v128_t m4b = wasm_u8x16_splat(0xf);
|
|
||||||
const v128_t s8b = wasm_i8x16_splat(0x8);
|
|
||||||
|
|
||||||
const v128_t v0_0 = wasm_v128_load(x0->qs);
|
|
||||||
const v128_t v0_1 = wasm_v128_load(y0->qs);
|
|
||||||
const v128_t v1_0 = wasm_v128_load(x1->qs);
|
|
||||||
const v128_t v1_1 = wasm_v128_load(y1->qs);
|
|
||||||
|
|
||||||
// 4-bit -> 8-bit
|
|
||||||
const v128_t v0_0l = wasm_v128_and(v0_0, m4b);
|
|
||||||
const v128_t v1_0l = wasm_v128_and(v1_0, m4b);
|
|
||||||
|
|
||||||
const v128_t v0_0h = wasm_u8x16_shr(v0_0, 4);
|
|
||||||
const v128_t v1_0h = wasm_u8x16_shr(v1_0, 4);
|
|
||||||
|
|
||||||
const v128_t v0_1l = wasm_v128_and(v0_1, m4b);
|
|
||||||
const v128_t v1_1l = wasm_v128_and(v1_1, m4b);
|
|
||||||
|
|
||||||
const v128_t v0_1h = wasm_u8x16_shr(v0_1, 4);
|
|
||||||
const v128_t v1_1h = wasm_u8x16_shr(v1_1, 4);
|
|
||||||
|
|
||||||
// sub 8
|
|
||||||
const v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b);
|
|
||||||
const v128_t v1_0ls = wasm_i8x16_sub(v1_0l, s8b);
|
|
||||||
|
|
||||||
const v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b);
|
|
||||||
const v128_t v1_0hs = wasm_i8x16_sub(v1_0h, s8b);
|
|
||||||
|
|
||||||
const v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b);
|
|
||||||
const v128_t v1_1ls = wasm_i8x16_sub(v1_1l, s8b);
|
|
||||||
|
|
||||||
const v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b);
|
|
||||||
const v128_t v1_1hs = wasm_i8x16_sub(v1_1h, s8b);
|
|
||||||
|
|
||||||
// dot product into int16x8_t
|
|
||||||
const v128_t pl0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0ls), wasm_i16x8_extend_low_i8x16(v1_0ls));
|
|
||||||
const v128_t pl0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0ls), wasm_i16x8_extend_high_i8x16(v1_0ls));
|
|
||||||
|
|
||||||
const v128_t ph0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0hs), wasm_i16x8_extend_low_i8x16(v1_0hs));
|
|
||||||
const v128_t ph0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0hs), wasm_i16x8_extend_high_i8x16(v1_0hs));
|
|
||||||
|
|
||||||
const v128_t pl1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1ls), wasm_i16x8_extend_low_i8x16(v1_1ls));
|
|
||||||
const v128_t pl1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1ls), wasm_i16x8_extend_high_i8x16(v1_1ls));
|
|
||||||
|
|
||||||
const v128_t ph1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1hs), wasm_i16x8_extend_low_i8x16(v1_1hs));
|
|
||||||
const v128_t ph1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1hs), wasm_i16x8_extend_high_i8x16(v1_1hs));
|
|
||||||
|
|
||||||
const v128_t pl_0 = wasm_i16x8_add(pl0l, pl0h);
|
|
||||||
const v128_t ph_0 = wasm_i16x8_add(ph0l, ph0h);
|
|
||||||
|
|
||||||
const v128_t pl_1 = wasm_i16x8_add(pl1l, pl1h);
|
|
||||||
const v128_t ph_1 = wasm_i16x8_add(ph1l, ph1h);
|
|
||||||
|
|
||||||
const v128_t p_0 = wasm_i16x8_add(pl_0, ph_0);
|
|
||||||
const v128_t p_1 = wasm_i16x8_add(pl_1, ph_1);
|
|
||||||
|
|
||||||
sum0 += x0->d * y0->d * (
|
|
||||||
wasm_i16x8_extract_lane(p_0, 0) + wasm_i16x8_extract_lane(p_0, 1) +
|
|
||||||
wasm_i16x8_extract_lane(p_0, 2) + wasm_i16x8_extract_lane(p_0, 3) +
|
|
||||||
wasm_i16x8_extract_lane(p_0, 4) + wasm_i16x8_extract_lane(p_0, 5) +
|
|
||||||
wasm_i16x8_extract_lane(p_0, 6) + wasm_i16x8_extract_lane(p_0, 7));
|
|
||||||
sum1 += x1->d * y1->d * (
|
|
||||||
wasm_i16x8_extract_lane(p_1, 0) + wasm_i16x8_extract_lane(p_1, 1) +
|
|
||||||
wasm_i16x8_extract_lane(p_1, 2) + wasm_i16x8_extract_lane(p_1, 3) +
|
|
||||||
wasm_i16x8_extract_lane(p_1, 4) + wasm_i16x8_extract_lane(p_1, 5) +
|
|
||||||
wasm_i16x8_extract_lane(p_1, 6) + wasm_i16x8_extract_lane(p_1, 7));
|
|
||||||
}
|
|
||||||
|
|
||||||
sumf = sum0 + sum1;
|
|
||||||
#else
|
|
||||||
// scalar
|
|
||||||
for (int i = 0; i < nb; i++) {
|
|
||||||
const float d0 = x[i].d;
|
|
||||||
const float d1 = y[i].d;
|
|
||||||
|
|
||||||
const uint8_t * restrict p0 = x[i].qs;
|
|
||||||
const uint8_t * restrict p1 = y[i].qs;
|
|
||||||
|
|
||||||
int sumi = 0;
|
|
||||||
for (int j = 0; j < QK4_0/2; j++) {
|
|
||||||
const uint8_t v0 = p0[j];
|
|
||||||
const uint8_t v1 = p1[j];
|
|
||||||
|
|
||||||
const int i0 = (v0 & 0xf) - 8;
|
|
||||||
const int i1 = (v0 >> 4) - 8;
|
|
||||||
|
|
||||||
const int i2 = (v1 & 0xf) - 8;
|
|
||||||
const int i3 = (v1 >> 4) - 8;
|
|
||||||
|
|
||||||
sumi += i0*i2 + i1*i3;
|
|
||||||
}
|
|
||||||
sumf += d0 * d1 * sumi;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
*s = sumf;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||||
const int nb = n / QK4_1;
|
const int nb = n / QK4_1;
|
||||||
|
|
||||||
|
@ -11064,7 +10724,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
#endif
|
#endif
|
||||||
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
||||||
cur = 0;
|
cur = 0;
|
||||||
} else if (quantize_fns[node->src0->type].vec_dot_q && node->src1->type == GGML_TYPE_F32) {
|
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||||
node->n_tasks = 1;
|
node->n_tasks = 1;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue