diff --git a/CMakeLists.txt b/CMakeLists.txt index 8eadea4fd..d7aa051da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -174,7 +174,6 @@ if (LLAMA_ALL_WARNINGS) -Wshadow -Wstrict-prototypes -Wpointer-arith - -Wno-unused-function ) set(cxx_flags -Wall diff --git a/Makefile b/Makefile index deb0d0009..d9a2d836b 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC LDFLAGS = # warnings -CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function +CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar # OS specific diff --git a/ggml.c b/ggml.c index 13c1548fe..dd31fc9b2 100644 --- a/ggml.c +++ b/ggml.c @@ -1562,7 +1562,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .quantize_row_q_dot = quantize_row_q8_0, .vec_dot_q = ggml_vec_dot_q4_2_q8_0, }, - // TODO: GGML_TYPE_Q8_0 + [GGML_TYPE_Q8_0] = { + .dequantize_row_q = NULL, // TODO + .quantize_row_q = quantize_row_q8_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = NULL, // TODO + }, }; // For internal test use @@ -2349,352 +2355,6 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t *s = sumf; } -static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK4_0; - - assert(n % QK4_0 == 0); - assert(nb % 2 == 0); - - const block_q4_0 * restrict x = vx; - const block_q4_0 * restrict y = vy; - - float sumf = 0.0; - -#if defined(__ARM_NEON) - float sum0 = 0.0f; - float sum1 = 0.0f; - - for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict y0 = &y[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q4_0 * restrict y1 = &y[i + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0xf); - const int8x16_t s8b = vdupq_n_s8(0x8); - - const uint8x16_t v0_0 = vld1q_u8(x0->qs); - const uint8x16_t v1_0 = vld1q_u8(y0->qs); - const uint8x16_t v0_1 = vld1q_u8(x1->qs); - const uint8x16_t v1_1 = vld1q_u8(y1->qs); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b)); - const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4)); - - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b)); - const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4)); - - // sub 8 - const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); - const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b); - const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); - const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b); - - const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); - const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b); - const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); - const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); - -#if defined(__ARM_FEATURE_DOTPROD) - // dot product into int32x4_t - int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls); - int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls); - - p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs); - p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs); - - sum0 += x0->d*y0->d*vaddvq_s32(p_0); - sum1 += x1->d*y1->d*vaddvq_s32(p_1); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); - - const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h); - const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h); - - const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h); - const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h); - - const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); - const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); - - sum0 += x0->d*y0->d*vaddvq_s16(p_0); - sum1 += x1->d*y1->d*vaddvq_s16(p_1); -#endif - } - - sumf = sum0 + sum1; -#elif defined(__AVX512F__) - // Initialize accumulator with zeros - __m512 acc0 = _mm512_setzero_ps(); - __m512 acc1 = _mm512_setzero_ps(); - - const int superblock_size = 16; - - const int superblock_count = nb / superblock_size; - - for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) { - int i = superblock_ix * superblock_size; - - acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+0 ); - acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+2 ); - acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+4 ); - acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+6 ); - acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+8 ); - acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+10 ); - acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i+12 ); - acc1 = dot_q4_0_twoblocks_avx512( acc1, x, y, i+14 ); - } - - // Remainders - for (int i = superblock_count * superblock_size; i < nb; i += 2) { - acc0 = dot_q4_0_twoblocks_avx512( acc0, x, y, i ); - } - - // Horizontal sum of all lanes of the accumulator - sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 ); -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - /* Prepare the constants we will need during execution */ - const __m256i lowMask = _mm256_set1_epi8( 0xF ); - const __m256i offset_8 = _mm256_set1_epi16( 8 ); - -#define UNROLL_COUNT 8 - // make sure we only unroll multiples of the block count - assert(nb % UNROLL_COUNT == 0); - - // Main loop - for (int i = 0; i < nb; i+=UNROLL_COUNT) { - // This loop will be unrolled by the compiler - for (int u=0;u we now have a vector of 8 int_32t */ - __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q ); - - /* Convert to vectore of 8 int32_t to 8 floats */ - __m256 q = _mm256_cvtepi32_ps( xy_q ); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( scale, q, acc ); - } - } - - // Return horizontal sum of the acc vector - __m128 res = _mm256_extractf128_ps( acc, 1 ); - res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); - res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); - res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - - sumf = _mm_cvtss_f32( res ); -#elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; ++i) { - // Compute combined scale for the block - const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); - - __m128i i32[2]; - for (int j = 0; j < 2; ++j) { - // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes - __m128i bx = bytesFromNibbles( x[i].qs + 8*j ); - __m128i by = bytesFromNibbles( y[i].qs + 8*j ); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m128i off = _mm_set1_epi8( 8 ); - bx = _mm_sub_epi8( bx, off ); - by = _mm_sub_epi8( by, off ); - - // Get absolute values of x vectors - const __m128i ax = _mm_sign_epi8(bx, bx); - - // Sign the values of the y vectors - const __m128i sy = _mm_sign_epi8(by, bx); - - // Perform multiplication and create 16-bit values - const __m128i dot = _mm_maddubs_epi16(ax, sy); - - const __m128i ones = _mm_set1_epi16(1); - i32[j] = _mm_madd_epi16(ones, dot); - } - - // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] )); - // Apply the scale, and accumulate - acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); - } - - // Return horizontal sum of the acc vector - __m128 res = _mm256_extractf128_ps( acc, 1 ); - res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); - res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); - res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - - sumf = _mm_cvtss_f32( res ); -#elif defined(__wasm_simd128__) - // wasm simd - float sum0 = 0.0f; - float sum1 = 0.0f; - - for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict y0 = &y[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q4_0 * restrict y1 = &y[i + 1]; - - const v128_t m4b = wasm_u8x16_splat(0xf); - const v128_t s8b = wasm_i8x16_splat(0x8); - - const v128_t v0_0 = wasm_v128_load(x0->qs); - const v128_t v0_1 = wasm_v128_load(y0->qs); - const v128_t v1_0 = wasm_v128_load(x1->qs); - const v128_t v1_1 = wasm_v128_load(y1->qs); - - // 4-bit -> 8-bit - const v128_t v0_0l = wasm_v128_and(v0_0, m4b); - const v128_t v1_0l = wasm_v128_and(v1_0, m4b); - - const v128_t v0_0h = wasm_u8x16_shr(v0_0, 4); - const v128_t v1_0h = wasm_u8x16_shr(v1_0, 4); - - const v128_t v0_1l = wasm_v128_and(v0_1, m4b); - const v128_t v1_1l = wasm_v128_and(v1_1, m4b); - - const v128_t v0_1h = wasm_u8x16_shr(v0_1, 4); - const v128_t v1_1h = wasm_u8x16_shr(v1_1, 4); - - // sub 8 - const v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b); - const v128_t v1_0ls = wasm_i8x16_sub(v1_0l, s8b); - - const v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b); - const v128_t v1_0hs = wasm_i8x16_sub(v1_0h, s8b); - - const v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b); - const v128_t v1_1ls = wasm_i8x16_sub(v1_1l, s8b); - - const v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b); - const v128_t v1_1hs = wasm_i8x16_sub(v1_1h, s8b); - - // dot product into int16x8_t - const v128_t pl0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0ls), wasm_i16x8_extend_low_i8x16(v1_0ls)); - const v128_t pl0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0ls), wasm_i16x8_extend_high_i8x16(v1_0ls)); - - const v128_t ph0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0hs), wasm_i16x8_extend_low_i8x16(v1_0hs)); - const v128_t ph0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0hs), wasm_i16x8_extend_high_i8x16(v1_0hs)); - - const v128_t pl1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1ls), wasm_i16x8_extend_low_i8x16(v1_1ls)); - const v128_t pl1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1ls), wasm_i16x8_extend_high_i8x16(v1_1ls)); - - const v128_t ph1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1hs), wasm_i16x8_extend_low_i8x16(v1_1hs)); - const v128_t ph1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1hs), wasm_i16x8_extend_high_i8x16(v1_1hs)); - - const v128_t pl_0 = wasm_i16x8_add(pl0l, pl0h); - const v128_t ph_0 = wasm_i16x8_add(ph0l, ph0h); - - const v128_t pl_1 = wasm_i16x8_add(pl1l, pl1h); - const v128_t ph_1 = wasm_i16x8_add(ph1l, ph1h); - - const v128_t p_0 = wasm_i16x8_add(pl_0, ph_0); - const v128_t p_1 = wasm_i16x8_add(pl_1, ph_1); - - sum0 += x0->d * y0->d * ( - wasm_i16x8_extract_lane(p_0, 0) + wasm_i16x8_extract_lane(p_0, 1) + - wasm_i16x8_extract_lane(p_0, 2) + wasm_i16x8_extract_lane(p_0, 3) + - wasm_i16x8_extract_lane(p_0, 4) + wasm_i16x8_extract_lane(p_0, 5) + - wasm_i16x8_extract_lane(p_0, 6) + wasm_i16x8_extract_lane(p_0, 7)); - sum1 += x1->d * y1->d * ( - wasm_i16x8_extract_lane(p_1, 0) + wasm_i16x8_extract_lane(p_1, 1) + - wasm_i16x8_extract_lane(p_1, 2) + wasm_i16x8_extract_lane(p_1, 3) + - wasm_i16x8_extract_lane(p_1, 4) + wasm_i16x8_extract_lane(p_1, 5) + - wasm_i16x8_extract_lane(p_1, 6) + wasm_i16x8_extract_lane(p_1, 7)); - } - - sumf = sum0 + sum1; -#else - // scalar - for (int i = 0; i < nb; i++) { - const float d0 = x[i].d; - const float d1 = y[i].d; - - const uint8_t * restrict p0 = x[i].qs; - const uint8_t * restrict p1 = y[i].qs; - - int sumi = 0; - for (int j = 0; j < QK4_0/2; j++) { - const uint8_t v0 = p0[j]; - const uint8_t v1 = p1[j]; - - const int i0 = (v0 & 0xf) - 8; - const int i1 = (v0 >> 4) - 8; - - const int i2 = (v1 & 0xf) - 8; - const int i3 = (v1 >> 4) - 8; - - sumi += i0*i2 + i1*i3; - } - sumf += d0 * d1 * sumi; - } -#endif - - *s = sumf; -} - static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK4_1; @@ -11064,7 +10724,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) #endif } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; - } else if (quantize_fns[node->src0->type].vec_dot_q && node->src1->type == GGML_TYPE_F32) { + } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1;