/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│ │ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi │ ╚──────────────────────────────────────────────────────────────────────────────╝ │ │ │ GGML │ │ Copyright (c) 2023 Georgi Gerganov │ │ │ │ Permission is hereby granted, free of charge, to any person obtaining │ │ a copy of this software and associated documentation files (the │ │ "Software"), to deal in the Software without restriction, including │ │ without limitation the rights to use, copy, modify, merge, publish, │ │ distribute, sublicense, and/or sell copies of the Software, and to │ │ permit persons to whom the Software is furnished to do so, subject to │ │ the following conditions: │ │ │ │ The above copyright notice and this permission notice shall be │ │ included in all copies or substantial portions of the Software. │ │ │ │ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ │ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ │ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ │ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ │ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ │ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ │ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ │ │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "third_party/ggml/ggjt.v2.q4_1.h" #include "libc/assert.h" #include "libc/macros.internal.h" #include "libc/math.h" #include "third_party/ggml/ggjt.v2.internal.h" #include "third_party/ggml/ggjt.v2.q8_1.h" static_assert(sizeof(block_v2_q4_1) == 2 * sizeof(float) + V2_QK4_1 / 2, "wrong q4_1 block size/padding"); void dequantize_row_v2_q4_1(const void * restrict x_, float * restrict y, int k) { const block_v2_q4_1 * restrict x = x_; static const int qk = V2_QK4_1; assert(k % qk == 0); const int nb = k / qk; for (int i = 0; i < nb; i++) { const float d = x[i].d; const float m = x[i].m; for (int j = 0; j < qk/2; ++j) { const int x0 = (x[i].qs[j] & 0x0F); const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; } } } size_t ggml_quantize_v2_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % V2_QK4_1 == 0); const int nb = k / V2_QK4_1; for (int b = 0; b < n; b += k) { block_v2_q4_1 * restrict y = (block_v2_q4_1 *) dst + b/V2_QK4_1; quantize_row_v2_q4_1_reference(src + b, y, k); for (int i = 0; i < nb; i++) { for (int j = 0; j < V2_QK4_1; j += 2) { const uint8_t vi0 = y[i].qs[j/2] & 0x0F; const uint8_t vi1 = y[i].qs[j/2] >> 4; hist[vi0]++; hist[vi1]++; } } } return (n/V2_QK4_1*sizeof(block_v2_q4_1)); } void quantize_row_v2_q4_1(const float * restrict x, void * restrict y, int k) { quantize_row_v2_q4_1_reference(x, y, k); } void ggml_vec_dot_v2_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = V2_QK8_1; const int nb = n / qk; assert(n % qk == 0); assert(nb % 2 == 0); const block_v2_q4_1 * restrict x = vx; const block_v2_q8_1 * restrict y = vy; // TODO: add WASM SIMD #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); float summs = 0; for (int i = 0; i < nb; i += 2) { const block_v2_q4_1 * restrict x0 = &x[i + 0]; const block_v2_q4_1 * restrict x1 = &x[i + 1]; const block_v2_q8_1 * restrict y0 = &y[i + 0]; const block_v2_q8_1 * restrict y1 = &y[i + 1]; summs += x0->m * y0->s + x1->m * y1->s; const uint8x16_t m4b = vdupq_n_u8(0x0F); const uint8x16_t v0_0 = vld1q_u8(x0->qs); const uint8x16_t v0_1 = vld1q_u8(x1->qs); // 4-bit -> 8-bit const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); // load y const int8x16_t v1_0l = vld1q_s8(y0->qs); const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); #if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h)); const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h)); const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l)); const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l)); const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h)); const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h)); const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d); #endif } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; #elif defined(__AVX2__) || defined(__AVX__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); float summs = 0; // Main loop for (int i = 0; i < nb; ++i) { const float * d0 = &x[i].d; const float * d1 = &y[i].d; summs += x[i].m * y[i].s; const __m256 d0v = _mm256_broadcast_ss( d0 ); const __m256 d1v = _mm256_broadcast_ss( d1 ); // Compute combined scales const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes const __m256i bx = bytes_from_nibbles_32(x[i].qs); const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); const __m256 xy = mul_sum_us8_pairs_float(bx, by); // Accumulate d0*d1*x*y #if defined(__AVX2__) acc = _mm256_fmadd_ps( d0d1, xy, acc ); #else acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); #endif } *s = hsum_float_8(acc) + summs; #else // scalar float sumf = 0.0; for (int i = 0; i < nb; i++) { int sumi = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[i].qs[j] & 0x0F); const int v1 = (x[i].qs[j] >> 4); sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s; } *s = sumf; #endif } void quantize_row_v2_q4_1_reference(const float * restrict x, void * restrict y_, int k) { block_v2_q4_1 * restrict y = y_; const int qk = V2_QK4_1; assert(k % qk == 0); const int nb = k / qk; for (int i = 0; i < nb; i++) { float min = FLT_MAX; float max = -FLT_MAX; for (int j = 0; j < qk; j++) { const float v = x[i*qk + j]; if (v < min) min = v; if (v > max) max = v; } const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; y[i].d = d; y[i].m = min; for (int j = 0; j < qk/2; ++j) { const float x0 = (x[i*qk + 0 + j] - min)*id; const float x1 = (x[i*qk + qk/2 + j] - min)*id; const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); y[i].qs[j] = xi0; y[i].qs[j] |= xi1 << 4; } } }