From ecbe466a364876927994e2f1ec14f4d82301d201 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 25 Mar 2023 19:47:21 +0200 Subject: [PATCH 1/8] Retire the ggml_mul_mat() branch for transposed src0 (#500) * Retire the ggml_mul_mat() for transposed src0 - It can always be made contiguous with ggml_cpy() - The code is now simplified - The results are deterministic in respect to num threads * SIMD-ify dequantize_row_q4_0() for ARM_NEON (#502) * Attempt to SIMD-ify dequantize_row_q4_0() for ARM_NEON * Fix dequantization - forgot to interleave the quants --- ggml.c | 1033 +++++++++++++++----------------------------------------- 1 file changed, 277 insertions(+), 756 deletions(-) diff --git a/ggml.c b/ggml.c index d8e1fbd4e..291e12a0a 100644 --- a/ggml.c +++ b/ggml.c @@ -496,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { assert(k % QK == 0); -#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__) +#if defined(__ARM_NEON) || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__) const int nb = k / QK; const size_t bs = sizeof(float) + QK/2; @@ -507,7 +507,6 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { #endif #if defined(__POWER9_VECTOR__) -#if QK == 32 const vector float v85 = vec_splats(8.5f); for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -548,11 +547,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { //memcpy(pb, pp, sizeof(pp)); pb += bs; } -#else -#error "not implemented for QK" -#endif #elif __ARM_NEON -#if QK == 32 for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -589,11 +584,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { memcpy(pb, pp, sizeof(pp)); pb += bs; } -#else -#error "not implemented for QK" -#endif #elif defined(__AVX2__) -#if QK == 32 for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors __m256 v0 = _mm256_loadu_ps( x ); @@ -660,11 +651,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { _mm_storeu_si128( ( __m128i* )pb, res ); pb += bs; } -#else -#error "not implemented for QK" -#endif #elif defined(__wasm_simd128__) -#if QK == 32 for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -701,9 +688,6 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { memcpy(pb, pp, sizeof(pp)); pb += bs; } -#else -#error "not implemented for QK" -#endif #else // scalar quantize_row_q4_0_reference(x, y, k); @@ -771,7 +755,7 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float)); -#if defined(__AVX2__) && QK % 32 == 0 +#if defined(__AVX2__) for (int i = 0; i < nb; i++) { // scale factor const __m256 d_v = _mm256_broadcast_ss((const float *) (pd + i*bs)); @@ -804,6 +788,59 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { } } } +#elif defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + const float d = *(const float *) (pd + i*bs); + + const uint8_t * restrict pp = pb + i*bs; + + const float32x4_t vd = vdupq_n_f32(d); + + for (int l = 0; l < QK; l += 16) { + // Load 16x4-bit integers into 8x8-bit integers + const uint8x8_t v8 = vld1_u8(pp + l/2); + + // Expand 4-bit nibbles to 8-bit bytes + const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f)); + const uint8x8_t v1 = vshr_n_u8(v8, 4); + + // Convert to signed 8-bit integers + const int8x8_t vs_0 = vreinterpret_s8_u8(v0); + const int8x8_t vs_1 = vreinterpret_s8_u8(v1); + + // Subtract 8 from each byte + const int8x8_t vb_0 = vsub_s8(vs_0, vdup_n_s8(8)); + const int8x8_t vb_1 = vsub_s8(vs_1, vdup_n_s8(8)); + + // Interleave and combine + const int8x8_t vx_0 = vzip1_s8(vb_0, vb_1); + const int8x8_t vx_1 = vzip2_s8(vb_0, vb_1); + + const int8x16_t vq = vcombine_s8(vx_0, vx_1); + + // convert to 2x int16x8_t + const int16x8_t vi_0 = vmovl_s8(vget_low_s8 (vq)); + const int16x8_t vi_1 = vmovl_s8(vget_high_s8(vq)); + + // convert to 4x float32x4_t + const float32x4_t vf_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_0))); + const float32x4_t vf_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_0))); + const float32x4_t vf_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_1))); + const float32x4_t vf_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_1))); + + // Multiply by d + const float32x4_t r0 = vmulq_f32(vf_0, vd); + const float32x4_t r1 = vmulq_f32(vf_1, vd); + const float32x4_t r2 = vmulq_f32(vf_2, vd); + const float32x4_t r3 = vmulq_f32(vf_3, vd); + + // Store + vst1q_f32(y + i*QK + l + 0, r0); + vst1q_f32(y + i*QK + l + 4, r1); + vst1q_f32(y + i*QK + l + 8, r2); + vst1q_f32(y + i*QK + l + 12, r3); + } + } #else // scalar for (int i = 0; i < nb; i++) { @@ -1500,8 +1537,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void float sumf = 0.0; -#ifdef __ARM_NEON -#if QK == 32 +#if defined(__ARM_NEON) float sum0 = 0.0f; float sum1 = 0.0f; @@ -1600,12 +1636,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void } sumf = sum0 + sum1; -#else -#error "not implemented for QK" -#endif #elif defined(__AVX512F__) - -#if QK == 32 // Initialize accumulator with zeros __m512 acc0 = _mm512_setzero_ps(); __m512 acc1 = _mm512_setzero_ps(); @@ -1634,11 +1665,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void // Horizontal sum of all lanes of the accumulator sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 ); -#else -#error "not implemented for QK" -#endif #elif defined(__AVX2__) -#if QK == 32 const size_t countBlocks = nb; // Initialize accumulator with zeros @@ -1689,11 +1716,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); sumf = _mm_cvtss_f32( res ); -#else -#error "not implemented for QK" -#endif #elif defined(__wasm_simd128__) -#if QK == 32 // wasm simd float sum0 = 0.0f; float sum1 = 0.0f; @@ -1776,9 +1799,6 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void } sumf = sum0 + sum1; -#else -#error "not implemented for QK" -#endif #else // scalar for (int i = 0; i < nb; i++) { @@ -1823,7 +1843,6 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void float sumf = 0.0; #if defined(__AVX2__) -#if QK == 32 // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); // Accumulator for constant offsets @@ -1898,9 +1917,6 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); sumf = _mm_cvtss_f32( res ) + acc_offset * QK; -#else -#error "not implemented for QK" -#endif #else // scalar for (int i = 0; i < nb; i++) { @@ -2017,167 +2033,6 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float #endif } -inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) { -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F16_STEP - 1)); - - GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); - - GGML_F16_VEC ax[GGML_F16_ARR]; - GGML_F16_VEC ay[GGML_F16_ARR]; - - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); - - GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); - } - } - - // leftovers - for (int i = np; i < n; ++i) { - GGML_ASSERT(false); - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); - } -#else - for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); - } -#endif -} - -inline static void ggml_vec_mad_q4_0(const int n, float * restrict y, void * restrict x, const float v) { - assert(n % QK == 0); - - const int nb = n / QK; - const size_t bs = sizeof(float) + QK/2; - - const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); - const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float)); - -#if __ARM_NEON -#if QK == 32 - for (int i = 0; i < nb; ++i) { - const float d0 = v*(*(const float *) (pd + i*bs)); - - const uint8_t * restrict pp = pb + i*bs; - - const uint8x8_t m4b = vdup_n_u8(0xf); - const int8x8_t s8b = vdup_n_s8(0x8); - - const float32x4_t vd = vdupq_n_f32(d0); - - for (int j = 0; j < 2; j++) { - const uint8x8_t vx = vld1_u8(pp + j*8); - - const int8x8_t vxl = vreinterpret_s8_u8(vand_u8(vx, m4b)); - const int8x8_t vxh = vreinterpret_s8_u8(vshr_n_u8(vx, 4)); - - // sub 8 - const int8x8_t vxls = vsub_s8(vxl, s8b); - const int8x8_t vxhs = vsub_s8(vxh, s8b); - - //const int8x8_t vxlt = vzip_s8(vxls, vxhs)[0]; - //const int8x8_t vxht = vzip_s8(vxls, vxhs)[1]; - const int8x8_t vxlt = vzip1_s8(vxls, vxhs); - const int8x8_t vxht = vzip2_s8(vxls, vxhs); - - const int8x16_t vxq = vcombine_s8(vxlt, vxht); - - // convert to 2x int16x8_t - const int16x8_t vxq0 = vmovl_s8(vget_low_s8 (vxq)); - const int16x8_t vxq1 = vmovl_s8(vget_high_s8(vxq)); - - // convert to 4x float32x4_t - const float32x4_t vx0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq0))); - const float32x4_t vx1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq0))); - const float32x4_t vx2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq1))); - const float32x4_t vx3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq1))); - - const float32x4_t vy0 = vld1q_f32(y + i*32 + j*16 + 0); - const float32x4_t vy1 = vld1q_f32(y + i*32 + j*16 + 4); - const float32x4_t vy2 = vld1q_f32(y + i*32 + j*16 + 8); - const float32x4_t vy3 = vld1q_f32(y + i*32 + j*16 + 12); - - const float32x4_t vr0 = vfmaq_f32(vy0, vx0, vd); - const float32x4_t vr1 = vfmaq_f32(vy1, vx1, vd); - const float32x4_t vr2 = vfmaq_f32(vy2, vx2, vd); - const float32x4_t vr3 = vfmaq_f32(vy3, vx3, vd); - - vst1q_f32(y + i*32 + j*16 + 0, vr0); - vst1q_f32(y + i*32 + j*16 + 4, vr1); - vst1q_f32(y + i*32 + j*16 + 8, vr2); - vst1q_f32(y + i*32 + j*16 + 12, vr3); - } - } -#endif -#else - // scalar - for (int i = 0; i < nb; i++) { - const float d = *(const float *) (pd + i*bs); - - const uint8_t * restrict pp = pb + i*bs; - - for (int l = 0; l < QK; l += 2) { - const uint8_t vi = pp[l/2]; - - const int8_t vi0 = vi & 0xf; - const int8_t vi1 = vi >> 4; - - const float v0 = (vi0 - 8)*d; - const float v1 = (vi1 - 8)*d; - - y[i*QK + l + 0] += v0*v; - y[i*QK + l + 1] += v1*v; - - assert(!isnan(y[i*QK + l + 0])); - assert(!isnan(y[i*QK + l + 1])); - assert(!isinf(y[i*QK + l + 0])); - assert(!isinf(y[i*QK + l + 1])); - } - } -#endif -} - -inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * restrict x, const float v) { - assert(n % QK == 0); - - const int nb = n / QK; - const size_t bs = 2*sizeof(float) + QK/2; - - const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); - const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); - const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); - - for (int i = 0; i < nb; i++) { - const float d = *(const float *) (pd + i*bs); - const float m = *(const float *) (pm + i*bs); - - const uint8_t * restrict pp = pb + i*bs; - - for (int l = 0; l < QK; l += 2) { - const uint8_t vi = pp[l/2]; - - const uint8_t vi0 = vi & 0xf; - const uint8_t vi1 = vi >> 4; - - const float v0 = d*vi0 + m; - const float v1 = d*vi1 + m; - - y[i*QK + l + 0] += v0*v; - y[i*QK + l + 1] += v1*v; - - assert(!isnan(y[i*QK + l + 0])); - assert(!isnan(y[i*QK + l + 1])); - assert(!isinf(y[i*QK + l + 0])); - assert(!isinf(y[i*QK + l + 1])); - //printf("mad: v0 %f v1 %f, i = %d, l = %d, d = %f, vi = %d, vi0 = %d, vi1 = %d\n", v0, v1, i, l, d, vi, vi0, vi1); - } - } -} - //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { #if defined(GGML_SIMD) @@ -2612,9 +2467,13 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return - (t0->ne[0] == t1->ne[0]) && - (t0->ne[2] == t1->ne[2]) && - (t0->ne[3] == t1->ne[3]); + (t0->ne[0] == t1->ne[0]) && + (t0->ne[2] == t1->ne[2]) && + (t0->ne[3] == t1->ne[3]); +} + +static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { + return tensor->nb[0] > tensor->nb[1]; } static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) { @@ -4010,6 +3869,7 @@ struct ggml_tensor * ggml_mul_mat( struct ggml_tensor * a, struct ggml_tensor * b) { GGML_ASSERT(ggml_can_mul_mat(a, b)); + GGML_ASSERT(!ggml_is_transposed(a)); bool is_node = false; @@ -5949,7 +5809,7 @@ static void ggml_compute_forward_mul_mat_f32( assert(ne3 == ne13); // TODO: we don't support permuted src0 - assert(nb00 == sizeof(float) || nb01 == sizeof(float)); + assert(nb00 == sizeof(float)); // dst cannot be transposed or permuted assert(nb0 == sizeof(float)); @@ -5964,9 +5824,6 @@ static void ggml_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // - // nb00 < nb01 - src0 is transposed - // compute by src0 columns #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { @@ -6007,126 +5864,50 @@ static void ggml_compute_forward_mul_mat_f32( #endif if (params->type == GGML_TASK_INIT) { - if (nb01 >= nb00) { - return; - } - - // TODO: fix this memset (wsize is overestimated) - memset(params->wdata, 0, params->wsize); return; } if (params->type == GGML_TASK_FINALIZE) { - if (nb01 >= nb00) { - return; - } - - // TODO: fix this memset (wsize is overestimated) - //assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth); - - float * const wdata = params->wdata; - - // cols per thread - const int dc = (ne + nth - 1)/nth; - - // col range for this thread - const int ic0 = dc*ith; - const int ic1 = MIN(ic0 + dc, ne); - - ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0); - - for (int k = 1; k < nth; k++) { - ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0); - } - return; } - if (nb01 >= nb00) { - // TODO: do not support transposed src1 - assert(nb10 == sizeof(float)); + // TODO: do not support transposed src1 + assert(nb10 == sizeof(float)); - // parallelize by src0 rows using ggml_vec_dot_f32 + // parallelize by src0 rows using ggml_vec_dot_f32 - // total rows in src0 - const int nr = ne01*ne02*ne03; + // total rows in src0 + const int nr = ne01*ne02*ne03; - // rows per thread - const int dr = (nr + nth - 1)/nth; + // rows per thread + const int dr = (nr + nth - 1)/nth; - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); - for (int ic = 0; ic < ne11; ++ic) { - // src1 indices - const int i13 = i03; - const int i12 = i02; - const int i11 = ic; + for (int ic = 0; ic < ne11; ++ic) { + // src1 indices + const int i13 = i03; + const int i12 = i02; + const int i11 = ic; - // dst indices - const int i0 = i01; - const int i1 = i11; - const int i2 = i02; - const int i3 = i03; + // dst indices + const int i0 = i01; + const int i1 = i11; + const int i2 = i02; + const int i3 = i03; - ggml_vec_dot_f32(ne00, - (float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)), - (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13))); - } - } - } else { - // parallelize by src1 columns using ggml_vec_mad_f32 - // each thread has its own work data - // during FINALIZE we accumulate all work data into dst - - // total columns in src1 - const int nc = ne10; - - // columns per thread - const int dc = (nc + nth - 1)/nth; - - // column range for this thread - const int ic0 = dc*ith; - const int ic1 = MIN(ic0 + dc, nc); - - // work data for thread - const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; - float * const wdata = params->wdata; - - for (int i13 = 0; i13 < ne13; ++i13) { - for (int i12 = 0; i12 < ne12; ++i12) { - for (int i11 = 0; i11 < ne11; ++i11) { - for (int ic = ic0; ic < ic1; ++ic) { - // src1 indices - const int i10 = ic; - - // src0 indices - const int i03 = i13; - const int i02 = i12; - const int i00 = ic; - - // dst indices - const int i1 = i11; - const int i2 = i12; - const int i3 = i13; - - assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); - - ggml_vec_mad_f32(ne01, - (float *) (wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0), - (float *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)), - *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13))); - } - } - } + ggml_vec_dot_f32(ne00, + (float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)), + (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13))); } } @@ -6192,7 +5973,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( GGML_ASSERT(ne3 == ne13); // TODO: we don't support permuted src0 - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t) || nb01 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); // dst cannot be transposed or permuted GGML_ASSERT(nb0 == sizeof(float)); @@ -6207,9 +5988,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // - // nb00 < nb01 - src0 is transposed - // compute by src0 columns #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { @@ -6261,148 +6039,66 @@ static void ggml_compute_forward_mul_mat_f16_f32( #endif if (params->type == GGML_TASK_INIT) { - if (nb01 >= nb00) { - ggml_fp16_t * const wdata = params->wdata; + ggml_fp16_t * const wdata = params->wdata; - size_t id = 0; - for (int i13 = 0; i13 < ne13; ++i13) { - for (int i12 = 0; i12 < ne12; ++i12) { - for (int i11 = 0; i11 < ne11; ++i11) { - for (int i10 = 0; i10 < ne10; ++i10) { - wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); - } + size_t id = 0; + for (int i13 = 0; i13 < ne13; ++i13) { + for (int i12 = 0; i12 < ne12; ++i12) { + for (int i11 = 0; i11 < ne11; ++i11) { + for (int i10 = 0; i10 < ne10; ++i10) { + wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); } } } - - GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize); - - return; } - // TODO: fix this memset (wsize is overestimated) - memset(params->wdata, 0, params->wsize); + GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize); + return; } if (params->type == GGML_TASK_FINALIZE) { - if (nb01 >= nb00) { - return; - } - - // TODO: fix this memset (wsize is overestimated) - //assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth); - - ggml_fp16_t * const wdata = params->wdata; - - // cols per thread - const int dc = (ne + nth - 1)/nth; - - // col range for this thread - const int ic0 = dc*ith; - const int ic1 = MIN(ic0 + dc, ne); - - for (int i = ic0; i < ic1; ++i) { - ((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]); - } - - for (int k = 1; k < nth; k++) { - for (int i = ic0; i < ic1; ++i) { - ((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]); - } - } - return; } - if (nb01 >= nb00) { - // fp16 -> half the size, so divide by 2 - // TODO: do not support transposed src1 - assert(nb10/2 == sizeof(ggml_fp16_t)); + // fp16 -> half the size, so divide by 2 + // TODO: do not support transposed src1 + assert(nb10/2 == sizeof(ggml_fp16_t)); - // parallelize by src0 rows using ggml_vec_dot_f16 + // parallelize by src0 rows using ggml_vec_dot_f16 - // total rows in src0 - const int nr = ne01*ne02*ne03; + // total rows in src0 + const int nr = ne01*ne02*ne03; - // rows per thread - const int dr = (nr + nth - 1)/nth; + // rows per thread + const int dr = (nr + nth - 1)/nth; - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); - ggml_fp16_t * wdata = params->wdata; + ggml_fp16_t * wdata = params->wdata; - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); - const int i13 = i03; - const int i12 = i02; + const int i13 = i03; + const int i12 = i02; - const int i0 = i01; - const int i2 = i02; - const int i3 = i03; + const int i0 = i01; + const int i2 = i02; + const int i3 = i03; - ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - ggml_fp16_t * src1_col = wdata + ( 0 + i12*ne11 + i13*ne12*ne11)*ne00; + ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + ggml_fp16_t * src1_col = wdata + ( 0 + i12*ne11 + i13*ne12*ne11)*ne00; - float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); + float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); - for (int ic = 0; ic < ne11; ++ic) { - ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); - } - } - } else { - // parallelize by src1 columns using ggml_vec_mad_f16 - // each thread has its own work data - // during FINALIZE we accumulate all work data into dst - - // total columns in src1 - const int nc = ne10; - - // columns per thread - const int dc = (nc + nth - 1)/nth; - - // column range for this thread - const int ic0 = dc*ith; - const int ic1 = MIN(ic0 + dc, nc); - - // work data for thread - const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; - ggml_fp16_t * const wdata = params->wdata; - - for (int i13 = 0; i13 < ne13; ++i13) { - for (int i12 = 0; i12 < ne12; ++i12) { - for (int i11 = 0; i11 < ne11; ++i11) { - // dst indices - const int i1 = i11; - const int i2 = i12; - const int i3 = i13; - - ggml_fp16_t * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0; - - for (int ic = ic0; ic < ic1; ++ic) { - // src1 indices - const int i10 = ic; - - // src0 indices - const int i03 = i13; - const int i02 = i12; - const int i00 = ic; - - assert(sizeof(ggml_fp16_t)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); - - ggml_fp16_t * src0_col = (ggml_fp16_t *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)); - float src1_val = * (float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - - ggml_vec_mad_f16(ne01, dst_row, src0_col, src1_val); - } - } - } + for (int ic = 0; ic < ne11; ++ic) { + ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); } } @@ -6467,7 +6163,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( GGML_ASSERT(ne3 == ne13); // TODO: we don't support permuted src0 - GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0] || nb01 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0]); + GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0]); // dst cannot be transposed or permuted GGML_ASSERT(nb0 == sizeof(float)); @@ -6482,9 +6178,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // - // nb00 < nb01 - src0 is transposed - // compute by src0 columns #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { @@ -6509,9 +6202,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( { size_t id = 0; for (int i01 = 0; i01 < ne01; ++i01) { - //for (int i00 = 0; i00 < ne00; ++i00) { - // wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); - //} dequantize_row_q4_0((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); id += ne00; } @@ -6538,143 +6228,63 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( #endif if (params->type == GGML_TASK_INIT) { - //printf("HHHHHHHHH ith = %d, nth = %d\n", ith, nth); - if (nb01 >= nb00) { - char * wdata = params->wdata; - - for (int i13 = 0; i13 < ne13; ++i13) { - for (int i12 = 0; i12 < ne12; ++i12) { - for (int i11 = 0; i11 < ne11; ++i11) { - //for (int i10 = 0; i10 < ne10; ++i10) { - // wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); - //} - quantize_row_q4_0((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); - wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]; - } - } - } - - return; - } - - // TODO: fix this memset (wsize is overestimated) - memset(params->wdata, 0, params->wsize); - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - if (nb01 >= nb00) { - return; - } - - float * const wdata = params->wdata; - - // cols per thread - const int dc = (ne + nth - 1)/nth; - - // col range for this thread - const int ic0 = dc*ith; - const int ic1 = MIN(ic0 + dc, ne); - - ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0); - - for (int k = 1; k < nth; k++) { - ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0); - } - - return; - } - - if (nb01 >= nb00) { - // TODO: do not support transposed src1 - - // parallelize by src0 rows using ggml_vec_dot_q4_0 - - // total rows in src0 - const int nr = ne01*ne02*ne03; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - void * wdata = params->wdata; - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int i13 = i03; - const int i12 = i02; - - const int i0 = i01; - const int i2 = i02; - const int i3 = i03; - - void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]); - - float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); - - assert(ne00 % 32 == 0); - - for (int ic = 0; ic < ne11; ++ic) { - ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]))); - } - } - } else { - //printf("AAAAA ith = %d, nth = %d\n", ith, nth); - // parallelize by src1 columns using ggml_vec_mad_q4_0 - // each thread has its own work data - // during FINALIZE we accumulate all work data into dst - - // total columns in src1 - const int nc = ne10; - - // columns per thread - const int dc = (nc + nth - 1)/nth; - - // column range for this thread - const int ic0 = dc*ith; - const int ic1 = MIN(ic0 + dc, nc); - - // work data for thread - const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; - float * const wdata = params->wdata; + char * wdata = params->wdata; for (int i13 = 0; i13 < ne13; ++i13) { for (int i12 = 0; i12 < ne12; ++i12) { for (int i11 = 0; i11 < ne11; ++i11) { - // dst indices - const int i1 = i11; - const int i2 = i12; - const int i3 = i13; - - float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0; - - for (int ic = ic0; ic < ic1; ++ic) { - // src1 indices - const int i10 = ic; - - // src0 indices - const int i03 = i13; - const int i02 = i12; - const int i00 = ic; - - assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); - - void * src0_col = (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)); - float src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - - ggml_vec_mad_q4_0(ne01, dst_row, src0_col, src1_val); - } + quantize_row_q4_0((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]; } } } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // TODO: do not support transposed src1 + + // parallelize by src0 rows using ggml_vec_dot_q4_0 + + // total rows in src0 + const int nr = ne01*ne02*ne03; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + void * wdata = params->wdata; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int i13 = i03; + const int i12 = i02; + + const int i0 = i01; + const int i2 = i02; + const int i3 = i03; + + void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]); + + float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); + + assert(ne00 % 32 == 0); + + for (int ic = 0; ic < ne11; ++ic) { + ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]))); + } } //int64_t t1 = ggml_time_us(); @@ -6738,7 +6348,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( GGML_ASSERT(ne3 == ne13); // TODO: we don't support permuted src0 - GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1] || nb01 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1]); + GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1]); // dst cannot be transposed or permuted GGML_ASSERT(nb0 == sizeof(float)); @@ -6753,9 +6363,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // - // nb00 < nb01 - src0 is transposed - // compute by src0 columns #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { @@ -6780,9 +6387,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( { size_t id = 0; for (int i01 = 0; i01 < ne01; ++i01) { - //for (int i00 = 0; i00 < ne00; ++i00) { - // wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); - //} dequantize_row_q4_1((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); id += ne00; } @@ -6809,143 +6413,66 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( #endif if (params->type == GGML_TASK_INIT) { - //printf("HHHHHHHHH ith = %d, nth = %d\n", ith, nth); - if (nb01 >= nb00) { - char * wdata = params->wdata; - - for (int i13 = 0; i13 < ne13; ++i13) { - for (int i12 = 0; i12 < ne12; ++i12) { - for (int i11 = 0; i11 < ne11; ++i11) { - //for (int i10 = 0; i10 < ne10; ++i10) { - // wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); - //} - quantize_row_q4_1((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); - wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]; - } - } - } - - return; - } - - // TODO: fix this memset (wsize is overestimated) - memset(params->wdata, 0, params->wsize); - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - if (nb01 >= nb00) { - return; - } - - float * const wdata = params->wdata; - - // cols per thread - const int dc = (ne + nth - 1)/nth; - - // col range for this thread - const int ic0 = dc*ith; - const int ic1 = MIN(ic0 + dc, ne); - - ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0); - - for (int k = 1; k < nth; k++) { - ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0); - } - - return; - } - - if (nb01 >= nb00) { - // TODO: do not support transposed src1 - - // parallelize by src0 rows using ggml_vec_dot_q4_1 - - // total rows in src0 - const int nr = ne01*ne02*ne03; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - void * wdata = params->wdata; - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int i13 = i03; - const int i12 = i02; - - const int i0 = i01; - const int i2 = i02; - const int i3 = i03; - - void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]); - - float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); - - assert(ne00 % 32 == 0); - - for (int ic = 0; ic < ne11; ++ic) { - ggml_vec_dot_q4_1(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]))); - } - } - } else { - //printf("AAAAA ith = %d, nth = %d\n", ith, nth); - // parallelize by src1 columns using ggml_vec_mad_q4_1 - // each thread has its own work data - // during FINALIZE we accumulate all work data into dst - - // total columns in src1 - const int nc = ne10; - - // columns per thread - const int dc = (nc + nth - 1)/nth; - - // column range for this thread - const int ic0 = dc*ith; - const int ic1 = MIN(ic0 + dc, nc); - - // work data for thread - const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; - float * const wdata = params->wdata; + char * wdata = params->wdata; for (int i13 = 0; i13 < ne13; ++i13) { for (int i12 = 0; i12 < ne12; ++i12) { for (int i11 = 0; i11 < ne11; ++i11) { - // dst indices - const int i1 = i11; - const int i2 = i12; - const int i3 = i13; - - float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0; - - for (int ic = ic0; ic < ic1; ++ic) { - // src1 indices - const int i10 = ic; - - // src0 indices - const int i03 = i13; - const int i02 = i12; - const int i00 = ic; - - assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); - - void * src0_col = (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)); - float src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); - - ggml_vec_mad_q4_1(ne01, dst_row, src0_col, src1_val); - } + //for (int i10 = 0; i10 < ne10; ++i10) { + // wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); + //} + quantize_row_q4_1((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]; } } } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // TODO: do not support transposed src1 + + // parallelize by src0 rows using ggml_vec_dot_q4_1 + + // total rows in src0 + const int nr = ne01*ne02*ne03; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + void * wdata = params->wdata; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int i13 = i03; + const int i12 = i02; + + const int i0 = i01; + const int i2 = i02; + const int i3 = i03; + + void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]); + + float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); + + assert(ne00 % 32 == 0); + + for (int ic = 0; ic < ne11; ++ic) { + ggml_vec_dot_q4_1(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]))); + } } //int64_t t1 = ggml_time_us(); @@ -9588,57 +9115,51 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t cur = 0; - // TODO: better way to determine if the matrix is transposed - if (node->src0->nb[1] < node->src0->nb[0]) { - cur = ggml_nbytes(node)*node->n_tasks; // TODO: this can become (n_tasks-1) - // TODO: overestimated by factor of x2 for FP16 - } else { - if (node->src0->type == GGML_TYPE_F16 && + if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]); - //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]); - //printf("cur = %zu\n", cur); - } else { - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); - } -#else - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); -#endif - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { - cur = 0; - } else if (node->src0->type == GGML_TYPE_Q4_0 && - node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else { - cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_0]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]; - } -#else - cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_0]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]; -#endif - } else if (node->src0->type == GGML_TYPE_Q4_1 && - node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else { - cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_1]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]; - } -#else - cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_1]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]; -#endif + if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + node->n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); + //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]); + //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]); + //printf("cur = %zu\n", cur); } else { - GGML_ASSERT(false); + cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); } +#else + cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); +#endif + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = 0; + } else if (node->src0->type == GGML_TYPE_Q4_0 && + node->src1->type == GGML_TYPE_F32) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + node->n_tasks = 1; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); + } else { + cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_0]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]; + } +#else + cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_0]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]; +#endif + } else if (node->src0->type == GGML_TYPE_Q4_1 && + node->src1->type == GGML_TYPE_F32) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + node->n_tasks = 1; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); + } else { + cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_1]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]; + } +#else + cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_1]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]; +#endif + } else { + GGML_ASSERT(false); } work_size = MAX(work_size, cur); From a316a425d04027453dc0fd45f003b647c12f66f9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 25 Mar 2023 20:26:40 +0200 Subject: [PATCH 2/8] Overhaul the examples structure - main -> examples - utils -> examples (renamed to "common") - quantize -> examples - separate tools for "perplexity" and "embedding" Hope I didn't break something ! --- .gitignore | 1 + CMakeLists.txt | 29 +--- Makefile | 19 ++- examples/CMakeLists.txt | 36 +++++ utils.cpp => examples/common.cpp | 4 +- utils.h => examples/common.h | 0 examples/embedding/CMakeLists.txt | 4 + examples/embedding/README.md | 3 + examples/embedding/embedding.cpp | 106 +++++++++++++ examples/main/CMakeLists.txt | 4 + examples/main/README.md | 3 + main.cpp => examples/main/main.cpp | 119 ++------------ examples/perplexity/CMakeLists.txt | 4 + examples/perplexity/README.md | 3 + examples/perplexity/perplexity.cpp | 146 ++++++++++++++++++ examples/quantize/CMakeLists.txt | 4 + examples/quantize/README.md | 3 + .../quantize/quantize.cpp | 0 ggml.c | 26 ++-- tests/CMakeLists.txt | 2 +- tests/test-tokenizer-0.cpp | 6 +- 21 files changed, 361 insertions(+), 161 deletions(-) create mode 100644 examples/CMakeLists.txt rename utils.cpp => examples/common.cpp (99%) rename utils.h => examples/common.h (100%) create mode 100644 examples/embedding/CMakeLists.txt create mode 100644 examples/embedding/README.md create mode 100644 examples/embedding/embedding.cpp create mode 100644 examples/main/CMakeLists.txt create mode 100644 examples/main/README.md rename main.cpp => examples/main/main.cpp (76%) create mode 100644 examples/perplexity/CMakeLists.txt create mode 100644 examples/perplexity/README.md create mode 100644 examples/perplexity/perplexity.cpp create mode 100644 examples/quantize/CMakeLists.txt create mode 100644 examples/quantize/README.md rename quantize.cpp => examples/quantize/quantize.cpp (100%) diff --git a/.gitignore b/.gitignore index 3087b0ea5..ce01fd541 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ models/* /main /quantize /result +/perplexity arm_neon.h compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index 51af97c4d..a1ff5a44e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -211,17 +211,6 @@ endif() # Build libraries # -add_library(utils OBJECT - utils.cpp - utils.h) - -target_include_directories(utils PUBLIC .) -target_compile_features(utils PUBLIC cxx_std_11) # don't bump -target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS}) -if (BUILD_SHARED_LIBS) - set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON) -endif() - add_library(ggml OBJECT ggml.c ggml.h) @@ -239,22 +228,12 @@ add_library(llama target_include_directories(llama PUBLIC .) target_compile_features(llama PUBLIC cxx_std_11) # don't bump -target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS}) +target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS}) if (BUILD_SHARED_LIBS) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) endif() -# -# Executables -# - -add_executable(main main.cpp) -target_link_libraries(main PRIVATE llama ggml utils) - -add_executable(quantize quantize.cpp) -target_link_libraries(quantize PRIVATE llama ggml utils) - # # programs, examples and tests # @@ -264,6 +243,6 @@ if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) add_subdirectory(tests) endif () -#if (LLAMA_BUILD_EXAMPLES) -# add_subdirectory(examples) -#endif() +if (LLAMA_BUILD_EXAMPLES) + add_subdirectory(examples) +endif() diff --git a/Makefile b/Makefile index e8b128cb8..98a2d85f3 100644 --- a/Makefile +++ b/Makefile @@ -212,7 +212,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main quantize +default: main quantize perplexity # # Build library @@ -224,20 +224,23 @@ ggml.o: ggml.c ggml.h llama.o: llama.cpp llama.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o -utils.o: utils.cpp utils.h - $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o +common.o: examples/common.cpp examples/common.h + $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: - rm -f *.o main quantize + rm -vf *.o main quantize perplexity -main: main.cpp ggml.o llama.o utils.o - $(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS) +main: examples/main/main.cpp ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: quantize.cpp ggml.o llama.o utils.o - $(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS) +quantize: examples/quantize/quantize.cpp ggml.o llama.o + $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) + +perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) # # Tests diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 000000000..ce3a34710 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,36 @@ +# dependencies + +find_package(Threads REQUIRED) + +# third-party + +# ... + +# common + +set(TARGET common) + +add_library(${TARGET} OBJECT + common.h + common.cpp + ) + +if (BUILD_SHARED_LIBS) + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + +target_include_directories(${TARGET} PUBLIC .) +target_compile_features(${TARGET} PUBLIC cxx_std_11) +target_link_libraries(${TARGET} PRIVATE llama) + +# examples + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +if (EMSCRIPTEN) +else() + add_subdirectory(main) + add_subdirectory(quantize) + add_subdirectory(perplexity) + add_subdirectory(embedding) +endif() diff --git a/utils.cpp b/examples/common.cpp similarity index 99% rename from utils.cpp rename to examples/common.cpp index cea309628..afa7d4026 100644 --- a/utils.cpp +++ b/examples/common.cpp @@ -1,6 +1,6 @@ -#include "ggml.h" +#include "common.h" -#include "utils.h" +#include "ggml.h" #include #include diff --git a/utils.h b/examples/common.h similarity index 100% rename from utils.h rename to examples/common.h diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt new file mode 100644 index 000000000..88c425d4a --- /dev/null +++ b/examples/embedding/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET embedding) +add_executable(${TARGET} embedding.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/embedding/README.md b/examples/embedding/README.md new file mode 100644 index 000000000..21d8be65f --- /dev/null +++ b/examples/embedding/README.md @@ -0,0 +1,3 @@ +# embedding + +TODO diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp new file mode 100644 index 000000000..3015293f7 --- /dev/null +++ b/examples/embedding/embedding.cpp @@ -0,0 +1,106 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + params.embedding = true; + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_context * ctx; + + // load the model + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.logits_all = params.perplexity; + lparams.use_mlock = params.use_mlock; + lparams.embedding = params.embedding; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + int n_past = 0; + + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + + // tokenize the prompt + auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + + // determine newline token + auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); + + if (params.verbose_prompt) { + fprintf(stderr, "\n"); + fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); + } + fprintf(stderr, "\n"); + } + + if (params.embedding){ + if (embd_inp.size() > 0) { + if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + } + + const auto embeddings = llama_get_embeddings(ctx); + + // TODO: print / use the embeddings + } + + llama_print_timings(ctx); + llama_free(ctx); + + return 0; +} diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt new file mode 100644 index 000000000..b2dcc2910 --- /dev/null +++ b/examples/main/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET main) +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/main/README.md b/examples/main/README.md new file mode 100644 index 000000000..4701aa558 --- /dev/null +++ b/examples/main/README.md @@ -0,0 +1,3 @@ +# main + +TODO diff --git a/main.cpp b/examples/main/main.cpp similarity index 76% rename from main.cpp rename to examples/main/main.cpp index 77260bb71..b5f1a7b5c 100644 --- a/main.cpp +++ b/examples/main/main.cpp @@ -1,5 +1,4 @@ -#include "utils.h" -#include "ggml.h" +#include "common.h" #include "llama.h" #include @@ -65,79 +64,6 @@ void set_console_state(console_state new_st) } } -std::vector softmax(const std::vector& logits) { - std::vector probs(logits.size()); - float max_logit = logits[0]; - for (float v : logits) max_logit = std::max(max_logit, v); - double sum_exp = 0.0; - for (size_t i = 0; i < logits.size(); i++) { - // Subtract the maximum logit value from the current logit value for numerical stability - float logit = logits[i] - max_logit; - double exp_logit = std::exp(logit); - sum_exp += exp_logit; - probs[i] = exp_logit; - } - for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp; - return probs; -} - -void perplexity(llama_context * ctx, const gpt_params & params) { - // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research - // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` - // Output: `perplexity: 13.5106 [114/114]` - auto tokens = ::llama_tokenize(ctx, params.prompt, true); - - int count = 0; - double nll = 0.0; - int seq_count = tokens.size() / params.n_ctx; - - fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count); - - for (int i = 0; i < seq_count; ++i) { - int start = i * params.n_ctx; - int end = start + params.n_ctx - 1; - std::vector embd(tokens.begin() + start, tokens.begin() + end); - auto start_t = std::chrono::high_resolution_clock::now(); - if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return; - } - auto end_t = std::chrono::high_resolution_clock::now(); - if (i == 0) { - double seconds = std::chrono::duration(end_t - start_t).count(); - printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0)); - } - // We get the logits for all the tokens in the context window (params.n_ctx) - // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, - // calculate the perplexity over the last half the window (so the model always has - // some context to predict the token). - // - // We rely on the fact that attention in the forward pass only looks at previous - // tokens here, so the logits returned for each token are an accurate representation - // of what the model would have predicted at that point. - // - // Example, we have a context window of 512, we will compute perplexity for each of the - // last 256 tokens. Then, we split the input up into context window size chunks to - // process the entire prompt. - - auto logits = llama_get_logits(ctx); - for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { - // Calculate probability of next token, given the previous ones. - int n_vocab = llama_n_vocab(ctx); - std::vector tok_logits( - logits + j * n_vocab, - logits + (j + 1) * n_vocab); - double prob = softmax(tok_logits)[tokens[start + j + 1]]; - nll += -std::log(prob); - ++count; - } - // perplexity is e^(average negative log-likelihood) - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); - fflush(stdout); - } - printf("\n"); -} - static bool is_interacting = false; #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) @@ -155,9 +81,6 @@ void sigint_handler(int signo) { #endif int main(int argc, char ** argv) { - // has to be called once at the start of the program to init ggml stuff - ggml_time_init(); - gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; @@ -165,6 +88,14 @@ int main(int argc, char ** argv) { return 1; } + if (params.perplexity) { + printf("\n************\n"); + printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + if (params.n_ctx > 2048) { fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" "expect poor results\n", __func__, params.n_ctx); @@ -198,9 +129,7 @@ int main(int argc, char ** argv) { lparams.n_parts = params.n_parts; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; - lparams.logits_all = params.perplexity; lparams.use_mlock = params.use_mlock; - lparams.embedding = params.embedding; ctx = llama_init_from_file(params.model.c_str(), lparams); @@ -236,11 +165,6 @@ int main(int argc, char ** argv) { return 0; } - if (params.perplexity) { - perplexity(ctx, params); - exit(0); - } - int n_past = 0; // Add a space in front of the first character to match OG llama tokenizer behavior @@ -346,27 +270,6 @@ int main(int argc, char ** argv) { // the first thing we will do is to output the prompt, so set color accordingly set_console_state(CONSOLE_STATE_PROMPT); - if (params.embedding){ - embd = embd_inp; - - if (embd.size() > 0) { - if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } - } - - const auto embeddings = llama_get_embeddings(ctx); - - // TODO: print / use the embeddings - - if (params.use_color) { - printf(ANSI_COLOR_RESET); - } - - return 0; - } - while (remaining_tokens > 0 || params.interactive) { // predict if (embd.size() > 0) { @@ -392,10 +295,6 @@ int main(int argc, char ** argv) { auto logits = llama_get_logits(ctx); if (params.ignore_eos) { - // set the logit of the eos token to zero to avoid sampling it - //logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0; - // TODO: this does not work of params.logits_all == true - assert(params.perplexity == false); logits[llama_token_eos()] = 0; } diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt new file mode 100644 index 000000000..5836df8b2 --- /dev/null +++ b/examples/perplexity/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET perplexity) +add_executable(${TARGET} perplexity.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md new file mode 100644 index 000000000..a932275c2 --- /dev/null +++ b/examples/perplexity/README.md @@ -0,0 +1,3 @@ +# perplexity + +TODO diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp new file mode 100644 index 000000000..f0266a01f --- /dev/null +++ b/examples/perplexity/perplexity.cpp @@ -0,0 +1,146 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include + +std::vector softmax(const std::vector& logits) { + std::vector probs(logits.size()); + float max_logit = logits[0]; + for (float v : logits) max_logit = std::max(max_logit, v); + double sum_exp = 0.0; + for (size_t i = 0; i < logits.size(); i++) { + // Subtract the maximum logit value from the current logit value for numerical stability + float logit = logits[i] - max_logit; + double exp_logit = std::exp(logit); + sum_exp += exp_logit; + probs[i] = exp_logit; + } + for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp; + return probs; +} + +void perplexity(llama_context * ctx, const gpt_params & params) { + // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research + // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` + // Output: `perplexity: 13.5106 [114/114]` + auto tokens = ::llama_tokenize(ctx, params.prompt, true); + + int count = 0; + double nll = 0.0; + int seq_count = tokens.size() / params.n_ctx; + + fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count); + + for (int i = 0; i < seq_count; ++i) { + int start = i * params.n_ctx; + int end = start + params.n_ctx - 1; + std::vector embd(tokens.begin() + start, tokens.begin() + end); + auto start_t = std::chrono::high_resolution_clock::now(); + if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + auto end_t = std::chrono::high_resolution_clock::now(); + if (i == 0) { + double seconds = std::chrono::duration(end_t - start_t).count(); + printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0)); + } + // We get the logits for all the tokens in the context window (params.n_ctx) + // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, + // calculate the perplexity over the last half the window (so the model always has + // some context to predict the token). + // + // We rely on the fact that attention in the forward pass only looks at previous + // tokens here, so the logits returned for each token are an accurate representation + // of what the model would have predicted at that point. + // + // Example, we have a context window of 512, we will compute perplexity for each of the + // last 256 tokens. Then, we split the input up into context window size chunks to + // process the entire prompt. + + auto logits = llama_get_logits(ctx); + for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { + // Calculate probability of next token, given the previous ones. + int n_vocab = llama_n_vocab(ctx); + std::vector tok_logits( + logits + j * n_vocab, + logits + (j + 1) * n_vocab); + double prob = softmax(tok_logits)[tokens[start + j + 1]]; + nll += -std::log(prob); + ++count; + } + // perplexity is e^(average negative log-likelihood) + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + fflush(stdout); + } + printf("\n"); +} + +int main(int argc, char ** argv) { + gpt_params params; + params.model = "models/llama-7B/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + params.perplexity = true; + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_context * ctx; + + // load the model + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.logits_all = params.perplexity; + lparams.use_mlock = params.use_mlock; + lparams.embedding = params.embedding; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + perplexity(ctx, params); + + llama_print_timings(ctx); + llama_free(ctx); + + return 0; +} diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt new file mode 100644 index 000000000..fb27d4517 --- /dev/null +++ b/examples/quantize/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET quantize) +add_executable(${TARGET} quantize.cpp) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/README.md b/examples/quantize/README.md new file mode 100644 index 000000000..f349e913e --- /dev/null +++ b/examples/quantize/README.md @@ -0,0 +1,3 @@ +# quantize + +TODO diff --git a/quantize.cpp b/examples/quantize/quantize.cpp similarity index 100% rename from quantize.cpp rename to examples/quantize/quantize.cpp diff --git a/ggml.c b/ggml.c index 291e12a0a..b566b5684 100644 --- a/ggml.c +++ b/ggml.c @@ -5741,8 +5741,8 @@ static bool ggml_compute_forward_mul_mat_use_blas( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - const int ne00 = src0->ne[0]; - const int ne01 = src0->ne[1]; + //const int ne00 = src0->ne[0]; + //const int ne01 = src0->ne[1]; const int ne10 = src1->ne[0]; @@ -5776,16 +5776,16 @@ static void ggml_compute_forward_mul_mat_f32( const int ne10 = src1->ne[0]; const int ne11 = src1->ne[1]; - const int ne12 = src1->ne[2]; - const int ne13 = src1->ne[3]; + //const int ne12 = src1->ne[2]; + //const int ne13 = src1->ne[3]; - const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne0 = dst->ne[0]; + //const int ne1 = dst->ne[1]; + //const int ne2 = dst->ne[2]; + //const int ne3 = dst->ne[3]; + //const int ne = ne0*ne1*ne2*ne3; - const int nb00 = src0->nb[0]; + //const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; const int nb02 = src0->nb[2]; const int nb03 = src0->nb[3]; @@ -5947,7 +5947,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne = ne0*ne1*ne2*ne3; const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; @@ -6137,7 +6137,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne = ne0*ne1*ne2*ne3; const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; @@ -6322,7 +6322,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - const int ne = ne0*ne1*ne2*ne3; + //const int ne = ne0*ne1*ne2*ne3; const int nb00 = src0->nb[0]; const int nb01 = src0->nb[1]; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6a4170f80..b44d7fe7e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,7 +1,7 @@ function(llama_add_test source) get_filename_component(TEST_TARGET ${source} NAME_WE) add_executable(${TEST_TARGET} ${source}) - target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils) + target_link_libraries(${TEST_TARGET} PRIVATE llama) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) endfunction() diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 49bc232b6..382055324 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -1,9 +1,9 @@ -#include "utils.h" #include "llama.h" #include #include #include +#include static const std::map> k_tests = { { "Hello World", { 1, 10994, 2787, }, }, @@ -48,7 +48,9 @@ int main(int argc, char **argv) { } for (const auto & test_kv : k_tests) { - const auto res = ::llama_tokenize(ctx, test_kv.first, true); + std::vector res(test_kv.first.size()); + const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true); + res.resize(n); bool correct = res.size() == test_kv.second.size(); From 459e93cce07cab9052c06b5bf360819893442e1e Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Sat, 25 Mar 2023 19:31:48 +0100 Subject: [PATCH 3/8] Add AVX2 implementation of dequantize_row_q4_1 (#505) --- ggml.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index b566b5684..c9a4e8675 100644 --- a/ggml.c +++ b/ggml.c @@ -783,7 +783,7 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { // Scale and store for (int j = 0; j < 4; j++) { - __m256 result = _mm256_mul_ps(vf[j], d_v); + const __m256 result = _mm256_mul_ps(vf[j], d_v); _mm256_storeu_ps(y + i * QK + l + j*8, result); } } @@ -879,6 +879,37 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); +#if defined(__AVX2__) + for (int i = 0; i < nb; i++) { + const __m256 d_v = _mm256_broadcast_ss((const float *) (pd + i*bs)); + const __m256 d_m = _mm256_broadcast_ss((const float *) (pm + i*bs)); + + const uint8_t * restrict pp = pb + i*bs; + + for (int l = 0; l < QK; l += 32) { + // Load 32x4-bit integers into 32x8-bit integers + __m256i vx8 = bytesFromNibbles(pp+l/2); + + // Convert to 16-bit int + const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0)); + const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1)); + + // Convert to 32-bit int -> float 32 + const __m256 vf[4] = { + _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))), + _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))), + _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))), + _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1))) + }; + + // Scale, add m and store + for (int j = 0; j < 4; j++) { + const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m); + _mm256_storeu_ps(y + i * QK + l + j*8, result); + } + } + } +#else for (int i = 0; i < nb; i++) { const float d = *(const float *) (pd + i*bs); const float m = *(const float *) (pm + i*bs); @@ -901,6 +932,7 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { assert(!isnan(y[i*QK + l + 1])); } } +#endif } // From 55ad42af845127bd0eb0c1f36f327ecec83f4bca Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 25 Mar 2023 20:36:52 +0200 Subject: [PATCH 4/8] Move chat scripts into "./examples" --- README.md | 7 +++++-- alpaca.sh => examples/alpaca.sh | 4 ++++ examples/{chatLLaMa => chat-13B.sh} | 0 chat.sh => examples/chat.sh | 4 ++++ 4 files changed, 13 insertions(+), 2 deletions(-) rename alpaca.sh => examples/alpaca.sh (89%) rename examples/{chatLLaMa => chat-13B.sh} (100%) rename chat.sh => examples/chat.sh (89%) diff --git a/README.md b/README.md index 8a84324b1..9ba6241da 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,10 @@ Here is an example few-shot interaction, invoked with the command ```bash # default arguments using 7B model -./chat.sh +./examples/chat.sh + +# advanced chat with 13B model +./examples/chat-13B.sh # custom arguments using 13B model ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt @@ -195,7 +198,7 @@ Note the use of `--color` to distinguish between user input and generated text. 2. Run the `main` tool like this: ``` -./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins +./examples/alpaca.sh ``` Sample run: diff --git a/alpaca.sh b/examples/alpaca.sh similarity index 89% rename from alpaca.sh rename to examples/alpaca.sh index d8a9f456a..4c9aa5077 100755 --- a/alpaca.sh +++ b/examples/alpaca.sh @@ -1,6 +1,10 @@ #!/bin/bash + # # Temporary script - will be removed in the future # +cd `dirname $0` +cd .. + ./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 diff --git a/examples/chatLLaMa b/examples/chat-13B.sh similarity index 100% rename from examples/chatLLaMa rename to examples/chat-13B.sh diff --git a/chat.sh b/examples/chat.sh similarity index 89% rename from chat.sh rename to examples/chat.sh index 5531315b3..2b6a63e3f 100755 --- a/chat.sh +++ b/examples/chat.sh @@ -1,6 +1,10 @@ #!/bin/bash + # # Temporary script - will be removed in the future # +cd `dirname $0` +cd .. + ./main -m ./models/7B/ggml-model-q4_0.bin -b 128 -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt From 03f7e335604b3d68f74995aa2ccb4955833ee423 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 25 Mar 2023 20:51:14 +0200 Subject: [PATCH 5/8] Cleanup STL headers + fix embedding examples + minor stuff --- examples/embedding/embedding.cpp | 15 +++++---------- examples/perplexity/perplexity.cpp | 8 -------- llama.cpp | 22 ++++++++++++++-------- llama.h | 1 + 4 files changed, 20 insertions(+), 26 deletions(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 3015293f7..d397f35fd 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,15 +1,6 @@ #include "common.h" #include "llama.h" -#include -#include -#include -#include -#include -#include -#include -#include - int main(int argc, char ** argv) { gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; @@ -94,9 +85,13 @@ int main(int argc, char ** argv) { } } + const int n_embd = llama_n_embd(ctx); const auto embeddings = llama_get_embeddings(ctx); - // TODO: print / use the embeddings + for (int i = 0; i < n_embd; i++) { + printf("%f ", embeddings[i]); + } + printf("\n"); } llama_print_timings(ctx); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index f0266a01f..f617ba365 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,14 +1,6 @@ #include "common.h" #include "llama.h" -#include -#include -#include -#include -#include -#include -#include - std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); float max_logit = logits[0]; diff --git a/llama.cpp b/llama.cpp index 0015edec1..2bd520353 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1261,10 +1261,10 @@ static llama_vocab::id llama_sample_top_p_top_k( double repeat_penalty) { auto & rng = lctx.rng; - const auto & vocab = lctx.vocab; - const auto & logits = lctx.logits; + const int n_logits = lctx.model.hparams.n_vocab; - int n_logits = vocab.id_to_token.size(); + const auto & logits = lctx.logits; + const auto * plogits = logits.data() + logits.size() - n_logits; std::vector> logits_id; logits_id.reserve(n_logits); @@ -1276,13 +1276,13 @@ static llama_vocab::id llama_sample_top_p_top_k( // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) { // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability - if (logits[i] < 0.0) { - logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i)); + if (plogits[i] < 0.0) { + logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); } else { - logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i)); + logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); } } else { - logits_id.push_back(std::make_pair(logits[i]*scale, i)); + logits_id.push_back(std::make_pair(plogits[i]*scale, i)); } } } @@ -1677,6 +1677,8 @@ struct llama_context * llama_init_from_file( } const auto & hparams = ctx->model.hparams; + + // resized during inference if (params.logits_all) { ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); } else { @@ -1684,7 +1686,7 @@ struct llama_context * llama_init_from_file( } if (params.embedding){ - ctx->embedding.reserve(hparams.n_embd); + ctx->embedding.resize(hparams.n_embd); } ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type)); @@ -1761,6 +1763,10 @@ int llama_n_ctx(struct llama_context * ctx) { return ctx->model.hparams.n_ctx; } +int llama_n_embd(struct llama_context * ctx) { + return ctx->model.hparams.n_embd; +} + float * llama_get_logits(struct llama_context * ctx) { return ctx->logits.data(); } diff --git a/llama.h b/llama.h index 827abc1f2..ebf55f41c 100644 --- a/llama.h +++ b/llama.h @@ -109,6 +109,7 @@ extern "C" { LLAMA_API int llama_n_vocab(struct llama_context * ctx); LLAMA_API int llama_n_ctx (struct llama_context * ctx); + LLAMA_API int llama_n_embd (struct llama_context * ctx); // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row From e2d490dafd860eaaaf9aa8008ab790527d556daf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 25 Mar 2023 21:36:22 +0200 Subject: [PATCH 6/8] Inifinite generation via context swapping (#71) --- examples/chat.sh | 8 ++- examples/common.cpp | 9 ++- examples/common.h | 1 + examples/main/main.cpp | 122 ++++++++++++++++++++++++++++------------- 4 files changed, 100 insertions(+), 40 deletions(-) diff --git a/examples/chat.sh b/examples/chat.sh index 2b6a63e3f..97973d056 100755 --- a/examples/chat.sh +++ b/examples/chat.sh @@ -7,4 +7,10 @@ cd `dirname $0` cd .. -./main -m ./models/7B/ggml-model-q4_0.bin -b 128 -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt +# Important: +# +# "--keep 48" is based on the contents of prompts/chat-with-bob.txt +# +./main -m ./models/7B/ggml-model-q4_0.bin -c 2048 -b 1024 -n 256 --keep 48 \ + --repeat_penalty 1.0 --color -i \ + -r "User:" -f prompts/chat-with-bob.txt diff --git a/examples/common.cpp b/examples/common.cpp index afa7d4026..866a6b063 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -112,6 +112,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } params.n_batch = std::stoi(argv[i]); params.n_batch = std::min(512, params.n_batch); + } else if (arg == "--keep") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_keep = std::stoi(argv[i]); } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; @@ -134,7 +140,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.use_mlock = true; } else if (arg == "--mtest") { params.mem_test = true; - } else if (arg == "--verbose_prompt") { + } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "-r" || arg == "--reverse-prompt") { if (++i >= argc) { @@ -210,6 +216,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n"); fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); + fprintf(stderr, " --keep number of tokens to keep from the initial prompt\n"); if (ggml_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } diff --git a/examples/common.h b/examples/common.h index dede80385..8caefd859 100644 --- a/examples/common.h +++ b/examples/common.h @@ -21,6 +21,7 @@ struct gpt_params { int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) int32_t n_ctx = 512; // context size int32_t n_batch = 8; // batch size for prompt processing + int32_t n_keep = 0; // number of tokens to keep from initial prompt // sampling parameters int32_t top_k = 40; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b5f1a7b5c..f78936d45 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -44,8 +44,20 @@ enum console_state { static console_state con_st = CONSOLE_STATE_DEFAULT; static bool con_use_color = false; -void set_console_state(console_state new_st) -{ +void enable_console_colors() { +#if defined (_WIN32) + if (params.use_color) { + // Enable ANSI colors on Windows 10+ + unsigned long dwMode = 0; + void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11) + if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) { + SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4) + } + } +#endif +} + +void set_console_state(console_state new_st) { if (!con_use_color) return; // only emit color code if state changed if (new_st != con_st) { @@ -96,6 +108,14 @@ int main(int argc, char ** argv) { return 0; } + if (params.embedding) { + printf("\n************\n"); + printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + if (params.n_ctx > 2048) { fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" "expect poor results\n", __func__, params.n_ctx); @@ -165,8 +185,6 @@ int main(int argc, char ** argv) { return 0; } - int n_past = 0; - // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); @@ -175,7 +193,13 @@ int main(int argc, char ** argv) { const int n_ctx = llama_n_ctx(ctx); - params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size()); + if ((int) embd_inp.size() > n_ctx - 4) { + fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + return 1; + } + + params.n_keep = std::min(params.n_keep, (int) embd_inp.size()); + //params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size()); // prefix & suffix for instruct mode const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); @@ -206,6 +230,13 @@ int main(int argc, char ** argv) { for (int i = 0; i < (int) embd_inp.size(); i++) { fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); } + if (params.n_keep > 0) { + fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); + for (int i = 0; i < params.n_keep; i++) { + fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i])); + } + fprintf(stderr, "'\n"); + } fprintf(stderr, "\n"); } @@ -222,7 +253,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: interactive mode on.\n", __func__); - if(params.antiprompt.size()) { + if (params.antiprompt.size()) { for (auto antiprompt : params.antiprompt) { fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); } @@ -232,14 +263,12 @@ int main(int argc, char ** argv) { fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); } } - fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); + fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); + fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); fprintf(stderr, "\n\n"); - std::vector embd; - - - int last_n_size = params.repeat_last_n; - std::vector last_n_tokens(last_n_size); + // TODO: replace with ring-buffer + std::vector last_n_tokens(n_ctx); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); if (params.interactive) { @@ -252,27 +281,42 @@ int main(int argc, char ** argv) { is_interacting = params.interactive_start || params.instruct; } - int input_consumed = 0; bool input_noecho = false; - int remaining_tokens = params.n_predict; + int n_past = 0; + int n_remain = params.n_predict; + int n_consumed = 0; -#if defined (_WIN32) - if (params.use_color) { - // Enable ANSI colors on Windows 10+ - unsigned long dwMode = 0; - void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11) - if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) { - SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4) - } - } -#endif // the first thing we will do is to output the prompt, so set color accordingly + enable_console_colors(); set_console_state(CONSOLE_STATE_PROMPT); - while (remaining_tokens > 0 || params.interactive) { + std::vector embd; + + while (n_remain > 0 || params.interactive) { // predict if (embd.size() > 0) { + // infinite text generation via context swapping + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch + if (n_past + (int) embd.size() > n_ctx) { + const int n_left = n_past - params.n_keep; + + n_past = params.n_keep; + + // insert n_left/2 tokens at the start of embd from last_n_tokens + embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); + + //printf("\n---\n"); + //printf("resetting: '"); + //for (int i = 0; i < (int) embd.size(); i++) { + // printf("%s", llama_token_to_str(ctx, embd[i])); + //} + //printf("'\n"); + //printf("\n---\n"); + } + if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { fprintf(stderr, "%s : failed to eval\n", __func__); return 1; @@ -282,7 +326,7 @@ int main(int argc, char ** argv) { n_past += embd.size(); embd.clear(); - if ((int) embd_inp.size() <= input_consumed && !is_interacting) { + if ((int) embd_inp.size() <= n_consumed && !is_interacting) { // out of user input, sample next token const float top_k = params.top_k; const float top_p = params.top_p; @@ -298,7 +342,9 @@ int main(int argc, char ** argv) { logits[llama_token_eos()] = 0; } - id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty); + id = llama_sample_top_p_top_k(ctx, + last_n_tokens.data() + n_ctx - params.repeat_last_n, + params.repeat_last_n, top_k, top_p, temp, repeat_penalty); last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(id); @@ -321,14 +367,14 @@ int main(int argc, char ** argv) { input_noecho = false; // decrement remaining sampling budget - --remaining_tokens; + --n_remain; } else { // some user input remains from prompt or interaction, forward it to processing - while ((int) embd_inp.size() > input_consumed) { - embd.push_back(embd_inp[input_consumed]); + while ((int) embd_inp.size() > n_consumed) { + embd.push_back(embd_inp[n_consumed]); last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(embd_inp[input_consumed]); - ++input_consumed; + last_n_tokens.push_back(embd_inp[n_consumed]); + ++n_consumed; if ((int) embd.size() >= params.n_batch) { break; } @@ -343,13 +389,13 @@ int main(int argc, char ** argv) { fflush(stdout); } // reset color to default if we there is no pending user input - if (!input_noecho && (int)embd_inp.size() == input_consumed) { + if (!input_noecho && (int)embd_inp.size() == n_consumed) { set_console_state(CONSOLE_STATE_DEFAULT); } // in interactive mode, and not currently processing queued inputs; // check if we should prompt the user for more - if (params.interactive && (int) embd_inp.size() <= input_consumed) { + if (params.interactive && (int) embd_inp.size() <= n_consumed) { // check for reverse prompt std::string last_output; for (auto id : last_n_tokens) { @@ -371,7 +417,7 @@ int main(int argc, char ** argv) { set_console_state(CONSOLE_STATE_USER_INPUT); if (params.instruct) { - input_consumed = embd_inp.size(); + n_consumed = embd_inp.size(); embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); printf("\n> "); @@ -405,7 +451,7 @@ int main(int argc, char ** argv) { embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); } - remaining_tokens -= line_inp.size(); + n_remain -= line_inp.size(); input_noecho = true; // do not echo this again } @@ -426,8 +472,8 @@ int main(int argc, char ** argv) { } // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. - if (params.interactive && remaining_tokens <= 0) { - remaining_tokens = params.n_predict; + if (params.interactive && n_remain <= 0) { + n_remain = params.n_predict; is_interacting = true; } } From 79b2b266db6b198b5af450982c3cd61120fac951 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 25 Mar 2023 21:51:41 +0200 Subject: [PATCH 7/8] If n_predict == -1, generate forever --- examples/chat.sh | 2 +- examples/common.cpp | 2 +- examples/main/main.cpp | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/chat.sh b/examples/chat.sh index 97973d056..9a928ef05 100755 --- a/examples/chat.sh +++ b/examples/chat.sh @@ -11,6 +11,6 @@ cd .. # # "--keep 48" is based on the contents of prompts/chat-with-bob.txt # -./main -m ./models/7B/ggml-model-q4_0.bin -c 2048 -b 1024 -n 256 --keep 48 \ +./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \ --repeat_penalty 1.0 --color -i \ -r "User:" -f prompts/chat-with-bob.txt diff --git a/examples/common.cpp b/examples/common.cpp index 866a6b063..2ab000f4f 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -204,7 +204,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); fprintf(stderr, " -f FNAME, --file FNAME\n"); fprintf(stderr, " prompt file to start generation.\n"); - fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); + fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 - infinity)\n", params.n_predict); fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index f78936d45..a453743a5 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -199,7 +199,6 @@ int main(int argc, char ** argv) { } params.n_keep = std::min(params.n_keep, (int) embd_inp.size()); - //params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size()); // prefix & suffix for instruct mode const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); @@ -293,7 +292,7 @@ int main(int argc, char ** argv) { std::vector embd; - while (n_remain > 0 || params.interactive) { + while (n_remain != 0 || params.interactive) { // predict if (embd.size() > 0) { // infinite text generation via context swapping From c2b25b6912662d2637d9c6e6df3a5de931e0d7ce Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 25 Mar 2023 21:53:39 +0200 Subject: [PATCH 8/8] Fix colors enabling on WIN32 --- examples/main/main.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index a453743a5..7bb2b6bc4 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -46,13 +46,11 @@ static bool con_use_color = false; void enable_console_colors() { #if defined (_WIN32) - if (params.use_color) { - // Enable ANSI colors on Windows 10+ - unsigned long dwMode = 0; - void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11) - if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) { - SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4) - } + // Enable ANSI colors on Windows 10+ + unsigned long dwMode = 0; + void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11) + if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) { + SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4) } #endif } @@ -287,7 +285,9 @@ int main(int argc, char ** argv) { int n_consumed = 0; // the first thing we will do is to output the prompt, so set color accordingly - enable_console_colors(); + if (params.use_color) { + enable_console_colors(); + } set_console_state(CONSOLE_STATE_PROMPT); std::vector embd;