From f56d6077d0c37e6606ac0a4fa3169de70593acfe Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Wed, 27 Dec 2023 17:37:25 +0900 Subject: [PATCH 1/5] Add byte token type when tokenizer.model is not exists (#4641) * Add byte token type to hf format * remove unused variable --- convert.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/convert.py b/convert.py index 7a3cd615e..1f0c4f2f4 100755 --- a/convert.py +++ b/convert.py @@ -357,6 +357,7 @@ class VocabLoader: for tok in self.tokenizer.all_special_tokens } self.special_ids: set[int] = set(self.tokenizer.all_special_ids) + self.reverse_vocab = {id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()} self.vocab_size_base: int = self.tokenizer.vocab_size self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict) self.fname_tokenizer: Path = fname_tokenizer @@ -370,15 +371,13 @@ class VocabLoader: self.spm = None def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - tokenizer = self.tokenizer - reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()} added_tokens_ids = set(self.added_tokens_dict.values()) for i in range(self.vocab_size_base): if i in added_tokens_ids: continue - text = reverse_vocab[i].encode("utf-8") + text = self.reverse_vocab[i].encode("utf-8") yield text, self.get_token_score(i), self.get_token_type(i) def get_token_type(self, token_id: int) -> gguf.TokenType: @@ -394,10 +393,13 @@ class VocabLoader: if self.spm.is_byte(token_id): toktype = gguf.TokenType.BYTE else: + token = self.reverse_vocab[token_id] if token_id == self.unk_token_id: toktype = gguf.TokenType.UNKNOWN - if token_id in self.special_ids: + elif token_id in self.special_ids: toktype = gguf.TokenType.CONTROL + elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"): + toktype = gguf.TokenType.BYTE return toktype From 951010fa53a0ffe81b7d2e87c4349e0d3cb3d19d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 27 Dec 2023 11:02:13 +0200 Subject: [PATCH 2/5] ggml : fix dot product for ARM (#4630) ggml-ci --- ggml-quants.c | 363 +++----------------------------------------------- 1 file changed, 22 insertions(+), 341 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index a15a24048..05ef8f9b7 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -407,6 +407,18 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { #define ggml_vld1q_s8_x4 vld1q_s8_x4 #endif + +#if !defined(__ARM_FEATURE_DOTPROD) + +inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { + const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); + const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); + + return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); +} + +#endif + #endif #if defined(__ARM_NEON) || defined(__wasm_simd128__) @@ -2468,32 +2480,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); -#if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); -#endif } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -2776,32 +2768,12 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); -#if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); -#endif } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; @@ -2963,32 +2935,12 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); -#if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); -#endif } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -3275,32 +3227,12 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); -#if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); -#endif } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; @@ -3550,7 +3482,6 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri const int8x16_t y1_0 = vld1q_s8(y1->qs); const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); -#if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); @@ -3558,26 +3489,6 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - -#else - const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); - const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0)); - const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1)); - const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1)); - - const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0)); - const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0)); - const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1)); - const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1)); - - const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1)); - const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3)); - const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); - const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); -#endif } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -3650,12 +3561,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const int nb = n / QK_K; #ifdef __ARM_NEON - const uint8x16_t m3 = vdupq_n_u8(0x3); const uint8x16_t m4 = vdupq_n_u8(0xF); -#if defined(__ARM_FEATURE_DOTPROD) - const int32x4_t vzero = vdupq_n_s32(0); -#endif + + const int32x4_t vzero = vdupq_n_s32(0); ggml_int8x16x2_t q2bytes; uint8_t aux[16]; @@ -3663,7 +3572,6 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri float sum = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); @@ -3689,20 +3597,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri // We use this macro instead of a function call because for some reason // the code runs 2-3% slower, even if the function is declared inline -#if defined(__ARM_FEATURE_DOTPROD) #define MULTIPLY_ACCUM_WITH_SCALE(index)\ isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\ isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)]; -#else -#define MULTIPLY_ACCUM_WITH_SCALE(index)\ - {\ - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),\ - vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));\ - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),\ - vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));\ - isum += vaddvq_s16(p1) * aux[is+(index)] + vaddvq_s16(p2) * aux[is+1+(index)];\ - } -#endif #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\ @@ -3710,26 +3607,23 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\ MULTIPLY_ACCUM_WITH_SCALE((index)); - for (int j = 0; j < QK_K/128; ++j) { - const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32; ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3)); q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3)); + MULTIPLY_ACCUM_WITH_SCALE(0); SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2); - SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4); - SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6); is += 8; } - sum += d * isum; + sum += d * isum; } *s = sum; @@ -4043,11 +3937,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const int nb = n / QK_K; #ifdef __ARM_NEON - const uint8x16_t m3 = vdupq_n_u8(0x3); -#if defined(__ARM_FEATURE_DOTPROD) - const int32x4_t vzero = vdupq_n_s32(0); -#endif + + const int32x4_t vzero = vdupq_n_s32(0); ggml_int8x16x4_t q2bytes; @@ -4081,28 +3973,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3)); q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3)); -#if defined(__ARM_FEATURE_DOTPROD) isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0]; isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1]; isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2]; isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3]; -#else - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - isum1 += vaddvq_s16(p1) * scales[0]; - isum2 += vaddvq_s16(p2) * scales[1]; - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q2bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - const int16x8_t p4 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q2bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - isum1 += vaddvq_s16(p3) * scales[2]; - isum2 += vaddvq_s16(p4) * scales[3]; -#endif sum += d * (isum1 + isum2); - } *s = sum; @@ -4328,9 +4204,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri uint32_t utmp[4]; const uint8x16_t m3b = vdupq_n_u8(0x3); -#ifdef __ARM_FEATURE_DOTPROD const int32x4_t vzero = vdupq_n_s32(0); -#endif const uint8x16_t m0 = vdupq_n_u8(1); const uint8x16_t m1 = vshlq_n_u8(m0, 1); @@ -4382,22 +4256,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2])); q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0]; isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1]; isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2]; isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3]; -#else - int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_1.val[0])), - vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_1.val[0]))); - int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_1.val[1])), - vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_1.val[1]))); - int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_1.val[2])), - vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_1.val[2]))); - int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_1.val[3])), - vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_1.val[3]))); - isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3]; -#endif + scale += 4; q3h.val[0] = vbicq_u8(m2, qhbits.val[0]); @@ -4410,22 +4273,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2])); q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0]; isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1]; isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2]; isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3]; -#else - p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_2.val[0])), - vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_2.val[0]))); - p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_2.val[1])), - vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_2.val[1]))); - p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_2.val[2])), - vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_2.val[2]))); - p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_2.val[3])), - vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_2.val[3]))); - isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3]; -#endif + scale += 4; if (j == 0) { @@ -4864,10 +4716,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const int nb = n / QK_K; #ifdef __ARM_NEON - -#ifdef __ARM_FEATURE_DOTPROD - const int32x4_t vzero = vdupq_n_s32(0); -#endif + const int32x4_t vzero = vdupq_n_s32(0); const uint8x16_t m3b = vdupq_n_u8(0x3); const uint8x16_t mh = vdupq_n_u8(4); @@ -4908,22 +4757,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2])); q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0]; isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2]; isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1]; isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3]; -#else - const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - isum += vaddvq_s16(p0) * scales[0] + vaddvq_s16(p1) * scales[2] + vaddvq_s16(p2) * scales[1] + vaddvq_s16(p3) * scales[3]; -#endif sum += d * isum; @@ -5228,11 +5065,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri uint32_t utmp[4]; #ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); -#ifdef __ARM_FEATURE_DOTPROD const int32x4_t mzero = vdupq_n_s32(0); -#endif ggml_int8x16x2_t q4bytes; ggml_int8x16x2_t q8bytes; @@ -5269,10 +5103,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri int32_t sumi2 = 0; for (int j = 0; j < QK_K/64; ++j) { - const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32; -#ifdef __ARM_FEATURE_DOTPROD q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); @@ -5287,26 +5119,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); sumi2 += vaddvq_s32(p2) * scales[2*j+1]; -#else - q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; - q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); - q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); - const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0]; - - q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; - q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); - q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) * scales[2*j+1]; - -#endif } sumf += d * (sumi1 + sumi2); @@ -5603,12 +5415,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const int nb = n / QK_K; #ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); -#ifdef __ARM_FEATURE_DOTPROD const int32x4_t mzero = vdupq_n_s32(0); -#endif float sumf = 0; @@ -5636,7 +5445,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); -#ifdef __ARM_FEATURE_DOTPROD q8bytes = ggml_vld1q_s8_x4(q8); q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); @@ -5650,27 +5458,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]); const int32_t sumi2 = vaddvq_s32(p2) * scales[1]; -#else - q8bytes = ggml_vld1q_s8_x4(q8); - q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); - q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); - const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - int32_t sumi1 = vaddvq_s16(vaddq_s16(p0, p1)) * scales[0]; - - q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); - q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[2]))); - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[3]))); - int32_t sumi2 = vaddvq_s16(vaddq_s16(p2, p3)) * scales[1]; - -#endif sumf += d * (sumi1 + sumi2); - } *s = sumf - sum_mins; @@ -5875,15 +5663,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri uint32_t utmp[4]; - #ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t mone = vdupq_n_u8(1); const uint8x16_t mtwo = vdupq_n_u8(2); -#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t mzero = vdupq_n_s32(0); -#endif ggml_int8x16x4_t q5bytes; @@ -5938,28 +5722,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2])); q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) - sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++; sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++; -#else - - const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - sumi += vaddvq_s16(vaddq_s16(p0, p1)) * *scales++; - - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - sumi += vaddvq_s16(vaddq_s16(p2, p3)) * *scales++; -#endif } sumf += d * sumi - dmin * sumi_mins; - } *s = sumf; @@ -6311,12 +6078,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const int nb = n / QK_K; #ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t mh = vdupq_n_u8(16); -#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t mzero = vdupq_n_s32(0); -#endif ggml_int8x16x4_t q5bytes; ggml_uint8x16x4_t q5h; @@ -6348,32 +6112,12 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2])); q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) - int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0])); int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1])); int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2])); int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3])); sumf += d * (sumi1 + sumi2 + sumi3 + sumi4); - -#else - - const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - int32_t sumi = sc[0] * vaddvq_s16(p0) + sc[1] * vaddvq_s16(p1); - - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - sumi += sc[2] * vaddvq_s16(p2) + sc[3] * vaddvq_s16(p3); - - sumf += d*sumi; -#endif - } *s = sumf; @@ -6600,13 +6344,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const int nb = n / QK_K; #ifdef __ARM_NEON - float sum = 0; const uint8x16_t m4b = vdupq_n_u8(0xF); -#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t vzero = vdupq_n_s32(0); -#endif //const int8x16_t m32s = vdupq_n_s8(32); const uint8x16_t mone = vdupq_n_u8(3); @@ -6658,31 +6399,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])); q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) - isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; + scale += 4; -#else - - int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1]; - scale += 2; - - int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1]; - scale += 2; -#endif - q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64; shifted = vshrq_n_u8(qhbits.val[0], 4); @@ -6703,34 +6426,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])); q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) - isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; scale += 4; - - //for (int l = 0; l < 4; ++l) { - // const int32x4_t p = vdotq_s32(vzero, q6bytes.val[l], q8bytes.val[l]); - // isum += vaddvq_s32(p) * *scale++; - //} -#else - p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1]; - scale += 2; - - p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1]; - scale += 2; -#endif - } //sum += isum * d_all * y[i].d; sum += d_all * y[i].d * (isum - 32 * isum_mins); @@ -7076,14 +6776,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const int nb = n / QK_K; #ifdef __ARM_NEON - float sum = 0; const uint8x16_t m4b = vdupq_n_u8(0xF); const int8x16_t m32s = vdupq_n_s8(32); -#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t vzero = vdupq_n_s32(0); -#endif const uint8x16_t mone = vdupq_n_u8(3); @@ -7119,26 +6816,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s); q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s); -#if defined(__ARM_FEATURE_DOTPROD) - isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; -#else - - int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1]; - - int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - isum += vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3]; -#endif sum += isum * d_all * y[i].d; From b47879b0dda43f2d26415e88b6840295817e552a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 27 Dec 2023 11:15:31 +0200 Subject: [PATCH 3/5] scripts : add sync-ggml-am.sh --- scripts/sync-ggml-am.sh | 131 ++++++++++++++++++++++++++++++++++++++++ scripts/sync-ggml.last | 1 + 2 files changed, 132 insertions(+) create mode 100755 scripts/sync-ggml-am.sh create mode 100644 scripts/sync-ggml.last diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh new file mode 100755 index 000000000..83abe3681 --- /dev/null +++ b/scripts/sync-ggml-am.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# +# Synchronize ggml changes to llama.cpp +# +# Usage: +# +# $ cd /path/to/llama.cpp +# $ ./scripts/sync-ggml-am.sh +# + +set -e + +sd=$(dirname $0) +cd $sd/../ + +SRC_LLAMA=$(pwd) +SRC_GGML=$(cd ../ggml; pwd) + +if [ ! -d $SRC_GGML ]; then + echo "ggml not found at $SRC_GGML" + exit 1 +fi + +lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last) +echo "Syncing ggml changes since commit $lc" + +cd $SRC_GGML + +git log --oneline $lc..HEAD + +git format-patch $lc --stdout -- \ + include/ggml/ggml*.h \ + src/ggml*.h \ + src/ggml*.c \ + src/ggml*.cpp \ + src/ggml*.m \ + src/ggml*.metal \ + src/ggml*.cu \ + tests/test-opt.cpp \ + tests/test-grad0.cpp \ + tests/test-quantize-fns.cpp \ + tests/test-quantize-perf.cpp \ + tests/test-backend-ops.cpp \ + > $SRC_LLAMA/ggml-src.patch + +# delete files if empty +if [ ! -s $SRC_LLAMA/ggml-src.patch ]; then + rm -v $SRC_LLAMA/ggml-src.patch +fi + +cd $SRC_LLAMA + +if [ -f $SRC_LLAMA/ggml-src.patch ]; then + # replace PR numbers + # + # Subject: some text (#1234) + # Subject: some text (ggml/1234) + cat ggml-src.patch | sed -e 's/^Subject: \(.*\) (#\([0-9]*\))/Subject: \1 (ggml\/\2)/' > ggml-src.patch.tmp + mv ggml-src.patch.tmp ggml-src.patch + + cat ggml-src.patch | sed -e 's/^\(.*\) (#\([0-9]*\))$/\1 (ggml\/\2)/' > ggml-src.patch.tmp + mv ggml-src.patch.tmp ggml-src.patch + + # replace filenames: + # + # src/ggml.c -> ggml.c + # src/ggml-alloc.c -> ggml-alloc.c + # src/ggml-backend-impl.h -> ggml-backend-impl.h + # src/ggml-backend.c -> ggml-backend.c + # src/ggml-cuda.cu -> ggml-cuda.cu + # src/ggml-cuda.h -> ggml-cuda.h + # src/ggml-impl.h -> ggml-impl.h + # src/ggml-metal.h -> ggml-metal.h + # src/ggml-metal.m -> ggml-metal.m + # src/ggml-metal.metal -> ggml-metal.metal + # src/ggml-mpi.h -> ggml-mpi.h + # src/ggml-mpi.c -> ggml-mpi.c + # src/ggml-opencl.cpp -> ggml-opencl.cpp + # src/ggml-opencl.h -> ggml-opencl.h + # src/ggml-quants.c -> ggml-quants.c + # src/ggml-quants.h -> ggml-quants.h + # include/ggml/ggml.h -> ggml.h + # include/ggml/ggml-alloc.h -> ggml-alloc.h + # include/ggml/ggml-backend.h -> ggml-backend.h + # + # tests/test-opt.cpp -> tests/test-opt.cpp + # tests/test-grad0.cpp -> tests/test-grad0.cpp + # tests/test-quantize-fns.cpp -> tests/test-quantize-fns.cpp + # tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp + # tests/test-backend-ops.cpp -> tests/test-backend-ops.cpp + + cat ggml-src.patch | sed \ + -e 's/src\/ggml\.c/ggml.c/g' \ + -e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \ + -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \ + -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \ + -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \ + -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \ + -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \ + -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \ + -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \ + -e 's/src\/ggml-metal\.metal/ggml-metal.metal/g' \ + -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \ + -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \ + -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \ + -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \ + -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \ + -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \ + -e 's/include\/ggml\/ggml\.h/ggml.h/g' \ + -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \ + -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \ + -e 's/tests\/test-opt\.cpp/tests\/test-opt.cpp/g' \ + -e 's/tests\/test-grad0\.cpp/tests\/test-grad0.cpp/g' \ + -e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \ + -e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \ + -e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \ + > ggml-src.patch.tmp + mv ggml-src.patch.tmp ggml-src.patch + + git am ggml-src.patch + + rm -v $SRC_LLAMA/ggml-src.patch +fi + +# update last commit +cd $SRC_GGML +git log -1 --format=%H > $SRC_LLAMA/scripts/sync-ggml.last + +echo "Done" + +exit 0 diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last new file mode 100644 index 000000000..1ec144116 --- /dev/null +++ b/scripts/sync-ggml.last @@ -0,0 +1 @@ +76e7f47b69e8334384dc718480c496dafbd47999 From 879b690a9e1eb1ab0a29b58236fc76978fb4d902 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 27 Dec 2023 15:16:55 +0100 Subject: [PATCH 4/5] finetune : fix output formatting in print_params (#4653) This commit fixes the output formatting in the print_params function which currently looks like this: ```console print_params: n_vocab: 32000 print_params: n_ctx: 128 print_params: n_embd: 4096 print_params: n_ff: 11008 print_params: n_head: 32 print_params: n_head_kv: 32 print_params: n_layer: 32 print_params: norm_rms_eps : 0.000010 print_params: rope_freq_base : 10000.000000 print_params: rope_freq_scale : 1.000000 ``` With this comit the output will look like this: ```console print_params: n_vocab : 32000 print_params: n_ctx : 128 print_params: n_embd : 4096 print_params: n_ff : 11008 print_params: n_head : 32 print_params: n_head_kv : 32 print_params: n_layer : 32 print_params: norm_rms_eps : 0.000010 print_params: rope_freq_base : 10000.000000 print_params: rope_freq_scale : 1.000000 ``` Signed-off-by: Daniel Bevenius --- examples/finetune/finetune.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 7b1333a9d..e0520f64c 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -196,13 +196,13 @@ static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab: %u\n", __func__, params->n_vocab); - printf("%s: n_ctx: %u\n", __func__, params->n_ctx); - printf("%s: n_embd: %u\n", __func__, params->n_embd); - printf("%s: n_ff: %u\n", __func__, params->n_ff); - printf("%s: n_head: %u\n", __func__, params->n_head); - printf("%s: n_head_kv: %u\n", __func__, params->n_head_kv); - printf("%s: n_layer: %u\n", __func__, params->n_layer); + printf("%s: n_vocab : %u\n", __func__, params->n_vocab); + printf("%s: n_ctx : %u\n", __func__, params->n_ctx); + printf("%s: n_embd : %u\n", __func__, params->n_embd); + printf("%s: n_ff : %u\n", __func__, params->n_ff); + printf("%s: n_head : %u\n", __func__, params->n_head); + printf("%s: n_head_kv : %u\n", __func__, params->n_head_kv); + printf("%s: n_layer : %u\n", __func__, params->n_layer); printf("%s: norm_rms_eps : %f\n", __func__, params->f_norm_rms_eps); printf("%s: rope_freq_base : %f\n", __func__, params->rope_freq_base); printf("%s: rope_freq_scale : %f\n", __func__, params->rope_freq_scale); From f6793491b5af6da75edad34d6f503ef86d31b09f Mon Sep 17 00:00:00 2001 From: "Nam D. Tran" <42194884+namtranase@users.noreply.github.com> Date: Wed, 27 Dec 2023 22:39:45 +0700 Subject: [PATCH 5/5] llama : add AWQ for llama, llama2, mpt, and mistral models (#4593) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update: awq support llama-7b model * update: change order * update: benchmark results for llama2-7b * update: mistral 7b v1 benchmark * update: support 4 models * fix: Readme * update: ready for PR * update: readme * fix: readme * update: change order import * black * format code * update: work for bot mpt and awqmpt * update: readme * Rename to llm_build_ffn_mpt_awq * Formatted other files * Fixed params count * fix: remove code * update: more detail for mpt * fix: readme * fix: readme * update: change folder architecture * fix: common.cpp * fix: readme * fix: remove ggml_repeat * update: cicd * update: cicd * uppdate: remove use_awq arg * update: readme * llama : adapt plamo to new ffn ggml-ci --------- Co-authored-by: Trần Đức Nam Co-authored-by: Le Hoang Anh Co-authored-by: Georgi Gerganov --- awq-py/README.md | 116 +++++++++++++++ awq-py/awq/apply_awq.py | 254 +++++++++++++++++++++++++++++++++ awq-py/requirements.txt | 2 + convert-hf-to-gguf.py | 27 +++- convert.py | 14 ++ gguf-py/gguf/constants.py | 3 + gguf-py/gguf/tensor_mapping.py | 5 + llama.cpp | 27 +++- 8 files changed, 443 insertions(+), 5 deletions(-) create mode 100644 awq-py/README.md create mode 100644 awq-py/awq/apply_awq.py create mode 100644 awq-py/requirements.txt diff --git a/awq-py/README.md b/awq-py/README.md new file mode 100644 index 000000000..59354f4e3 --- /dev/null +++ b/awq-py/README.md @@ -0,0 +1,116 @@ +# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp +[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)] + +**Supported models:** + +- [X] LLaMA +- [x] LLaMA 2 +- [X] MPT +- [X] Mistral AI v0.1 +- [ ] Bloom +- [ ] Mixtral MoE + +**TODO:** +- [x] Update version work with both MPT and MPT-AWQ model +- [ ] Add OPT model +- [ ] Add Bloom model +- [ ] Add Mixtral MoE +- [ ] Support w3, w2 + + +## Contents + +- [Install](##Install) +- [Convert](##Convert) +- [Quantize](##Quantize) +- [Test](##Test) +- [Benchmark](##Benchmark) +- [Results](##Results) + +## Install +Install requirements +```bash +pip install -r requirements.txt +``` +Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT +```bash +git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache +``` + +## Convert +Example for llama model +```bash +# For llama7b and llama2 models +python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf +# For mistral and mpt models +python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf +``` + +## Quantize +```bash +# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types. +./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0 +``` + +## Test +```bash +# For all models. +./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time" +``` + +## Benchmark +The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512. +```bash +# For llama and llama2, and mistral models. +./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw +``` + +## Results +Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison +We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k + +### Llama 7B (Build with OpenBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|-----------:|--------------|-------:|-------:|-------:|-------:| +|Llama 7B | perplexity | 5.9066 | 6.1214 | 6.0643 | 6.5808 | +|Llama 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G | +|Llama 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-LLama 7B| perplexity | 5.9175 | 6.0252 | 5.9987 | 6.3692 | +|AWQ-LLama 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G | +|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | + + +### Llama2 7B (Build with CuBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|------------:|--------------|-------:|-------:|-------:|-------:| +|Llama2 7B | perplexity | 5.8664 | 6.0260 | 6.0656 | 6.4496 | +|Llama2 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G | +|Llama2 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-LLama2 7B| perplexity | 5.8801 | 6.0054 | 5.9849 | 6.3650 | +|AWQ-LLama2 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G | +|AWQ-LLama2 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | + + +### Mistral 7B v0.1 (Build with CuBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|-------------:|--------------|-------:|-------:|-------:|-------:| +|Mistral 7B | perplexity | 5.6931 | 5.8202 | 5.8268 | 6.1645 | +|Mistral 7B | file size | 14.5G | 4.1G | 4.5G | 3.1G | +|Mistral 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-Mistral 7B| perplexity | 5.6934 | 5.8020 | 5.7691 | 6.0426 | +|AWQ-Mistral 7B| file size | 14.5G | 4.1G | 4.5G | 3.1G | +|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | + +### MPT 7B (Build with OpenBLAS) + +| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | +|---------:|--------------|-------:|-------:|-------:|--------:| +|MPT 7B | perplexity | 8.4369 | 8.7956 | 8.6265 | 11.4913 | +|MPT 7B | file size | 13.7G | 3.9G | 4.3G | 2.8G | +|MPT 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | +|AWQ-MPT 7B| perplexity | 8.4944 | 8.7053 | 8.6750 | 10.2873| +|AWQ-MPT 7B| file size | 13.7G | 3.9G | 4.3G | 2.8G | +|AWQ-MPT 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | diff --git a/awq-py/awq/apply_awq.py b/awq-py/awq/apply_awq.py new file mode 100644 index 000000000..11132c5d2 --- /dev/null +++ b/awq-py/awq/apply_awq.py @@ -0,0 +1,254 @@ +""" +Implements the AWQ for llama.cpp use cases. +Original paper: https://arxiv.org/abs/2306.00978 + +This code is based on versions of the AWQ implementation found in the following repositories: +* https://github.com/mit-han-lab/llm-awq +* https://github.com/casper-hansen/AutoAWQ +""" + +import os +import torch +import torch.nn as nn + +from transformers import AutoModelForCausalLM, AutoConfig +from transformers.models.bloom.modeling_bloom import BloomGelu +from transformers.models.llama.modeling_llama import LlamaRMSNorm +from transformers.activations import GELUActivation + + +class ScaledActivation(nn.Module): + """ + ScaledActivation module wraps an existing activation function and applies a + scale factor to its output. + + Args: + module (nn.Module): The activation function to be scaled. + scales (torch.Tensor): A tensor of size (num_features,) containing the initial + scale factors for each feature. + + Returns: + torch.Tensor: The scaled output of the activation function. + """ + + def __init__(self, module, scales): + super().__init__() + self.act = module + self.scales = nn.Parameter(scales.data) + + def forward(self, x): + return self.act(x) / self.scales.view(1, 1, -1).to(x.device) + + +def set_op_by_name(layer, name, new_module): + """ + Set the new module for given module's name. + + Args: + layer (nn.Module): The layer in which to replace the submodule. + name (str): The path to the submodule to be replaced, using dot notation + to access nested modules. + new_module (nn.Module): The new module to replace the existing one. + """ + levels = name.split(".") + if len(levels) > 1: + mod_ = layer + for l_idx in range(len(levels) - 1): + if levels[l_idx].isdigit(): + mod_ = mod_[int(levels[l_idx])] + else: + mod_ = getattr(mod_, levels[l_idx]) + setattr(mod_, levels[-1], new_module) + else: + setattr(layer, name, new_module) + + +def get_op_by_name(module, op_name): + """ + Retrieves a submodule within a given layer based on its name. + + Args: + module (nn.Module): The layer containing the submodule to find. + op_name (str): The name of the submodule. + + Returns: + nn.Module: The requested submodule found within the given layer. + + Raises: + ValueError: If the specified submodule cannot be found within the layer. + """ + for name, m in module.named_modules(): + if name == op_name: + return m + raise ValueError(f"Cannot find op {op_name} in module {module}") + + +@torch.no_grad() +def scale_ln_fcs(ln, fcs, scales): + """ + Scales the weights of a LayerNorm and a list of fully-connected layers proportionally. + + Args: + ln (nn.LayerNorm): The LayerNorm module to be scaled. + fcs (List[nn.Linear]): A list of fully-connected layers to be scaled. + scales (torch.Tensor): A 1D tensor of size (num_features,). + """ + + if not isinstance(fcs, list): + fcs = [fcs] + + scales = scales.to(ln.weight.device) + + ln.weight.div_(scales) + if hasattr(ln, "bias") and ln.bias is not None: + ln.bias.div_(scales) + + for fc in fcs: + fc.weight.mul_(scales.view(1, -1)) + + for p in ln.parameters(): + assert torch.isnan(p).sum() == 0 + for fc in fcs: + for p in fc.parameters(): + assert torch.isnan(p).sum() == 0 + + +@torch.no_grad() +def scale_fc_fc(fc1, fc2, scales): + """ + Scales the weights of two fully-connected layers in a specific pattern. + + Args: + fc1 (nn.Linear): The first fully-connected layer to be scaled. + fc2 (nn.Linear): The second fully-connected layer to be scaled. + scales (torch.Tensor): A 1D tensor of size (num_features,). + """ + assert isinstance(fc1, nn.Linear) + assert isinstance(fc2, nn.Linear) + + scales = scales.to(fc1.weight.device) + + fc1.weight[-scales.size(0):].div_(scales.view(-1, 1)) + if fc1.bias is not None: + fc1.bias.div_(scales.view(-1)) + + fc2.weight.mul_(scales.view(1, -1)) + + for p in fc1.parameters(): + assert torch.isnan(p).sum() == 0 + for p in fc2.parameters(): + assert torch.isnan(p).sum() == 0 + + +@torch.no_grad() +def scale_gelu_fc(gelu, fc, scales): + """ + Scales the weight of a GELU activation and a fully-connected layer proportionally. + + Args: + gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled. + fc (nn.Linear): The fully-connected layer to be scaled. + scales (torch.Tensor): A 1D tensor of size (num_features,). + + Raises: + TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`. + TypeError: If the `fc` module is not of type `nn.Linear`. + """ + assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation)) + assert isinstance(fc, nn.Linear) + + fc.weight.mul_(scales.view(1, -1).to(fc.weight.device)) + + for p in fc.parameters(): + assert torch.isnan(p).sum() == 0 + + +def apply_scale(module, scales_list, input_feat_dict=None): + """ + Applies different scaling strategies to layers based on their type and hierarchy within a given module. + + Args: + module (nn.Module): The module containing the layers to be scaled. + scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing: + * prev_op_name (str): The name of the preceding operation or module, + relative to which the layers to be scaled are located. + * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation. + * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature. + input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding + input features (optional). + """ + for prev_op_name, layer_names, scales in scales_list: + prev_op = get_op_by_name(module, prev_op_name) + layers = [get_op_by_name(module, name) for name in layer_names] + + prev_op.cuda() + for layer in layers: + layer.cuda() + scales.cuda() + + if isinstance(prev_op, nn.Linear): + assert len(layers) == 1 + scale_fc_fc(prev_op, layers[0], scales) + elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower(): + scale_ln_fcs(prev_op, layers, scales) + elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)): + new_module = ScaledActivation(prev_op, scales) + set_op_by_name(module, prev_op_name, new_module) + scale_gelu_fc(prev_op, layers[0], scales) + else: + raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!") + + # apply the scaling to input feat if given; prepare it for clipping + if input_feat_dict is not None: + for layer_name in layer_names: + inp = input_feat_dict[layer_name] + inp.div_(scales.view(1, -1).to(inp.device)) + + prev_op.cpu() + for layer in layers: + layer.cpu() + scales.cpu() + + +@torch.no_grad() +def apply_clip(module, clip_list): + """ + Applies element-wise clipping to the weight of a specific layer within a given module. + + Args: + module (nn.Module): The module containing the layer to be clipped. + clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing: + * name (str): The name of the layer to be clipped, relative to the root of the module. + * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight. + """ + for name, max_val in clip_list: + layer = get_op_by_name(module, name) + layer.cuda() + max_val = max_val.to(layer.weight.device) + org_shape = layer.weight.shape + layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1) + layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val) + layer.weight.data = layer.weight.data.reshape(org_shape) + layer.cpu() + + +def add_scale_weights(model_path, scale_path, tmp_path): + """ + Adds pre-computed Activation Weight Quantization (AWQ) results to a model, + including scaling factors and clipping bounds. + + Args: + model_path (str): Path to the pre-trained model to be equipped with AWQ. + scale_path (str): Path to the AWQ scale factors (.pt file). + tmp_path (str): Path to the temporary directory where the equipped model will be saved. + """ + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + model_path, config=config, trust_remote_code=True + ) + model.eval() + awq_results = torch.load(str(scale_path), map_location="cpu") + apply_scale(model, awq_results["scale"]) + apply_clip(model, awq_results["clip"]) + model.save_pretrained(str(tmp_path)) + os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}") diff --git a/awq-py/requirements.txt b/awq-py/requirements.txt new file mode 100644 index 000000000..5fe604329 --- /dev/null +++ b/awq-py/requirements.txt @@ -0,0 +1,2 @@ +torch>=2.0.0 +transformers>=4.32.0 diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 303d08170..7dbc28147 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -46,7 +46,7 @@ class Model: self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) self.model_arch = self._get_model_architecture() - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess) + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) def set_vocab(self): self._set_vocab_gpt2() @@ -59,7 +59,7 @@ class Model: from safetensors import safe_open ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) else: - ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) + ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True)) with ctx as model_part: for name in model_part.keys(): @@ -464,7 +464,11 @@ class MPTModel(Model): data = data_torch.squeeze().numpy() # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if "scales" in name: + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) + new_name = new_name.replace("scales", "act.scales") + else: + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: print(f"Can not map tensor {name!r}") sys.exit() @@ -1095,6 +1099,9 @@ def parse_args() -> argparse.Namespace: "--vocab-only", action="store_true", help="extract only the vocab", ) + parser.add_argument( + "--awq-path", type=Path, default=None, + help="Path to scale awq cache file") parser.add_argument( "--outfile", type=Path, help="path to write to; default: based on input", @@ -1115,6 +1122,20 @@ def parse_args() -> argparse.Namespace: args = parse_args() dir_model = args.model + +if args.awq_path: + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights + tmp_model_path = args.model / "weighted_model" + dir_model = tmp_model_path + if tmp_model_path.is_dir(): + print(f"{tmp_model_path} exists as a weighted model.") + else: + tmp_model_path.mkdir(parents=True, exist_ok=True) + print("Saving new weighted model ...") + add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) + print(f"Saved weighted model at {tmp_model_path}.") + if not dir_model.is_dir(): print(f'Error: {args.model} is not a directory', file=sys.stderr) sys.exit(1) diff --git a/convert.py b/convert.py index 1f0c4f2f4..c3f3fc0a1 100755 --- a/convert.py +++ b/convert.py @@ -1187,6 +1187,7 @@ def main(args_in: list[str] | None = None) -> None: # We currently only support Q8_0 output on little endian systems. output_choices.append("q8_0") parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") + parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") @@ -1200,6 +1201,19 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") args = parser.parse_args(args_in) + if args.awq_path: + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights + tmp_model_path = args.model / "weighted_model" + if tmp_model_path.is_dir(): + print(f"{tmp_model_path} exists as a weighted model.") + else: + tmp_model_path.mkdir(parents=True, exist_ok=True) + print("Saving new weighted model ...") + add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) + print(f"Saved weighted model at {tmp_model_path}.") + args.model = tmp_model_path + if args.dump_single: model_plus = lazy_load_file(args.model) do_dump_model(model_plus) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 4cd87cdda..c9be21119 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -120,6 +120,7 @@ class MODEL_TENSOR(IntEnum): FFN_GATE = auto() FFN_DOWN = auto() FFN_UP = auto() + FFN_ACT = auto() FFN_GATE_EXP = auto() FFN_DOWN_EXP = auto() FFN_UP_EXP = auto() @@ -169,6 +170,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", @@ -269,6 +271,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_ACT, ], MODEL_ARCH.GPTJ: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 446c6b688..0b8f70417 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -188,6 +188,11 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral ), + # AWQ-activation gate + MODEL_TENSOR.FFN_ACT: ( + "transformer.blocks.{bid}.ffn.act", # mpt + ), + # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( "model.layers.{bid}.mlp.gate_proj", # llama-hf refact diff --git a/llama.cpp b/llama.cpp index 4aa59c4c0..bf1b01a90 100644 --- a/llama.cpp +++ b/llama.cpp @@ -354,6 +354,7 @@ enum llm_tensor { LLM_TENSOR_FFN_GATE, LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_ACT, LLM_TENSOR_FFN_DOWN_EXP, LLM_TENSOR_FFN_GATE_EXP, LLM_TENSOR_FFN_UP_EXP, @@ -473,6 +474,7 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" }, }, }, { @@ -1285,6 +1287,7 @@ struct llama_hparams { float f_clamp_kqv; float f_max_alibi_bias; + bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; if (this->n_vocab != other.n_vocab) return true; @@ -1388,6 +1391,7 @@ struct llama_layer { // ff bias struct ggml_tensor * ffn_down_b; // b2 struct ggml_tensor * ffn_up_b; // b3 + struct ggml_tensor * ffn_act; }; struct llama_kv_cell { @@ -3471,7 +3475,6 @@ static bool llm_load_tensors( case LLM_ARCH_MPT: { model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - // output { ggml_backend_type backend_norm; @@ -3509,6 +3512,9 @@ static bool llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + + // AWQ ScaleActivation layer + layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false); } } break; case LLM_ARCH_STABLELM: @@ -4039,6 +4045,7 @@ static struct ggml_tensor * llm_build_ffn( struct ggml_tensor * gate_b, struct ggml_tensor * down, struct ggml_tensor * down_b, + struct ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, const llm_build_cb & cb, @@ -4083,6 +4090,10 @@ static struct ggml_tensor * llm_build_ffn( { cur = ggml_gelu(ctx, cur); cb(cur, "ffn_gelu", il); + if (act_scales != NULL) { + cur = ggml_div(ctx, cur, act_scales); + cb(cur, "ffn_act", il); + } } break; case LLM_FFN_RELU: { @@ -4401,6 +4412,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } else { @@ -4580,6 +4592,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -4694,6 +4707,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -4798,6 +4812,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5002,6 +5017,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5088,6 +5104,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5183,6 +5200,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5268,11 +5286,11 @@ struct llm_build_context { NULL, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, + model.layers[il].ffn_act, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5381,6 +5399,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5493,6 +5512,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5600,6 +5620,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(ffn_output, "ffn_out", il); } @@ -5703,6 +5724,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5887,6 +5909,7 @@ static const std::unordered_map k_offload_map { "ffn_gate", OFFLOAD_FUNC }, { "ffn_gate_b", OFFLOAD_FUNC }, { "ffn_gate_par", OFFLOAD_FUNC }, + { "ffn_act", OFFLOAD_FUNC }, { "ffn_down", OFFLOAD_FUNC }, { "ffn_down_b", OFFLOAD_FUNC }, { "ffn_out", OFFLOAD_FUNC },