Merge branch 'master' into concedo_experimental

# Conflicts:
#	README.md
This commit is contained in:
Concedo 2023-12-27 21:43:46 +08:00
commit 69ab1bf2f8
7 changed files with 406 additions and 593 deletions

View file

@ -357,6 +357,7 @@ class VocabLoader:
for tok in self.tokenizer.all_special_tokens
}
self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
self.reverse_vocab = {id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
self.vocab_size_base: int = self.tokenizer.vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
self.fname_tokenizer: Path = fname_tokenizer
@ -370,15 +371,13 @@ class VocabLoader:
self.spm = None
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.tokenizer
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
added_tokens_ids = set(self.added_tokens_dict.values())
for i in range(self.vocab_size_base):
if i in added_tokens_ids:
continue
text = reverse_vocab[i].encode("utf-8")
text = self.reverse_vocab[i].encode("utf-8")
yield text, self.get_token_score(i), self.get_token_type(i)
def get_token_type(self, token_id: int) -> gguf.TokenType:
@ -394,10 +393,13 @@ class VocabLoader:
if self.spm.is_byte(token_id):
toktype = gguf.TokenType.BYTE
else:
token = self.reverse_vocab[token_id]
if token_id == self.unk_token_id:
toktype = gguf.TokenType.UNKNOWN
if token_id in self.special_ids:
elif token_id in self.special_ids:
toktype = gguf.TokenType.CONTROL
elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"):
toktype = gguf.TokenType.BYTE
return toktype

File diff suppressed because it is too large Load diff

View file

@ -409,6 +409,18 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
#define ggml_vld1q_s8_x4 vld1q_s8_x4
#endif
#if !defined(__ARM_FEATURE_DOTPROD)
inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
}
#endif
#endif
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
@ -2470,32 +2482,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
const int8x16_t v1_1l = vld1q_s8(y1->qs);
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
#if defined(__ARM_FEATURE_DOTPROD)
// dot product into int32x4_t
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
#endif
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -2778,32 +2770,12 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
const int8x16_t v1_1l = vld1q_s8(y1->qs);
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
#if defined(__ARM_FEATURE_DOTPROD)
// dot product into int32x4_t
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
#endif
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
@ -2965,32 +2937,12 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
const int8x16_t v1_1l = vld1q_s8(y1->qs);
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
#if defined(__ARM_FEATURE_DOTPROD)
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
#endif
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -3277,32 +3229,12 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
const int8x16_t v1_1l = vld1q_s8(y1->qs);
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
#if defined(__ARM_FEATURE_DOTPROD)
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
#endif
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
@ -3552,7 +3484,6 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
const int8x16_t y1_0 = vld1q_s8(y1->qs);
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
#if defined(__ARM_FEATURE_DOTPROD)
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
@ -3560,26 +3491,6 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
#else
const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
#endif
}
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -3652,12 +3563,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const int nb = n / QK_K;
#ifdef __ARM_NEON
const uint8x16_t m3 = vdupq_n_u8(0x3);
const uint8x16_t m4 = vdupq_n_u8(0xF);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0);
#endif
const int32x4_t vzero = vdupq_n_s32(0);
ggml_int8x16x2_t q2bytes;
uint8_t aux[16];
@ -3665,7 +3574,6 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
float sum = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
@ -3691,20 +3599,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
// We use this macro instead of a function call because for some reason
// the code runs 2-3% slower, even if the function is declared inline
#if defined(__ARM_FEATURE_DOTPROD)
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
#else
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
{\
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),\
vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));\
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),\
vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));\
isum += vaddvq_s16(p1) * aux[is+(index)] + vaddvq_s16(p2) * aux[is+1+(index)];\
}
#endif
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
@ -3712,26 +3609,23 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
MULTIPLY_ACCUM_WITH_SCALE((index));
for (int j = 0; j < QK_K/128; ++j) {
const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
MULTIPLY_ACCUM_WITH_SCALE(0);
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
is += 8;
}
sum += d * isum;
sum += d * isum;
}
*s = sum;
@ -4045,11 +3939,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
const int nb = n / QK_K;
#ifdef __ARM_NEON
const uint8x16_t m3 = vdupq_n_u8(0x3);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0);
#endif
const int32x4_t vzero = vdupq_n_s32(0);
ggml_int8x16x4_t q2bytes;
@ -4083,28 +3975,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
#if defined(__ARM_FEATURE_DOTPROD)
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
#else
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));
isum1 += vaddvq_s16(p1) * scales[0];
isum2 += vaddvq_s16(p2) * scales[1];
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
vmull_s8(vget_high_s8(q2bytes.val[2]), vget_high_s8(q8bytes.val[2])));
const int16x8_t p4 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
vmull_s8(vget_high_s8(q2bytes.val[3]), vget_high_s8(q8bytes.val[3])));
isum1 += vaddvq_s16(p3) * scales[2];
isum2 += vaddvq_s16(p4) * scales[3];
#endif
sum += d * (isum1 + isum2);
}
*s = sum;
@ -4330,9 +4206,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
uint32_t utmp[4];
const uint8x16_t m3b = vdupq_n_u8(0x3);
#ifdef __ARM_FEATURE_DOTPROD
const int32x4_t vzero = vdupq_n_s32(0);
#endif
const uint8x16_t m0 = vdupq_n_u8(1);
const uint8x16_t m1 = vshlq_n_u8(m0, 1);
@ -4384,22 +4258,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
#if defined(__ARM_FEATURE_DOTPROD)
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
#else
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_1.val[0])),
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_1.val[0])));
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_1.val[1])),
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_1.val[1])));
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_1.val[2])),
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_1.val[2])));
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_1.val[3])),
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_1.val[3])));
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
#endif
scale += 4;
q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
@ -4412,22 +4275,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
#if defined(__ARM_FEATURE_DOTPROD)
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
#else
p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_2.val[0])),
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_2.val[0])));
p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_2.val[1])),
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_2.val[1])));
p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_2.val[2])),
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_2.val[2])));
p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_2.val[3])),
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_2.val[3])));
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
#endif
scale += 4;
if (j == 0) {
@ -4866,10 +4718,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
const int nb = n / QK_K;
#ifdef __ARM_NEON
#ifdef __ARM_FEATURE_DOTPROD
const int32x4_t vzero = vdupq_n_s32(0);
#endif
const int32x4_t vzero = vdupq_n_s32(0);
const uint8x16_t m3b = vdupq_n_u8(0x3);
const uint8x16_t mh = vdupq_n_u8(4);
@ -4910,22 +4759,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
#if defined(__ARM_FEATURE_DOTPROD)
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
#else
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes.val[0])));
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes.val[1])));
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes.val[2])));
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes.val[3])));
isum += vaddvq_s16(p0) * scales[0] + vaddvq_s16(p1) * scales[2] + vaddvq_s16(p2) * scales[1] + vaddvq_s16(p3) * scales[3];
#endif
sum += d * isum;
@ -5230,11 +5067,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
uint32_t utmp[4];
#ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf);
#ifdef __ARM_FEATURE_DOTPROD
const int32x4_t mzero = vdupq_n_s32(0);
#endif
ggml_int8x16x2_t q4bytes;
ggml_int8x16x2_t q8bytes;
@ -5271,10 +5105,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
int32_t sumi2 = 0;
for (int j = 0; j < QK_K/64; ++j) {
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
#ifdef __ARM_FEATURE_DOTPROD
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
@ -5289,26 +5121,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
#else
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) * scales[2*j+1];
#endif
}
sumf += d * (sumi1 + sumi2);
@ -5605,12 +5417,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
const int nb = n / QK_K;
#ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf);
#ifdef __ARM_FEATURE_DOTPROD
const int32x4_t mzero = vdupq_n_s32(0);
#endif
float sumf = 0;
@ -5638,7 +5447,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
#ifdef __ARM_FEATURE_DOTPROD
q8bytes = ggml_vld1q_s8_x4(q8);
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
@ -5652,27 +5460,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
#else
q8bytes = ggml_vld1q_s8_x4(q8);
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
int32_t sumi1 = vaddvq_s16(vaddq_s16(p0, p1)) * scales[0];
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[2])),
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[2])));
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[3])),
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[3])));
int32_t sumi2 = vaddvq_s16(vaddq_s16(p2, p3)) * scales[1];
#endif
sumf += d * (sumi1 + sumi2);
}
*s = sumf - sum_mins;
@ -5877,15 +5665,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
uint32_t utmp[4];
#ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf);
const uint8x16_t mone = vdupq_n_u8(1);
const uint8x16_t mtwo = vdupq_n_u8(2);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t mzero = vdupq_n_s32(0);
#endif
ggml_int8x16x4_t q5bytes;
@ -5940,28 +5724,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
#if defined(__ARM_FEATURE_DOTPROD)
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
#else
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
sumi += vaddvq_s16(vaddq_s16(p0, p1)) * *scales++;
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
sumi += vaddvq_s16(vaddq_s16(p2, p3)) * *scales++;
#endif
}
sumf += d * sumi - dmin * sumi_mins;
}
*s = sumf;
@ -6313,12 +6080,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
const int nb = n / QK_K;
#ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf);
const uint8x16_t mh = vdupq_n_u8(16);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t mzero = vdupq_n_s32(0);
#endif
ggml_int8x16x4_t q5bytes;
ggml_uint8x16x4_t q5h;
@ -6350,32 +6114,12 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
#if defined(__ARM_FEATURE_DOTPROD)
int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
#else
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
int32_t sumi = sc[0] * vaddvq_s16(p0) + sc[1] * vaddvq_s16(p1);
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
sumi += sc[2] * vaddvq_s16(p2) + sc[3] * vaddvq_s16(p3);
sumf += d*sumi;
#endif
}
*s = sumf;
@ -6602,13 +6346,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
const int nb = n / QK_K;
#ifdef __ARM_NEON
float sum = 0;
const uint8x16_t m4b = vdupq_n_u8(0xF);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0);
#endif
//const int8x16_t m32s = vdupq_n_s8(32);
const uint8x16_t mone = vdupq_n_u8(3);
@ -6660,31 +6401,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
#if defined(__ARM_FEATURE_DOTPROD)
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
scale += 4;
#else
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
scale += 2;
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
scale += 2;
#endif
q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
shifted = vshrq_n_u8(qhbits.val[0], 4);
@ -6705,34 +6428,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
#if defined(__ARM_FEATURE_DOTPROD)
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
scale += 4;
//for (int l = 0; l < 4; ++l) {
// const int32x4_t p = vdotq_s32(vzero, q6bytes.val[l], q8bytes.val[l]);
// isum += vaddvq_s32(p) * *scale++;
//}
#else
p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
scale += 2;
p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
scale += 2;
#endif
}
//sum += isum * d_all * y[i].d;
sum += d_all * y[i].d * (isum - 32 * isum_mins);
@ -7078,14 +6778,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
const int nb = n / QK_K;
#ifdef __ARM_NEON
float sum = 0;
const uint8x16_t m4b = vdupq_n_u8(0xF);
const int8x16_t m32s = vdupq_n_s8(32);
#if defined(__ARM_FEATURE_DOTPROD)
const int32x4_t vzero = vdupq_n_s32(0);
#endif
const uint8x16_t mone = vdupq_n_u8(3);
@ -7121,26 +6818,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
#if defined(__ARM_FEATURE_DOTPROD)
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
#else
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
isum += vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
#endif
sum += isum * d_all * y[i].d;

7
ggml.c
View file

@ -4041,7 +4041,6 @@ static struct ggml_tensor * ggml_group_norm_impl(
result->op = GGML_OP_GROUP_NORM;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = NULL; // TODO: maybe store epsilon here?
return result;
}
@ -5541,7 +5540,6 @@ static struct ggml_tensor * ggml_upscale_impl(
result->op_params[0] = scale_factor;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = NULL;
return result;
}
@ -5846,7 +5844,6 @@ struct ggml_tensor * ggml_get_rel_pos(
result->op = GGML_OP_GET_REL_POS;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = NULL;
return result;
}
@ -17452,9 +17449,9 @@ static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g
}
//
// ADAM
// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
//
// ref: https://arxiv.org/pdf/1412.6980.pdf
// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
//
static enum ggml_opt_result ggml_opt_adam(

View file

@ -9784,7 +9784,8 @@ struct llama_context * llama_new_context_with_model(
ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
if (model->n_gpu_layers > 0) {
ggml_cuda_set_scratch_size(alloc_size);
// the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
ggml_cuda_set_scratch_size(alloc_size + 64);
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
// calculate total VRAM usage

131
scripts/sync-ggml-am.sh Executable file
View file

@ -0,0 +1,131 @@
#!/bin/bash
#
# Synchronize ggml changes to llama.cpp
#
# Usage:
#
# $ cd /path/to/llama.cpp
# $ ./scripts/sync-ggml-am.sh
#
set -e
sd=$(dirname $0)
cd $sd/../
SRC_LLAMA=$(pwd)
SRC_GGML=$(cd ../ggml; pwd)
if [ ! -d $SRC_GGML ]; then
echo "ggml not found at $SRC_GGML"
exit 1
fi
lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
echo "Syncing ggml changes since commit $lc"
cd $SRC_GGML
git log --oneline $lc..HEAD
git format-patch $lc --stdout -- \
include/ggml/ggml*.h \
src/ggml*.h \
src/ggml*.c \
src/ggml*.cpp \
src/ggml*.m \
src/ggml*.metal \
src/ggml*.cu \
tests/test-opt.cpp \
tests/test-grad0.cpp \
tests/test-quantize-fns.cpp \
tests/test-quantize-perf.cpp \
tests/test-backend-ops.cpp \
> $SRC_LLAMA/ggml-src.patch
# delete files if empty
if [ ! -s $SRC_LLAMA/ggml-src.patch ]; then
rm -v $SRC_LLAMA/ggml-src.patch
fi
cd $SRC_LLAMA
if [ -f $SRC_LLAMA/ggml-src.patch ]; then
# replace PR numbers
#
# Subject: some text (#1234)
# Subject: some text (ggml/1234)
cat ggml-src.patch | sed -e 's/^Subject: \(.*\) (#\([0-9]*\))/Subject: \1 (ggml\/\2)/' > ggml-src.patch.tmp
mv ggml-src.patch.tmp ggml-src.patch
cat ggml-src.patch | sed -e 's/^\(.*\) (#\([0-9]*\))$/\1 (ggml\/\2)/' > ggml-src.patch.tmp
mv ggml-src.patch.tmp ggml-src.patch
# replace filenames:
#
# src/ggml.c -> ggml.c
# src/ggml-alloc.c -> ggml-alloc.c
# src/ggml-backend-impl.h -> ggml-backend-impl.h
# src/ggml-backend.c -> ggml-backend.c
# src/ggml-cuda.cu -> ggml-cuda.cu
# src/ggml-cuda.h -> ggml-cuda.h
# src/ggml-impl.h -> ggml-impl.h
# src/ggml-metal.h -> ggml-metal.h
# src/ggml-metal.m -> ggml-metal.m
# src/ggml-metal.metal -> ggml-metal.metal
# src/ggml-mpi.h -> ggml-mpi.h
# src/ggml-mpi.c -> ggml-mpi.c
# src/ggml-opencl.cpp -> ggml-opencl.cpp
# src/ggml-opencl.h -> ggml-opencl.h
# src/ggml-quants.c -> ggml-quants.c
# src/ggml-quants.h -> ggml-quants.h
# include/ggml/ggml.h -> ggml.h
# include/ggml/ggml-alloc.h -> ggml-alloc.h
# include/ggml/ggml-backend.h -> ggml-backend.h
#
# tests/test-opt.cpp -> tests/test-opt.cpp
# tests/test-grad0.cpp -> tests/test-grad0.cpp
# tests/test-quantize-fns.cpp -> tests/test-quantize-fns.cpp
# tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
# tests/test-backend-ops.cpp -> tests/test-backend-ops.cpp
cat ggml-src.patch | sed \
-e 's/src\/ggml\.c/ggml.c/g' \
-e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \
-e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
-e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
-e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
-e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
-e 's/src\/ggml-metal\.metal/ggml-metal.metal/g' \
-e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
-e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
-e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
-e 's/include\/ggml\/ggml\.h/ggml.h/g' \
-e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
-e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
-e 's/tests\/test-opt\.cpp/tests\/test-opt.cpp/g' \
-e 's/tests\/test-grad0\.cpp/tests\/test-grad0.cpp/g' \
-e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \
-e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \
-e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \
> ggml-src.patch.tmp
mv ggml-src.patch.tmp ggml-src.patch
git am ggml-src.patch
rm -v $SRC_LLAMA/ggml-src.patch
fi
# update last commit
cd $SRC_GGML
git log -1 --format=%H > $SRC_LLAMA/scripts/sync-ggml.last
echo "Done"
exit 0

1
scripts/sync-ggml.last Normal file
View file

@ -0,0 +1 @@
76e7f47b69e8334384dc718480c496dafbd47999