Merge branch 'master' into concedo_experimental
# Conflicts: # README.md
This commit is contained in:
commit
69ab1bf2f8
7 changed files with 406 additions and 593 deletions
10
convert.py
10
convert.py
|
@ -357,6 +357,7 @@ class VocabLoader:
|
|||
for tok in self.tokenizer.all_special_tokens
|
||||
}
|
||||
self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
|
||||
self.reverse_vocab = {id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
|
||||
self.vocab_size_base: int = self.tokenizer.vocab_size
|
||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
|
||||
self.fname_tokenizer: Path = fname_tokenizer
|
||||
|
@ -370,15 +371,13 @@ class VocabLoader:
|
|||
self.spm = None
|
||||
|
||||
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
tokenizer = self.tokenizer
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
|
||||
added_tokens_ids = set(self.added_tokens_dict.values())
|
||||
|
||||
for i in range(self.vocab_size_base):
|
||||
if i in added_tokens_ids:
|
||||
continue
|
||||
|
||||
text = reverse_vocab[i].encode("utf-8")
|
||||
text = self.reverse_vocab[i].encode("utf-8")
|
||||
yield text, self.get_token_score(i), self.get_token_type(i)
|
||||
|
||||
def get_token_type(self, token_id: int) -> gguf.TokenType:
|
||||
|
@ -394,10 +393,13 @@ class VocabLoader:
|
|||
if self.spm.is_byte(token_id):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
else:
|
||||
token = self.reverse_vocab[token_id]
|
||||
if token_id == self.unk_token_id:
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
if token_id in self.special_ids:
|
||||
elif token_id in self.special_ids:
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
|
||||
return toktype
|
||||
|
||||
|
|
484
ggml-cuda.cu
484
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
363
ggml-quants.c
363
ggml-quants.c
|
@ -409,6 +409,18 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
|||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
|
||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
||||
|
@ -2470,32 +2482,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
|||
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
// dot product into int32x4_t
|
||||
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
||||
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
#else
|
||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
|
||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
|
||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
|
||||
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
|
||||
|
||||
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
|
||||
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
|
||||
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
|
||||
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
|
||||
|
||||
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
||||
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
||||
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
||||
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
#endif
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
|
@ -2778,32 +2770,12 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
|||
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
// dot product into int32x4_t
|
||||
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
||||
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||
#else
|
||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
|
||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
|
||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
|
||||
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
|
||||
|
||||
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
|
||||
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
|
||||
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
|
||||
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
|
||||
|
||||
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
||||
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
||||
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
||||
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||
#endif
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
||||
|
@ -2965,32 +2937,12 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
|||
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
#else
|
||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
|
||||
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
|
||||
|
||||
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
|
||||
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
|
||||
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
|
||||
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
|
||||
|
||||
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
||||
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
||||
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
||||
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
#endif
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
|
@ -3277,32 +3229,12 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
|||
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||
#else
|
||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
|
||||
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
|
||||
|
||||
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
|
||||
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
|
||||
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
|
||||
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
|
||||
|
||||
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
||||
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
||||
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
||||
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||
#endif
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
||||
|
@ -3552,7 +3484,6 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
|||
const int8x16_t y1_0 = vld1q_s8(y1->qs);
|
||||
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
||||
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
|
@ -3560,26 +3491,6 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
|||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
||||
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
|
||||
#else
|
||||
const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
|
||||
const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
|
||||
const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
|
||||
const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
|
||||
|
||||
const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
|
||||
const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
|
||||
const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
|
||||
const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
|
||||
|
||||
const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
|
||||
const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
|
||||
const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
|
||||
const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
#endif
|
||||
}
|
||||
|
||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
|
@ -3652,12 +3563,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
const int nb = n / QK_K;
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
||||
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
|
||||
ggml_int8x16x2_t q2bytes;
|
||||
uint8_t aux[16];
|
||||
|
@ -3665,7 +3574,6 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
float sum = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
|
@ -3691,20 +3599,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
|
||||
// We use this macro instead of a function call because for some reason
|
||||
// the code runs 2-3% slower, even if the function is declared inline
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
|
||||
#else
|
||||
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
||||
{\
|
||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),\
|
||||
vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));\
|
||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),\
|
||||
vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));\
|
||||
isum += vaddvq_s16(p1) * aux[is+(index)] + vaddvq_s16(p2) * aux[is+1+(index)];\
|
||||
}
|
||||
#endif
|
||||
|
||||
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
||||
|
@ -3712,26 +3609,23 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
|
||||
MULTIPLY_ACCUM_WITH_SCALE((index));
|
||||
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
|
||||
const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
|
||||
|
||||
ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
|
||||
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
|
||||
|
||||
MULTIPLY_ACCUM_WITH_SCALE(0);
|
||||
|
||||
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
|
||||
|
||||
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
|
||||
|
||||
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
|
||||
|
||||
is += 8;
|
||||
}
|
||||
sum += d * isum;
|
||||
|
||||
sum += d * isum;
|
||||
}
|
||||
|
||||
*s = sum;
|
||||
|
@ -4045,11 +3939,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
const int nb = n / QK_K;
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
|
||||
ggml_int8x16x4_t q2bytes;
|
||||
|
||||
|
@ -4083,28 +3975,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
|
||||
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
|
||||
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
|
||||
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||
#else
|
||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||
vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
||||
vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
||||
isum1 += vaddvq_s16(p1) * scales[0];
|
||||
isum2 += vaddvq_s16(p2) * scales[1];
|
||||
|
||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
||||
vmull_s8(vget_high_s8(q2bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
||||
const int16x8_t p4 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
||||
vmull_s8(vget_high_s8(q2bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
||||
isum1 += vaddvq_s16(p3) * scales[2];
|
||||
isum2 += vaddvq_s16(p4) * scales[3];
|
||||
#endif
|
||||
sum += d * (isum1 + isum2);
|
||||
|
||||
}
|
||||
|
||||
*s = sum;
|
||||
|
@ -4330,9 +4206,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
uint32_t utmp[4];
|
||||
|
||||
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
||||
#ifdef __ARM_FEATURE_DOTPROD
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
const uint8x16_t m0 = vdupq_n_u8(1);
|
||||
const uint8x16_t m1 = vshlq_n_u8(m0, 1);
|
||||
|
@ -4384,22 +4258,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
||||
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
|
||||
#else
|
||||
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_1.val[0])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_1.val[0])));
|
||||
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_1.val[1])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_1.val[1])));
|
||||
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_1.val[2])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_1.val[2])));
|
||||
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_1.val[3])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_1.val[3])));
|
||||
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
|
||||
#endif
|
||||
|
||||
scale += 4;
|
||||
|
||||
q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
|
||||
|
@ -4412,22 +4275,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
||||
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
|
||||
#else
|
||||
p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_2.val[0])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_2.val[0])));
|
||||
p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_2.val[1])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_2.val[1])));
|
||||
p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_2.val[2])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_2.val[2])));
|
||||
p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_2.val[3])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_2.val[3])));
|
||||
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
|
||||
#endif
|
||||
|
||||
scale += 4;
|
||||
|
||||
if (j == 0) {
|
||||
|
@ -4866,10 +4718,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
const int nb = n / QK_K;
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
#ifdef __ARM_FEATURE_DOTPROD
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
|
||||
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
||||
const uint8x16_t mh = vdupq_n_u8(4);
|
||||
|
@ -4910,22 +4759,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
|
||||
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||
#else
|
||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
||||
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
||||
isum += vaddvq_s16(p0) * scales[0] + vaddvq_s16(p1) * scales[2] + vaddvq_s16(p2) * scales[1] + vaddvq_s16(p3) * scales[3];
|
||||
#endif
|
||||
|
||||
sum += d * isum;
|
||||
|
||||
|
@ -5230,11 +5067,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
uint32_t utmp[4];
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||
#ifdef __ARM_FEATURE_DOTPROD
|
||||
const int32x4_t mzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
ggml_int8x16x2_t q4bytes;
|
||||
ggml_int8x16x2_t q8bytes;
|
||||
|
@ -5271,10 +5105,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
int32_t sumi2 = 0;
|
||||
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
|
||||
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
||||
|
||||
#ifdef __ARM_FEATURE_DOTPROD
|
||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||
|
@ -5289,26 +5121,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||
|
||||
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
||||
#else
|
||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
||||
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
||||
sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
|
||||
|
||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
||||
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
||||
sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) * scales[2*j+1];
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
|
@ -5605,12 +5417,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
const int nb = n / QK_K;
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||
|
||||
#ifdef __ARM_FEATURE_DOTPROD
|
||||
const int32x4_t mzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
|
@ -5638,7 +5447,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
|
||||
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
||||
|
||||
#ifdef __ARM_FEATURE_DOTPROD
|
||||
q8bytes = ggml_vld1q_s8_x4(q8);
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||
|
@ -5652,27 +5460,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
|
||||
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
||||
|
||||
#else
|
||||
q8bytes = ggml_vld1q_s8_x4(q8);
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
||||
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
||||
int32_t sumi1 = vaddvq_s16(vaddq_s16(p0, p1)) * scales[0];
|
||||
|
||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[2])),
|
||||
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[2])));
|
||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[3])),
|
||||
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[3])));
|
||||
int32_t sumi2 = vaddvq_s16(vaddq_s16(p2, p3)) * scales[1];
|
||||
|
||||
#endif
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
|
||||
}
|
||||
|
||||
*s = sumf - sum_mins;
|
||||
|
@ -5877,15 +5665,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
|
||||
uint32_t utmp[4];
|
||||
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||
const uint8x16_t mone = vdupq_n_u8(1);
|
||||
const uint8x16_t mtwo = vdupq_n_u8(2);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t mzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
ggml_int8x16x4_t q5bytes;
|
||||
|
||||
|
@ -5940,28 +5724,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
|
||||
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
|
||||
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
|
||||
#else
|
||||
|
||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||
vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
||||
vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
||||
sumi += vaddvq_s16(vaddq_s16(p0, p1)) * *scales++;
|
||||
|
||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
||||
vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
||||
vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
||||
sumi += vaddvq_s16(vaddq_s16(p2, p3)) * *scales++;
|
||||
#endif
|
||||
}
|
||||
|
||||
sumf += d * sumi - dmin * sumi_mins;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
@ -6313,12 +6080,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
const int nb = n / QK_K;
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||
const uint8x16_t mh = vdupq_n_u8(16);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t mzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
ggml_int8x16x4_t q5bytes;
|
||||
ggml_uint8x16x4_t q5h;
|
||||
|
@ -6350,32 +6114,12 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
|
||||
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
|
||||
int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
|
||||
int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
|
||||
int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
|
||||
|
||||
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
||||
|
||||
#else
|
||||
|
||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||
vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
||||
vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
||||
int32_t sumi = sc[0] * vaddvq_s16(p0) + sc[1] * vaddvq_s16(p1);
|
||||
|
||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
||||
vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
||||
vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
||||
sumi += sc[2] * vaddvq_s16(p2) + sc[3] * vaddvq_s16(p3);
|
||||
|
||||
sumf += d*sumi;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
@ -6602,13 +6346,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
const int nb = n / QK_K;
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
float sum = 0;
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
//const int8x16_t m32s = vdupq_n_s8(32);
|
||||
|
||||
const uint8x16_t mone = vdupq_n_u8(3);
|
||||
|
@ -6660,31 +6401,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
|
||||
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
#else
|
||||
|
||||
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
||||
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
||||
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
|
||||
scale += 2;
|
||||
|
||||
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
||||
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
||||
isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
|
||||
scale += 2;
|
||||
#endif
|
||||
|
||||
q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||
|
||||
shifted = vshrq_n_u8(qhbits.val[0], 4);
|
||||
|
@ -6705,34 +6428,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
|
||||
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
scale += 4;
|
||||
|
||||
//for (int l = 0; l < 4; ++l) {
|
||||
// const int32x4_t p = vdotq_s32(vzero, q6bytes.val[l], q8bytes.val[l]);
|
||||
// isum += vaddvq_s32(p) * *scale++;
|
||||
//}
|
||||
#else
|
||||
p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
||||
p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
||||
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
|
||||
scale += 2;
|
||||
|
||||
p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
||||
p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
||||
isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
|
||||
scale += 2;
|
||||
#endif
|
||||
|
||||
}
|
||||
//sum += isum * d_all * y[i].d;
|
||||
sum += d_all * y[i].d * (isum - 32 * isum_mins);
|
||||
|
@ -7078,14 +6778,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
const int nb = n / QK_K;
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
float sum = 0;
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
||||
const int8x16_t m32s = vdupq_n_s8(32);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int32x4_t vzero = vdupq_n_s32(0);
|
||||
#endif
|
||||
|
||||
const uint8x16_t mone = vdupq_n_u8(3);
|
||||
|
||||
|
@ -7121,26 +6818,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
|
||||
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||
#else
|
||||
|
||||
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
||||
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
||||
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
|
||||
|
||||
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
||||
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
||||
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
||||
isum += vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
|
||||
#endif
|
||||
|
||||
sum += isum * d_all * y[i].d;
|
||||
|
||||
|
|
7
ggml.c
7
ggml.c
|
@ -4041,7 +4041,6 @@ static struct ggml_tensor * ggml_group_norm_impl(
|
|||
result->op = GGML_OP_GROUP_NORM;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -5541,7 +5540,6 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|||
result->op_params[0] = scale_factor;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = NULL;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -5846,7 +5844,6 @@ struct ggml_tensor * ggml_get_rel_pos(
|
|||
result->op = GGML_OP_GET_REL_POS;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = NULL;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -17452,9 +17449,9 @@ static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g
|
|||
}
|
||||
|
||||
//
|
||||
// ADAM
|
||||
// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
|
||||
//
|
||||
// ref: https://arxiv.org/pdf/1412.6980.pdf
|
||||
// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
|
||||
//
|
||||
|
||||
static enum ggml_opt_result ggml_opt_adam(
|
||||
|
|
|
@ -9784,7 +9784,8 @@ struct llama_context * llama_new_context_with_model(
|
|||
ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
|
||||
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
ggml_cuda_set_scratch_size(alloc_size);
|
||||
// the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
|
||||
ggml_cuda_set_scratch_size(alloc_size + 64);
|
||||
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
||||
|
||||
// calculate total VRAM usage
|
||||
|
|
131
scripts/sync-ggml-am.sh
Executable file
131
scripts/sync-ggml-am.sh
Executable file
|
@ -0,0 +1,131 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# Synchronize ggml changes to llama.cpp
|
||||
#
|
||||
# Usage:
|
||||
#
|
||||
# $ cd /path/to/llama.cpp
|
||||
# $ ./scripts/sync-ggml-am.sh
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
sd=$(dirname $0)
|
||||
cd $sd/../
|
||||
|
||||
SRC_LLAMA=$(pwd)
|
||||
SRC_GGML=$(cd ../ggml; pwd)
|
||||
|
||||
if [ ! -d $SRC_GGML ]; then
|
||||
echo "ggml not found at $SRC_GGML"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
|
||||
echo "Syncing ggml changes since commit $lc"
|
||||
|
||||
cd $SRC_GGML
|
||||
|
||||
git log --oneline $lc..HEAD
|
||||
|
||||
git format-patch $lc --stdout -- \
|
||||
include/ggml/ggml*.h \
|
||||
src/ggml*.h \
|
||||
src/ggml*.c \
|
||||
src/ggml*.cpp \
|
||||
src/ggml*.m \
|
||||
src/ggml*.metal \
|
||||
src/ggml*.cu \
|
||||
tests/test-opt.cpp \
|
||||
tests/test-grad0.cpp \
|
||||
tests/test-quantize-fns.cpp \
|
||||
tests/test-quantize-perf.cpp \
|
||||
tests/test-backend-ops.cpp \
|
||||
> $SRC_LLAMA/ggml-src.patch
|
||||
|
||||
# delete files if empty
|
||||
if [ ! -s $SRC_LLAMA/ggml-src.patch ]; then
|
||||
rm -v $SRC_LLAMA/ggml-src.patch
|
||||
fi
|
||||
|
||||
cd $SRC_LLAMA
|
||||
|
||||
if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
# replace PR numbers
|
||||
#
|
||||
# Subject: some text (#1234)
|
||||
# Subject: some text (ggml/1234)
|
||||
cat ggml-src.patch | sed -e 's/^Subject: \(.*\) (#\([0-9]*\))/Subject: \1 (ggml\/\2)/' > ggml-src.patch.tmp
|
||||
mv ggml-src.patch.tmp ggml-src.patch
|
||||
|
||||
cat ggml-src.patch | sed -e 's/^\(.*\) (#\([0-9]*\))$/\1 (ggml\/\2)/' > ggml-src.patch.tmp
|
||||
mv ggml-src.patch.tmp ggml-src.patch
|
||||
|
||||
# replace filenames:
|
||||
#
|
||||
# src/ggml.c -> ggml.c
|
||||
# src/ggml-alloc.c -> ggml-alloc.c
|
||||
# src/ggml-backend-impl.h -> ggml-backend-impl.h
|
||||
# src/ggml-backend.c -> ggml-backend.c
|
||||
# src/ggml-cuda.cu -> ggml-cuda.cu
|
||||
# src/ggml-cuda.h -> ggml-cuda.h
|
||||
# src/ggml-impl.h -> ggml-impl.h
|
||||
# src/ggml-metal.h -> ggml-metal.h
|
||||
# src/ggml-metal.m -> ggml-metal.m
|
||||
# src/ggml-metal.metal -> ggml-metal.metal
|
||||
# src/ggml-mpi.h -> ggml-mpi.h
|
||||
# src/ggml-mpi.c -> ggml-mpi.c
|
||||
# src/ggml-opencl.cpp -> ggml-opencl.cpp
|
||||
# src/ggml-opencl.h -> ggml-opencl.h
|
||||
# src/ggml-quants.c -> ggml-quants.c
|
||||
# src/ggml-quants.h -> ggml-quants.h
|
||||
# include/ggml/ggml.h -> ggml.h
|
||||
# include/ggml/ggml-alloc.h -> ggml-alloc.h
|
||||
# include/ggml/ggml-backend.h -> ggml-backend.h
|
||||
#
|
||||
# tests/test-opt.cpp -> tests/test-opt.cpp
|
||||
# tests/test-grad0.cpp -> tests/test-grad0.cpp
|
||||
# tests/test-quantize-fns.cpp -> tests/test-quantize-fns.cpp
|
||||
# tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
|
||||
# tests/test-backend-ops.cpp -> tests/test-backend-ops.cpp
|
||||
|
||||
cat ggml-src.patch | sed \
|
||||
-e 's/src\/ggml\.c/ggml.c/g' \
|
||||
-e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \
|
||||
-e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
|
||||
-e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
|
||||
-e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
|
||||
-e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
|
||||
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
|
||||
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
|
||||
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
|
||||
-e 's/src\/ggml-metal\.metal/ggml-metal.metal/g' \
|
||||
-e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
|
||||
-e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
|
||||
-e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
|
||||
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
||||
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
||||
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
|
||||
-e 's/include\/ggml\/ggml\.h/ggml.h/g' \
|
||||
-e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
|
||||
-e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
|
||||
-e 's/tests\/test-opt\.cpp/tests\/test-opt.cpp/g' \
|
||||
-e 's/tests\/test-grad0\.cpp/tests\/test-grad0.cpp/g' \
|
||||
-e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \
|
||||
-e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \
|
||||
-e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \
|
||||
> ggml-src.patch.tmp
|
||||
mv ggml-src.patch.tmp ggml-src.patch
|
||||
|
||||
git am ggml-src.patch
|
||||
|
||||
rm -v $SRC_LLAMA/ggml-src.patch
|
||||
fi
|
||||
|
||||
# update last commit
|
||||
cd $SRC_GGML
|
||||
git log -1 --format=%H > $SRC_LLAMA/scripts/sync-ggml.last
|
||||
|
||||
echo "Done"
|
||||
|
||||
exit 0
|
1
scripts/sync-ggml.last
Normal file
1
scripts/sync-ggml.last
Normal file
|
@ -0,0 +1 @@
|
|||
76e7f47b69e8334384dc718480c496dafbd47999
|
Loading…
Add table
Add a link
Reference in a new issue