BACKWARDS COMPAT QUANT SHIM is ready, but upstream model converter is BORKED. BORK BORK.
This commit is contained in:
parent
08810d5fee
commit
b335f73a60
5 changed files with 95 additions and 69 deletions
31
ggml.c
31
ggml.c
|
@ -393,9 +393,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
//legacy functions
|
||||
#include "ggml_v2.c"
|
||||
|
||||
//
|
||||
// timing
|
||||
//
|
||||
|
@ -724,6 +721,31 @@ typedef struct {
|
|||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
#define QK4_2 16
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
uint8_t qs[QK4_2 / 2]; // nibbles / quants
|
||||
} block_q4_2;
|
||||
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
|
||||
|
||||
#define QK4_3 16
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
ggml_fp16_t m; // min
|
||||
uint8_t qs[QK4_3 / 2]; // nibbles / quants
|
||||
} block_q4_3;
|
||||
static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
|
||||
|
||||
#define QK8_1 32
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
float s0; // d * sum(qs[i]) low
|
||||
float s1; // d * sum(qs[i]) high
|
||||
int8_t qs[QK8_1]; // quants
|
||||
} block_q8_1_v2;
|
||||
static_assert(sizeof(block_q8_1_v2) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
|
||||
static const int qk = QK4_0;
|
||||
|
@ -12398,3 +12420,6 @@ int ggml_cpu_has_vsx(void) {
|
|||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//legacy functions
|
||||
#include "ggml_v2.c"
|
14
ggml.h
14
ggml.h
|
@ -876,15 +876,21 @@ extern "C" {
|
|||
//
|
||||
|
||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_0_v2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_1_v2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_2_v2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_0_v2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_1_v2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q8_0_v2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_chunk_v2(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||
//
|
||||
// system info
|
||||
//
|
||||
|
|
93
ggml_v2.c
93
ggml_v2.c
|
@ -43,9 +43,12 @@ static inline __m256i bytes_from_nibbles_32_v2(const uint8_t * rsi)
|
|||
bytes = _mm256_or_si256( low, high );
|
||||
return bytes;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#if __ARM_NEON
|
||||
#if !defined(__aarch64__)
|
||||
int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
|
||||
int8x8_t res;
|
||||
|
||||
|
@ -133,38 +136,10 @@ uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
|
|||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#define QK4_2 16
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
uint8_t qs[QK4_2 / 2]; // nibbles / quants
|
||||
} block_q4_2_v2;
|
||||
static_assert(sizeof(block_q4_2_v2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
|
||||
|
||||
#define QK4_3 16
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
ggml_fp16_t m; // min
|
||||
uint8_t qs[QK4_3 / 2]; // nibbles / quants
|
||||
} block_q4_3_v2;
|
||||
static_assert(sizeof(block_q4_3_v2) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
|
||||
|
||||
|
||||
#define QK8_1 32
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
float s0; // d * sum(qs[i]) low
|
||||
float s1; // d * sum(qs[i]) high
|
||||
int8_t qs[QK8_1]; // quants
|
||||
} block_q8_1_v2;
|
||||
static_assert(sizeof(block_q8_1_v2) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
static void quantize_row_q4_0_reference_v2(const float * restrict x, block_q4_0 * restrict y, int k) {
|
||||
assert(k % QK4_0 == 0);
|
||||
|
@ -676,7 +651,7 @@ static void quantize_row_q4_1_v2(const float * restrict x, void * restrict vy, i
|
|||
}
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
static void quantize_row_q4_2_reference_v2(const float * restrict x, block_q4_2_v2 * restrict y, int k) {
|
||||
static void quantize_row_q4_2_reference_v2(const float * restrict x, block_q4_2 * restrict y, int k) {
|
||||
assert(k % QK4_2 == 0);
|
||||
|
||||
const int nb = k / QK4_2;
|
||||
|
@ -717,12 +692,12 @@ static void quantize_row_q4_2_reference_v2(const float * restrict x, block_q4_2_
|
|||
static void quantize_row_q4_2_v2(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK4_2 == 0);
|
||||
|
||||
block_q4_2_v2 * restrict y = vy;
|
||||
block_q4_2 * restrict y = vy;
|
||||
|
||||
quantize_row_q4_2_reference_v2(x, y, k);
|
||||
}
|
||||
|
||||
static void quantize_row_q4_3_reference_v2(const float * restrict x, block_q4_3_v2 * restrict y, int k) {
|
||||
static void quantize_row_q4_3_reference_v2(const float * restrict x, block_q4_3 * restrict y, int k) {
|
||||
assert(k % QK4_3 == 0);
|
||||
const int nb = k / QK4_3;
|
||||
|
||||
|
@ -760,7 +735,7 @@ static void quantize_row_q4_3_reference_v2(const float * restrict x, block_q4_3_
|
|||
static void quantize_row_q4_3_v2(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK4_3 == 0);
|
||||
|
||||
block_q4_3_v2 * restrict y = vy;
|
||||
block_q4_3 * restrict y = vy;
|
||||
|
||||
quantize_row_q4_3_reference_v2(x, y, k);
|
||||
}
|
||||
|
@ -1443,7 +1418,7 @@ static void dequantize_row_q4_2_v2(const void * restrict vx, float * restrict y,
|
|||
assert(k % QK4_2 == 0);
|
||||
const int nb = k / QK4_2;
|
||||
|
||||
const block_q4_2_v2 * restrict x = vx;
|
||||
const block_q4_2 * restrict x = vx;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
@ -1472,7 +1447,7 @@ static void dequantize_row_q4_3_v2(const void * restrict vx, float * restrict y,
|
|||
assert(k % QK4_3 == 0);
|
||||
const int nb = k / QK4_3;
|
||||
|
||||
const block_q4_3_v2 * restrict x = vx;
|
||||
const block_q4_3 * restrict x = vx;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
@ -1975,7 +1950,7 @@ static void ggml_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const voi
|
|||
assert(nb % 2 == 0);
|
||||
assert(QK8_0 == 2*QK4_2);
|
||||
|
||||
const block_q4_2_v2 * restrict x = vx;
|
||||
const block_q4_2 * restrict x = vx;
|
||||
const block_q8_0 * restrict y = vy;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
@ -1983,10 +1958,10 @@ static void ggml_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const voi
|
|||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; i += 2) {
|
||||
const block_q4_2_v2 * restrict x0_0 = &x[2*(i + 0) + 0];
|
||||
const block_q4_2_v2 * restrict x0_1 = &x[2*(i + 0) + 1];
|
||||
const block_q4_2_v2 * restrict x1_0 = &x[2*(i + 1) + 0];
|
||||
const block_q4_2_v2 * restrict x1_1 = &x[2*(i + 1) + 1];
|
||||
const block_q4_2 * restrict x0_0 = &x[2*(i + 0) + 0];
|
||||
const block_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1];
|
||||
const block_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0];
|
||||
const block_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1];
|
||||
|
||||
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||
|
@ -2132,7 +2107,7 @@ static void ggml_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const voi
|
|||
assert(nb % 2 == 0);
|
||||
assert(QK8_1 == 2*QK4_3);
|
||||
|
||||
const block_q4_3_v2 * restrict x = vx;
|
||||
const block_q4_3 * restrict x = vx;
|
||||
const block_q8_1_v2 * restrict y = vy;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
@ -2143,8 +2118,8 @@ static void ggml_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const voi
|
|||
float summs1 = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const block_q4_3_v2 * restrict x0_0 = &x[2*(i + 0) + 0];
|
||||
const block_q4_3_v2 * restrict x0_1 = &x[2*(i + 0) + 1];
|
||||
const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0];
|
||||
const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1];
|
||||
|
||||
const block_q8_1_v2 * restrict y0 = &y[i + 0];
|
||||
|
||||
|
@ -2816,7 +2791,7 @@ size_t ggml_quantize_q4_2_v2(const float * src, void * dst, int n, int k, int64_
|
|||
const int nb = k / QK4_2;
|
||||
|
||||
for (int j = 0; j < n; j += k) {
|
||||
block_q4_2_v2 * restrict y = (block_q4_2_v2 *)dst + j/QK4_2;
|
||||
block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2;
|
||||
|
||||
quantize_row_q4_2_reference_v2(src + j, y, k);
|
||||
|
||||
|
@ -2831,7 +2806,7 @@ size_t ggml_quantize_q4_2_v2(const float * src, void * dst, int n, int k, int64_
|
|||
}
|
||||
}
|
||||
|
||||
return (n/QK4_2*sizeof(block_q4_2_v2));
|
||||
return (n/QK4_2*sizeof(block_q4_2));
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_t * hist) {
|
||||
|
@ -2839,7 +2814,7 @@ size_t ggml_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_
|
|||
const int nb = k / QK4_3;
|
||||
|
||||
for (int j = 0; j < n; j += k) {
|
||||
block_q4_3_v2 * restrict y = (block_q4_3_v2 *)dst + j/QK4_3;
|
||||
block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3;
|
||||
|
||||
quantize_row_q4_3_reference_v2(src + j, y, k);
|
||||
|
||||
|
@ -2854,7 +2829,7 @@ size_t ggml_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_
|
|||
}
|
||||
}
|
||||
|
||||
return (n/QK4_3*sizeof(block_q4_3_v2));
|
||||
return (n/QK4_3*sizeof(block_q4_3));
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q5_0_v2(const float * src, void * dst, int n, int k, int64_t * hist) {
|
||||
|
@ -2917,6 +2892,26 @@ size_t ggml_quantize_q5_1_v2(const float * src, void * dst, int n, int k, int64_
|
|||
return (n/QK5_1*sizeof(block_q5_1));
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q8_0_v2(const float * src, void * dst, int n, int k, int64_t * hist) {
|
||||
assert(k % QK8_0 == 0);
|
||||
const int nb = k / QK8_0;
|
||||
|
||||
for (int j = 0; j < n; j += k) {
|
||||
block_q8_0 * restrict y = (block_q8_0 *)dst + j/QK8_0;
|
||||
|
||||
quantize_row_q8_0_reference_v2(src + j, y, k);
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int l = 0; l < QK8_0; ++l) {
|
||||
const int8_t vi = y[i].qs[l];
|
||||
|
||||
hist[vi/16 + 8]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (n/QK8_0*sizeof(block_q8_0));
|
||||
}
|
||||
|
||||
//TODO: integrate
|
||||
size_t ggml_quantize_chunk_v2(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
|
||||
|
@ -2937,13 +2932,13 @@ size_t ggml_quantize_chunk_v2(enum ggml_type type, const float * src, void * dst
|
|||
case GGML_TYPE_Q4_2:
|
||||
{
|
||||
GGML_ASSERT(start % QK4_2 == 0);
|
||||
block_q4_2_v2 * block = (block_q4_2_v2*)dst + start / QK4_2;
|
||||
block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
|
||||
result = ggml_quantize_q4_2_v2(src + start, block, n, n, hist);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_3:
|
||||
{
|
||||
GGML_ASSERT(start % QK4_3 == 0);
|
||||
block_q4_3_v2 * block = (block_q4_3_v2*)dst + start / QK4_3;
|
||||
block_q4_3 * block = (block_q4_3*)dst + start / QK4_3;
|
||||
result = ggml_quantize_q4_3_v2(src + start, block, n, n, hist);
|
||||
} break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
|
|
|
@ -773,22 +773,22 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode
|
|||
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q4_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q4_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
break;
|
||||
case GGML_TYPE_Q4_2:
|
||||
cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q4_2_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q5_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q5_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q8_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
break;
|
||||
default: {
|
||||
fprintf(stderr, "unsupported quantization type %d\n", type);
|
||||
|
|
|
@ -166,31 +166,31 @@ bool ggml_common_quantize_0(
|
|||
switch ((ggml_type) ttype) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q4_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q4_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_2:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q4_2_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_3:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q4_3_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q5_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
{
|
||||
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q5_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_q8_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue