No second least squares pass
This commit is contained in:
parent
8386034e08
commit
5a02328d1f
1 changed files with 14 additions and 25 deletions
|
@ -5,6 +5,7 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <float.h>
|
#include <float.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
|
@ -457,20 +458,6 @@ static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static float lstsq_q_0(const float * restrict q, const float * restrict x, int qk) {
|
|
||||||
// Least squares fits `d * q = x` for d.
|
|
||||||
float qs2 = 0.0f;
|
|
||||||
float xq = 0.0f;
|
|
||||||
for (int i = 0; i < qk; i++) {
|
|
||||||
qs2 += q[i]*q[i];
|
|
||||||
xq += x[i]*q[i];
|
|
||||||
}
|
|
||||||
if (qs2 == 0.0f) {
|
|
||||||
return 0.0f;
|
|
||||||
}
|
|
||||||
return xq / qs2;
|
|
||||||
}
|
|
||||||
|
|
||||||
static float lstsq_q_0_u8(const uint8_t * restrict q, const float * restrict x, int qk) {
|
static float lstsq_q_0_u8(const uint8_t * restrict q, const float * restrict x, int qk) {
|
||||||
// Least squares fits `d * q = x` for d.
|
// Least squares fits `d * q = x` for d.
|
||||||
float qs2 = 0.0f;
|
float qs2 = 0.0f;
|
||||||
|
@ -1445,18 +1432,20 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Least squares fit min and scale.
|
if (pass == 0) {
|
||||||
float min, scale;
|
// Least squares fit min and scale.
|
||||||
lstsq_q_k(q_fit, x, q_m, block_size, &min, &scale);
|
float min, scale;
|
||||||
// Check for nans.
|
lstsq_q_k(q_fit, x, q_m, block_size, &min, &scale);
|
||||||
assert(min == min);
|
// Check for nans.
|
||||||
assert(scale == scale);
|
assert(min == min);
|
||||||
// Quantize to fp16 for the next pass.
|
assert(scale == scale);
|
||||||
max_scale = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(scale)) * max_group;
|
// Quantize to fp16 for the next pass.
|
||||||
max_min = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(min)) * max_group;
|
max_scale = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(scale)) * max_group;
|
||||||
|
max_min = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(min)) * max_group;
|
||||||
|
|
||||||
*block_scale = GGML_FP32_TO_FP16(scale);
|
*block_scale = GGML_FP32_TO_FP16(scale);
|
||||||
*block_min = GGML_FP32_TO_FP16(min);
|
*block_min = GGML_FP32_TO_FP16(min);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue