Adding Q5_K - scalar, AVX2, CUDA
Performance is ~20% lower compared to Q4_K on the CPU. This is to be expected, considering that we are memory bound on the CPU and the 5-bit model is ~22% larger than the 4-bit. On the GPU, single token prediction is about the same as Q4_0 for both, single token and batch prediction.
This commit is contained in:
parent
cf221afb55
commit
b835d0f49f
8 changed files with 440 additions and 12 deletions
6
Makefile
6
Makefile
|
@ -40,8 +40,10 @@ endif
|
||||||
#
|
#
|
||||||
|
|
||||||
# keep standard at C11 and C++11
|
# keep standard at C11 and C++11
|
||||||
CFLAGS = -I. -O3 -std=c11 -fPIC
|
#OPT = -Ofast
|
||||||
CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
|
OPT = -O3
|
||||||
|
CFLAGS = -I. $(OPT) -std=c11 -fPIC
|
||||||
|
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
|
||||||
ifdef LLAMA_DEBUG
|
ifdef LLAMA_DEBUG
|
||||||
|
|
|
@ -14,6 +14,7 @@ static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
|
||||||
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
||||||
{"q3_K", LLAMA_FTYPE_MOSTLY_Q3_K},
|
{"q3_K", LLAMA_FTYPE_MOSTLY_Q3_K},
|
||||||
{"q4_K", LLAMA_FTYPE_MOSTLY_Q4_K},
|
{"q4_K", LLAMA_FTYPE_MOSTLY_Q4_K},
|
||||||
|
{"q5_K", LLAMA_FTYPE_MOSTLY_Q5_K},
|
||||||
{"q6_K", LLAMA_FTYPE_MOSTLY_Q6_K},
|
{"q6_K", LLAMA_FTYPE_MOSTLY_Q6_K},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
94
ggml-cuda.cu
94
ggml-cuda.cu
|
@ -105,6 +105,15 @@ typedef struct {
|
||||||
} block_q4_K;
|
} block_q4_K;
|
||||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
half d; // super-block scale for quantized scales
|
||||||
|
half dmin; // super-block scale for quantized mins
|
||||||
|
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
||||||
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
|
} block_q5_K;
|
||||||
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||||
|
@ -355,6 +364,76 @@ static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
||||||
|
const block_q5_K * x = (const block_q5_K *) vx;
|
||||||
|
|
||||||
|
const int i = blockIdx.x;
|
||||||
|
|
||||||
|
// assume 64 threads - this is very slightly better than the one below
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const int il = tid/16; // il is in 0...3
|
||||||
|
const int ir = tid%16; // ir is in 0...15
|
||||||
|
const int is = 2*il; // is is in 0...6
|
||||||
|
|
||||||
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
||||||
|
|
||||||
|
const float dall = x[i].d;
|
||||||
|
const float dmin = x[i].dmin;
|
||||||
|
|
||||||
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
||||||
|
const uint8_t * qh = x[i].qh + 2*ir;
|
||||||
|
|
||||||
|
uint8_t sc, m;
|
||||||
|
get_scale_min_k4(is + 0, x[i].scales, sc, m);
|
||||||
|
const float d1 = dall * sc; const float m1 = dmin * m;
|
||||||
|
get_scale_min_k4(is + 1, x[i].scales, sc, m);
|
||||||
|
const float d2 = dall * sc; const float m2 = dmin * m;
|
||||||
|
|
||||||
|
uint8_t hm = 1 << (2*il);
|
||||||
|
y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
|
||||||
|
y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
|
||||||
|
hm <<= 1;
|
||||||
|
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||||
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||||
|
|
||||||
|
const block_q5_K * x = (const block_q5_K *) vx;
|
||||||
|
|
||||||
|
// iqs is in 0...248 in steps of 8 =>
|
||||||
|
const int j = iqs / 64; // j is in 0...3
|
||||||
|
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
||||||
|
const int is = 2*j; // is is in 0...6 in steps of 2
|
||||||
|
|
||||||
|
const float * y = yy + 64*j + ir;
|
||||||
|
const uint8_t * ql = x[ib].qs + 32*j + ir;
|
||||||
|
const uint8_t * qh = x[ib].qh + ir;
|
||||||
|
|
||||||
|
const float dall = x[ib].d;
|
||||||
|
const float dmin = x[ib].dmin;
|
||||||
|
|
||||||
|
uint8_t sc, m;
|
||||||
|
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
|
||||||
|
const float d1 = dall * sc;
|
||||||
|
const float m1 = dmin * m;
|
||||||
|
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
|
||||||
|
const float d2 = dall * sc;
|
||||||
|
const float m2 = dmin * m;
|
||||||
|
|
||||||
|
uint8_t hm = 1 << is;
|
||||||
|
float sum = 0;
|
||||||
|
for (int k = 0; k < 4; ++k) {
|
||||||
|
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
|
||||||
|
}
|
||||||
|
hm <<= 1;
|
||||||
|
for (int k = 0; k < 4; ++k) {
|
||||||
|
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
|
||||||
|
}
|
||||||
|
result = sum;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
||||||
const block_q6_K * x = (const block_q6_K *) vx;
|
const block_q6_K * x = (const block_q6_K *) vx;
|
||||||
|
|
||||||
|
@ -556,6 +635,11 @@ static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cu
|
||||||
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
}
|
||||||
|
|
||||||
static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -613,6 +697,12 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
|
||||||
dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
|
const dim3 block_dims(32, 2, 1);
|
||||||
|
dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||||
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const dim3 block_dims(32, 2, 1);
|
const dim3 block_dims(32, 2, 1);
|
||||||
|
@ -648,6 +738,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||||
return dequantize_row_q3_K_cuda;
|
return dequantize_row_q3_K_cuda;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
return dequantize_row_q4_K_cuda;
|
return dequantize_row_q4_K_cuda;
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
return dequantize_row_q5_K_cuda;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
return dequantize_row_q6_K_cuda;
|
return dequantize_row_q6_K_cuda;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
@ -673,6 +765,8 @@ static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_t
|
||||||
return dequantize_mul_mat_vec_q3_K_cuda;
|
return dequantize_mul_mat_vec_q3_K_cuda;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
return dequantize_mul_mat_vec_q4_K_cuda;
|
return dequantize_mul_mat_vec_q4_K_cuda;
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
return dequantize_mul_mat_vec_q5_K_cuda;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
return dequantize_mul_mat_vec_q6_K_cuda;
|
return dequantize_mul_mat_vec_q6_K_cuda;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
|
35
ggml.c
35
ggml.c
|
@ -1582,6 +1582,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_q = ggml_vec_dot_q4_K_q8_K,
|
.vec_dot_q = ggml_vec_dot_q4_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_Q5_K] = {
|
||||||
|
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K,
|
||||||
|
.quantize_row_q = quantize_row_q5_K,
|
||||||
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference,
|
||||||
|
.quantize_row_q_dot = quantize_row_q8_K,
|
||||||
|
.vec_dot_q = ggml_vec_dot_q5_K_q8_K,
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
},
|
||||||
[GGML_TYPE_Q6_K] = {
|
[GGML_TYPE_Q6_K] = {
|
||||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
|
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K,
|
||||||
.quantize_row_q = quantize_row_q6_K,
|
.quantize_row_q = quantize_row_q6_K,
|
||||||
|
@ -3471,13 +3479,14 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q8_1] = QK8_1,
|
[GGML_TYPE_Q8_1] = QK8_1,
|
||||||
[GGML_TYPE_Q3_K] = QK_K,
|
[GGML_TYPE_Q3_K] = QK_K,
|
||||||
[GGML_TYPE_Q4_K] = QK_K,
|
[GGML_TYPE_Q4_K] = QK_K,
|
||||||
|
[GGML_TYPE_Q5_K] = QK_K,
|
||||||
[GGML_TYPE_Q6_K] = QK_K,
|
[GGML_TYPE_Q6_K] = QK_K,
|
||||||
[GGML_TYPE_Q8_K] = QK_K,
|
[GGML_TYPE_Q8_K] = QK_K,
|
||||||
[GGML_TYPE_I8] = 1,
|
[GGML_TYPE_I8] = 1,
|
||||||
[GGML_TYPE_I16] = 1,
|
[GGML_TYPE_I16] = 1,
|
||||||
[GGML_TYPE_I32] = 1,
|
[GGML_TYPE_I32] = 1,
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 17, "GGML_BLCK_SIZE is outdated");
|
static_assert(GGML_TYPE_COUNT == 18, "GGML_BLCK_SIZE is outdated");
|
||||||
|
|
||||||
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_F32] = sizeof(float),
|
[GGML_TYPE_F32] = sizeof(float),
|
||||||
|
@ -3489,13 +3498,14 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
||||||
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
||||||
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
||||||
|
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
||||||
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
||||||
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
||||||
[GGML_TYPE_I8] = sizeof(int8_t),
|
[GGML_TYPE_I8] = sizeof(int8_t),
|
||||||
[GGML_TYPE_I16] = sizeof(int16_t),
|
[GGML_TYPE_I16] = sizeof(int16_t),
|
||||||
[GGML_TYPE_I32] = sizeof(int32_t),
|
[GGML_TYPE_I32] = sizeof(int32_t),
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 17, "GGML_TYPE_SIZE is outdated");
|
static_assert(GGML_TYPE_COUNT == 18, "GGML_TYPE_SIZE is outdated");
|
||||||
|
|
||||||
|
|
||||||
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
||||||
|
@ -3509,13 +3519,14 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q8_1] = "q8_1",
|
[GGML_TYPE_Q8_1] = "q8_1",
|
||||||
[GGML_TYPE_Q3_K] = "q3_K",
|
[GGML_TYPE_Q3_K] = "q3_K",
|
||||||
[GGML_TYPE_Q4_K] = "q4_K",
|
[GGML_TYPE_Q4_K] = "q4_K",
|
||||||
|
[GGML_TYPE_Q5_K] = "q5_K",
|
||||||
[GGML_TYPE_Q6_K] = "q6_K",
|
[GGML_TYPE_Q6_K] = "q6_K",
|
||||||
[GGML_TYPE_Q8_K] = "q8_K",
|
[GGML_TYPE_Q8_K] = "q8_K",
|
||||||
[GGML_TYPE_I8] = "i8",
|
[GGML_TYPE_I8] = "i8",
|
||||||
[GGML_TYPE_I16] = "i16",
|
[GGML_TYPE_I16] = "i16",
|
||||||
[GGML_TYPE_I32] = "i32",
|
[GGML_TYPE_I32] = "i32",
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 17, "GGML_TYPE_NAME is outdated");
|
static_assert(GGML_TYPE_COUNT == 18, "GGML_TYPE_NAME is outdated");
|
||||||
|
|
||||||
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_F32] = false,
|
[GGML_TYPE_F32] = false,
|
||||||
|
@ -3527,13 +3538,14 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q8_0] = true,
|
[GGML_TYPE_Q8_0] = true,
|
||||||
[GGML_TYPE_Q8_1] = true,
|
[GGML_TYPE_Q8_1] = true,
|
||||||
[GGML_TYPE_Q4_K] = true,
|
[GGML_TYPE_Q4_K] = true,
|
||||||
|
[GGML_TYPE_Q5_K] = true,
|
||||||
[GGML_TYPE_Q6_K] = true,
|
[GGML_TYPE_Q6_K] = true,
|
||||||
[GGML_TYPE_Q8_K] = true,
|
[GGML_TYPE_Q8_K] = true,
|
||||||
[GGML_TYPE_I8] = false,
|
[GGML_TYPE_I8] = false,
|
||||||
[GGML_TYPE_I16] = false,
|
[GGML_TYPE_I16] = false,
|
||||||
[GGML_TYPE_I32] = false,
|
[GGML_TYPE_I32] = false,
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 17, "GGML_IS_QUANTIZED is outdated");
|
static_assert(GGML_TYPE_COUNT == 18, "GGML_IS_QUANTIZED is outdated");
|
||||||
|
|
||||||
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"NONE",
|
"NONE",
|
||||||
|
@ -3842,6 +3854,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||||
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
||||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||||
|
@ -7628,6 +7641,7 @@ static void ggml_compute_forward_add(
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
||||||
|
@ -7934,6 +7948,7 @@ static void ggml_compute_forward_add1(
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
||||||
|
@ -8059,6 +8074,7 @@ static void ggml_compute_forward_acc(
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
@ -10155,6 +10171,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
|
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
|
||||||
|
@ -10341,6 +10358,7 @@ static void ggml_compute_forward_set(
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
@ -10509,6 +10527,7 @@ static void ggml_compute_forward_get_rows(
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
||||||
|
@ -11058,6 +11077,7 @@ static void ggml_compute_forward_alibi(
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_Q8_K:
|
case GGML_TYPE_Q8_K:
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
|
@ -11133,6 +11153,7 @@ static void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_Q8_K:
|
case GGML_TYPE_Q8_K:
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
|
@ -16150,6 +16171,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
block_q4_K * block = (block_q4_K*)dst + start / QK_K;
|
block_q4_K * block = (block_q4_K*)dst + start / QK_K;
|
||||||
result = ggml_quantize_q4_K(src + start, block, n, n, hist);
|
result = ggml_quantize_q4_K(src + start, block, n, n, hist);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
|
block_q5_K * block = (block_q5_K*)dst + start / QK_K;
|
||||||
|
result = ggml_quantize_q5_K(src + start, block, n, n, hist);
|
||||||
|
} break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
|
|
9
ggml.h
9
ggml.h
|
@ -244,9 +244,9 @@ extern "C" {
|
||||||
// k-quantizations
|
// k-quantizations
|
||||||
GGML_TYPE_Q3_K = 10,
|
GGML_TYPE_Q3_K = 10,
|
||||||
GGML_TYPE_Q4_K = 11,
|
GGML_TYPE_Q4_K = 11,
|
||||||
//GGML_TYPE_Q5_K = 12,
|
GGML_TYPE_Q5_K = 12,
|
||||||
GGML_TYPE_Q6_K = 12,
|
GGML_TYPE_Q6_K = 13,
|
||||||
GGML_TYPE_Q8_K = 13,
|
GGML_TYPE_Q8_K = 14,
|
||||||
GGML_TYPE_I8,
|
GGML_TYPE_I8,
|
||||||
GGML_TYPE_I16,
|
GGML_TYPE_I16,
|
||||||
GGML_TYPE_I32,
|
GGML_TYPE_I32,
|
||||||
|
@ -272,7 +272,8 @@ extern "C" {
|
||||||
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q3_K = 10, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q3_K = 10, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q4_K = 11, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q4_K = 11, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q6_K = 12, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_K = 12, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_Q6_K = 13, // except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
// available tensor operations:
|
// available tensor operations:
|
||||||
|
|
299
k_quants.c
299
k_quants.c
|
@ -499,6 +499,135 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n
|
||||||
return (n/QK_K*sizeof(block_q4_K));
|
return (n/QK_K*sizeof(block_q4_K));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ====================== 5-bit (de)-quantization
|
||||||
|
|
||||||
|
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
uint8_t L[QK_K];
|
||||||
|
float mins[QK_K/32];
|
||||||
|
float scales[QK_K/32];
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||||
|
float max_min = 0;
|
||||||
|
for (int j = 0; j < QK_K/32; ++j) {
|
||||||
|
scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5);
|
||||||
|
float scale = scales[j];
|
||||||
|
if (scale > max_scale) {
|
||||||
|
max_scale = scale;
|
||||||
|
}
|
||||||
|
float min = mins[j];
|
||||||
|
if (min > max_min) {
|
||||||
|
max_min = min;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
|
||||||
|
float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
|
||||||
|
for (int j = 0; j < QK_K/32; ++j) {
|
||||||
|
uint8_t ls = nearest_int(inv_scale*scales[j]);
|
||||||
|
uint8_t lm = nearest_int(inv_min*mins[j]);
|
||||||
|
ls = MIN(63, ls);
|
||||||
|
lm = MIN(63, lm);
|
||||||
|
if (j < 4) {
|
||||||
|
y[i].scales[j] = ls;
|
||||||
|
y[i].scales[j+4] = lm;
|
||||||
|
} else {
|
||||||
|
y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
|
||||||
|
y[i].scales[j-4] |= ((ls >> 4) << 6);
|
||||||
|
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
y[i].d = ggml_fp32_to_fp16(max_scale/63.f);
|
||||||
|
y[i].dmin = ggml_fp32_to_fp16(max_min/63.f);
|
||||||
|
|
||||||
|
uint8_t sc, m;
|
||||||
|
for (int j = 0; j < QK_K/32; ++j) {
|
||||||
|
get_scale_min_k4(j, y[i].scales, &sc, &m);
|
||||||
|
const float d = ggml_fp16_to_fp32(y[i].d) * sc;
|
||||||
|
if (!d) continue;
|
||||||
|
const float dm = ggml_fp16_to_fp32(y[i].dmin) * m;
|
||||||
|
for (int ii = 0; ii < 32; ++ii) {
|
||||||
|
int l = nearest_int((x[32*j + ii] + dm)/d);
|
||||||
|
l = MAX(0, MIN(31, l));
|
||||||
|
L[32*j + ii] = l;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t * restrict qh = y[i].qh;
|
||||||
|
uint8_t * restrict ql = y[i].qs;
|
||||||
|
memset(qh, 0, QK_K/8);
|
||||||
|
|
||||||
|
uint8_t m1 = 1, m2 = 2;
|
||||||
|
for (int n = 0; n < QK_K; n += 64) {
|
||||||
|
for (int j = 0; j < 32; ++j) {
|
||||||
|
int l1 = L[n + j];
|
||||||
|
if (l1 > 15) {
|
||||||
|
l1 -= 16; qh[j] |= m1;
|
||||||
|
}
|
||||||
|
int l2 = L[n + j + 32];
|
||||||
|
if (l2 > 15) {
|
||||||
|
l2 -= 16; qh[j] |= m2;
|
||||||
|
}
|
||||||
|
ql[j] = l1 | (l2 << 4);
|
||||||
|
}
|
||||||
|
m1 <<= 2; m2 <<= 2;
|
||||||
|
ql += 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
x += QK_K;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
const float d = ggml_fp16_to_fp32(x[i].d);
|
||||||
|
const float min = ggml_fp16_to_fp32(x[i].dmin);
|
||||||
|
|
||||||
|
const uint8_t * ql = x[i].qs;
|
||||||
|
const uint8_t * qh = x[i].qh;
|
||||||
|
|
||||||
|
int is = 0;
|
||||||
|
uint8_t sc, m;
|
||||||
|
uint8_t u1 = 1, u2 = 2;
|
||||||
|
for (int j = 0; j < QK_K; j += 64) {
|
||||||
|
get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
|
||||||
|
const float d1 = d * sc; const float m1 = min * m;
|
||||||
|
get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
|
||||||
|
const float d2 = d * sc; const float m2 = min * m;
|
||||||
|
for (int l = 0; l < 32; ++l) *y++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1;
|
||||||
|
for (int l = 0; l < 32; ++l) *y++ = d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2;
|
||||||
|
ql += 32; is += 2;
|
||||||
|
u1 <<= 2; u2 <<= 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
block_q5_K * restrict y = vy;
|
||||||
|
quantize_row_q5_K_reference(x, y, k);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
(void)hist;
|
||||||
|
for (int j = 0; j < nb; j += k) {
|
||||||
|
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
||||||
|
quantize_row_q5_K_reference(src + j, y, k);
|
||||||
|
}
|
||||||
|
return (n/QK_K*sizeof(block_q5_K));
|
||||||
|
}
|
||||||
|
|
||||||
// ====================== 6-bit (de)-quantization
|
// ====================== 6-bit (de)-quantization
|
||||||
|
|
||||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
|
||||||
|
@ -1058,6 +1187,176 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||||
|
assert(n % QK_K == 0);
|
||||||
|
|
||||||
|
const block_q5_K * restrict x = vx;
|
||||||
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
||||||
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
|
#ifdef z__ARM_NEON
|
||||||
|
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
|
||||||
|
#elif defined __AVX2__
|
||||||
|
|
||||||
|
static const uint32_t kmask1 = 0x3f3f3f3f;
|
||||||
|
static const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
|
static const uint32_t kmask3 = 0x03030303;
|
||||||
|
|
||||||
|
const __m256i m4 = _mm256_set1_epi8(0xF);
|
||||||
|
const __m128i mzero = _mm_setzero_si128();
|
||||||
|
const __m256i mone = _mm256_set1_epi8(1);
|
||||||
|
|
||||||
|
__m256 acc = _mm256_setzero_ps();
|
||||||
|
|
||||||
|
uint32_t utmp[4];
|
||||||
|
|
||||||
|
float summs = 0.f;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
||||||
|
const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
||||||
|
|
||||||
|
const uint8_t * restrict q5 = x[i].qs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
memcpy(utmp, x[i].scales, 12);
|
||||||
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||||
|
const uint32_t uaux = utmp[1] & kmask1;
|
||||||
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||||
|
utmp[2] = uaux;
|
||||||
|
utmp[0] &= kmask1;
|
||||||
|
|
||||||
|
const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
|
||||||
|
|
||||||
|
const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
|
||||||
|
const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
|
||||||
|
const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
|
||||||
|
const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
|
||||||
|
summs += dmin * _mm_extract_epi32(hsum, 0);
|
||||||
|
|
||||||
|
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
||||||
|
const __m256i scales = _mm256_set_m128i(sc128, sc128);
|
||||||
|
|
||||||
|
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
|
||||||
|
__m256i hmask = mone;
|
||||||
|
|
||||||
|
__m256i sumi = _mm256_setzero_si256();
|
||||||
|
|
||||||
|
int bit = 0;
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K/64; ++j) {
|
||||||
|
|
||||||
|
const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
|
||||||
|
const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
|
||||||
|
|
||||||
|
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
|
||||||
|
|
||||||
|
const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
|
||||||
|
const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
|
||||||
|
const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0);
|
||||||
|
hmask = _mm256_slli_epi16(hmask, 1);
|
||||||
|
|
||||||
|
const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
|
||||||
|
const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
|
||||||
|
const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1);
|
||||||
|
hmask = _mm256_slli_epi16(hmask, 1);
|
||||||
|
|
||||||
|
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
||||||
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
||||||
|
|
||||||
|
__m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
|
||||||
|
__m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
|
||||||
|
|
||||||
|
p16_0 = _mm256_madd_epi16(scale_0, p16_0);
|
||||||
|
p16_1 = _mm256_madd_epi16(scale_1, p16_1);
|
||||||
|
|
||||||
|
sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
__m256 vd = _mm256_set1_ps(d);
|
||||||
|
acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(acc) + summs;
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
static const uint32_t kmask1 = 0x3f3f3f3f;
|
||||||
|
static const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
|
static const uint32_t kmask3 = 0x03030303;
|
||||||
|
|
||||||
|
uint32_t utmp[4];
|
||||||
|
|
||||||
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||||
|
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||||
|
|
||||||
|
int8_t aux8[QK_K];
|
||||||
|
int16_t aux16[8];
|
||||||
|
float sums [8];
|
||||||
|
int32_t aux32[8];
|
||||||
|
memset(sums, 0, 8*sizeof(float));
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const uint8_t * restrict q4 = x[i].qs;
|
||||||
|
const uint8_t * restrict hm = x[i].qh;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
memset(aux32, 0, 8*sizeof(int32_t));
|
||||||
|
int8_t * restrict a = aux8;
|
||||||
|
uint8_t m = 1;
|
||||||
|
for (int j = 0; j < QK_K/64; ++j) {
|
||||||
|
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||||
|
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||||
|
a += 32; m <<= 1;
|
||||||
|
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||||
|
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||||
|
a += 32; m <<= 1;
|
||||||
|
q4 += 32;
|
||||||
|
}
|
||||||
|
memcpy(utmp, x[i].scales, 12);
|
||||||
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||||
|
const uint32_t uaux = utmp[1] & kmask1;
|
||||||
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||||
|
utmp[2] = uaux;
|
||||||
|
utmp[0] &= kmask1;
|
||||||
|
|
||||||
|
int sumi = 0;
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||||
|
a = aux8;
|
||||||
|
int is = 0;
|
||||||
|
for (int j = 0; j < QK_K/32; ++j) {
|
||||||
|
int32_t scale = scales[is++];
|
||||||
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||||
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||||
|
q8 += 8; a += 8;
|
||||||
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||||
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||||
|
q8 += 8; a += 8;
|
||||||
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||||
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||||
|
q8 += 8; a += 8;
|
||||||
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||||
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||||
|
q8 += 8; a += 8;
|
||||||
|
}
|
||||||
|
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
||||||
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||||
|
const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
|
||||||
|
sumf -= dmin * sumi;
|
||||||
|
}
|
||||||
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||||
|
*s = sumf;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
||||||
|
|
|
@ -509,6 +509,7 @@ struct llama_file_loader {
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
break;
|
break;
|
||||||
default: {
|
default: {
|
||||||
|
@ -586,7 +587,7 @@ struct llama_file_saver {
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
break;
|
break;
|
||||||
default: LLAMA_ASSERT(false);
|
default: LLAMA_ASSERT(false);
|
||||||
|
@ -906,6 +907,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K: return "mostly Q3_K";
|
case LLAMA_FTYPE_MOSTLY_Q3_K: return "mostly Q3_K";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_K: return "mostly Q4_K";
|
case LLAMA_FTYPE_MOSTLY_Q4_K: return "mostly Q4_K";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q5_K: return "mostly Q5_K";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
||||||
default: return "unknown, may not work";
|
default: return "unknown, may not work";
|
||||||
}
|
}
|
||||||
|
@ -2074,6 +2076,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K: quantized_type = GGML_TYPE_Q3_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q3_K: quantized_type = GGML_TYPE_Q3_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_K: quantized_type = GGML_TYPE_Q4_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q4_K: quantized_type = GGML_TYPE_Q4_K; break;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q5_K: quantized_type = GGML_TYPE_Q5_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||||
default: throw format("invalid output file type %d\n", ftype);
|
default: throw format("invalid output file type %d\n", ftype);
|
||||||
};
|
};
|
||||||
|
|
3
llama.h
3
llama.h
|
@ -96,7 +96,8 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K = 10,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q3_K = 10,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K = 11,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_K = 11,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q6_K = 12,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_K = 12,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q6_K = 13,// except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue