Adding Q2_K - just CUDA for now
Token prediction is pretty good - about 15.5 ms on a RTX 4080. Perplexity is about the same as Q4_K.
This commit is contained in:
parent
4faa040c20
commit
b439efb712
8 changed files with 275 additions and 26 deletions
|
@ -12,6 +12,7 @@ static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
|
||||||
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
||||||
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
||||||
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
||||||
|
{"q2_K", LLAMA_FTYPE_MOSTLY_Q2_K},
|
||||||
{"q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M},
|
{"q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M},
|
||||||
{"q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S},
|
{"q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S},
|
||||||
{"q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M},
|
{"q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M},
|
||||||
|
|
76
ggml-cuda.cu
76
ggml-cuda.cu
|
@ -89,6 +89,14 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
||||||
|
|
||||||
#define QK_K 256
|
#define QK_K 256
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||||
|
uint8_t qs[QK_K/4]; // quants
|
||||||
|
half d; // super-block scale for quantized scales
|
||||||
|
half dmin; // super-block scale for quantized mins
|
||||||
|
} block_q2_K;
|
||||||
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t hmask[QK_K/8];
|
uint8_t hmask[QK_K/8];
|
||||||
uint8_t qs[QK_K/4]; // nibbles / quants
|
uint8_t qs[QK_K/4]; // nibbles / quants
|
||||||
|
@ -225,6 +233,59 @@ static __device__ void dequantize_q8_0(const void * vx, const int ib, const int
|
||||||
|
|
||||||
//================================== k-quants
|
//================================== k-quants
|
||||||
|
|
||||||
|
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
||||||
|
|
||||||
|
const int i = blockIdx.x;
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const int n = tid/32;
|
||||||
|
const int l = tid - 32*n;
|
||||||
|
const int is = 8*n + l/16;
|
||||||
|
|
||||||
|
const block_q2_K * x = (const block_q2_K *) vx;
|
||||||
|
|
||||||
|
const uint8_t q = x[i].qs[32*n + l];
|
||||||
|
float * y = yy + i*QK_K + 128*n;
|
||||||
|
|
||||||
|
float dall = x[i].d;
|
||||||
|
float dmin = x[i].dmin;
|
||||||
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||||
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||||
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||||
|
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
|
||||||
|
|
||||||
|
const block_q2_K * x = (const block_q2_K *) vx;
|
||||||
|
|
||||||
|
// if n is 0, we want to do the lower 128, else the upper 128,
|
||||||
|
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
|
||||||
|
// y[l+16], y[l+48], y[l+80], y[l+112]
|
||||||
|
int n = iqs/128; // 0 or 1
|
||||||
|
int r = iqs - 128*n; // 0...120 in steps of 8
|
||||||
|
int l = r/8; // 0...15 in steps of 1
|
||||||
|
|
||||||
|
const float * y = yy + 128*n + l;
|
||||||
|
const uint8_t * q = x[ib].qs + 32*n + l;
|
||||||
|
const uint8_t * s = x[ib].scales + 8*n;
|
||||||
|
|
||||||
|
const float dall = x[ib].d;
|
||||||
|
const float dmin = x[ib].dmin;
|
||||||
|
|
||||||
|
float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
|
||||||
|
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
|
||||||
|
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
|
||||||
|
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
|
||||||
|
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
|
||||||
|
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[1] >> 4))
|
||||||
|
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
|
||||||
|
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
|
||||||
|
|
||||||
|
result = sum;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
||||||
|
|
||||||
int r = threadIdx.x/4;
|
int r = threadIdx.x/4;
|
||||||
|
@ -625,6 +686,11 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
|
||||||
dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
}
|
||||||
|
|
||||||
static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -685,6 +751,12 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
|
||||||
<<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
<<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
|
const dim3 block_dims(32, 2, 1);
|
||||||
|
dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||||
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const dim3 block_dims(32, 2, 1);
|
const dim3 block_dims(32, 2, 1);
|
||||||
|
@ -734,6 +806,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||||
return dequantize_row_q5_1_cuda;
|
return dequantize_row_q5_1_cuda;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
return dequantize_row_q8_0_cuda;
|
return dequantize_row_q8_0_cuda;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
return dequantize_row_q2_K_cuda;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
return dequantize_row_q3_K_cuda;
|
return dequantize_row_q3_K_cuda;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
@ -761,6 +835,8 @@ static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_t
|
||||||
return dequantize_mul_mat_vec_q5_1_cuda;
|
return dequantize_mul_mat_vec_q5_1_cuda;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
return dequantize_mul_mat_vec_q8_0_cuda;
|
return dequantize_mul_mat_vec_q8_0_cuda;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
return dequantize_mul_mat_vec_q2_K_cuda;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
return dequantize_mul_mat_vec_q3_K_cuda;
|
return dequantize_mul_mat_vec_q3_K_cuda;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
|
35
ggml.c
35
ggml.c
|
@ -1566,6 +1566,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_q = NULL, // TODO
|
.vec_dot_q = NULL, // TODO
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_Q2_K] = {
|
||||||
|
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K,
|
||||||
|
.quantize_row_q = quantize_row_q2_K,
|
||||||
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference,
|
||||||
|
.quantize_row_q_dot = quantize_row_q8_K,
|
||||||
|
.vec_dot_q = NULL, //ggml_vec_dot_q2_K_q8_K,
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
},
|
||||||
[GGML_TYPE_Q3_K] = {
|
[GGML_TYPE_Q3_K] = {
|
||||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
|
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
|
||||||
.quantize_row_q = quantize_row_q3_K,
|
.quantize_row_q = quantize_row_q3_K,
|
||||||
|
@ -3477,6 +3485,7 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q5_1] = QK5_1,
|
[GGML_TYPE_Q5_1] = QK5_1,
|
||||||
[GGML_TYPE_Q8_0] = QK8_0,
|
[GGML_TYPE_Q8_0] = QK8_0,
|
||||||
[GGML_TYPE_Q8_1] = QK8_1,
|
[GGML_TYPE_Q8_1] = QK8_1,
|
||||||
|
[GGML_TYPE_Q2_K] = QK_K,
|
||||||
[GGML_TYPE_Q3_K] = QK_K,
|
[GGML_TYPE_Q3_K] = QK_K,
|
||||||
[GGML_TYPE_Q4_K] = QK_K,
|
[GGML_TYPE_Q4_K] = QK_K,
|
||||||
[GGML_TYPE_Q5_K] = QK_K,
|
[GGML_TYPE_Q5_K] = QK_K,
|
||||||
|
@ -3486,7 +3495,7 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_I16] = 1,
|
[GGML_TYPE_I16] = 1,
|
||||||
[GGML_TYPE_I32] = 1,
|
[GGML_TYPE_I32] = 1,
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 18, "GGML_BLCK_SIZE is outdated");
|
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
|
||||||
|
|
||||||
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_F32] = sizeof(float),
|
[GGML_TYPE_F32] = sizeof(float),
|
||||||
|
@ -3497,6 +3506,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
||||||
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
||||||
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
||||||
|
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
||||||
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
||||||
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
||||||
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
||||||
|
@ -3506,7 +3516,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_I16] = sizeof(int16_t),
|
[GGML_TYPE_I16] = sizeof(int16_t),
|
||||||
[GGML_TYPE_I32] = sizeof(int32_t),
|
[GGML_TYPE_I32] = sizeof(int32_t),
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 18, "GGML_TYPE_SIZE is outdated");
|
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
|
||||||
|
|
||||||
|
|
||||||
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
||||||
|
@ -3518,6 +3528,7 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q5_1] = "q5_1",
|
[GGML_TYPE_Q5_1] = "q5_1",
|
||||||
[GGML_TYPE_Q8_0] = "q8_0",
|
[GGML_TYPE_Q8_0] = "q8_0",
|
||||||
[GGML_TYPE_Q8_1] = "q8_1",
|
[GGML_TYPE_Q8_1] = "q8_1",
|
||||||
|
[GGML_TYPE_Q2_K] = "q2_K",
|
||||||
[GGML_TYPE_Q3_K] = "q3_K",
|
[GGML_TYPE_Q3_K] = "q3_K",
|
||||||
[GGML_TYPE_Q4_K] = "q4_K",
|
[GGML_TYPE_Q4_K] = "q4_K",
|
||||||
[GGML_TYPE_Q5_K] = "q5_K",
|
[GGML_TYPE_Q5_K] = "q5_K",
|
||||||
|
@ -3527,7 +3538,7 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_I16] = "i16",
|
[GGML_TYPE_I16] = "i16",
|
||||||
[GGML_TYPE_I32] = "i32",
|
[GGML_TYPE_I32] = "i32",
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 18, "GGML_TYPE_NAME is outdated");
|
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
|
||||||
|
|
||||||
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_F32] = false,
|
[GGML_TYPE_F32] = false,
|
||||||
|
@ -3538,6 +3549,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q5_1] = true,
|
[GGML_TYPE_Q5_1] = true,
|
||||||
[GGML_TYPE_Q8_0] = true,
|
[GGML_TYPE_Q8_0] = true,
|
||||||
[GGML_TYPE_Q8_1] = true,
|
[GGML_TYPE_Q8_1] = true,
|
||||||
|
[GGML_TYPE_Q2_K] = true,
|
||||||
[GGML_TYPE_Q3_K] = true,
|
[GGML_TYPE_Q3_K] = true,
|
||||||
[GGML_TYPE_Q4_K] = true,
|
[GGML_TYPE_Q4_K] = true,
|
||||||
[GGML_TYPE_Q5_K] = true,
|
[GGML_TYPE_Q5_K] = true,
|
||||||
|
@ -3547,7 +3559,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_I16] = false,
|
[GGML_TYPE_I16] = false,
|
||||||
[GGML_TYPE_I32] = false,
|
[GGML_TYPE_I32] = false,
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 18, "GGML_IS_QUANTIZED is outdated");
|
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
|
||||||
|
|
||||||
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"NONE",
|
"NONE",
|
||||||
|
@ -3854,6 +3866,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||||
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
|
||||||
|
@ -7641,6 +7654,7 @@ static void ggml_compute_forward_add(
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -7948,6 +7962,7 @@ static void ggml_compute_forward_add1(
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -8074,6 +8089,7 @@ static void ggml_compute_forward_acc(
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -10171,6 +10187,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -10358,6 +10375,7 @@ static void ggml_compute_forward_set(
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -10527,6 +10545,7 @@ static void ggml_compute_forward_get_rows(
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -11077,6 +11096,7 @@ static void ggml_compute_forward_alibi(
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -11153,6 +11173,7 @@ static void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -16161,6 +16182,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
||||||
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
|
block_q2_K * block = (block_q2_K*)dst + start / QK_K;
|
||||||
|
result = ggml_quantize_q2_K(src + start, block, n, n, hist);
|
||||||
|
} break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
|
|
20
ggml.h
20
ggml.h
|
@ -242,11 +242,12 @@ extern "C" {
|
||||||
GGML_TYPE_Q8_0 = 8,
|
GGML_TYPE_Q8_0 = 8,
|
||||||
GGML_TYPE_Q8_1 = 9,
|
GGML_TYPE_Q8_1 = 9,
|
||||||
// k-quantizations
|
// k-quantizations
|
||||||
GGML_TYPE_Q3_K = 10,
|
GGML_TYPE_Q2_K = 10,
|
||||||
GGML_TYPE_Q4_K = 11,
|
GGML_TYPE_Q3_K = 11,
|
||||||
GGML_TYPE_Q5_K = 12,
|
GGML_TYPE_Q4_K = 12,
|
||||||
GGML_TYPE_Q6_K = 13,
|
GGML_TYPE_Q5_K = 13,
|
||||||
GGML_TYPE_Q8_K = 14,
|
GGML_TYPE_Q6_K = 14,
|
||||||
|
GGML_TYPE_Q8_K = 15,
|
||||||
GGML_TYPE_I8,
|
GGML_TYPE_I8,
|
||||||
GGML_TYPE_I16,
|
GGML_TYPE_I16,
|
||||||
GGML_TYPE_I32,
|
GGML_TYPE_I32,
|
||||||
|
@ -270,10 +271,11 @@ extern "C" {
|
||||||
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q3_K = 10, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q4_K = 11, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q5_K = 12, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q6_K = 13, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
// available tensor operations:
|
// available tensor operations:
|
||||||
|
|
121
k_quants.c
121
k_quants.c
|
@ -270,6 +270,127 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//========================- 2-bit (de)-quantization
|
||||||
|
|
||||||
|
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
uint8_t L[QK_K];
|
||||||
|
float mins[QK_K/16];
|
||||||
|
float scales[QK_K/16];
|
||||||
|
|
||||||
|
const float q4scale = 15.f;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||||
|
float max_min = 0;
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5);
|
||||||
|
float scale = scales[j];
|
||||||
|
if (scale > max_scale) {
|
||||||
|
max_scale = scale;
|
||||||
|
}
|
||||||
|
float min = mins[j];
|
||||||
|
if (min > max_min) {
|
||||||
|
max_min = min;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max_scale > 0) {
|
||||||
|
float iscale = q4scale/max_scale;
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
int l = nearest_int(iscale*scales[j]);
|
||||||
|
y[i].scales[j] = l;
|
||||||
|
}
|
||||||
|
y[i].d = ggml_fp32_to_fp16(max_scale/q4scale);
|
||||||
|
} else {
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
|
||||||
|
y[i].d = ggml_fp32_to_fp16(0.f);
|
||||||
|
}
|
||||||
|
if (max_min > 0) {
|
||||||
|
float iscale = q4scale/max_min;
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
int l = nearest_int(iscale*mins[j]);
|
||||||
|
y[i].scales[j] |= (l << 4);
|
||||||
|
}
|
||||||
|
y[i].dmin = ggml_fp32_to_fp16(max_min/q4scale);
|
||||||
|
} else {
|
||||||
|
y[i].dmin = ggml_fp32_to_fp16(0.f);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
const float d = ggml_fp16_to_fp32(y[i].d) * (y[i].scales[j] & 0xF);
|
||||||
|
if (!d) continue;
|
||||||
|
const float dm = ggml_fp16_to_fp32(y[i].dmin) * (y[i].scales[j] >> 4);
|
||||||
|
for (int ii = 0; ii < 16; ++ii) {
|
||||||
|
int l = nearest_int((x[16*j + ii] + dm)/d);
|
||||||
|
l = MAX(0, MIN(3, l));
|
||||||
|
L[16*j + ii] = l;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K; j += 128) {
|
||||||
|
for (int l = 0; l < 32; ++l) {
|
||||||
|
y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
x += QK_K;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
const float d = ggml_fp16_to_fp32(x[i].d);
|
||||||
|
const float min = ggml_fp16_to_fp32(x[i].dmin);
|
||||||
|
|
||||||
|
const uint8_t * q = x[i].qs;
|
||||||
|
|
||||||
|
int is = 0;
|
||||||
|
float dl, ml;
|
||||||
|
for (int n = 0; n < QK_K; n += 128) {
|
||||||
|
int shift = 0;
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
|
||||||
|
uint8_t sc = x[i].scales[is++];
|
||||||
|
dl = d * (sc & 0xF); ml = min * (sc >> 4);
|
||||||
|
for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
|
||||||
|
|
||||||
|
sc = x[i].scales[is++];
|
||||||
|
dl = d * (sc & 0xF); ml = min * (sc >> 4);
|
||||||
|
for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
|
||||||
|
|
||||||
|
shift += 2;
|
||||||
|
}
|
||||||
|
q += 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
|
quantize_row_q2_K_reference(x, vy, k);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
||||||
|
(void)hist;
|
||||||
|
|
||||||
|
for (int j = 0; j < nb; j += k) {
|
||||||
|
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
||||||
|
quantize_row_q2_K_reference(src + j, y, k);
|
||||||
|
}
|
||||||
|
return (n/QK_K*sizeof(block_q2_K));
|
||||||
|
}
|
||||||
|
|
||||||
//========================= 3-bit (de)-quantization
|
//========================= 3-bit (de)-quantization
|
||||||
|
|
||||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
||||||
|
|
21
k_quants.h
21
k_quants.h
|
@ -13,6 +13,18 @@
|
||||||
// Super-block quantization structures
|
// Super-block quantization structures
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// 2-bit quantization
|
||||||
|
// weight is represented as x = a * q + b
|
||||||
|
// 16 blocks of 16 elemenets each
|
||||||
|
// Effectively 2.5625 bits per weight
|
||||||
|
typedef struct {
|
||||||
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||||
|
uint8_t qs[QK_K/4]; // quants
|
||||||
|
ggml_fp16_t d; // super-block scale for quantized scales
|
||||||
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||||
|
} block_q2_K;
|
||||||
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||||
|
|
||||||
// 3-bit quantization
|
// 3-bit quantization
|
||||||
// weight is represented as x = a * q
|
// weight is represented as x = a * q
|
||||||
// 16 blocks of 16 elemenets each
|
// 16 blocks of 16 elemenets each
|
||||||
|
@ -32,7 +44,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K /
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_fp16_t d; // super-block scale for quantized scales
|
ggml_fp16_t d; // super-block scale for quantized scales
|
||||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
} block_q4_K;
|
} block_q4_K;
|
||||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
||||||
|
@ -44,7 +56,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2,
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_fp16_t d; // super-block scale for quantized scales
|
ggml_fp16_t d; // super-block scale for quantized scales
|
||||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||||
uint8_t qh[QK_K/8]; // quants, high bit
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
} block_q5_K;
|
} block_q5_K;
|
||||||
|
@ -72,12 +84,14 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
|
||||||
|
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
|
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
||||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
||||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
||||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
||||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||||
|
|
||||||
|
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
@ -85,6 +99,7 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
|
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
||||||
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
||||||
|
@ -92,12 +107,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
|
||||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
|
||||||
// Quantization with histogram collection
|
// Quantization with histogram collection
|
||||||
|
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
|
10
llama.cpp
10
llama.cpp
|
@ -507,6 +507,7 @@ struct llama_file_loader {
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -586,6 +587,7 @@ struct llama_file_saver {
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -907,6 +909,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
||||||
// K-quants
|
// K-quants
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
||||||
|
@ -2081,6 +2084,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
||||||
// K-quants
|
// K-quants
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
||||||
|
@ -2158,7 +2162,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
|
if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
|
||||||
else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
||||||
|
@ -2166,7 +2170,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
++i_attention_wv;
|
++i_attention_wv;
|
||||||
}
|
}
|
||||||
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
||||||
|
@ -2174,7 +2178,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
++i_feed_forward_w2;
|
++i_feed_forward_w2;
|
||||||
}
|
}
|
||||||
else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
float * f32_data;
|
float * f32_data;
|
||||||
|
|
17
llama.h
17
llama.h
|
@ -94,14 +94,15 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_S = 10,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_M = 11,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_L = 12,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K_S = 13,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K_M = 14,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 15,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 16,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q6_K = 17,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue