imatrix: WIP

This commit is contained in:
Iwan Kawrakow 2024-01-12 12:53:16 +02:00
parent e9372e4098
commit 8da2b25b0b
8 changed files with 771 additions and 85 deletions

View file

@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
// Set up a the benchmark matrices // Set up a the benchmark matrices
// printf("Creating new tensor q11 & Running quantize\n"); // printf("Creating new tensor q11 & Running quantize\n");
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data()); ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
// Set up a the compute graph // Set up a the compute graph
// printf("Creating new tensor q31\n"); // printf("Creating new tensor q31\n");
@ -207,7 +207,7 @@ int main(int argc, char ** argv) {
// Set up a second graph computation to make sure we override the CPU cache lines // Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n"); // printf("Creating new tensor q12 & Running quantize\n");
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data()); ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
// printf("Creating new tensor q32\n"); // printf("Creating new tensor q32\n");
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);

View file

@ -82,7 +82,7 @@ static void usage(const char * executable) {
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); printf(" --imatrixfile_name: use data in file_name as importance matrix for quant optimizations\n");
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@ -93,7 +93,7 @@ static void usage(const char * executable) {
} else { } else {
printf(" "); printf(" ");
} }
printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str()); printf("%-7s : %s\n", it.name.c_str(), it.desc.c_str());
} }
exit(1); exit(1);
} }
@ -147,11 +147,11 @@ static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std
static void prepare_imatrix(const std::string& imatrix_file, static void prepare_imatrix(const std::string& imatrix_file,
const std::vector<std::string>& included_weights, const std::vector<std::string>& included_weights,
const std::vector<std::string>& excluded_weights, const std::vector<std::string>& excluded_weights,
std::unordered_map<std::string, std::vector<float>> imatrix_data) { std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
if (!imatrix_file.empty()) { if (!imatrix_file.empty()) {
load_imatrix(imatrix_file, imatrix_data); load_imatrix(imatrix_file, imatrix_data);
} }
if (imatrix_file.empty()) { if (imatrix_data.empty()) {
return; return;
} }
if (!excluded_weights.empty()) { if (!excluded_weights.empty()) {
@ -175,6 +175,9 @@ static void prepare_imatrix(const std::string& imatrix_file,
} }
imatrix_data = std::move(tmp); imatrix_data = std::move(tmp);
} }
if (!imatrix_data.empty()) {
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
}
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
@ -195,19 +198,19 @@ int main(int argc, char ** argv) {
params.allow_requantize = true; params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) { } else if (strcmp(argv[arg_idx], "--pure") == 0) {
params.pure = true; params.pure = true;
} else if (strcmp(argv[arg_idx], "-im") == 0 || strcmp(argv[arg_idx], "--importance-matrix") == 0) { } else if (strcmp(argv[arg_idx], "--imatrix") == 0) {
if (arg_idx < argc-1) { if (arg_idx < argc-1) {
imatrix_file = argv[++arg_idx]; imatrix_file = argv[++arg_idx];
} else { } else {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "-iw") == 0 || strcmp(argv[arg_idx], "--include-weights") == 0) { } else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
if (arg_idx < argc-1) { if (arg_idx < argc-1) {
included_weights.push_back(argv[++arg_idx]); included_weights.push_back(argv[++arg_idx]);
} else { } else {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "-ew") == 0 || strcmp(argv[arg_idx], "--exclude-weights") == 0) { } else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) {
if (arg_idx < argc-1) { if (arg_idx < argc-1) {
excluded_weights.push_back(argv[++arg_idx]); excluded_weights.push_back(argv[++arg_idx]);
} else { } else {
@ -219,6 +222,7 @@ int main(int argc, char ** argv) {
} }
if (argc - arg_idx < 2) { if (argc - arg_idx < 2) {
printf("%s: bad arguments\n", argv[0]);
usage(argv[0]); usage(argv[0]);
} }
if (!included_weights.empty() && !excluded_weights.empty()) { if (!included_weights.empty() && !excluded_weights.empty()) {

View file

@ -5,6 +5,8 @@
#include <string.h> #include <string.h>
#include <assert.h> #include <assert.h>
#include <float.h> #include <float.h>
#include <stdlib.h> // for qsort
#include <stdio.h> // for GGML_ASSERT
#ifdef __ARM_NEON #ifdef __ARM_NEON
@ -2553,14 +2555,6 @@ static const uint8_t ksigns_iq2xs[128] = {
static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128}; static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
(void)x;
(void)y;
(void)k;
assert(k % QK_K == 0);
//fprintf(stderr, "=========================== %s: not implemented\n", __func__);
}
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) { void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
assert(k % QK_K == 0); assert(k % QK_K == 0);
const int nb = k / QK_K; const int nb = k / QK_K;
@ -2587,33 +2581,8 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
} }
} }
void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
assert(k % QK_K == 0);
block_iq2_xxs * restrict y = vy;
quantize_row_iq2_xxs_reference(x, y, k);
}
size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % QK_K == 0);
(void)hist; // TODO: collect histograms
for (int j = 0; j < n; j += k) {
block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
quantize_row_iq2_xxs_reference(src + j, y, k);
}
return (n/QK_K*sizeof(block_iq2_xxs));
}
// ====================== 2.3125 bpw (de)-quantization // ====================== 2.3125 bpw (de)-quantization
void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
(void)x;
(void)y;
(void)k;
assert(k % QK_K == 0);
//fprintf(stderr, "=========================== %s: not implemented\n", __func__);
}
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) { void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
assert(k % QK_K == 0); assert(k % QK_K == 0);
const int nb = k / QK_K; const int nb = k / QK_K;
@ -2639,23 +2608,6 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
} }
} }
void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
assert(k % QK_K == 0);
block_iq2_xs * restrict y = vy;
quantize_row_iq2_xs_reference(x, y, k);
}
size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % QK_K == 0);
(void)hist; // TODO: collect histograms
for (int j = 0; j < n; j += k) {
block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
quantize_row_iq2_xs_reference(src + j, y, k);
}
return (n/QK_K*sizeof(block_iq2_xs));
}
//===================================== Q8_K ============================================== //===================================== Q8_K ==============================================
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@ -7699,3 +7651,665 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
*s = 0.125f * sumf; *s = 0.125f * sumf;
#endif #endif
} }
// ================================ IQ2 quantization =============================================
typedef struct {
uint64_t * grid;
int * map;
uint16_t * neighbours;
} iq2_entry_t;
static iq2_entry_t iq2_data[2] = {
{NULL, NULL, NULL},
{NULL, NULL, NULL},
};
static inline int iq2_data_index(int grid_size) {
GGML_ASSERT(grid_size == 256 || grid_size == 512);
return grid_size == 256 ? 0 : 1;
}
static int iq2_compare_func(const void * left, const void * right) {
const int * l = (const int *)left;
const int * r = (const int *)right;
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
}
static void q2xs_init_impl(int grid_size) {
const int gindex = iq2_data_index(grid_size);
if (iq2_data[gindex].grid) {
return;
}
static const uint16_t kgrid_256[256] = {
0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113,
2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240,
4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400,
5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260,
8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872,
10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
};
static const uint16_t kgrid_512[512] = {
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597,
640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096,
1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348,
1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065,
2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441,
2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160,
4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372,
4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125,
5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652,
5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197,
8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549,
8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894,
10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
};
const int kmap_size = 43692;
const int nwant = 2;
const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
uint64_t * kgrid_q2xs;
int * kmap_q2xs;
uint16_t * kneighbors_q2xs;
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
for (int k = 0; k < grid_size; ++k) {
int8_t * pos = (int8_t *)(the_grid + k);
for (int i = 0; i < 8; ++i) {
int l = (kgrid[k] >> 2*i) & 0x3;
pos[i] = 2*l + 1;
}
}
kgrid_q2xs = the_grid;
iq2_data[gindex].grid = the_grid;
kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
iq2_data[gindex].map = kmap_q2xs;
for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
uint64_t aux64;
uint8_t * aux8 = (uint8_t *)&aux64;
for (int i = 0; i < grid_size; ++i) {
aux64 = kgrid_q2xs[i];
uint16_t index = 0;
for (int k=0; k<8; ++k) {
uint16_t q = (aux8[k] - 1)/2;
index |= (q << 2*k);
}
kmap_q2xs[index] = i;
}
int8_t pos[8];
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
int num_neighbors = 0, num_not_in_map = 0;
for (int i = 0; i < kmap_size; ++i) {
if (kmap_q2xs[i] >= 0) continue;
++num_not_in_map;
for (int k = 0; k < 8; ++k) {
int l = (i >> 2*k) & 0x3;
pos[k] = 2*l + 1;
}
for (int j = 0; j < grid_size; ++j) {
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
int d2 = 0;
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
dist2[2*j+0] = d2;
dist2[2*j+1] = j;
}
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
int n = 0; int d2 = dist2[0];
int nhave = 1;
for (int j = 0; j < grid_size; ++j) {
if (dist2[2*j] > d2) {
if (nhave == nwant) break;
d2 = dist2[2*j];
++nhave;
}
++n;
}
num_neighbors += n;
}
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
iq2_data[gindex].neighbours = kneighbors_q2xs;
int counter = 0;
for (int i = 0; i < kmap_size; ++i) {
if (kmap_q2xs[i] >= 0) continue;
for (int k = 0; k < 8; ++k) {
int l = (i >> 2*k) & 0x3;
pos[k] = 2*l + 1;
}
for (int j = 0; j < grid_size; ++j) {
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
int d2 = 0;
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
dist2[2*j+0] = d2;
dist2[2*j+1] = j;
}
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
kmap_q2xs[i] = -(counter + 1);
int d2 = dist2[0];
uint16_t * start = &kneighbors_q2xs[counter++];
int n = 0, nhave = 1;
for (int j = 0; j < grid_size; ++j) {
if (dist2[2*j] > d2) {
if (nhave == nwant) break;
d2 = dist2[2*j];
++nhave;
}
kneighbors_q2xs[counter++] = dist2[2*j+1];
++n;
}
*start = n;
}
free(dist2);
}
void ggml_init_iq2_quantization(enum ggml_type type) {
if (type == GGML_TYPE_IQ2_XXS) {
q2xs_init_impl(256);
}
else if (type == GGML_TYPE_IQ2_XS) {
q2xs_init_impl(512);
}
else {
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
}
}
static void q2xs_deinit_impl(int grid_size) {
GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
const int gindex = iq2_data_index(grid_size);
if (iq2_data[gindex].grid) {
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
}
}
void ggml_deinit_iq2_quantization(enum ggml_type type) {
if (type == GGML_TYPE_IQ2_XXS) {
q2xs_deinit_impl(256);
}
else if (type == GGML_TYPE_IQ2_XS) {
q2xs_deinit_impl(512);
}
else {
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
}
}
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
int num_neighbors = neighbours[0];
GGML_ASSERT(num_neighbors > 0);
float best_d2 = FLT_MAX;
int grid_index = -1;
for (int j = 1; j <= num_neighbors; ++j) {
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
float d2 = 0;
for (int i = 0; i < 8; ++i) {
float q = pg[i];
float diff = scale*q - xval[i];
d2 += weight[i]*diff*diff;
}
if (d2 < best_d2) {
best_d2 = d2; grid_index = neighbours[j];
}
}
GGML_ASSERT(grid_index >= 0);
const int8_t * pg = (const int8_t *)(grid + grid_index);
for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
return grid_index;
}
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
const int gindex = iq2_data_index(256);
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
const int * kmap_q2xs = iq2_data[gindex].map;
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
GGML_ASSERT(quant_weights);
GGML_ASSERT(kgrid_q2xs);
GGML_ASSERT(kmap_q2xs);
GGML_ASSERT(kneighbors_q2xs);
GGML_ASSERT(n%QK_K == 0);
const int kMaxQ = 3;
const int nbl = n/256;
block_iq2_xxs * y = vy;
float scales[QK_K/32];
float weight[32];
float xval[32];
int8_t L[32];
int8_t Laux[32];
float waux[32];
bool is_on_grid[4];
bool is_on_grid_aux[4];
uint8_t block_signs[4];
uint32_t q2[2*(QK_K/32)];
for (int ibl = 0; ibl < nbl; ++ibl) {
y[ibl].d = GGML_FP32_TO_FP16(0.f);
memset(q2, 0, QK_K/4);
float max_scale = 0;
const float * xbl = x + QK_K*ibl;
float sumx2 = 0;
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
float sigma2 = sumx2/QK_K;
for (int ib = 0; ib < QK_K/32; ++ib) {
const float * xb = xbl + 32*ib;
const float * qw = quant_weights + QK_K*ibl + 32*ib;
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
for (int k = 0; k < 4; ++k) {
int nflip = 0;
uint8_t s = 0;
for (int i = 0; i < 8; ++i) {
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
else {
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
}
}
if (nflip%2) {
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
for (int i = 1; i < 8; ++i) {
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
if (ax < min) {
min = ax; imin = i;
}
}
xval[8*k+imin] = -xval[8*k+imin];
s ^= (1 << imin);
}
block_signs[k] = s & 127;
}
float max = xval[0];
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
if (!max) {
scales[ib] = 0;
memset(L, 0, 32);
continue;
}
float best = 0;
float scale = max/(2*kMaxQ-1);
for (int is = -9; is <= 9; ++is) {
float id = (2*kMaxQ-1+is*0.1f)/max;
float this_scale = 1/id;
for (int k = 0; k < 4; ++k) {
for (int i = 0; i < 8; ++i) {
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
}
uint16_t u = 0;
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
int grid_index = kmap_q2xs[u];
is_on_grid_aux[k] = true;
if (grid_index < 0) {
is_on_grid_aux[k] = false;
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
}
}
float sumqx = 0, sumq2 = 0;
for (int i = 0; i < 32; ++i) {
float w = weight[i];
float q = 2*Laux[i] + 1;
sumqx += w*xval[i]*q;
sumq2 += w*q*q;
}
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
scale = sumqx/sumq2; best = scale*sumqx;
for (int i = 0; i < 32; ++i) L[i] = Laux[i];
for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k];
}
}
int n_not_ongrid = 0;
for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
if (n_not_ongrid > 0 && scale > 0) {
float id = 1/scale;
for (int k = 0; k < 4; ++k) {
if (is_on_grid[k]) continue;
uint16_t u = 0;
for (int i = 0; i < 8; ++i) {
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
l = MAX(0, MIN(kMaxQ-1, l));
u |= (l << 2*i);
}
int grid_index = kmap_q2xs[u];
if (grid_index < 0) {
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
}
const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
}
float sumqx = 0, sumq2 = 0;
for (int i = 0; i < 32; ++i) {
float w = weight[i];
float q = 2*L[i] + 1;
sumqx += w*xval[i]*q;
sumq2 += w*q*q;
}
if (sumq2 > 0) scale = sumqx/sumq2;
}
if (scale < 0) {
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
// and correspondingly flip quant signs.
scale = -scale;
for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
}
for (int k = 0; k < 4; ++k) {
uint16_t u = 0;
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
int grid_index = kmap_q2xs[u];
if (grid_index < 0) {
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
printf("\n");
GGML_ASSERT(false);
}
q2[2*ib+0] |= (grid_index << 8*k);
q2[2*ib+1] |= (block_signs[k] << 7*k);
}
GGML_ASSERT(scale >= 0);
scales[ib] = scale;
max_scale = MAX(max_scale, scale);
}
if (!max_scale) {
memset(y[ibl].qs, 0, QK_K/4);
continue;
}
float d = max_scale/31;
y[ibl].d = GGML_FP32_TO_FP16(d);
float id = 1/d;
float sumqx = 0, sumq2 = 0;
for (int ib = 0; ib < QK_K/32; ++ib) {
int l = nearest_int(0.5f*(id*scales[ib]-1));
l = MAX(0, MIN(15, l));
q2[2*ib+1] |= ((uint32_t)l << 28);
const float * xb = xbl + 32*ib;
const float * qw = quant_weights + QK_K*ibl + 32*ib;
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib);
const float db = d * (1 + 2*l);
uint32_t u = 0;
for (int k = 0; k < 4; ++k) {
const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127);
const float * xk = xb + 8*k;
const float * wk = weight + 8*k;
const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
float best_mse = 0; int best_index = aux8[k];
for (int j = 0; j < 8; ++j) {
float diff = db * grid[j] * signs[j] - xk[j];
best_mse += wk[j] * diff * diff;
}
for (int idx = 0; idx < 256; ++idx) {
grid = (const uint8_t *)(kgrid_q2xs + idx);
float mse = 0;
for (int j = 0; j < 8; ++j) {
float diff = db * grid[j] * signs[j] - xk[j];
mse += wk[j] * diff * diff;
}
if (mse < best_mse) {
best_mse = mse; best_index = idx;
}
}
u |= (best_index << 8*k);
grid = (const uint8_t *)(kgrid_q2xs + best_index);
for (int j = 0; j < 8; ++j) {
float q = db * grid[j] * signs[j];
sumqx += wk[j] * q * xk[j];
sumq2 += wk[j] * q * q;
}
}
q2[2*ib] = u;
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
}
memcpy(y[ibl].qs, q2, QK_K/4);
}
}
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
const int gindex = iq2_data_index(512);
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
const int * kmap_q2xs = iq2_data[gindex].map;
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
GGML_ASSERT(quant_weights);
GGML_ASSERT(kmap_q2xs);
GGML_ASSERT(kgrid_q2xs);
GGML_ASSERT(kneighbors_q2xs);
GGML_ASSERT(n%QK_K == 0);
const int kMaxQ = 3;
const int nbl = n/256;
block_iq2_xs * y = vy;
float scales[QK_K/16];
float weight[16];
float xval[16];
int8_t L[16];
int8_t Laux[16];
float waux[16];
bool is_on_grid[2];
bool is_on_grid_aux[2];
uint8_t block_signs[2];
uint16_t q2[2*(QK_K/16)];
for (int ibl = 0; ibl < nbl; ++ibl) {
y[ibl].d = GGML_FP32_TO_FP16(0.f);
memset(q2, 0, QK_K/4);
memset(y[ibl].scales, 0, QK_K/32);
float max_scale = 0;
const float * xbl = x + QK_K*ibl;
float sumx2 = 0;
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
float sigma2 = sumx2/QK_K;
for (int ib = 0; ib < QK_K/16; ++ib) {
const float * xb = xbl + 16*ib;
const float * qw = quant_weights + QK_K*ibl + 16*ib;
for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
for (int k = 0; k < 2; ++k) {
int nflip = 0;
uint8_t s = 0;
for (int i = 0; i < 8; ++i) {
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
else {
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
}
}
if (nflip%2) {
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
for (int i = 1; i < 8; ++i) {
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
if (ax < min) {
min = ax; imin = i;
}
}
xval[8*k+imin] = -xval[8*k+imin];
s ^= (1 << imin);
}
block_signs[k] = s & 127;
}
float max = xval[0];
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
if (!max) {
scales[ib] = 0;
memset(L, 0, 16);
continue;
}
float best = 0;
float scale = max/(2*kMaxQ-1);
is_on_grid[0] = is_on_grid[1] = true;
for (int is = -9; is <= 9; ++is) {
float id = (2*kMaxQ-1+is*0.1f)/max;
float this_scale = 1/id;
for (int k = 0; k < 2; ++k) {
for (int i = 0; i < 8; ++i) {
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
}
uint16_t u = 0;
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
int grid_index = kmap_q2xs[u];
is_on_grid_aux[k] = true;
if (grid_index < 0) {
is_on_grid_aux[k] = false;
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
}
}
float sumqx = 0, sumq2 = 0;
for (int i = 0; i < 16; ++i) {
float w = weight[i];
float q = 2*Laux[i] + 1;
sumqx += w*xval[i]*q;
sumq2 += w*q*q;
}
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
scale = sumqx/sumq2; best = scale*sumqx;
for (int i = 0; i < 16; ++i) L[i] = Laux[i];
for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
}
}
int n_not_ongrid = 0;
for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
if (n_not_ongrid > 0 && scale > 0) {
float id = 1/scale;
for (int k = 0; k < 2; ++k) {
if (is_on_grid[k]) continue;
uint16_t u = 0;
for (int i = 0; i < 8; ++i) {
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
l = MAX(0, MIN(kMaxQ-1, l));
u |= (l << 2*i);
L[8*k + i] = l;
}
int grid_index = kmap_q2xs[u];
if (grid_index < 0) {
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
}
}
float sumqx = 0, sumq2 = 0;
for (int i = 0; i < 16; ++i) {
float w = weight[i];
float q = 2*L[i] + 1;
sumqx += w*xval[i]*q;
sumq2 += w*q*q;
}
if (sumq2 > 0) scale = sumqx/sumq2;
}
if (scale < 0) {
scale = -scale;
for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
}
for (int k = 0; k < 2; ++k) {
uint16_t u = 0;
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
int grid_index = kmap_q2xs[u];
if (grid_index < 0) {
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
printf("\n");
GGML_ASSERT(false);
}
q2[2*ib+k] = grid_index | (block_signs[k] << 9);
}
GGML_ASSERT(scale >= 0);
scales[ib] = scale;
max_scale = MAX(max_scale, scale);
}
if (!max_scale) {
memset(y[ibl].qs, 0, QK_K/4);
continue;
}
float d = max_scale/31;
y[ibl].d = GGML_FP32_TO_FP16(d);
float id = 1/d;
for (int ib = 0; ib < QK_K/16; ++ib) {
int l = nearest_int(0.5f*(id*scales[ib]-1));
l = MAX(0, MIN(15, l));
if (ib%2 == 0) y[ibl].scales[ib/2] = l;
else y[ibl].scales[ib/2] |= (l << 4);
}
memcpy(y[ibl].qs, q2, QK_K/4);
}
}
size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
(void)hist;
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
src += n_per_row;
qrow += nblock*sizeof(block_iq2_xxs);
}
return nrow * nblock * sizeof(block_iq2_xxs);
}
size_t ggml_quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
(void)hist;
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
char * qrow = (char *)dst;
for (int row = 0; row < nrow; ++row) {
quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
src += n_per_row;
qrow += nblock*sizeof(block_iq2_xs);
}
return nrow * nblock * sizeof(block_iq2_xs);
}

View file

@ -196,8 +196,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
@ -212,8 +210,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
// Dequantization // Dequantization
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);

29
ggml.c
View file

@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.type_size = sizeof(block_iq2_xxs), .type_size = sizeof(block_iq2_xxs),
.is_quantized = true, .is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs, .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
.from_float = quantize_row_iq2_xxs, .from_float = NULL,
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference, .from_float_reference = NULL,
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K, .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
}, },
@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.type_size = sizeof(block_iq2_xs), .type_size = sizeof(block_iq2_xs),
.is_quantized = true, .is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs, .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
.from_float = quantize_row_iq2_xs, .from_float = NULL,
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference, .from_float_reference = NULL,
.vec_dot = ggml_vec_dot_iq2_xs_q8_K, .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K, .vec_dot_type = GGML_TYPE_Q8_K,
}, },
@ -18625,8 +18625,11 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
return (n/QK8_0*sizeof(block_q8_0)); return (n/QK8_0*sizeof(block_q8_0));
} }
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
(void)imatrix;
size_t result = 0; size_t result = 0;
int n = nrows * n_per_row;
switch (type) { switch (type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
{ {
@ -18691,14 +18694,22 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XXS:
{ {
GGML_ASSERT(start % QK_K == 0); GGML_ASSERT(start % QK_K == 0);
block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K; GGML_ASSERT(start % n_per_row == 0);
result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist); GGML_ASSERT(imatrix);
size_t start_row = start / n_per_row;
size_t row_size = ggml_row_size(type, n_per_row);
result = ggml_quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
GGML_ASSERT(result == row_size * nrows);
} break; } break;
case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_XS:
{ {
GGML_ASSERT(start % QK_K == 0); GGML_ASSERT(start % QK_K == 0);
block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K; GGML_ASSERT(start % n_per_row == 0);
result = ggml_quantize_iq2_xs(src + start, block, n, n, hist); GGML_ASSERT(imatrix);
size_t start_row = start / n_per_row;
size_t row_size = ggml_row_size(type, n_per_row);
result = ggml_quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
GGML_ASSERT(result == row_size * nrows);
} break; } break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {

11
ggml.h
View file

@ -2062,10 +2062,15 @@ extern "C" {
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
// These are needed for IQ2_XS and IQ2_XXS quantizations
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
// //
// Importance matrix // Importance matrix

View file

@ -8924,9 +8924,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0; new_type = GGML_TYPE_Q8_0;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
new_type = GGML_TYPE_Q5_K;
}
else if (new_type != GGML_TYPE_Q8_0) { else if (new_type != GGML_TYPE_Q8_0) {
new_type = GGML_TYPE_Q6_K; new_type = GGML_TYPE_Q6_K;
} }
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
if (name.find("attn_v.weight") != std::string::npos) {
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
else new_type = GGML_TYPE_Q2_K;
++qs.i_attention_wv;
}
else if (name.find("ffn_down") != std::string::npos) {
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
++qs.i_feed_forward_w2;
}
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
} else if (name.find("attn_v.weight") != std::string::npos) { } else if (name.find("attn_v.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@ -9089,6 +9103,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (params->only_copy) { if (params->only_copy) {
ftype = model.ftype; ftype = model.ftype;
} }
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
if (params->imatrix) {
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
if (imatrix_data) {
printf("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
}
}
const size_t align = GGUF_DEFAULT_ALIGNMENT; const size_t align = GGUF_DEFAULT_ALIGNMENT;
struct gguf_context * ctx_out = gguf_init_empty(); struct gguf_context * ctx_out = gguf_init_empty();
@ -9146,6 +9167,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// placeholder for the meta data // placeholder for the meta data
::zeros(fout, meta_size); ::zeros(fout, meta_size);
std::set<ggml_type> used_iq2;
for (int i = 0; i < ml.n_tensors; ++i) { for (int i = 0; i < ml.n_tensors; ++i) {
struct ggml_tensor * tensor = ml.get_tensor_meta(i); struct ggml_tensor * tensor = ml.get_tensor_meta(i);
@ -9198,6 +9221,26 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else { } else {
const size_t nelements = ggml_nelements(tensor); const size_t nelements = ggml_nelements(tensor);
if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
ggml_init_iq2_quantization(new_type);
used_iq2.insert(new_type);
}
const float * imatrix = nullptr;
if (imatrix_data) {
auto it = imatrix_data->find(tensor->name);
if (it == imatrix_data->end()) {
printf("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
} else {
if (it->second.size() == (size_t)tensor->ne[0]) {
imatrix = it->second.data();
} else {
printf("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
int(it->second.size()), int(tensor->ne[0]), tensor->name);
}
}
}
float * f32_data; float * f32_data;
if (tensor->type == GGML_TYPE_F32) { if (tensor->type == GGML_TYPE_F32) {
@ -9218,21 +9261,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_data = work.data(); new_data = work.data();
std::array<int64_t, 1 << 4> hist_cur = {}; std::array<int64_t, 1 << 4> hist_cur = {};
static const int chunk_size = 32 * 512; const int n_per_row = tensor->ne[0];
const int nrows = nelements / n_per_row;
static const int min_chunk_size = 32 * 512;
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
const int nchunk = (nelements + chunk_size - 1)/chunk_size; const int nchunk = (nelements + chunk_size - 1)/chunk_size;
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
if (nthread_use < 2) { if (nthread_use < 2) {
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
} else { } else {
size_t counter = 0; int counter = 0;
new_size = 0; new_size = 0;
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() { auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
nrows, n_per_row, imatrix]() {
std::array<int64_t, 1 << 4> local_hist = {}; std::array<int64_t, 1 << 4> local_hist = {};
const int nrows_per_chunk = chunk_size / n_per_row;
size_t local_size = 0; size_t local_size = 0;
while (true) { while (true) {
std::unique_lock<std::mutex> lock(mutex); std::unique_lock<std::mutex> lock(mutex);
size_t first = counter; counter += chunk_size; int first_row = counter; counter += nrows_per_chunk;
if (first >= nelements) { if (first_row >= nrows) {
if (local_size > 0) { if (local_size > 0) {
for (int j=0; j<int(local_hist.size()); ++j) { for (int j=0; j<int(local_hist.size()); ++j) {
hist_cur[j] += local_hist[j]; hist_cur[j] += local_hist[j];
@ -9242,8 +9292,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
break; break;
} }
lock.unlock(); lock.unlock();
size_t last = std::min(nelements, first + chunk_size); const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
} }
}; };
for (int it = 0; it < nthread_use - 1; ++it) { for (int it = 0; it < nthread_use - 1; ++it) {
@ -9254,7 +9305,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
workers.clear(); workers.clear();
} }
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
int64_t tot_count = 0; int64_t tot_count = 0;
for (size_t i = 0; i < hist_cur.size(); i++) { for (size_t i = 0; i < hist_cur.size(); i++) {
hist_all[i] += hist_cur[i]; hist_all[i] += hist_cur[i];
@ -9262,6 +9313,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} }
if (tot_count > 0) { if (tot_count > 0) {
LLAMA_LOG_INFO(" | hist: ");
for (size_t i = 0; i < hist_cur.size(); i++) { for (size_t i = 0; i < hist_cur.size(); i++) {
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements)); LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
} }
@ -9290,6 +9342,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
fout.close(); fout.close();
for (auto type : used_iq2) {
ggml_deinit_iq2_quantization(type);
}
gguf_free(ctx_out); gguf_free(ctx_out);
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);

View file

@ -56,7 +56,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size)); std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
int64_t hist[16]; int64_t hist[16];
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist); ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, nullptr);
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
// This is going to create some weird integers though. // This is going to create some weird integers though.