Remove Q4/Q5 bit shuffling without breaking compatibility
This commit is contained in:
parent
fe60904eef
commit
7beb59a80b
10 changed files with 151 additions and 664 deletions
24
README.md
24
README.md
|
@ -330,18 +330,18 @@ As the models are currently fully loaded into memory, you will need adequate dis
|
||||||
|
|
||||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||||
|
|
||||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q4_2 | Q5_0 | Q5_1 | Q8_0 |
|
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||||
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
|
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
|
||||||
| 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 6.1466 | 5.9862 | 5.9481 | 5.9069 |
|
| 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
|
||||||
| 7B | file size | 13.0G | 4.0G | 4.8G | 4.0G | 4.4G | 4.8G | 7.1G |
|
| 7B | file size | 13.0G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G |
|
||||||
| 7B | ms/tok @ 4th | 128 | 56 | 61 | 84 | 91 | 95 | 75 |
|
| 7B | ms/tok @ 4th | 128 | 56 | 61 | 91 | 95 | 75 |
|
||||||
| 7B | ms/tok @ 8th | 128 | 47 | 55 | 48 | 53 | 59 | 75 |
|
| 7B | ms/tok @ 8th | 128 | 47 | 55 | 53 | 59 | 75 |
|
||||||
| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 |
|
| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
|
||||||
| 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.3513 | 5.2856 | 5.2706 | 5.2548 |
|
| 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
|
||||||
| 13B | file size | 25.0G | 7.6G | 9.1G | 7.6G | 8.4G | 9.1G | 14G |
|
| 13B | file size | 25.0G | 7.6G | 9.1G | 8.4G | 9.1G | 14G |
|
||||||
| 13B | ms/tok @ 4th | 239 | 104 | 113 | 160 | 176 | 185 | 141 |
|
| 13B | ms/tok @ 4th | 239 | 104 | 113 | 176 | 185 | 141 |
|
||||||
| 13B | ms/tok @ 8th | 240 | 85 | 99 | 97 | 108 | 117 | 147 |
|
| 13B | ms/tok @ 8th | 240 | 85 | 99 | 108 | 117 | 147 |
|
||||||
| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 |
|
| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
|
||||||
|
|
||||||
### Perplexity (measuring model quality)
|
### Perplexity (measuring model quality)
|
||||||
|
|
||||||
|
|
|
@ -2,14 +2,12 @@
|
||||||
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
|
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
|
||||||
99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6 models/7B/ggml-model-q4_0.bin
|
99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6 models/7B/ggml-model-q4_0.bin
|
||||||
cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin
|
cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin
|
||||||
25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496 models/7B/ggml-model-q4_2.bin
|
|
||||||
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
|
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
|
||||||
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
|
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
|
||||||
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
|
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
|
||||||
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
|
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
|
||||||
eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab models/13B/ggml-model-q4_0.bin
|
eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab models/13B/ggml-model-q4_0.bin
|
||||||
d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin
|
d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin
|
||||||
75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa models/13B/ggml-model-q4_2.bin
|
|
||||||
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
|
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
|
||||||
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
|
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
|
||||||
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
|
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
|
||||||
|
@ -18,7 +16,6 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/con
|
||||||
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
|
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
|
||||||
517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d models/30B/ggml-model-q4_0.bin
|
517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d models/30B/ggml-model-q4_0.bin
|
||||||
7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin
|
7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin
|
||||||
aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204 models/30B/ggml-model-q4_2.bin
|
|
||||||
2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json
|
2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json
|
||||||
135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth
|
135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth
|
||||||
9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth
|
9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth
|
||||||
|
@ -31,6 +28,5 @@ d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/con
|
||||||
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
|
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
|
||||||
01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2 models/65B/ggml-model-q4_0.bin
|
01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2 models/65B/ggml-model-q4_0.bin
|
||||||
4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin
|
4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin
|
||||||
1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9 models/65B/ggml-model-q4_2.bin
|
|
||||||
999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json
|
999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json
|
||||||
9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model
|
9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model
|
||||||
|
|
|
@ -7,12 +7,11 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
|
static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
|
||||||
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
|
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
|
||||||
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
|
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
|
||||||
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
|
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
||||||
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
||||||
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
||||||
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
|
bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||||
|
|
37
ggml-cuda.cu
37
ggml-cuda.cu
|
@ -49,13 +49,6 @@ typedef struct {
|
||||||
} block_q4_1;
|
} block_q4_1;
|
||||||
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||||
|
|
||||||
#define QK4_2 16
|
|
||||||
typedef struct {
|
|
||||||
half d; // delta
|
|
||||||
uint8_t qs[QK4_2 / 2]; // nibbles / quants
|
|
||||||
} block_q4_2;
|
|
||||||
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
|
|
||||||
|
|
||||||
#define QK5_0 32
|
#define QK5_0 32
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
|
@ -127,29 +120,6 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
|
|
||||||
const block_q4_2 * x = (const block_q4_2 *) vx;
|
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
|
||||||
|
|
||||||
const float d = x[i].d;
|
|
||||||
|
|
||||||
const uint8_t * pp = x[i].qs;
|
|
||||||
|
|
||||||
for (int l = 0; l < QK4_2; l += 2) {
|
|
||||||
const uint8_t vi = pp[l/2];
|
|
||||||
|
|
||||||
const int8_t vi0 = vi & 0xf;
|
|
||||||
const int8_t vi1 = vi >> 4;
|
|
||||||
|
|
||||||
const float v0 = (vi0 - 8)*d;
|
|
||||||
const float v1 = (vi1 - 8)*d;
|
|
||||||
|
|
||||||
y[i*QK4_2 + l + 0] = v0;
|
|
||||||
y[i*QK4_2 + l + 1] = v1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
|
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
|
||||||
const block_q5_0 * x = (const block_q5_0 *) vx;
|
const block_q5_0 * x = (const block_q5_0 *) vx;
|
||||||
|
|
||||||
|
@ -235,11 +205,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
|
||||||
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
|
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
|
||||||
const int nb = k / QK4_2;
|
|
||||||
dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK5_0;
|
const int nb = k / QK5_0;
|
||||||
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
|
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
|
||||||
|
@ -274,8 +239,6 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||||
return dequantize_row_q4_0_cuda;
|
return dequantize_row_q4_0_cuda;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
return dequantize_row_q4_1_cuda;
|
return dequantize_row_q4_1_cuda;
|
||||||
case GGML_TYPE_Q4_2:
|
|
||||||
return dequantize_row_q4_2_cuda;
|
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
return dequantize_row_q5_0_cuda;
|
return dequantize_row_q5_0_cuda;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
|
|
|
@ -52,26 +52,6 @@ __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global f
|
||||||
result[index + 1] = (vi >> 4) * d + m;
|
result[index + 1] = (vi >> 4) * d + m;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct block_q4_2
|
|
||||||
{
|
|
||||||
ushort d;
|
|
||||||
uchar qs[8];
|
|
||||||
};
|
|
||||||
|
|
||||||
__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
|
|
||||||
const uint i = get_global_id(0) / 16;
|
|
||||||
const uint l = get_local_id(0);
|
|
||||||
|
|
||||||
const float d = vload_half(0, (__global half*) &blocks[i].d);
|
|
||||||
|
|
||||||
const uchar vi = blocks[i].qs[l];
|
|
||||||
|
|
||||||
const uint index = i*16 + l*2;
|
|
||||||
result[index + 0] = ((vi & 0xf) - 8)*d;
|
|
||||||
result[index + 1] = ((vi >> 4) - 8)*d;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
struct block_q5_0
|
struct block_q5_0
|
||||||
{
|
{
|
||||||
float d;
|
float d;
|
||||||
|
@ -167,7 +147,7 @@ static cl_device_id device;
|
||||||
static cl_context context;
|
static cl_context context;
|
||||||
static cl_command_queue queue;
|
static cl_command_queue queue;
|
||||||
static cl_program program;
|
static cl_program program;
|
||||||
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
|
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
|
||||||
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
|
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
|
||||||
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
|
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
|
||||||
|
|
||||||
|
@ -238,8 +218,6 @@ void ggml_cl_init(void) {
|
||||||
CL_CHECK(err, "clCreateKernel");
|
CL_CHECK(err, "clCreateKernel");
|
||||||
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
|
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
|
||||||
CL_CHECK(err, "clCreateKernel");
|
CL_CHECK(err, "clCreateKernel");
|
||||||
kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
|
|
||||||
CL_CHECK(err, "clCreateKernel");
|
|
||||||
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
|
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
|
||||||
CL_CHECK(err, "clCreateKernel");
|
CL_CHECK(err, "clCreateKernel");
|
||||||
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
|
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
|
||||||
|
@ -292,12 +270,6 @@ void ggml_cl_sgemm_wrapper(
|
||||||
local = 16;
|
local = 16;
|
||||||
size_qb = global * (sizeof(float) * 2 + local) / 32;
|
size_qb = global * (sizeof(float) * 2 + local) / 32;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_2:
|
|
||||||
dequant = true;
|
|
||||||
kernel = kernel_q4_2;
|
|
||||||
local = 8;
|
|
||||||
size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
dequant = true;
|
dequant = true;
|
||||||
kernel = kernel_q5_0;
|
kernel = kernel_q5_0;
|
||||||
|
|
4
ggml.h
4
ggml.h
|
@ -231,7 +231,7 @@ extern "C" {
|
||||||
GGML_TYPE_F16 = 1,
|
GGML_TYPE_F16 = 1,
|
||||||
GGML_TYPE_Q4_0 = 2,
|
GGML_TYPE_Q4_0 = 2,
|
||||||
GGML_TYPE_Q4_1 = 3,
|
GGML_TYPE_Q4_1 = 3,
|
||||||
GGML_TYPE_Q4_2 = 4,
|
// GGML_TYPE_Q4_2 = 4, support has been removed
|
||||||
// GGML_TYPE_Q4_3 (5) support has been removed
|
// GGML_TYPE_Q4_3 (5) support has been removed
|
||||||
GGML_TYPE_Q5_0 = 6,
|
GGML_TYPE_Q5_0 = 6,
|
||||||
GGML_TYPE_Q5_1 = 7,
|
GGML_TYPE_Q5_1 = 7,
|
||||||
|
@ -251,7 +251,6 @@ extern "C" {
|
||||||
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||||
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
|
||||||
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
|
@ -876,7 +875,6 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
|
|
@ -482,7 +482,6 @@ struct llama_file_loader {
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
case GGML_TYPE_Q4_2:
|
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
|
@ -558,7 +557,6 @@ struct llama_file_saver {
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
case GGML_TYPE_Q4_2:
|
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
|
@ -852,7 +850,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||||
return "mostly Q4_1, some F16";
|
return "mostly Q4_1, some F16";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
||||||
|
@ -1905,7 +1902,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
switch (ftype) {
|
switch (ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -78,7 +78,7 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||||
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
||||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||||
|
|
|
@ -7,7 +7,6 @@
|
||||||
# 7B
|
# 7B
|
||||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
|
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
|
||||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
|
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
|
||||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-7b-q4_2.txt
|
|
||||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
|
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
|
||||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
|
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
|
||||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
|
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
|
||||||
|
@ -15,7 +14,6 @@ time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0
|
||||||
# 13B
|
# 13B
|
||||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
|
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
|
||||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
|
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
|
||||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-13b-q4_2.txt
|
|
||||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
|
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
|
||||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
|
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
|
||||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
|
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
|
||||||
|
@ -28,7 +26,6 @@ time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8
|
||||||
time ./bin/perplexity -m ../models/7B/ggml-model-f16.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-f16.txt
|
time ./bin/perplexity -m ../models/7B/ggml-model-f16.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-f16.txt
|
||||||
time ./bin/perplexity -m ../models/7B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_0.txt
|
time ./bin/perplexity -m ../models/7B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_0.txt
|
||||||
time ./bin/perplexity -m ../models/7B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_1.txt
|
time ./bin/perplexity -m ../models/7B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_1.txt
|
||||||
time ./bin/perplexity -m ../models/7B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_2.txt
|
|
||||||
time ./bin/perplexity -m ../models/7B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_0.txt
|
time ./bin/perplexity -m ../models/7B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_0.txt
|
||||||
time ./bin/perplexity -m ../models/7B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_1.txt
|
time ./bin/perplexity -m ../models/7B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_1.txt
|
||||||
time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q8_0.txt
|
time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q8_0.txt
|
||||||
|
@ -37,7 +34,6 @@ time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --n
|
||||||
time ./bin/perplexity -m ../models/13B/ggml-model-f16.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-f16.txt
|
time ./bin/perplexity -m ../models/13B/ggml-model-f16.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-f16.txt
|
||||||
time ./bin/perplexity -m ../models/13B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_0.txt
|
time ./bin/perplexity -m ../models/13B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_0.txt
|
||||||
time ./bin/perplexity -m ../models/13B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_1.txt
|
time ./bin/perplexity -m ../models/13B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_1.txt
|
||||||
time ./bin/perplexity -m ../models/13B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_2.txt
|
|
||||||
time ./bin/perplexity -m ../models/13B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_0.txt
|
time ./bin/perplexity -m ../models/13B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_0.txt
|
||||||
time ./bin/perplexity -m ../models/13B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_1.txt
|
time ./bin/perplexity -m ../models/13B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_1.txt
|
||||||
time ./bin/perplexity -m ../models/13B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q8_0.txt
|
time ./bin/perplexity -m ../models/13B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q8_0.txt
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue