fixed wrong sized struct from legacy q8_1, fixed opencl varsize arrays

This commit is contained in:
Concedo 2023-05-13 23:56:08 +08:00
parent c9eb2ba1c5
commit e05455f852
7 changed files with 25 additions and 16 deletions

View file

@ -20,7 +20,7 @@ constant uint QK4_0 = 32;
struct block_q4_0 struct block_q4_0
{ {
float d; float d;
uint8_t qs[QK4_0 / 2]; uint8_t qs[16];
}; };
constant uint QK4_1 = 32; constant uint QK4_1 = 32;
@ -28,7 +28,7 @@ struct block_q4_1
{ {
float d; float d;
float m; float m;
uint8_t qs[QK4_1 / 2]; uint8_t qs[16];
}; };
constant uint QK5_0 = 32; constant uint QK5_0 = 32;
@ -36,7 +36,7 @@ struct __attribute__ ((packed)) block_q5_0
{ {
half d; half d;
uint32_t qh; uint32_t qh;
uint8_t qs[QK5_0 / 2]; uint8_t qs[16];
}; };
constant uint QK5_1 = 32; constant uint QK5_1 = 32;
@ -45,14 +45,14 @@ struct block_q5_1
half d; half d;
half m; half m;
uint32_t qh; uint32_t qh;
uint8_t qs[QK5_1 / 2]; uint8_t qs[16];
}; };
constant uint QK8_0 = 32; constant uint QK8_0 = 32;
struct block_q8_0 struct block_q8_0
{ {
float d; float d;
uint8_t qs[QK8_0]; uint8_t qs[32];
}; };

15
ggml.c
View file

@ -3102,11 +3102,12 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q5_1] = QK5_1, [GGML_TYPE_Q5_1] = QK5_1,
[GGML_TYPE_Q8_0] = QK8_0, [GGML_TYPE_Q8_0] = QK8_0,
[GGML_TYPE_Q8_1] = QK8_1, [GGML_TYPE_Q8_1] = QK8_1,
[GGML_TYPE_Q8_1B] = QK8_1,
[GGML_TYPE_I8] = 1, [GGML_TYPE_I8] = 1,
[GGML_TYPE_I16] = 1, [GGML_TYPE_I16] = 1,
[GGML_TYPE_I32] = 1, [GGML_TYPE_I32] = 1,
}; };
static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated"); static_assert(GGML_TYPE_COUNT == 14, "GGML_BLCK_SIZE is outdated");
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = sizeof(float), [GGML_TYPE_F32] = sizeof(float),
@ -3119,11 +3120,12 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
[GGML_TYPE_Q8_0] = sizeof(block_q8_0), [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
[GGML_TYPE_Q8_1] = sizeof(block_q8_1), [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
[GGML_TYPE_Q8_1B] = sizeof(block_q8_1_v2),
[GGML_TYPE_I8] = sizeof(int8_t), [GGML_TYPE_I8] = sizeof(int8_t),
[GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I16] = sizeof(int16_t),
[GGML_TYPE_I32] = sizeof(int32_t), [GGML_TYPE_I32] = sizeof(int32_t),
}; };
static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated"); static_assert(GGML_TYPE_COUNT == 14, "GGML_TYPE_SIZE is outdated");
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
@ -3137,11 +3139,12 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q5_1] = "q5_1",
[GGML_TYPE_Q8_0] = "q8_0", [GGML_TYPE_Q8_0] = "q8_0",
[GGML_TYPE_Q8_1] = "q8_1", [GGML_TYPE_Q8_1] = "q8_1",
[GGML_TYPE_Q8_1B] = "q8_1b",
[GGML_TYPE_I8] = "i8", [GGML_TYPE_I8] = "i8",
[GGML_TYPE_I16] = "i16", [GGML_TYPE_I16] = "i16",
[GGML_TYPE_I32] = "i32", [GGML_TYPE_I32] = "i32",
}; };
static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated"); static_assert(GGML_TYPE_COUNT == 14, "GGML_TYPE_NAME is outdated");
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = false, [GGML_TYPE_F32] = false,
@ -3154,11 +3157,12 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q5_1] = true, [GGML_TYPE_Q5_1] = true,
[GGML_TYPE_Q8_0] = true, [GGML_TYPE_Q8_0] = true,
[GGML_TYPE_Q8_1] = true, [GGML_TYPE_Q8_1] = true,
[GGML_TYPE_Q8_1B] = true,
[GGML_TYPE_I8] = false, [GGML_TYPE_I8] = false,
[GGML_TYPE_I16] = false, [GGML_TYPE_I16] = false,
[GGML_TYPE_I32] = false, [GGML_TYPE_I32] = false,
}; };
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated"); static_assert(GGML_TYPE_COUNT == 14, "GGML_IS_QUANTIZED is outdated");
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"NONE", "NONE",
@ -8041,6 +8045,7 @@ static void ggml_compute_forward_mul_mat(
case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_1B:
{ {
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
} break; } break;
@ -8273,6 +8278,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_1B:
{ {
ggml_compute_forward_get_rows_q(params, src0, src1, dst); ggml_compute_forward_get_rows_q(params, src0, src1, dst);
} break; } break;
@ -8599,6 +8605,7 @@ static void ggml_compute_forward_alibi(
case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_1B:
case GGML_TYPE_I8: case GGML_TYPE_I8:
case GGML_TYPE_I16: case GGML_TYPE_I16:
case GGML_TYPE_I32: case GGML_TYPE_I32:

1
ggml.h
View file

@ -240,6 +240,7 @@ extern "C" {
GGML_TYPE_I8, GGML_TYPE_I8,
GGML_TYPE_I16, GGML_TYPE_I16,
GGML_TYPE_I32, GGML_TYPE_I32,
GGML_TYPE_Q8_1B = 13, //legacy q8_1
GGML_TYPE_COUNT, GGML_TYPE_COUNT,
}; };

View file

@ -1592,7 +1592,7 @@ static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = {
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference_v2,
.quantize_row_q_dot = quantize_row_q8_1_v2, .quantize_row_q_dot = quantize_row_q8_1_v2,
.vec_dot_q = ggml_vec_dot_q4_1_q8_1_v2, .vec_dot_q = ggml_vec_dot_q4_1_q8_1_v2,
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1B,
}, },
[GGML_TYPE_Q4_2] = { [GGML_TYPE_Q4_2] = {
.dequantize_row_q = dequantize_row_q4_2_v2, .dequantize_row_q = dequantize_row_q4_2_v2,
@ -1608,7 +1608,7 @@ static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = {
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference_v2,
.quantize_row_q_dot = quantize_row_q8_1_v2, .quantize_row_q_dot = quantize_row_q8_1_v2,
.vec_dot_q = ggml_vec_dot_q4_3_q8_1_v2, .vec_dot_q = ggml_vec_dot_q4_3_q8_1_v2,
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1B,
}, },
[GGML_TYPE_Q5_0] = { [GGML_TYPE_Q5_0] = {
.dequantize_row_q = dequantize_row_q5_0_v2, .dequantize_row_q = dequantize_row_q5_0_v2,
@ -1624,7 +1624,7 @@ static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = {
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference_v2,
.quantize_row_q_dot = quantize_row_q8_1_v2, .quantize_row_q_dot = quantize_row_q8_1_v2,
.vec_dot_q = ggml_vec_dot_q5_1_q8_1_v2, .vec_dot_q = ggml_vec_dot_q5_1_q8_1_v2,
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1B,
}, },
[GGML_TYPE_Q8_0] = { [GGML_TYPE_Q8_0] = {
.dequantize_row_q = dequantize_row_q8_0_v2, .dequantize_row_q = dequantize_row_q8_0_v2,
@ -1634,13 +1634,13 @@ static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = {
.vec_dot_q = ggml_vec_dot_q8_0_q8_0_v2, .vec_dot_q = ggml_vec_dot_q8_0_q8_0_v2,
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
}, },
[GGML_TYPE_Q8_1] = { [GGML_TYPE_Q8_1B] = {
.dequantize_row_q = NULL, // TODO .dequantize_row_q = NULL, // TODO
.quantize_row_q = quantize_row_q8_1_v2, .quantize_row_q = quantize_row_q8_1_v2,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference_v2,
.quantize_row_q_dot = quantize_row_q8_1_v2, .quantize_row_q_dot = quantize_row_q8_1_v2,
.vec_dot_q = NULL, // TODO .vec_dot_q = NULL, // TODO
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1B,
}, },
}; };

View file

@ -224,7 +224,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
neox_ctx.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = params.n_ctx; neox_ctx.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = params.n_ctx;
printf("System Info: %s\n", llama_print_system_info()); printf("System Info: %s\n", llama_print_system_info());
SetQuantsUnshuffled(false);
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2) if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
{ {
//newer format has bit unshuffling //newer format has bit unshuffling

View file

@ -198,7 +198,7 @@ maxctx = 2048
maxlen = 128 maxlen = 128
modelbusy = False modelbusy = False
defaultport = 5001 defaultport = 5001
KcppVersion = "1.21.1" KcppVersion = "1.21.2"
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
sys_version = "" sys_version = ""

View file

@ -198,6 +198,7 @@ bool ggml_common_quantize_0(
case GGML_TYPE_I16: case GGML_TYPE_I16:
case GGML_TYPE_I32: case GGML_TYPE_I32:
case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_1B:
case GGML_TYPE_COUNT: case GGML_TYPE_COUNT:
{ {
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));