diff --git a/ggml-opencl.c b/ggml-opencl.c index 31ab13b25..598b9a1e9 100644 --- a/ggml-opencl.c +++ b/ggml-opencl.c @@ -20,7 +20,7 @@ constant uint QK4_0 = 32; struct block_q4_0 { float d; - uint8_t qs[QK4_0 / 2]; + uint8_t qs[16]; }; constant uint QK4_1 = 32; @@ -28,7 +28,7 @@ struct block_q4_1 { float d; float m; - uint8_t qs[QK4_1 / 2]; + uint8_t qs[16]; }; constant uint QK5_0 = 32; @@ -36,7 +36,7 @@ struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; - uint8_t qs[QK5_0 / 2]; + uint8_t qs[16]; }; constant uint QK5_1 = 32; @@ -45,14 +45,14 @@ struct block_q5_1 half d; half m; uint32_t qh; - uint8_t qs[QK5_1 / 2]; + uint8_t qs[16]; }; constant uint QK8_0 = 32; struct block_q8_0 { float d; - uint8_t qs[QK8_0]; + uint8_t qs[32]; }; diff --git a/ggml.c b/ggml.c index c5d820fec..49fe226d0 100644 --- a/ggml.c +++ b/ggml.c @@ -3102,11 +3102,12 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = QK5_1, [GGML_TYPE_Q8_0] = QK8_0, [GGML_TYPE_Q8_1] = QK8_1, + [GGML_TYPE_Q8_1B] = QK8_1, [GGML_TYPE_I8] = 1, [GGML_TYPE_I16] = 1, [GGML_TYPE_I32] = 1, }; -static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 14, "GGML_BLCK_SIZE is outdated"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = sizeof(float), @@ -3119,11 +3120,12 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), [GGML_TYPE_Q8_1] = sizeof(block_q8_1), + [GGML_TYPE_Q8_1B] = sizeof(block_q8_1_v2), [GGML_TYPE_I8] = sizeof(int8_t), [GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I32] = sizeof(int32_t), }; -static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 14, "GGML_TYPE_SIZE is outdated"); static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { @@ -3137,11 +3139,12 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q8_0] = "q8_0", [GGML_TYPE_Q8_1] = "q8_1", + [GGML_TYPE_Q8_1B] = "q8_1b", [GGML_TYPE_I8] = "i8", [GGML_TYPE_I16] = "i16", [GGML_TYPE_I32] = "i32", }; -static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated"); +static_assert(GGML_TYPE_COUNT == 14, "GGML_TYPE_NAME is outdated"); static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = false, @@ -3154,11 +3157,12 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = true, [GGML_TYPE_Q8_0] = true, [GGML_TYPE_Q8_1] = true, + [GGML_TYPE_Q8_1B] = true, [GGML_TYPE_I8] = false, [GGML_TYPE_I16] = false, [GGML_TYPE_I32] = false, }; -static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated"); +static_assert(GGML_TYPE_COUNT == 14, "GGML_IS_QUANTIZED is outdated"); static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", @@ -8041,6 +8045,7 @@ static void ggml_compute_forward_mul_mat( case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_1: + case GGML_TYPE_Q8_1B: { ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); } break; @@ -8273,6 +8278,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_1: + case GGML_TYPE_Q8_1B: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); } break; @@ -8599,6 +8605,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_1: + case GGML_TYPE_Q8_1B: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: diff --git a/ggml.h b/ggml.h index 35bd45e8a..d6cc480c7 100644 --- a/ggml.h +++ b/ggml.h @@ -240,6 +240,7 @@ extern "C" { GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, + GGML_TYPE_Q8_1B = 13, //legacy q8_1 GGML_TYPE_COUNT, }; diff --git a/ggml_v2.c b/ggml_v2.c index 38bf9108d..ffc6ab552 100644 --- a/ggml_v2.c +++ b/ggml_v2.c @@ -1592,7 +1592,7 @@ static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = { .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference_v2, .quantize_row_q_dot = quantize_row_q8_1_v2, .vec_dot_q = ggml_vec_dot_q4_1_q8_1_v2, - .vec_dot_type = GGML_TYPE_Q8_1, + .vec_dot_type = GGML_TYPE_Q8_1B, }, [GGML_TYPE_Q4_2] = { .dequantize_row_q = dequantize_row_q4_2_v2, @@ -1608,7 +1608,7 @@ static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = { .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference_v2, .quantize_row_q_dot = quantize_row_q8_1_v2, .vec_dot_q = ggml_vec_dot_q4_3_q8_1_v2, - .vec_dot_type = GGML_TYPE_Q8_1, + .vec_dot_type = GGML_TYPE_Q8_1B, }, [GGML_TYPE_Q5_0] = { .dequantize_row_q = dequantize_row_q5_0_v2, @@ -1624,7 +1624,7 @@ static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = { .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference_v2, .quantize_row_q_dot = quantize_row_q8_1_v2, .vec_dot_q = ggml_vec_dot_q5_1_q8_1_v2, - .vec_dot_type = GGML_TYPE_Q8_1, + .vec_dot_type = GGML_TYPE_Q8_1B, }, [GGML_TYPE_Q8_0] = { .dequantize_row_q = dequantize_row_q8_0_v2, @@ -1634,13 +1634,13 @@ static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = { .vec_dot_q = ggml_vec_dot_q8_0_q8_0_v2, .vec_dot_type = GGML_TYPE_Q8_0, }, - [GGML_TYPE_Q8_1] = { + [GGML_TYPE_Q8_1B] = { .dequantize_row_q = NULL, // TODO .quantize_row_q = quantize_row_q8_1_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference_v2, .quantize_row_q_dot = quantize_row_q8_1_v2, .vec_dot_q = NULL, // TODO - .vec_dot_type = GGML_TYPE_Q8_1, + .vec_dot_type = GGML_TYPE_Q8_1B, }, }; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 92a42a2fa..08b60b3ef 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -224,7 +224,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in neox_ctx.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = params.n_ctx; printf("System Info: %s\n", llama_print_system_info()); - + SetQuantsUnshuffled(false); if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2) { //newer format has bit unshuffling diff --git a/koboldcpp.py b/koboldcpp.py index 71f1f8b6c..7b8e766ca 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -198,7 +198,7 @@ maxctx = 2048 maxlen = 128 modelbusy = False defaultport = 5001 -KcppVersion = "1.21.1" +KcppVersion = "1.21.2" class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" diff --git a/otherarch/tools/common-ggml.cpp b/otherarch/tools/common-ggml.cpp index 350e95039..3c0bfe286 100644 --- a/otherarch/tools/common-ggml.cpp +++ b/otherarch/tools/common-ggml.cpp @@ -198,6 +198,7 @@ bool ggml_common_quantize_0( case GGML_TYPE_I16: case GGML_TYPE_I32: case GGML_TYPE_Q8_1: + case GGML_TYPE_Q8_1B: case GGML_TYPE_COUNT: { fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));