diff --git a/ggml.c b/ggml.c index 6fc1fc1aa..4fd7a4141 100644 --- a/ggml.c +++ b/ggml.c @@ -355,6 +355,18 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) { } } +static void ggml_i32_to_f32_row(const int32_t * x, float * y, int n) { + for (int i = 0; i < n; i++) { + y[i] = (float) x[i]; + } +} + +static void ggml_f32_to_i32_row(const float * x, int32_t * y, int n) { + for (int i = 0; i < n; i++) { + y[i] = (int32_t) x[i]; + } +} + // // timing // @@ -454,6 +466,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = 1, .type_size = sizeof(int32_t), .is_quantized = false, + .to_float = (ggml_to_float_t) ggml_i32_to_f32_row, + .from_float = (ggml_from_float_t) ggml_f32_to_i32_row, + .from_float_reference = (ggml_from_float_t) ggml_f32_to_i32_row, }, [GGML_TYPE_F32] = { .type_name = "f32", @@ -469,10 +484,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = 1, .type_size = sizeof(ggml_fp16_t), .is_quantized = false, - .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, + .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, - .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot_type = GGML_TYPE_F16, .nrows = 1, }, @@ -481,8 +496,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK4_0, .type_size = sizeof(block_q4_0), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q4_0, - .from_float = quantize_row_q4_0, + .to_float = (ggml_to_float_t) dequantize_row_q4_0, + .from_float = (ggml_from_float_t) quantize_row_q4_0, .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, .vec_dot = ggml_vec_dot_q4_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, @@ -497,8 +512,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK4_1, .type_size = sizeof(block_q4_1), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q4_1, - .from_float = quantize_row_q4_1, + .to_float = (ggml_to_float_t) dequantize_row_q4_1, + .from_float = (ggml_from_float_t) quantize_row_q4_1, .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, .vec_dot = ggml_vec_dot_q4_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, @@ -537,8 +552,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK5_0, .type_size = sizeof(block_q5_0), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q5_0, - .from_float = quantize_row_q5_0, + .to_float = (ggml_to_float_t) dequantize_row_q5_0, + .from_float = (ggml_from_float_t) quantize_row_q5_0, .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, .vec_dot = ggml_vec_dot_q5_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, @@ -549,8 +564,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK5_1, .type_size = sizeof(block_q5_1), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q5_1, - .from_float = quantize_row_q5_1, + .to_float = (ggml_to_float_t) dequantize_row_q5_1, + .from_float = (ggml_from_float_t) quantize_row_q5_1, .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, .vec_dot = ggml_vec_dot_q5_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, @@ -561,8 +576,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK8_0, .type_size = sizeof(block_q8_0), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q8_0, - .from_float = quantize_row_q8_0, + .to_float = (ggml_to_float_t) dequantize_row_q8_0, + .from_float = (ggml_from_float_t) quantize_row_q8_0, .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, .vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, @@ -577,7 +592,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK8_1, .type_size = sizeof(block_q8_1), .is_quantized = true, - .from_float = quantize_row_q8_1, + .from_float = (ggml_from_float_t) quantize_row_q8_1, .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .vec_dot_type = GGML_TYPE_Q8_1, .nrows = 1, @@ -587,8 +602,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK_K, .type_size = sizeof(block_q2_K), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q2_K, - .from_float = quantize_row_q2_K, + .to_float = (ggml_to_float_t) dequantize_row_q2_K, + .from_float = (ggml_from_float_t) quantize_row_q2_K, .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, .vec_dot = ggml_vec_dot_q2_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, @@ -599,8 +614,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK_K, .type_size = sizeof(block_q3_K), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q3_K, - .from_float = quantize_row_q3_K, + .to_float = (ggml_to_float_t) dequantize_row_q3_K, + .from_float = (ggml_from_float_t) quantize_row_q3_K, .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, .vec_dot = ggml_vec_dot_q3_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, @@ -611,8 +626,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK_K, .type_size = sizeof(block_q4_K), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q4_K, - .from_float = quantize_row_q4_K, + .to_float = (ggml_to_float_t) dequantize_row_q4_K, + .from_float = (ggml_from_float_t) quantize_row_q4_K, .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, .vec_dot = ggml_vec_dot_q4_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, @@ -623,8 +638,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK_K, .type_size = sizeof(block_q5_K), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q5_K, - .from_float = quantize_row_q5_K, + .to_float = (ggml_to_float_t) dequantize_row_q5_K, + .from_float = (ggml_from_float_t) quantize_row_q5_K, .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, .vec_dot = ggml_vec_dot_q5_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, @@ -635,8 +650,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK_K, .type_size = sizeof(block_q6_K), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_q6_K, - .from_float = quantize_row_q6_K, + .to_float = (ggml_to_float_t) dequantize_row_q6_K, + .from_float = (ggml_from_float_t) quantize_row_q6_K, .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, .vec_dot = ggml_vec_dot_q6_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, @@ -671,9 +686,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK_K, .type_size = sizeof(block_iq3_xxs), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs, - .from_float = quantize_row_iq3_xxs, - .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference, + .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs, + .from_float = (ggml_from_float_t) quantize_row_iq3_xxs, + .from_float_reference = (ggml_from_float_t) quantize_row_iq3_xxs_reference, .vec_dot = ggml_vec_dot_iq3_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, @@ -695,9 +710,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .blck_size = QK4_NL, .type_size = sizeof(block_iq4_nl), .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_iq4_nl, - .from_float = quantize_row_iq4_nl, - .from_float_reference = (ggml_from_float_t)quantize_row_iq4_nl_reference, + .to_float = (ggml_to_float_t) dequantize_row_iq4_nl, + .from_float = (ggml_from_float_t) quantize_row_iq4_nl, + .from_float_reference = (ggml_from_float_t) quantize_row_iq4_nl_reference, .vec_dot = ggml_vec_dot_iq4_nl_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, diff --git a/llama.cpp b/llama.cpp index 788bf3fbc..174ecf899 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5928,9 +5928,10 @@ struct llm_build_context { // get input vectors with right size const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type); - struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0); - struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0); + struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0); // construct input embeddings (token, type, position) inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); @@ -5938,8 +5939,9 @@ struct llm_build_context { // token types are hardcoded to zero ("Sentence A") struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); inpL = ggml_add(ctx0, inpL, type_row0); + if (model.arch == LLM_ARCH_BERT) { - inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); + inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, ggml_cast(ctx0, inp_pos, GGML_TYPE_I32)), inpL); } cb(inpL, "inp_embd", -1); diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 5e92d5742..0b90e560c 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -143,10 +143,10 @@ int main(int argc, char * argv[]) { continue; } - printf("Testing %s\n", ggml_type_name((ggml_type) i)); - ggml_quantize_init(ei); + if (qfns.from_float && qfns.to_float && qfns.vec_dot) { + printf("Testing %s\n", ggml_type_name((ggml_type) i)); + ggml_quantize_init(ei); - if (qfns.from_float && qfns.to_float) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float max_quantization_error = type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 48d9fae3d..ca4c156f3 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -275,7 +275,7 @@ int main(int argc, char * argv[]) { continue; } - if (qfns.from_float && qfns.to_float) { + if (qfns.from_float && qfns.to_float && qfns.vec_dot) { printf("%s\n", ggml_type_name(type)); ggml_quantize_init(type);