diff --git a/ggml-metal.m b/ggml-metal.m index 017b72ce9..78cac5041 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -803,7 +803,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const case GGML_OP_DIAG_MASK_INF: case GGML_OP_GET_ROWS: { - return op->ne[3] == 1; + return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1; } default: return false; diff --git a/ggml.c b/ggml.c index 5145a7876..093d38d00 100644 --- a/ggml.c +++ b/ggml.c @@ -19765,7 +19765,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa case GGML_OP_CPY: case GGML_OP_DUP: { - if (ggml_is_quantized(node->type)) { + if (ggml_is_quantized(node->type) || + // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32 + (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) || + (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) { cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } } break; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 86c5492ad..6e968fc4e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -863,7 +863,6 @@ QK_K = 256 GGML_QUANT_SIZES = { GGMLQuantizationType.F32: (1, 4), GGMLQuantizationType.F16: (1, 2), - GGMLQuantizationType.BF16: (1, 2), GGMLQuantizationType.Q4_0: (32, 2 + 16), GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), @@ -890,6 +889,7 @@ GGML_QUANT_SIZES = { GGMLQuantizationType.I64: (1, 8), GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), + GGMLQuantizationType.BF16: (1, 2), } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b27c1291e..41718e001 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -50,7 +50,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); - } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) { + } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix @@ -92,6 +92,8 @@ static std::vector tensor_to_float(const ggml_tensor * t) { size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0]; if (t->type == GGML_TYPE_F16) { tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i])); + } else if (t->type == GGML_TYPE_BF16) { + tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i])); } else if (t->type == GGML_TYPE_F32) { tv.push_back(*(float *) &buf[i]); } else if (t->type == GGML_TYPE_I32) { @@ -1898,7 +1900,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op std::default_random_engine rng(0); const ggml_type all_types[] = { - GGML_TYPE_F32, GGML_TYPE_F16, + GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0,