Introduce bfloat16 support

Many models on Hugging Face (e.g. Mistral, TinyLLaMA) use bfloat16 as their canonical floating point format. ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌──┴───┐┌─┴───┐ 0b0000000000000000 brain16 This encoding has the same number of exponent bits as float32. That makes conversion relatively straightforward, even in the absence of hardware support. For example, converting brain16 to binary32 means simply shifting 16 bits to the left. ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌──┴───┐┌─┴───────────────────┐ 0b00000000000000000000000000000000 IEEE binary32 The issue is that converting bf16 to fp16 can result in information loss. Only 13% of bf16 numbers can be precisely represented in fp16 which in practice ends up being 99.71% of Mistral 7b v0.2's weights however there is currently no way other than fp32 to get the others ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌─┴─┐┌─┴──────┐ 0b0000000000000000 IEEE binary16 This change fixes that, by adding a bf16 data type to GGML. Support for CPU inference has been implemented along with optimizations for the AVX2, AVX512, and AVX512BF16 ISAs. Perplexity on Mistral 7b 0.2 improves somewhere around -0.0024 to -0.0046 compared to using fp16
2024-03-31 07:42:59 -07:00 · 2024-03-31 07:42:59 -07:00 · 55e962a26b
commit 55e962a26b
parent c780e75305
8 changed files with 1577 additions and 16 deletions
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
    auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
+        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
            return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
        } else if (a->type == GGML_TYPE_F32) {
            return ggml_add(ctx, a, b);
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -46,7 +46,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
-    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, -0.0020 ppl @ Mistral-7B", },
    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B", },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -518,6 +518,9 @@ size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml
 // return index, asserts if table is full
 size_t ggml_hash_find_or_insert(      struct ggml_hash_set hash_set, struct ggml_tensor * key);
 #define GGML_FP32_TO_BF16(x) ggml_fp32_to_bf16(x)
 #define GGML_BF16_TO_FP32(x) ggml_bf16_to_fp32(x)
 #ifdef __cplusplus
 }
 #endif
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -370,6 +370,7 @@ extern "C" {
        GGML_TYPE_I64     = 27,
        GGML_TYPE_F64     = 28,
        GGML_TYPE_IQ1_M   = 29,
        GGML_TYPE_BF16    = 30,
        GGML_TYPE_COUNT,
    };
@ -410,6 +411,7 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ2_S   = 21, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
    };
    // available tensor operations:
@ -2390,6 +2392,90 @@ extern "C" {
    GGML_API int ggml_cpu_has_vsx        (void);
    GGML_API int ggml_cpu_has_matmul_int8(void);
    /**
     * Google Brain 16-bit floating point number.
     *
     *       ┌sign
     *       │
     *       │   ┌exponent
     *       │   │
     *       │   │      ┌mantissa
     *       │   │      │
     *       │┌──┴───┐┌─┴───┐
     *     0b0000000000000000 brain16
     *
     * Since bf16 has the same number of exponent bits as a 32bit float,
     * encoding and decoding numbers becomes relatively straightforward.
     *
     *       ┌sign
     *       │
     *       │   ┌exponent
     *       │   │
     *       │   │      ┌mantissa
     *       │   │      │
     *       │┌──┴───┐┌─┴───────────────────┐
     *     0b00000000000000000000000000000000 IEEE binary32
     *
     * For comparison, the standard fp16 format has fewer exponent bits.
     *
     *       ┌sign
     *       │
     *       │  ┌exponent
     *       │  │
     *       │  │    ┌mantissa
     *       │  │    │
     *       │┌─┴─┐┌─┴──────┐
     *     0b0000000000000000 IEEE binary16
     *
     * So be warned that converting between them, destroys several bits.
     *
     * @see IEEE 754-2008
     */
    typedef struct {
        uint16_t x;
    } ggml_bf16_t;
    /**
     * Converts brain16 to float32.
     */
    static inline float ggml_bf16_to_fp32(ggml_bf16_t h) {
        union {
            float f;
            uint32_t i;
        } u;
        u.i = (uint32_t)h.x << 16;
        return u.f;
    }
    /**
     * Converts float32 to brain16.
     *
     * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
     * Subnormals shall be flushed to zero, and NANs will be quiet.
     * This code should vectorize nicely if using modern compilers.
     */
    static inline ggml_bf16_t ggml_fp32_to_bf16(float s) {
        ggml_bf16_t h;
        union {
            float f;
            uint32_t i;
        } u;
        u.f = s;
        if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
            h.x = (u.i >> 16) | 64; /* force to quiet */
            return h;
        }
        if (!(u.i & 0x7f800000)) { /* subnormal */
            h.x = (u.i & 0x80000000) >> 16; /* flush to zero */
            return h;
        }
        h.x = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
        return h;
    }
    GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int n);
    GGML_API void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int n);
    //
    // Internal types and functions exposed for tests and benchmarks
    //
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -817,6 +817,7 @@ class GGMLQuantizationType(IntEnum):
    I64     = 27
    F64     = 28
    IQ1_M   = 29
    BF16    = 30
 class GGUFEndian(IntEnum):
@ -862,6 +863,7 @@ QK_K = 256
 GGML_QUANT_SIZES = {
    GGMLQuantizationType.F32:     (1, 4),
    GGMLQuantizationType.F16:     (1, 2),
    GGMLQuantizationType.BF16:    (1, 2),
    GGMLQuantizationType.Q4_0:    (32, 2 + 16),
    GGMLQuantizationType.Q4_1:    (32, 2 + 2 + 16),
    GGMLQuantizationType.Q5_0:    (32, 2 + 4 + 16),
--- a/llama.cpp
+++ b/llama.cpp
@ -3175,6 +3175,7 @@ struct llama_model_loader {
            switch (type_max) {
                case GGML_TYPE_F32:     ftype = LLAMA_FTYPE_ALL_F32;        break;
                case GGML_TYPE_F16:     ftype = LLAMA_FTYPE_MOSTLY_F16;     break;
                case GGML_TYPE_BF16:    ftype = LLAMA_FTYPE_MOSTLY_BF16;    break;
                case GGML_TYPE_Q4_0:    ftype = LLAMA_FTYPE_MOSTLY_Q4_0;    break;
                case GGML_TYPE_Q4_1:    ftype = LLAMA_FTYPE_MOSTLY_Q4_1;    break;
                case GGML_TYPE_Q5_0:    ftype = LLAMA_FTYPE_MOSTLY_Q5_0;    break;
@ -3666,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
    switch (ftype) {
        case LLAMA_FTYPE_ALL_F32:     return "all F32";
        case LLAMA_FTYPE_MOSTLY_F16:  return "F16";
        case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
        case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
        case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
        case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@ -6129,6 +6131,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
            || !(
                model.ftype == LLAMA_FTYPE_ALL_F32 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
            )
@ -14158,13 +14161,16 @@ static void llama_tensor_dequantize_internal(
        if (qtype.to_float == NULL) {
            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
        }
-    } else if (tensor->type != GGML_TYPE_F16) {
+    } else if (tensor->type != GGML_TYPE_F16 &&
               tensor->type != GGML_TYPE_BF16) {
        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
    }
    if (nthread < 2) {
        if (tensor->type == GGML_TYPE_F16) {
            ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
        } else if (tensor->type == GGML_TYPE_BF16) {
            ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
        } else if (ggml_is_quantized(tensor->type)) {
            qtype.to_float(tensor->data, f32_output, nelements);
        } else {
@ -14173,7 +14179,14 @@ static void llama_tensor_dequantize_internal(
        return;
    }
-    size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
+    size_t block_size;
    if (tensor->type == GGML_TYPE_F16 ||
        tensor->type == GGML_TYPE_BF16) {
        block_size = 1;
    } else {
        block_size = (size_t)ggml_blck_size(tensor->type);
    }
    size_t block_size_bytes = ggml_type_size(tensor->type);
    GGML_ASSERT(nelements % block_size == 0);
@ -14192,6 +14205,8 @@ static void llama_tensor_dequantize_internal(
        auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
            if (typ == GGML_TYPE_F16) {
                ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
            } else if (typ == GGML_TYPE_BF16) {
                ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
            } else {
                qtype.to_float(inbuf, outbuf, nels);
            }
@ -14552,6 +14567,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
        case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
        case LLAMA_FTYPE_MOSTLY_F16:  default_type = GGML_TYPE_F16;  break;
        case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
        case LLAMA_FTYPE_ALL_F32:     default_type = GGML_TYPE_F32;  break;
        // K-quants
--- a/llama.h
+++ b/llama.h
@ -137,6 +137,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };