llama : support for falcon-mamba architecture (#9074)

* feat: initial support for llama.cpp * fix: lint * refactor: better refactor * Update src/llama.cpp Co-authored-by: compilade <git@compilade.net> * Update src/llama.cpp Co-authored-by: compilade <git@compilade.net> * fix: address comments * Update convert_hf_to_gguf.py Co-authored-by: compilade <git@compilade.net> * fix: add more cleanup and harmonization * fix: lint * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * fix: change name * Apply suggestions from code review Co-authored-by: compilade <git@compilade.net> * add in operator * fix: add `dt_b_c_rms` in `llm_load_print_meta` * fix: correct printf format for bool * fix: correct print format * Update src/llama.cpp Co-authored-by: compilade <git@compilade.net> * llama : quantize more Mamba tensors * llama : use f16 as the fallback of fallback quant types --------- Co-authored-by: compilade <git@compilade.net>
2024-08-21 12:06:36 +04:00 · 2024-08-21 12:06:36 +04:00 · b40eb84895
commit b40eb84895
parent f63f603c87
5 changed files with 36 additions and 24 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -328,6 +328,7 @@ enum llm_kv {
    LLM_KV_SSM_CONV_KERNEL,
    LLM_KV_SSM_STATE_SIZE,
    LLM_KV_SSM_TIME_STEP_RANK,
+    LLM_KV_SSM_DT_B_C_RMS,

    LLM_KV_TOKENIZER_MODEL,
    LLM_KV_TOKENIZER_PRE,
@ -426,6 +427,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_SSM_INNER_SIZE,                "%s.ssm.inner_size"     },
    { LLM_KV_SSM_STATE_SIZE,                "%s.ssm.state_size"     },
    { LLM_KV_SSM_TIME_STEP_RANK,            "%s.ssm.time_step_rank" },
+    { LLM_KV_SSM_DT_B_C_RMS,                "%s.ssm.dt_b_c_rms" },

    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
@ -2237,6 +2239,7 @@ struct llama_hparams {
    uint32_t ssm_d_inner = 0;
    uint32_t ssm_d_state = 0;
    uint32_t ssm_dt_rank = 0;
+    bool ssm_dt_b_c_rms = false;

    float f_clamp_kqv      = 0.0f;
    float f_max_alibi_bias = 0.0f;
@ -2286,6 +2289,7 @@ struct llama_hparams {
        if (this->ssm_d_inner != other.ssm_d_inner) return true;
        if (this->ssm_d_state != other.ssm_d_state) return true;
        if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
+        if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;

        if (this->dec_start_token_id != other.dec_start_token_id) return true;

@ -5052,6 +5056,7 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+                ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);

                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

@ -5907,6 +5912,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
        LLAMA_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
        LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
        LLAMA_LOG_INFO("%s: ssm_dt_rank      = %u\n",     __func__, hparams.ssm_dt_rank);
+        LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms   = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
    }

    LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
@ -12161,6 +12167,10 @@ struct llm_build_context {
        GGML_ASSERT(2 * d_model == d_inner);
        const int64_t d_state = hparams.ssm_d_state;
        const int64_t dt_rank = hparams.ssm_dt_rank;
+        // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
+        const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
+        // Use the same RMS norm as the final layer norm
+        const float norm_rms_eps = hparams.f_norm_rms_eps;

        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
@ -12241,6 +12251,13 @@ struct llm_build_context {
                struct ggml_tensor * B  = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
                struct ggml_tensor * C  = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));

+                // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
+                if (ssm_dt_b_c_rms) {
+                    dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
+                    B = ggml_rms_norm(ctx0, B, norm_rms_eps);
+                    C = ggml_rms_norm(ctx0, C, norm_rms_eps);
+                }
+
                // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
                dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt);
                dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
@ -16105,6 +16122,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
        }
+        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
+            new_type = GGML_TYPE_F16;
+        }
        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
        ++qs.n_fallback;
    }
@ -16433,8 +16453,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        // do not quantize Mamba's small yet 2D weights
        // NOTE: can't use LLM_TN here because the layer number is not known
        quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
-        quantize &= name.find("ssm_x.weight")      == std::string::npos;
-        quantize &= name.find("ssm_dt.weight")     == std::string::npos;

        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;