llama : support for falcon-mamba
architecture (#9074)
* feat: initial support for llama.cpp * fix: lint * refactor: better refactor * Update src/llama.cpp Co-authored-by: compilade <git@compilade.net> * Update src/llama.cpp Co-authored-by: compilade <git@compilade.net> * fix: address comments * Update convert_hf_to_gguf.py Co-authored-by: compilade <git@compilade.net> * fix: add more cleanup and harmonization * fix: lint * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * fix: change name * Apply suggestions from code review Co-authored-by: compilade <git@compilade.net> * add in operator * fix: add `dt_b_c_rms` in `llm_load_print_meta` * fix: correct printf format for bool * fix: correct print format * Update src/llama.cpp Co-authored-by: compilade <git@compilade.net> * llama : quantize more Mamba tensors * llama : use f16 as the fallback of fallback quant types --------- Co-authored-by: compilade <git@compilade.net>
This commit is contained in:
parent
f63f603c87
commit
b40eb84895
5 changed files with 36 additions and 24 deletions
|
@ -328,6 +328,7 @@ enum llm_kv {
|
|||
LLM_KV_SSM_CONV_KERNEL,
|
||||
LLM_KV_SSM_STATE_SIZE,
|
||||
LLM_KV_SSM_TIME_STEP_RANK,
|
||||
LLM_KV_SSM_DT_B_C_RMS,
|
||||
|
||||
LLM_KV_TOKENIZER_MODEL,
|
||||
LLM_KV_TOKENIZER_PRE,
|
||||
|
@ -426,6 +427,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
||||
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
||||
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
||||
{ LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
|
||||
|
||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
||||
|
@ -2237,6 +2239,7 @@ struct llama_hparams {
|
|||
uint32_t ssm_d_inner = 0;
|
||||
uint32_t ssm_d_state = 0;
|
||||
uint32_t ssm_dt_rank = 0;
|
||||
bool ssm_dt_b_c_rms = false;
|
||||
|
||||
float f_clamp_kqv = 0.0f;
|
||||
float f_max_alibi_bias = 0.0f;
|
||||
|
@ -2286,6 +2289,7 @@ struct llama_hparams {
|
|||
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
||||
if (this->ssm_d_state != other.ssm_d_state) return true;
|
||||
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
|
||||
if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
|
||||
|
||||
if (this->dec_start_token_id != other.dec_start_token_id) return true;
|
||||
|
||||
|
@ -5052,6 +5056,7 @@ static void llm_load_hparams(
|
|||
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
||||
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
||||
ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
|
@ -5907,6 +5912,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
||||
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
||||
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
||||
|
@ -12161,6 +12167,10 @@ struct llm_build_context {
|
|||
GGML_ASSERT(2 * d_model == d_inner);
|
||||
const int64_t d_state = hparams.ssm_d_state;
|
||||
const int64_t dt_rank = hparams.ssm_dt_rank;
|
||||
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
|
||||
const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
|
||||
// Use the same RMS norm as the final layer norm
|
||||
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -12241,6 +12251,13 @@ struct llm_build_context {
|
|||
struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
|
||||
struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
|
||||
|
||||
// Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
|
||||
if (ssm_dt_b_c_rms) {
|
||||
dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
|
||||
B = ggml_rms_norm(ctx0, B, norm_rms_eps);
|
||||
C = ggml_rms_norm(ctx0, C, norm_rms_eps);
|
||||
}
|
||||
|
||||
// {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
|
||||
dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt);
|
||||
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
|
||||
|
@ -16105,6 +16122,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
||||
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
|
||||
}
|
||||
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
|
||||
new_type = GGML_TYPE_F16;
|
||||
}
|
||||
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
|
||||
++qs.n_fallback;
|
||||
}
|
||||
|
@ -16433,8 +16453,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
// do not quantize Mamba's small yet 2D weights
|
||||
// NOTE: can't use LLM_TN here because the layer number is not known
|
||||
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
|
||||
quantize &= name.find("ssm_x.weight") == std::string::npos;
|
||||
quantize &= name.find("ssm_dt.weight") == std::string::npos;
|
||||
|
||||
// do not quantize relative position bias (T5)
|
||||
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue