diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ffdba7444..28e865e5c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1844,6 +1844,17 @@ class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 +@Model.register("MambaForCausalLM") +class MambaModel(Model): + model_arch = gguf.MODEL_ARCH.MAMBA + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_file_type(self.ftype) + + ###### CONVERSION LOGIC ###### diff --git a/ggml.c b/ggml.c index f29b9f13f..597bf319e 100644 --- a/ggml.c +++ b/ggml.c @@ -1577,6 +1577,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); } inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } +inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); } inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } @@ -1778,6 +1779,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "DIV", "SQR", "SQRT", + "EXP", "LOG", "SUM", "SUM_ROWS", @@ -1811,6 +1813,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "DIAG_MASK_ZERO", "SOFT_MAX", "SOFT_MAX_BACK", + "SOFT_PLUS", "ROPE", "ROPE_BACK", "ALIBI", @@ -1850,7 +1853,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); +static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1864,6 +1867,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "x/y", "x^2", "√x", + "e^x", // or should this be "exp(x)"? "log(x)", "Σx", "Σx_k", @@ -1897,6 +1901,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "diag_mask_zero(x)", "soft_max(x)", "soft_max_back(x)", + "soft_plus(x)", "rope(x)", "rope_back(x)", "alibi(x)", @@ -1936,7 +1941,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); +static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -3796,6 +3801,39 @@ struct ggml_tensor * ggml_sqrt_inplace( return ggml_sqrt_impl(ctx, a, true); } +// ggml_exp + +static struct ggml_tensor * ggml_exp_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_EXP; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_exp( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_exp_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_exp_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_exp_impl(ctx, a, true); +} + // ggml_log static struct ggml_tensor * ggml_log_impl( @@ -5291,6 +5329,42 @@ struct ggml_tensor * ggml_soft_max_back_inplace( return ggml_soft_max_back_impl(ctx, a, b, true); } +// ggml_soft_plus + +struct ggml_tensor * ggml_soft_plus_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + + // TODO: does `a` need to be contiguous? + + bool is_node = false; + + if (a->grad) { + is_node = true; // TODO : implement backward pass + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SOFT_PLUS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_soft_plus( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_soft_plus_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_soft_plus_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_soft_plus_impl(ctx, a, true); +} + // ggml_rope static struct ggml_tensor * ggml_rope_impl( @@ -8593,6 +8667,49 @@ static void ggml_compute_forward_sqrt( } } +// ggml_compute_forward_exp + +static void ggml_compute_forward_exp_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_exp_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +static void ggml_compute_forward_exp( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_exp_f32(params, src0, dst); + } break; + case GGML_TYPE_F16: // TODO: use ggml_table_exp_f16 + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_log static void ggml_compute_forward_log_f32( @@ -12052,6 +12169,48 @@ static void ggml_compute_forward_soft_max_back( } } +static void ggml_compute_forward_soft_plus_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < nr; ++i) { + float * x = (float *) ((char *) dst->data + i*( dst->nb[1])); + float * y = (float *) ((char *) src0->data + i*(src0->nb[1])); + for (int j = 0; j < nc; ++j) { + x[j] = logf(1.0f + expf(y[i])); + } + } +} + +static void ggml_compute_forward_soft_plus( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_soft_plus_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_alibi static void ggml_compute_forward_alibi_f32( @@ -15447,6 +15606,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_sqrt(params, tensor); } break; + case GGML_OP_EXP: + { + ggml_compute_forward_exp(params, tensor->src[0], tensor); + } break; case GGML_OP_LOG: { ggml_compute_forward_log(params, tensor); @@ -15571,6 +15734,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_soft_max_back(params, tensor); } break; + case GGML_OP_SOFT_PLUS: + { + ggml_compute_forward_soft_plus(params, tensor->src[0], tensor); + } case GGML_OP_ROPE: { ggml_compute_forward_rope(params, tensor); @@ -16123,6 +16290,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor zero_table); } } break; + case GGML_OP_EXP: + { + GGML_ASSERT(false); // TODO: implement + } break; case GGML_OP_LOG: { if (src0->grad) { @@ -16501,6 +16672,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_SOFT_PLUS: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_ROPE: { // necessary for llama @@ -17243,6 +17418,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_SUB: case GGML_OP_SQR: case GGML_OP_SQRT: + case GGML_OP_EXP: case GGML_OP_LOG: case GGML_OP_SUM: case GGML_OP_SUM_ROWS: @@ -17343,6 +17519,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = MIN(n_threads, ggml_nrows(node->src[0])); } break; + case GGML_OP_SOFT_PLUS: + { + n_tasks = 1; //TODO + } break; case GGML_OP_CONV_TRANSPOSE_1D: { n_tasks = n_threads; @@ -17715,6 +17895,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } } break; case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_PLUS: case GGML_OP_ROPE: { cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; diff --git a/ggml.h b/ggml.h index 0a6d3c051..efb62c598 100644 --- a/ggml.h +++ b/ggml.h @@ -410,6 +410,7 @@ extern "C" { GGML_OP_DIV, GGML_OP_SQR, GGML_OP_SQRT, + GGML_OP_EXP, GGML_OP_LOG, GGML_OP_SUM, GGML_OP_SUM_ROWS, @@ -443,6 +444,7 @@ extern "C" { GGML_OP_DIAG_MASK_ZERO, GGML_OP_SOFT_MAX, GGML_OP_SOFT_MAX_BACK, + GGML_OP_SOFT_PLUS, GGML_OP_ROPE, GGML_OP_ROPE_BACK, GGML_OP_ALIBI, @@ -932,6 +934,14 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_exp( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_exp_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_log( struct ggml_context * ctx, struct ggml_tensor * a); @@ -1420,6 +1430,15 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_soft_plus( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_soft_plus_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + // rotary position embedding // if mode & 1 == 1, skip n_past elements (DEPRECATED) // if mode & 2 == 1, GPT-NeoX style diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a62139811..a28108383 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -113,6 +113,7 @@ class MODEL_ARCH(IntEnum): MINICPM = auto() GEMMA = auto() STARCODER2 = auto() + MAMBA = auto() class MODEL_TENSOR(IntEnum): @@ -144,6 +145,13 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_NORM = auto() ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() + SSM_IN = auto() + SSM_CONV1D = auto() + SSM_X = auto() + SSM_DT = auto() + SSM_A = auto() + SSM_D = auto() + SSM_OUT = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -171,6 +179,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.MINICPM: "minicpm", MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.MAMBA: "mamba", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -202,6 +211,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + # FIXME: NAMES FOR MAMBA ARE NOT FINAL + MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", + MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", + MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", + MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", + MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", + MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", + MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -543,6 +560,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.MAMBA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.SSM_IN, + MODEL_TENSOR.SSM_CONV1D, + MODEL_TENSOR.SSM_X, + MODEL_TENSOR.SSM_DT, + MODEL_TENSOR.SSM_A, + MODEL_TENSOR.SSM_D, + MODEL_TENSOR.SSM_OUT, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index db2ec9704..d24d10dcb 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -20,6 +20,8 @@ class TensorNameMap: "wte", # gpt2 "transformer.embd.wte", # phi2 "model.tok_embeddings", # internlm2 + "model.embedding", # mamba + "backbone.embedding", # mamba ), # Token type embeddings @@ -44,7 +46,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -61,6 +63,8 @@ class TensorNameMap: "language_model.encoder.final_layernorm", # persimmon "model.final_layernorm", # persimmon "lm_head.ln", # phi2 + "model.norm_f", # mamba + "backbone.norm_f", # mamba ), # Rope frequencies @@ -86,6 +90,8 @@ class TensorNameMap: "transformer.h.{bid}.ln", # phi2 "model.layers.layers.{bid}.norm", # plamo "model.layers.{bid}.attention_norm", # internlm2 + "model.layers.{bid}.norm", # mamba + "backbone.layers.{bid}.mixer.norm", # mamba ), # Attention norm 2 @@ -282,7 +288,42 @@ class TensorNameMap: MODEL_TENSOR.LAYER_OUT_NORM: ( "encoder.layer.{bid}.output.LayerNorm", # bert "encoder.layers.{bid}.norm2", # nomic-bert - ) + ), + + MODEL_TENSOR.SSM_IN: ( + "model.layers.{bid}.in_proj", + "backbone.layers.{bid}.mixer.in_proj", + ), + + MODEL_TENSOR.SSM_CONV1D: ( + "model.layers.{bid}.conv1d", + "backbone.layers.{bid}.mixer.conv1d", + ), + + MODEL_TENSOR.SSM_X: ( + "model.layers.{bid}.x_proj", + "backbone.layers.{bid}.mixer.x_proj", + ), + + MODEL_TENSOR.SSM_DT: ( + "model.layers.{bid}.dt_proj", + "backbone.layers.{bid}.mixer.dt_proj", + ), + + MODEL_TENSOR.SSM_A: ( + "model.layers.{bid}.A_log", + "backbone.layers.{bid}.mixer.A_log", + ), + + MODEL_TENSOR.SSM_D: ( + "model.layers.{bid}.D", + "backbone.layers.{bid}.mixer.D", + ), + + MODEL_TENSOR.SSM_OUT: ( + "model.layers.{bid}.out_proj", + "backbone.layers.{bid}.mixer.out_proj", + ), } mapping: dict[str, tuple[MODEL_TENSOR, str]] diff --git a/llama.cpp b/llama.cpp index c1f015791..2dc1cc1b3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -213,6 +213,7 @@ enum llm_arch { LLM_ARCH_MINICPM, LLM_ARCH_GEMMA, LLM_ARCH_STARCODER2, + LLM_ARCH_MAMBA, LLM_ARCH_UNKNOWN, }; @@ -241,6 +242,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_STARCODER2, "starcoder2" }, + { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -399,6 +401,15 @@ enum llm_tensor { LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_LAYER_OUT_NORM, + // TODO: maybe use longer names? + // TODO: can the in_proj and/or the out_proj instead re-use some of the above types? + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_X, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_D, + LLM_TENSOR_SSM_OUT, }; static const std::map> LLM_TENSOR_NAMES = { @@ -801,6 +812,22 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_MAMBA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"}, + { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in"}, + { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d"}, + { LLM_TENSOR_SSM_X, "blk.%d.ssm_x"}, + { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt"}, + { LLM_TENSOR_SSM_A, "blk.%d.ssm_a"}, + { LLM_TENSOR_SSM_D, "blk.%d.ssm_d"}, + { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out"}, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1737,6 +1764,22 @@ struct llama_layer { struct ggml_tensor * ffn_down_b; // b2 struct ggml_tensor * ffn_up_b; // b3 struct ggml_tensor * ffn_act; + + + // mamba proj + struct ggml_tensor * ssm_in; + struct ggml_tensor * ssm_x; + struct ggml_tensor * ssm_dt; + struct ggml_tensor * ssm_out; + + // mamba + struct ggml_tensor * ssm_conv1d; + struct ggml_tensor * ssm_a; + struct ggml_tensor * ssm_d; + + // mamba bias + struct ggml_tensor * ssm_conv1d_b; + struct ggml_tensor * ssm_dt_b; }; struct llama_kv_cell { @@ -3376,6 +3419,29 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_MAMBA: + { + switch (hparams.n_layer) { + case 24: + switch (hparams.n_embd) { + case 768: model.type = e_model::MODEL_SMALL; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 48: + switch (hparams.n_embd) { + case 1024: model.type = e_model::MODEL_MEDIUM; break; + case 1536: model.type = e_model::MODEL_LARGE; break; + case 2048: model.type = e_model::MODEL_XL; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 64: + switch (hparams.n_embd) { + case 2560: model.type = e_model::MODEL_3B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } default: (void)0; } @@ -4596,6 +4662,36 @@ static bool llm_load_tensors( layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}); } } break; + case LLM_ARCH_MAMBA: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + // TODO: MAMBA + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + // norm + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + // TODO: D, in_proj, conv1d, x_proj, dt_proj, A_log, out_proj + // TODO: what's the difference between ctx_layer and ctx_split? + // A: It seems that ctx_split is for matrices (2d???) while ctx_layer is for other things (like 1D bias and norms, probably.) + + // out_proj + layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {2*n_embd, n_embd}); + + + } + } default: throw std::runtime_error("unknown architecture"); } @@ -7779,6 +7875,92 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_mamba() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + // d_model + const int64_t n_embd = hparams.n_embd; + const int64_t d_state = 16; + const int64_t d_conv = 4; + // expand = 2 + // d_inner = expand * d_model + const int64_t d_inner = 2 * n_embd; // FIXME: this is wrong + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + // TODO: give it the right size + struct ggml_tensor * state; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + for (int il = 0; il < n_layer; ++il) { + // FIXME: init attn_norm + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + // TODO: that's probably the wrong name. + cb(cur, "attn_norm", il); + + // conv + { + // [] * [] = [2*n_embd] + struct ggml_tensor * xz = ggml_mul_mat(ctx0, cur, model.layers[il].ssm_in); + // split the above in two + struct ggml_tensor * x = ggml_view_1d(ctx0, xz, d_inner, 0); + struct ggml_tensor * z = ggml_view_1d(ctx0, xz, d_inner, d_inner); + + + // FIXME: this is wrong + cur = ggml_conv_1d(ctx0, cur, model.layers[il].ssm_conv1d, 1, d_conv - 1, 1); + + cur = ggml_add(ctx0, cur, model.layers[il].ssm_conv1d_b); + + // TODO: there's some SiLU in there (but no ffn? or is the conv an ffn?) + cur = ggml_silu(ctx0, cur); + } + + // ssm + { + + // TODO: use ggml_soft_plus here + + } + + // TODO: there's some SiLU again towards the end. Can the `llm_build_ffn` helper be used? + // Maybe the best way is to implement it, _then_ check if that helper would do the same thing. + // discretize + { + } + + // residual + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + // the last step of each layer already makes these equivalent + // cur = inpL; + + // final rmsnorm + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -12321,6 +12503,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MPT: case LLM_ARCH_REFACT: case LLM_ARCH_BLOOM: + case LLM_ARCH_MAMBA: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values