mamba : begin working on support for Mamba SSM
This commit is contained in:
parent
67be2ce101
commit
8cd0a286b4
6 changed files with 469 additions and 4 deletions
|
@ -1844,6 +1844,17 @@ class StarCoder2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("MambaForCausalLM")
|
||||||
|
class MambaModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_name(self.dir_model.name)
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
|
185
ggml.c
185
ggml.c
|
@ -1577,6 +1577,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
||||||
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
||||||
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
||||||
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
||||||
|
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
|
||||||
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
||||||
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
||||||
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
||||||
|
@ -1778,6 +1779,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"DIV",
|
"DIV",
|
||||||
"SQR",
|
"SQR",
|
||||||
"SQRT",
|
"SQRT",
|
||||||
|
"EXP",
|
||||||
"LOG",
|
"LOG",
|
||||||
"SUM",
|
"SUM",
|
||||||
"SUM_ROWS",
|
"SUM_ROWS",
|
||||||
|
@ -1811,6 +1813,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"DIAG_MASK_ZERO",
|
"DIAG_MASK_ZERO",
|
||||||
"SOFT_MAX",
|
"SOFT_MAX",
|
||||||
"SOFT_MAX_BACK",
|
"SOFT_MAX_BACK",
|
||||||
|
"SOFT_PLUS",
|
||||||
"ROPE",
|
"ROPE",
|
||||||
"ROPE_BACK",
|
"ROPE_BACK",
|
||||||
"ALIBI",
|
"ALIBI",
|
||||||
|
@ -1850,7 +1853,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"CROSS_ENTROPY_LOSS_BACK",
|
"CROSS_ENTROPY_LOSS_BACK",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
||||||
|
|
||||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"none",
|
"none",
|
||||||
|
@ -1864,6 +1867,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"x/y",
|
"x/y",
|
||||||
"x^2",
|
"x^2",
|
||||||
"√x",
|
"√x",
|
||||||
|
"e^x", // or should this be "exp(x)"?
|
||||||
"log(x)",
|
"log(x)",
|
||||||
"Σx",
|
"Σx",
|
||||||
"Σx_k",
|
"Σx_k",
|
||||||
|
@ -1897,6 +1901,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"diag_mask_zero(x)",
|
"diag_mask_zero(x)",
|
||||||
"soft_max(x)",
|
"soft_max(x)",
|
||||||
"soft_max_back(x)",
|
"soft_max_back(x)",
|
||||||
|
"soft_plus(x)",
|
||||||
"rope(x)",
|
"rope(x)",
|
||||||
"rope_back(x)",
|
"rope_back(x)",
|
||||||
"alibi(x)",
|
"alibi(x)",
|
||||||
|
@ -1936,7 +1941,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"cross_entropy_loss_back(x,y)",
|
"cross_entropy_loss_back(x,y)",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
||||||
|
|
||||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||||
|
|
||||||
|
@ -3796,6 +3801,39 @@ struct ggml_tensor * ggml_sqrt_inplace(
|
||||||
return ggml_sqrt_impl(ctx, a, true);
|
return ggml_sqrt_impl(ctx, a, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_exp
|
||||||
|
|
||||||
|
static struct ggml_tensor * ggml_exp_impl(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
bool inplace) {
|
||||||
|
bool is_node = false;
|
||||||
|
|
||||||
|
if (!inplace && (a->grad)) {
|
||||||
|
is_node = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
result->op = GGML_OP_EXP;
|
||||||
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
result->src[0] = a;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_exp(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a) {
|
||||||
|
return ggml_exp_impl(ctx, a, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_exp_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a) {
|
||||||
|
return ggml_exp_impl(ctx, a, true);
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_log
|
// ggml_log
|
||||||
|
|
||||||
static struct ggml_tensor * ggml_log_impl(
|
static struct ggml_tensor * ggml_log_impl(
|
||||||
|
@ -5291,6 +5329,42 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
|
||||||
return ggml_soft_max_back_impl(ctx, a, b, true);
|
return ggml_soft_max_back_impl(ctx, a, b, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_soft_plus
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_soft_plus_impl(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
bool inplace) {
|
||||||
|
|
||||||
|
// TODO: does `a` need to be contiguous?
|
||||||
|
|
||||||
|
bool is_node = false;
|
||||||
|
|
||||||
|
if (a->grad) {
|
||||||
|
is_node = true; // TODO : implement backward pass
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
result->op = GGML_OP_SOFT_PLUS;
|
||||||
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
result->src[0] = a;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_soft_plus(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a) {
|
||||||
|
return ggml_soft_plus_impl(ctx, a, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_soft_plus_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a) {
|
||||||
|
return ggml_soft_plus_impl(ctx, a, true);
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_rope
|
// ggml_rope
|
||||||
|
|
||||||
static struct ggml_tensor * ggml_rope_impl(
|
static struct ggml_tensor * ggml_rope_impl(
|
||||||
|
@ -8593,6 +8667,49 @@ static void ggml_compute_forward_sqrt(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_exp
|
||||||
|
|
||||||
|
static void ggml_compute_forward_exp_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
GGML_ASSERT(params->ith == 0);
|
||||||
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n = ggml_nrows(src0);
|
||||||
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
|
GGML_ASSERT( dst->nb[0] == sizeof(float));
|
||||||
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||||
|
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
ggml_vec_exp_f32(nc,
|
||||||
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_compute_forward_exp(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_exp_f32(params, src0, dst);
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_F16: // TODO: use ggml_table_exp_f16
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_log
|
// ggml_compute_forward_log
|
||||||
|
|
||||||
static void ggml_compute_forward_log_f32(
|
static void ggml_compute_forward_log_f32(
|
||||||
|
@ -12052,6 +12169,48 @@ static void ggml_compute_forward_soft_max_back(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_compute_forward_soft_plus_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
GGML_ASSERT(params->ith == 0);
|
||||||
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int nc = src0->ne[0];
|
||||||
|
const int nr = ggml_nrows(src0);
|
||||||
|
|
||||||
|
GGML_ASSERT( dst->nb[0] == sizeof(float));
|
||||||
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||||
|
|
||||||
|
for (int i = 0; i < nr; ++i) {
|
||||||
|
float * x = (float *) ((char *) dst->data + i*( dst->nb[1]));
|
||||||
|
float * y = (float *) ((char *) src0->data + i*(src0->nb[1]));
|
||||||
|
for (int j = 0; j < nc; ++j) {
|
||||||
|
x[j] = logf(1.0f + expf(y[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_compute_forward_soft_plus(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_soft_plus_f32(params, src0, dst);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_alibi
|
// ggml_compute_forward_alibi
|
||||||
|
|
||||||
static void ggml_compute_forward_alibi_f32(
|
static void ggml_compute_forward_alibi_f32(
|
||||||
|
@ -15447,6 +15606,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_sqrt(params, tensor);
|
ggml_compute_forward_sqrt(params, tensor);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_EXP:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_exp(params, tensor->src[0], tensor);
|
||||||
|
} break;
|
||||||
case GGML_OP_LOG:
|
case GGML_OP_LOG:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_log(params, tensor);
|
ggml_compute_forward_log(params, tensor);
|
||||||
|
@ -15571,6 +15734,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_soft_max_back(params, tensor);
|
ggml_compute_forward_soft_max_back(params, tensor);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_SOFT_PLUS:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_soft_plus(params, tensor->src[0], tensor);
|
||||||
|
}
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_rope(params, tensor);
|
ggml_compute_forward_rope(params, tensor);
|
||||||
|
@ -16123,6 +16290,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
zero_table);
|
zero_table);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_EXP:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false); // TODO: implement
|
||||||
|
} break;
|
||||||
case GGML_OP_LOG:
|
case GGML_OP_LOG:
|
||||||
{
|
{
|
||||||
if (src0->grad) {
|
if (src0->grad) {
|
||||||
|
@ -16501,6 +16672,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false); // TODO: not implemented
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_SOFT_PLUS:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
} break;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
// necessary for llama
|
// necessary for llama
|
||||||
|
@ -17243,6 +17418,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
case GGML_OP_SQR:
|
case GGML_OP_SQR:
|
||||||
case GGML_OP_SQRT:
|
case GGML_OP_SQRT:
|
||||||
|
case GGML_OP_EXP:
|
||||||
case GGML_OP_LOG:
|
case GGML_OP_LOG:
|
||||||
case GGML_OP_SUM:
|
case GGML_OP_SUM:
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
|
@ -17343,6 +17519,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
{
|
{
|
||||||
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
|
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_SOFT_PLUS:
|
||||||
|
{
|
||||||
|
n_tasks = 1; //TODO
|
||||||
|
} break;
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
@ -17715,6 +17895,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
|
case GGML_OP_SOFT_PLUS:
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
||||||
|
|
19
ggml.h
19
ggml.h
|
@ -410,6 +410,7 @@ extern "C" {
|
||||||
GGML_OP_DIV,
|
GGML_OP_DIV,
|
||||||
GGML_OP_SQR,
|
GGML_OP_SQR,
|
||||||
GGML_OP_SQRT,
|
GGML_OP_SQRT,
|
||||||
|
GGML_OP_EXP,
|
||||||
GGML_OP_LOG,
|
GGML_OP_LOG,
|
||||||
GGML_OP_SUM,
|
GGML_OP_SUM,
|
||||||
GGML_OP_SUM_ROWS,
|
GGML_OP_SUM_ROWS,
|
||||||
|
@ -443,6 +444,7 @@ extern "C" {
|
||||||
GGML_OP_DIAG_MASK_ZERO,
|
GGML_OP_DIAG_MASK_ZERO,
|
||||||
GGML_OP_SOFT_MAX,
|
GGML_OP_SOFT_MAX,
|
||||||
GGML_OP_SOFT_MAX_BACK,
|
GGML_OP_SOFT_MAX_BACK,
|
||||||
|
GGML_OP_SOFT_PLUS,
|
||||||
GGML_OP_ROPE,
|
GGML_OP_ROPE,
|
||||||
GGML_OP_ROPE_BACK,
|
GGML_OP_ROPE_BACK,
|
||||||
GGML_OP_ALIBI,
|
GGML_OP_ALIBI,
|
||||||
|
@ -932,6 +934,14 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_exp(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_exp_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_log(
|
GGML_API struct ggml_tensor * ggml_log(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
@ -1420,6 +1430,15 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_soft_plus(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
// in-place, returns view(a)
|
||||||
|
GGML_API struct ggml_tensor * ggml_soft_plus_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// rotary position embedding
|
// rotary position embedding
|
||||||
// if mode & 1 == 1, skip n_past elements (DEPRECATED)
|
// if mode & 1 == 1, skip n_past elements (DEPRECATED)
|
||||||
// if mode & 2 == 1, GPT-NeoX style
|
// if mode & 2 == 1, GPT-NeoX style
|
||||||
|
|
|
@ -113,6 +113,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
MINICPM = auto()
|
MINICPM = auto()
|
||||||
GEMMA = auto()
|
GEMMA = auto()
|
||||||
STARCODER2 = auto()
|
STARCODER2 = auto()
|
||||||
|
MAMBA = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -144,6 +145,13 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
LAYER_OUT_NORM = auto()
|
LAYER_OUT_NORM = auto()
|
||||||
|
SSM_IN = auto()
|
||||||
|
SSM_CONV1D = auto()
|
||||||
|
SSM_X = auto()
|
||||||
|
SSM_DT = auto()
|
||||||
|
SSM_A = auto()
|
||||||
|
SSM_D = auto()
|
||||||
|
SSM_OUT = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -171,6 +179,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.MINICPM: "minicpm",
|
MODEL_ARCH.MINICPM: "minicpm",
|
||||||
MODEL_ARCH.GEMMA: "gemma",
|
MODEL_ARCH.GEMMA: "gemma",
|
||||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||||
|
MODEL_ARCH.MAMBA: "mamba",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -202,6 +211,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
|
||||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||||
|
# FIXME: NAMES FOR MAMBA ARE NOT FINAL
|
||||||
|
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
||||||
|
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
||||||
|
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
||||||
|
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
||||||
|
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||||
|
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||||
|
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -543,6 +560,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.MAMBA: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.SSM_IN,
|
||||||
|
MODEL_TENSOR.SSM_CONV1D,
|
||||||
|
MODEL_TENSOR.SSM_X,
|
||||||
|
MODEL_TENSOR.SSM_DT,
|
||||||
|
MODEL_TENSOR.SSM_A,
|
||||||
|
MODEL_TENSOR.SSM_D,
|
||||||
|
MODEL_TENSOR.SSM_OUT,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,8 @@ class TensorNameMap:
|
||||||
"wte", # gpt2
|
"wte", # gpt2
|
||||||
"transformer.embd.wte", # phi2
|
"transformer.embd.wte", # phi2
|
||||||
"model.tok_embeddings", # internlm2
|
"model.tok_embeddings", # internlm2
|
||||||
|
"model.embedding", # mamba
|
||||||
|
"backbone.embedding", # mamba
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
|
@ -44,7 +46,7 @@ class TensorNameMap:
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba
|
||||||
"output", # llama-pth bloom internlm2
|
"output", # llama-pth bloom internlm2
|
||||||
"word_embeddings_for_head", # persimmon
|
"word_embeddings_for_head", # persimmon
|
||||||
"lm_head.linear", # phi2
|
"lm_head.linear", # phi2
|
||||||
|
@ -61,6 +63,8 @@ class TensorNameMap:
|
||||||
"language_model.encoder.final_layernorm", # persimmon
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
"model.final_layernorm", # persimmon
|
"model.final_layernorm", # persimmon
|
||||||
"lm_head.ln", # phi2
|
"lm_head.ln", # phi2
|
||||||
|
"model.norm_f", # mamba
|
||||||
|
"backbone.norm_f", # mamba
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rope frequencies
|
# Rope frequencies
|
||||||
|
@ -86,6 +90,8 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.ln", # phi2
|
"transformer.h.{bid}.ln", # phi2
|
||||||
"model.layers.layers.{bid}.norm", # plamo
|
"model.layers.layers.{bid}.norm", # plamo
|
||||||
"model.layers.{bid}.attention_norm", # internlm2
|
"model.layers.{bid}.attention_norm", # internlm2
|
||||||
|
"model.layers.{bid}.norm", # mamba
|
||||||
|
"backbone.layers.{bid}.mixer.norm", # mamba
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
|
@ -282,7 +288,42 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM: (
|
MODEL_TENSOR.LAYER_OUT_NORM: (
|
||||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
"encoder.layers.{bid}.norm2", # nomic-bert
|
"encoder.layers.{bid}.norm2", # nomic-bert
|
||||||
)
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_IN: (
|
||||||
|
"model.layers.{bid}.in_proj",
|
||||||
|
"backbone.layers.{bid}.mixer.in_proj",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_CONV1D: (
|
||||||
|
"model.layers.{bid}.conv1d",
|
||||||
|
"backbone.layers.{bid}.mixer.conv1d",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_X: (
|
||||||
|
"model.layers.{bid}.x_proj",
|
||||||
|
"backbone.layers.{bid}.mixer.x_proj",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_DT: (
|
||||||
|
"model.layers.{bid}.dt_proj",
|
||||||
|
"backbone.layers.{bid}.mixer.dt_proj",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_A: (
|
||||||
|
"model.layers.{bid}.A_log",
|
||||||
|
"backbone.layers.{bid}.mixer.A_log",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_D: (
|
||||||
|
"model.layers.{bid}.D",
|
||||||
|
"backbone.layers.{bid}.mixer.D",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_OUT: (
|
||||||
|
"model.layers.{bid}.out_proj",
|
||||||
|
"backbone.layers.{bid}.mixer.out_proj",
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||||
|
|
183
llama.cpp
183
llama.cpp
|
@ -213,6 +213,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_MINICPM,
|
LLM_ARCH_MINICPM,
|
||||||
LLM_ARCH_GEMMA,
|
LLM_ARCH_GEMMA,
|
||||||
LLM_ARCH_STARCODER2,
|
LLM_ARCH_STARCODER2,
|
||||||
|
LLM_ARCH_MAMBA,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -241,6 +242,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_MINICPM, "minicpm" },
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
||||||
{ LLM_ARCH_GEMMA, "gemma" },
|
{ LLM_ARCH_GEMMA, "gemma" },
|
||||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||||
|
{ LLM_ARCH_MAMBA, "mamba" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -399,6 +401,15 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ATTN_Q_NORM,
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
LLM_TENSOR_ATTN_K_NORM,
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
LLM_TENSOR_LAYER_OUT_NORM,
|
LLM_TENSOR_LAYER_OUT_NORM,
|
||||||
|
// TODO: maybe use longer names?
|
||||||
|
// TODO: can the in_proj and/or the out_proj instead re-use some of the above types?
|
||||||
|
LLM_TENSOR_SSM_IN,
|
||||||
|
LLM_TENSOR_SSM_CONV1D,
|
||||||
|
LLM_TENSOR_SSM_X,
|
||||||
|
LLM_TENSOR_SSM_DT,
|
||||||
|
LLM_TENSOR_SSM_A,
|
||||||
|
LLM_TENSOR_SSM_D,
|
||||||
|
LLM_TENSOR_SSM_OUT,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||||
|
@ -801,6 +812,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_MAMBA,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
||||||
|
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in"},
|
||||||
|
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d"},
|
||||||
|
{ LLM_TENSOR_SSM_X, "blk.%d.ssm_x"},
|
||||||
|
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt"},
|
||||||
|
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a"},
|
||||||
|
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d"},
|
||||||
|
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out"},
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
@ -1737,6 +1764,22 @@ struct llama_layer {
|
||||||
struct ggml_tensor * ffn_down_b; // b2
|
struct ggml_tensor * ffn_down_b; // b2
|
||||||
struct ggml_tensor * ffn_up_b; // b3
|
struct ggml_tensor * ffn_up_b; // b3
|
||||||
struct ggml_tensor * ffn_act;
|
struct ggml_tensor * ffn_act;
|
||||||
|
|
||||||
|
|
||||||
|
// mamba proj
|
||||||
|
struct ggml_tensor * ssm_in;
|
||||||
|
struct ggml_tensor * ssm_x;
|
||||||
|
struct ggml_tensor * ssm_dt;
|
||||||
|
struct ggml_tensor * ssm_out;
|
||||||
|
|
||||||
|
// mamba
|
||||||
|
struct ggml_tensor * ssm_conv1d;
|
||||||
|
struct ggml_tensor * ssm_a;
|
||||||
|
struct ggml_tensor * ssm_d;
|
||||||
|
|
||||||
|
// mamba bias
|
||||||
|
struct ggml_tensor * ssm_conv1d_b;
|
||||||
|
struct ggml_tensor * ssm_dt_b;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_kv_cell {
|
struct llama_kv_cell {
|
||||||
|
@ -3376,6 +3419,29 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_MAMBA:
|
||||||
|
{
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 24:
|
||||||
|
switch (hparams.n_embd) {
|
||||||
|
case 768: model.type = e_model::MODEL_SMALL; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
} break;
|
||||||
|
case 48:
|
||||||
|
switch (hparams.n_embd) {
|
||||||
|
case 1024: model.type = e_model::MODEL_MEDIUM; break;
|
||||||
|
case 1536: model.type = e_model::MODEL_LARGE; break;
|
||||||
|
case 2048: model.type = e_model::MODEL_XL; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
} break;
|
||||||
|
case 64:
|
||||||
|
switch (hparams.n_embd) {
|
||||||
|
case 2560: model.type = e_model::MODEL_3B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
} break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
}
|
||||||
default: (void)0;
|
default: (void)0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4596,6 +4662,36 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_MAMBA:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||||
|
}
|
||||||
|
// TODO: MAMBA
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
// norm
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
// TODO: D, in_proj, conv1d, x_proj, dt_proj, A_log, out_proj
|
||||||
|
// TODO: what's the difference between ctx_layer and ctx_split?
|
||||||
|
// A: It seems that ctx_split is for matrices (2d???) while ctx_layer is for other things (like 1D bias and norms, probably.)
|
||||||
|
|
||||||
|
// out_proj
|
||||||
|
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {2*n_embd, n_embd});
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("unknown architecture");
|
throw std::runtime_error("unknown architecture");
|
||||||
}
|
}
|
||||||
|
@ -7779,6 +7875,92 @@ struct llm_build_context {
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_mamba() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
// d_model
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
const int64_t d_state = 16;
|
||||||
|
const int64_t d_conv = 4;
|
||||||
|
// expand = 2
|
||||||
|
// d_inner = expand * d_model
|
||||||
|
const int64_t d_inner = 2 * n_embd; // FIXME: this is wrong
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
// TODO: give it the right size
|
||||||
|
struct ggml_tensor * state;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||||
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
// FIXME: init attn_norm
|
||||||
|
// norm
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
// TODO: that's probably the wrong name.
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// conv
|
||||||
|
{
|
||||||
|
// [] * [] = [2*n_embd]
|
||||||
|
struct ggml_tensor * xz = ggml_mul_mat(ctx0, cur, model.layers[il].ssm_in);
|
||||||
|
// split the above in two
|
||||||
|
struct ggml_tensor * x = ggml_view_1d(ctx0, xz, d_inner, 0);
|
||||||
|
struct ggml_tensor * z = ggml_view_1d(ctx0, xz, d_inner, d_inner);
|
||||||
|
|
||||||
|
|
||||||
|
// FIXME: this is wrong
|
||||||
|
cur = ggml_conv_1d(ctx0, cur, model.layers[il].ssm_conv1d, 1, d_conv - 1, 1);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].ssm_conv1d_b);
|
||||||
|
|
||||||
|
// TODO: there's some SiLU in there (but no ffn? or is the conv an ffn?)
|
||||||
|
cur = ggml_silu(ctx0, cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ssm
|
||||||
|
{
|
||||||
|
|
||||||
|
// TODO: use ggml_soft_plus here
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: there's some SiLU again towards the end. Can the `llm_build_ffn` helper be used?
|
||||||
|
// Maybe the best way is to implement it, _then_ check if that helper would do the same thing.
|
||||||
|
// discretize
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
// residual
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
// the last step of each layer already makes these equivalent
|
||||||
|
// cur = inpL;
|
||||||
|
|
||||||
|
// final rmsnorm
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||||
|
@ -12321,6 +12503,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_MPT:
|
case LLM_ARCH_MPT:
|
||||||
case LLM_ARCH_REFACT:
|
case LLM_ARCH_REFACT:
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
|
case LLM_ARCH_MAMBA:
|
||||||
return LLAMA_ROPE_TYPE_NONE;
|
return LLAMA_ROPE_TYPE_NONE;
|
||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue