Fix offloading layers to CUDA

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
This commit is contained in:
Molly Sophia 2024-08-07 16:40:41 +08:00
parent 903089b5eb
commit 98ce5f43f0
2 changed files with 21 additions and 9 deletions

View file

@ -7675,7 +7675,8 @@ struct ggml_tensor * ggml_rwkv_wkv(
is_node = true; is_node = true;
} }
const int64_t ne[4] = { S * H, n_tokens, 1, 1 }; // concat output and new_state
const int64_t ne[4] = { S * H, n_tokens + S, 1, 1 };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
result->op = GGML_OP_RWKV_WKV; result->op = GGML_OP_RWKV_WKV;
@ -16853,11 +16854,12 @@ static void ggml_compute_forward_add_rel_pos(
static void ggml_compute_forward_rwkv_wkv_f32( static void ggml_compute_forward_rwkv_wkv_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
const size_t T = dst->ne[1]; const size_t T = dst->src[1]->ne[3];
const size_t C = dst->ne[0]; const size_t C = dst->ne[0];
const size_t H = dst->src[1]->ne[2]; const size_t H = dst->src[1]->ne[2];
float * dst_data = (float *) dst->data; float * dst_data = (float *) dst->data;
float * state = ((float *) dst->data) + C * T;
if (params->ith != 0) { if (params->ith != 0) {
return; return;
@ -16870,7 +16872,7 @@ static void ggml_compute_forward_rwkv_wkv_f32(
float * r = (float *) dst->src[2]->data; float * r = (float *) dst->src[2]->data;
float * time_faaaa = (float *) dst->src[3]->data; float * time_faaaa = (float *) dst->src[3]->data;
float * time_decay = (float *) dst->src[4]->data; float * time_decay = (float *) dst->src[4]->data;
float * state = (float *) dst->src[5]->data; memcpy(state, dst->src[5]->data, (C / H) * C * sizeof(float));
size_t t_stride = H * (C / H); size_t t_stride = H * (C / H);

View file

@ -9366,7 +9366,7 @@ static struct ggml_tensor * llm_build_time_mix(
const struct llama_layer * layer, const struct llama_layer * layer,
struct ggml_tensor * current, struct ggml_tensor * current,
struct ggml_tensor * x_prev, struct ggml_tensor * x_prev,
struct ggml_tensor * wkv_state) { struct ggml_tensor ** wkv_state) {
size_t n_embed = current->ne[0]; size_t n_embed = current->ne[0];
size_t n_tokens = current->ne[1]; size_t n_tokens = current->ne[1];
size_t head_size = layer->time_mix_first->ne[0]; size_t head_size = layer->time_mix_first->ne[0];
@ -9509,13 +9509,15 @@ static struct ggml_tensor * llm_build_time_mix(
w, w,
ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed) ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed)
); );
w = ggml_exp(ctx, ggml_neg_inplace(ctx, ggml_exp(ctx, w))); w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens); w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
k = ggml_transpose(ctx, k); k = ggml_transpose(ctx, k);
v = ggml_transpose(ctx, v); v = ggml_transpose(ctx, v);
r = ggml_transpose(ctx, r); r = ggml_transpose(ctx, r);
current = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, wkv_state); struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
current = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
*wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size, n_embed * n_tokens * sizeof(float));
// ggml_group_norm considers groups in the third dimension. // ggml_group_norm considers groups in the third dimension.
current = ggml_reshape_4d(ctx, current, 1, 1, n_embed, n_tokens); current = ggml_reshape_4d(ctx, current, 1, 1, n_embed, n_tokens);
@ -15110,7 +15112,7 @@ struct llm_build_context {
n_embd * ggml_type_size(x_prev->type) n_embd * ggml_type_size(x_prev->type)
); );
x = ggml_add(ctx0, x, llm_build_time_mix(ctx0, layer, x_norm, x_prev, wkv_state)); x = ggml_add(ctx0, x, llm_build_time_mix(ctx0, layer, x_norm, x_prev, &wkv_state));
ggml_build_forward_expand(gf, x); ggml_build_forward_expand(gf, x);
ggml_build_forward_expand( ggml_build_forward_expand(
gf, gf,
@ -15125,6 +15127,14 @@ struct llm_build_context {
att_shift att_shift
) )
); );
ggml_build_forward_expand(
gf,
ggml_cpy(
ctx0,
wkv_state,
ggml_view_1d(ctx0, kv_self.v_l[layer_i], hparams.n_embd_v_s(), (kv_self.size - 1) * hparams.n_embd_v_s() * ggml_type_size(kv_self.k_l[layer_i]->type))
)
);
x_norm = llm_build_norm(ctx0, x, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i); x_norm = llm_build_norm(ctx0, x, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i);
x_prev = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); x_prev = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
@ -15151,7 +15161,7 @@ struct llm_build_context {
) )
); );
if ((layer_i + 1) % hparams.rescale_every_n_layers == 0) { if ((layer_i + 1) % hparams.rescale_every_n_layers == 0) {
x = ggml_scale_inplace(ctx0, x, 0.5F); x = ggml_scale(ctx0, x, 0.5F);
} }
} }