llama : keep same graph topology even when n_outputs == 0

This commit is contained in:
Francis Couture-Harpin 2024-03-17 22:04:42 -04:00
parent 711b0bcb11
commit d100502251
2 changed files with 12 additions and 30 deletions

19
ggml.c
View file

@ -2542,6 +2542,11 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
static inline bool ggml_is_empty(const struct ggml_tensor * tensor) {
// nb[3] depends on the previous nb and ne
return tensor->nb[3] == 0 || tensor->ne[3] == 0;
}
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@ -2556,11 +2561,11 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
(t1->ne[0]%t0->ne[0] == 0) &&
(t1->ne[1]%t0->ne[1] == 0) &&
(t1->ne[2]%t0->ne[2] == 0) &&
(t1->ne[3]%t0->ne[3] == 0);
return ggml_is_empty(t0) ||
((t1->ne[0]%t0->ne[0] == 0) &&
(t1->ne[1]%t0->ne[1] == 0) &&
(t1->ne[2]%t0->ne[2] == 0) &&
(t1->ne[3]%t0->ne[3] == 0));
}
static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
@ -16047,7 +16052,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
GGML_ASSERT(params);
if (tensor->op == GGML_OP_NONE) {
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
return;
}
@ -18011,7 +18016,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
{
// FIXME: the cost of launching additional threads decreases performance with GPU offloading
//n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
n_tasks = MIN(n_cur_threads, MAX(ggml_nelements(node->src[1]), 1));
} break;
case GGML_OP_SCALE:
case GGML_OP_SET:

View file

@ -5894,7 +5894,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
n_tokens = n_outputs;
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
@ -6082,7 +6081,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -6205,7 +6203,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -6308,7 +6305,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -6513,7 +6509,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
@ -6610,7 +6605,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -6775,7 +6769,6 @@ struct llm_build_context {
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -6905,7 +6898,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -7011,7 +7003,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -7132,7 +7123,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -7246,7 +7236,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -7366,7 +7355,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -7492,7 +7480,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -7598,7 +7585,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
@ -7699,7 +7685,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -7807,7 +7792,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -7924,7 +7908,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -8042,7 +8025,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -8173,7 +8155,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -8295,7 +8276,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -8416,7 +8396,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -8571,7 +8550,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
x = ggml_get_rows(ctx0, x, inp_out_ids);
y = ggml_get_rows(ctx0, y, inp_out_ids);
@ -8683,7 +8661,6 @@ struct llm_build_context {
if (il == n_layer - 1) {
// skip computing output for unused tokens
if (n_outputs == 0) { return gf; }
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);