llama : keep same graph topology even when n_outputs == 0

2024-03-17 22:04:42 -04:00 · 2024-03-17 22:04:42 -04:00 · d100502251
commit d100502251
parent 711b0bcb11
2 changed files with 12 additions and 30 deletions
--- a/ggml.c
+++ b/ggml.c
@ -2542,6 +2542,11 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }

+static inline bool ggml_is_empty(const struct ggml_tensor * tensor) {
+    // nb[3] depends on the previous nb and ne
+    return tensor->nb[3] == 0 || tensor->ne[3] == 0;
+}
+
 bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

@ -2556,11 +2561,11 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
 static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

-    return
-        (t1->ne[0]%t0->ne[0] == 0) &&
-        (t1->ne[1]%t0->ne[1] == 0) &&
-        (t1->ne[2]%t0->ne[2] == 0) &&
-        (t1->ne[3]%t0->ne[3] == 0);
+    return ggml_is_empty(t0) ||
+        ((t1->ne[0]%t0->ne[0] == 0) &&
+         (t1->ne[1]%t0->ne[1] == 0) &&
+         (t1->ne[2]%t0->ne[2] == 0) &&
+         (t1->ne[3]%t0->ne[3] == 0));
 }

 static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
@ -16047,7 +16052,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
    GGML_ASSERT(params);

-    if (tensor->op == GGML_OP_NONE) {
+    if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
        return;
    }

@ -18011,7 +18016,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
            {
                // FIXME: the cost of launching additional threads decreases performance with GPU offloading
                //n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
-                n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
+                n_tasks = MIN(n_cur_threads, MAX(ggml_nelements(node->src[1]), 1));
            } break;
        case GGML_OP_SCALE:
        case GGML_OP_SET:
--- a/llama.cpp
+++ b/llama.cpp
@ -5894,7 +5894,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                n_tokens = n_outputs;
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
@ -6082,7 +6081,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -6205,7 +6203,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur       = ggml_get_rows(ctx0,       cur, inp_out_ids);
                inpL      = ggml_get_rows(ctx0,      inpL, inp_out_ids);
@ -6308,7 +6305,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -6513,7 +6509,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur      = ggml_get_rows(ctx0,      cur, inp_out_ids);
                residual = ggml_get_rows(ctx0, residual, inp_out_ids);
@ -6610,7 +6605,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -6775,7 +6769,6 @@ struct llm_build_context {

            if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -6905,7 +6898,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -7011,7 +7003,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -7132,7 +7123,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -7246,7 +7236,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -7366,7 +7355,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -7492,7 +7480,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur              = ggml_get_rows(ctx0,              cur, inp_out_ids);
                inpL             = ggml_get_rows(ctx0,             inpL, inp_out_ids);
@ -7598,7 +7585,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur    = ggml_get_rows(ctx0,    cur, inp_out_ids);
                sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
@ -7699,7 +7685,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -7807,7 +7792,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -7924,7 +7908,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -8042,7 +8025,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -8173,7 +8155,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -8295,7 +8276,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@ -8416,7 +8396,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@ -8571,7 +8550,6 @@ struct llm_build_context {

                if (il == n_layer - 1) {
                    // skip computing output for unused tokens
-                    if (n_outputs == 0) { return gf; }
                    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                    x    = ggml_get_rows(ctx0,    x, inp_out_ids);
                    y    = ggml_get_rows(ctx0,    y, inp_out_ids);
@ -8683,7 +8661,6 @@ struct llm_build_context {

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
-                if (n_outputs == 0) { return gf; }
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);