From 8febfc73af9ab97afc62fdcd0c6851b1d86b8ca5 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sat, 1 Apr 2023 01:26:48 +0800
Subject: [PATCH] Fix inplace version of operators

Use inplace version when possible
---
 ggml.c    | 73 +++++++++++++++++++++++++++++++++++++++++--------------
 ggml.h    | 33 +++++++++++++++++++++++--
 llama.cpp |  8 +++---
 3 files changed, 90 insertions(+), 24 deletions(-)

diff --git a/ggml.c b/ggml.c
index ffd54ec41..3a225eaf1 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4278,9 +4278,7 @@ struct ggml_tensor * ggml_scale_impl(
         is_node = true;
     }
 
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     result->op   = GGML_OP_SCALE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4593,10 +4591,11 @@ struct ggml_tensor * ggml_get_rows(
 
 // ggml_diag_mask_inf
 
-struct ggml_tensor * ggml_diag_mask_inf(
+struct ggml_tensor * ggml_diag_mask_inf_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   n_past) {
+        int                   n_past,
+        bool                  inplace) {
     bool is_node = false;
 
     if (a->grad) {
@@ -4604,9 +4603,7 @@ struct ggml_tensor * ggml_diag_mask_inf(
         is_node = true;
     }
 
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
     struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
 
     result->op   = GGML_OP_DIAG_MASK_INF;
@@ -4617,11 +4614,26 @@ struct ggml_tensor * ggml_diag_mask_inf(
     return result;
 }
 
+struct ggml_tensor * ggml_diag_mask_inf(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    ggml_diag_mask_inf_impl(ctx, a, n_past, false);
+}
+
+struct ggml_tensor * ggml_diag_mask_inf_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past) {
+    ggml_diag_mask_inf_impl(ctx, a, n_past, true);
+}
+
 // ggml_soft_max
 
-struct ggml_tensor * ggml_soft_max(
+struct ggml_tensor * ggml_soft_max_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
+        struct ggml_tensor  * a,
+        bool                  inplace) {
     bool is_node = false;
 
     if (a->grad) {
@@ -4629,9 +4641,7 @@ struct ggml_tensor * ggml_soft_max(
         is_node = true;
     }
 
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     result->op   = GGML_OP_SOFT_MAX;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4641,14 +4651,26 @@ struct ggml_tensor * ggml_soft_max(
     return result;
 }
 
+struct ggml_tensor * ggml_soft_max(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    ggml_soft_max_impl(ctx, a, false);
+}
+struct ggml_tensor * ggml_soft_max_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    ggml_soft_max_impl(ctx, a, true);
+}
+
 // ggml_rope
 
-struct ggml_tensor * ggml_rope(
+struct ggml_tensor * ggml_rope_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         int                   n_past,
         int                   n_dims,
-        int                   mode) {
+        int                   mode,
+        bool                  inplace) {
     GGML_ASSERT(n_past >= 0);
     bool is_node = false;
 
@@ -4657,9 +4679,7 @@ struct ggml_tensor * ggml_rope(
         is_node = true;
     }
 
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
     struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
     ((int32_t *) b->data)[0] = n_past;
@@ -4673,6 +4693,23 @@ struct ggml_tensor * ggml_rope(
 
     return result;
 }
+struct ggml_tensor * ggml_rope(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode){
+    ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
+}
+
+struct ggml_tensor * ggml_rope_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode){
+    ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
+}
 
 // ggml_conv_1d_1s
 
diff --git a/ggml.h b/ggml.h
index 5f5542d0f..d877258c3 100644
--- a/ggml.h
+++ b/ggml.h
@@ -470,27 +470,45 @@ struct ggml_tensor * ggml_repeat(
 struct ggml_tensor * ggml_abs(
         struct ggml_context * ctx,
         struct ggml_tensor  * a);
+struct ggml_tensor * ggml_abs_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
 
 struct ggml_tensor * ggml_sgn(
         struct ggml_context * ctx,
         struct ggml_tensor  * a);
+struct ggml_tensor * ggml_sgn_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
 
 struct ggml_tensor * ggml_neg(
         struct ggml_context * ctx,
         struct ggml_tensor  * a);
+struct ggml_tensor * ggml_neg_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
 
 struct ggml_tensor * ggml_step(
         struct ggml_context * ctx,
         struct ggml_tensor  * a);
+struct ggml_tensor * ggml_step_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
 
 struct ggml_tensor * ggml_relu(
         struct ggml_context * ctx,
         struct ggml_tensor  * a);
+struct ggml_tensor * ggml_relu_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
 
 // TODO: double-check this computation is correct
 struct ggml_tensor * ggml_gelu(
         struct ggml_context * ctx,
         struct ggml_tensor  * a);
+struct ggml_tensor * ggml_gelu_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
 
 struct ggml_tensor * ggml_silu(
         struct ggml_context * ctx,
@@ -605,16 +623,22 @@ struct ggml_tensor * ggml_get_rows(
         struct ggml_tensor  * b);
 
 // set elements above the diagonal to -INF
-// in-place, returns view(a)
 struct ggml_tensor * ggml_diag_mask_inf(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         int                   n_past);
+struct ggml_tensor * ggml_diag_mask_inf_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past);
 
 // in-place, returns view(a)
 struct ggml_tensor * ggml_soft_max(
         struct ggml_context * ctx,
         struct ggml_tensor  * a);
+struct ggml_tensor * ggml_soft_max_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
 
 // rotary position embedding
 // in-place, returns view(a)
@@ -626,7 +650,12 @@ struct ggml_tensor * ggml_rope(
         int                   n_past,
         int                   n_dims,
         int                   mode);
-
+struct ggml_tensor * ggml_rope_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode);
 // padding = 1
 // TODO: we don't support extra parameters for now
 //       that's why we are hard-coding the stride, padding, and dilation
diff --git a/llama.cpp b/llama.cpp
index 1eabed13d..e46147b17 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -826,7 +826,7 @@ static bool llama_eval_internal(
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
             struct ggml_tensor * Q =
                 ggml_permute(ctx0,
-                        ggml_rope(ctx0,
+                        ggml_rope_inplace(ctx0,
                             ggml_reshape_3d(ctx0,
                                 Qcur,
                                 n_embd/n_head, n_head, N),
@@ -836,7 +836,7 @@ static bool llama_eval_internal(
             // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
             struct ggml_tensor * K =
                 ggml_permute(ctx0,
-                        ggml_rope(ctx0,
+                        ggml_rope_inplace(ctx0,
                             ggml_reshape_3d(ctx0,
                                 ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
                                 n_embd/n_head, n_head, n_past + N),
@@ -853,10 +853,10 @@ static bool llama_eval_internal(
                         ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
 
             // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
 
             // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
             struct ggml_tensor * V_trans =