llama : add SKIP_KQ_KQV option

ggml : skip nops
llama : profiling the attention compute
2023-10-22 09:58:29 +03:00 · 2023-10-22 09:55:37 +03:00 · 2023-10-22 09:22:54 +03:00
2 changed files with 31 additions and 0 deletions
--- a/ggml.c
+++ b/ggml.c
@ -16602,6 +16602,10 @@ static void ggml_compute_forward_cross_entropy_loss_back(
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
    GGML_ASSERT(params);

+    if (tensor->op == GGML_OP_NONE) {
+        return;
+    }
+
 #ifdef GGML_USE_CUBLAS
    bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
    if (skip_cpu) {
--- a/llama.cpp
+++ b/llama.cpp
@ -5815,6 +5815,33 @@ static struct ggml_cgraph * llama_build_graph(
            GGML_ASSERT(false);
    }

+#if 1
+    for (int i = 0; i < result->n_nodes; ++i) {
+        struct ggml_tensor * node = result->nodes[i];
+        if (getenv("SKIP_KQ_ALL")) {
+            if (
+                    strcmp(node->name, "KQ")  == 0 ||
+                    strcmp(node->name, "KQ_scaled") == 0 ||
+                    strcmp(node->name, "KQ_masked") == 0 ||
+                    strcmp(node->name, "KQ_soft_max") == 0 ||
+                    strcmp(node->name, "KQV") == 0 ||
+                    false) {
+                //printf("skipping %s\n", dst->name);
+                node->op  = GGML_OP_NONE;
+            }
+        }
+        if (getenv("SKIP_KQ_KQV")) {
+            if (
+                    strcmp(node->name, "KQ")  == 0 ||
+                    strcmp(node->name, "KQV") == 0 ||
+                    false) {
+                //printf("skipping %s\n", dst->name);
+                node->op  = GGML_OP_NONE;
+            }
+        }
+    }
+#endif
+
    return result;
 }
Author	SHA1	Message	Date
Georgi Gerganov	cb79f8a2d8	llama : add SKIP_KQ_KQV option	2023-10-22 09:58:29 +03:00
Georgi Gerganov	ed9fde7a1e	ggml : skip nops	2023-10-22 09:55:37 +03:00
Georgi Gerganov	2471d56a2e	llama : profiling the attention compute	2023-10-22 09:22:54 +03:00