diff --git a/common/common.cpp b/common/common.cpp
index b9aa97419..859c1aad1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1144,8 +1144,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     {
         LOG("warming up the model with an empty run\n");
 
+        fprintf(stderr, "%s: warming up\n", __func__);
         std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
-        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+        // llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
         llama_kv_cache_clear(lctx);
         llama_reset_timings(lctx);
     }
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 5fa4cb9d4..e23f71a75 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -361,8 +361,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                 fprintf(stderr, "%s : failed to eval\n", __func__);
                 return {tokens, -1, logit_history, prob_history};
             }
-            if (j > 10)
-             exit(-1);
+            exit(-1);
 
             // restore the original token in case it was set to BOS
             tokens[batch_start] = token_org;
diff --git a/ggml-backend.c b/ggml-backend.c
index 526ce732b..8fb997698 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -196,10 +196,15 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
 }
 
 void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    printf("%s: compute graph in the backend\n{\n", __func__);
     backend->iface.graph_compute(backend, cgraph);
+    printf("}\n%s: finish computing graph in the backend\n", __func__);
 
+    // printf("%s: synchronize results\n", __func__);
+    // const int64_t start_us = ggml_time_us();
     // TODO: optional sync
     ggml_backend_synchronize(backend);
+    // printf("%s: finish synchronizing %.3f ms\n", __func__, (ggml_time_us() - start_us) / 1000);
 }
 
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
diff --git a/ggml.c b/ggml.c
index c46884d3c..29a2e18a4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14438,6 +14438,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
         return;
     }
 
+    const int64_t start_us = ggml_time_us();
+
 #ifdef GGML_USE_CUBLAS
     bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
     if (skip_cpu) {
@@ -14767,6 +14769,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 GGML_ASSERT(false);
             } break;
     }
+
+    printf("\t%s: Tensor [%s], OP [%s], Time [%.3f] ms\n", __func__, ggml_op_string(tensor->op), tensor->name, (ggml_time_us() - start_us) / 1000.0);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -16801,7 +16805,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
         }
     }
 
-    const int n_threads = cplan->n_threads;
+    const int n_threads = 1; // cplan->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
         /*.cgraph                  =*/ cgraph,
diff --git a/llama.cpp b/llama.cpp
index 54b7c6d4d..1abfbaed0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4500,13 +4500,13 @@ struct llm_build_context {
 
         ggml_build_forward_expand(gf, cur);
 
-        LLAMA_LOG_INFO("%s: Start to print tensors in the computation graph\n", __func__);
+        LLAMA_LOG_INFO("{\n\t%s: Start to print tensors in the computation graph\n", __func__);
         for (int i = 0; i < gf->n_nodes; ++ i) {
             ggml_tensor * t = gf->nodes[i];
-            LLAMA_LOG_INFO("%s: Tensor name [%s]\n", __func__, t->name);
-            LLAMA_LOG_INFO("%s: \tOP [%s]\n", __func__, ggml_op_string(t->op));
-            LLAMA_LOG_INFO("%s: \tBackend [%s]\n", __func__, ggml_backend_type_string(t->backend));
-            LLAMA_LOG_INFO("%s: \tShape (", __func__);
+            LLAMA_LOG_INFO("\t%s: Tensor name [%s]\n", __func__, t->name);
+            LLAMA_LOG_INFO("\t%s: \tOP [%s]\n", __func__, ggml_op_string(t->op));
+            LLAMA_LOG_INFO("\t%s: \tBackend [%s]\n", __func__, ggml_backend_type_string(t->backend));
+            LLAMA_LOG_INFO("\t%s: \tShape (", __func__);
             for (int j = 0; j < GGML_MAX_DIMS; ++ j) {
                 LLAMA_LOG_INFO("%ld", t->ne[GGML_MAX_DIMS - 1 - j]);
                 if (j != GGML_MAX_DIMS - 1) {
@@ -4515,7 +4515,7 @@ struct llm_build_context {
                     LLAMA_LOG_INFO(")\n");
                 }
             }
-            LLAMA_LOG_INFO("%s: \tSource tensor [", __func__);
+            LLAMA_LOG_INFO("\t%s: \tSource tensor [", __func__);
             switch (t->op) {
                 case GGML_OP_VIEW:
                 case GGML_OP_RMS_NORM:
@@ -4541,8 +4541,7 @@ struct llm_build_context {
             }
             LLAMA_LOG_INFO("]\n");
         }
-        LLAMA_LOG_INFO("%s: Finish printing tensors in the computation graph\n", __func__);
-        exit(-1);
+        LLAMA_LOG_INFO("\t%s: Finish printing tensors in the computation graph\n}\n", __func__);
 
         return gf;
     }
@@ -6481,7 +6480,7 @@ static int llama_decode_internal(
     res->backend = GGML_BACKEND_CPU;
 #endif
 
-    LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+    LLAMA_LOG_INFO("%s: graph build time: %.3f ms (%d nodes, %d leafs)\n", __func__, (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
diff --git a/analyze_cgf.py b/py/analyze_cgf.py
similarity index 100%
rename from analyze_cgf.py
rename to py/analyze_cgf.py
diff --git a/py/analyze_time.py b/py/analyze_time.py
new file mode 100644
index 000000000..e80929e79
--- /dev/null
+++ b/py/analyze_time.py
@@ -0,0 +1,65 @@
+import os
+import argparse
+
+def read_data(file_path):
+    fin = open(file_path, "r")
+    lines = fin.readlines()
+    fin.close()
+
+    def split(context):
+        data_list = []
+        read = False
+        data = ""
+        for c in context:
+            if c == "[":
+                read = True
+                continue
+            elif c == "]":
+                read = False
+                data_list.append(data)
+                data = ""
+                continue
+            if read:
+                data += c
+        return data_list
+
+    time_summary = {}
+    time_info = {}
+    start = False
+    for i, line in enumerate(lines):
+        line = line.strip()
+        if line == "{":
+            start = True
+            continue
+        elif line == "}":
+            start = False
+            continue
+        if start and "ggml_compute_forward" in line:
+            data = split(line)
+            tensor = data[0]
+            op = data[1]
+            time_cost = float(data[2])
+            if op == "inp_embd":
+                layer = 0
+            elif "-" in op:
+                layer = int(op.split(" ")[0].split("-")[1]) + 1
+            else:
+                layer = 100
+            if layer in time_summary:
+                time_info[layer].append({"name": tensor, "op": op, "time": time_cost})
+                time_summary[layer] += time_cost
+            else:
+                time_info[layer] = [{"name": tensor, "op": op, "time": time_cost}]
+                time_summary[layer] = time_cost
+    
+    return time_info, time_summary
+            
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_file", required=True, type=str)
+    args = parser.parse_args()
+
+    time_info, time_summary = read_data(args.log_file)
+    for k, v in time_summary.items():
+        print(k, v)