diff --git a/common/common.cpp b/common/common.cpp index b9aa97419..859c1aad1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1144,8 +1144,9 @@ std::tuple llama_init_from_gpt_par { LOG("warming up the model with an empty run\n"); + fprintf(stderr, "%s: warming up\n", __func__); std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; - llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + // llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); llama_kv_cache_clear(lctx); llama_reset_timings(lctx); } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 5fa4cb9d4..e23f71a75 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -361,8 +361,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par fprintf(stderr, "%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } - if (j > 10) - exit(-1); + exit(-1); // restore the original token in case it was set to BOS tokens[batch_start] = token_org; diff --git a/ggml-backend.c b/ggml-backend.c index 526ce732b..8fb997698 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -196,10 +196,15 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_ } void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + printf("%s: compute graph in the backend\n{\n", __func__); backend->iface.graph_compute(backend, cgraph); + printf("}\n%s: finish computing graph in the backend\n", __func__); + // printf("%s: synchronize results\n", __func__); + // const int64_t start_us = ggml_time_us(); // TODO: optional sync ggml_backend_synchronize(backend); + // printf("%s: finish synchronizing %.3f ms\n", __func__, (ggml_time_us() - start_us) / 1000); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { diff --git a/ggml.c b/ggml.c index c46884d3c..29a2e18a4 100644 --- a/ggml.c +++ b/ggml.c @@ -14438,6 +14438,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm return; } + const int64_t start_us = ggml_time_us(); + #ifdef GGML_USE_CUBLAS bool skip_cpu = ggml_cuda_compute_forward(params, tensor); if (skip_cpu) { @@ -14767,6 +14769,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm GGML_ASSERT(false); } break; } + + printf("\t%s: Tensor [%s], OP [%s], Time [%.3f] ms\n", __func__, ggml_op_string(tensor->op), tensor->name, (ggml_time_us() - start_us) / 1000.0); } //////////////////////////////////////////////////////////////////////////////// @@ -16801,7 +16805,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { } } - const int n_threads = cplan->n_threads; + const int n_threads = 1; // cplan->n_threads; struct ggml_compute_state_shared state_shared = { /*.cgraph =*/ cgraph, diff --git a/llama.cpp b/llama.cpp index 54b7c6d4d..1abfbaed0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4500,13 +4500,13 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - LLAMA_LOG_INFO("%s: Start to print tensors in the computation graph\n", __func__); + LLAMA_LOG_INFO("{\n\t%s: Start to print tensors in the computation graph\n", __func__); for (int i = 0; i < gf->n_nodes; ++ i) { ggml_tensor * t = gf->nodes[i]; - LLAMA_LOG_INFO("%s: Tensor name [%s]\n", __func__, t->name); - LLAMA_LOG_INFO("%s: \tOP [%s]\n", __func__, ggml_op_string(t->op)); - LLAMA_LOG_INFO("%s: \tBackend [%s]\n", __func__, ggml_backend_type_string(t->backend)); - LLAMA_LOG_INFO("%s: \tShape (", __func__); + LLAMA_LOG_INFO("\t%s: Tensor name [%s]\n", __func__, t->name); + LLAMA_LOG_INFO("\t%s: \tOP [%s]\n", __func__, ggml_op_string(t->op)); + LLAMA_LOG_INFO("\t%s: \tBackend [%s]\n", __func__, ggml_backend_type_string(t->backend)); + LLAMA_LOG_INFO("\t%s: \tShape (", __func__); for (int j = 0; j < GGML_MAX_DIMS; ++ j) { LLAMA_LOG_INFO("%ld", t->ne[GGML_MAX_DIMS - 1 - j]); if (j != GGML_MAX_DIMS - 1) { @@ -4515,7 +4515,7 @@ struct llm_build_context { LLAMA_LOG_INFO(")\n"); } } - LLAMA_LOG_INFO("%s: \tSource tensor [", __func__); + LLAMA_LOG_INFO("\t%s: \tSource tensor [", __func__); switch (t->op) { case GGML_OP_VIEW: case GGML_OP_RMS_NORM: @@ -4541,8 +4541,7 @@ struct llm_build_context { } LLAMA_LOG_INFO("]\n"); } - LLAMA_LOG_INFO("%s: Finish printing tensors in the computation graph\n", __func__); - exit(-1); + LLAMA_LOG_INFO("\t%s: Finish printing tensors in the computation graph\n}\n", __func__); return gf; } @@ -6481,7 +6480,7 @@ static int llama_decode_internal( res->backend = GGML_BACKEND_CPU; #endif - LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + LLAMA_LOG_INFO("%s: graph build time: %.3f ms (%d nodes, %d leafs)\n", __func__, (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance diff --git a/analyze_cgf.py b/py/analyze_cgf.py similarity index 100% rename from analyze_cgf.py rename to py/analyze_cgf.py diff --git a/py/analyze_time.py b/py/analyze_time.py new file mode 100644 index 000000000..e80929e79 --- /dev/null +++ b/py/analyze_time.py @@ -0,0 +1,65 @@ +import os +import argparse + +def read_data(file_path): + fin = open(file_path, "r") + lines = fin.readlines() + fin.close() + + def split(context): + data_list = [] + read = False + data = "" + for c in context: + if c == "[": + read = True + continue + elif c == "]": + read = False + data_list.append(data) + data = "" + continue + if read: + data += c + return data_list + + time_summary = {} + time_info = {} + start = False + for i, line in enumerate(lines): + line = line.strip() + if line == "{": + start = True + continue + elif line == "}": + start = False + continue + if start and "ggml_compute_forward" in line: + data = split(line) + tensor = data[0] + op = data[1] + time_cost = float(data[2]) + if op == "inp_embd": + layer = 0 + elif "-" in op: + layer = int(op.split(" ")[0].split("-")[1]) + 1 + else: + layer = 100 + if layer in time_summary: + time_info[layer].append({"name": tensor, "op": op, "time": time_cost}) + time_summary[layer] += time_cost + else: + time_info[layer] = [{"name": tensor, "op": op, "time": time_cost}] + time_summary[layer] = time_cost + + return time_info, time_summary + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--log_file", required=True, type=str) + args = parser.parse_args() + + time_info, time_summary = read_data(args.log_file) + for k, v in time_summary.items(): + print(k, v)