add print functions and analyzing codes
This commit is contained in:
parent
6215c33a2b
commit
0da77c8d2a
7 changed files with 86 additions and 13 deletions
|
@ -1144,8 +1144,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
{
|
{
|
||||||
LOG("warming up the model with an empty run\n");
|
LOG("warming up the model with an empty run\n");
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: warming up\n", __func__);
|
||||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
// llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
||||||
llama_kv_cache_clear(lctx);
|
llama_kv_cache_clear(lctx);
|
||||||
llama_reset_timings(lctx);
|
llama_reset_timings(lctx);
|
||||||
}
|
}
|
||||||
|
|
|
@ -361,7 +361,6 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||||
return {tokens, -1, logit_history, prob_history};
|
return {tokens, -1, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
if (j > 10)
|
|
||||||
exit(-1);
|
exit(-1);
|
||||||
|
|
||||||
// restore the original token in case it was set to BOS
|
// restore the original token in case it was set to BOS
|
||||||
|
|
|
@ -196,10 +196,15 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
|
printf("%s: compute graph in the backend\n{\n", __func__);
|
||||||
backend->iface.graph_compute(backend, cgraph);
|
backend->iface.graph_compute(backend, cgraph);
|
||||||
|
printf("}\n%s: finish computing graph in the backend\n", __func__);
|
||||||
|
|
||||||
|
// printf("%s: synchronize results\n", __func__);
|
||||||
|
// const int64_t start_us = ggml_time_us();
|
||||||
// TODO: optional sync
|
// TODO: optional sync
|
||||||
ggml_backend_synchronize(backend);
|
ggml_backend_synchronize(backend);
|
||||||
|
// printf("%s: finish synchronizing %.3f ms\n", __func__, (ggml_time_us() - start_us) / 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
|
6
ggml.c
6
ggml.c
|
@ -14438,6 +14438,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int64_t start_us = ggml_time_us();
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
||||||
if (skip_cpu) {
|
if (skip_cpu) {
|
||||||
|
@ -14767,6 +14769,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
printf("\t%s: Tensor [%s], OP [%s], Time [%.3f] ms\n", __func__, ggml_op_string(tensor->op), tensor->name, (ggml_time_us() - start_us) / 1000.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -16801,7 +16805,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_threads = cplan->n_threads;
|
const int n_threads = 1; // cplan->n_threads;
|
||||||
|
|
||||||
struct ggml_compute_state_shared state_shared = {
|
struct ggml_compute_state_shared state_shared = {
|
||||||
/*.cgraph =*/ cgraph,
|
/*.cgraph =*/ cgraph,
|
||||||
|
|
17
llama.cpp
17
llama.cpp
|
@ -4500,13 +4500,13 @@ struct llm_build_context {
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: Start to print tensors in the computation graph\n", __func__);
|
LLAMA_LOG_INFO("{\n\t%s: Start to print tensors in the computation graph\n", __func__);
|
||||||
for (int i = 0; i < gf->n_nodes; ++ i) {
|
for (int i = 0; i < gf->n_nodes; ++ i) {
|
||||||
ggml_tensor * t = gf->nodes[i];
|
ggml_tensor * t = gf->nodes[i];
|
||||||
LLAMA_LOG_INFO("%s: Tensor name [%s]\n", __func__, t->name);
|
LLAMA_LOG_INFO("\t%s: Tensor name [%s]\n", __func__, t->name);
|
||||||
LLAMA_LOG_INFO("%s: \tOP [%s]\n", __func__, ggml_op_string(t->op));
|
LLAMA_LOG_INFO("\t%s: \tOP [%s]\n", __func__, ggml_op_string(t->op));
|
||||||
LLAMA_LOG_INFO("%s: \tBackend [%s]\n", __func__, ggml_backend_type_string(t->backend));
|
LLAMA_LOG_INFO("\t%s: \tBackend [%s]\n", __func__, ggml_backend_type_string(t->backend));
|
||||||
LLAMA_LOG_INFO("%s: \tShape (", __func__);
|
LLAMA_LOG_INFO("\t%s: \tShape (", __func__);
|
||||||
for (int j = 0; j < GGML_MAX_DIMS; ++ j) {
|
for (int j = 0; j < GGML_MAX_DIMS; ++ j) {
|
||||||
LLAMA_LOG_INFO("%ld", t->ne[GGML_MAX_DIMS - 1 - j]);
|
LLAMA_LOG_INFO("%ld", t->ne[GGML_MAX_DIMS - 1 - j]);
|
||||||
if (j != GGML_MAX_DIMS - 1) {
|
if (j != GGML_MAX_DIMS - 1) {
|
||||||
|
@ -4515,7 +4515,7 @@ struct llm_build_context {
|
||||||
LLAMA_LOG_INFO(")\n");
|
LLAMA_LOG_INFO(")\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LLAMA_LOG_INFO("%s: \tSource tensor [", __func__);
|
LLAMA_LOG_INFO("\t%s: \tSource tensor [", __func__);
|
||||||
switch (t->op) {
|
switch (t->op) {
|
||||||
case GGML_OP_VIEW:
|
case GGML_OP_VIEW:
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
|
@ -4541,8 +4541,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
LLAMA_LOG_INFO("]\n");
|
LLAMA_LOG_INFO("]\n");
|
||||||
}
|
}
|
||||||
LLAMA_LOG_INFO("%s: Finish printing tensors in the computation graph\n", __func__);
|
LLAMA_LOG_INFO("\t%s: Finish printing tensors in the computation graph\n}\n", __func__);
|
||||||
exit(-1);
|
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
@ -6481,7 +6480,7 @@ static int llama_decode_internal(
|
||||||
res->backend = GGML_BACKEND_CPU;
|
res->backend = GGML_BACKEND_CPU;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
LLAMA_LOG_INFO("%s: graph build time: %.3f ms (%d nodes, %d leafs)\n", __func__, (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||||
|
|
||||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||||
|
|
65
py/analyze_time.py
Normal file
65
py/analyze_time.py
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
def read_data(file_path):
|
||||||
|
fin = open(file_path, "r")
|
||||||
|
lines = fin.readlines()
|
||||||
|
fin.close()
|
||||||
|
|
||||||
|
def split(context):
|
||||||
|
data_list = []
|
||||||
|
read = False
|
||||||
|
data = ""
|
||||||
|
for c in context:
|
||||||
|
if c == "[":
|
||||||
|
read = True
|
||||||
|
continue
|
||||||
|
elif c == "]":
|
||||||
|
read = False
|
||||||
|
data_list.append(data)
|
||||||
|
data = ""
|
||||||
|
continue
|
||||||
|
if read:
|
||||||
|
data += c
|
||||||
|
return data_list
|
||||||
|
|
||||||
|
time_summary = {}
|
||||||
|
time_info = {}
|
||||||
|
start = False
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
line = line.strip()
|
||||||
|
if line == "{":
|
||||||
|
start = True
|
||||||
|
continue
|
||||||
|
elif line == "}":
|
||||||
|
start = False
|
||||||
|
continue
|
||||||
|
if start and "ggml_compute_forward" in line:
|
||||||
|
data = split(line)
|
||||||
|
tensor = data[0]
|
||||||
|
op = data[1]
|
||||||
|
time_cost = float(data[2])
|
||||||
|
if op == "inp_embd":
|
||||||
|
layer = 0
|
||||||
|
elif "-" in op:
|
||||||
|
layer = int(op.split(" ")[0].split("-")[1]) + 1
|
||||||
|
else:
|
||||||
|
layer = 100
|
||||||
|
if layer in time_summary:
|
||||||
|
time_info[layer].append({"name": tensor, "op": op, "time": time_cost})
|
||||||
|
time_summary[layer] += time_cost
|
||||||
|
else:
|
||||||
|
time_info[layer] = [{"name": tensor, "op": op, "time": time_cost}]
|
||||||
|
time_summary[layer] = time_cost
|
||||||
|
|
||||||
|
return time_info, time_summary
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--log_file", required=True, type=str)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
time_info, time_summary = read_data(args.log_file)
|
||||||
|
for k, v in time_summary.items():
|
||||||
|
print(k, v)
|
Loading…
Add table
Add a link
Reference in a new issue