diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9a77beca6..20e0133ac 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -325,6 +325,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par std::vector workers(std::thread::hardware_concurrency() - 1); + fprintf(stderr, "%s: n_tokens: %ld\n", __func__, tokens.size()); + fprintf(stderr, "%s: n_ctx: %d\n", __func__, n_ctx); + fprintf(stderr, "%s: n_batch: %d\n", __func__, n_batch); + fprintf(stderr, "%s: num batches per chunk processing: %d\n", __func__, int((n_ctx + n_batch - 1) / n_batch)); + for (int i = 0; i < n_chunk; ++i) { const int start = i * n_ctx; const int end = start + n_ctx; diff --git a/ggml.c b/ggml.c index ed56e60a8..c46884d3c 100644 --- a/ggml.c +++ b/ggml.c @@ -259,6 +259,178 @@ typedef double ggml_float; #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +// +// enum strings +// + +const char * ggml_backend_type_string(enum ggml_backend_type backend_type) { + switch (backend_type) { + case 0: + return "GGML_BACKEND_CPU"; + case 10: + return "GGML_BACKEND_GPU"; + case 20: + return "GGML_BACKEND_GPU_SPLIT"; + default: + return "WRONG_BACKEND_TYPE"; + } + return ""; +} + +const char * ggml_op_string(enum ggml_op op) { + switch (op) { + case 0: + return "GGML_OP_NONE"; + case 1: + return "GGML_OP_DUP"; + case 2: + return "GGML_OP_ADD"; + case 3: + return "GGML_OP_ADD1"; + case 4: + return "GGML_OP_ACC"; + case 5: + return "GGML_OP_SUB"; + case 6: + return "GGML_OP_MUL"; + case 7: + return "GGML_OP_DIV"; + case 8: + return "GGML_OP_SQR"; + case 9: + return "GGML_OP_SQRT"; + case 10: + return "GGML_OP_LOG"; + case 11: + return "GGML_OP_SUM"; + case 12: + return "GGML_OP_SUM_ROWS"; + case 13: + return "GGML_OP_MEAN"; + case 14: + return "GGML_OP_ARGMAX"; + case 15: + return "GGML_OP_REPEAT"; + case 16: + return "GGML_OP_REPEAT_BACK"; + case 17: + return "GGML_OP_CONCAT"; + case 18: + return "GGML_OP_SILU_BACK"; + case 19: + return "GGML_OP_NORM"; // normalize + case 20: + return "GGML_OP_RMS_NORM"; + case 21: + return "GGML_OP_RMS_NORM_BACK"; + case 22: + return "GGML_OP_GROUP_NORM"; + case 23: + return "GGML_OP_MUL_MAT"; + case 24: + return "GGML_OP_MUL_MAT_ID"; + case 25: + return "GGML_OP_OUT_PROD"; + case 26: + return "GGML_OP_SCALE"; + case 27: + return "GGML_OP_SET"; + case 28: + return "GGML_OP_CPY"; + case 29: + return "GGML_OP_CONT"; + case 30: + return "GGML_OP_RESHAPE"; + case 31: + return "GGML_OP_VIEW"; + case 32: + return "GGML_OP_PERMUTE"; + case 33: + return "GGML_OP_TRANSPOSE"; + case 34: + return "GGML_OP_GET_ROWS"; + case 35: + return "GGML_OP_GET_ROWS_BACK"; + case 36: + return "GGML_OP_DIAG"; + case 37: + return "GGML_OP_DIAG_MASK_INF"; + case 38: + return "GGML_OP_DIAG_MASK_ZERO"; + case 39: + return "GGML_OP_SOFT_MAX"; + case 40: + return "GGML_OP_SOFT_MAX_BACK"; + case 41: + return "GGML_OP_ROPE"; + case 42: + return "GGML_OP_ROPE_BACK"; + case 43: + return "GGML_OP_ALIBI"; + case 44: + return "GGML_OP_CLAMP"; + case 45: + return "GGML_OP_CONV_TRANSPOSE_1D"; + case 46: + return "GGML_OP_IM2COL"; + case 47: + return "GGML_OP_CONV_TRANSPOSE_2D"; + case 48: + return "GGML_OP_POOL_1D"; + case 49: + return "GGML_OP_POOL_2D"; + case 50: + return "GGML_OP_UPSCALE"; // nearest interpolate + case 51: + return "GGML_OP_PAD"; + case 52: + return "GGML_OP_ARGSORT"; + case 53: + return "GGML_OP_LEAKY_RELU"; + case 54: + return "GGML_OP_FLASH_ATTN"; + case 55: + return "GGML_OP_FLASH_FF"; + case 56: + return "GGML_OP_FLASH_ATTN_BACK"; + case 57: + return "GGML_OP_WIN_PART"; + case 58: + return "GGML_OP_WIN_UNPART"; + case 59: + return "GGML_OP_GET_REL_POS"; + case 60: + return "GGML_OP_ADD_REL_POS"; + case 61: + return "GGML_OP_UNARY"; + case 62: + return "GGML_OP_MAP_UNARY"; + case 63: + return "GGML_OP_MAP_BINARY"; + case 64: + return "GGML_OP_MAP_CUSTOM1_F32"; + case 65: + return "GGML_OP_MAP_CUSTOM2_F32"; + case 66: + return "GGML_OP_MAP_CUSTOM3_F32"; + case 67: + return "GGML_OP_MAP_CUSTOM1"; + case 68: + return "GGML_OP_MAP_CUSTOM2"; + case 69: + return "GGML_OP_MAP_CUSTOM3"; + case 70: + return "GGML_OP_CROSS_ENTROPY_LOSS"; + case 71: + return "GGML_OP_CROSS_ENTROPY_LOSS_BACK"; + case 72: + return "GGML_OP_COUNT"; + default: + return "WRONG_OP"; + } + return ""; +} + // // global data // diff --git a/ggml.h b/ggml.h index 67d6bc4f1..3ef708c11 100644 --- a/ggml.h +++ b/ggml.h @@ -490,6 +490,10 @@ extern "C" { GGML_LOG_LEVEL_DEBUG = 5 }; + const char * ggml_backend_type_string(enum ggml_backend_type backend_type); + + const char * ggml_op_string(enum ggml_op op); + // ggml object struct ggml_object { size_t offs; @@ -511,7 +515,7 @@ extern "C" { struct ggml_backend_buffer * buffer; - int64_t ne[GGML_MAX_DIMS]; // number of elements + int64_t ne[GGML_MAX_DIMS]; // number of elements, [batch size, , seq len, hidden dim] size_t nb[GGML_MAX_DIMS]; // stride in bytes: // nb[0] = ggml_type_size(type) // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding diff --git a/llama.cpp b/llama.cpp index bf1b01a90..8778193a6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4500,6 +4500,23 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); + for (int i = 0; i < gf->n_nodes; ++ i) { + ggml_tensor * t = gf->nodes[i]; + LLAMA_LOG_INFO("%s: Tensor name [%s]\n", __func__, t->name); + LLAMA_LOG_INFO("%s: \tOP [%s]\n", __func__, ggml_op_string(t->op)); + LLAMA_LOG_INFO("%s: \tBackend [%s]\n", __func__, ggml_backend_type_string(t->backend)); + LLAMA_LOG_INFO("%s: \tShape (", __func__); + for (int j = 0; j < GGML_MAX_DIMS; ++ j) { + LLAMA_LOG_INFO("%ld", t->ne[GGML_MAX_DIMS - 1 - j]); + if (j != GGML_MAX_DIMS - 1) { + LLAMA_LOG_INFO(", "); + } else { + LLAMA_LOG_INFO(")\n"); + } + } + } + exit(-1); + return gf; } @@ -6437,7 +6454,7 @@ static int llama_decode_internal( res->backend = GGML_BACKEND_CPU; #endif - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance