From 4c6a7bb0b0b4d3d541361396da7199900955cdad Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 6 Aug 2024 21:34:04 +0200 Subject: [PATCH] Add GGML_VULKAN_PERF option to output performance data per operator --- Makefile | 4 +++ ggml/CMakeLists.txt | 1 + ggml/src/CMakeLists.txt | 4 +++ ggml/src/ggml-vulkan.cpp | 73 ++++++++++++++++++++++++++++++++++++++-- 4 files changed, 80 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index f4ce4f1fb..488e9dd99 100644 --- a/Makefile +++ b/Makefile @@ -762,6 +762,10 @@ ifdef GGML_VULKAN_MEMORY_DEBUG MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG endif +ifdef GGML_VULKAN_PERF + MK_CPPFLAGS += -DGGML_VULKAN_PERF +endif + ifdef GGML_VULKAN_VALIDATE MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE endif diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 7fe1661bb..2ca1ac880 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -136,6 +136,7 @@ option(GGML_VULKAN "ggml: use Vulkan" option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF) +option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF) option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_KOMPUTE "ggml: use Kompute" OFF) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 425a25895..1775ef3cc 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -602,6 +602,10 @@ if (GGML_VULKAN) add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG) endif() + if (GGML_VULKAN_PERF) + add_compile_definitions(GGML_VULKAN_PERF) + endif() + if (GGML_VULKAN_VALIDATE) add_compile_definitions(GGML_VULKAN_VALIDATE) endif() diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 32d66a0c3..a3539ff68 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -1,6 +1,6 @@ #include "ggml-vulkan.h" #include -#ifdef GGML_VULKAN_RUN_TESTS +#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) #include #endif @@ -133,6 +133,9 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { #ifdef GGML_VULKAN_MEMORY_DEBUG class vk_memory_logger; #endif +#ifdef GGML_VULKAN_PERF +class vk_perf_logger; +#endif static void ggml_vk_destroy_buffer(vk_buffer& buf); struct vk_device_struct { @@ -218,6 +221,9 @@ struct vk_device_struct { #ifdef GGML_VULKAN_MEMORY_DEBUG std::unique_ptr memory_logger; #endif +#ifdef GGML_VULKAN_PERF + std::unique_ptr perf_logger; +#endif ~vk_device_struct() { VK_LOG_DEBUG("destroy device " << name); @@ -476,6 +482,48 @@ private: #define VK_LOG_MEMORY(msg) ((void) 0) #endif // GGML_VULKAN_MEMORY_DEBUG +#if defined(GGML_VULKAN_PERF) + +class vk_perf_logger { +public: + void print_timings() { + std::cerr << "----------------\nVulkan Timings:" << std::endl; + for (const auto& t : timings) { + uint64_t total = 0; + for (const auto& time : t.second) { + total += time; + } + std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl; + } + + timings.clear(); + } + + void log_timing(const ggml_tensor * node, uint64_t time) { + if (node->op == GGML_OP_UNARY) { + timings[ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time); + return; + } + if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { + const uint64_t m = node->src[0]->ne[1]; + const uint64_t n = node->src[1]->ne[1]; + const uint64_t k = node->src[1]->ne[0]; + std::string name = ggml_op_name(node->op); + if (n == 1) { + name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k); + } else { + name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k); + } + timings[name].push_back(time); + return; + } + timings[ggml_op_name(node->op)].push_back(time); + } +private: + std::map> timings; +}; +#endif // GGML_VULKAN_PERF + struct ggml_backend_vk_context { std::string name; @@ -1701,6 +1749,9 @@ static vk_device ggml_vk_get_device(size_t idx) { #ifdef GGML_VULKAN_MEMORY_DEBUG device->memory_logger = std::unique_ptr(new vk_memory_logger()); #endif +#ifdef GGML_VULKAN_PERF + device->perf_logger = std::unique_ptr(new vk_perf_logger()); +#endif size_t dev_num = vk_instance.device_indices[idx]; @@ -5851,7 +5902,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod ctx->tensor_ctxs[node_idx] = compute_ctx; -#ifdef GGML_VULKAN_CHECK_RESULTS +#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF) // Force context reset on each node so that each tensor ends up in its own context // and can be run and compared to its CPU equivalent separately last_node = true; @@ -5935,6 +5986,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock(); +#ifdef GGML_VULKAN_PERF + std::chrono::steady_clock::time_point start; +#endif // GGML_VULKAN_PERF + // Only run if ctx hasn't been submitted yet if (!subctx->seqs.empty()) { // Do staging buffer copies @@ -5942,11 +5997,21 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(cpy.dst, cpy.src, cpy.n); } +#ifdef GGML_VULKAN_PERF + start = std::chrono::steady_clock::now(); +#endif // GGML_VULKAN_PERF + ggml_vk_submit(subctx, ctx->fence); } if (tensor_idx == subctx->exit_tensor_idx) { VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); + +#ifdef GGML_VULKAN_PERF + auto duration = std::chrono::duration_cast(std::chrono::steady_clock::now() - start); + ctx->device->perf_logger->log_timing(tensor, duration.count()); +#endif // GGML_VULKAN_PERF + ctx->device->device.resetFences({ ctx->fence }); // Do staging buffer copies @@ -6479,6 +6544,10 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen GGML_ASSERT(ok); } +#ifdef GGML_VULKAN_PERF + ctx->device->perf_logger->print_timings(); +#endif + ggml_vk_graph_cleanup(ctx); return GGML_STATUS_SUCCESS;