diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 89fdf9d1c..1bee9e313 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -75,6 +75,7 @@ option(GGML_CCACHE "ggml: use ccache if available"       ON)
 option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
 option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
 option(GGML_GPROF                  "ggml: enable gprof"                                   OFF)
+option(GGML_GRAPH_PROFILER         "ggml: enable internal Graph and Op profiler"          OFF)
 
 # build
 option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index e1424fc6a..098a89ac1 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -9,6 +9,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
     add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
 endif()
 
+if (GGML_GRAPH_PROFILER)
+    add_compile_definitions(GGML_GRAPH_PROFILER)
+endif()
+
 if (NOT MSVC)
     if (GGML_SANITIZE_THREAD)
         add_compile_options(-fsanitize=thread)
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index de634bdee..4007b4339 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -157,17 +157,6 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
     GGML_ABORT("fatal error");
 }
 
-// op profile data (per op / per thread)
-enum ggml_profile_event {
-    GGML_PROF_OP_START,
-    GGML_PROF_OP_SYNC,
-    GGML_PROF_OP_END
-};
-
-struct ggml_profile_data {
-    uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec
-};
-
 // computation graph
 
 enum ggml_cgraph_eval_order {
@@ -176,6 +165,8 @@ enum ggml_cgraph_eval_order {
     GGML_CGRAPH_EVAL_ORDER_COUNT
 };
 
+struct ggml_profile_data;
+
 struct ggml_cgraph {
     int size;
     int n_nodes;
@@ -194,12 +185,6 @@ struct ggml_cgraph {
 
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 
-void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads);
-void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads);
-void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads);
-void ggml_profile_graph_free(struct ggml_cgraph *cg);
-void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-profile.cpp b/ggml/src/ggml-profile.cpp
index e6ee6aea0..0f59455a5 100644
--- a/ggml/src/ggml-profile.cpp
+++ b/ggml/src/ggml-profile.cpp
@@ -1,9 +1,12 @@
-#include "ggml-impl.h"
+#include "ggml-profile.h"
+
 #include <stdint.h>
 #include <stdlib.h>
 
 #include <chrono>
 
+#ifdef GGML_GRAPH_PROFILER
+
 extern "C" void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads)
 {
     if (!getenv("GGML_GRAPH_PROFILE")) { return; }
@@ -138,3 +141,5 @@ extern "C" void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_pr
     using clock = std::chrono::high_resolution_clock;
     cg->prof[node_n][ith].nsec[e] = std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
 }
+
+#endif // GGML_GRAPH_PROFILER
diff --git a/ggml/src/ggml-profile.h b/ggml/src/ggml-profile.h
new file mode 100644
index 000000000..e572b91da
--- /dev/null
+++ b/ggml/src/ggml-profile.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "ggml-impl.h"
+
+// GGML internal header
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// op profile data (per op / per thread)
+enum ggml_profile_event {
+    GGML_PROF_OP_START,
+    GGML_PROF_OP_SYNC,
+    GGML_PROF_OP_END
+};
+
+struct ggml_profile_data {
+    uint64_t nsec[GGML_PROF_OP_END + 1]; // event times in nsec
+};
+
+#ifndef GGML_GRAPH_PROFILER
+
+// Stub out all profiler functions
+
+static inline void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads)
+{
+    GGML_UNUSED(cg);
+    GGML_UNUSED(n_threads);
+}
+
+static inline void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads)
+{
+    GGML_UNUSED(cg);
+    GGML_UNUSED(n_threads);
+}
+
+static inline void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads)
+{
+    GGML_UNUSED(cg);
+    GGML_UNUSED(n_threads);
+}
+
+static inline void ggml_profile_graph_free(struct ggml_cgraph *cg)
+{
+    GGML_UNUSED(cg);
+}
+
+static inline void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith)
+{
+    GGML_UNUSED(cg);
+    GGML_UNUSED(e);
+    GGML_UNUSED(node_n);
+    GGML_UNUSED(ith);
+}
+
+#else
+
+void ggml_profile_graph_init(struct ggml_cgraph *cg, int n_threads);
+void ggml_profile_graph_start(struct ggml_cgraph *cg, int n_threads);
+void ggml_profile_graph_finish(struct ggml_cgraph *cg, int n_threads);
+void ggml_profile_graph_free(struct ggml_cgraph *cg);
+void ggml_profile_op_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
+
+#endif // GGML_GRAPH_PROFILER
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 9be8341ed..3d00124f8 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -7,6 +7,7 @@
 #include "ggml-quants.h"
 #include "ggml.h"
 #include "ggml-aarch64.h"
+#include "ggml-profile.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW