diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp
index efe88fb20..0b1019386 100644
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@@ -37,6 +37,7 @@
 #include "diagmask.hpp"
 #include "scale.hpp"
 #include "clamp.hpp"
+#include "pool2d.hpp"
 #include "gla.hpp"
 
 #endif // GGML_SYCL_BACKEND_HPP
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 0992a1b58..346a32260 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -1463,67 +1463,7 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
     }
 }
 
-template <typename Ti, typename To>
-static  void pool2d_nchw_kernel(
-        const int ih, const int iw, const int oh, const int ow,
-        const int kh, const int kw, const int sh, const int sw,
-        const int ph, const int pw, const int parallel_elements,
-        const Ti* src, To* dst, const enum ggml_op_pool op,
-        const sycl::nd_item<3> &item_ct1) {
-        int idx = item_ct1.get_local_id(2) +
-                  item_ct1.get_group(2) * item_ct1.get_local_range(2);
-        if (idx >= parallel_elements) {
-            return;
-        }
 
-        const int I_HW = ih * iw;
-        const int O_HW = oh * ow;
-        const int nc = idx / O_HW;
-        const int cur_oh = idx % O_HW / ow;
-        const int cur_ow = idx % O_HW % ow;
-        const Ti* i_ptr = src + nc * I_HW;
-        To* o_ptr = dst + nc * O_HW;
-        const int start_h = cur_oh * sh - ph;
-        const int bh = sycl::max(0, start_h);
-        const int eh = sycl::min(ih, start_h + kh);
-        const int start_w = cur_ow * sw - pw;
-        const int bw = sycl::max(0, start_w);
-        const int ew = sycl::min(iw, start_w + kw);
-
-        To res = 0;
-
-        switch (op) {
-            case GGML_OP_POOL_AVG: res = 0; break;
-            case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
-            default:
-                res      = (To) sycl::nan(uint32_t(0));
-                break;
-        }
-
-        for (int i = bh; i < eh; i += 1) {
-            for (int j = bw; j < ew; j += 1) {
-#if DPCT_COMPATIBILITY_TEMP >= 350
-                /*
-                DPCT1098:106: The '*' expression is used instead of the __ldg
-                call. These two expressions do not provide the exact same
-                functionality. Check the generated code for potential precision
-                and/or performance issues.
-                */
-                Ti cur = *(i_ptr + i * iw + j);
-#else
-                Ti cur = i_ptr[i * iw + j];
-#endif
-                switch (op) {
-                    case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
-                    case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
-                    default:
-                        res = (To) sycl::nan(uint32_t(0));
-                        break;
-                }
-            }
-        }
-        o_ptr[cur_oh * ow + cur_ow] = res;
-}
 
 static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
                                    const int ky, const int kx_padded,
@@ -1812,46 +1752,6 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx,  ggml_tensor *dst) {
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer));
-
-    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
-    const int k0 = opts[1];
-    const int k1 = opts[2];
-    const int s0 = opts[3];
-    const int s1 = opts[4];
-    const int p0 = opts[5];
-    const int p1 = opts[6];
-
-    const int64_t IH = dst->src[0]->ne[1];
-    const int64_t IW = dst->src[0]->ne[0];
-
-    const int64_t N = dst->ne[3];
-    const int64_t OC = dst->ne[2];
-    const int64_t OH = dst->ne[1];
-    const int64_t OW = dst->ne[0];
-
-    const int parallel_elements = N * OC * OH * OW;
-    const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE;
-    dpct::queue_ptr main_stream = ctx.stream();
-    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
-    float * dst_dd = static_cast<float *>(dst->data);
-    sycl::range<3>  block_nums(1, 1, num_blocks);
-    main_stream->parallel_for(
-        sycl::nd_range<3>(block_nums *
-                              sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0,
-                               parallel_elements, src0_dd, dst_dd, op,
-                               item_ct1);
-        });
-}
-
 inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -2801,10 +2701,6 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    ggml_sycl_op_pool2d(ctx, dst);
-}
-
 static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
     ggml_sycl_op_sum(ctx, dst);
diff --git a/ggml/src/ggml-sycl/pool2d.cpp b/ggml/src/ggml-sycl/pool2d.cpp
new file mode 100644
index 000000000..dd11ee6b5
--- /dev/null
+++ b/ggml/src/ggml-sycl/pool2d.cpp
@@ -0,0 +1,114 @@
+#include "pool2d.hpp"
+#include <float.h>
+
+template <typename Ti, typename To>
+static void pool2d_nchw_kernel(const int ih, const int iw, const int oh, const int ow, const int kh, const int kw,
+                               const int sh, const int sw, const int ph, const int pw, const int parallel_elements,
+                               const Ti * src, To * dst, const enum ggml_op_pool op,
+                               const sycl::nd_item<3> & item_ct1) {
+    int idx = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (idx >= parallel_elements) {
+        return;
+    }
+
+    const int  I_HW    = ih * iw;
+    const int  O_HW    = oh * ow;
+    const int  nc      = idx / O_HW;
+    const int  cur_oh  = idx % O_HW / ow;
+    const int  cur_ow  = idx % O_HW % ow;
+    const Ti * i_ptr   = src + nc * I_HW;
+    To *       o_ptr   = dst + nc * O_HW;
+    const int  start_h = cur_oh * sh - ph;
+    const int  bh      = sycl::max(0, start_h);
+    const int  eh      = sycl::min(ih, start_h + kh);
+    const int  start_w = cur_ow * sw - pw;
+    const int  bw      = sycl::max(0, start_w);
+    const int  ew      = sycl::min(iw, start_w + kw);
+
+    To res = 0;
+
+    switch (op) {
+        case GGML_OP_POOL_AVG:
+            res = 0;
+            break;
+        case GGML_OP_POOL_MAX:
+            res = -FLT_MAX;
+            break;
+        default:
+            res = (To) sycl::nan(uint32_t(0));
+            break;
+    }
+
+    for (int i = bh; i < eh; i += 1) {
+        for (int j = bw; j < ew; j += 1) {
+#if DPCT_COMPATIBILITY_TEMP >= 350
+            /*
+                DPCT1098:106: The '*' expression is used instead of the __ldg
+                call. These two expressions do not provide the exact same
+                functionality. Check the generated code for potential precision
+                and/or performance issues.
+                */
+            Ti cur = *(i_ptr + i * iw + j);
+#else
+            Ti cur = i_ptr[i * iw + j];
+#endif
+            switch (op) {
+                case GGML_OP_POOL_AVG:
+                    res += (cur / (kh * kw));
+                    break;
+                case GGML_OP_POOL_MAX:
+                    res = sycl::max(res, (To) cur);
+                    break;
+                default:
+                    res = (To) sycl::nan(uint32_t(0));
+                    break;
+            }
+        }
+    }
+    o_ptr[cur_oh * ow + cur_ow] = res;
+}
+
+static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try {
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
+
+    const int32_t *   opts = (const int32_t *) dst->op_params;
+    enum ggml_op_pool op   = static_cast<ggml_op_pool>(opts[0]);
+    const int         k0   = opts[1];
+    const int         k1   = opts[2];
+    const int         s0   = opts[3];
+    const int         s1   = opts[4];
+    const int         p0   = opts[5];
+    const int         p1   = opts[6];
+
+    const int64_t IH = dst->src[0]->ne[1];
+    const int64_t IW = dst->src[0]->ne[0];
+
+    const int64_t N  = dst->ne[3];
+    const int64_t OC = dst->ne[2];
+    const int64_t OH = dst->ne[1];
+    const int64_t OW = dst->ne[0];
+
+    const int       parallel_elements = N * OC * OH * OW;
+    const int       num_blocks        = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE;
+    dpct::queue_ptr main_stream       = ctx.stream();
+    const float *   src0_dd           = static_cast<const float *>(dst->src[0]->data);
+    float *         dst_dd            = static_cast<float *>(dst->data);
+    sycl::range<3>  block_nums(1, 1, num_blocks);
+    main_stream->parallel_for(sycl::nd_range<3>(block_nums * sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
+                                                sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
+                              [=](sycl::nd_item<3> item_ct1) {
+                                  pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd,
+                                                     dst_dd, op, item_ct1);
+                              });
+} catch (const sycl::exception & exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_pool2d(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
diff --git a/ggml/src/ggml-sycl/pool2d.hpp b/ggml/src/ggml-sycl/pool2d.hpp
new file mode 100644
index 000000000..6b2ce8043
--- /dev/null
+++ b/ggml/src/ggml-sycl/pool2d.hpp
@@ -0,0 +1,8 @@
+#ifndef GGML_SYCL_POOL2D_HPP
+#define GGML_SYCL_POOL2D_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif // GGML_SYCL_POOL2D_HPP