diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 4b6be9adc..84117c842 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -10774,14 +10774,10 @@ static void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_ten
     ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
     ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
 
-    const bool src0_on_device =             src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
-    const bool  dst_on_device =              dst->backend == GGML_BACKEND_TYPE_GPU;
-
     // dd = data device
-    float * src0_ddf = nullptr;
-    float * src1_ddf = nullptr;
-    float *  dst_ddf = nullptr;
+    float * src0_ddf = (float *) src0->data;
+    float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
+    float *  dst_ddf = (float *) dst->data;
 
     ggml_sycl_pool_alloc<float> src0_f(ctx.pool());
     ggml_sycl_pool_alloc<float> src1_f(ctx.pool());
@@ -10792,48 +10788,8 @@ static void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_ten
     // GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
         // ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device);
 
-    if (src0_on_device) {
-        src0_ddf = (float *) src0_extra->data_device[ctx.device];
-    } else {
-        src0_ddf = src0_f.alloc(ggml_nelements(src0));
-        // GGML_SYCL_DEBUG("before ggml_sycl_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0);
-        SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
-    }
-
-    if (use_src1) {
-        if (src1_on_device) {
-            src1_ddf = (float *) src1_extra->data_device[ctx.device];
-        } else {
-            src1_ddf = src1_f.alloc(ggml_nelements(src1));
-            SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
-        }
-    }
-    if (dst_on_device) {
-        dst_ddf = (float *) dst_extra->data_device[ctx.device];
-    } else {
-        dst_ddf = dst_f.alloc(ggml_nelements(dst));
-    }
-
-    // GGML_SYCL_DEBUG("op src0=%p, src1=%p, dst=%p, src0_ddf=%p, src1_ddf=%p, dst_ddf=%p, main_stream=%p\n",
-        // src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
     // do the computation
     op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
-    /*
-    DPCT1010:89: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    SYCL_CHECK(0);
-
-    // copy dst to host if necessary
-    if (!dst_on_device) {
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst)).wait()));
-    }
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            dpct::get_current_device().queues_wait_and_throw()));
-    }
     // print_ggml_tensor("tensor", dst);
 }
 catch (sycl::exception const &exc) {
@@ -10923,7 +10879,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
     ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
 
-    const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
     const bool src0_is_contiguous = ggml_is_contiguous(src0);
     const bool src1_is_contiguous = ggml_is_contiguous(src1);
 
@@ -10995,20 +10950,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
 
         used_devices++;
 
-        const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
-        const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
+        const bool src1_on_device = i == ctx.device;
+        const bool  dst_on_device = i == ctx.device;
 
         ggml_sycl_set_device(i);
         queue_ptr stream = ctx.stream(i, 0);
 
-        if (src0_on_device && src0_is_contiguous) {
-            dev[i].src0_dd = (char *) src0_extra->data_device[i];
+        if (src0_is_contiguous) {
+            dev[i].src0_dd = (char *) src0->data;
         } else {
             dev[i].src0_dd = dev[i].src0_dd_alloc.alloc(ctx.pool(i), ggml_nbytes(src0));
         }
 
         if (src1_on_device && src1_is_contiguous) {
-            dev[i].src1_ddf = (float *) src1_extra->data_device[i];
+            dev[i].src1_ddf = (float *) src1->data;
         } else {
             dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1));
         }
@@ -11028,7 +10983,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         }
 
         if (dst_on_device) {
-            dev[i].dst_dd = (float *) dst_extra->data_device[i];
+            dev[i].dst_dd = (float *) dst->data;
         } else {
             const size_t size_dst_ddf = split ? (dev[i].row_high - dev[i].row_low)*ne1 : ggml_nelements(dst);
             dev[i].dst_dd = dev[i].dst_dd_alloc.alloc(ctx.pool(i), size_dst_ddf);
@@ -11059,8 +11014,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                 continue;
             }
 
-            const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
-            const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
+            const bool src1_on_device = i == ctx.device;
+            const bool  dst_on_device = i == ctx.device;
             const int64_t row_diff = dev[i].row_high - dev[i].row_low;
 
             ggml_sycl_set_device(i);
@@ -11091,12 +11046,12 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
 
                 // the main device memory buffer can be on VRAM scratch, with space for all partial results
                 // in that case an offset on dst_ddf_i is needed
-                if (dst->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device) {
+                if (i == ctx.device) {
                     dst_dd_i += dev[i].row_low; // offset is 0 if no tensor split
                 }
 
                 // copy src0, src1 to device if necessary
-                if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
+                if (src1_is_contiguous) {
                     if (i != ctx.device) {
                         if (convert_src1_to_q8_1) {
                             char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
@@ -11114,14 +11069,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                                 src1_ncols * ne10 * sizeof(float))));
                         }
                     }
-                } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
+                } else if (src1_on_device && !src1_is_contiguous) {
                     SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
                                    src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
                 } else {
                     GGML_ASSERT(false);
                 }
 
-                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
+                if (convert_src1_to_q8_1 && !src1_is_contiguous) {
                     quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                     /*
                     DPCT1010:92: SYCL uses exceptions to report errors and does
@@ -11131,7 +11086,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                     SYCL_CHECK(0);
                 }
 
-                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
+                if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
                     SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[i].row_low, dev[i].row_high, stream));
                 }
                 if (src1->type == GGML_TYPE_F16) {
@@ -11149,17 +11104,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
 
                 // copy dst to host or other device if necessary
                 if (!dst_on_device) {
-                    void * dst_off_device;
-                    dpct::memcpy_direction kind;
-                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-                        dst_off_device = dst->data;
-                        kind = dpct::device_to_host;
-                    } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
-                        dst_off_device = dst_extra->data_device[ctx.device];
-                        kind = dpct::device_to_device;
-                    } else {
-                        GGML_ASSERT(false);
-                    }
+                    void * dst_off_device = dst->data;
                     if (split) {
                         // src0 = weight matrix is saved as a transposed matrix for better memory layout.
                         // dst is NOT transposed.
@@ -11170,27 +11115,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
                         dhf_dst_i += src1_col_0*ne0 + dev[i].row_low;
 
-                        //todo, dirty solution. Need be updated when device2device memcpy() is supported.
-                        if (kind == dpct::device_to_device) {
-                            size_t dst_size = ggml_nbytes_pad(dst);
-                            float *host_buf = (float *)malloc(dst_size);
-                            SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
-                                host_buf, ne0 * sizeof(float), dst_dd_i,
-                                row_diff * sizeof(float), row_diff * sizeof(float),
-                                src1_ncols, dpct::device_to_host, *stream)));
-                            dpct::dev_mgr::instance().get_device(i).queues_wait_and_throw();
-                            SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
-                                dhf_dst_i, ne0 * sizeof(float), host_buf,
-                                row_diff * sizeof(float), row_diff * sizeof(float),
-                                src1_ncols, dpct::host_to_device, *main_stream)));
-                            dpct::dev_mgr::instance().get_device(ctx.device).queues_wait_and_throw();
-                            free(host_buf);
-                        } else {
-                            SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
-                                dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
-                                row_diff * sizeof(float), row_diff * sizeof(float),
-                                src1_ncols, kind, *stream)));
-                        }
+                        SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
+                            dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
+                            row_diff * sizeof(float), row_diff * sizeof(float),
+                            src1_ncols, dpct::device_to_device, *stream)));
                     } else {
                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
@@ -11234,12 +11162,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
             }
         }
     }
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        SYCL_CHECK(ggml_sycl_set_device(ctx.device));
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            dpct::get_current_device().queues_wait_and_throw()));
-    }
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__