summary dpct definition in one header file to replace folder:dpct

2024-01-13 20:33:42 +08:00 · 2024-01-13 20:33:42 +08:00 · a47f5ec42e
commit a47f5ec42e
parent 5b5389941e
27 changed files with 2836 additions and 25652 deletions
--- a/dpct.hpp
+++ b/dpct.hpp
--- a/dpct/atomic.hpp
+++ b/dpct/atomic.hpp
@ -1,842 +0,0 @@
-//==---- atomic.hpp -------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_ATOMIC_HPP__
-#define __DPCT_ATOMIC_HPP__
-
-#include <sycl/sycl.hpp>
-
-namespace dpct {
-
-/// Atomically add the value operand to the value at the addr and assign the
-/// result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to add to the value at \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_add(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_add(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_add(operand);
-}
-
-/// Atomically add the value operand to the value at the addr and assign the
-/// result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to add to the value at \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_add(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_add(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically subtract the value operand from the value at the addr and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to subtract from the value at \p addr
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_sub(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_sub(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_sub(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_sub(operand);
-}
-
-/// Atomically subtract the value operand from the value at the addr and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to subtract from the value at \p addr
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_sub(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_sub(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_sub<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically perform a bitwise AND between the value operand and the value at the addr
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise AND operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_and(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_and(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_and(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_and(operand);
-}
-
-/// Atomically perform a bitwise AND between the value operand and the value at the addr
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise AND operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_and(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_and<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_and<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_and<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_and(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_and<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically or the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise OR operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_or(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_or(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_or(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_or(operand);
-}
-
-/// Atomically or the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise OR operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_or(T *addr, T operand,
-                         sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_or<T, addressSpace, sycl::memory_order::relaxed,
-                           sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_or<T, addressSpace, sycl::memory_order::acq_rel,
-                           sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_or<T, addressSpace, sycl::memory_order::seq_cst,
-                           sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_or(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_or<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically xor the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise XOR operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_xor(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_xor(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_xor(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_xor(operand);
-}
-
-/// Atomically xor the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise XOR operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_xor(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_xor(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_xor<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically calculate the minimum of the value at addr and the value operand
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_min(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_min(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_min(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_min(operand);
-}
-
-/// Atomically calculate the minimum of the value at addr and the value operand
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_min(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_min<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_min<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_min<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_min(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_min<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically calculate the maximum of the value at addr and the value operand
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_max(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_max(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_max(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_max(operand);
-}
-
-/// Atomically calculate the maximum of the value at addr and the value operand
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_max(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_max<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_max<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_max<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_max(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_max<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically set \p operand to the value stored in \p addr, if old value stored in
-/// \p addr is equal to zero or greater than \p operand, else decrease the value stored
-/// in \p addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The threshold value.
-/// \param memoryOrder The memory ordering used.
-/// \returns The old value stored in \p addr.
-template <sycl::access::address_space addressSpace = sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline unsigned int atomic_fetch_compare_dec(unsigned int *addr,
-                                             unsigned int operand) {
-  auto atm = sycl::atomic_ref<unsigned int, memoryOrder, memoryScope,
-                                  addressSpace>(addr[0]);
-  unsigned int old;
-
-	while (true) {
-	  old = atm.load();
-	  if (old == 0 || old > operand) {
-		  if (atm.compare_exchange_strong(old, operand))
-        break;
-	  } else if (atm.compare_exchange_strong(old, old - 1))
-      break;
-	}
-
-  return old;
-}
-
-/// Atomically increment the value stored in \p addr if old value stored in \p
-/// addr is less than \p operand, else set 0 to the value stored in \p addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The threshold value.
-/// \param memoryOrder The memory ordering used.
-/// \returns The old value stored in \p addr.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline unsigned int atomic_fetch_compare_inc(unsigned int *addr,
-                                             unsigned int operand) {
-  auto atm = sycl::atomic_ref<unsigned int, memoryOrder, memoryScope,
-                                  addressSpace>(addr[0]);
-  unsigned int old;
-  while (true) {
-    old = atm.load();
-    if (old >= operand) {
-      if (atm.compare_exchange_strong(old, 0))
-        break;
-    } else if (atm.compare_exchange_strong(old, old + 1))
-      break;
-  }
-  return old;
-}
-
-/// Atomically increment the value stored in \p addr if old value stored in \p
-/// addr is less than \p operand, else set 0 to the value stored in \p addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The threshold value.
-/// \param memoryOrder The memory ordering used.
-/// \returns The old value stored in \p addr.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space>
-inline unsigned int
-atomic_fetch_compare_inc(unsigned int *addr, unsigned int operand,
-                         sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::relaxed,
-                                    sycl::memory_scope::device>(addr,
-                                                                   operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::acq_rel,
-                                    sycl::memory_scope::device>(addr,
-                                                                   operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::seq_cst,
-                                    sycl::memory_scope::device>(addr,
-                                                                   operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-/// Atomically exchange the value at the address addr with the value operand.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to be exchanged with the value pointed by \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_exchange(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.exchange(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_exchange(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.exchange(operand);
-}
-
-/// Atomically exchange the value at the address addr with the value operand.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to be exchanged with the value pointed by \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_exchange(T *addr, T operand,
-                         sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_exchange<T, addressSpace, sycl::memory_order::relaxed,
-                           sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_exchange<T, addressSpace, sycl::memory_order::acq_rel,
-                           sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_exchange<T, addressSpace, sycl::memory_order::seq_cst,
-                           sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_exchange(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_exchange<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically compare the value at \p addr to the value expected and exchange
-/// with the value desired if the value at \p addr is equal to the value expected.
-/// Returns the value at the \p addr before the call.
-/// \param [in, out] addr Multi_ptr.
-/// \param expected The value to compare against the value at \p addr.
-/// \param desired The value to assign to \p addr if the value at \p addr is expected.
-/// \param success The memory ordering used when comparison succeeds.
-/// \param fail The memory ordering used when comparison fails.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_compare_exchange_strong(
-    sycl::multi_ptr<T, addressSpace> addr, T expected, T desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed) {
-  auto atm = sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(*addr);
-
-  atm.compare_exchange_strong(expected, desired, success, fail);
-  return expected;
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2, typename T3>
-T1 atomic_compare_exchange_strong(
-    sycl::multi_ptr<T1, addressSpace> addr, T2 expected, T3 desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(*addr);
-  T1 expected_value = expected;
-  atm.compare_exchange_strong(expected_value, desired, success, fail);
-  return expected_value;
-}
-
-/// Atomically compare the value at \p addr to the value expected and exchange
-/// with the value desired if the value at \p addr is equal to the value expected.
-/// Returns the value at the \p addr before the call.
-/// \param [in] addr The pointer to the data.
-/// \param expected The value to compare against the value at \p addr.
-/// \param desired The value to assign to \p addr if the value at \p addr is expected.
-/// \param success The memory ordering used when comparison succeeds.
-/// \param fail The memory ordering used when comparison fails.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_compare_exchange_strong(
-    T *addr, T expected, T desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  atm.compare_exchange_strong(expected, desired, success, fail);
-  return expected;
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2, typename T3>
-T1 atomic_compare_exchange_strong(
-    T1 *addr, T2 expected, T3 desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed) {
-  T1 expected_value = expected;
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  atm.compare_exchange_strong(expected_value, desired, success, fail);
-  return expected_value;
-}
-
-/// Atomic extension to implement standard APIs in std::atomic
-namespace detail{
-template <typename T> struct IsValidAtomicType {
-  static constexpr bool value =
-      (std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
-       std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
-       std::is_same<T, long long>::value ||
-       std::is_same<T, unsigned long long>::value ||
-       std::is_same<T, float>::value || std::is_same<T, double>::value ||
-       std::is_pointer<T>::value);
-};
-} // namespace detail
-
-template <typename T,
-          sycl::memory_scope DefaultScope = sycl::memory_scope::system,
-          sycl::memory_order DefaultOrder = sycl::memory_order::seq_cst,
-          sycl::access::address_space Space =
-              sycl::access::address_space::generic_space>
-class atomic{
-  static_assert(
-    detail::IsValidAtomicType<T>::value,
-    "Invalid atomic type.  Valid types are int, unsigned int, long, "
-      "unsigned long, long long, unsigned long long, float, double "
-      "and pointer types");
-  T __d;
-
-public:
-  /// default memory synchronization order
-  static constexpr sycl::memory_order default_read_order =
-      sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>::default_read_order;
-  static constexpr sycl::memory_order default_write_order =
-      sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>::default_write_order;
-  static constexpr sycl::memory_scope default_scope = DefaultScope;
-  static constexpr sycl::memory_order default_read_modify_write_order =
-      DefaultOrder;
-  
-
-  /// Default constructor.
-  constexpr atomic() noexcept = default;
-  /// Constructor with initialize value.
-  constexpr atomic(T d) noexcept : __d(d){};
-
-  /// atomically replaces the value of the referenced object with a non-atomic argument
-  /// \param operand The value to replace the pointed value.
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  void store(T operand, sycl::memory_order memoryOrder = default_write_order,
-             sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    atm.store(operand, memoryOrder, memoryScope);
-  }
-
-  /// atomically obtains the value of the referenced object
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object
-  T load(sycl::memory_order memoryOrder = default_read_order,
-         sycl::memory_scope memoryScope = default_scope) const noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(
-      const_cast<T &>(__d));
-    return atm.load(memoryOrder, memoryScope);
-  }
-
-  /// atomically replaces the value of the referenced object and obtains the value held previously
-  /// \param operand The value to replace the pointed value.
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object before the call.
-  T exchange(T operand,
-             sycl::memory_order memoryOrder = default_read_modify_write_order,
-             sycl::memory_scope memoryScope = default_scope) noexcept {
-
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.exchange(operand, memoryOrder, memoryScope);
-  }
-
-  /// atomically compares the value of the referenced object with non-atomic argument 
-  /// and performs atomic exchange if equal or atomic load if not
-  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
-  /// \param desired  The value to store in the referenced object if it is as expected
-  /// \param success The memory models for the read-modify-write
-  /// \param failure The memory models for load operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully changed, false otherwise.
-  bool compare_exchange_weak(
-      T &expected, T desired,
-      sycl::memory_order success, sycl::memory_order failure,
-      sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_weak(expected, desired, success, failure, memoryScope);
-  }
-  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
-  /// \param desired  The value to store in the referenced object if it is as expected
-  /// \param memoryOrder 	The memory synchronization ordering for operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully changed, false otherwise.
-  bool compare_exchange_weak(T &expected, T desired,
-                  sycl::memory_order memoryOrder = default_read_modify_write_order,
-                  sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_weak(expected, desired, memoryOrder, memoryScope);
-  }
-
-  /// atomically compares the value of the referenced object with non-atomic argument 
-  /// and performs atomic exchange if equal or atomic load if not
-  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
-  /// \param desired  The value to store in the referenced object if it is as expected
-  /// \param success The memory models for the read-modify-write
-  /// \param failure The memory models for load operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully changed, false otherwise.
-  bool compare_exchange_strong(
-      T &expected, T desired,
-      sycl::memory_order success, sycl::memory_order failure,
-      sycl::memory_scope memoryScope = default_scope) noexcept {
-
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_strong(expected, desired, success, failure, memoryScope);
-  }
-  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
-  /// \param desired  The value to store in the referenced object if it is as expected
-  /// \param memoryOrder 	The memory synchronization ordering for operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully changed, false otherwise.
-  bool compare_exchange_strong(T &expected, T desired,
-                    sycl::memory_order memoryOrder = default_read_modify_write_order,
-                    sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_strong(expected, desired, memoryOrder, memoryScope);
-  }
-
-  /// atomically adds the argument to the value stored in the atomic object and obtains the value held previously
-  /// \param operand 	The other argument of arithmetic addition
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object before the call.
-  T fetch_add(T operand,
-              sycl::memory_order memoryOrder = default_read_modify_write_order,
-              sycl::memory_scope  memoryScope = default_scope) noexcept {
-
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.fetch_add(operand, memoryOrder,  memoryScope);
-  }
-
-  /// atomically subtracts the argument from the value stored in the atomic object and obtains the value held previously
-  /// \param operand 	The other argument of arithmetic subtraction
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object before the call.
-  T fetch_sub(T operand,
-              sycl::memory_order memoryOrder = default_read_modify_write_order,
-              sycl::memory_scope memoryScope = default_scope) noexcept {
-
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.fetch_sub(operand, memoryOrder, memoryScope);
-  }
-};
-
-} // namespace dpct
-#endif // __DPCT_ATOMIC_HPP__
--- a/dpct/blas_utils.hpp
+++ b/dpct/blas_utils.hpp
--- a/dpct/ccl_utils.hpp
+++ b/dpct/ccl_utils.hpp
@ -1,286 +0,0 @@
-//==---- ccl_utils.hpp----------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_CCL_UTILS_HPP__
-#define __DPCT_CCL_UTILS_HPP__
-
-#include <sycl/sycl.hpp>
-#include <oneapi/ccl.hpp>
-#include <unordered_map>
-#include <memory>
-
-#include "device.hpp"
-
-namespace dpct {
-namespace ccl {
-namespace detail {
-
-/// Get stored kvs with specified kvs address.
-inline std::shared_ptr<oneapi::ccl::kvs> &
-get_kvs(const oneapi::ccl::kvs::address_type &addr) {
-  struct hash {
-    std::size_t operator()(const oneapi::ccl::kvs::address_type &in) const {
-      return std::hash<std::string_view>()(std::string_view(in.data(), in.size()));
-    }
-  };
-  static std::unordered_map<oneapi::ccl::kvs::address_type,
-                            std::shared_ptr<oneapi::ccl::kvs>, hash>
-      kvs_map;
-  return kvs_map[addr];
-}
-
-/// Help class to init ccl environment. 
-class ccl_init_helper {
-public:
-  ccl_init_helper() { oneapi::ccl::init(); }
-};
-
-} // namespace detail
-
-/// Get concatenated library version as an integer.
-static inline int get_version() {
-  oneapi::ccl::init();
-  auto ver = oneapi::ccl::get_library_version();
-  return ver.major * 10000 + ver.minor * 100 + ver.update;
-}
-
-/// Create main kvs and return its address.
-static inline oneapi::ccl::kvs::address_type create_kvs_address() {
-  oneapi::ccl::init();
-  auto ptr = oneapi::ccl::create_main_kvs();
-  auto addr = ptr->get_address();
-  detail::get_kvs(addr) = ptr;
-  return addr;
-}
-
-/// Get stored kvs with /p addr if exist. Otherwise, create kvs with /p addr.
-static inline std::shared_ptr<oneapi::ccl::kvs>
-create_kvs(const oneapi::ccl::kvs::address_type &addr) {
-  oneapi::ccl::init();
-  auto &ptr = detail::get_kvs(addr);
-  if (!ptr)
-    ptr = oneapi::ccl::create_kvs(addr);
-  return ptr;
-}
-
-/// dpct communicator extension
-class communicator_wrapper : public dpct::ccl::detail::ccl_init_helper {
-public:
-  communicator_wrapper(
-      int size, int rank, oneapi::ccl::kvs::address_type id,
-      const oneapi::ccl::comm_attr &attr = oneapi::ccl::default_comm_attr)
-      : _device_comm(oneapi::ccl::create_device(
-            static_cast<sycl::device &>(dpct::get_current_device()))),
-        _context_comm(oneapi::ccl::create_context(dpct::get_default_context())),
-        _comm(oneapi::ccl::create_communicator(
-            size, rank, _device_comm, _context_comm, dpct::ccl::create_kvs(id),
-            attr)) {
-    _queue_init = false;
-    _ccl_stream_ptr = nullptr;
-  }
-
-  ~communicator_wrapper() {
-    delete _ccl_stream_ptr;
-  };
-
-  /// Return the rank in a oneapi::ccl::communicator
-  /// \returns The rank corresponding to communicator object
-  int rank() const {
-    return _comm.rank();
-  }
-
-  /// Retrieves the number of rank in oneapi::ccl::communicator
-  /// \returns The number of the ranks
-  int size() const {
-    return _comm.size();
-  }
-
-  /// Return underlying native device, which was used in oneapi::ccl::communicator
-  sycl::device get_device() const {
-    return _comm.get_device().get_native();
-  }
-
-  /// \brief allreduce is a collective communication operation that performs the global reduction operation
-  ///       on values from all ranks of communicator and distributes the result back to all ranks.
-  /// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced
-  /// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff
-  /// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff
-  /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
-  /// \param rtype the type of the reduction operation to be applied
-  /// \param queue_ptr a sycl::queue ptr associated with the operation
-  /// \return @ref void
-  void allreduce(const void *sendbuff, void *recvbuff, size_t count,
-                 oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
-                 sycl::queue *queue_ptr) {
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::allreduce(sendbuff, recvbuff, count, dtype, rtype,
-                                        _comm, stream);
-        },
-        queue_ptr);
-  }
-
-  /// \brief reduce is a collective communication operation that performs the
-  ///        global reduction operation on values from all ranks of the communicator
-  ///        and returns the result to the root rank.
-  /// \param sendbuff the buffer with @c count elements of @c dtype that stores
-  ///        local data to be reduced 
-  /// \param recvbuff [out] the buffer to store reduced result, 
-  ///        must have the same dimension as @c sendbuff 
-  /// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff 
-  /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff 
-  /// \param root the rank that gets the result of reduction 
-  /// \param rtype the type of the reduction operation to be applied 
-  /// \param queue_ptr a sycl::queue ptr associated with the operation 
-  /// \return @ref void
-  void reduce(const void *sendbuff, void *recvbuff, size_t count,
-              oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
-              int root, sycl::queue *queue_ptr) {
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::reduce(sendbuff, recvbuff, count, dtype, rtype,
-                                     root, _comm, stream);
-        },
-        queue_ptr);
-  }
-
-  /// \brief broadcast is a collective communication operation that broadcasts data
-  ///        from one rank of communicator (denoted as root) to all other ranks.
-  ///        Only support in-place operation
-  /// \param sendbuff the buffer with @c count elements of @c dtype that stores
-  ///        local data to be reduced 
-  /// \param recvbuff [out] the buffer to store reduced result
-  /// \param count the number of elements of type @c dtype in @c buf 
-  /// \param dtype thedatatype of elements in @c buf 
-  /// \param root the rank that broadcasts @c buf
-  /// \param queue_ptr a sycl::queue ptr associated with the operation
-  /// \return @ref void
-  void broadcast(void *sendbuff, void *recvbuff, size_t count,
-                 oneapi::ccl::datatype dtype, int root,
-                 sycl::queue *queue_ptr) {
-    if (sendbuff != recvbuff) {
-      throw std::runtime_error(
-          "oneCCL broadcast only support in-place operation. "
-          "sendbuff and recvbuff must be same.");
-      return;
-    }
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::broadcast(recvbuff, count, dtype, root, _comm,
-                                        stream);
-        },
-        queue_ptr);
-  }
-
-  /// \brief reduce_scatter is a collective communication operation that performs the global reduction operation
-  ///        on values from all ranks of the communicator and scatters the result in blocks back to all ranks.
-  /// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced
-  /// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff
-  /// \param recv_count the number of elements of type @c dtype in receive block
-  /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
-  /// \param rtype the type of the reduction operation to be applied
-  /// \param queue_ptr a sycl::queue ptr associated with the operation
-  /// \return @ref void
-  void reduce_scatter(const void *sendbuff, void *recvbuff, size_t recv_count,
-                      oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
-                      sycl::queue *queue_ptr) {
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::reduce_scatter(sendbuff, recvbuff, recv_count,
-                                             dtype, rtype, _comm, stream);
-        },
-        queue_ptr);
-  }
-
-  /// \brief send is a pt2pt communication operation that sends data from one rank of communicator.
-  /// \param sendbuff the buffer with @c count elements of @c dtype serves as send buffer for root
-  /// \param count the number of elements of type @c dtype in @c sendbuff
-  /// \param dtype the datatype of elements in @c sendbuff
-  /// \param peer the rank that receives @c sendbuff
-  /// \param queue_ptr a sycl::queue ptr associated with the operation
-  /// \return @ref void
-  void send(void *sendbuff, size_t count, oneapi::ccl::datatype dtype, int peer,
-            sycl::queue *queue_ptr) {
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::send(sendbuff, count, dtype, peer, _comm, stream);
-        },
-        queue_ptr);
-  }
-
-  /// \brief recv is a pt2pt communication operation that sends data from one rank of communicator.
-  /// \param recvbuff the buffer with @c count elements of @c dtype serves as  receive buffer
-  /// \param count the number of elements of type @c dtype in @c recvbuff
-  /// \param dtype the datatype of elements in @c recvbuff
-  /// \param peer the rank that receives @c recvbuff
-  /// \param queue_ptr a sycl::queue ptr associated with the operation
-  /// \return @ref void
-  void recv(void *recvbuff, size_t count, oneapi::ccl::datatype dtype, int peer,
-            sycl::queue *queue_ptr) {
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::recv(recvbuff, count, dtype, peer, _comm, stream);
-        },
-        queue_ptr);
-  }
-
-private:
-  oneapi::ccl::device _device_comm;
-  oneapi::ccl::context _context_comm;
-  oneapi::ccl::communicator _comm;
-  sycl::queue _queue;
-  bool _queue_init;
-  oneapi::ccl::stream *_ccl_stream_ptr;
-
-  template <class Fn>
-  void call_func_wrapper(Fn func, sycl::queue *qptr) {
-    if (_queue_init && *qptr != _queue) {
-      call_func_async(func, qptr);
-    } else {
-      if(!_queue_init) {
-        _queue = *qptr;
-        _queue_init = true;
-        _ccl_stream_ptr = new oneapi::ccl::stream(oneapi::ccl::create_stream(_queue));
-      }
-      std::invoke(func, *_ccl_stream_ptr);
-    }
-  }
-
-  class call_func_async {
-    sycl::queue *_q_ptr;
-    struct call_async_impl {
-      oneapi::ccl::stream _ccl_stream_impl;
-      oneapi::ccl::event _ccl_event_impl;
-      template <class Fn>
-      explicit call_async_impl(Fn func, sycl::queue *qptr)
-          : _ccl_stream_impl(oneapi::ccl::create_stream(*qptr)),
-            _ccl_event_impl(std::invoke(func, _ccl_stream_impl)) {}
-    };
-    call_async_impl *_imp;
-
-  public:
-    template <class Fn>
-    explicit call_func_async(Fn func, sycl::queue *qptr)
-        : _q_ptr(qptr),
-          _imp(new call_async_impl(func, qptr)) {}
-    ~call_func_async() {
-      _q_ptr->submit([&](sycl::handler &cgh)
-                     { cgh.host_task([=]
-                                     {
-        _imp->_ccl_event_impl.wait();
-        delete _imp; }); });
-    }
-  };
-};
-
-typedef dpct::ccl::communicator_wrapper *comm_ptr;
-
-} // namespace ccl
-} // namespace dpct
-
-#endif // __DPCT_CCL_UTILS_HPP__
--- a/dpct/device.hpp
+++ b/dpct/device.hpp
@ -1,781 +0,0 @@
-//==---- device.hpp -------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_DEVICE_HPP__
-#define __DPCT_DEVICE_HPP__
-
-#include <sycl/sycl.hpp>
-#include <algorithm>
-#include <array>
-#include <cstring>
-#include <iostream>
-#include <mutex>
-#include <set>
-#include <sstream>
-#include <map>
-#include <vector>
-#include <thread>
-#if defined(__linux__)
-#include <unistd.h>
-#include <sys/syscall.h>
-#endif
-#if defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
-namespace dpct {
-namespace detail {
-static void get_version(const sycl::device &dev, int &major, int &minor) {
-  // Version string has the following format:
-  // a. OpenCL<space><major.minor><space><vendor-specific-information>
-  // b. <major.minor>
-  std::string ver;
-  ver = dev.get_info<sycl::info::device::version>();
-  std::string::size_type i = 0;
-  while (i < ver.size()) {
-    if (isdigit(ver[i]))
-      break;
-    i++;
-  }
-  major = std::stoi(&(ver[i]));
-  while (i < ver.size()) {
-    if (ver[i] == '.')
-      break;
-    i++;
-  }
-  i++;
-  minor = std::stoi(&(ver[i]));
-}
-} // namespace detail
-
-/// SYCL default exception handler
-inline auto exception_handler = [](sycl::exception_list exceptions) {
-  for (std::exception_ptr const &e : exceptions) {
-    try {
-      std::rethrow_exception(e);
-    } catch (sycl::exception const &e) {
-      std::cerr << "Caught asynchronous SYCL exception:" << std::endl
-                << e.what() << std::endl
-                << "Exception caught at file:" << __FILE__
-                << ", line:" << __LINE__ << std::endl;
-    }
-  }
-};
-
-typedef sycl::event *event_ptr;
-
-typedef sycl::queue *queue_ptr;
-
-typedef char *device_ptr;
-
-/// Destroy \p event pointed memory.
-///
-/// \param event Pointer to the sycl::event address.
-static void destroy_event(event_ptr event) {
-    delete event;
-}
-
-class device_info {
-public:
-  // get interface
-  const char *get_name() const { return _name; }
-  char *get_name() { return _name; }
-  template <typename WorkItemSizesTy = sycl::range<3>,
-            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                 std::is_same_v<WorkItemSizesTy, int *>,
-                             int> = 0>
-  auto get_max_work_item_sizes() const {
-    if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-      return sycl::range<3>(_max_work_item_sizes_i[0],
-                            _max_work_item_sizes_i[1],
-                            _max_work_item_sizes_i[2]);
-    else {
-      return _max_work_item_sizes_i;
-    }  
-  }
-  template <typename WorkItemSizesTy = sycl::range<3>,
-            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                 std::is_same_v<WorkItemSizesTy, int *>,
-                             int> = 0>
-  auto get_max_work_item_sizes() {
-    if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-      return sycl::range<3>(_max_work_item_sizes_i[0],
-                            _max_work_item_sizes_i[1],
-                            _max_work_item_sizes_i[2]);
-    else {
-      return _max_work_item_sizes_i;
-    }  
-  }
-  bool get_host_unified_memory() const { return _host_unified_memory; }
-  int get_major_version() const { return _major; }
-  int get_minor_version() const { return _minor; }
-  int get_integrated() const { return _integrated; }
-  int get_max_clock_frequency() const { return _frequency; }
-  int get_max_compute_units() const { return _max_compute_units; }
-  int get_max_work_group_size() const { return _max_work_group_size; }
-  int get_max_sub_group_size() const { return _max_sub_group_size; }
-  int get_max_work_items_per_compute_unit() const {
-    return _max_work_items_per_compute_unit;
-  }
-  int get_max_register_size_per_work_group() const {
-    return _max_register_size_per_work_group;
-  }
-  template <typename NDRangeSizeTy = size_t *,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                 std::is_same_v<NDRangeSizeTy, int *>,
-                             int> = 0>
-  auto get_max_nd_range_size() const {
-    if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-      return _max_nd_range_size;
-    else
-      return _max_nd_range_size_i;
-  }
-  template <typename NDRangeSizeTy = size_t *,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                 std::is_same_v<NDRangeSizeTy, int *>,
-                             int> = 0>
-  auto get_max_nd_range_size() {
-    if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-      return _max_nd_range_size;
-    else
-      return _max_nd_range_size_i;
-  }
-  size_t get_global_mem_size() const { return _global_mem_size; }
-  size_t get_local_mem_size() const { return _local_mem_size; }
-  /// Returns the maximum clock rate of device's global memory in kHz. If
-  /// compiler does not support this API then returns default value 3200000 kHz.
-  unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
-  /// Returns the maximum bus width between device and memory in bits. If
-  /// compiler does not support this API then returns default value 64 bits.
-  unsigned int get_memory_bus_width() const { return _memory_bus_width; }
-  uint32_t get_device_id() const { return _device_id; }
-  std::array<unsigned char, 16> get_uuid() const { return _uuid; }
-  /// Returns global memory cache size in bytes.
-  unsigned int get_global_mem_cache_size() const {
-    return _global_mem_cache_size;
-  }
-
-  // set interface
-  void set_name(const char* name) {
-    size_t length = strlen(name);
-    if (length < 256) {
-      std::memcpy(_name, name, length + 1);
-    } else {
-      std::memcpy(_name, name, 255);
-      _name[255] = '\0';
-    }
-  }
-  void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) {
-    for (int i = 0; i < 3; ++i)
-      _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-  }
-  [[deprecated]] void
-  set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) {
-    for (int i = 0; i < 3; ++i) {
-      _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-    }
-  }
-  void set_host_unified_memory(bool host_unified_memory) {
-    _host_unified_memory = host_unified_memory;
-  }
-  void set_major_version(int major) { _major = major; }
-  void set_minor_version(int minor) { _minor = minor; }
-  void set_integrated(int integrated) { _integrated = integrated; }
-  void set_max_clock_frequency(int frequency) { _frequency = frequency; }
-  void set_max_compute_units(int max_compute_units) {
-    _max_compute_units = max_compute_units;
-  }
-  void set_global_mem_size(size_t global_mem_size) {
-    _global_mem_size = global_mem_size;
-  }
-  void set_local_mem_size(size_t local_mem_size) {
-    _local_mem_size = local_mem_size;
-  }
-  void set_max_work_group_size(int max_work_group_size) {
-    _max_work_group_size = max_work_group_size;
-  }
-  void set_max_sub_group_size(int max_sub_group_size) {
-    _max_sub_group_size = max_sub_group_size;
-  }
-  void
-  set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) {
-    _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
-  }
-  void set_max_nd_range_size(int max_nd_range_size[]) {
-    for (int i = 0; i < 3; i++) {
-      _max_nd_range_size[i] = max_nd_range_size[i];
-      _max_nd_range_size_i[i] = max_nd_range_size[i];
-    }
-  }
-  void set_memory_clock_rate(unsigned int memory_clock_rate) {
-    _memory_clock_rate = memory_clock_rate;
-  }
-  void set_memory_bus_width(unsigned int memory_bus_width) {
-    _memory_bus_width = memory_bus_width;
-  }
-  void
-  set_max_register_size_per_work_group(int max_register_size_per_work_group) {
-    _max_register_size_per_work_group = max_register_size_per_work_group;
-  }
-  void set_device_id(uint32_t device_id) {
-    _device_id = device_id;
-  }
-  void set_uuid(std::array<unsigned char, 16> uuid) {
-    _uuid = std::move(uuid);
-  }
-  void set_global_mem_cache_size(unsigned int global_mem_cache_size) {
-    _global_mem_cache_size = global_mem_cache_size;
-  }
-
-private:
-  char _name[256];
-  int _max_work_item_sizes_i[3];
-  bool _host_unified_memory = false;
-  int _major;
-  int _minor;
-  int _integrated = 0;
-  int _frequency;
-  // Set estimated value 3200000 kHz as default value.
-  unsigned int _memory_clock_rate = 3200000;
-  // Set estimated value 64 bits as default value.
-  unsigned int _memory_bus_width = 64;
-  unsigned int _global_mem_cache_size;
-  int _max_compute_units;
-  int _max_work_group_size;
-  int _max_sub_group_size;
-  int _max_work_items_per_compute_unit;
-  int _max_register_size_per_work_group;
-  size_t _global_mem_size;
-  size_t _local_mem_size;
-  size_t _max_nd_range_size[3];
-  int _max_nd_range_size_i[3];
-  uint32_t _device_id;
-  std::array<unsigned char, 16> _uuid;
-};
-
-static int get_major_version(const sycl::device &dev) {
-  int major, minor;
-  detail::get_version(dev, major, minor);
-  return major;
-}
-
-static int get_minor_version(const sycl::device &dev) {
-  int major, minor;
-  detail::get_version(dev, major, minor);
-  return minor;
-}
-
-static void get_device_info(device_info &out, const sycl::device &dev) {
-  device_info prop;
-  prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
-
-  int major, minor;
-  detail::get_version(dev, major, minor);
-  prop.set_major_version(major);
-  prop.set_minor_version(minor);
-
-  prop.set_max_work_item_sizes(
-#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
-      // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
-      // is an enum class element
-      dev.get_info<sycl::info::device::max_work_item_sizes>());
-#else
-      // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
-      // an int
-      dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
-#endif
-  prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
-
-  prop.set_max_clock_frequency(
-      dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
-
-  prop.set_max_compute_units(
-      dev.get_info<sycl::info::device::max_compute_units>());
-  prop.set_max_work_group_size(
-      dev.get_info<sycl::info::device::max_work_group_size>());
-  prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
-  prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
-
-#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
-  if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) {
-    unsigned int tmp =
-        dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
-    if (tmp != 0)
-      prop.set_memory_clock_rate(1000 * tmp);
-  }
-  if (dev.has(sycl::aspect::ext_intel_memory_bus_width)) {
-    prop.set_memory_bus_width(
-        dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
-  }
-  if (dev.has(sycl::aspect::ext_intel_device_id)) {
-    prop.set_device_id(
-        dev.get_info<sycl::ext::intel::info::device::device_id>());
-  }
-  if (dev.has(sycl::aspect::ext_intel_device_info_uuid)) {
-    prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
-  }
-#elif defined(_MSC_VER) && !defined(__clang__)
-#pragma message("get_device_info: querying memory_clock_rate and \
-memory_bus_width are not supported by the compiler used. \
-Use 3200000 kHz as memory_clock_rate default value. \
-Use 64 bits as memory_bus_width default value.")
-#else
-#warning "get_device_info: querying memory_clock_rate and \
-memory_bus_width are not supported by the compiler used. \
-Use 3200000 kHz as memory_clock_rate default value. \
-Use 64 bits as memory_bus_width default value."
-#endif
-
-  size_t max_sub_group_size = 1;
-  std::vector<size_t> sub_group_sizes =
-      dev.get_info<sycl::info::device::sub_group_sizes>();
-
-  for (const auto &sub_group_size : sub_group_sizes) {
-    if (max_sub_group_size < sub_group_size)
-      max_sub_group_size = sub_group_size;
-  }
-
-  prop.set_max_sub_group_size(max_sub_group_size);
-
-  prop.set_max_work_items_per_compute_unit(
-      dev.get_info<sycl::info::device::max_work_group_size>());
-  int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
-  prop.set_max_nd_range_size(max_nd_range_size);
-
-  // Estimates max register size per work group, feel free to update the value
-  // according to device properties.
-  prop.set_max_register_size_per_work_group(65536);
-
-  prop.set_global_mem_cache_size(
-      dev.get_info<sycl::info::device::global_mem_cache_size>());
-  out = prop;
-}
-
-/// dpct device extension
-class device_ext : public sycl::device {
-  typedef std::mutex mutex_type;
-
-public:
-  device_ext() : sycl::device(), _ctx(*this) {}
-  ~device_ext() {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    clear_queues();
-  }
-  device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this) {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    init_queues();
-  }
-
-  int is_native_atomic_supported() { return 0; }
-  int get_major_version() const {
-    return dpct::get_major_version(*this);
-  }
-
-  int get_minor_version() const {
-    return dpct::get_minor_version(*this);
-  }
-
-  int get_max_compute_units() const {
-    return get_device_info().get_max_compute_units();
-  }
-
-  /// Return the maximum clock frequency of this device in KHz.
-  int get_max_clock_frequency() const {
-    return get_device_info().get_max_clock_frequency();
-  }
-
-  int get_integrated() const { return get_device_info().get_integrated(); }
-
-  int get_max_sub_group_size() const {
-    return get_device_info().get_max_sub_group_size();
-  }
-
-  int get_max_register_size_per_work_group() const {
-    return get_device_info().get_max_register_size_per_work_group();
-  }
-
-  int get_max_work_group_size() const {
-    return get_device_info().get_max_work_group_size();
-  }
-
-  int get_mem_base_addr_align() const {
-    return get_info<sycl::info::device::mem_base_addr_align>();
-  }
-
-  size_t get_global_mem_size() const {
-    return get_device_info().get_global_mem_size();
-  }
-
-  /// Get the number of bytes of free and total memory on the SYCL device.
-  /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
-  /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
-  void get_memory_info(size_t &free_memory, size_t &total_memory) {
-#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
-    if (!has(sycl::aspect::ext_intel_free_memory)) {
-      std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
-      free_memory = 0;
-    } else {
-      free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
-    }
-#else
-    std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
-    free_memory = 0;
-#if defined(_MSC_VER) && !defined(__clang__)
-#pragma message("Querying the number of bytes of free memory is not supported")
-#else
-#warning "Querying the number of bytes of free memory is not supported"
-#endif
-#endif
-    total_memory = get_device_info().get_global_mem_size();
-  }
-
-  void get_device_info(device_info &out) const {
-    dpct::get_device_info(out, *this);
-  }
-
-  device_info get_device_info() const {
-    device_info prop;
-    dpct::get_device_info(prop, *this);
-    return prop;
-  }
-
-  void reset() {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    clear_queues();
-    init_queues();
-  }
-
-  sycl::queue &in_order_queue() { return *_q_in_order; }
-
-  sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
-
-  sycl::queue &default_queue() {
-#ifdef DPCT_USM_LEVEL_NONE
-    return out_of_order_queue();
-#else
-    return in_order_queue();
-#endif // DPCT_USM_LEVEL_NONE
-  }
-
-  void queues_wait_and_throw() {
-    std::unique_lock<mutex_type> lock(m_mutex);
-    std::vector<std::shared_ptr<sycl::queue>> current_queues(
-        _queues);
-    lock.unlock();
-    for (const auto &q : current_queues) {
-      q->wait_and_throw();
-    }
-    // Guard the destruct of current_queues to make sure the ref count is safe.
-    lock.lock();
-  }
-
-  sycl::queue *create_queue(bool enable_exception_handler = false) {
-#ifdef DPCT_USM_LEVEL_NONE
-    return create_out_of_order_queue(enable_exception_handler);
-#else
-    return create_in_order_queue(enable_exception_handler);
-#endif // DPCT_USM_LEVEL_NONE
-  }
-
-  sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    return create_queue_impl(enable_exception_handler,
-                             sycl::property::queue::in_order());
-  }
-
-  sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    return create_queue_impl(enable_exception_handler);
-  }
-
-  void destroy_queue(sycl::queue *&queue) {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
-                                  [=](const std::shared_ptr<sycl::queue> &q) -> bool {
-                                    return q.get() == queue;
-                                  }),
-                   _queues.end());
-    queue = nullptr;
-  }
-  void set_saved_queue(sycl::queue* q) {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    _saved_queue = q;
-  }
-  sycl::queue *get_saved_queue() const {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    return _saved_queue;
-  }
-  sycl::context get_context() const { return _ctx; }
-
-private:
-  void clear_queues() {
-    _queues.clear();
-    _q_in_order = _q_out_of_order = _saved_queue = nullptr;
-  }
-
-  void init_queues() {
-    _q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
-    _q_out_of_order = create_queue_impl(true);
-    _saved_queue = &default_queue();
-  }
-
-  /// Caller should acquire resource \p m_mutex before calling this function.
-  template <class... Properties>
-  sycl::queue *create_queue_impl(bool enable_exception_handler,
-                                 Properties... properties) {
-    sycl::async_handler eh = {};
-    if (enable_exception_handler) {
-      eh = exception_handler;
-    }
-    _queues.push_back(std::make_shared<sycl::queue>(
-        _ctx, *this, eh,
-        sycl::property_list(
-#ifdef DPCT_PROFILING_ENABLED
-            sycl::property::queue::enable_profiling(),
-#endif
-            properties...)));
-
-    return _queues.back().get();
-  }
-
-  void get_version(int &major, int &minor) const {
-    detail::get_version(*this, major, minor);
-  }
-  sycl::queue *_q_in_order, *_q_out_of_order;
-  sycl::queue *_saved_queue;
-  sycl::context _ctx;
-  std::vector<std::shared_ptr<sycl::queue>> _queues;
-  mutable mutex_type m_mutex;
-};
-
-static inline unsigned int get_tid() {
-#if defined(__linux__)
-  return syscall(SYS_gettid);
-#elif defined(_WIN64)
-  return GetCurrentThreadId();
-#else
-#error "Only support Windows and Linux."
-#endif
-}
-
-/// device manager
-class dev_mgr {
-public:
-  device_ext &current_device() {
-    unsigned int dev_id=current_device_id();
-    check_id(dev_id);
-    return *_devs[dev_id];
-  }
-  device_ext &cpu_device() const {
-    std::lock_guard<std::recursive_mutex> lock(m_mutex);
-    if (_cpu_device == -1) {
-      throw std::runtime_error("no valid cpu device");
-    } else {
-      return *_devs[_cpu_device];
-    }
-  }
-  device_ext &get_device(unsigned int id) const {
-    std::lock_guard<std::recursive_mutex> lock(m_mutex);
-    check_id(id);
-    return *_devs[id];
-  }
-  unsigned int current_device_id() const {
-   std::lock_guard<std::recursive_mutex> lock(m_mutex);
-   auto it=_thread2dev_map.find(get_tid());
-   if(it != _thread2dev_map.end())
-      return it->second;
-    return DEFAULT_DEVICE_ID;
-  }
-
-/// Select device with a device ID.
-/// \param [in] id The id of the device which can
-/// be obtained through get_device_id(const sycl::device).
-  void select_device(unsigned int id) {
-    std::lock_guard<std::recursive_mutex> lock(m_mutex);
-    check_id(id);
-    _thread2dev_map[get_tid()]=id;
-  }
-  unsigned int device_count() { return _devs.size(); }
-
-  unsigned int get_device_id(const sycl::device &dev) {
-    unsigned int id = 0;
-    for(auto dev_item : _devs) {
-      if (*dev_item == dev) {
-        break;
-      }
-      id++;
-    }
-    return id;
-  }
-
-  template <class DeviceSelector>
-  std::enable_if_t<
-      std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
-  select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
-    sycl::device selected_device = sycl::device(selector);
-    unsigned int selected_device_id = get_device_id(selected_device);
-    select_device(selected_device_id);
-  }
-
-  /// Returns the instance of device manager singleton.
-  static dev_mgr &instance() {
-    static dev_mgr d_m;
-    return d_m;
-  }
-  dev_mgr(const dev_mgr &) = delete;
-  dev_mgr &operator=(const dev_mgr &) = delete;
-  dev_mgr(dev_mgr &&) = delete;
-  dev_mgr &operator=(dev_mgr &&) = delete;
-
-private:
-  mutable std::recursive_mutex m_mutex;
-  dev_mgr() {
-    sycl::device default_device =
-        sycl::device(sycl::default_selector_v);
-    _devs.push_back(std::make_shared<device_ext>(default_device));
-
-    std::vector<sycl::device> sycl_all_devs =
-        sycl::device::get_devices(sycl::info::device_type::all);
-    // Collect other devices except for the default device.
-    if (default_device.is_cpu())
-      _cpu_device = 0;
-    for (auto &dev : sycl_all_devs) {
-      if (dev == default_device) {
-        continue;
-      }
-      _devs.push_back(std::make_shared<device_ext>(dev));
-      if (_cpu_device == -1 && dev.is_cpu()) {
-        _cpu_device = _devs.size() - 1;
-      }
-    }
-  }
-  void check_id(unsigned int id) const {
-    if (id >= _devs.size()) {
-      throw std::runtime_error("invalid device id");
-    }
-  }
-  std::vector<std::shared_ptr<device_ext>> _devs;
-  /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
-  /// thread id in _thread2dev_map, which means default device should be used
-  /// for the current thread.
-  const unsigned int DEFAULT_DEVICE_ID = 0;
-  /// thread-id to device-id map.
-  std::map<unsigned int, unsigned int> _thread2dev_map;
-  int _cpu_device = -1;
-};
-
-/// Util function to get the default queue of current selected device depends on
-/// the USM config. Return the default out-of-ordered queue when USM-none is
-/// enabled, otherwise return the default in-ordered queue.
-static inline sycl::queue &get_default_queue() {
-  return dev_mgr::instance().current_device().default_queue();
-}
-
-/// Util function to get the default in-ordered queue of current device in
-/// dpct device manager.
-static inline sycl::queue &get_in_order_queue() {
-  return dev_mgr::instance().current_device().in_order_queue();
-}
-
-/// Util function to get the default out-of-ordered queue of current device in
-/// dpct device manager.
-static inline sycl::queue &get_out_of_order_queue() {
-  return dev_mgr::instance().current_device().out_of_order_queue();
-}
-
-/// Util function to get the id of current device in
-/// dpct device manager.
-static inline unsigned int get_current_device_id() {
-  return dev_mgr::instance().current_device_id();
-}
-
-/// Util function to get the current device.
-static inline device_ext &get_current_device() {
-  return dev_mgr::instance().current_device();
-}
-
-/// Util function to get a device by id.
-static inline device_ext &get_device(unsigned int id) {
-  return dev_mgr::instance().get_device(id);
-}
-
-/// Util function to get the context of the default queue of current
-/// device in dpct device manager.
-static inline sycl::context get_default_context() {
-  return dpct::get_current_device().get_context();
-}
-
-/// Util function to get a CPU device.
-static inline device_ext &cpu_device() {
-  return dev_mgr::instance().cpu_device();
-}
-
-static inline unsigned int select_device(unsigned int id) {
-  dev_mgr::instance().select_device(id);
-  return id;
-}
-
-template <class DeviceSelector>
-static inline std::enable_if_t<
-    std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
-select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
-  dev_mgr::instance().select_device(selector);
-}
-
-static inline unsigned int get_device_id(const sycl::device &dev){
-  return dev_mgr::instance().get_device_id(dev);
-}
-
-/// Util function to check whether a device supports some kinds of sycl::aspect.
-inline void
-has_capability_or_fail(const sycl::device &dev,
-                       const std::initializer_list<sycl::aspect> &props) {
-  for (const auto &it : props) {
-    if (dev.has(it))
-      continue;
-    switch (it) {
-    case sycl::aspect::fp64:
-      throw std::runtime_error("'double' is not supported in '" +
-                               dev.get_info<sycl::info::device::name>() +
-                               "' device");
-      break;
-    case sycl::aspect::fp16:
-      throw std::runtime_error("'half' is not supported in '" +
-                               dev.get_info<sycl::info::device::name>() +
-                               "' device");
-      break;
-    default:
-#define __SYCL_ASPECT(ASPECT, ID)                                              \
-  case sycl::aspect::ASPECT:                                                   \
-    return #ASPECT;
-#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
-#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
-      auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string {
-        switch (AspectNum) {
-#include <sycl/info/aspects.def>
-#include <sycl/info/aspects_deprecated.def>
-        default:
-          return "unknown aspect";
-        }
-      };
-#undef __SYCL_ASPECT_DEPRECATED_ALIAS
-#undef __SYCL_ASPECT_DEPRECATED
-#undef __SYCL_ASPECT
-      throw std::runtime_error(
-          "'" + getAspectNameStr(it) + "' is not supported in '" +
-          dev.get_info<sycl::info::device::name>() + "' device");
-    }
-    break;
-  }
-}
-} // namespace dpct
-
-#endif // __DPCT_DEVICE_HPP__
--- a/dpct/dnnl_utils.hpp
+++ b/dpct/dnnl_utils.hpp
--- a/dpct/dpct.hpp
+++ b/dpct/dpct.hpp
@ -1,62 +0,0 @@
-//==---- dpct.hpp ---------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_HPP__
-#define __DPCT_HPP__
-
-#include <sycl/sycl.hpp>
-#include <iostream>
-#include <limits.h>
-#include <math.h>
-
-template <class... Args> class dpct_kernel_name;
-template <int Arg> class dpct_kernel_scalar;
-
-#include "atomic.hpp"
-#include "device.hpp"
-#include "image.hpp"
-#include "kernel.hpp"
-#include "math.hpp"
-#include "memory.hpp"
-#include "util.hpp"
-
-#if defined(_MSC_VER)
-#define __dpct_align__(n) __declspec(align(n))
-#define __dpct_inline__ __forceinline
-#else
-#define __dpct_align__(n) __attribute__((aligned(n)))
-#define __dpct_inline__ __inline__ __attribute__((always_inline))
-#endif
-
-#if defined(_MSC_VER)
-#define __dpct_noinline__ __declspec(noinline)
-#else
-#define __dpct_noinline__ __attribute__((noinline))
-#endif
-
-#define DPCT_COMPATIBILITY_TEMP (900)
-
-namespace dpct{
-enum error_code { success = 0, default_error = 999 };
-}
-
-#define DPCT_CHECK_ERROR(expr)                                                 \
-  [&]() {                                                                      \
-    try {                                                                      \
-      expr;                                                                    \
-      return dpct::success;                                                    \
-    } catch (std::exception const &e) {                                        \
-      std::cerr << e.what() << std::endl;                                      \
-      return dpct::default_error;                                              \
-    }                                                                          \
-  }()
-
-#define DPCT_PI_F (3.14159274101257f)
-#define DPCT_PI (3.141592653589793115998)
-
-#endif // __DPCT_HPP__
--- a/dpct/dpl_extras/algorithm.h
+++ b/dpct/dpl_extras/algorithm.h
--- a/dpct/dpl_extras/dpcpp_extensions.h
+++ b/dpct/dpl_extras/dpcpp_extensions.h
@ -1,747 +0,0 @@
-//==---- dpcpp_extensions.h ------------------*- C++ -*---------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------===//
-
-#ifndef __DPCT_DPCPP_EXTENSIONS_H__
-#define __DPCT_DPCPP_EXTENSIONS_H__
-
-#include <sycl/sycl.hpp>
-#include <stdexcept>
-
-#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
-#include <sycl/ext/oneapi/experimental/user_defined_reductions.hpp>
-#endif
-
-#include "../dpct.hpp"
-#include "functional.h"
-
-namespace dpct {
-namespace group {
-namespace detail {
-
-template <typename... _Args>
-constexpr auto __reduce_over_group(_Args... __args) {
-  return sycl::reduce_over_group(__args...);
-}
-
-template <typename... _Args> constexpr auto __group_broadcast(_Args... __args) {
-  return sycl::group_broadcast(__args...);
-}
-
-template <typename... _Args>
-constexpr auto __exclusive_scan_over_group(_Args... __args) {
-  return sycl::exclusive_scan_over_group(__args...);
-}
-
-template <typename... _Args>
-constexpr auto __inclusive_scan_over_group(_Args... __args) {
-  return sycl::inclusive_scan_over_group(__args...);
-}
-
-} // end namespace detail
-
-/// Perform an exclusive scan over the values of inputs from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param inputs Pointer to the input data for the scan operation.
-/// \param outputs Pointer to the location where scan results will be stored.
-/// \param init initial value of the scan result.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan.
-template <typename Item, typename T, class BinaryOperation,
-          int VALUES_PER_THREAD>
-__dpct_inline__ void
-exclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD],
-               T (&outputs)[VALUES_PER_THREAD], T init,
-               BinaryOperation binary_op) {
-  T result = inputs[0];
-
-#pragma unroll
-  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
-    result = binary_op(result, inputs[i]);
-  }
-
-  T exclusive_result =
-      detail::__exclusive_scan_over_group(item.get_group(), result, binary_op);
-
-  T input = inputs[0];
-  if (item.get_local_linear_id() == 0) {
-    outputs[0] = init;
-  } else {
-    outputs[0] = exclusive_result;
-  }
-
-#pragma unroll
-  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
-    T output = binary_op(input, outputs[i - 1]);
-    input = inputs[i];
-    outputs[i] = output;
-  }
-}
-
-/// Perform an exclusive scan over the values of input from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param input Input data for the scan operation.
-/// \param init initial value of the scan result.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \param group_aggregate group-wide aggregate of all inputs
-/// in the work-items of the group. \returns exclusive scan of the first i
-/// work-items where item is the i-th work item.
-template <typename Item, typename T, class BinaryOperation>
-__dpct_inline__ T
-exclusive_scan(const Item &item, T input, T init, BinaryOperation binary_op,
-               T &group_aggregate) {
-  T output = detail::__exclusive_scan_over_group(item.get_group(), input, init,
-                                                 binary_op);
-  if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
-    group_aggregate = binary_op(output, input);
-  }
-
-  group_aggregate = detail::__group_broadcast(
-      item.get_group(), group_aggregate, item.get_local_range().size() - 1);
-  return output;
-}
-
-/// Perform an exclusive scan over the values of input from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param input Input data for the scan operation.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \param prefix_callback_op functor invoked by the first
-/// work-item in the group that returns the
-///        initial value in the resulting scan of the work-items in the group.
-/// \returns exclusive scan of the input elements assigned to work-items in the
-/// group.
-template <typename Item, typename T, class BinaryOperation,
-          class GroupPrefixCallbackOperation>
-__dpct_inline__ T
-exclusive_scan(const Item &item, T input, BinaryOperation binary_op,
-               GroupPrefixCallbackOperation &prefix_callback_op) {
-  T group_aggregate;
-
-  T output =
-      detail::__exclusive_scan_over_group(item.get_group(), input, binary_op);
-  if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
-    group_aggregate = binary_op(output, input);
-  }
-
-  group_aggregate = detail::__group_broadcast(
-      item.get_group(), group_aggregate, item.get_local_range().size() - 1);
-
-  T group_prefix = prefix_callback_op(group_aggregate);
-  if (item.get_local_linear_id() == 0) {
-    output = group_prefix;
-  } else {
-    output = binary_op(group_prefix, output);
-  }
-
-  return output;
-}
-
-namespace detail {
-
-typedef uint16_t digit_counter_type;
-typedef uint32_t packed_counter_type;
-
-template <int N, int CURRENT_VAL = N, int COUNT = 0> struct log2 {
-  enum { VALUE = log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };
-};
-
-template <int N, int COUNT> struct log2<N, 0, COUNT> {
-  enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 };
-};
-
-template <int RADIX_BITS, bool DESCENDING = false> class radix_rank {
-public:
-  static size_t get_local_memory_size(size_t group_threads) {
-    return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type);
-  }
-
-  radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  template <typename Item, int VALUES_PER_THREAD>
-  __dpct_inline__ void
-  rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD],
-            int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) {
-
-    digit_counter_type thread_prefixes[VALUES_PER_THREAD];
-    digit_counter_type *digit_counters[VALUES_PER_THREAD];
-    digit_counter_type *buffer =
-        reinterpret_cast<digit_counter_type *>(_local_memory);
-
-    reset_local_memory(item);
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      uint32_t digit = ::dpct::bfe(keys[i], current_bit, num_bits);
-      uint32_t sub_counter = digit >> LOG_COUNTER_LANES;
-      uint32_t counter_lane = digit & (COUNTER_LANES - 1);
-
-      if (DESCENDING) {
-        sub_counter = PACKING_RATIO - 1 - sub_counter;
-        counter_lane = COUNTER_LANES - 1 - counter_lane;
-      }
-
-      digit_counters[i] =
-          &buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO +
-                  item.get_local_linear_id() * PACKING_RATIO + sub_counter];
-      thread_prefixes[i] = *digit_counters[i];
-      *digit_counters[i] = thread_prefixes[i] + 1;
-    }
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-    scan_counters(item);
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      ranks[i] = thread_prefixes[i] + *digit_counters[i];
-    }
-  }
-
-private:
-  template <typename Item>
-  __dpct_inline__ void reset_local_memory(const Item &item) {
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0;
-    }
-  }
-
-  template <typename Item>
-  __dpct_inline__ packed_counter_type upsweep(const Item &item) {
-    packed_counter_type sum = 0;
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; i++) {
-      cached_segment[i] =
-          ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i];
-    }
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      sum += cached_segment[i];
-    }
-
-    return sum;
-  }
-
-  template <typename Item>
-  __dpct_inline__ void
-  exclusive_downsweep(const Item &item, packed_counter_type raking_partial) {
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-    packed_counter_type sum = raking_partial;
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      packed_counter_type value = cached_segment[i];
-      cached_segment[i] = sum;
-      sum += value;
-    }
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] =
-          cached_segment[i];
-    }
-  }
-
-  struct prefix_callback {
-    __dpct_inline__ packed_counter_type
-    operator()(packed_counter_type block_aggregate) {
-      packed_counter_type block_prefix = 0;
-
-#pragma unroll
-      for (int packed = 1; packed < PACKING_RATIO; packed++) {
-        block_prefix += block_aggregate
-                        << (sizeof(digit_counter_type) * 8 * packed);
-      }
-
-      return block_prefix;
-    }
-  };
-
-  template <typename Item>
-  __dpct_inline__ void scan_counters(const Item &item) {
-    packed_counter_type raking_partial = upsweep(item);
-
-    prefix_callback callback;
-    packed_counter_type exclusive_partial = exclusive_scan(
-        item, raking_partial, sycl::ext::oneapi::plus<packed_counter_type>(),
-        callback);
-
-    exclusive_downsweep(item, exclusive_partial);
-  }
-
-private:
-  static constexpr int PACKING_RATIO =
-      sizeof(packed_counter_type) / sizeof(digit_counter_type);
-  static constexpr int LOG_PACKING_RATIO = log2<PACKING_RATIO>::VALUE;
-  static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO;
-  static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES;
-  static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1;
-
-  packed_counter_type cached_segment[PADDED_COUNTER_LANES];
-  uint8_t *_local_memory;
-};
-
-template <typename T, typename U> struct base_traits {
-
-  static __dpct_inline__ U twiddle_in(U key) {
-    throw std::runtime_error("Not implemented");
-  }
-  static __dpct_inline__ U twiddle_out(U key) {
-    throw std::runtime_error("Not implemented");
-  }
-};
-
-template <typename U> struct base_traits<uint32_t, U> {
-  static __dpct_inline__ U twiddle_in(U key) { return key; }
-  static __dpct_inline__ U twiddle_out(U key) { return key; }
-};
-
-template <typename U> struct base_traits<int, U> {
-  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
-  static __dpct_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; }
-  static __dpct_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; }
-};
-
-template <typename U> struct base_traits<float, U> {
-  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
-  static __dpct_inline__ U twiddle_in(U key) {
-    U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT;
-    return key ^ mask;
-  }
-  static __dpct_inline__ U twiddle_out(U key) {
-    U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1);
-    return key ^ mask;
-  }
-};
-
-template <typename T> struct traits : base_traits<T, T> {};
-template <> struct traits<uint32_t> : base_traits<uint32_t, uint32_t> {};
-template <> struct traits<int> : base_traits<int, uint32_t> {};
-template <> struct traits<float> : base_traits<float, uint32_t> {};
-
-} // namespace detail
-
-namespace detail {
-
-template <int N> struct power_of_two {
-  enum { VALUE = ((N & (N - 1)) == 0) };
-};
-
-__dpct_inline__ uint32_t shr_add(uint32_t x, uint32_t shift, uint32_t addend) {
-  return (x >> shift) + addend;
-}
-
-} // namespace detail
-
-/// Implements scatter to blocked exchange pattern used in radix sort algorithm.
-///
-/// \tparam T type of the data elements exchanges
-/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
-template <typename T, int VALUES_PER_THREAD> class exchange {
-public:
-  static size_t get_local_memory_size(size_t group_threads) {
-    size_t padding_values =
-        (INSERT_PADDING)
-            ? ((group_threads * VALUES_PER_THREAD) >> LOG_LOCAL_MEMORY_BANKS)
-            : 0;
-    return (group_threads * VALUES_PER_THREAD + padding_values) * sizeof(T);
-  }
-
-  exchange(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  /// Rearrange elements from rank order to blocked order
-  template <typename Item>
-  __dpct_inline__ void
-  scatter_to_blocked(Item item, T (&keys)[VALUES_PER_THREAD],
-                     int (&ranks)[VALUES_PER_THREAD]) {
-    T *buffer = reinterpret_cast<T *>(_local_memory);
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; i++) {
-      int offset = ranks[i];
-      if (INSERT_PADDING)
-        offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
-      buffer[offset] = keys[i];
-    }
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; i++) {
-      int offset = (item.get_local_id(0) * VALUES_PER_THREAD) + i;
-      if (INSERT_PADDING)
-        offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
-      keys[i] = buffer[offset];
-    }
-  }
-
-private:
-  static constexpr int LOG_LOCAL_MEMORY_BANKS = 5;
-  static constexpr bool INSERT_PADDING =
-      (VALUES_PER_THREAD > 4) &&
-      (detail::power_of_two<VALUES_PER_THREAD>::VALUE);
-
-  uint8_t *_local_memory;
-};
-
-/// Implements radix sort to sort integer data elements assigned to all threads
-/// in the group.
-///
-/// \tparam T type of the data elements exchanges
-/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
-/// \tparam DECENDING boolean value indicating if data elements are sorted in
-/// decending order.
-template <typename T, int VALUES_PER_THREAD, bool DESCENDING = false>
-class radix_sort {
-public:
-  static size_t get_local_memory_size(size_t group_threads) {
-    size_t ranks_size =
-        detail::radix_rank<RADIX_BITS>::get_local_memory_size(group_threads);
-    size_t exchange_size =
-        exchange<T, VALUES_PER_THREAD>::get_local_memory_size(group_threads);
-    return sycl::max(ranks_size, exchange_size);
-  }
-
-  radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  template <typename Item>
-  __dpct_inline__ void
-  sort(const Item &item, T (&keys)[VALUES_PER_THREAD], int begin_bit = 0,
-       int end_bit = 8 * sizeof(T)) {
-
-    uint32_t(&unsigned_keys)[VALUES_PER_THREAD] =
-        reinterpret_cast<uint32_t(&)[VALUES_PER_THREAD]>(keys);
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      unsigned_keys[i] = detail::traits<T>::twiddle_in(unsigned_keys[i]);
-    }
-
-    while (true) {
-      int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit);
-
-      int ranks[VALUES_PER_THREAD];
-      detail::radix_rank<RADIX_BITS, DESCENDING>(_local_memory)
-          .template rank_keys(item, unsigned_keys, ranks, begin_bit, pass_bits);
-      begin_bit += RADIX_BITS;
-
-      item.barrier(sycl::access::fence_space::local_space);
-
-      exchange<T, VALUES_PER_THREAD>(_local_memory)
-          .scatter_to_blocked(item, keys, ranks);
-
-      item.barrier(sycl::access::fence_space::local_space);
-
-      if (begin_bit >= end_bit)
-        break;
-    }
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      unsigned_keys[i] = detail::traits<T>::twiddle_out(unsigned_keys[i]);
-    }
-  }
-
-private:
-  static constexpr int RADIX_BITS = 4;
-
-  uint8_t *_local_memory;
-};
-
-/// Perform a reduction of the data elements assigned to all threads in the
-/// group.
-///
-/// \param item A work-item in a group.
-/// \param inputs Pointer to the input data for the reduce operation.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \returns value of the reduction using binary_op
-template <typename Item, typename T, class BinaryOperation,
-          int VALUES_PER_THREAD>
-__dpct_inline__ T
-reduce(Item item, T (&inputs)[VALUES_PER_THREAD], BinaryOperation binary_op) {
-  T result = inputs[0];
-
-#pragma unroll
-  for (int i = 1; i < VALUES_PER_THREAD; i++) {
-    result = binary_op(result, inputs[i]);
-  }
-  return detail::__reduce_over_group(item.get_group(), result, binary_op);
-}
-
-/// Perform a reduction on a limited number of the work items in a subgroup
-///
-/// \param item A work-item in a group.
-/// \param value value per work item which is to be reduced
-/// \param items_to_reduce num work items at the start of the subgroup to reduce
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \returns value of the reduction using binary_op
-template <typename Item, typename T, class BinaryOperation>
-__dpct_inline__
-typename ::std::enable_if_t<sycl::has_known_identity_v<BinaryOperation, T>, T>
-reduce_over_partial_group(const Item &item, const T &value,
-                          const ::std::uint16_t &items_to_reduce,
-                          BinaryOperation binary_op) {
-  T value_temp = (item.get_local_linear_id() < items_to_reduce)
-                     ? value
-                     : sycl::known_identity_v<BinaryOperation, T>;
-  return detail::__reduce_over_group(item.get_sub_group(), value_temp,
-                                     binary_op);
-}
-
-/// Perform an inclusive scan over the values of inputs from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param inputs Pointer to the input data for the scan operation.
-/// \param outputs Pointer to the location where scan results will be stored.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \returns inclusive scan of the input elements assigned to
-/// work-items in the group.
-template <typename Item, typename T, class BinaryOperation,
-          int VALUES_PER_THREAD>
-__dpct_inline__ void
-inclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD],
-               T (&outputs)[VALUES_PER_THREAD], BinaryOperation binary_op) {
-  T result = inputs[0];
-
-#pragma unroll
-  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
-    result = binary_op(result, inputs[i]);
-  }
-
-  T exclusive_result =
-      detail::__exclusive_scan_over_group(item.get_group(), result, binary_op);
-
-  if (item.get_local_linear_id() == 0) {
-    outputs[0] = inputs[0];
-  } else {
-    outputs[0] = binary_op(inputs[0], exclusive_result);
-  }
-
-#pragma unroll
-  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
-    outputs[i] = binary_op(inputs[i], outputs[i - 1]);
-  }
-}
-
-/// Perform an inclusive scan over the values of inputs from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param input Pointer to the input data for the scan operation.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \param group_aggregate group-wide aggregate of all inputs
-/// in the work-items of the group. \returns inclusive scan of the input
-/// elements assigned to work-items in the group.
-template <typename Item, typename T, class BinaryOperation>
-__dpct_inline__ T inclusive_scan(const Item &item, T input,
-                                                BinaryOperation binary_op,
-                                                T &group_aggregate) {
-  T output =
-      detail::__inclusive_scan_over_group(item.get_group(), input, binary_op);
-  if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
-    group_aggregate = output;
-  }
-
-  group_aggregate = detail::__group_broadcast(
-      item.get_group(), group_aggregate, item.get_local_range().size() - 1);
-  return output;
-}
-
-/// Perform an inclusive scan over the values of input from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param input Input data for the scan operation.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \param prefix_callback_op functor invoked by the first
-/// work-item in the group that returns the
-///        initial value in the resulting scan of the work-items in the group.
-/// \returns inclusive scan of the input elements assigned to work-items in the
-/// group.
-template <typename Item, typename T, class BinaryOperation,
-          class GroupPrefixCallbackOperation>
-__dpct_inline__ T
-inclusive_scan(const Item &item, T input, BinaryOperation binary_op,
-               GroupPrefixCallbackOperation &prefix_callback_op) {
-  T group_aggregate;
-
-  T output = inclusive_scan(item, input, binary_op, group_aggregate);
-  T group_prefix = prefix_callback_op(group_aggregate);
-
-  return binary_op(group_prefix, output);
-}
-
-} // namespace group
-
-namespace device {
-
-namespace detail {
-
-template <typename... _Args> constexpr auto __joint_reduce(_Args... __args) {
-  return sycl::joint_reduce(__args...);
-}
-
-} // namespace detail
-
-/// Perform a reduce on each of the segments specified within data stored on
-/// the device.
-///
-/// \param queue Command queue used to access device used for reduction
-/// \param inputs Pointer to the data elements on the device to be reduced
-/// \param outputs Pointer to the storage where the reduced value for each
-/// segment will be stored \param segment_count number of segments to be reduced
-/// \param begin_offsets Pointer to the set of indices that are the first
-/// element in each segment \param end_offsets Pointer to the set of indices
-/// that are one past the last element in each segment \param binary_op functor
-/// that implements the binary operation used to perform the scan. \param init
-/// initial value of the reduction for each segment.
-template <int GROUP_SIZE, typename T, typename OffsetT, class BinaryOperation>
-void segmented_reduce(sycl::queue queue, T *inputs, T *outputs,
-                      size_t segment_count, OffsetT *begin_offsets,
-                      OffsetT *end_offsets, BinaryOperation binary_op, T init) {
-
-  sycl::range<1> global_size(segment_count * GROUP_SIZE);
-  sycl::range<1> local_size(GROUP_SIZE);
-
-  queue.submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(
-        sycl::nd_range<1>(global_size, local_size), [=](sycl::nd_item<1> item) {
-          OffsetT segment_begin = begin_offsets[item.get_group_linear_id()];
-          OffsetT segment_end = end_offsets[item.get_group_linear_id()];
-          if (segment_begin == segment_end) {
-            if (item.get_local_linear_id() == 0) {
-              outputs[item.get_group_linear_id()] = init;
-            }
-            return;
-          }
-
-          sycl::multi_ptr<T, sycl::access::address_space::global_space>
-              input_ptr = inputs;
-          T group_aggregate = detail::__joint_reduce(
-              item.get_group(), input_ptr + segment_begin,
-              input_ptr + segment_end, init, binary_op);
-
-          if (item.get_local_linear_id() == 0) {
-            outputs[item.get_group_linear_id()] = group_aggregate;
-          }
-        });
-  });
-}
-
-
-#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
-
-namespace experimental {
-namespace detail {
-template <typename _Tp, typename... _Ts> struct __is_any {
-  constexpr static bool value = std::disjunction_v<
-      std::is_same<std::remove_cv_t<_Tp>, std::remove_cv_t<_Ts>>...>;
-};
-
-template <typename _Tp, typename _Bp> struct __in_native_op_list {
-  constexpr static bool value =
-      __is_any<_Bp, sycl::plus<_Tp>, sycl::bit_or<_Tp>, sycl::bit_xor<_Tp>,
-               sycl::bit_and<_Tp>, sycl::maximum<_Tp>, sycl::minimum<_Tp>,
-               sycl::multiplies<_Tp>>::value;
-};
-
-template <typename _Tp, typename _Bp> struct __is_native_op {
-  constexpr static bool value = __in_native_op_list<_Tp, _Bp>::value ||
-                                __in_native_op_list<void, _Bp>::value;
-};
-
-} // namespace detail
-
-/// Perform a reduce on each of the segments specified within data stored on
-/// the device. Compared with dpct::device::segmented_reduce, this experimental
-/// feature support user define reductions.
-///
-/// \param queue Command queue used to access device used for reduction
-/// \param inputs Pointer to the data elements on the device to be reduced
-/// \param outputs Pointer to the storage where the reduced value for each
-/// segment will be stored \param segment_count number of segments to be reduced
-/// \param begin_offsets Pointer to the set of indices that are the first
-/// element in each segment \param end_offsets Pointer to the set of indices
-/// that are one past the last element in each segment \param binary_op functor
-/// that implements the binary operation used to perform the scan. \param init
-/// initial value of the reduction for each segment.
-template <int GROUP_SIZE, typename T, typename OffsetT, class BinaryOperation>
-void segmented_reduce(sycl::queue queue, T *inputs, T *outputs,
-                      size_t segment_count, OffsetT *begin_offsets,
-                      OffsetT *end_offsets, BinaryOperation binary_op, T init) {
-
-  sycl::range<1> global_size(segment_count * GROUP_SIZE);
-  sycl::range<1> local_size(GROUP_SIZE);
-
-  if constexpr (!detail::__is_native_op<T, BinaryOperation>::value) {
-    queue.submit([&](sycl::handler &cgh) {
-      size_t temp_memory_size = GROUP_SIZE * sizeof(T);
-      auto scratch = sycl::local_accessor<std::byte, 1>(temp_memory_size, cgh);
-      cgh.parallel_for(
-          sycl::nd_range<1>(global_size, local_size),
-          [=](sycl::nd_item<1> item) {
-            OffsetT segment_begin = begin_offsets[item.get_group_linear_id()];
-            OffsetT segment_end = end_offsets[item.get_group_linear_id()];
-            if (segment_begin == segment_end) {
-              if (item.get_local_linear_id() == 0) {
-                outputs[item.get_group_linear_id()] = init;
-              }
-              return;
-            }
-            // Create a handle that associates the group with an allocation it
-            // can use
-            auto handle =
-                sycl::ext::oneapi::experimental::group_with_scratchpad(
-                    item.get_group(),
-                    sycl::span(&scratch[0], temp_memory_size));
-            T group_aggregate = sycl::ext::oneapi::experimental::joint_reduce(
-                handle, inputs + segment_begin, inputs + segment_end, init,
-                binary_op);
-            if (item.get_local_linear_id() == 0) {
-              outputs[item.get_group_linear_id()] = group_aggregate;
-            }
-          });
-    });
-  } else {
-    dpct::device::segmented_reduce<GROUP_SIZE>(queue, inputs, outputs,
-                                               segment_count, begin_offsets,
-                                               end_offsets, binary_op, init);
-  }
-}
-} // namespace experimental
-
-#endif // SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
-
-
-} // namespace device
-} // namespace dpct
-
-#endif
--- a/dpct/dpl_extras/functional.h
+++ b/dpct/dpl_extras/functional.h
@ -1,453 +0,0 @@
-//==---- functional.h -----------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_FUNCTIONAL_H__
-#define __DPCT_FUNCTIONAL_H__
-
-#include <functional>
-#include <oneapi/dpl/functional>
-#include <oneapi/dpl/iterator>
-
-#if ONEDPL_USE_DPCPP_BACKEND
-#include <oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h>
-#endif
-
-#include <tuple>
-#include <utility>
-
-#include "../dpct.hpp"
-#define _DPCT_GCC_VERSION                                                      \
-  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-
-// Portability "#pragma" definition
-#ifdef _MSC_VER
-#define _DPCT_PRAGMA(x) __pragma(x)
-#else
-#define _DPCT_PRAGMA(x) _Pragma(#x)
-#endif
-
-// Enable loop unrolling pragmas where supported
-#if (__INTEL_COMPILER ||                                                       \
-     (!defined(__INTEL_COMPILER) && _DPCT_GCC_VERSION >= 80000))
-#define _DPCT_PRAGMA_UNROLL _DPCT_PRAGMA(unroll)
-#else // no pragma unroll
-#define _DPCT_PRAGMA_UNROLL
-#endif
-
-namespace dpct {
-
-struct null_type {};
-
-// Function object to wrap user defined functors to provide compile time "const"
-// workaround for user function objects.
-// The SYCL spec (4.12) states that writing to a function object during a SYCL
-// kernel is undefined behavior.  This wrapper is provided as a compile-time
-// work around, but functors used in SYCL kernels must be `const` in practice.
-template <typename _Op> struct mark_functor_const {
-  mutable _Op op;
-  mark_functor_const() : op() {}
-  mark_functor_const(const _Op &__op) : op(__op) {}
-  mark_functor_const(_Op &&__op) : op(::std::move(__op)) {}
-  template <typename... _T> auto operator()(_T &&...x) const {
-    return op(std::forward<_T>(x)...);
-  }
-};
-
-namespace internal {
-
-template <class _ExecPolicy, class _T>
-using enable_if_execution_policy =
-    typename std::enable_if<oneapi::dpl::execution::is_execution_policy<
-                                typename std::decay<_ExecPolicy>::type>::value,
-                            _T>::type;
-
-template <typename _T> struct is_hetero_execution_policy : ::std::false_type {};
-
-template <typename... PolicyParams>
-struct is_hetero_execution_policy<
-    oneapi::dpl::execution::device_policy<PolicyParams...>> : ::std::true_type {
-};
-
-template <typename _T> struct is_fpga_execution_policy : ::std::false_type {};
-
-#if _ONEDPL_FPGA_DEVICE
-template <unsigned int unroll_factor, typename... PolicyParams>
-struct is_hetero_execution_policy<
-    execution::fpga_policy<unroll_factor, PolicyParams...>> : ::std::true_type {
-};
-#endif
-
-template <class _ExecPolicy, class _T>
-using enable_if_hetero_execution_policy = typename std::enable_if<
-    is_hetero_execution_policy<typename std::decay<_ExecPolicy>::type>::value,
-    _T>::type;
-
-#if _ONEDPL_CPP14_INTEGER_SEQUENCE_PRESENT
-
-template <std::size_t... _Sp>
-using index_sequence = std::index_sequence<_Sp...>;
-template <std::size_t _Np>
-using make_index_sequence = std::make_index_sequence<_Np>;
-
-#else
-
-template <std::size_t... _Sp> class index_sequence {};
-
-template <std::size_t _Np, std::size_t... _Sp>
-struct make_index_sequence_impl
-    : make_index_sequence_impl<_Np - 1, _Np - 1, _Sp...> {};
-
-template <std::size_t... _Sp> struct make_index_sequence_impl<0, _Sp...> {
-  using type = index_sequence<_Sp...>;
-};
-
-template <std::size_t _Np>
-using make_index_sequence = typename make_index_sequence_impl<_Np>::type;
-#endif
-
-// Minimal buffer implementations for temporary storage in mapping rules
-// Some of our algorithms need to start with raw memory buffer,
-// not an initialized array, because initialization/destruction
-// would make the span be at least O(N).
-#if ONEDPL_USE_DPCPP_BACKEND
-template <typename _Tp> class __buffer {
-  sycl::buffer<_Tp, 1> __buf;
-
-  __buffer(const __buffer &) = delete;
-
-  void operator=(const __buffer &) = delete;
-
-public:
-  // Try to obtain buffer of given size to store objects of _Tp type
-  __buffer(std::size_t __n) : __buf(sycl::range<1>(__n)) {}
-
-  // Return pointer to buffer, or  NULL if buffer could not be obtained.
-  auto get() -> decltype(oneapi::dpl::begin(__buf)) const {
-    return oneapi::dpl::begin(__buf);
-  }
-};
-#else
-template <typename _Tp> class __buffer {
-  std::unique_ptr<_Tp> _M_ptr;
-
-  __buffer(const __buffer &) = delete;
-
-  void operator=(const __buffer &) = delete;
-
-public:
-  // Try to obtain buffer of given size to store objects of _Tp type
-  __buffer(const std::size_t __n) : _M_ptr(new _Tp[__n]) {}
-
-  // Return pointer to buffer, or  NULL if buffer could not be obtained.
-  _Tp *get() const { return _M_ptr.get(); }
-};
-#endif
-
-// Implements C++14 std::less<void> specialization to allow parameter type
-// deduction.
-class __less {
-public:
-  template <typename _Xp, typename _Yp>
-  bool operator()(_Xp &&__x, _Yp &&__y) const {
-    return std::forward<_Xp>(__x) < std::forward<_Yp>(__y);
-  }
-};
-
-template <typename Policy, typename NewName> struct rebind_policy {
-  using type = Policy;
-};
-
-template <typename KernelName, typename NewName>
-struct rebind_policy<oneapi::dpl::execution::device_policy<KernelName>,
-                     NewName> {
-  using type = oneapi::dpl::execution::device_policy<NewName>;
-};
-
-#if _ONEDPL_FPGA_DEVICE
-template <unsigned int factor, typename KernelName, typename NewName>
-struct rebind_policy<oneapi::dpl::execution::fpga_policy<factor, KernelName>,
-                     NewName> {
-  using type = oneapi::dpl::execution::fpga_policy<factor, NewName>;
-};
-#endif
-
-template <typename T1, typename T2,
-          typename R1 = typename std::iterator_traits<T1>::reference,
-          typename R2 = typename std::iterator_traits<T2>::reference>
-struct perm_fun {
-  typedef R2 result_of;
-  perm_fun(T1 input) : source(input) {}
-
-  R2 operator()(R1 x) const { return *(source + x); }
-
-private:
-  T1 source;
-};
-
-// Functor compares first element (key) from tied sequence.
-template <typename Compare = class internal::__less> struct compare_key_fun {
-  typedef bool result_of;
-  compare_key_fun(Compare _comp = internal::__less()) : comp(_comp) {}
-
-  template <typename _T1, typename _T2>
-  result_of operator()(_T1 &&a, _T2 &&b) const {
-    using std::get;
-    return comp(get<0>(a), get<0>(b));
-  }
-
-private:
-  mutable Compare comp;
-};
-
-// Functor evaluates second element of tied sequence with predicate.
-// Used by: copy_if, remove_copy_if, stable_partition_copy
-// Lambda:
-template <typename Predicate> struct predicate_key_fun {
-  typedef bool result_of;
-  predicate_key_fun(Predicate _pred) : pred(_pred) {}
-
-  template <typename _T1> result_of operator()(_T1 &&a) const {
-    using std::get;
-    return pred(get<1>(a));
-  }
-
-private:
-  mutable Predicate pred;
-};
-
-// Used by: remove_if
-template <typename Predicate> struct negate_predicate_key_fun {
-  typedef bool result_of;
-  negate_predicate_key_fun(Predicate _pred) : pred(_pred) {}
-
-  template <typename _T1> result_of operator()(_T1 &&a) const {
-    using std::get;
-    return !pred(get<1>(a));
-  }
-
-private:
-  mutable Predicate pred;
-};
-
-template <typename T> struct sequence_fun {
-  using result_type = T;
-  sequence_fun(T _init, T _step) : init(_init), step(_step) {}
-
-  template <typename _T> result_type operator()(_T &&i) const {
-    return static_cast<T>(init + step * i);
-  }
-
-private:
-  const T init;
-  const T step;
-};
-
-//[binary_pred](Ref a, Ref b){ return(binary_pred(get<0>(a),get<0>(b)));
-template <typename Predicate> struct unique_fun {
-  typedef bool result_of;
-  unique_fun(Predicate _pred) : pred(_pred) {}
-  template <typename _T> result_of operator()(_T &&a, _T &&b) const {
-    using std::get;
-    return pred(get<0>(a), get<0>(b));
-  }
-
-private:
-  mutable Predicate pred;
-};
-
-// Lambda: [pred, &new_value](Ref1 a, Ref2 s) {return pred(s) ? new_value : a;
-// });
-template <typename T, typename Predicate> struct replace_if_fun {
-public:
-  typedef T result_of;
-  replace_if_fun(Predicate _pred, T _new_value)
-      : pred(_pred), new_value(_new_value) {}
-
-  template <typename _T1, typename _T2> T operator()(_T1 &&a, _T2 &&s) const {
-    return pred(s) ? new_value : a;
-  }
-
-private:
-  mutable Predicate pred;
-  const T new_value;
-};
-
-//[pred,op](Ref a){return pred(a) ? op(a) : a; }
-template <typename T, typename Predicate, typename Operator>
-struct transform_if_fun {
-  transform_if_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {}
-  template <typename _T>
-  void operator()(_T&& t) const {
-    using std::get;
-    if (pred(get<0>(t)))
-      get<1>(t) = op(get<0>(t));
-  }
-
-private:
-  mutable Predicate pred;
-  mutable Operator op;
-};
-
-//[pred, op](Ref1 a, Ref2 s) { return pred(s) ? op(a) : a; });
-template <typename T, typename Predicate, typename Operator>
-struct transform_if_unary_zip_mask_fun {
-  transform_if_unary_zip_mask_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {}
-  template <typename _T>
-  void operator()(_T&& t) const {
-    using std::get;
-    if (pred(get<1>(t)))
-      get<2>(t) = op(get<0>(t));
-  }
-
-private:
-  mutable Predicate pred;
-  mutable Operator op;
-};
-
-template <typename T, typename Predicate, typename BinaryOperation>
-class transform_if_zip_mask_fun {
-public:
-  transform_if_zip_mask_fun(Predicate _pred = oneapi::dpl::identity(),
-                            BinaryOperation _op = oneapi::dpl::identity())
-      : pred(_pred), op(_op) {}
-  template <typename _T> void operator()(_T &&t) const {
-    using std::get;
-    if (pred(get<2>(t)))
-      get<3>(t) = op(get<0>(t), get<1>(t));
-  }
-
-private:
-  mutable Predicate pred;
-  mutable BinaryOperation op;
-};
-
-// This following code is similar to a section of code in
-// oneDPL/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
-// It has a similar approach, and could be consolidated.
-// Outside of some differences in approach, there are two significant
-// differences in function.
-//
-// 1) This code allows the output type of the bit range translation to be fit
-// into to the minimal type required to provide that many bits. The code in
-// oneDPL to calculate the bucket for the radix is similar but its output is
-// always std::uint32_t.  The assumption that the bit range desired will fit in
-// 32 bits is not true for this code.
-//
-// 2) This code ensures that for floating point type, -0.0f and 0.0f map to the
-// same value.  This allows the output of this translation to be used to provide
-// a sort which ensures the stability of these values for floating point types.
-
-template <int N> struct uint_byte_map {};
-template <> struct uint_byte_map<1> { using type = uint8_t; };
-template <> struct uint_byte_map<2> { using type = uint16_t; };
-template <> struct uint_byte_map<4> { using type = uint32_t; };
-template <> struct uint_byte_map<8> { using type = uint64_t; };
-
-template <typename T> struct uint_map {
-  using type = typename uint_byte_map<sizeof(T)>::type;
-};
-
-template <typename T, typename OutKeyT> class translate_key {
-  using uint_type_t = typename uint_map<T>::type;
-
-public:
-  translate_key(int begin_bit, int end_bit) {
-    shift = begin_bit;
-    mask = ~OutKeyT(0); // all ones
-    mask = mask >> (sizeof(OutKeyT) * 8 -
-                    (end_bit - begin_bit));           // setup appropriate mask
-    flip_sign = uint_type_t(1) << (sizeof(uint_type_t) * 8 - 1); // sign bit
-    flip_key = ~uint_type_t(0);                       // 0xF...F
-  }
-
-  inline OutKeyT operator()(const T &key) const {
-    uint_type_t intermediate;
-    if constexpr (std::is_floating_point<T>::value) {
-        // normal case (both -0.0f and 0.0f equal -0.0f)
-        if (key != T(-0.0f)) {
-        uint_type_t is_negative = reinterpret_cast<const uint_type_t &>(key) >>
-              (sizeof(uint_type_t) * 8 - 1);
-          intermediate = reinterpret_cast<const uint_type_t &>(key) ^
-                         ((is_negative * flip_key) | flip_sign);
-        } else // special case for -0.0f to keep stability with 0.0f
-        {
-          T negzero = T(-0.0f);
-          intermediate = reinterpret_cast<const uint_type_t &>(negzero);
-        }
-    } else if constexpr (std::is_signed<T>::value) {
-        intermediate = reinterpret_cast<const uint_type_t &>(key) ^ flip_sign;
-    } else {
-      intermediate = key;
-    }
-
-    return static_cast<OutKeyT>(intermediate >> shift) &
-           mask; // shift, cast, and mask
-  }
-
-private:
-  uint8_t shift;
-  OutKeyT mask;
-  uint_type_t flip_sign;
-  uint_type_t flip_key;
-};
-
-// Unary operator that returns reference to its argument. Ported from
-// oneDPL: oneapi/dpl/pstl/utils.h
-struct no_op_fun {
-  template <typename Tp> Tp &&operator()(Tp &&a) const {
-    return ::std::forward<Tp>(a);
-  }
-};
-
-// Unary functor which composes a pair of functors by calling them in succession
-// on an input
-template <typename FunctorInner, typename FunctorOuter>
-struct __composition_functor {
-  __composition_functor(FunctorInner in, FunctorOuter out)
-      : _in(in), _out(out) {}
-  template <typename T> T operator()(const T &i) const {
-    return _out(_in(i));
-  }
-  FunctorInner _in;
-  FunctorOuter _out;
-};
-
-// Unary functor which maps an index of a ROI into a 2D flattened array
-template <typename OffsetT> struct __roi_2d_index_functor {
-  __roi_2d_index_functor(const OffsetT &num_cols,
-                         const ::std::size_t &row_stride)
-      : _num_cols(num_cols), _row_stride(row_stride) {}
-
-  template <typename Index> Index operator()(const Index &i) const {
-    return _row_stride * (i / _num_cols) + (i % _num_cols);
-  }
-
-  OffsetT _num_cols;
-  ::std::size_t _row_stride;
-};
-
-// Unary functor which maps and index into an interleaved array by its active
-// channel
-template <typename OffsetT> struct __interleaved_index_functor {
-  __interleaved_index_functor(const OffsetT &total_channels,
-                              const OffsetT &active_channel)
-      : _total_channels(total_channels), _active_channel(active_channel) {}
-
-  template <typename Index> Index operator()(const Index &i) const {
-    return i * _total_channels + _active_channel;
-  }
-
-  OffsetT _total_channels;
-  OffsetT _active_channel;
-};
-
-} // end namespace internal
-
-} // end namespace dpct
-
-#endif
--- a/dpct/dpl_extras/iterators.h
+++ b/dpct/dpl_extras/iterators.h
@ -1,347 +0,0 @@
-//==---- iterators.h ------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_ITERATORS_H__
-#define __DPCT_ITERATORS_H__
-
-#include <oneapi/dpl/iterator>
-
-#include "functional.h"
-
-namespace dpct {
-
-namespace internal {
-
-// Wrapper class returned from a dereferenced transform_iterator which was
-// created using
-//  make_transform_output_iterator(). Used to apply the supplied transform
-//  function when writing into an object of this class.
-//
-// Example:
-// int a[] = {0, 1, 2, 3, 4};
-// int* p = a;
-// auto f = [](auto v) {return v*v;};
-// auto tr_out = dpct::make_transform_output_iterator(p+1, f);
-// auto wrap = *tr_out;         // wrap is a transform_output_ref_wrapper
-// std::cout<<*(p+1)<<std::endl;  // '1'
-// wrap = 2;                    // apply function, store 2*2=4
-// std::cout<<*(p+1)<<std::endl;  // '4'
-template <typename T, typename _UnaryFunc> class transform_output_ref_wrapper {
-private:
-  T __my_reference_;
-  _UnaryFunc __my_unary_func_;
-
-public:
-  template <typename U>
-  transform_output_ref_wrapper(U &&__reference, _UnaryFunc __unary_func)
-      : __my_reference_(std::forward<U>(__reference)),
-        __my_unary_func_(__unary_func) {}
-
-  // When writing to an object of this type, apply the supplied unary function,
-  // then write to the wrapped reference
-  template <typename UnaryInputType>
-  transform_output_ref_wrapper &operator=(const UnaryInputType &e) {
-    __my_reference_ = __my_unary_func_(e);
-    return *this;
-  }
-};
-
-// Unary functor to create a transform_output_reference_wrapper when a
-// transform_iterator is dereferenced, so that a
-// the supplied unary function may be applied on write, resulting in a
-// transform_output_iterator
-template <typename _UnaryFunc> struct _Unary_Out {
-  _Unary_Out(_UnaryFunc __f_) : __f(__f_) {}
-  _UnaryFunc __f;
-  template <typename T> auto operator()(T &&val) const {
-    return transform_output_ref_wrapper<T, _UnaryFunc>(std::forward<T>(val),
-                                                       __f);
-  }
-};
-
-} // end namespace internal
-
-using std::advance;
-
-using std::distance;
-
-template <typename T>
-oneapi::dpl::counting_iterator<T> make_counting_iterator(const T &input) {
-  return oneapi::dpl::counting_iterator<T>(input);
-}
-
-template <typename _Tp> class constant_iterator {
-public:
-  typedef std::false_type is_hetero;
-  typedef std::true_type is_passed_directly;
-  typedef std::ptrdiff_t difference_type;
-  typedef _Tp value_type;
-  typedef _Tp *pointer;
-  // There is no storage behind the iterator, so we return a value instead of
-  // reference.
-  typedef const _Tp reference;
-  typedef const _Tp const_reference;
-  typedef std::random_access_iterator_tag iterator_category;
-
-  explicit constant_iterator(_Tp __init)
-      : __my_value_(__init), __my_counter_(0) {}
-
-private:
-  // used to construct iterator instances with different counter values required
-  // by arithmetic operators
-  constant_iterator(const _Tp &__value, const difference_type &__offset)
-      : __my_value_(__value), __my_counter_(__offset) {}
-
-public:
-  // non-const variants of access operators are not provided so unintended
-  // writes are caught at compile time.
-  const_reference operator*() const { return __my_value_; }
-  const_reference operator[](difference_type) const { return __my_value_; }
-
-  difference_type operator-(const constant_iterator &__it) const {
-    return __my_counter_ - __it.__my_counter_;
-  }
-
-  constant_iterator &operator+=(difference_type __forward) {
-    __my_counter_ += __forward;
-    return *this;
-  }
-  constant_iterator &operator-=(difference_type __backward) {
-    return *this += -__backward;
-  }
-  constant_iterator &operator++() { return *this += 1; }
-  constant_iterator &operator--() { return *this -= 1; }
-
-  constant_iterator operator++(int) {
-    constant_iterator __it(*this);
-    ++(*this);
-    return __it;
-  }
-  constant_iterator operator--(int) {
-    constant_iterator __it(*this);
-    --(*this);
-    return __it;
-  }
-
-  constant_iterator operator-(difference_type __backward) const {
-    return constant_iterator(__my_value_, __my_counter_ - __backward);
-  }
-  constant_iterator operator+(difference_type __forward) const {
-    return constant_iterator(__my_value_, __my_counter_ + __forward);
-  }
-  friend constant_iterator operator+(difference_type __forward,
-                                     const constant_iterator __it) {
-    return __it + __forward;
-  }
-
-  bool operator==(const constant_iterator &__it) const {
-    return __my_value_ == __it.__my_value_ &&
-           this->__my_counter_ == __it.__my_counter_;
-  }
-  bool operator!=(const constant_iterator &__it) const {
-    return !(*this == __it);
-  }
-  bool operator<(const constant_iterator &__it) const {
-    return *this - __it < 0;
-  }
-  bool operator>(const constant_iterator &__it) const { return __it < *this; }
-  bool operator<=(const constant_iterator &__it) const {
-    return !(*this > __it);
-  }
-  bool operator>=(const constant_iterator &__it) const {
-    return !(*this < __it);
-  }
-
-private:
-  _Tp __my_value_;
-  uint64_t __my_counter_;
-};
-
-template <typename _Tp>
-constant_iterator<_Tp> make_constant_iterator(_Tp __value) {
-  return constant_iterator<_Tp>(__value);
-}
-
-// key_value_pair class to represent a key and value, specifically a
-// dereferenced arg_index_input_iterator
-template <typename _KeyTp, typename _ValueTp> class key_value_pair {
-public:
-  key_value_pair() = default;
-
-  key_value_pair(const _KeyTp &_key, const _ValueTp &_value)
-      : key(_key), value(_value) {}
-
-  bool operator==(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const {
-    return (key == _kvp.key) && (value == _kvp.value);
-  }
-
-  bool operator!=(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const {
-    return (key != _kvp.key) || (value != _kvp.value);
-  }
-
-  _KeyTp key;
-  _ValueTp value;
-};
-
-namespace detail {
-
-template <typename KeyTp, typename _ValueTp> struct make_key_value_pair {
-  template <typename ValRefTp>
-  key_value_pair<KeyTp, _ValueTp>
-  operator()(const oneapi::dpl::__internal::tuple<KeyTp, ValRefTp> &tup) const {
-    return ::dpct::key_value_pair<KeyTp, _ValueTp>(::std::get<0>(tup),
-                                                   ::std::get<1>(tup));
-  }
-};
-
-template <class T> struct __zip_iterator_impl;
-template <class... Ts> struct __zip_iterator_impl<std::tuple<Ts...>> {
-  using type = oneapi::dpl::zip_iterator<Ts...>;
-};
-
-} // end namespace detail
-
-// dpct::zip_iterator can only accept std::tuple type as template argument for
-// compatibility purpose. Please use oneapi::dpl::zip_iterator if you want to
-// pass iterator's types directly.
-template <typename... Ts>
-using zip_iterator = typename detail::__zip_iterator_impl<Ts...>::type;
-
-// arg_index_input_iterator is an iterator over a input iterator, with a index.
-// When dereferenced, it returns a key_value_pair, which can be interrogated for
-// the index key or the value from the input iterator
-template <typename InputIteratorT, typename OffsetT = ptrdiff_t,
-          typename OutputValueT =
-              typename ::std::iterator_traits<InputIteratorT>::value_type>
-class arg_index_input_iterator
-    : public oneapi::dpl::transform_iterator<
-          oneapi::dpl::zip_iterator<oneapi::dpl::counting_iterator<OffsetT>,
-                                    InputIteratorT>,
-          detail::make_key_value_pair<OffsetT, OutputValueT>> {
-  using arg_index_input_iterator_wrap = oneapi::dpl::transform_iterator<
-      oneapi::dpl::zip_iterator<oneapi::dpl::counting_iterator<OffsetT>,
-                                InputIteratorT>,
-      detail::make_key_value_pair<OffsetT, OutputValueT>>;
-
-public:
-  typedef OffsetT difference_type;
-
-  // signal to __get_sycl_range that this iterator is as a direct pass iterator
-  using is_zip = ::std::true_type;
-
-  arg_index_input_iterator(const arg_index_input_iterator_wrap &__arg_wrap)
-      : arg_index_input_iterator_wrap(__arg_wrap) {}
-  arg_index_input_iterator(InputIteratorT __iter)
-      : arg_index_input_iterator_wrap(
-            oneapi::dpl::make_zip_iterator(
-                oneapi::dpl::counting_iterator(OffsetT(0)), __iter),
-            detail::make_key_value_pair<OffsetT, OutputValueT>()) {}
-
-  arg_index_input_iterator &operator=(const arg_index_input_iterator &__input) {
-    arg_index_input_iterator_wrap::operator=(__input);
-    return *this;
-  }
-  arg_index_input_iterator &operator++() {
-    arg_index_input_iterator_wrap::operator++();
-    return *this;
-  }
-  arg_index_input_iterator &operator--() {
-    arg_index_input_iterator_wrap::operator--();
-    return *this;
-  }
-  arg_index_input_iterator operator++(int) {
-    arg_index_input_iterator __it(*this);
-    ++(*this);
-    return __it;
-  }
-  arg_index_input_iterator operator--(int) {
-    arg_index_input_iterator __it(*this);
-    --(*this);
-    return __it;
-  }
-  arg_index_input_iterator operator+(difference_type __forward) const {
-    return arg_index_input_iterator(
-        arg_index_input_iterator_wrap::operator+(__forward));
-  }
-  arg_index_input_iterator operator-(difference_type __backward) const {
-    return arg_index_input_iterator(
-        arg_index_input_iterator_wrap::operator-(__backward));
-  }
-  arg_index_input_iterator &operator+=(difference_type __forward) {
-    arg_index_input_iterator_wrap::operator+=(__forward);
-    return *this;
-  }
-  arg_index_input_iterator &operator-=(difference_type __backward) {
-    arg_index_input_iterator_wrap::operator-=(__backward);
-    return *this;
-  }
-
-  friend arg_index_input_iterator
-  operator+(difference_type __forward, const arg_index_input_iterator &__it) {
-    return __it + __forward;
-  }
-
-  difference_type operator-(const arg_index_input_iterator &__it) const {
-    return arg_index_input_iterator_wrap::operator-(__it);
-  }
-  bool operator==(const arg_index_input_iterator &__it) const {
-    return arg_index_input_iterator_wrap::operator==(__it);
-  }
-  bool operator!=(const arg_index_input_iterator &__it) const {
-    return !(*this == __it);
-  }
-  bool operator<(const arg_index_input_iterator &__it) const {
-    return *this - __it < 0;
-  }
-  bool operator>(const arg_index_input_iterator &__it) const {
-    return __it < *this;
-  }
-  bool operator<=(const arg_index_input_iterator &__it) const {
-    return !(*this > __it);
-  }
-  bool operator>=(const arg_index_input_iterator &__it) const {
-    return !(*this < __it);
-  }
-
-  // returns an arg_index_input_iterator with the same iter position, but a
-  // count reset to 0
-  arg_index_input_iterator create_normalized() {
-    return arg_index_input_iterator(
-        ::std::get<1>(arg_index_input_iterator_wrap::base().base()));
-  }
-};
-
-template <typename IterT> struct io_iterator_pair {
-  inline io_iterator_pair() : selector(false) {}
-
-  inline io_iterator_pair(const IterT &first, const IterT &second)
-      : selector(false) {
-    iter[0] = first;
-    iter[1] = second;
-  }
-
-  inline IterT first() const { return selector ? iter[1] : iter[0]; }
-
-  inline IterT second() const { return selector ? iter[0] : iter[1]; }
-
-  inline void swap() { selector = !selector; }
-
-  bool selector;
-
-  IterT iter[2];
-};
-
-template <typename _Iter, typename _UnaryFunc>
-auto make_transform_output_iterator(_Iter __it, _UnaryFunc __unary_func) {
-  return oneapi::dpl::transform_iterator(
-      __it, internal::_Unary_Out<_UnaryFunc>(__unary_func));
-}
-
-} // end namespace dpct
-
-#endif
--- a/dpct/dpl_extras/memory.h
+++ b/dpct/dpl_extras/memory.h
--- a/dpct/dpl_extras/numeric.h
+++ b/dpct/dpl_extras/numeric.h
@ -1,32 +0,0 @@
-//==---- numeric.h --------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_NUMERIC_H__
-#define __DPCT_NUMERIC_H__
-
-namespace dpct {
-
-template <typename Policy, typename InputIt1, typename InputIt2, typename T>
-T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1,
-                InputIt2 first2, T init) {
-  return std::transform_reduce(std::forward<Policy>(policy), first1, last1,
-                               first2, init);
-}
-
-template <typename Policy, typename InputIt1, typename InputIt2, typename T,
-          typename BinaryOperation1, typename BinaryOperation2>
-T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1,
-                InputIt2 first2, T init, BinaryOperation1 op1,
-                BinaryOperation2 op2) {
-  return std::transform_reduce(std::forward<Policy>(policy), first1, last1,
-                               first2, init, op1, op2);
-}
-
-} // end namespace dpct
-
-#endif
--- a/dpct/dpl_extras/vector.h
+++ b/dpct/dpl_extras/vector.h
@ -1,752 +0,0 @@
-//==---- vector.h ---------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_VECTOR_H__
-#define __DPCT_VECTOR_H__
-
-#include <oneapi/dpl/algorithm>
-#include <oneapi/dpl/execution>
-
-#include <sycl/sycl.hpp>
-
-#include "memory.h"
-
-#include <algorithm>
-#include <iterator>
-#include <vector>
-
-#include "../device.hpp"
-
-namespace dpct {
-
-namespace internal {
-template <typename Iter, typename Void = void> // for non-iterators
-struct is_iterator : std::false_type {};
-
-template <typename Iter> // For iterators
-struct is_iterator<
-    Iter,
-    typename std::enable_if<
-        !std::is_void<typename Iter::iterator_category>::value, void>::type>
-    : std::true_type {};
-
-template <typename T> // For pointers
-struct is_iterator<T *> : std::true_type {};
-} // end namespace internal
-
-#ifndef DPCT_USM_LEVEL_NONE
-
-template <typename T,
-          typename Allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>>
-class device_vector {
-public:
-  using iterator = device_iterator<T>;
-  using const_iterator = const iterator;
-  using reference = device_reference<T>;
-  using const_reference = const reference;
-  using value_type = T;
-  using pointer = T *;
-  using const_pointer = const T *;
-  using difference_type =
-      typename ::std::iterator_traits<iterator>::difference_type;
-  using size_type = ::std::size_t;
-
-private:
-  Allocator _alloc;
-  size_type _size;
-  size_type _capacity;
-  pointer _storage;
-
-  size_type _min_capacity() const { return size_type(1); }
-
-  void _set_capacity_and_alloc() {
-    _capacity = ::std::max(_size * 2, _min_capacity());
-    _storage = _alloc.allocate(_capacity);
-  }
-
-public:
-  template <typename OtherA> operator ::std::vector<T, OtherA>() const {
-    auto __tmp = ::std::vector<T, OtherA>(this->size());
-    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              this->begin(), this->end(), __tmp.begin());
-    return __tmp;
-  }
-  device_vector()
-      : _alloc(get_default_queue()), _size(0), _capacity(_min_capacity()) {
-    _set_capacity_and_alloc();
-  }
-  ~device_vector() /*= default*/ { _alloc.deallocate(_storage, _capacity); };
-  explicit device_vector(size_type n) : device_vector(n, T()) {}
-  explicit device_vector(size_type n, const T &value)
-      : _alloc(get_default_queue()), _size(n) {
-    _set_capacity_and_alloc();
-    if (_size > 0) {
-      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                begin(), end(), T(value));
-    }
-  }
-  device_vector(const device_vector &other) : _alloc(get_default_queue()) {
-    _size = other.size();
-    _capacity = other.capacity();
-    _storage = _alloc.allocate(_capacity);
-    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              other.begin(), other.end(), begin());
-  }
-  device_vector(device_vector &&other)
-      : _alloc(get_default_queue()), _size(other.size()),
-        _capacity(other.capacity()), _storage(other._storage) {
-    other._size = 0;
-    other._capacity = 0; 
-    other._storage = nullptr;
-  }
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename ::std::enable_if<
-                    internal::is_iterator<InputIterator>::value &&
-                        !::std::is_pointer<InputIterator>::value &&
-                        ::std::is_same<typename ::std::iterator_traits<
-                                         InputIterator>::iterator_category,
-                                     ::std::random_access_iterator_tag>::value,
-                    InputIterator>::type last)
-      : _alloc(get_default_queue()) {
-    _size = ::std::distance(first, last);
-    _set_capacity_and_alloc();
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                first, last, begin());
-    }
-  }
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename ::std::enable_if<::std::is_pointer<InputIterator>::value,
-                                        InputIterator>::type last)
-      : _alloc(get_default_queue()) {
-    _size = ::std::distance(first, last);
-    _set_capacity_and_alloc();
-    if (_size > 0) {
-      auto ptr_type = sycl::get_pointer_type(first, get_default_context());
-      if (ptr_type != sycl::usm::alloc::host &&
-          ptr_type != sycl::usm::alloc::unknown) {
-        ::std::copy(
-            oneapi::dpl::execution::make_device_policy(get_default_queue()),
-            first, last, begin());
-      } else {
-        sycl::buffer<typename ::std::iterator_traits<InputIterator>::value_type,
-                     1>
-            buf(first, last);
-        auto buf_first = oneapi::dpl::begin(buf);
-        auto buf_last = oneapi::dpl::end(buf);
-        ::std::copy(
-            oneapi::dpl::execution::make_device_policy(get_default_queue()),
-            buf_first, buf_last, begin());
-      }
-    }
-  }
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename ::std::enable_if<
-                    internal::is_iterator<InputIterator>::value &&
-                        !::std::is_pointer<InputIterator>::value &&
-                        !::std::is_same<typename ::std::iterator_traits<
-                                          InputIterator>::iterator_category,
-                                      ::std::random_access_iterator_tag>::value,
-                    InputIterator>::type last)
-      : _alloc(get_default_queue()), _size(::std::distance(first, last)) {
-    _set_capacity_and_alloc();
-    ::std::vector<T> _tmp(first, last);
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                _tmp.begin(), _tmp.end(), this->begin());
-    }
-  }
-
-  template <typename OtherAllocator>
-  device_vector(const device_vector<T, OtherAllocator> &v)
-      : _alloc(get_default_queue()), _storage(v.real_begin()), _size(v.size()),
-        _capacity(v.capacity()) {}
-
-  template <typename OtherAllocator>
-  device_vector(::std::vector<T, OtherAllocator> &v)
-      : _alloc(get_default_queue()), _size(v.size()) {
-    _set_capacity_and_alloc();
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                v.begin(), v.end(), this->begin());
-    }
-  }
-
-  template <typename OtherAllocator>
-  device_vector &operator=(const ::std::vector<T, OtherAllocator> &v) {
-    resize(v.size());
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                v.begin(), v.end(), begin());
-    }
-    return *this;
-  }
-  device_vector &operator=(const device_vector &other) {
-    // Copy assignment operator:
-    resize(other.size());
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                other.begin(), other.end(), begin());
-    }
-    return *this;
-  }
-  device_vector &operator=(device_vector &&other) {
-    // Move assignment operator:
-    device_vector dummy(::std::move(other));
-    this->swap(dummy);
-    return *this;
-  }
-  size_type size() const { return _size; }
-  iterator begin() noexcept { return device_iterator<T>(_storage, 0); }
-  iterator end() { return device_iterator<T>(_storage, size()); }
-  const_iterator begin() const noexcept {
-    return device_iterator<T>(_storage, 0);
-  }
-  const_iterator cbegin() const noexcept { return begin(); }
-  const_iterator end() const { return device_iterator<T>(_storage, size()); }
-  const_iterator cend() const { return end(); }
-  T *real_begin() { return _storage; }
-  const T *real_begin() const { return _storage; }
-  void swap(device_vector &v) {
-    ::std::swap(_size, v._size);
-    ::std::swap(_capacity, v._capacity);
-    ::std::swap(_storage, v._storage);
-    ::std::swap(_alloc, v._alloc);
-  }
-  reference operator[](size_type n) { return _storage[n]; }
-  const_reference operator[](size_type n) const { return _storage[n]; }
-  void reserve(size_type n) {
-    if (n > capacity()) {
-      // allocate buffer for new size
-      auto tmp = _alloc.allocate(2 * n);
-      // copy content (old buffer to new buffer)
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                begin(), end(), tmp);
-      // deallocate old memory
-      _alloc.deallocate(_storage, _capacity);
-      _storage = tmp;
-      _capacity = 2 * n;
-    }
-  }
-  void resize(size_type new_size, const T &x = T()) {
-    reserve(new_size);
-    if (_size < new_size) {
-      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                begin() + _size, begin() + new_size, x);
-    }
-    _size = new_size;
-  }
-  size_type max_size(void) const {
-    return ::std::numeric_limits<size_type>::max() / sizeof(T);
-  }
-  size_type capacity() const { return _capacity; }
-  const_reference front() const { return *begin(); }
-  reference front() { return *begin(); }
-  const_reference back(void) const { return *(end() - 1); }
-  reference back(void) { return *(end() - 1); }
-  pointer data(void) { return _storage; }
-  const_pointer data(void) const { return _storage; }
-  void shrink_to_fit(void) {
-    if (_size != capacity()) {
-      size_type tmp_capacity = ::std::max(_size, _min_capacity());
-      auto tmp = _alloc.allocate(tmp_capacity);
-      if (_size > 0) {
-        ::std::copy(
-            oneapi::dpl::execution::make_device_policy(get_default_queue()),
-            begin(), end(), tmp);
-      }
-      _alloc.deallocate(_storage, _capacity);
-      _storage = tmp;
-      _capacity = tmp_capacity;
-    }
-  }
-  void assign(size_type n, const T &x) {
-    resize(n);
-    if (_size > 0) {
-      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                begin(), begin() + n, x);
-    }
-  }
-  template <typename InputIterator>
-  void
-  assign(InputIterator first,
-         typename ::std::enable_if<internal::is_iterator<InputIterator>::value,
-                                 InputIterator>::type last) {
-    auto n = ::std::distance(first, last);
-    resize(n);
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                first, last, begin());
-    }
-  }
-  void clear(void) { _size = 0; }
-  bool empty(void) const { return (size() == 0); }
-  void push_back(const T &x) { insert(end(), size_type(1), x); }
-  void pop_back(void) {
-    if (_size > 0)
-      --_size;
-  }
-  iterator erase(iterator first, iterator last) {
-    auto n = ::std::distance(first, last);
-    if (last == end()) {
-      _size = _size - n;
-      return end();
-    }
-    auto m = ::std::distance(last, end());
-    if (m <= 0) {
-      return end();
-    }
-    auto tmp = _alloc.allocate(m);
-    // copy remainder to temporary buffer.
-    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              last, end(), tmp);
-    // override (erase) subsequence in storage.
-    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              tmp, tmp + m, first);
-    _alloc.deallocate(tmp, m);
-    _size -= n;
-    return begin() + first.get_idx() + n;
-  }
-  iterator erase(iterator pos) { return erase(pos, pos + 1); }
-  iterator insert(iterator position, const T &x) {
-    auto n = ::std::distance(begin(), position);
-    insert(position, size_type(1), x);
-    return begin() + n;
-  }
-  void insert(iterator position, size_type n, const T &x) {
-    if (position == end()) {
-      resize(size() + n);
-      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                end() - n, end(), x);
-    } else {
-      auto i_n = ::std::distance(begin(), position);
-      // allocate temporary storage
-      auto m = ::std::distance(position, end());
-      // will throw if position is not inside active vector
-      auto tmp = _alloc.allocate(m);
-      // copy remainder
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                position, end(), tmp);
-
-      resize(size() + n);
-      // resizing might invalidate position
-      position = begin() + position.get_idx();
-
-      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                position, position + n, x);
-
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                tmp, tmp + m, position + n);
-      _alloc.deallocate(tmp, m);
-    }
-  }
-  template <typename InputIterator>
-  void
-  insert(iterator position, InputIterator first,
-         typename ::std::enable_if<internal::is_iterator<InputIterator>::value,
-                                 InputIterator>::type last) {
-    auto n = ::std::distance(first, last);
-    if (position == end()) {
-      resize(size() + n);
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                first, last, end());
-    } else {
-      auto m = ::std::distance(position, end());
-      // will throw if position is not inside active vector
-      auto tmp = _alloc.allocate(m);
-
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                position, end(), tmp);
-
-      resize(size() + n);
-      // resizing might invalidate position
-      position = begin() + position.get_idx();
-
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                first, last, position);
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                tmp, tmp + m, position + n);
-      _alloc.deallocate(tmp, m);
-    }
-  }
-  Allocator get_allocator() const { return _alloc; }
-};
-
-#else
-
-template <typename T, typename Allocator = detail::__buffer_allocator<T>>
-class device_vector {
-  static_assert(
-      std::is_same<Allocator, detail::__buffer_allocator<T>>::value,
-      "device_vector doesn't support custom allocator when USM is not used.");
-
-public:
-  using iterator = device_iterator<T>;
-  using const_iterator = const iterator;
-  using reference = device_reference<T>;
-  using const_reference = const reference;
-  using value_type = T;
-  using pointer = T *;
-  using const_pointer = const T *;
-  using difference_type =
-      typename std::iterator_traits<iterator>::difference_type;
-  using size_type = std::size_t;
-
-private:
-  using Buffer = sycl::buffer<T, 1>;
-  using Range = sycl::range<1>;
-  // Using mem_mgr to handle memory allocation
-  void *_storage;
-  size_type _size;
-
-  size_type _min_capacity() const { return size_type(1); }
-
-  void *alloc_store(size_type num_bytes) {
-    return detail::mem_mgr::instance().mem_alloc(num_bytes);
-  }
-
-public:
-  template <typename OtherA> operator std::vector<T, OtherA>() const {
-    auto __tmp = std::vector<T, OtherA>(this->size());
-    std::copy(oneapi::dpl::execution::dpcpp_default, this->begin(), this->end(),
-              __tmp.begin());
-    return __tmp;
-  }
-  device_vector()
-      : _storage(alloc_store(_min_capacity() * sizeof(T))), _size(0) {}
-  ~device_vector() = default;
-  explicit device_vector(size_type n) : device_vector(n, T()) {}
-  explicit device_vector(size_type n, const T &value)
-      : _storage(alloc_store(std::max(n, _min_capacity()) * sizeof(T))),
-        _size(n) {
-    auto buf = get_buffer();
-    std::fill(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(buf),
-              oneapi::dpl::begin(buf) + n, T(value));
-  }
-  device_vector(const device_vector &other)
-      : _storage(other._storage), _size(other.size()) {}
-  device_vector(device_vector &&other)
-      : _storage(std::move(other._storage)), _size(other.size()) {}
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename std::enable_if<
-                    internal::is_iterator<InputIterator>::value &&
-                        !std::is_pointer<InputIterator>::value &&
-                        std::is_same<typename std::iterator_traits<
-                                         InputIterator>::iterator_category,
-                                     std::random_access_iterator_tag>::value,
-                    InputIterator>::type last)
-      : _storage(alloc_store(std::distance(first, last) * sizeof(T))),
-        _size(std::distance(first, last)) {
-    auto buf = get_buffer();
-    auto dst = oneapi::dpl::begin(buf);
-    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              first, last, dst);
-  }
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename std::enable_if<std::is_pointer<InputIterator>::value,
-                                        InputIterator>::type last)
-      : _storage(alloc_store(std::distance(first, last) * sizeof(T))),
-        _size(std::distance(first, last)) {
-    auto buf = get_buffer();
-    Buffer tmp_buf(first, last);
-    auto start = oneapi::dpl::begin(tmp_buf);
-    auto end = oneapi::dpl::end(tmp_buf);
-    auto dst = oneapi::dpl::begin(buf);
-    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              start, end, dst);
-  }
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename std::enable_if<
-                    internal::is_iterator<InputIterator>::value &&
-                        !std::is_same<typename std::iterator_traits<
-                                          InputIterator>::iterator_category,
-                                      std::random_access_iterator_tag>::value,
-                    InputIterator>::type last)
-      : _storage(alloc_store(std::distance(first, last) * sizeof(T))),
-        _size(std::distance(first, last)) {
-    auto buf = get_buffer();
-    std::vector<T> tmp(first, last);
-    Buffer tmp_buf(tmp);
-    auto start = oneapi::dpl::begin(tmp_buf);
-    auto end = oneapi::dpl::end(tmp_buf);
-    auto dst = oneapi::dpl::begin(buf);
-    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              start, end, dst);
-  }
-
-  template <typename OtherAllocator>
-  device_vector(const device_vector<T, OtherAllocator> &v)
-      : _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) {
-    auto buf = get_buffer();
-    auto dst = oneapi::dpl::begin(buf);
-    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              v.real_begin(), v.real_begin() + v.size(), dst);
-  }
-
-  template <typename OtherAllocator>
-  device_vector(std::vector<T, OtherAllocator> &v)
-      : _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) {
-    std::copy(oneapi::dpl::execution::dpcpp_default, v.begin(), v.end(),
-              oneapi::dpl::begin(get_buffer()));
-  }
-
-  device_vector &operator=(const device_vector &other) {
-    // Copy assignment operator:
-    _size = other.size();
-    void *tmp = alloc_store(_size * sizeof(T));
-    auto tmp_buf =
-        detail::mem_mgr::instance()
-            .translate_ptr(tmp)
-            .buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
-    std::copy(oneapi::dpl::execution::dpcpp_default,
-              oneapi::dpl::begin(other.get_buffer()),
-              oneapi::dpl::end(other.get_buffer()),
-              oneapi::dpl::begin(tmp_buf));
-    detail::mem_mgr::instance().mem_free(_storage);
-    _storage = tmp;
-    return *this;
-  }
-  device_vector &operator=(device_vector &&other) {
-    // Move assignment operator:
-    _size = other.size();
-    this->_storage = std::move(other._storage);
-    return *this;
-  }
-  template <typename OtherAllocator>
-  device_vector &operator=(const std::vector<T, OtherAllocator> &v) {
-    Buffer data(v.begin(), v.end());
-    _size = v.size();
-    void *tmp = alloc_store(_size * sizeof(T));
-    auto tmp_buf =
-        detail::mem_mgr::instance()
-            .translate_ptr(tmp)
-            .buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
-    std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(data),
-              oneapi::dpl::end(data), oneapi::dpl::begin(tmp_buf));
-    detail::mem_mgr::instance().mem_free(_storage);
-    _storage = tmp;
-
-    return *this;
-  }
-  Buffer get_buffer() const {
-    return detail::mem_mgr::instance()
-        .translate_ptr(_storage)
-        .buffer.template reinterpret<T, 1>(sycl::range<1>(capacity()));
-  }
-  size_type size() const { return _size; }
-  iterator begin() noexcept { return device_iterator<T>(get_buffer(), 0); }
-  iterator end() { return device_iterator<T>(get_buffer(), _size); }
-  const_iterator begin() const noexcept {
-    return device_iterator<T>(get_buffer(), 0);
-  }
-  const_iterator cbegin() const noexcept { return begin(); }
-  const_iterator end() const { return device_iterator<T>(get_buffer(), _size); }
-  const_iterator cend() const { return end(); }
-  T *real_begin() {
-    return (detail::mem_mgr::instance()
-                .translate_ptr(_storage)
-                .buffer.template get_access<sycl::access_mode::read_write>())
-        .get_pointer();
-  }
-  const T *real_begin() const {
-    return const_cast<device_vector *>(this)
-        ->detail::mem_mgr::instance()
-        .translate_ptr(_storage)
-        .buffer.template get_access<sycl::access_mode::read_write>()
-        .get_pointer();
-  }
-  void swap(device_vector &v) {
-    void *temp = v._storage;
-    v._storage = this->_storage;
-    this->_storage = temp;
-    std::swap(_size, v._size);
-  }
-  reference operator[](size_type n) { return *(begin() + n); }
-  const_reference operator[](size_type n) const { return *(begin() + n); }
-  void reserve(size_type n) {
-    if (n > capacity()) {
-      // create new buffer (allocate for new size)
-      void *a = alloc_store(n * sizeof(T));
-
-      // copy content (old buffer to new buffer)
-      if (_storage != nullptr) {
-        auto tmp = detail::mem_mgr::instance()
-                       .translate_ptr(a)
-                       .buffer.template reinterpret<T, 1>(sycl::range<1>(n));
-        auto src_buf = get_buffer();
-        std::copy(oneapi::dpl::execution::dpcpp_default,
-                  oneapi::dpl::begin(src_buf), oneapi::dpl::end(src_buf),
-                  oneapi::dpl::begin(tmp));
-
-        // deallocate old memory
-        detail::mem_mgr::instance().mem_free(_storage);
-      }
-      _storage = a;
-    }
-  }
-  void resize(size_type new_size, const T &x = T()) {
-    reserve(new_size);
-    if (_size < new_size) {
-      auto src_buf = get_buffer();
-      std::fill(oneapi::dpl::execution::dpcpp_default,
-                oneapi::dpl::begin(src_buf) + _size,
-                oneapi::dpl::begin(src_buf) + new_size, x);
-    }
-    _size = new_size;
-  }
-  size_type max_size(void) const {
-    return std::numeric_limits<size_type>::max() / sizeof(T);
-  }
-  size_type capacity() const {
-    return _storage != nullptr ? detail::mem_mgr::instance()
-                                         .translate_ptr(_storage)
-                                         .buffer.size() /
-                                     sizeof(T)
-                               : 0;
-  }
-  const_reference front() const { return *begin(); }
-  reference front() { return *begin(); }
-  const_reference back(void) const { return *(end() - 1); }
-  reference back(void) { return *(end() - 1); }
-  pointer data(void) { return reinterpret_cast<pointer>(_storage); }
-  const_pointer data(void) const {
-    return reinterpret_cast<const_pointer>(_storage);
-  }
-  void shrink_to_fit(void) {
-    if (_size != capacity()) {
-      void *a = alloc_store(_size * sizeof(T));
-      auto tmp = detail::mem_mgr::instance()
-                     .translate_ptr(a)
-                     .buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
-      std::copy(oneapi::dpl::execution::dpcpp_default,
-                oneapi::dpl::begin(get_buffer()),
-                oneapi::dpl::begin(get_buffer()) + _size,
-                oneapi::dpl::begin(tmp));
-      detail::mem_mgr::instance().mem_free(_storage);
-      _storage = a;
-    }
-  }
-  void assign(size_type n, const T &x) {
-    resize(n);
-    std::fill(oneapi::dpl::execution::dpcpp_default, begin(), begin() + n, x);
-  }
-  template <typename InputIterator>
-  void
-  assign(InputIterator first,
-         typename std::enable_if<internal::is_iterator<InputIterator>::value,
-                                 InputIterator>::type last) {
-    auto n = std::distance(first, last);
-    resize(n);
-    if (internal::is_iterator<InputIterator>::value &&
-        !std::is_pointer<InputIterator>::value)
-      std::copy(oneapi::dpl::execution::dpcpp_default, first, last, begin());
-    else {
-      Buffer tmp(first, last);
-      std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
-                oneapi::dpl::end(tmp), begin());
-    }
-  }
-  void clear(void) {
-    _size = 0;
-    detail::mem_mgr::instance().mem_free(_storage);
-    _storage = nullptr;
-  }
-  bool empty(void) const { return (size() == 0); }
-  void push_back(const T &x) { insert(end(), size_type(1), x); }
-  void pop_back(void) {
-    if (_size > 0)
-      --_size;
-  }
-  iterator erase(iterator first, iterator last) {
-    auto n = std::distance(first, last);
-    if (last == end()) {
-      _size = _size - n;
-      return end();
-    }
-    Buffer tmp{Range(std::distance(last, end()))};
-    // copy remainder to temporary buffer.
-    std::copy(oneapi::dpl::execution::dpcpp_default, last, end(),
-              oneapi::dpl::begin(tmp));
-    // override (erase) subsequence in storage.
-    std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
-              oneapi::dpl::end(tmp), first);
-    resize(_size - n);
-    return begin() + first.get_idx() + n;
-  }
-  iterator erase(iterator pos) { return erase(pos, pos + 1); }
-  iterator insert(iterator position, const T &x) {
-    auto n = std::distance(begin(), position);
-    insert(position, size_type(1), x);
-    return begin() + n;
-  }
-  void insert(iterator position, size_type n, const T &x) {
-    if (position == end()) {
-      resize(size() + n);
-      std::fill(oneapi::dpl::execution::dpcpp_default, end() - n, end(), x);
-    } else {
-      auto i_n = std::distance(begin(), position);
-      // allocate temporary storage
-      Buffer tmp{Range(std::distance(position, end()))};
-      // copy remainder
-      std::copy(oneapi::dpl::execution::dpcpp_default, position, end(),
-                oneapi::dpl::begin(tmp));
-
-      resize(size() + n);
-      // resizing might invalidate position
-      position = begin() + position.get_idx();
-
-      std::fill(oneapi::dpl::execution::dpcpp_default, position, position + n,
-                x);
-
-      std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
-                oneapi::dpl::end(tmp), position + n);
-    }
-  }
-  template <typename InputIterator>
-  void
-  insert(iterator position, InputIterator first,
-         typename std::enable_if<internal::is_iterator<InputIterator>::value,
-                                 InputIterator>::type last) {
-    auto n = std::distance(first, last);
-    if (position == end()) {
-      resize(size() + n);
-      std::copy(oneapi::dpl::execution::dpcpp_default, first, last, end());
-    } else {
-      Buffer tmp{Range(std::distance(position, end()))};
-
-      std::copy(oneapi::dpl::execution::dpcpp_default, position, end(),
-                oneapi::dpl::begin(tmp));
-
-      resize(size() + n);
-      // resizing might invalidate position
-      position = begin() + position.get_idx();
-
-      std::copy(oneapi::dpl::execution::dpcpp_default, first, last, position);
-      std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
-                oneapi::dpl::end(tmp), position + n);
-    }
-  }
-};
-
-#endif
-
-} // end namespace dpct
-
-#endif
--- a/dpct/dpl_utils.hpp
+++ b/dpct/dpl_utils.hpp
@ -1,26 +0,0 @@
-//==---- dpl_utils.hpp ----------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_DPL_UTILS_HPP__
-#define __DPCT_DPL_UTILS_HPP__
-
-#define ONEDPL_USE_DPCPP_BACKEND 1
-#define __USE_DPCT 1
-
-#include <oneapi/dpl/execution>
-#include <oneapi/dpl/algorithm>
-#include <oneapi/dpl/numeric>
-
-#include "dpl_extras/memory.h"
-#include "dpl_extras/algorithm.h"
-#include "dpl_extras/numeric.h"
-#include "dpl_extras/iterators.h"
-#include "dpl_extras/vector.h"
-#include "dpl_extras/dpcpp_extensions.h"
-
-#endif // __DPCT_DPL_UTILS_HPP__
--- a/dpct/fft_utils.hpp
+++ b/dpct/fft_utils.hpp
--- a/dpct/image.hpp
+++ b/dpct/image.hpp
@ -1,901 +0,0 @@
-//==---- image.hpp --------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_IMAGE_HPP__
-#define __DPCT_IMAGE_HPP__
-
-#include <sycl/sycl.hpp>
-
-#include "memory.hpp"
-#include "util.hpp"
-
-namespace dpct {
-
-enum class image_channel_data_type {
-  signed_int,
-  unsigned_int,
-  fp,
-};
-
-class image_channel;
-class image_wrapper_base;
-namespace detail {
-/// Image object type traits, with accessor type and sampled data type defined.
-/// The data type of an image accessor must be one of sycl::int4, sycl::uint4,
-/// sycl::float4 and sycl::half4. The data type of accessors with 8bits/16bits
-/// channel width will be 32 bits. sycl::half is an exception.
-template <class T> struct image_trait {
-  using acc_data_t = sycl::vec<T, 4>;
-  template <int dimensions>
-  using accessor_t =
-      sycl::accessor<acc_data_t, dimensions, sycl::access_mode::read,
-                         sycl::access::target::image>;
-  template <int dimensions>
-  using array_accessor_t =
-      sycl::accessor<acc_data_t, dimensions, sycl::access_mode::read,
-                         sycl::access::target::image_array>;
-  using data_t = T;
-  using elem_t = T;
-  static constexpr image_channel_data_type data_type =
-      std::is_integral<T>::value
-          ? (std::is_signed<T>::value ? image_channel_data_type::signed_int
-                                      : image_channel_data_type::unsigned_int)
-          : image_channel_data_type::fp;
-  static constexpr int channel_num = 1;
-};
-template <>
-struct image_trait<std::uint8_t> : public image_trait<std::uint32_t> {
-  using data_t = std::uint8_t;
-  using elem_t = data_t;
-};
-template <>
-struct image_trait<std::uint16_t>
-    : public image_trait<std::uint32_t> {
-  using data_t = std::uint16_t;
-  using elem_t = data_t;
-};
-template <>
-struct image_trait<std::int8_t> : public image_trait<std::int32_t> {
-  using data_t = std::int8_t;
-  using elem_t = data_t;
-};
-template <>
-struct image_trait<std::int16_t> : public image_trait<std::int32_t> {
-  using data_t = std::int16_t;
-  using elem_t = data_t;
-};
-template <>
-struct image_trait<char>
-    : public image_trait<typename std::conditional<
-          std::is_signed<char>::value, signed char, unsigned char>::type> {};
-
-template <class T>
-struct image_trait<sycl::vec<T, 1>> : public image_trait<T> {};
-
-template <class T>
-struct image_trait<sycl::vec<T, 2>> : public image_trait<T> {
-  using data_t = sycl::vec<T, 2>;
-  static constexpr int channel_num = 2;
-};
-
-template <class T>
-struct image_trait<sycl::vec<T, 3>>
-    : public image_trait<sycl::vec<T, 4>> {
-  static constexpr int channel_num = 3;
-};
-
-template <class T>
-struct image_trait<sycl::vec<T, 4>> : public image_trait<T> {
-  using data_t = sycl::vec<T, 4>;
-  static constexpr int channel_num = 4;
-};
-
-/// Functor to fetch data from read result of an image accessor.
-template <class T> struct fetch_data {
-  using return_t = typename image_trait<T>::data_t;
-  using acc_data_t = typename image_trait<T>::acc_data_t;
-
-  return_t operator()(acc_data_t &&original_data) {
-    return (return_t)original_data.r();
-  }
-};
-template <class T>
-struct fetch_data<sycl::vec<T, 1>> : public fetch_data<T> {};
-template <class T> struct fetch_data<sycl::vec<T, 2>> {
-  using return_t = typename image_trait<sycl::vec<T, 2>>::data_t;
-  using acc_data_t = typename image_trait<sycl::vec<T, 2>>::acc_data_t;
-
-  return_t operator()(acc_data_t &&origin_data) {
-    return return_t(origin_data.r(), origin_data.g());
-  }
-};
-template <class T>
-struct fetch_data<sycl::vec<T, 3>>
-    : public fetch_data<sycl::vec<T, 4>> {};
-template <class T> struct fetch_data<sycl::vec<T, 4>> {
-  using return_t = typename image_trait<sycl::vec<T, 4>>::data_t;
-  using acc_data_t = typename image_trait<sycl::vec<T, 4>>::acc_data_t;
-
-  return_t operator()(acc_data_t &&origin_data) {
-    return return_t(origin_data.r(), origin_data.g(), origin_data.b(),
-                    origin_data.a());
-  }
-};
-
-/// Create image according with given type \p T and \p dims.
-template <class T> static image_wrapper_base *create_image_wrapper(int dims);
-
-/// Create image with given data type \p T, channel order and dims
-template <class T>
-static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims);
-
-/// Create image with channel info and specified dimensions.
-static image_wrapper_base *create_image_wrapper(image_channel channel, int dims);
-
-} // namespace detail
-
-/// Image channel info, include channel number, order, data width and type
-class image_channel {
-  image_channel_data_type _type = image_channel_data_type::signed_int;
-  /// Number of channels.
-  unsigned _channel_num = 0;
-  /// Total size of all channels in bytes.
-  unsigned _total_size = 0;
-  /// Size of each channel in bytes.
-  unsigned _channel_size = 0;
-
-public:
-  /// Create image channel info according to template argument \p T.
-  template <class T> static image_channel create() {
-    image_channel channel;
-    channel.set_channel_size(detail::image_trait<T>::channel_num,
-                             sizeof(typename detail::image_trait<T>::elem_t) *
-                                 8);
-    channel.set_channel_data_type(detail::image_trait<T>::data_type);
-    return channel;
-  }
-
-  image_channel() = default;
-
-  image_channel_data_type get_channel_data_type() { return _type; }
-  void set_channel_data_type(image_channel_data_type type) { _type = type; }
-
-  unsigned get_total_size() { return _total_size; }
-
-  unsigned get_channel_num() { return _channel_num; }
-  void set_channel_num(unsigned channel_num) {
-    _channel_num = channel_num;
-    _total_size = _channel_size * _channel_num;
-  }
-
-  /// image_channel constructor.
-  /// \param r Channel r width in bits.
-  /// \param g Channel g width in bits. Should be same with \p r, or zero.
-  /// \param b Channel b width in bits. Should be same with \p g, or zero.
-  /// \param a Channel a width in bits. Should be same with \p b, or zero.
-  /// \param data_type Image channel data type: signed_nt, unsigned_int or fp.
-  image_channel(int r, int g, int b, int a, image_channel_data_type data_type) {
-    _type = data_type;
-    if (a) {
-      assert(r == a && "SYCL doesn't support different channel size");
-      assert(r == b && "SYCL doesn't support different channel size");
-      assert(r == g && "SYCL doesn't support different channel size");
-      set_channel_size(4, a);
-    } else if (b) {
-      assert(r == b && "SYCL doesn't support different channel size");
-      assert(r == g && "SYCL doesn't support different channel size");
-      set_channel_size(3, b);
-    } else if (g) {
-      assert(r == g && "SYCL doesn't support different channel size");
-      set_channel_size(2, g);
-    } else {
-      set_channel_size(1, r);
-    }
-  }
-
-  sycl::image_channel_type get_channel_type() const {
-    if (_channel_size == 4) {
-      if (_type == image_channel_data_type::signed_int)
-        return sycl::image_channel_type::signed_int32;
-      else if (_type == image_channel_data_type::unsigned_int)
-        return sycl::image_channel_type::unsigned_int32;
-      else if (_type == image_channel_data_type::fp)
-        return sycl::image_channel_type::fp32;
-    } else if (_channel_size == 2) {
-      if (_type == image_channel_data_type::signed_int)
-        return sycl::image_channel_type::signed_int16;
-      else if (_type == image_channel_data_type::unsigned_int)
-        return sycl::image_channel_type::unsigned_int16;
-      else if (_type == image_channel_data_type::fp)
-        return sycl::image_channel_type::fp16;
-    } else {
-      if (_type == image_channel_data_type::signed_int)
-        return sycl::image_channel_type::signed_int8;
-      else if (_type == image_channel_data_type::unsigned_int)
-        return sycl::image_channel_type::unsigned_int8;
-    }
-    assert(false && "unexpected channel data kind and channel size");
-    return sycl::image_channel_type::signed_int32;
-  }
-  void set_channel_type(sycl::image_channel_type type) {
-    switch (type) {
-    case sycl::image_channel_type::unsigned_int8:
-      _type = image_channel_data_type::unsigned_int;
-      _channel_size = 1;
-      break;
-    case sycl::image_channel_type::unsigned_int16:
-      _type = image_channel_data_type::unsigned_int;
-      _channel_size = 2;
-      break;
-    case sycl::image_channel_type::unsigned_int32:
-      _type = image_channel_data_type::unsigned_int;
-      _channel_size = 4;
-      break;
-    case sycl::image_channel_type::signed_int8:
-      _type = image_channel_data_type::signed_int;
-      _channel_size = 1;
-      break;
-    case sycl::image_channel_type::signed_int16:
-      _type = image_channel_data_type::signed_int;
-      _channel_size = 2;
-      break;
-    case sycl::image_channel_type::signed_int32:
-      _type = image_channel_data_type::signed_int;
-      _channel_size = 4;
-      break;
-    case sycl::image_channel_type::fp16:
-      _type = image_channel_data_type::fp;
-      _channel_size = 2;
-      break;
-    case sycl::image_channel_type::fp32:
-      _type = image_channel_data_type::fp;
-      _channel_size = 4;
-      break;
-    default:
-      break;
-    }
-    _total_size = _channel_size * _channel_num;
-  }
-
-  sycl::image_channel_order get_channel_order() const {
-    switch (_channel_num) {
-    case 1:
-      return sycl::image_channel_order::r;
-    case 2:
-      return sycl::image_channel_order::rg;
-    case 3:
-      return sycl::image_channel_order::rgb;
-    case 4:
-      return sycl::image_channel_order::rgba;
-    default:
-      return sycl::image_channel_order::r;
-    }
-  }
-  /// Get the size for each channel in bits.
-  unsigned get_channel_size() const { return _channel_size * 8; }
-
-  /// Set channel size.
-  /// \param in_channel_num Channels number to set.
-  /// \param channel_size Size for each channel in bits.
-  void set_channel_size(unsigned in_channel_num,
-                        unsigned channel_size) {
-    if (in_channel_num < _channel_num)
-      return;
-    _channel_num = in_channel_num;
-    _channel_size = channel_size / 8;
-    _total_size = _channel_size * _channel_num;
-  }
-};
-
-/// 2D or 3D matrix data for image.
-class image_matrix {
-  image_channel _channel;
-  int _range[3] = {1, 1, 1};
-  int _dims = 0;
-  void *_host_data = nullptr;
-
-  /// Set range of each dimension.
-  template <int dimensions> void set_range(sycl::range<dimensions> range) {
-    for (int i = 0; i < dimensions; ++i)
-      _range[i] = range[i];
-    _dims = dimensions;
-  }
-
-  template <int... DimIdx>
-  sycl::range<sizeof...(DimIdx)> get_range(integer_sequence<DimIdx...>) {
-    return sycl::range<sizeof...(DimIdx)>(_range[DimIdx]...);
-  }
-
-public:
-  /// Constructor with channel info and dimension size info.
-  template <int dimensions>
-  image_matrix(image_channel channel, sycl::range<dimensions> range)
-      : _channel(channel) {
-    set_range(range);
-    _host_data = std::malloc(range.size() * _channel.get_total_size());
-  }
-  image_matrix(sycl::image_channel_type channel_type, unsigned channel_num,
-               size_t x, size_t y) {
-    _channel.set_channel_type(channel_type);
-    _channel.set_channel_num(channel_num);
-    _dims = 1;
-    _range[0] = x;
-    if (y) {
-      _dims = 2;
-      _range[1] = y;
-    }
-    _host_data = std::malloc(_range[0] * _range[1] * _channel.get_total_size());
-  }
-
-  /// Construct a new image class with the matrix data.
-  template <int dimensions> sycl::image<dimensions> *create_image() {
-    return create_image<dimensions>(_channel);
-  }
-  /// Construct a new image class with the matrix data.
-  template <int dimensions>
-  sycl::image<dimensions> *create_image(image_channel channel) {
-    return new sycl::image<dimensions>(
-        _host_data, channel.get_channel_order(), channel.get_channel_type(),
-        get_range(make_index_sequence<dimensions>()),
-        sycl::property::image::use_host_ptr());
-  }
-
-  /// Get channel info.
-  inline image_channel get_channel() { return _channel; }
-  /// Get range of the image.
-  sycl::range<3> get_range() {
-    return sycl::range<3>(_range[0], _range[1], _range[2]);
-  }
-  /// Get matrix dims.
-  inline int get_dims() { return _dims; }
-  /// Convert to pitched data.
-  pitched_data to_pitched_data() {
-    return pitched_data(_host_data, _range[0] * _channel.get_total_size(),
-                        _range[0], _range[1]);
-  }
-
-  ~image_matrix() {
-    if (_host_data)
-      std::free(_host_data);
-    _host_data = nullptr;
-  }
-};
-using image_matrix_p = image_matrix *;
-
-enum class image_data_type { matrix, linear, pitch, unsupport };
-
-/// Image data info.
-class image_data {
-public:
-  image_data() { _type = image_data_type::unsupport; }
-  image_data(image_matrix_p matrix_data) { set_data(matrix_data); }
-  image_data(void *data_ptr, size_t x_size, image_channel channel) {
-    set_data(data_ptr, x_size, channel);
-  }
-  image_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size,
-             image_channel channel) {
-    set_data(data_ptr, x_size, y_size, pitch_size, channel);
-  }
-  void set_data(image_matrix_p matrix_data) {
-    _type = image_data_type::matrix;
-    _data = matrix_data;
-    _channel = matrix_data->get_channel();
-  }
-  void set_data(void *data_ptr, size_t x_size, image_channel channel) {
-    _type = image_data_type::linear;
-    _data = data_ptr;
-    _x = x_size;
-    _channel = channel;
-  }
-  void set_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size,
-                image_channel channel) {
-    _type = image_data_type::pitch;
-    _data = data_ptr;
-    _x = x_size;
-    _y = y_size;
-    _pitch = pitch_size;
-    _channel = channel;
-  }
-
-  image_data_type get_data_type() const { return _type; }
-  void set_data_type(image_data_type type) { _type = type; }
-
-  void *get_data_ptr() const { return _data; }
-  void set_data_ptr(void *data) { _data = data; }
-
-  size_t get_x() const { return _x; }
-  void set_x(size_t x) { _x = x; }
-
-  size_t get_y() const { return _y; }
-  void set_y(size_t y) { _y = y; }
-
-  size_t get_pitch() const { return _pitch; }
-  void set_pitch(size_t pitch) { _pitch = pitch; }
-
-  image_channel get_channel() const { return _channel; }
-  void set_channel(image_channel channel) { _channel = channel; }
-
-  image_channel_data_type get_channel_data_type() {
-    return _channel.get_channel_data_type();
-  }
-  void set_channel_data_type(image_channel_data_type type) {
-    _channel.set_channel_data_type(type);
-  }
-
-  unsigned get_channel_size() { return _channel.get_channel_size(); }
-  void set_channel_size(unsigned channel_num, unsigned channel_size) {
-    return _channel.set_channel_size(channel_num, channel_size);
-  }
-
-  unsigned get_channel_num() { return _channel.get_channel_num(); }
-  void set_channel_num(unsigned num) {
-    return _channel.set_channel_num(num);
-  }
-
-  sycl::image_channel_type get_channel_type() {
-    return _channel.get_channel_type();
-  }
-  void set_channel_type(sycl::image_channel_type type) {
-    return _channel.set_channel_type(type);
-  }
-
-private:
-  image_data_type _type;
-  void *_data = nullptr;
-  size_t _x, _y, _pitch;
-  image_channel _channel;
-};
-
-/// Image sampling info, include addressing mode, filtering mode and
-/// normalization info.
-class sampling_info {
-  sycl::addressing_mode _addressing_mode =
-      sycl::addressing_mode::clamp_to_edge;
-  sycl::filtering_mode _filtering_mode = sycl::filtering_mode::nearest;
-  sycl::coordinate_normalization_mode _coordinate_normalization_mode =
-      sycl::coordinate_normalization_mode::unnormalized;
-
-public:
-  sycl::addressing_mode get_addressing_mode() { return _addressing_mode; }
-  void set(sycl::addressing_mode addressing_mode) { _addressing_mode = addressing_mode; }
-
-  sycl::filtering_mode get_filtering_mode() { return _filtering_mode; }
-  void set(sycl::filtering_mode filtering_mode) { _filtering_mode = filtering_mode; }
-
-  sycl::coordinate_normalization_mode get_coordinate_normalization_mode() {
-    return _coordinate_normalization_mode;
-  }
-  void set(sycl::coordinate_normalization_mode coordinate_normalization_mode) {
-    _coordinate_normalization_mode = coordinate_normalization_mode;
-  }
-
-  bool is_coordinate_normalized() {
-    return _coordinate_normalization_mode ==
-           sycl::coordinate_normalization_mode::normalized;
-  }
-  void set_coordinate_normalization_mode(int is_normalized) {
-    _coordinate_normalization_mode =
-        is_normalized ? sycl::coordinate_normalization_mode::normalized
-                      : sycl::coordinate_normalization_mode::unnormalized;
-  }
-  void
-  set(sycl::addressing_mode addressing_mode,
-      sycl::filtering_mode filtering_mode,
-      sycl::coordinate_normalization_mode coordinate_normalization_mode) {
-    set(addressing_mode);
-    set(filtering_mode);
-    set(coordinate_normalization_mode);
-  }
-  void set(sycl::addressing_mode addressing_mode,
-           sycl::filtering_mode filtering_mode, int is_normalized) {
-    set(addressing_mode);
-    set(filtering_mode);
-    set_coordinate_normalization_mode(is_normalized);
-  }
-
-  sycl::sampler get_sampler() {
-    return sycl::sampler(_coordinate_normalization_mode, _addressing_mode,
-                             _filtering_mode);
-  }
-};
-
-/// Image base class.
-class image_wrapper_base {
-  sampling_info _sampling_info;
-  image_data _data;
-
-public:
-  virtual ~image_wrapper_base() = 0;
-
-  void attach(image_data data) { set_data(data); }
-  /// Attach matrix data to this class.
-  void attach(image_matrix *matrix) {
-    detach();
-    image_wrapper_base::set_data(image_data(matrix));
-  }
-  /// Attach matrix data to this class.
-  void attach(image_matrix *matrix, image_channel channel) {
-    attach(matrix);
-    image_wrapper_base::set_channel(channel);
-  }
-  /// Attach linear data to this class.
-  void attach(const void *ptr, size_t count) {
-    attach(ptr, count, get_channel());
-  }
-  /// Attach linear data to this class.
-  void attach(const void *ptr, size_t count, image_channel channel) {
-    detach();
-    image_wrapper_base::set_data(image_data(const_cast<void *>(ptr), count, channel));
-  }
-  /// Attach 2D data to this class.
-  void attach(const void *data, size_t x, size_t y, size_t pitch) {
-    attach(data, x, y, pitch, get_channel());
-  }
-  /// Attach 2D data to this class.
-  void attach(const void *data, size_t x, size_t y, size_t pitch,
-              image_channel channel) {
-    detach();
-    image_wrapper_base::set_data(
-        image_data(const_cast<void *>(data), x, y, pitch, channel));
-  }
-  /// Detach data.
-  virtual void detach() {}
-
-  sampling_info get_sampling_info() { return _sampling_info; }
-  void set_sampling_info(sampling_info info) {
-    _sampling_info = info;
-  }
-  const image_data &get_data() { return _data; }
-  void set_data(image_data data) { _data = data; }
-
-  image_channel get_channel() { return _data.get_channel(); }
-  void set_channel(image_channel channel) { _data.set_channel(channel); }
-
-  image_channel_data_type get_channel_data_type() {
-    return _data.get_channel_data_type();
-  }
-  void set_channel_data_type(image_channel_data_type type) {
-    _data.set_channel_data_type(type);
-  }
-
-  unsigned get_channel_size() { return _data.get_channel_size(); }
-  void set_channel_size(unsigned channel_num, unsigned channel_size) {
-    return _data.set_channel_size(channel_num, channel_size);
-  }
-
-  sycl::addressing_mode get_addressing_mode() {
-    return _sampling_info.get_addressing_mode();
-  }
-  void set(sycl::addressing_mode addressing_mode) {
-    _sampling_info.set(addressing_mode);
-  }
-
-  sycl::filtering_mode get_filtering_mode() {
-    return _sampling_info.get_filtering_mode();
-  }
-  void set(sycl::filtering_mode filtering_mode) {
-    _sampling_info.set(filtering_mode);
-  }
-
-  sycl::coordinate_normalization_mode get_coordinate_normalization_mode() {
-    return _sampling_info.get_coordinate_normalization_mode();
-  }
-  void
-  set(sycl::coordinate_normalization_mode coordinate_normalization_mode) {
-    _sampling_info.set(coordinate_normalization_mode);
-  }
-
-  bool is_coordinate_normalized() {
-    return _sampling_info.is_coordinate_normalized();
-  }
-  void set_coordinate_normalization_mode(int is_normalized) {
-    _sampling_info.set_coordinate_normalization_mode(is_normalized);
-  }
-  void
-  set(sycl::addressing_mode addressing_mode,
-      sycl::filtering_mode filtering_mode,
-      sycl::coordinate_normalization_mode coordinate_normalization_mode) {
-    set(addressing_mode);
-    set(filtering_mode);
-    set(coordinate_normalization_mode);
-  }
-  void set(sycl::addressing_mode addressing_mode,
-           sycl::filtering_mode filtering_mode, int is_normalized) {
-    set(addressing_mode);
-    set(filtering_mode);
-    set_coordinate_normalization_mode(is_normalized);
-  }
-
-  unsigned get_channel_num() { return _data.get_channel_num(); }
-  void set_channel_num(unsigned num) {
-    return _data.set_channel_num(num);
-  }
-
-  sycl::image_channel_type get_channel_type() {
-    return _data.get_channel_type();
-  }
-  void set_channel_type(sycl::image_channel_type type) {
-    return _data.set_channel_type(type);
-  }
-
-  sycl::sampler get_sampler() {
-    sycl::sampler smp = _sampling_info.get_sampler();
-    /// linear memory only used for sycl::filtering_mode::nearest.
-    if (_data.get_data_type() == image_data_type::linear) {
-      smp = sycl::sampler(smp.get_coordinate_normalization_mode(),
-                          smp.get_addressing_mode(),
-                          sycl::filtering_mode::nearest);
-    }
-    return smp;
-  }
-};
-inline image_wrapper_base::~image_wrapper_base() {}
-using image_wrapper_base_p = image_wrapper_base *;
-
-template <class T, int dimensions, bool IsImageArray> class image_accessor_ext;
-
-/// Image class, wrapper of sycl::image.
-template <class T, int dimensions, bool IsImageArray = false> class image_wrapper : public image_wrapper_base {
-  sycl::image<dimensions> *_image = nullptr;
-
-#ifndef DPCT_USM_LEVEL_NONE
-  std::vector<char> _host_buffer;
-#endif
-
-  void create_image(sycl::queue q) {
-    auto &data = get_data();
-    if (data.get_data_type() == image_data_type::matrix) {
-      _image = static_cast<image_matrix_p>(data.get_data_ptr())
-          ->create_image<dimensions>(data.get_channel());
-      return;
-    }
-    auto ptr = data.get_data_ptr();
-    auto channel = data.get_channel();
-
-    if (detail::get_pointer_attribute(q, ptr) == detail::pointer_access_attribute::device_only) {
-#ifdef DPCT_USM_LEVEL_NONE
-      ptr = get_buffer(ptr)
-                .template get_access<sycl::access_mode::read_write>()
-                .get_pointer();
-#else
-      auto sz = data.get_x();
-      if (data.get_data_type() == image_data_type::pitch)
-        sz *= channel.get_total_size() * data.get_y();
-      _host_buffer.resize(sz);
-      q.memcpy(_host_buffer.data(), ptr, sz).wait();
-      ptr = _host_buffer.data();
-#endif
-    }
-
-    if constexpr (dimensions == 1) {
-      assert(data.get_data_type() == image_data_type::linear);
-      _image = new sycl::image<1>(
-        ptr, channel.get_channel_order(), channel.get_channel_type(),
-        sycl::range<1>(data.get_x() / channel.get_total_size()));
-    } else if constexpr (dimensions == 2) {
-      assert(data.get_data_type() == image_data_type::pitch);
-      _image = new sycl::image<2>(ptr, channel.get_channel_order(),
-                                  channel.get_channel_type(),
-                                  sycl::range<2>(data.get_x(), data.get_y()),
-                                  sycl::range<1>(data.get_pitch()));
-    } else {
-      throw std::runtime_error("3D image only support matrix data");
-    }
-    return;
-  }
-
-public:
-  using acc_data_t = typename detail::image_trait<T>::acc_data_t;
-  using accessor_t =
-      typename image_accessor_ext<T, IsImageArray ? (dimensions - 1) : dimensions,
-                              IsImageArray>::accessor_t;
-
-  image_wrapper() { set_channel(image_channel::create<T>()); }
-  ~image_wrapper() { detach(); }
-
-  /// Get image accessor.
-  accessor_t get_access(sycl::handler &cgh, sycl::queue &q = get_default_queue()) {
-    if (!_image)
-      create_image(q);
-    return accessor_t(*_image, cgh);
-  }
-
-  /// Detach data.
-  void detach() override {
-    if (_image)
-      delete _image;
-    _image = nullptr;
-  }
-};
-
-/// Wrap sampler and image accessor together.
-template <class T, int dimensions, bool IsImageArray = false>
-class image_accessor_ext {
-public:
-  using accessor_t =
-      typename detail::image_trait<T>::template accessor_t<dimensions>;
-  using data_t = typename detail::image_trait<T>::data_t;
-  sycl::sampler _sampler;
-  accessor_t _img_acc;
-
-public:
-  image_accessor_ext(sycl::sampler sampler, accessor_t acc)
-      : _sampler(sampler), _img_acc(acc) {}
-
-  /// Read data from accessor.
-  template <bool Available = dimensions == 3>
-  typename std::enable_if<Available, data_t>::type read(float x, float y,
-                                                        float z) {
-    return detail::fetch_data<T>()(
-        _img_acc.read(sycl::float4(x, y, z, 0), _sampler));
-  }
-  /// Read data from accessor.
-  template <class Coord0, class Coord1, class Coord2,
-            bool Available = dimensions == 3 &&
-                             std::is_integral<Coord0>::value
-                                 &&std::is_integral<Coord1>::value
-                                     &&std::is_integral<Coord2>::value>
-  typename std::enable_if<Available, data_t>::type read(Coord0 x, Coord1 y,
-                                                        Coord2 z) {
-    return detail::fetch_data<T>()(
-        _img_acc.read(sycl::int4(x, y, z, 0), _sampler));
-  }
-  /// Read data from accessor.
-  template <bool Available = dimensions == 2>
-  typename std::enable_if<Available, data_t>::type read(float x, float y) {
-    return detail::fetch_data<T>()(
-        _img_acc.read(sycl::float2(x, y), _sampler));
-  }
-  /// Read data from accessor.
-  template <class Coord0, class Coord1,
-            bool Available = dimensions == 2 &&
-                             std::is_integral<Coord0>::value
-                                 &&std::is_integral<Coord1>::value>
-  typename std::enable_if<Available, data_t>::type read(Coord0 x, Coord1 y) {
-    return detail::fetch_data<T>()(
-        _img_acc.read(sycl::int2(x, y), _sampler));
-  }
-  /// Read data from accessor.
-  template <bool Available = dimensions == 1>
-  typename std::enable_if<Available, data_t>::type read(float x) {
-    return detail::fetch_data<T>()(_img_acc.read(x, _sampler));
-  }
-  /// Read data from accessor.
-  template <class CoordT,
-            bool Available = dimensions == 1 && std::is_integral<CoordT>::value>
-  typename std::enable_if<Available, data_t>::type read(CoordT x) {
-    return detail::fetch_data<T>()(_img_acc.read(x, _sampler));
-  }
-};
-
-template <class T, int dimensions> class image_accessor_ext<T, dimensions, true> {
-public:
-  using accessor_t =
-      typename detail::image_trait<T>::template array_accessor_t<dimensions>;
-  using data_t = typename detail::image_trait<T>::data_t;
-  sycl::sampler _sampler;
-  accessor_t _img_acc;
-
-public:
-  image_accessor_ext(sycl::sampler sampler, accessor_t acc)
-      : _sampler(sampler), _img_acc(acc) {}
-
-  /// Read data from accessor.
-  template <bool Available = dimensions == 2>
-  typename std::enable_if<Available, data_t>::type read(int index, float x,
-                                                        float y) {
-    return detail::fetch_data<T>()(
-        _img_acc[index].read(sycl::float2(x, y), _sampler));
-  }
-  /// Read data from accessor.
-  template <bool Available = dimensions == 2>
-  typename std::enable_if<Available, data_t>::type read(int index, int x, int y) {
-    return detail::fetch_data<T>()(
-        _img_acc[index].read(sycl::int2(x, y), _sampler));
-  }
-  /// Read data from accessor.
-  template <bool Available = dimensions == 1>
-  typename std::enable_if<Available, data_t>::type read(int index, float x) {
-    return detail::fetch_data<T>()(
-        _img_acc[index].read(x, _sampler));
-  }
-  /// Read data from accessor.
-  template <bool Available = dimensions == 1>
-  typename std::enable_if<Available, data_t>::type read(int index, int x) {
-    return detail::fetch_data<T>()(
-        _img_acc[index].read(x, _sampler));
-  }
-};
-
-/// Create image wrapper according to image data and sampling info.
-/// \return Pointer to image wrapper base class.
-/// \param data Image data used to create image wrapper.
-/// \param info Image sampling info used to create image wrapper.
-/// \returns Pointer to base class of created image wrapper object.
-static inline image_wrapper_base *create_image_wrapper(image_data data,
-                              sampling_info info) {
-  image_channel channel;
-  int dims = 1;
-  if (data.get_data_type() == image_data_type::matrix) {
-    auto matrix = (image_matrix_p)data.get_data_ptr();
-    channel = matrix->get_channel();
-    dims = matrix->get_dims();
-  } else {
-    if (data.get_data_type() == image_data_type::pitch) {
-      dims = 2;
-    }
-    channel = data.get_channel();
-  }
-
-  if (auto ret = detail::create_image_wrapper(channel, dims)) {
-    ret->set_sampling_info(info);
-    ret->set_data(data);
-    return ret;
-  }
-  return nullptr;
-}
-
-namespace detail {
-/// Create image according with given type \p T and \p dims.
-template <class T> static image_wrapper_base *create_image_wrapper(int dims) {
-  switch (dims) {
-  case 1:
-    return new image_wrapper<T, 1>();
-  case 2:
-    return new image_wrapper<T, 2>();
-  case 3:
-    return new image_wrapper<T, 3>();
-  default:
-    return nullptr;
-  }
-}
-/// Create image with given data type \p T, channel order and dims
-template <class T>
-static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims) {
-  switch (channel_num) {
-  case 1:
-    return create_image_wrapper<T>(dims);
-  case 2:
-    return create_image_wrapper<sycl::vec<T, 2>>(dims);
-  case 3:
-    return create_image_wrapper<sycl::vec<T, 3>>(dims);
-  case 4:
-    return create_image_wrapper<sycl::vec<T, 4>>(dims);
-  default:
-    return nullptr;
-  }
-}
-
-/// Create image with channel info and specified dimensions.
-static image_wrapper_base *create_image_wrapper(image_channel channel, int dims) {
-  switch (channel.get_channel_type()) {
-  case sycl::image_channel_type::fp16:
-    return create_image_wrapper<sycl::half>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::fp32:
-    return create_image_wrapper<float>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::signed_int8:
-    return create_image_wrapper<std::int8_t>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::signed_int16:
-    return create_image_wrapper<std::int16_t>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::signed_int32:
-    return create_image_wrapper<std::int32_t>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::unsigned_int8:
-    return create_image_wrapper<std::uint8_t>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::unsigned_int16:
-    return create_image_wrapper<std::uint16_t>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::unsigned_int32:
-    return create_image_wrapper<std::uint32_t>(channel.get_channel_num(), dims);
-  default:
-    return nullptr;
-  }
-}
-} // namespace detail
-
-} // namespace dpct
-
-#endif // !__DPCT_IMAGE_HPP__
--- a/dpct/kernel.hpp
+++ b/dpct/kernel.hpp
@ -1,459 +0,0 @@
-//==---- kernel.hpp -------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_KERNEL_HPP__
-#define __DPCT_KERNEL_HPP__
-
-#include <sycl/sycl.hpp>
-#ifdef _WIN32
-#include <unordered_set>
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-#if defined(__has_include) && __has_include(<filesystem>)
-#include <filesystem>
-#elif defined(__has_include) && __has_include(<experimental/filesystem>)
-#include <experimental/filesystem>
-#else
-#error "SYCLomatic runtime requires C++ filesystem support"
-#endif
-
-#include <fstream>
-#include <image.hpp>
-#include <random>
-
-namespace dpct {
-
-typedef void (*kernel_functor)(sycl::queue &, const sycl::nd_range<3> &,
-                               unsigned int, void **, void **);
-
-struct kernel_function_info {
-  int max_work_group_size = 0;
-};
-
-static inline void get_kernel_function_info(kernel_function_info *kernel_info,
-                                            const void *function) {
-  kernel_info->max_work_group_size =
-      dpct::dev_mgr::instance()
-          .current_device()
-          .get_info<sycl::info::device::max_work_group_size>();
-}
-static inline kernel_function_info
-get_kernel_function_info(const void *function) {
-  kernel_function_info kernel_info;
-  kernel_info.max_work_group_size =
-      dpct::dev_mgr::instance()
-          .current_device()
-          .get_info<sycl::info::device::max_work_group_size>();
-  return kernel_info;
-}
-
-
-namespace detail {
-
-#if defined(__has_include) && __has_include(<filesystem>)
-namespace fs = std::filesystem;
-#else
-namespace fs = std::experimental::filesystem;
-#endif
-
-/// Write data to temporary file and return absolute path to temporary file.
-/// Temporary file is created in a temporary directory both of which have random
-/// names with only the user having access permissions.  Only one temporary file
-/// will be created in the temporary directory.
-static inline fs::path write_data_to_file(char const *const data, size_t size) {
-  std::error_code ec;
-
-  if (sizeof(size_t) >= sizeof(std::streamsize) &&
-      size > (std::numeric_limits<std::streamsize>::max)())
-    throw std::runtime_error("data file too large");
-
-  // random number generator
-  std::random_device dev;
-  std::mt19937 prng(dev());
-  std::uniform_int_distribution<uint64_t> rand(0);
-
-  // find temporary directory
-  auto tmp_dir = fs::temp_directory_path(ec);
-  if (ec)
-    throw std::runtime_error("could not find temporary directory");
-
-  // create private directory
-  std::stringstream directory;
-  fs::path directory_path;
-  constexpr int max_attempts = 5;
-  int i;
-
-  for (i = 0; i < max_attempts; i++) {
-    directory << std::hex << rand(prng);
-    directory_path = tmp_dir / directory.str();
-    if (fs::create_directory(directory_path)) {
-      break;
-    }
-  }
-  if (i == max_attempts)
-    throw std::runtime_error("could not create directory");
-
-  // only allow owner permissions to private directory
-  fs::permissions(directory_path, fs::perms::owner_all, ec);
-  if (ec)
-    throw std::runtime_error("could not set directory permissions");
-
-  // random filename in private directory
-  std::stringstream filename;
-  filename << std::hex << rand(prng);
-#ifdef _WIN32
-  auto filepath = directory_path / (filename.str() + ".dll");
-#else
-  auto filepath = directory_path / filename.str();
-#endif
-
-  // write data to temporary file
-  auto outfile = std::ofstream(filepath, std::ios::out | std::ios::binary);
-  if (outfile) {
-    // only allow program to write file
-    fs::permissions(filepath, fs::perms::owner_write, ec);
-    if (ec)
-      throw std::runtime_error("could not set permissions");
-
-    outfile.write(data, size);
-    if (!outfile.good())
-      throw std::runtime_error("could not write data");
-    outfile.close();
-
-    // only allow program to read/execute file
-    fs::permissions(filepath, fs::perms::owner_read | fs::perms::owner_exec,
-                    ec);
-    if (ec)
-      throw std::runtime_error("could not set permissions");
-  } else
-    throw std::runtime_error("could not write data");
-
-  // check temporary file contents
-  auto infile = std::ifstream(filepath, std::ios::in | std::ios::binary);
-  if (infile) {
-    bool mismatch = false;
-    size_t cnt = 0;
-
-    while (1) {
-      char c;
-      infile.get(c);
-      if (infile.eof())
-        break;
-      if (c != data[cnt++])
-        mismatch = true;
-    }
-    if (cnt != size || mismatch)
-      throw std::runtime_error("file contents not written correctly");
-  } else
-    throw std::runtime_error("could not validate file");
-
-  if (!filepath.is_absolute())
-    throw std::runtime_error("temporary filepath is not absolute");
-
-  return filepath;
-}
-
-static inline uint16_t extract16(unsigned char const *const ptr) {
-  uint16_t ret = 0;
-
-  ret |= static_cast<uint16_t>(ptr[0]) << 0;
-  ret |= static_cast<uint16_t>(ptr[1]) << 8;
-
-  return (ret);
-}
-
-static inline uint32_t extract32(unsigned char const *const ptr) {
-  uint32_t ret = 0;
-
-  ret |= static_cast<uint32_t>(ptr[0]) << 0;
-  ret |= static_cast<uint32_t>(ptr[1]) << 8;
-  ret |= static_cast<uint32_t>(ptr[2]) << 16;
-  ret |= static_cast<uint32_t>(ptr[3]) << 24;
-
-  return (ret);
-}
-
-static inline uint64_t extract64(unsigned char const *const ptr) {
-  uint64_t ret = 0;
-
-  ret |= static_cast<uint64_t>(ptr[0]) << 0;
-  ret |= static_cast<uint64_t>(ptr[1]) << 8;
-  ret |= static_cast<uint64_t>(ptr[2]) << 16;
-  ret |= static_cast<uint64_t>(ptr[3]) << 24;
-  ret |= static_cast<uint64_t>(ptr[4]) << 32;
-  ret |= static_cast<uint64_t>(ptr[5]) << 40;
-  ret |= static_cast<uint64_t>(ptr[6]) << 48;
-  ret |= static_cast<uint64_t>(ptr[7]) << 56;
-
-  return (ret);
-}
-
-static inline uint64_t get_lib_size(char const *const blob) {
-#ifdef _WIN32
-  ///////////////////////////////////////////////////////////////////////
-  // Analyze DOS stub
-  unsigned char const *const ublob =
-      reinterpret_cast<unsigned char const *const>(blob);
-  if (ublob[0] != 0x4d || ublob[1] != 0x5a) {
-    throw std::runtime_error("Blob is not a Windows DLL.");
-  }
-  uint32_t pe_header_offset = extract32(ublob + 0x3c);
-
-  ///////////////////////////////////////////////////////////////////////
-  // Ananlyze PE-header
-  unsigned char const *const pe_header = ublob + pe_header_offset;
-
-  // signature
-  uint32_t pe_signature = extract32(pe_header + 0);
-  if (pe_signature != 0x00004550) {
-    throw std::runtime_error("PE-header signature is not 0x00004550");
-  }
-
-  // machine
-  uint16_t machine = extract16(pe_header + 4);
-  if (machine != 0x8664) {
-    throw std::runtime_error("Only DLLs for x64 supported");
-  }
-
-  // number of sections
-  uint16_t number_of_sections = extract16(pe_header + 6);
-
-  // sizeof optional header
-  uint16_t sizeof_optional_header = extract16(pe_header + 20);
-
-  // magic
-  uint16_t magic = extract16(pe_header + 24);
-  if (magic != 0x10b && magic != 0x20b) {
-    throw std::runtime_error("MAGIC is not 0x010b or 0x020b");
-  }
-
-  ///////////////////////////////////////////////////////////////////////
-  // Analyze tail of optional header
-  constexpr int coff_header_size = 24;
-
-  unsigned char const *const tail_of_optional_header =
-      pe_header + coff_header_size + sizeof_optional_header;
-  if (extract64(tail_of_optional_header - 8) != 0) {
-    throw std::runtime_error("Optional header not zero-padded");
-  }
-
-  ///////////////////////////////////////////////////////////////////////
-  // Analyze last section header
-  constexpr int section_header_size = 40;
-  unsigned char const *const last_section_header =
-      tail_of_optional_header + section_header_size * (number_of_sections - 1);
-
-  uint32_t sizeof_raw_data = extract32(last_section_header + 16);
-  uint32_t pointer_to_raw_data = extract32(last_section_header + 20);
-
-  return sizeof_raw_data + pointer_to_raw_data;
-#else
-  if (blob[0] != 0x7F || blob[1] != 'E' || blob[2] != 'L' || blob[3] != 'F')
-    throw std::runtime_error("Blob is not in ELF format");
-
-  if (blob[4] != 0x02)
-    throw std::runtime_error("Only 64-bit headers are supported");
-
-  if (blob[5] != 0x01)
-    throw std::runtime_error("Only little-endian headers are supported");
-
-  unsigned char const *const ublob =
-      reinterpret_cast<unsigned char const *const>(blob);
-  uint64_t e_shoff = extract64(ublob + 0x28);
-  uint16_t e_shentsize = extract16(ublob + 0x3A);
-  uint16_t e_shnum = extract16(ublob + 0x3C);
-
-  return e_shoff + (e_shentsize * e_shnum);
-#endif
-}
-
-#ifdef _WIN32
-class path_lib_record {
-public:
-  void operator=(const path_lib_record &) = delete;
-  ~path_lib_record() {
-    for (auto entry : lib_to_path) {
-      FreeLibrary(static_cast<HMODULE>(entry.first));
-      fs::permissions(entry.second, fs::perms::owner_all);
-      fs::remove_all(entry.second.remove_filename());
-    }
-  }
-  static void record_lib_path(fs::path path, void *library) {
-    lib_to_path[library] = path;
-  }
-  static void remove_lib(void *library) {
-    auto path = lib_to_path[library];
-    std::error_code ec;
-
-    FreeLibrary(static_cast<HMODULE>(library));
-    fs::permissions(path, fs::perms::owner_all);
-    if (fs::remove_all(path.remove_filename(), ec) != 2 || ec)
-      // one directory and one temporary file should have been deleted
-      throw std::runtime_error("Directory delete failed");
-
-    lib_to_path.erase(library);
-  }
-
-private:
-  static inline std::unordered_map<void *, fs::path> lib_to_path;
-};
-#endif
-
-} // namespace detail
-
-class kernel_library {
-public:
-  kernel_library() : ptr{nullptr} {}
-  kernel_library(void *ptr) : ptr{ptr} {}
-
-  operator void *() const { return ptr; }
-
-private:
-  void *ptr;
-#ifdef _WIN32
-  static inline detail::path_lib_record single_instance_to_trigger_destructor;
-#endif
-};
-
-namespace detail {
-
-static inline kernel_library load_dl_from_data(char const *const data,
-                                               size_t size) {
-  fs::path filename = write_data_to_file(data, size);
-#ifdef _WIN32
-  void *so = LoadLibraryW(filename.wstring().c_str());
-#else
-  void *so = dlopen(filename.c_str(), RTLD_LAZY);
-#endif
-  if (so == nullptr)
-    throw std::runtime_error("Failed to load kernel library");
-
-#ifdef _WIN32
-  detail::path_lib_record::record_lib_path(filename, so);
-#else
-  std::error_code ec;
-
-  // Windows DLL cannot be deleted while in use
-  if (fs::remove_all(filename.remove_filename(), ec) != 2 || ec)
-    // one directory and one temporary file should have been deleted
-    throw std::runtime_error("Directory delete failed");
-#endif
-
-  return so;
-}
-
-} // namespace detail
-
-/// Load kernel library and return a handle to use the library.
-/// \param [in] name The name of the library.
-static inline kernel_library load_kernel_library(const std::string &name) {
-  std::ifstream ifs;
-  ifs.open(name, std::ios::in | std::ios::binary);
-
-  std::stringstream buffer;
-  buffer << ifs.rdbuf();
-
-  const std::string buffer_string = buffer.str();
-  return detail::load_dl_from_data(buffer_string.c_str(), buffer_string.size());
-}
-
-/// Load kernel library whose image is alreay in memory and return a handle to
-/// use the library.
-/// \param [in] image A pointer to the image in memory.
-static inline kernel_library load_kernel_library_mem(char const *const image) {
-  const size_t size = detail::get_lib_size(image);
-
-  return detail::load_dl_from_data(image, size);
-}
-
-/// Unload kernel library.
-/// \param [in,out] library Handle to the library to be closed.
-static inline void unload_kernel_library(const kernel_library &library) {
-#ifdef _WIN32
-  detail::path_lib_record::remove_lib(library);
-#else
-  dlclose(library);
-#endif
-}
-
-class kernel_function {
-public:
-  kernel_function() : ptr{nullptr} {}
-  kernel_function(dpct::kernel_functor ptr) : ptr{ptr} {}
-
-  operator void *() const { return ((void *)ptr); }
-
-  void operator()(sycl::queue &q, const sycl::nd_range<3> &range,
-                  unsigned int a, void **args, void **extra) {
-    ptr(q, range, a, args, extra);
-  }
-
-private:
-  dpct::kernel_functor ptr;
-};
-
-/// Find kernel function in a kernel library and return its address.
-/// \param [in] library Handle to the kernel library.
-/// \param [in] name Name of the kernel function.
-static inline dpct::kernel_function
-get_kernel_function(kernel_library &library, const std::string &name) {
-#ifdef _WIN32
-  dpct::kernel_functor fn = reinterpret_cast<dpct::kernel_functor>(
-      GetProcAddress(static_cast<HMODULE>(static_cast<void *>(library)),
-                     (name + std::string("_wrapper")).c_str()));
-#else
-  dpct::kernel_functor fn = reinterpret_cast<dpct::kernel_functor>(
-      dlsym(library, (name + std::string("_wrapper")).c_str()));
-#endif
-  if (fn == nullptr)
-    throw std::runtime_error("Failed to get function");
-  return fn;
-}
-
-/// Invoke a kernel function.
-/// \param [in] function kernel function.
-/// \param [in] queue SYCL queue used to execute kernel
-/// \param [in] groupRange SYCL group range
-/// \param [in] localRange SYCL local range
-/// \param [in] localMemSize The size of local memory required by the kernel
-///             function.
-/// \param [in] kernelParams Array of pointers to kernel arguments.
-/// \param [in] extra Extra arguments.
-static inline void invoke_kernel_function(dpct::kernel_function &function,
-                                          sycl::queue &queue,
-                                          sycl::range<3> groupRange,
-                                          sycl::range<3> localRange,
-                                          unsigned int localMemSize,
-                                          void **kernelParams, void **extra) {
-  function(queue, sycl::nd_range<3>(groupRange * localRange, localRange),
-           localMemSize, kernelParams, extra);
-}
-
-/// Find image wrapper in a kernel library and return its address.
-/// \param [in] library Handle to the kernel library.
-/// \param [in] name Name of the target image wrapper.
-static inline dpct::image_wrapper_base_p
-get_image_wrapper(dpct::kernel_library &library, const std::string &name) {
-#ifdef _WIN32
-  dpct::image_wrapper_base_p fn =
-      reinterpret_cast<dpct::image_wrapper_base_p>(GetProcAddress(
-          static_cast<HMODULE>(static_cast<void *>(library)), name.c_str()));
-#else
-  dpct::image_wrapper_base_p fn = reinterpret_cast<dpct::image_wrapper_base_p>(
-      dlsym(library, name.c_str()));
-#endif
-  if (fn == nullptr)
-    throw std::runtime_error("Failed to get image");
-  return fn;
-}
-
-} // namespace dpct
-#endif // __DPCT_KERNEL_HPP__
--- a/dpct/lapack_utils.hpp
+++ b/dpct/lapack_utils.hpp
--- a/dpct/lib_common_utils.hpp
+++ b/dpct/lib_common_utils.hpp
@ -1,174 +0,0 @@
-//==---- lib_common_utils.hpp ---------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_LIB_COMMON_UTILS_HPP__
-#define __DPCT_LIB_COMMON_UTILS_HPP__
-
-#include <sycl/sycl.hpp>
-#include <oneapi/mkl.hpp>
-#include "memory.hpp"
-#include "util.hpp"
-
-namespace dpct {
-namespace detail {
-template <typename T> inline auto get_memory(const void *x) {
-  T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
-#ifdef DPCT_USM_LEVEL_NONE
-  return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
-#else
-  return new_x;
-#endif
-}
-
-template <typename T>
-inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q) {
-  using Ty = typename DataType<T>::T2;
-  Ty s_h;
-  if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
-    detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
-        .wait();
-  else
-    s_h = *reinterpret_cast<const Ty *>(s);
-  return s_h;
-}
-} // namespace detail
-
-enum class version_field : int { major, minor, update, patch };
-
-/// Returns the requested field of Intel(R) oneAPI Math Kernel Library version.
-/// \param field The version information field (major, minor, update or patch).
-/// \param result The result value.
-inline void mkl_get_version(version_field field, int *result) {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-  MKLVersion version;
-  mkl_get_version(&version);
-  if (version_field::major == field) {
-    *result = version.MajorVersion;
-  } else if (version_field::minor == field) {
-    *result = version.MinorVersion;
-  } else if (version_field::update == field) {
-    *result = version.UpdateVersion;
-  } else if (version_field::patch == field) {
-    *result = 0;
-  } else {
-    throw std::runtime_error("unknown field");
-  }
-#endif
-}
-
-enum class library_data_t : unsigned char {
-  real_float = 0,
-  complex_float,
-  real_double,
-  complex_double,
-  real_half,
-  complex_half,
-  real_bfloat16,
-  complex_bfloat16,
-  real_int4,
-  complex_int4,
-  real_uint4,
-  complex_uint4,
-  real_int8,
-  complex_int8,
-  real_uint8,
-  complex_uint8,
-  real_int16,
-  complex_int16,
-  real_uint16,
-  complex_uint16,
-  real_int32,
-  complex_int32,
-  real_uint32,
-  complex_uint32,
-  real_int64,
-  complex_int64,
-  real_uint64,
-  complex_uint64,
-  real_int8_4,
-  real_int8_32,
-  real_uint8_4,
-  library_data_t_size
-};
-
-namespace detail {
-template <typename ArgT>
-inline constexpr std::uint64_t get_type_combination_id(ArgT Val) {
-  static_assert((unsigned char)library_data_t::library_data_t_size <=
-                    std::numeric_limits<unsigned char>::max() &&
-                "library_data_t size exceeds limit.");
-  static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
-  return (std::uint64_t)Val;
-}
-
-template <typename FirstT, typename... RestT>
-inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
-                                                       RestT... RestVal) {
-  static_assert((std::uint8_t)library_data_t::library_data_t_size <=
-                    std::numeric_limits<unsigned char>::max() &&
-                "library_data_t size exceeds limit.");
-  static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
-  static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
-  return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
-}
-
-inline constexpr std::size_t library_data_size[] = {
-    8 * sizeof(float),                    // real_float
-    8 * sizeof(std::complex<float>),      // complex_float
-    8 * sizeof(double),                   // real_double
-    8 * sizeof(std::complex<double>),     // complex_double
-    8 * sizeof(sycl::half),               // real_half
-    8 * sizeof(std::complex<sycl::half>), // complex_half
-    16,                                   // real_bfloat16
-    16 * 2,                               // complex_bfloat16
-    4,                                    // real_int4
-    4 * 2,                                // complex_int4
-    4,                                    // real_uint4
-    4 * 2,                                // complex_uint4
-    8,                                    // real_int8
-    8 * 2,                                // complex_int8
-    8,                                    // real_uint8
-    8 * 2,                                // complex_uint8
-    16,                                   // real_int16
-    16 * 2,                               // complex_int16
-    16,                                   // real_uint16
-    16 * 2,                               // complex_uint16
-    32,                                   // real_int32
-    32 * 2,                               // complex_int32
-    32,                                   // real_uint32
-    32 * 2,                               // complex_uint32
-    64,                                   // real_int64
-    64 * 2,                               // complex_int64
-    64,                                   // real_uint64
-    64 * 2,                               // complex_uint64
-    8,                                    // real_int8_4
-    8,                                    // real_int8_32
-    8                                     // real_uint8_4
-};
-} // namespace detail
-
-#ifdef DPCT_USM_LEVEL_NONE
-/// Cast a "rvalue reference to a temporary object" to an "lvalue reference to
-/// that temporary object".
-/// CAUTION:
-/// The returned lvalue reference is available only before the last step in
-/// evaluating the full-expression that contains this function call.
-/// \param [in] temporary_object The rvalue reference to a temporary object.
-/// \returns The lvalue reference to that temporary object.
-template <typename T>
-inline typename std::enable_if_t<std::is_rvalue_reference_v<T &&>, T &>
-rvalue_ref_to_lvalue_ref(T &&temporary_object) {
-  return temporary_object;
-}
-#endif
-} // namespace dpct
-
-#endif // __DPCT_LIB_COMMON_UTILS_HPP__
--- a/dpct/math.hpp
+++ b/dpct/math.hpp
--- a/dpct/memory.hpp
+++ b/dpct/memory.hpp
--- a/dpct/rng_utils.hpp
+++ b/dpct/rng_utils.hpp
@ -1,535 +0,0 @@
-//==---- rng_utils.hpp ----------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_RNG_UTILS_HPP__
-#define __DPCT_RNG_UTILS_HPP__
-
-#include <sycl/sycl.hpp>
-#include <oneapi/mkl.hpp>
-#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
-#include <oneapi/mkl/rng/device.hpp>
-#endif
-#include "device.hpp"
-#include "lib_common_utils.hpp"
-
-namespace dpct {
-namespace rng {
-#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
-namespace device {
-/// The random number generator on device.
-/// \tparam engine_t The device random number generator engine. It can only be
-/// oneapi::mkl::rng::device::mrg32k3a<1> or
-/// oneapi::mkl::rng::device::mrg32k3a<4> or
-/// oneapi::mkl::rng::device::philox4x32x10<1> or
-/// oneapi::mkl::rng::device::philox4x32x10<4>.
-template <typename engine_t> class rng_generator {
-  static_assert(
-      std::disjunction_v<
-          std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<1>>,
-          std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<4>>,
-          std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<1>>,
-          std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<4>>,
-          std::is_same<engine_t, oneapi::mkl::rng::device::mcg59<1>>>,
-      "engine_t can only be oneapi::mkl::rng::device::mrg32k3a<1> or "
-      "oneapi::mkl::rng::device::mrg32k3a<4> or "
-      "oneapi::mkl::rng::device::philox4x32x10<1> or "
-      "oneapi::mkl::rng::device::philox4x32x10<4> or "
-      "oneapi::mkl::rng::device::mcg59<1>.");
-  static constexpr bool _is_engine_vec_size_one = std::disjunction_v<
-      std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<1>>,
-      std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<1>>,
-      std::is_same<engine_t, oneapi::mkl::rng::device::mcg59<1>>>;
-  static constexpr std::uint64_t default_seed = 0;
-  oneapi::mkl::rng::device::bits<std::uint32_t> _distr_bits;
-  oneapi::mkl::rng::device::uniform_bits<std::uint32_t> _distr_uniform_bits;
-  oneapi::mkl::rng::device::gaussian<float> _distr_gaussian_float;
-  oneapi::mkl::rng::device::gaussian<double> _distr_gaussian_double;
-  oneapi::mkl::rng::device::lognormal<float> _distr_lognormal_float;
-  oneapi::mkl::rng::device::lognormal<double> _distr_lognormal_double;
-  oneapi::mkl::rng::device::poisson<std::uint32_t> _distr_poisson;
-  oneapi::mkl::rng::device::uniform<float> _distr_uniform_float;
-  oneapi::mkl::rng::device::uniform<double> _distr_uniform_double;
-  engine_t _engine;
-
-public:
-  /// Default constructor of rng_generator
-  rng_generator() { _engine = engine_t(default_seed); }
-  /// Constructor of rng_generator if engine type is not mcg59
-  /// \param [in] seed The seed to initialize the engine state.
-  /// \param [in] num_to_skip Set the number of elements need to be skipped.
-  /// The number is calculated as: num_to_skip[0] + num_to_skip[1] * 2^64 +
-  /// num_to_skip[2] * 2^128 + ... + num_to_skip[n-1] * 2^(64*(n-1))
-  template <typename T = engine_t,
-            typename std::enable_if<!std::is_same_v<
-                T, oneapi::mkl::rng::device::mcg59<1>>>::type * = nullptr>
-  rng_generator(std::uint64_t seed,
-                std::initializer_list<std::uint64_t> num_to_skip) {
-    _engine = engine_t(seed, num_to_skip);
-  }
-  /// Constructor of rng_generator if engine type is mcg59
-  /// \param [in] seed The seed to initialize the engine state.
-  /// \param [in] num_to_skip Set the number of elements need to be skipped.
-  template <typename T = engine_t,
-            typename std::enable_if<std::is_same_v<
-                T, oneapi::mkl::rng::device::mcg59<1>>>::type * = nullptr>
-  rng_generator(std::uint64_t seed, std::uint64_t num_to_skip) {
-    _engine = engine_t(seed, num_to_skip);
-  }
-
-  /// Generate random number(s) obeys distribution \tparam distr_t.
-  /// \tparam T The distribution of the random number. It can only be
-  /// oneapi::mkl::rng::device::bits<std::uint32_t>,
-  /// oneapi::mkl::rng::device::uniform_bits<std::uint32_t>,
-  /// oneapi::mkl::rng::device::gaussian<float>,
-  /// oneapi::mkl::rng::device::gaussian<double>,
-  /// oneapi::mkl::rng::device::lognormal<float>,
-  /// oneapi::mkl::rng::device::lognormal<double>,
-  /// oneapi::mkl::rng::device::poisson<std::uint32_t>,
-  /// oneapi::mkl::rng::device::uniform<float> or
-  /// oneapi::mkl::rng::device::uniform<double>
-  /// \tparam vec_size The length of the return vector. It can only be 1, 2
-  /// or 4.
-  /// \param distr_params The parameter(s) for lognormal or poisson
-  /// distribution.
-  /// \return The vector of the random number(s).
-  template <typename distr_t, int vec_size, class... distr_params_t>
-  auto generate(distr_params_t... distr_params) {
-    static_assert(vec_size == 1 || vec_size == 2 || vec_size == 4,
-                  "vec_size is not supported.");
-    static_assert(
-        std::disjunction_v<
-            std::is_same<distr_t,
-                         oneapi::mkl::rng::device::bits<std::uint32_t>>,
-            std::is_same<distr_t,
-                         oneapi::mkl::rng::device::uniform_bits<std::uint32_t>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::gaussian<float>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::gaussian<double>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::lognormal<float>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::lognormal<double>>,
-            std::is_same<distr_t,
-                         oneapi::mkl::rng::device::poisson<std::uint32_t>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::uniform<float>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::uniform<double>>>,
-        "distribution is not supported.");
-
-    if constexpr (std::is_same_v<
-                      distr_t, oneapi::mkl::rng::device::bits<std::uint32_t>>) {
-      return generate_vec<vec_size>(_distr_bits);
-    }
-    if constexpr (std::is_same_v<
-                      distr_t,
-                      oneapi::mkl::rng::device::uniform_bits<std::uint32_t>>) {
-      return generate_vec<vec_size>(_distr_uniform_bits);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::gaussian<float>>) {
-      return generate_vec<vec_size>(_distr_gaussian_float);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::gaussian<double>>) {
-      return generate_vec<vec_size>(_distr_gaussian_double);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::lognormal<float>>) {
-      return generate_vec<vec_size>(_distr_lognormal_float, distr_params...,
-                                    0.0f, 1.0f);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::lognormal<double>>) {
-      return generate_vec<vec_size>(_distr_lognormal_double, distr_params...,
-                                    0.0, 1.0);
-    }
-    if constexpr (std::is_same_v<distr_t, oneapi::mkl::rng::device::poisson<
-                                              std::uint32_t>>) {
-      return generate_vec<vec_size>(_distr_poisson, distr_params...);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::uniform<float>>) {
-      return generate_vec<vec_size>(_distr_uniform_float);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::uniform<double>>) {
-      return generate_vec<vec_size>(_distr_uniform_double);
-    }
-  }
-
-  /// Get the random number generator engine.
-  /// \return The reference of the internal random number generator engine.
-  engine_t &get_engine() { return _engine; }
-
-private:
-  template <int vec_size, typename distr_t, class... distr_params_t>
-  auto generate_vec(distr_t &distr, distr_params_t... distr_params) {
-    if constexpr (sizeof...(distr_params_t)) {
-      typename distr_t::param_type pt(distr_params...);
-      distr.param(pt);
-    }
-    if constexpr (vec_size == 4) {
-      if constexpr (_is_engine_vec_size_one) {
-        sycl::vec<typename distr_t::result_type, 4> res;
-        res.x() = oneapi::mkl::rng::device::generate(distr, _engine);
-        res.y() = oneapi::mkl::rng::device::generate(distr, _engine);
-        res.z() = oneapi::mkl::rng::device::generate(distr, _engine);
-        res.w() = oneapi::mkl::rng::device::generate(distr, _engine);
-        return res;
-      } else {
-        return oneapi::mkl::rng::device::generate(distr, _engine);
-      }
-    } else if constexpr (vec_size == 1) {
-      if constexpr (_is_engine_vec_size_one) {
-        return oneapi::mkl::rng::device::generate(distr, _engine);
-      } else {
-        return oneapi::mkl::rng::device::generate_single(distr, _engine);
-      }
-    } else if constexpr (vec_size == 2) {
-      if constexpr (_is_engine_vec_size_one) {
-        sycl::vec<typename distr_t::result_type, 2> res;
-        res.x() = oneapi::mkl::rng::device::generate(distr, _engine);
-        res.y() = oneapi::mkl::rng::device::generate(distr, _engine);
-        return res;
-      } else {
-        sycl::vec<typename distr_t::result_type, 2> res;
-        res.x() = oneapi::mkl::rng::device::generate_single(distr, _engine);
-        res.y() = oneapi::mkl::rng::device::generate_single(distr, _engine);
-        return res;
-      }
-    }
-  }
-};
-
-} // namespace device
-#endif
-
-namespace host {
-namespace detail {
-class rng_generator_base {
-public:
-  /// Set the seed of host rng_generator.
-  /// \param seed The engine seed.
-  virtual void set_seed(const std::uint64_t seed) = 0;
-
-  /// Set the dimensions of host rng_generator.
-  /// \param dimensions The engine dimensions.
-  virtual void set_dimensions(const std::uint32_t dimensions) = 0;
-
-  /// Set the queue of host rng_generator.
-  /// \param queue The engine queue.
-  virtual void set_queue(sycl::queue *queue) = 0;
-
-  /// Generate unsigned int random number(s) with 'uniform_bits' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  virtual inline void generate_uniform_bits(unsigned int *output,
-                                            std::int64_t n) = 0;
-
-  /// Generate unsigned long long random number(s) with 'uniform_bits'
-  /// distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  virtual inline void generate_uniform_bits(unsigned long long *output,
-                                            std::int64_t n) = 0;
-
-  /// Generate float random number(s) with 'lognormal' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param m Mean of associated normal distribution
-  /// \param s Standard deviation of associated normal distribution.
-  virtual inline void generate_lognormal(float *output, std::int64_t n, float m,
-                                         float s) = 0;
-
-  /// Generate double random number(s) with 'lognormal' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param m Mean of associated normal distribution
-  /// \param s Standard deviation of associated normal distribution.
-  virtual inline void generate_lognormal(double *output, std::int64_t n,
-                                         double m, double s) = 0;
-
-  /// Generate float random number(s) with 'gaussian' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param mean Mean of normal distribution
-  /// \param stddev Standard deviation of normal distribution.
-  virtual inline void generate_gaussian(float *output, std::int64_t n,
-                                        float mean, float stddev) = 0;
-
-  /// Generate double random number(s) with 'gaussian' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param mean Mean of normal distribution
-  /// \param stddev Standard deviation of normal distribution.
-  virtual inline void generate_gaussian(double *output, std::int64_t n,
-                                        double mean, double stddev) = 0;
-
-  /// Generate unsigned int random number(s) with 'poisson' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param lambda Lambda for the Poisson distribution.
-  virtual inline void generate_poisson(unsigned int *output, std::int64_t n,
-                                       double lambda) = 0;
-
-  /// Generate float random number(s) with 'uniform' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  virtual inline void generate_uniform(float *output, std::int64_t n) = 0;
-
-  /// Generate double random number(s) with 'uniform' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  virtual inline void generate_uniform(double *output, std::int64_t n) = 0;
-
-  /// Skip ahead several random number(s).
-  /// \param num_to_skip The number of random numbers to be skipped.
-  virtual void skip_ahead(const std::uint64_t num_to_skip) = 0;
-
-  /// Set the direction numbers of host rng_generator. Only Sobol engine
-  /// supports this method.
-  /// \param direction_numbers The engine direction numbers.
-  virtual void set_direction_numbers(
-      const std::vector<std::uint32_t> &direction_numbers) = 0;
-
-protected:
-  sycl::queue *_queue{&dpct::get_default_queue()};
-  std::uint64_t _seed{0};
-  std::uint32_t _dimensions{1};
-  std::vector<std::uint32_t> _direction_numbers;
-};
-
-/// The random number generator on host.
-template <typename engine_t = oneapi::mkl::rng::philox4x32x10>
-class rng_generator : public rng_generator_base {
-public:
-  /// Constructor of rng_generator.
-  rng_generator() : _engine(create_engine(_queue, _seed, _dimensions)) {}
-
-  /// Set the seed of host rng_generator.
-  /// \param seed The engine seed.
-  void set_seed(const std::uint64_t seed) {
-    if (seed == _seed) {
-      return;
-    }
-    _seed = seed;
-    _engine = create_engine(_queue, _seed, _dimensions);
-  }
-
-  /// Set the dimensions of host rng_generator.
-  /// \param dimensions The engine dimensions.
-  void set_dimensions(const std::uint32_t dimensions) {
-    if (dimensions == _dimensions) {
-      return;
-    }
-    _dimensions = dimensions;
-    _engine = create_engine(_queue, _seed, _dimensions);
-  }
-
-  /// Set the queue of host rng_generator.
-  /// \param queue The engine queue.
-  void set_queue(sycl::queue *queue) {
-    if (queue == _queue) {
-      return;
-    }
-    _queue = queue;
-    _engine = create_engine(_queue, _seed, _dimensions);
-  }
-
-  /// Set the direction numbers of Sobol host rng_generator.
-  /// \param direction_numbers The user-defined direction numbers.
-  void
-  set_direction_numbers(const std::vector<std::uint32_t> &direction_numbers) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                             "Interfaces Project does not support this API.");
-#else
-    if constexpr (std::is_same_v<engine_t, oneapi::mkl::rng::sobol>) {
-      if (direction_numbers == _direction_numbers) {
-        return;
-      }
-      _direction_numbers = direction_numbers;
-      _engine = oneapi::mkl::rng::sobol(*_queue, _direction_numbers);
-    } else {
-      throw std::runtime_error("Only Sobol engine supports this method.");
-    }
-#endif
-  }
-
-  /// Generate unsigned int random number(s) with 'uniform_bits' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  inline void generate_uniform_bits(unsigned int *output, std::int64_t n) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                             "Interfaces Project does not support this API.");
-#else
-    static_assert(sizeof(unsigned int) == sizeof(std::uint32_t));
-    generate<oneapi::mkl::rng::uniform_bits<std::uint32_t>>(
-        (std::uint32_t *)output, n);
-#endif
-  }
-
-  /// Generate unsigned long long random number(s) with 'uniform_bits'
-  /// distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  inline void generate_uniform_bits(unsigned long long *output,
-                                    std::int64_t n) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                             "Interfaces Project does not support this API.");
-#else
-    static_assert(sizeof(unsigned long long) == sizeof(std::uint64_t));
-    generate<oneapi::mkl::rng::uniform_bits<std::uint64_t>>(
-        (std::uint64_t *)output, n);
-#endif
-  }
-
-  /// Generate float random number(s) with 'lognormal' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param m Mean of associated normal distribution
-  /// \param s Standard deviation of associated normal distribution.
-  inline void generate_lognormal(float *output, std::int64_t n, float m,
-                                 float s) {
-    generate<oneapi::mkl::rng::lognormal<float>>(output, n, m, s);
-  }
-
-  /// Generate double random number(s) with 'lognormal' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param m Mean of associated normal distribution
-  /// \param s Standard deviation of associated normal distribution.
-  inline void generate_lognormal(double *output, std::int64_t n, double m,
-                                 double s) {
-    generate<oneapi::mkl::rng::lognormal<double>>(output, n, m, s);
-  }
-
-  /// Generate float random number(s) with 'gaussian' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param mean Mean of normal distribution
-  /// \param stddev Standard deviation of normal distribution.
-  inline void generate_gaussian(float *output, std::int64_t n, float mean,
-                                float stddev) {
-    generate<oneapi::mkl::rng::gaussian<float>>(output, n, mean, stddev);
-  }
-
-  /// Generate double random number(s) with 'gaussian' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param mean Mean of normal distribution
-  /// \param stddev Standard deviation of normal distribution.
-  inline void generate_gaussian(double *output, std::int64_t n, double mean,
-                                double stddev) {
-    generate<oneapi::mkl::rng::gaussian<double>>(output, n, mean, stddev);
-  }
-
-  /// Generate unsigned int random number(s) with 'poisson' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param lambda Lambda for the Poisson distribution.
-  inline void generate_poisson(unsigned int *output, std::int64_t n,
-                               double lambda) {
-    generate<oneapi::mkl::rng::poisson<unsigned int>>(output, n, lambda);
-  }
-
-  /// Generate float random number(s) with 'uniform' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  inline void generate_uniform(float *output, std::int64_t n) {
-    generate<oneapi::mkl::rng::uniform<float>>(output, n);
-  }
-
-  /// Generate double random number(s) with 'uniform' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  inline void generate_uniform(double *output, std::int64_t n) {
-    generate<oneapi::mkl::rng::uniform<double>>(output, n);
-  }
-
-  /// Skip ahead several random number(s).
-  /// \param num_to_skip The number of random numbers to be skipped.
-  void skip_ahead(const std::uint64_t num_to_skip) {
-#ifndef __INTEL_MKL__
-    oneapi::mkl::rng::skip_ahead(_engine, num_to_skip);
-#else
-    if constexpr (std::is_same_v<engine_t, oneapi::mkl::rng::mt2203>)
-      throw std::runtime_error("no skip_ahead method of mt2203 engine.");
-    else
-      oneapi::mkl::rng::skip_ahead(_engine, num_to_skip);
-#endif
-  }
-
-private:
-  static inline engine_t create_engine(sycl::queue *queue,
-                                       const std::uint64_t seed,
-                                       const std::uint32_t dimensions) {
-#ifdef __INTEL_MKL__
-    return std::is_same_v<engine_t, oneapi::mkl::rng::sobol>
-               ? engine_t(*queue, dimensions)
-               : engine_t(*queue, seed);
-#else
-    return engine_t(*queue, seed);
-#endif
-  }
-
-  template <typename distr_t, typename buffer_t, class... distr_params_t>
-  void generate(buffer_t *output, const std::int64_t n,
-                const distr_params_t... distr_params) {
-    auto output_buf = dpct::detail::get_memory<buffer_t>(output);
-    oneapi::mkl::rng::generate(distr_t(distr_params...), _engine, n,
-                               output_buf);
-  }
-  engine_t _engine{};
-};
-} // namespace detail
-} // namespace host
-
-enum class random_engine_type {
-  philox4x32x10,
-  mrg32k3a,
-  mt2203,
-  mt19937,
-  sobol,
-  mcg59
-};
-
-typedef std::shared_ptr<rng::host::detail::rng_generator_base> host_rng_ptr;
-
-/// Create a host random number generator.
-/// \param type The random engine type.
-/// \return The pointer of random number generator.
-inline host_rng_ptr create_host_rng(const random_engine_type type) {
-  switch (type) {
-  case random_engine_type::philox4x32x10:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::philox4x32x10>>();
-  case random_engine_type::mrg32k3a:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::mrg32k3a>>();
-#ifndef __INTEL_MKL__
-    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                             "Interfaces Project does not support this API.");
-#else
-  case random_engine_type::mt2203:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::mt2203>>();
-  case random_engine_type::mt19937:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::mt19937>>();
-  case random_engine_type::sobol:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::sobol>>();
-  case random_engine_type::mcg59:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::mcg59>>();
-#endif
-  }
-}
-} // namespace rng
-} // namespace dpct
-
-#endif // __DPCT_RNG_UTILS_HPP__
--- a/dpct/sparse_utils.hpp
+++ b/dpct/sparse_utils.hpp
--- a/dpct/util.hpp
+++ b/dpct/util.hpp
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -19,9 +19,10 @@

 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
-#include <dpct/dpct.hpp>
-#include <dpct/blas_utils.hpp>
-#include <dpct/lib_common_utils.hpp>
+// #include <dpct/dpct.hpp>
+// #include <dpct/blas_utils.hpp>
+// #include <dpct/lib_common_utils.hpp>
+#include "dpct.hpp"
 #include "ggml-sycl.h"
 #include "ggml.h"
 #include "ggml-backend-impl.h"
--- a/run.sh
+++ b/run.sh
@ -14,6 +14,6 @@ echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
 #export GGML_SYCL_DEBUG=1
 #export GGML_SYCL_LIST_DEVICE=1
 #./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
-./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
 #./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0