summary dpct definition in one header file to replace folder:dpct
This commit is contained in:
parent
5b5389941e
commit
a47f5ec42e
27 changed files with 2836 additions and 25652 deletions
842
dpct/atomic.hpp
842
dpct/atomic.hpp
|
@ -1,842 +0,0 @@
|
|||
//==---- atomic.hpp -------------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_ATOMIC_HPP__
|
||||
#define __DPCT_ATOMIC_HPP__
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
namespace dpct {
|
||||
|
||||
/// Atomically add the value operand to the value at the addr and assign the
|
||||
/// result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to add to the value at \p addr.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T,
|
||||
sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
inline T atomic_fetch_add(T *addr, T operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_add(operand);
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_add(operand);
|
||||
}
|
||||
|
||||
/// Atomically add the value operand to the value at the addr and assign the
|
||||
/// result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to add to the value at \p addr.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T, sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space>
|
||||
inline T atomic_fetch_add(T *addr, T operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
switch (memoryOrder) {
|
||||
case sycl::memory_order::relaxed:
|
||||
return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::acq_rel:
|
||||
return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::seq_cst:
|
||||
return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
default:
|
||||
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
||||
"atomics are: sycl::memory_order::relaxed, "
|
||||
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
||||
}
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_add(T1 *addr, T2 operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
|
||||
}
|
||||
|
||||
/// Atomically subtract the value operand from the value at the addr and assign
|
||||
/// the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to subtract from the value at \p addr
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T,
|
||||
sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
inline T atomic_fetch_sub(T *addr, T operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_sub(operand);
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_sub(T1 *addr, T2 operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_sub(operand);
|
||||
}
|
||||
|
||||
/// Atomically subtract the value operand from the value at the addr and assign
|
||||
/// the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to subtract from the value at \p addr
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T, sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space>
|
||||
inline T atomic_fetch_sub(T *addr, T operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
switch (memoryOrder) {
|
||||
case sycl::memory_order::relaxed:
|
||||
return atomic_fetch_sub<T, addressSpace, sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::acq_rel:
|
||||
return atomic_fetch_sub<T, addressSpace, sycl::memory_order::acq_rel,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::seq_cst:
|
||||
return atomic_fetch_sub<T, addressSpace, sycl::memory_order::seq_cst,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
default:
|
||||
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
||||
"atomics are: sycl::memory_order::relaxed, "
|
||||
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
||||
}
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_sub(T1 *addr, T2 operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
atomic_fetch_sub<T1, addressSpace>(addr, operand, memoryOrder);
|
||||
}
|
||||
|
||||
/// Atomically perform a bitwise AND between the value operand and the value at the addr
|
||||
/// and assign the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to use in bitwise AND operation with the value at the \p addr.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T,
|
||||
sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
inline T atomic_fetch_and(T *addr, T operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_and(operand);
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_and(T1 *addr, T2 operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_and(operand);
|
||||
}
|
||||
|
||||
/// Atomically perform a bitwise AND between the value operand and the value at the addr
|
||||
/// and assign the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to use in bitwise AND operation with the value at the \p addr.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T, sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space>
|
||||
inline T atomic_fetch_and(T *addr, T operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
switch (memoryOrder) {
|
||||
case sycl::memory_order::relaxed:
|
||||
return atomic_fetch_and<T, addressSpace, sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::acq_rel:
|
||||
return atomic_fetch_and<T, addressSpace, sycl::memory_order::acq_rel,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::seq_cst:
|
||||
return atomic_fetch_and<T, addressSpace, sycl::memory_order::seq_cst,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
default:
|
||||
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
||||
"atomics are: sycl::memory_order::relaxed, "
|
||||
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
||||
}
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_and(T1 *addr, T2 operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
atomic_fetch_and<T1, addressSpace>(addr, operand, memoryOrder);
|
||||
}
|
||||
|
||||
/// Atomically or the value at the addr with the value operand, and assign
|
||||
/// the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to use in bitwise OR operation with the value at the \p addr.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T,
|
||||
sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
inline T atomic_fetch_or(T *addr, T operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_or(operand);
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_or(T1 *addr, T2 operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_or(operand);
|
||||
}
|
||||
|
||||
/// Atomically or the value at the addr with the value operand, and assign
|
||||
/// the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to use in bitwise OR operation with the value at the \p addr.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T, sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space>
|
||||
inline T atomic_fetch_or(T *addr, T operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
switch (memoryOrder) {
|
||||
case sycl::memory_order::relaxed:
|
||||
return atomic_fetch_or<T, addressSpace, sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::acq_rel:
|
||||
return atomic_fetch_or<T, addressSpace, sycl::memory_order::acq_rel,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::seq_cst:
|
||||
return atomic_fetch_or<T, addressSpace, sycl::memory_order::seq_cst,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
default:
|
||||
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
||||
"atomics are: sycl::memory_order::relaxed, "
|
||||
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
||||
}
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_or(T1 *addr, T2 operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
atomic_fetch_or<T1, addressSpace>(addr, operand, memoryOrder);
|
||||
}
|
||||
|
||||
/// Atomically xor the value at the addr with the value operand, and assign
|
||||
/// the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to use in bitwise XOR operation with the value at the \p addr.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T,
|
||||
sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
inline T atomic_fetch_xor(T *addr, T operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_xor(operand);
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_xor(T1 *addr, T2 operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_xor(operand);
|
||||
}
|
||||
|
||||
/// Atomically xor the value at the addr with the value operand, and assign
|
||||
/// the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to use in bitwise XOR operation with the value at the \p addr.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T, sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space>
|
||||
inline T atomic_fetch_xor(T *addr, T operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
switch (memoryOrder) {
|
||||
case sycl::memory_order::relaxed:
|
||||
return atomic_fetch_xor<T, addressSpace, sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::acq_rel:
|
||||
return atomic_fetch_xor<T, addressSpace, sycl::memory_order::acq_rel,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::seq_cst:
|
||||
return atomic_fetch_xor<T, addressSpace, sycl::memory_order::seq_cst,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
default:
|
||||
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
||||
"atomics are: sycl::memory_order::relaxed, "
|
||||
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
||||
}
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_xor(T1 *addr, T2 operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
atomic_fetch_xor<T1, addressSpace>(addr, operand, memoryOrder);
|
||||
}
|
||||
|
||||
/// Atomically calculate the minimum of the value at addr and the value operand
|
||||
/// and assign the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T,
|
||||
sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
inline T atomic_fetch_min(T *addr, T operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_min(operand);
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_min(T1 *addr, T2 operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_min(operand);
|
||||
}
|
||||
|
||||
/// Atomically calculate the minimum of the value at addr and the value operand
|
||||
/// and assign the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T, sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space>
|
||||
inline T atomic_fetch_min(T *addr, T operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
switch (memoryOrder) {
|
||||
case sycl::memory_order::relaxed:
|
||||
return atomic_fetch_min<T, addressSpace, sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::acq_rel:
|
||||
return atomic_fetch_min<T, addressSpace, sycl::memory_order::acq_rel,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::seq_cst:
|
||||
return atomic_fetch_min<T, addressSpace, sycl::memory_order::seq_cst,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
default:
|
||||
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
||||
"atomics are: sycl::memory_order::relaxed, "
|
||||
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
||||
}
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_min(T1 *addr, T2 operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
atomic_fetch_min<T1, addressSpace>(addr, operand, memoryOrder);
|
||||
}
|
||||
|
||||
/// Atomically calculate the maximum of the value at addr and the value operand
|
||||
/// and assign the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T,
|
||||
sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
inline T atomic_fetch_max(T *addr, T operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_max(operand);
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_max(T1 *addr, T2 operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.fetch_max(operand);
|
||||
}
|
||||
|
||||
/// Atomically calculate the maximum of the value at addr and the value operand
|
||||
/// and assign the result to the value at addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T, sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space>
|
||||
inline T atomic_fetch_max(T *addr, T operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
switch (memoryOrder) {
|
||||
case sycl::memory_order::relaxed:
|
||||
return atomic_fetch_max<T, addressSpace, sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::acq_rel:
|
||||
return atomic_fetch_max<T, addressSpace, sycl::memory_order::acq_rel,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::seq_cst:
|
||||
return atomic_fetch_max<T, addressSpace, sycl::memory_order::seq_cst,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
default:
|
||||
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
||||
"atomics are: sycl::memory_order::relaxed, "
|
||||
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
||||
}
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_fetch_max(T1 *addr, T2 operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
atomic_fetch_max<T1, addressSpace>(addr, operand, memoryOrder);
|
||||
}
|
||||
|
||||
/// Atomically set \p operand to the value stored in \p addr, if old value stored in
|
||||
/// \p addr is equal to zero or greater than \p operand, else decrease the value stored
|
||||
/// in \p addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The threshold value.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The old value stored in \p addr.
|
||||
template <sycl::access::address_space addressSpace = sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
inline unsigned int atomic_fetch_compare_dec(unsigned int *addr,
|
||||
unsigned int operand) {
|
||||
auto atm = sycl::atomic_ref<unsigned int, memoryOrder, memoryScope,
|
||||
addressSpace>(addr[0]);
|
||||
unsigned int old;
|
||||
|
||||
while (true) {
|
||||
old = atm.load();
|
||||
if (old == 0 || old > operand) {
|
||||
if (atm.compare_exchange_strong(old, operand))
|
||||
break;
|
||||
} else if (atm.compare_exchange_strong(old, old - 1))
|
||||
break;
|
||||
}
|
||||
|
||||
return old;
|
||||
}
|
||||
|
||||
/// Atomically increment the value stored in \p addr if old value stored in \p
|
||||
/// addr is less than \p operand, else set 0 to the value stored in \p addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The threshold value.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The old value stored in \p addr.
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
inline unsigned int atomic_fetch_compare_inc(unsigned int *addr,
|
||||
unsigned int operand) {
|
||||
auto atm = sycl::atomic_ref<unsigned int, memoryOrder, memoryScope,
|
||||
addressSpace>(addr[0]);
|
||||
unsigned int old;
|
||||
while (true) {
|
||||
old = atm.load();
|
||||
if (old >= operand) {
|
||||
if (atm.compare_exchange_strong(old, 0))
|
||||
break;
|
||||
} else if (atm.compare_exchange_strong(old, old + 1))
|
||||
break;
|
||||
}
|
||||
return old;
|
||||
}
|
||||
|
||||
/// Atomically increment the value stored in \p addr if old value stored in \p
|
||||
/// addr is less than \p operand, else set 0 to the value stored in \p addr.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The threshold value.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The old value stored in \p addr.
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space>
|
||||
inline unsigned int
|
||||
atomic_fetch_compare_inc(unsigned int *addr, unsigned int operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
switch (memoryOrder) {
|
||||
case sycl::memory_order::relaxed:
|
||||
return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device>(addr,
|
||||
operand);
|
||||
case sycl::memory_order::acq_rel:
|
||||
return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::acq_rel,
|
||||
sycl::memory_scope::device>(addr,
|
||||
operand);
|
||||
case sycl::memory_order::seq_cst:
|
||||
return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::seq_cst,
|
||||
sycl::memory_scope::device>(addr,
|
||||
operand);
|
||||
default:
|
||||
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
||||
"atomics are: sycl::memory_order::relaxed, "
|
||||
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
||||
}
|
||||
}
|
||||
|
||||
/// Atomically exchange the value at the address addr with the value operand.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to be exchanged with the value pointed by \p addr.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T,
|
||||
sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
inline T atomic_exchange(T *addr, T operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.exchange(operand);
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_exchange(T1 *addr, T2 operand) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
return atm.exchange(operand);
|
||||
}
|
||||
|
||||
/// Atomically exchange the value at the address addr with the value operand.
|
||||
/// \param [in, out] addr The pointer to the data.
|
||||
/// \param operand The value to be exchanged with the value pointed by \p addr.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T, sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space>
|
||||
inline T atomic_exchange(T *addr, T operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
switch (memoryOrder) {
|
||||
case sycl::memory_order::relaxed:
|
||||
return atomic_exchange<T, addressSpace, sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::acq_rel:
|
||||
return atomic_exchange<T, addressSpace, sycl::memory_order::acq_rel,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
case sycl::memory_order::seq_cst:
|
||||
return atomic_exchange<T, addressSpace, sycl::memory_order::seq_cst,
|
||||
sycl::memory_scope::device>(addr, operand);
|
||||
default:
|
||||
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
||||
"atomics are: sycl::memory_order::relaxed, "
|
||||
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
||||
}
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
typename T1, typename T2>
|
||||
inline T1 atomic_exchange(T1 *addr, T2 operand,
|
||||
sycl::memory_order memoryOrder) {
|
||||
atomic_exchange<T1, addressSpace>(addr, operand, memoryOrder);
|
||||
}
|
||||
|
||||
/// Atomically compare the value at \p addr to the value expected and exchange
|
||||
/// with the value desired if the value at \p addr is equal to the value expected.
|
||||
/// Returns the value at the \p addr before the call.
|
||||
/// \param [in, out] addr Multi_ptr.
|
||||
/// \param expected The value to compare against the value at \p addr.
|
||||
/// \param desired The value to assign to \p addr if the value at \p addr is expected.
|
||||
/// \param success The memory ordering used when comparison succeeds.
|
||||
/// \param fail The memory ordering used when comparison fails.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T,
|
||||
sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
T atomic_compare_exchange_strong(
|
||||
sycl::multi_ptr<T, addressSpace> addr, T expected, T desired,
|
||||
sycl::memory_order success = sycl::memory_order::relaxed,
|
||||
sycl::memory_order fail = sycl::memory_order::relaxed) {
|
||||
auto atm = sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(*addr);
|
||||
|
||||
atm.compare_exchange_strong(expected, desired, success, fail);
|
||||
return expected;
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||
typename T1, typename T2, typename T3>
|
||||
T1 atomic_compare_exchange_strong(
|
||||
sycl::multi_ptr<T1, addressSpace> addr, T2 expected, T3 desired,
|
||||
sycl::memory_order success = sycl::memory_order::relaxed,
|
||||
sycl::memory_order fail = sycl::memory_order::relaxed) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(*addr);
|
||||
T1 expected_value = expected;
|
||||
atm.compare_exchange_strong(expected_value, desired, success, fail);
|
||||
return expected_value;
|
||||
}
|
||||
|
||||
/// Atomically compare the value at \p addr to the value expected and exchange
|
||||
/// with the value desired if the value at \p addr is equal to the value expected.
|
||||
/// Returns the value at the \p addr before the call.
|
||||
/// \param [in] addr The pointer to the data.
|
||||
/// \param expected The value to compare against the value at \p addr.
|
||||
/// \param desired The value to assign to \p addr if the value at \p addr is expected.
|
||||
/// \param success The memory ordering used when comparison succeeds.
|
||||
/// \param fail The memory ordering used when comparison fails.
|
||||
/// \returns The value at the \p addr before the call.
|
||||
template <typename T,
|
||||
sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
||||
T atomic_compare_exchange_strong(
|
||||
T *addr, T expected, T desired,
|
||||
sycl::memory_order success = sycl::memory_order::relaxed,
|
||||
sycl::memory_order fail = sycl::memory_order::relaxed) {
|
||||
auto atm =
|
||||
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
atm.compare_exchange_strong(expected, desired, success, fail);
|
||||
return expected;
|
||||
}
|
||||
|
||||
template <sycl::access::address_space addressSpace =
|
||||
sycl::access::address_space::global_space,
|
||||
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
||||
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
||||
typename T1, typename T2, typename T3>
|
||||
T1 atomic_compare_exchange_strong(
|
||||
T1 *addr, T2 expected, T3 desired,
|
||||
sycl::memory_order success = sycl::memory_order::relaxed,
|
||||
sycl::memory_order fail = sycl::memory_order::relaxed) {
|
||||
T1 expected_value = expected;
|
||||
auto atm =
|
||||
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
||||
atm.compare_exchange_strong(expected_value, desired, success, fail);
|
||||
return expected_value;
|
||||
}
|
||||
|
||||
/// Atomic extension to implement standard APIs in std::atomic
|
||||
namespace detail{
|
||||
template <typename T> struct IsValidAtomicType {
|
||||
static constexpr bool value =
|
||||
(std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
|
||||
std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
|
||||
std::is_same<T, long long>::value ||
|
||||
std::is_same<T, unsigned long long>::value ||
|
||||
std::is_same<T, float>::value || std::is_same<T, double>::value ||
|
||||
std::is_pointer<T>::value);
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <typename T,
|
||||
sycl::memory_scope DefaultScope = sycl::memory_scope::system,
|
||||
sycl::memory_order DefaultOrder = sycl::memory_order::seq_cst,
|
||||
sycl::access::address_space Space =
|
||||
sycl::access::address_space::generic_space>
|
||||
class atomic{
|
||||
static_assert(
|
||||
detail::IsValidAtomicType<T>::value,
|
||||
"Invalid atomic type. Valid types are int, unsigned int, long, "
|
||||
"unsigned long, long long, unsigned long long, float, double "
|
||||
"and pointer types");
|
||||
T __d;
|
||||
|
||||
public:
|
||||
/// default memory synchronization order
|
||||
static constexpr sycl::memory_order default_read_order =
|
||||
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>::default_read_order;
|
||||
static constexpr sycl::memory_order default_write_order =
|
||||
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>::default_write_order;
|
||||
static constexpr sycl::memory_scope default_scope = DefaultScope;
|
||||
static constexpr sycl::memory_order default_read_modify_write_order =
|
||||
DefaultOrder;
|
||||
|
||||
|
||||
/// Default constructor.
|
||||
constexpr atomic() noexcept = default;
|
||||
/// Constructor with initialize value.
|
||||
constexpr atomic(T d) noexcept : __d(d){};
|
||||
|
||||
/// atomically replaces the value of the referenced object with a non-atomic argument
|
||||
/// \param operand The value to replace the pointed value.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \param memoryScope The memory scope used.
|
||||
void store(T operand, sycl::memory_order memoryOrder = default_write_order,
|
||||
sycl::memory_scope memoryScope = default_scope) noexcept {
|
||||
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
|
||||
atm.store(operand, memoryOrder, memoryScope);
|
||||
}
|
||||
|
||||
/// atomically obtains the value of the referenced object
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \param memoryScope The memory scope used.
|
||||
/// \returns The value of the referenced object
|
||||
T load(sycl::memory_order memoryOrder = default_read_order,
|
||||
sycl::memory_scope memoryScope = default_scope) const noexcept {
|
||||
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(
|
||||
const_cast<T &>(__d));
|
||||
return atm.load(memoryOrder, memoryScope);
|
||||
}
|
||||
|
||||
/// atomically replaces the value of the referenced object and obtains the value held previously
|
||||
/// \param operand The value to replace the pointed value.
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \param memoryScope The memory scope used.
|
||||
/// \returns The value of the referenced object before the call.
|
||||
T exchange(T operand,
|
||||
sycl::memory_order memoryOrder = default_read_modify_write_order,
|
||||
sycl::memory_scope memoryScope = default_scope) noexcept {
|
||||
|
||||
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
|
||||
return atm.exchange(operand, memoryOrder, memoryScope);
|
||||
}
|
||||
|
||||
/// atomically compares the value of the referenced object with non-atomic argument
|
||||
/// and performs atomic exchange if equal or atomic load if not
|
||||
/// \param expected The value expected to be found in the object referenced by the atomic_ref object
|
||||
/// \param desired The value to store in the referenced object if it is as expected
|
||||
/// \param success The memory models for the read-modify-write
|
||||
/// \param failure The memory models for load operations
|
||||
/// \param memoryScope The memory scope used.
|
||||
/// \returns true if the referenced object was successfully changed, false otherwise.
|
||||
bool compare_exchange_weak(
|
||||
T &expected, T desired,
|
||||
sycl::memory_order success, sycl::memory_order failure,
|
||||
sycl::memory_scope memoryScope = default_scope) noexcept {
|
||||
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
|
||||
return atm.compare_exchange_weak(expected, desired, success, failure, memoryScope);
|
||||
}
|
||||
/// \param expected The value expected to be found in the object referenced by the atomic_ref object
|
||||
/// \param desired The value to store in the referenced object if it is as expected
|
||||
/// \param memoryOrder The memory synchronization ordering for operations
|
||||
/// \param memoryScope The memory scope used.
|
||||
/// \returns true if the referenced object was successfully changed, false otherwise.
|
||||
bool compare_exchange_weak(T &expected, T desired,
|
||||
sycl::memory_order memoryOrder = default_read_modify_write_order,
|
||||
sycl::memory_scope memoryScope = default_scope) noexcept {
|
||||
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
|
||||
return atm.compare_exchange_weak(expected, desired, memoryOrder, memoryScope);
|
||||
}
|
||||
|
||||
/// atomically compares the value of the referenced object with non-atomic argument
|
||||
/// and performs atomic exchange if equal or atomic load if not
|
||||
/// \param expected The value expected to be found in the object referenced by the atomic_ref object
|
||||
/// \param desired The value to store in the referenced object if it is as expected
|
||||
/// \param success The memory models for the read-modify-write
|
||||
/// \param failure The memory models for load operations
|
||||
/// \param memoryScope The memory scope used.
|
||||
/// \returns true if the referenced object was successfully changed, false otherwise.
|
||||
bool compare_exchange_strong(
|
||||
T &expected, T desired,
|
||||
sycl::memory_order success, sycl::memory_order failure,
|
||||
sycl::memory_scope memoryScope = default_scope) noexcept {
|
||||
|
||||
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
|
||||
return atm.compare_exchange_strong(expected, desired, success, failure, memoryScope);
|
||||
}
|
||||
/// \param expected The value expected to be found in the object referenced by the atomic_ref object
|
||||
/// \param desired The value to store in the referenced object if it is as expected
|
||||
/// \param memoryOrder The memory synchronization ordering for operations
|
||||
/// \param memoryScope The memory scope used.
|
||||
/// \returns true if the referenced object was successfully changed, false otherwise.
|
||||
bool compare_exchange_strong(T &expected, T desired,
|
||||
sycl::memory_order memoryOrder = default_read_modify_write_order,
|
||||
sycl::memory_scope memoryScope = default_scope) noexcept {
|
||||
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
|
||||
return atm.compare_exchange_strong(expected, desired, memoryOrder, memoryScope);
|
||||
}
|
||||
|
||||
/// atomically adds the argument to the value stored in the atomic object and obtains the value held previously
|
||||
/// \param operand The other argument of arithmetic addition
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \param memoryScope The memory scope used.
|
||||
/// \returns The value of the referenced object before the call.
|
||||
T fetch_add(T operand,
|
||||
sycl::memory_order memoryOrder = default_read_modify_write_order,
|
||||
sycl::memory_scope memoryScope = default_scope) noexcept {
|
||||
|
||||
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
|
||||
return atm.fetch_add(operand, memoryOrder, memoryScope);
|
||||
}
|
||||
|
||||
/// atomically subtracts the argument from the value stored in the atomic object and obtains the value held previously
|
||||
/// \param operand The other argument of arithmetic subtraction
|
||||
/// \param memoryOrder The memory ordering used.
|
||||
/// \param memoryScope The memory scope used.
|
||||
/// \returns The value of the referenced object before the call.
|
||||
T fetch_sub(T operand,
|
||||
sycl::memory_order memoryOrder = default_read_modify_write_order,
|
||||
sycl::memory_scope memoryScope = default_scope) noexcept {
|
||||
|
||||
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
|
||||
return atm.fetch_sub(operand, memoryOrder, memoryScope);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace dpct
|
||||
#endif // __DPCT_ATOMIC_HPP__
|
1792
dpct/blas_utils.hpp
1792
dpct/blas_utils.hpp
File diff suppressed because it is too large
Load diff
|
@ -1,286 +0,0 @@
|
|||
//==---- ccl_utils.hpp----------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_CCL_UTILS_HPP__
|
||||
#define __DPCT_CCL_UTILS_HPP__
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <oneapi/ccl.hpp>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
|
||||
#include "device.hpp"
|
||||
|
||||
namespace dpct {
|
||||
namespace ccl {
|
||||
namespace detail {
|
||||
|
||||
/// Get stored kvs with specified kvs address.
|
||||
inline std::shared_ptr<oneapi::ccl::kvs> &
|
||||
get_kvs(const oneapi::ccl::kvs::address_type &addr) {
|
||||
struct hash {
|
||||
std::size_t operator()(const oneapi::ccl::kvs::address_type &in) const {
|
||||
return std::hash<std::string_view>()(std::string_view(in.data(), in.size()));
|
||||
}
|
||||
};
|
||||
static std::unordered_map<oneapi::ccl::kvs::address_type,
|
||||
std::shared_ptr<oneapi::ccl::kvs>, hash>
|
||||
kvs_map;
|
||||
return kvs_map[addr];
|
||||
}
|
||||
|
||||
/// Help class to init ccl environment.
|
||||
class ccl_init_helper {
|
||||
public:
|
||||
ccl_init_helper() { oneapi::ccl::init(); }
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// Get concatenated library version as an integer.
|
||||
static inline int get_version() {
|
||||
oneapi::ccl::init();
|
||||
auto ver = oneapi::ccl::get_library_version();
|
||||
return ver.major * 10000 + ver.minor * 100 + ver.update;
|
||||
}
|
||||
|
||||
/// Create main kvs and return its address.
|
||||
static inline oneapi::ccl::kvs::address_type create_kvs_address() {
|
||||
oneapi::ccl::init();
|
||||
auto ptr = oneapi::ccl::create_main_kvs();
|
||||
auto addr = ptr->get_address();
|
||||
detail::get_kvs(addr) = ptr;
|
||||
return addr;
|
||||
}
|
||||
|
||||
/// Get stored kvs with /p addr if exist. Otherwise, create kvs with /p addr.
|
||||
static inline std::shared_ptr<oneapi::ccl::kvs>
|
||||
create_kvs(const oneapi::ccl::kvs::address_type &addr) {
|
||||
oneapi::ccl::init();
|
||||
auto &ptr = detail::get_kvs(addr);
|
||||
if (!ptr)
|
||||
ptr = oneapi::ccl::create_kvs(addr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// dpct communicator extension
|
||||
class communicator_wrapper : public dpct::ccl::detail::ccl_init_helper {
|
||||
public:
|
||||
communicator_wrapper(
|
||||
int size, int rank, oneapi::ccl::kvs::address_type id,
|
||||
const oneapi::ccl::comm_attr &attr = oneapi::ccl::default_comm_attr)
|
||||
: _device_comm(oneapi::ccl::create_device(
|
||||
static_cast<sycl::device &>(dpct::get_current_device()))),
|
||||
_context_comm(oneapi::ccl::create_context(dpct::get_default_context())),
|
||||
_comm(oneapi::ccl::create_communicator(
|
||||
size, rank, _device_comm, _context_comm, dpct::ccl::create_kvs(id),
|
||||
attr)) {
|
||||
_queue_init = false;
|
||||
_ccl_stream_ptr = nullptr;
|
||||
}
|
||||
|
||||
~communicator_wrapper() {
|
||||
delete _ccl_stream_ptr;
|
||||
};
|
||||
|
||||
/// Return the rank in a oneapi::ccl::communicator
|
||||
/// \returns The rank corresponding to communicator object
|
||||
int rank() const {
|
||||
return _comm.rank();
|
||||
}
|
||||
|
||||
/// Retrieves the number of rank in oneapi::ccl::communicator
|
||||
/// \returns The number of the ranks
|
||||
int size() const {
|
||||
return _comm.size();
|
||||
}
|
||||
|
||||
/// Return underlying native device, which was used in oneapi::ccl::communicator
|
||||
sycl::device get_device() const {
|
||||
return _comm.get_device().get_native();
|
||||
}
|
||||
|
||||
/// \brief allreduce is a collective communication operation that performs the global reduction operation
|
||||
/// on values from all ranks of communicator and distributes the result back to all ranks.
|
||||
/// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced
|
||||
/// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff
|
||||
/// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff
|
||||
/// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
|
||||
/// \param rtype the type of the reduction operation to be applied
|
||||
/// \param queue_ptr a sycl::queue ptr associated with the operation
|
||||
/// \return @ref void
|
||||
void allreduce(const void *sendbuff, void *recvbuff, size_t count,
|
||||
oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
|
||||
sycl::queue *queue_ptr) {
|
||||
call_func_wrapper(
|
||||
[=](const oneapi::ccl::stream &stream) {
|
||||
return oneapi::ccl::allreduce(sendbuff, recvbuff, count, dtype, rtype,
|
||||
_comm, stream);
|
||||
},
|
||||
queue_ptr);
|
||||
}
|
||||
|
||||
/// \brief reduce is a collective communication operation that performs the
|
||||
/// global reduction operation on values from all ranks of the communicator
|
||||
/// and returns the result to the root rank.
|
||||
/// \param sendbuff the buffer with @c count elements of @c dtype that stores
|
||||
/// local data to be reduced
|
||||
/// \param recvbuff [out] the buffer to store reduced result,
|
||||
/// must have the same dimension as @c sendbuff
|
||||
/// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff
|
||||
/// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
|
||||
/// \param root the rank that gets the result of reduction
|
||||
/// \param rtype the type of the reduction operation to be applied
|
||||
/// \param queue_ptr a sycl::queue ptr associated with the operation
|
||||
/// \return @ref void
|
||||
void reduce(const void *sendbuff, void *recvbuff, size_t count,
|
||||
oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
|
||||
int root, sycl::queue *queue_ptr) {
|
||||
call_func_wrapper(
|
||||
[=](const oneapi::ccl::stream &stream) {
|
||||
return oneapi::ccl::reduce(sendbuff, recvbuff, count, dtype, rtype,
|
||||
root, _comm, stream);
|
||||
},
|
||||
queue_ptr);
|
||||
}
|
||||
|
||||
/// \brief broadcast is a collective communication operation that broadcasts data
|
||||
/// from one rank of communicator (denoted as root) to all other ranks.
|
||||
/// Only support in-place operation
|
||||
/// \param sendbuff the buffer with @c count elements of @c dtype that stores
|
||||
/// local data to be reduced
|
||||
/// \param recvbuff [out] the buffer to store reduced result
|
||||
/// \param count the number of elements of type @c dtype in @c buf
|
||||
/// \param dtype thedatatype of elements in @c buf
|
||||
/// \param root the rank that broadcasts @c buf
|
||||
/// \param queue_ptr a sycl::queue ptr associated with the operation
|
||||
/// \return @ref void
|
||||
void broadcast(void *sendbuff, void *recvbuff, size_t count,
|
||||
oneapi::ccl::datatype dtype, int root,
|
||||
sycl::queue *queue_ptr) {
|
||||
if (sendbuff != recvbuff) {
|
||||
throw std::runtime_error(
|
||||
"oneCCL broadcast only support in-place operation. "
|
||||
"sendbuff and recvbuff must be same.");
|
||||
return;
|
||||
}
|
||||
call_func_wrapper(
|
||||
[=](const oneapi::ccl::stream &stream) {
|
||||
return oneapi::ccl::broadcast(recvbuff, count, dtype, root, _comm,
|
||||
stream);
|
||||
},
|
||||
queue_ptr);
|
||||
}
|
||||
|
||||
/// \brief reduce_scatter is a collective communication operation that performs the global reduction operation
|
||||
/// on values from all ranks of the communicator and scatters the result in blocks back to all ranks.
|
||||
/// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced
|
||||
/// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff
|
||||
/// \param recv_count the number of elements of type @c dtype in receive block
|
||||
/// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
|
||||
/// \param rtype the type of the reduction operation to be applied
|
||||
/// \param queue_ptr a sycl::queue ptr associated with the operation
|
||||
/// \return @ref void
|
||||
void reduce_scatter(const void *sendbuff, void *recvbuff, size_t recv_count,
|
||||
oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
|
||||
sycl::queue *queue_ptr) {
|
||||
call_func_wrapper(
|
||||
[=](const oneapi::ccl::stream &stream) {
|
||||
return oneapi::ccl::reduce_scatter(sendbuff, recvbuff, recv_count,
|
||||
dtype, rtype, _comm, stream);
|
||||
},
|
||||
queue_ptr);
|
||||
}
|
||||
|
||||
/// \brief send is a pt2pt communication operation that sends data from one rank of communicator.
|
||||
/// \param sendbuff the buffer with @c count elements of @c dtype serves as send buffer for root
|
||||
/// \param count the number of elements of type @c dtype in @c sendbuff
|
||||
/// \param dtype the datatype of elements in @c sendbuff
|
||||
/// \param peer the rank that receives @c sendbuff
|
||||
/// \param queue_ptr a sycl::queue ptr associated with the operation
|
||||
/// \return @ref void
|
||||
void send(void *sendbuff, size_t count, oneapi::ccl::datatype dtype, int peer,
|
||||
sycl::queue *queue_ptr) {
|
||||
call_func_wrapper(
|
||||
[=](const oneapi::ccl::stream &stream) {
|
||||
return oneapi::ccl::send(sendbuff, count, dtype, peer, _comm, stream);
|
||||
},
|
||||
queue_ptr);
|
||||
}
|
||||
|
||||
/// \brief recv is a pt2pt communication operation that sends data from one rank of communicator.
|
||||
/// \param recvbuff the buffer with @c count elements of @c dtype serves as receive buffer
|
||||
/// \param count the number of elements of type @c dtype in @c recvbuff
|
||||
/// \param dtype the datatype of elements in @c recvbuff
|
||||
/// \param peer the rank that receives @c recvbuff
|
||||
/// \param queue_ptr a sycl::queue ptr associated with the operation
|
||||
/// \return @ref void
|
||||
void recv(void *recvbuff, size_t count, oneapi::ccl::datatype dtype, int peer,
|
||||
sycl::queue *queue_ptr) {
|
||||
call_func_wrapper(
|
||||
[=](const oneapi::ccl::stream &stream) {
|
||||
return oneapi::ccl::recv(recvbuff, count, dtype, peer, _comm, stream);
|
||||
},
|
||||
queue_ptr);
|
||||
}
|
||||
|
||||
private:
|
||||
oneapi::ccl::device _device_comm;
|
||||
oneapi::ccl::context _context_comm;
|
||||
oneapi::ccl::communicator _comm;
|
||||
sycl::queue _queue;
|
||||
bool _queue_init;
|
||||
oneapi::ccl::stream *_ccl_stream_ptr;
|
||||
|
||||
template <class Fn>
|
||||
void call_func_wrapper(Fn func, sycl::queue *qptr) {
|
||||
if (_queue_init && *qptr != _queue) {
|
||||
call_func_async(func, qptr);
|
||||
} else {
|
||||
if(!_queue_init) {
|
||||
_queue = *qptr;
|
||||
_queue_init = true;
|
||||
_ccl_stream_ptr = new oneapi::ccl::stream(oneapi::ccl::create_stream(_queue));
|
||||
}
|
||||
std::invoke(func, *_ccl_stream_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
class call_func_async {
|
||||
sycl::queue *_q_ptr;
|
||||
struct call_async_impl {
|
||||
oneapi::ccl::stream _ccl_stream_impl;
|
||||
oneapi::ccl::event _ccl_event_impl;
|
||||
template <class Fn>
|
||||
explicit call_async_impl(Fn func, sycl::queue *qptr)
|
||||
: _ccl_stream_impl(oneapi::ccl::create_stream(*qptr)),
|
||||
_ccl_event_impl(std::invoke(func, _ccl_stream_impl)) {}
|
||||
};
|
||||
call_async_impl *_imp;
|
||||
|
||||
public:
|
||||
template <class Fn>
|
||||
explicit call_func_async(Fn func, sycl::queue *qptr)
|
||||
: _q_ptr(qptr),
|
||||
_imp(new call_async_impl(func, qptr)) {}
|
||||
~call_func_async() {
|
||||
_q_ptr->submit([&](sycl::handler &cgh)
|
||||
{ cgh.host_task([=]
|
||||
{
|
||||
_imp->_ccl_event_impl.wait();
|
||||
delete _imp; }); });
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
typedef dpct::ccl::communicator_wrapper *comm_ptr;
|
||||
|
||||
} // namespace ccl
|
||||
} // namespace dpct
|
||||
|
||||
#endif // __DPCT_CCL_UTILS_HPP__
|
781
dpct/device.hpp
781
dpct/device.hpp
|
@ -1,781 +0,0 @@
|
|||
//==---- device.hpp -------------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_DEVICE_HPP__
|
||||
#define __DPCT_DEVICE_HPP__
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <thread>
|
||||
#if defined(__linux__)
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#endif
|
||||
#if defined(_WIN64)
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
namespace dpct {
|
||||
namespace detail {
|
||||
static void get_version(const sycl::device &dev, int &major, int &minor) {
|
||||
// Version string has the following format:
|
||||
// a. OpenCL<space><major.minor><space><vendor-specific-information>
|
||||
// b. <major.minor>
|
||||
std::string ver;
|
||||
ver = dev.get_info<sycl::info::device::version>();
|
||||
std::string::size_type i = 0;
|
||||
while (i < ver.size()) {
|
||||
if (isdigit(ver[i]))
|
||||
break;
|
||||
i++;
|
||||
}
|
||||
major = std::stoi(&(ver[i]));
|
||||
while (i < ver.size()) {
|
||||
if (ver[i] == '.')
|
||||
break;
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
minor = std::stoi(&(ver[i]));
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
/// SYCL default exception handler
|
||||
inline auto exception_handler = [](sycl::exception_list exceptions) {
|
||||
for (std::exception_ptr const &e : exceptions) {
|
||||
try {
|
||||
std::rethrow_exception(e);
|
||||
} catch (sycl::exception const &e) {
|
||||
std::cerr << "Caught asynchronous SYCL exception:" << std::endl
|
||||
<< e.what() << std::endl
|
||||
<< "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef sycl::event *event_ptr;
|
||||
|
||||
typedef sycl::queue *queue_ptr;
|
||||
|
||||
typedef char *device_ptr;
|
||||
|
||||
/// Destroy \p event pointed memory.
|
||||
///
|
||||
/// \param event Pointer to the sycl::event address.
|
||||
static void destroy_event(event_ptr event) {
|
||||
delete event;
|
||||
}
|
||||
|
||||
class device_info {
|
||||
public:
|
||||
// get interface
|
||||
const char *get_name() const { return _name; }
|
||||
char *get_name() { return _name; }
|
||||
template <typename WorkItemSizesTy = sycl::range<3>,
|
||||
std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
|
||||
std::is_same_v<WorkItemSizesTy, int *>,
|
||||
int> = 0>
|
||||
auto get_max_work_item_sizes() const {
|
||||
if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
|
||||
return sycl::range<3>(_max_work_item_sizes_i[0],
|
||||
_max_work_item_sizes_i[1],
|
||||
_max_work_item_sizes_i[2]);
|
||||
else {
|
||||
return _max_work_item_sizes_i;
|
||||
}
|
||||
}
|
||||
template <typename WorkItemSizesTy = sycl::range<3>,
|
||||
std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
|
||||
std::is_same_v<WorkItemSizesTy, int *>,
|
||||
int> = 0>
|
||||
auto get_max_work_item_sizes() {
|
||||
if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
|
||||
return sycl::range<3>(_max_work_item_sizes_i[0],
|
||||
_max_work_item_sizes_i[1],
|
||||
_max_work_item_sizes_i[2]);
|
||||
else {
|
||||
return _max_work_item_sizes_i;
|
||||
}
|
||||
}
|
||||
bool get_host_unified_memory() const { return _host_unified_memory; }
|
||||
int get_major_version() const { return _major; }
|
||||
int get_minor_version() const { return _minor; }
|
||||
int get_integrated() const { return _integrated; }
|
||||
int get_max_clock_frequency() const { return _frequency; }
|
||||
int get_max_compute_units() const { return _max_compute_units; }
|
||||
int get_max_work_group_size() const { return _max_work_group_size; }
|
||||
int get_max_sub_group_size() const { return _max_sub_group_size; }
|
||||
int get_max_work_items_per_compute_unit() const {
|
||||
return _max_work_items_per_compute_unit;
|
||||
}
|
||||
int get_max_register_size_per_work_group() const {
|
||||
return _max_register_size_per_work_group;
|
||||
}
|
||||
template <typename NDRangeSizeTy = size_t *,
|
||||
std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
|
||||
std::is_same_v<NDRangeSizeTy, int *>,
|
||||
int> = 0>
|
||||
auto get_max_nd_range_size() const {
|
||||
if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
|
||||
return _max_nd_range_size;
|
||||
else
|
||||
return _max_nd_range_size_i;
|
||||
}
|
||||
template <typename NDRangeSizeTy = size_t *,
|
||||
std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
|
||||
std::is_same_v<NDRangeSizeTy, int *>,
|
||||
int> = 0>
|
||||
auto get_max_nd_range_size() {
|
||||
if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
|
||||
return _max_nd_range_size;
|
||||
else
|
||||
return _max_nd_range_size_i;
|
||||
}
|
||||
size_t get_global_mem_size() const { return _global_mem_size; }
|
||||
size_t get_local_mem_size() const { return _local_mem_size; }
|
||||
/// Returns the maximum clock rate of device's global memory in kHz. If
|
||||
/// compiler does not support this API then returns default value 3200000 kHz.
|
||||
unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
|
||||
/// Returns the maximum bus width between device and memory in bits. If
|
||||
/// compiler does not support this API then returns default value 64 bits.
|
||||
unsigned int get_memory_bus_width() const { return _memory_bus_width; }
|
||||
uint32_t get_device_id() const { return _device_id; }
|
||||
std::array<unsigned char, 16> get_uuid() const { return _uuid; }
|
||||
/// Returns global memory cache size in bytes.
|
||||
unsigned int get_global_mem_cache_size() const {
|
||||
return _global_mem_cache_size;
|
||||
}
|
||||
|
||||
// set interface
|
||||
void set_name(const char* name) {
|
||||
size_t length = strlen(name);
|
||||
if (length < 256) {
|
||||
std::memcpy(_name, name, length + 1);
|
||||
} else {
|
||||
std::memcpy(_name, name, 255);
|
||||
_name[255] = '\0';
|
||||
}
|
||||
}
|
||||
void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) {
|
||||
for (int i = 0; i < 3; ++i)
|
||||
_max_work_item_sizes_i[i] = max_work_item_sizes[i];
|
||||
}
|
||||
[[deprecated]] void
|
||||
set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) {
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
_max_work_item_sizes_i[i] = max_work_item_sizes[i];
|
||||
}
|
||||
}
|
||||
void set_host_unified_memory(bool host_unified_memory) {
|
||||
_host_unified_memory = host_unified_memory;
|
||||
}
|
||||
void set_major_version(int major) { _major = major; }
|
||||
void set_minor_version(int minor) { _minor = minor; }
|
||||
void set_integrated(int integrated) { _integrated = integrated; }
|
||||
void set_max_clock_frequency(int frequency) { _frequency = frequency; }
|
||||
void set_max_compute_units(int max_compute_units) {
|
||||
_max_compute_units = max_compute_units;
|
||||
}
|
||||
void set_global_mem_size(size_t global_mem_size) {
|
||||
_global_mem_size = global_mem_size;
|
||||
}
|
||||
void set_local_mem_size(size_t local_mem_size) {
|
||||
_local_mem_size = local_mem_size;
|
||||
}
|
||||
void set_max_work_group_size(int max_work_group_size) {
|
||||
_max_work_group_size = max_work_group_size;
|
||||
}
|
||||
void set_max_sub_group_size(int max_sub_group_size) {
|
||||
_max_sub_group_size = max_sub_group_size;
|
||||
}
|
||||
void
|
||||
set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) {
|
||||
_max_work_items_per_compute_unit = max_work_items_per_compute_unit;
|
||||
}
|
||||
void set_max_nd_range_size(int max_nd_range_size[]) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
_max_nd_range_size[i] = max_nd_range_size[i];
|
||||
_max_nd_range_size_i[i] = max_nd_range_size[i];
|
||||
}
|
||||
}
|
||||
void set_memory_clock_rate(unsigned int memory_clock_rate) {
|
||||
_memory_clock_rate = memory_clock_rate;
|
||||
}
|
||||
void set_memory_bus_width(unsigned int memory_bus_width) {
|
||||
_memory_bus_width = memory_bus_width;
|
||||
}
|
||||
void
|
||||
set_max_register_size_per_work_group(int max_register_size_per_work_group) {
|
||||
_max_register_size_per_work_group = max_register_size_per_work_group;
|
||||
}
|
||||
void set_device_id(uint32_t device_id) {
|
||||
_device_id = device_id;
|
||||
}
|
||||
void set_uuid(std::array<unsigned char, 16> uuid) {
|
||||
_uuid = std::move(uuid);
|
||||
}
|
||||
void set_global_mem_cache_size(unsigned int global_mem_cache_size) {
|
||||
_global_mem_cache_size = global_mem_cache_size;
|
||||
}
|
||||
|
||||
private:
|
||||
char _name[256];
|
||||
int _max_work_item_sizes_i[3];
|
||||
bool _host_unified_memory = false;
|
||||
int _major;
|
||||
int _minor;
|
||||
int _integrated = 0;
|
||||
int _frequency;
|
||||
// Set estimated value 3200000 kHz as default value.
|
||||
unsigned int _memory_clock_rate = 3200000;
|
||||
// Set estimated value 64 bits as default value.
|
||||
unsigned int _memory_bus_width = 64;
|
||||
unsigned int _global_mem_cache_size;
|
||||
int _max_compute_units;
|
||||
int _max_work_group_size;
|
||||
int _max_sub_group_size;
|
||||
int _max_work_items_per_compute_unit;
|
||||
int _max_register_size_per_work_group;
|
||||
size_t _global_mem_size;
|
||||
size_t _local_mem_size;
|
||||
size_t _max_nd_range_size[3];
|
||||
int _max_nd_range_size_i[3];
|
||||
uint32_t _device_id;
|
||||
std::array<unsigned char, 16> _uuid;
|
||||
};
|
||||
|
||||
static int get_major_version(const sycl::device &dev) {
|
||||
int major, minor;
|
||||
detail::get_version(dev, major, minor);
|
||||
return major;
|
||||
}
|
||||
|
||||
static int get_minor_version(const sycl::device &dev) {
|
||||
int major, minor;
|
||||
detail::get_version(dev, major, minor);
|
||||
return minor;
|
||||
}
|
||||
|
||||
static void get_device_info(device_info &out, const sycl::device &dev) {
|
||||
device_info prop;
|
||||
prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
|
||||
|
||||
int major, minor;
|
||||
detail::get_version(dev, major, minor);
|
||||
prop.set_major_version(major);
|
||||
prop.set_minor_version(minor);
|
||||
|
||||
prop.set_max_work_item_sizes(
|
||||
#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
|
||||
// oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
|
||||
// is an enum class element
|
||||
dev.get_info<sycl::info::device::max_work_item_sizes>());
|
||||
#else
|
||||
// SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
|
||||
// an int
|
||||
dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
|
||||
#endif
|
||||
prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
|
||||
|
||||
prop.set_max_clock_frequency(
|
||||
dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
|
||||
|
||||
prop.set_max_compute_units(
|
||||
dev.get_info<sycl::info::device::max_compute_units>());
|
||||
prop.set_max_work_group_size(
|
||||
dev.get_info<sycl::info::device::max_work_group_size>());
|
||||
prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
|
||||
prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
|
||||
|
||||
#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
|
||||
if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) {
|
||||
unsigned int tmp =
|
||||
dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
|
||||
if (tmp != 0)
|
||||
prop.set_memory_clock_rate(1000 * tmp);
|
||||
}
|
||||
if (dev.has(sycl::aspect::ext_intel_memory_bus_width)) {
|
||||
prop.set_memory_bus_width(
|
||||
dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
|
||||
}
|
||||
if (dev.has(sycl::aspect::ext_intel_device_id)) {
|
||||
prop.set_device_id(
|
||||
dev.get_info<sycl::ext::intel::info::device::device_id>());
|
||||
}
|
||||
if (dev.has(sycl::aspect::ext_intel_device_info_uuid)) {
|
||||
prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
|
||||
}
|
||||
#elif defined(_MSC_VER) && !defined(__clang__)
|
||||
#pragma message("get_device_info: querying memory_clock_rate and \
|
||||
memory_bus_width are not supported by the compiler used. \
|
||||
Use 3200000 kHz as memory_clock_rate default value. \
|
||||
Use 64 bits as memory_bus_width default value.")
|
||||
#else
|
||||
#warning "get_device_info: querying memory_clock_rate and \
|
||||
memory_bus_width are not supported by the compiler used. \
|
||||
Use 3200000 kHz as memory_clock_rate default value. \
|
||||
Use 64 bits as memory_bus_width default value."
|
||||
#endif
|
||||
|
||||
size_t max_sub_group_size = 1;
|
||||
std::vector<size_t> sub_group_sizes =
|
||||
dev.get_info<sycl::info::device::sub_group_sizes>();
|
||||
|
||||
for (const auto &sub_group_size : sub_group_sizes) {
|
||||
if (max_sub_group_size < sub_group_size)
|
||||
max_sub_group_size = sub_group_size;
|
||||
}
|
||||
|
||||
prop.set_max_sub_group_size(max_sub_group_size);
|
||||
|
||||
prop.set_max_work_items_per_compute_unit(
|
||||
dev.get_info<sycl::info::device::max_work_group_size>());
|
||||
int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
|
||||
prop.set_max_nd_range_size(max_nd_range_size);
|
||||
|
||||
// Estimates max register size per work group, feel free to update the value
|
||||
// according to device properties.
|
||||
prop.set_max_register_size_per_work_group(65536);
|
||||
|
||||
prop.set_global_mem_cache_size(
|
||||
dev.get_info<sycl::info::device::global_mem_cache_size>());
|
||||
out = prop;
|
||||
}
|
||||
|
||||
/// dpct device extension
|
||||
class device_ext : public sycl::device {
|
||||
typedef std::mutex mutex_type;
|
||||
|
||||
public:
|
||||
device_ext() : sycl::device(), _ctx(*this) {}
|
||||
~device_ext() {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
clear_queues();
|
||||
}
|
||||
device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
init_queues();
|
||||
}
|
||||
|
||||
int is_native_atomic_supported() { return 0; }
|
||||
int get_major_version() const {
|
||||
return dpct::get_major_version(*this);
|
||||
}
|
||||
|
||||
int get_minor_version() const {
|
||||
return dpct::get_minor_version(*this);
|
||||
}
|
||||
|
||||
int get_max_compute_units() const {
|
||||
return get_device_info().get_max_compute_units();
|
||||
}
|
||||
|
||||
/// Return the maximum clock frequency of this device in KHz.
|
||||
int get_max_clock_frequency() const {
|
||||
return get_device_info().get_max_clock_frequency();
|
||||
}
|
||||
|
||||
int get_integrated() const { return get_device_info().get_integrated(); }
|
||||
|
||||
int get_max_sub_group_size() const {
|
||||
return get_device_info().get_max_sub_group_size();
|
||||
}
|
||||
|
||||
int get_max_register_size_per_work_group() const {
|
||||
return get_device_info().get_max_register_size_per_work_group();
|
||||
}
|
||||
|
||||
int get_max_work_group_size() const {
|
||||
return get_device_info().get_max_work_group_size();
|
||||
}
|
||||
|
||||
int get_mem_base_addr_align() const {
|
||||
return get_info<sycl::info::device::mem_base_addr_align>();
|
||||
}
|
||||
|
||||
size_t get_global_mem_size() const {
|
||||
return get_device_info().get_global_mem_size();
|
||||
}
|
||||
|
||||
/// Get the number of bytes of free and total memory on the SYCL device.
|
||||
/// \param [out] free_memory The number of bytes of free memory on the SYCL device.
|
||||
/// \param [out] total_memory The number of bytes of total memory on the SYCL device.
|
||||
void get_memory_info(size_t &free_memory, size_t &total_memory) {
|
||||
#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
|
||||
if (!has(sycl::aspect::ext_intel_free_memory)) {
|
||||
std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
|
||||
free_memory = 0;
|
||||
} else {
|
||||
free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
|
||||
}
|
||||
#else
|
||||
std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
|
||||
free_memory = 0;
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#pragma message("Querying the number of bytes of free memory is not supported")
|
||||
#else
|
||||
#warning "Querying the number of bytes of free memory is not supported"
|
||||
#endif
|
||||
#endif
|
||||
total_memory = get_device_info().get_global_mem_size();
|
||||
}
|
||||
|
||||
void get_device_info(device_info &out) const {
|
||||
dpct::get_device_info(out, *this);
|
||||
}
|
||||
|
||||
device_info get_device_info() const {
|
||||
device_info prop;
|
||||
dpct::get_device_info(prop, *this);
|
||||
return prop;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
clear_queues();
|
||||
init_queues();
|
||||
}
|
||||
|
||||
sycl::queue &in_order_queue() { return *_q_in_order; }
|
||||
|
||||
sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
|
||||
|
||||
sycl::queue &default_queue() {
|
||||
#ifdef DPCT_USM_LEVEL_NONE
|
||||
return out_of_order_queue();
|
||||
#else
|
||||
return in_order_queue();
|
||||
#endif // DPCT_USM_LEVEL_NONE
|
||||
}
|
||||
|
||||
void queues_wait_and_throw() {
|
||||
std::unique_lock<mutex_type> lock(m_mutex);
|
||||
std::vector<std::shared_ptr<sycl::queue>> current_queues(
|
||||
_queues);
|
||||
lock.unlock();
|
||||
for (const auto &q : current_queues) {
|
||||
q->wait_and_throw();
|
||||
}
|
||||
// Guard the destruct of current_queues to make sure the ref count is safe.
|
||||
lock.lock();
|
||||
}
|
||||
|
||||
sycl::queue *create_queue(bool enable_exception_handler = false) {
|
||||
#ifdef DPCT_USM_LEVEL_NONE
|
||||
return create_out_of_order_queue(enable_exception_handler);
|
||||
#else
|
||||
return create_in_order_queue(enable_exception_handler);
|
||||
#endif // DPCT_USM_LEVEL_NONE
|
||||
}
|
||||
|
||||
sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return create_queue_impl(enable_exception_handler,
|
||||
sycl::property::queue::in_order());
|
||||
}
|
||||
|
||||
sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return create_queue_impl(enable_exception_handler);
|
||||
}
|
||||
|
||||
void destroy_queue(sycl::queue *&queue) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
|
||||
[=](const std::shared_ptr<sycl::queue> &q) -> bool {
|
||||
return q.get() == queue;
|
||||
}),
|
||||
_queues.end());
|
||||
queue = nullptr;
|
||||
}
|
||||
void set_saved_queue(sycl::queue* q) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
_saved_queue = q;
|
||||
}
|
||||
sycl::queue *get_saved_queue() const {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
return _saved_queue;
|
||||
}
|
||||
sycl::context get_context() const { return _ctx; }
|
||||
|
||||
private:
|
||||
void clear_queues() {
|
||||
_queues.clear();
|
||||
_q_in_order = _q_out_of_order = _saved_queue = nullptr;
|
||||
}
|
||||
|
||||
void init_queues() {
|
||||
_q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
|
||||
_q_out_of_order = create_queue_impl(true);
|
||||
_saved_queue = &default_queue();
|
||||
}
|
||||
|
||||
/// Caller should acquire resource \p m_mutex before calling this function.
|
||||
template <class... Properties>
|
||||
sycl::queue *create_queue_impl(bool enable_exception_handler,
|
||||
Properties... properties) {
|
||||
sycl::async_handler eh = {};
|
||||
if (enable_exception_handler) {
|
||||
eh = exception_handler;
|
||||
}
|
||||
_queues.push_back(std::make_shared<sycl::queue>(
|
||||
_ctx, *this, eh,
|
||||
sycl::property_list(
|
||||
#ifdef DPCT_PROFILING_ENABLED
|
||||
sycl::property::queue::enable_profiling(),
|
||||
#endif
|
||||
properties...)));
|
||||
|
||||
return _queues.back().get();
|
||||
}
|
||||
|
||||
void get_version(int &major, int &minor) const {
|
||||
detail::get_version(*this, major, minor);
|
||||
}
|
||||
sycl::queue *_q_in_order, *_q_out_of_order;
|
||||
sycl::queue *_saved_queue;
|
||||
sycl::context _ctx;
|
||||
std::vector<std::shared_ptr<sycl::queue>> _queues;
|
||||
mutable mutex_type m_mutex;
|
||||
};
|
||||
|
||||
static inline unsigned int get_tid() {
|
||||
#if defined(__linux__)
|
||||
return syscall(SYS_gettid);
|
||||
#elif defined(_WIN64)
|
||||
return GetCurrentThreadId();
|
||||
#else
|
||||
#error "Only support Windows and Linux."
|
||||
#endif
|
||||
}
|
||||
|
||||
/// device manager
|
||||
class dev_mgr {
|
||||
public:
|
||||
device_ext ¤t_device() {
|
||||
unsigned int dev_id=current_device_id();
|
||||
check_id(dev_id);
|
||||
return *_devs[dev_id];
|
||||
}
|
||||
device_ext &cpu_device() const {
|
||||
std::lock_guard<std::recursive_mutex> lock(m_mutex);
|
||||
if (_cpu_device == -1) {
|
||||
throw std::runtime_error("no valid cpu device");
|
||||
} else {
|
||||
return *_devs[_cpu_device];
|
||||
}
|
||||
}
|
||||
device_ext &get_device(unsigned int id) const {
|
||||
std::lock_guard<std::recursive_mutex> lock(m_mutex);
|
||||
check_id(id);
|
||||
return *_devs[id];
|
||||
}
|
||||
unsigned int current_device_id() const {
|
||||
std::lock_guard<std::recursive_mutex> lock(m_mutex);
|
||||
auto it=_thread2dev_map.find(get_tid());
|
||||
if(it != _thread2dev_map.end())
|
||||
return it->second;
|
||||
return DEFAULT_DEVICE_ID;
|
||||
}
|
||||
|
||||
/// Select device with a device ID.
|
||||
/// \param [in] id The id of the device which can
|
||||
/// be obtained through get_device_id(const sycl::device).
|
||||
void select_device(unsigned int id) {
|
||||
std::lock_guard<std::recursive_mutex> lock(m_mutex);
|
||||
check_id(id);
|
||||
_thread2dev_map[get_tid()]=id;
|
||||
}
|
||||
unsigned int device_count() { return _devs.size(); }
|
||||
|
||||
unsigned int get_device_id(const sycl::device &dev) {
|
||||
unsigned int id = 0;
|
||||
for(auto dev_item : _devs) {
|
||||
if (*dev_item == dev) {
|
||||
break;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
return id;
|
||||
}
|
||||
|
||||
template <class DeviceSelector>
|
||||
std::enable_if_t<
|
||||
std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
|
||||
select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
|
||||
sycl::device selected_device = sycl::device(selector);
|
||||
unsigned int selected_device_id = get_device_id(selected_device);
|
||||
select_device(selected_device_id);
|
||||
}
|
||||
|
||||
/// Returns the instance of device manager singleton.
|
||||
static dev_mgr &instance() {
|
||||
static dev_mgr d_m;
|
||||
return d_m;
|
||||
}
|
||||
dev_mgr(const dev_mgr &) = delete;
|
||||
dev_mgr &operator=(const dev_mgr &) = delete;
|
||||
dev_mgr(dev_mgr &&) = delete;
|
||||
dev_mgr &operator=(dev_mgr &&) = delete;
|
||||
|
||||
private:
|
||||
mutable std::recursive_mutex m_mutex;
|
||||
dev_mgr() {
|
||||
sycl::device default_device =
|
||||
sycl::device(sycl::default_selector_v);
|
||||
_devs.push_back(std::make_shared<device_ext>(default_device));
|
||||
|
||||
std::vector<sycl::device> sycl_all_devs =
|
||||
sycl::device::get_devices(sycl::info::device_type::all);
|
||||
// Collect other devices except for the default device.
|
||||
if (default_device.is_cpu())
|
||||
_cpu_device = 0;
|
||||
for (auto &dev : sycl_all_devs) {
|
||||
if (dev == default_device) {
|
||||
continue;
|
||||
}
|
||||
_devs.push_back(std::make_shared<device_ext>(dev));
|
||||
if (_cpu_device == -1 && dev.is_cpu()) {
|
||||
_cpu_device = _devs.size() - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
void check_id(unsigned int id) const {
|
||||
if (id >= _devs.size()) {
|
||||
throw std::runtime_error("invalid device id");
|
||||
}
|
||||
}
|
||||
std::vector<std::shared_ptr<device_ext>> _devs;
|
||||
/// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
|
||||
/// thread id in _thread2dev_map, which means default device should be used
|
||||
/// for the current thread.
|
||||
const unsigned int DEFAULT_DEVICE_ID = 0;
|
||||
/// thread-id to device-id map.
|
||||
std::map<unsigned int, unsigned int> _thread2dev_map;
|
||||
int _cpu_device = -1;
|
||||
};
|
||||
|
||||
/// Util function to get the default queue of current selected device depends on
|
||||
/// the USM config. Return the default out-of-ordered queue when USM-none is
|
||||
/// enabled, otherwise return the default in-ordered queue.
|
||||
static inline sycl::queue &get_default_queue() {
|
||||
return dev_mgr::instance().current_device().default_queue();
|
||||
}
|
||||
|
||||
/// Util function to get the default in-ordered queue of current device in
|
||||
/// dpct device manager.
|
||||
static inline sycl::queue &get_in_order_queue() {
|
||||
return dev_mgr::instance().current_device().in_order_queue();
|
||||
}
|
||||
|
||||
/// Util function to get the default out-of-ordered queue of current device in
|
||||
/// dpct device manager.
|
||||
static inline sycl::queue &get_out_of_order_queue() {
|
||||
return dev_mgr::instance().current_device().out_of_order_queue();
|
||||
}
|
||||
|
||||
/// Util function to get the id of current device in
|
||||
/// dpct device manager.
|
||||
static inline unsigned int get_current_device_id() {
|
||||
return dev_mgr::instance().current_device_id();
|
||||
}
|
||||
|
||||
/// Util function to get the current device.
|
||||
static inline device_ext &get_current_device() {
|
||||
return dev_mgr::instance().current_device();
|
||||
}
|
||||
|
||||
/// Util function to get a device by id.
|
||||
static inline device_ext &get_device(unsigned int id) {
|
||||
return dev_mgr::instance().get_device(id);
|
||||
}
|
||||
|
||||
/// Util function to get the context of the default queue of current
|
||||
/// device in dpct device manager.
|
||||
static inline sycl::context get_default_context() {
|
||||
return dpct::get_current_device().get_context();
|
||||
}
|
||||
|
||||
/// Util function to get a CPU device.
|
||||
static inline device_ext &cpu_device() {
|
||||
return dev_mgr::instance().cpu_device();
|
||||
}
|
||||
|
||||
static inline unsigned int select_device(unsigned int id) {
|
||||
dev_mgr::instance().select_device(id);
|
||||
return id;
|
||||
}
|
||||
|
||||
template <class DeviceSelector>
|
||||
static inline std::enable_if_t<
|
||||
std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
|
||||
select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
|
||||
dev_mgr::instance().select_device(selector);
|
||||
}
|
||||
|
||||
static inline unsigned int get_device_id(const sycl::device &dev){
|
||||
return dev_mgr::instance().get_device_id(dev);
|
||||
}
|
||||
|
||||
/// Util function to check whether a device supports some kinds of sycl::aspect.
|
||||
inline void
|
||||
has_capability_or_fail(const sycl::device &dev,
|
||||
const std::initializer_list<sycl::aspect> &props) {
|
||||
for (const auto &it : props) {
|
||||
if (dev.has(it))
|
||||
continue;
|
||||
switch (it) {
|
||||
case sycl::aspect::fp64:
|
||||
throw std::runtime_error("'double' is not supported in '" +
|
||||
dev.get_info<sycl::info::device::name>() +
|
||||
"' device");
|
||||
break;
|
||||
case sycl::aspect::fp16:
|
||||
throw std::runtime_error("'half' is not supported in '" +
|
||||
dev.get_info<sycl::info::device::name>() +
|
||||
"' device");
|
||||
break;
|
||||
default:
|
||||
#define __SYCL_ASPECT(ASPECT, ID) \
|
||||
case sycl::aspect::ASPECT: \
|
||||
return #ASPECT;
|
||||
#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
|
||||
#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
|
||||
auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string {
|
||||
switch (AspectNum) {
|
||||
#include <sycl/info/aspects.def>
|
||||
#include <sycl/info/aspects_deprecated.def>
|
||||
default:
|
||||
return "unknown aspect";
|
||||
}
|
||||
};
|
||||
#undef __SYCL_ASPECT_DEPRECATED_ALIAS
|
||||
#undef __SYCL_ASPECT_DEPRECATED
|
||||
#undef __SYCL_ASPECT
|
||||
throw std::runtime_error(
|
||||
"'" + getAspectNameStr(it) + "' is not supported in '" +
|
||||
dev.get_info<sycl::info::device::name>() + "' device");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // namespace dpct
|
||||
|
||||
#endif // __DPCT_DEVICE_HPP__
|
4921
dpct/dnnl_utils.hpp
4921
dpct/dnnl_utils.hpp
File diff suppressed because it is too large
Load diff
|
@ -1,62 +0,0 @@
|
|||
//==---- dpct.hpp ---------------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_HPP__
|
||||
#define __DPCT_HPP__
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <iostream>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
|
||||
template <class... Args> class dpct_kernel_name;
|
||||
template <int Arg> class dpct_kernel_scalar;
|
||||
|
||||
#include "atomic.hpp"
|
||||
#include "device.hpp"
|
||||
#include "image.hpp"
|
||||
#include "kernel.hpp"
|
||||
#include "math.hpp"
|
||||
#include "memory.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __dpct_align__(n) __declspec(align(n))
|
||||
#define __dpct_inline__ __forceinline
|
||||
#else
|
||||
#define __dpct_align__(n) __attribute__((aligned(n)))
|
||||
#define __dpct_inline__ __inline__ __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __dpct_noinline__ __declspec(noinline)
|
||||
#else
|
||||
#define __dpct_noinline__ __attribute__((noinline))
|
||||
#endif
|
||||
|
||||
#define DPCT_COMPATIBILITY_TEMP (900)
|
||||
|
||||
namespace dpct{
|
||||
enum error_code { success = 0, default_error = 999 };
|
||||
}
|
||||
|
||||
#define DPCT_CHECK_ERROR(expr) \
|
||||
[&]() { \
|
||||
try { \
|
||||
expr; \
|
||||
return dpct::success; \
|
||||
} catch (std::exception const &e) { \
|
||||
std::cerr << e.what() << std::endl; \
|
||||
return dpct::default_error; \
|
||||
} \
|
||||
}()
|
||||
|
||||
#define DPCT_PI_F (3.14159274101257f)
|
||||
#define DPCT_PI (3.141592653589793115998)
|
||||
|
||||
#endif // __DPCT_HPP__
|
File diff suppressed because it is too large
Load diff
|
@ -1,747 +0,0 @@
|
|||
//==---- dpcpp_extensions.h ------------------*- C++ -*---------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_DPCPP_EXTENSIONS_H__
|
||||
#define __DPCT_DPCPP_EXTENSIONS_H__
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <stdexcept>
|
||||
|
||||
#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
|
||||
#include <sycl/ext/oneapi/experimental/user_defined_reductions.hpp>
|
||||
#endif
|
||||
|
||||
#include "../dpct.hpp"
|
||||
#include "functional.h"
|
||||
|
||||
namespace dpct {
|
||||
namespace group {
|
||||
namespace detail {
|
||||
|
||||
template <typename... _Args>
|
||||
constexpr auto __reduce_over_group(_Args... __args) {
|
||||
return sycl::reduce_over_group(__args...);
|
||||
}
|
||||
|
||||
template <typename... _Args> constexpr auto __group_broadcast(_Args... __args) {
|
||||
return sycl::group_broadcast(__args...);
|
||||
}
|
||||
|
||||
template <typename... _Args>
|
||||
constexpr auto __exclusive_scan_over_group(_Args... __args) {
|
||||
return sycl::exclusive_scan_over_group(__args...);
|
||||
}
|
||||
|
||||
template <typename... _Args>
|
||||
constexpr auto __inclusive_scan_over_group(_Args... __args) {
|
||||
return sycl::inclusive_scan_over_group(__args...);
|
||||
}
|
||||
|
||||
} // end namespace detail
|
||||
|
||||
/// Perform an exclusive scan over the values of inputs from all work-items in
|
||||
/// the group using the operator binary_op, which must be one of the SYCL 2020
|
||||
/// group algorithms library function objects.
|
||||
///
|
||||
/// \param item A work-item in a group.
|
||||
/// \param inputs Pointer to the input data for the scan operation.
|
||||
/// \param outputs Pointer to the location where scan results will be stored.
|
||||
/// \param init initial value of the scan result.
|
||||
/// \param binary_op functor that implements the binary operation used to
|
||||
/// perform the scan.
|
||||
template <typename Item, typename T, class BinaryOperation,
|
||||
int VALUES_PER_THREAD>
|
||||
__dpct_inline__ void
|
||||
exclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD],
|
||||
T (&outputs)[VALUES_PER_THREAD], T init,
|
||||
BinaryOperation binary_op) {
|
||||
T result = inputs[0];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i < VALUES_PER_THREAD; ++i) {
|
||||
result = binary_op(result, inputs[i]);
|
||||
}
|
||||
|
||||
T exclusive_result =
|
||||
detail::__exclusive_scan_over_group(item.get_group(), result, binary_op);
|
||||
|
||||
T input = inputs[0];
|
||||
if (item.get_local_linear_id() == 0) {
|
||||
outputs[0] = init;
|
||||
} else {
|
||||
outputs[0] = exclusive_result;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i < VALUES_PER_THREAD; ++i) {
|
||||
T output = binary_op(input, outputs[i - 1]);
|
||||
input = inputs[i];
|
||||
outputs[i] = output;
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform an exclusive scan over the values of input from all work-items in
|
||||
/// the group using the operator binary_op, which must be one of the SYCL 2020
|
||||
/// group algorithms library function objects.
|
||||
///
|
||||
/// \param item A work-item in a group.
|
||||
/// \param input Input data for the scan operation.
|
||||
/// \param init initial value of the scan result.
|
||||
/// \param binary_op functor that implements the binary operation used to
|
||||
/// perform the scan. \param group_aggregate group-wide aggregate of all inputs
|
||||
/// in the work-items of the group. \returns exclusive scan of the first i
|
||||
/// work-items where item is the i-th work item.
|
||||
template <typename Item, typename T, class BinaryOperation>
|
||||
__dpct_inline__ T
|
||||
exclusive_scan(const Item &item, T input, T init, BinaryOperation binary_op,
|
||||
T &group_aggregate) {
|
||||
T output = detail::__exclusive_scan_over_group(item.get_group(), input, init,
|
||||
binary_op);
|
||||
if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
|
||||
group_aggregate = binary_op(output, input);
|
||||
}
|
||||
|
||||
group_aggregate = detail::__group_broadcast(
|
||||
item.get_group(), group_aggregate, item.get_local_range().size() - 1);
|
||||
return output;
|
||||
}
|
||||
|
||||
/// Perform an exclusive scan over the values of input from all work-items in
|
||||
/// the group using the operator binary_op, which must be one of the SYCL 2020
|
||||
/// group algorithms library function objects.
|
||||
///
|
||||
/// \param item A work-item in a group.
|
||||
/// \param input Input data for the scan operation.
|
||||
/// \param binary_op functor that implements the binary operation used to
|
||||
/// perform the scan. \param prefix_callback_op functor invoked by the first
|
||||
/// work-item in the group that returns the
|
||||
/// initial value in the resulting scan of the work-items in the group.
|
||||
/// \returns exclusive scan of the input elements assigned to work-items in the
|
||||
/// group.
|
||||
template <typename Item, typename T, class BinaryOperation,
|
||||
class GroupPrefixCallbackOperation>
|
||||
__dpct_inline__ T
|
||||
exclusive_scan(const Item &item, T input, BinaryOperation binary_op,
|
||||
GroupPrefixCallbackOperation &prefix_callback_op) {
|
||||
T group_aggregate;
|
||||
|
||||
T output =
|
||||
detail::__exclusive_scan_over_group(item.get_group(), input, binary_op);
|
||||
if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
|
||||
group_aggregate = binary_op(output, input);
|
||||
}
|
||||
|
||||
group_aggregate = detail::__group_broadcast(
|
||||
item.get_group(), group_aggregate, item.get_local_range().size() - 1);
|
||||
|
||||
T group_prefix = prefix_callback_op(group_aggregate);
|
||||
if (item.get_local_linear_id() == 0) {
|
||||
output = group_prefix;
|
||||
} else {
|
||||
output = binary_op(group_prefix, output);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
|
||||
typedef uint16_t digit_counter_type;
|
||||
typedef uint32_t packed_counter_type;
|
||||
|
||||
template <int N, int CURRENT_VAL = N, int COUNT = 0> struct log2 {
|
||||
enum { VALUE = log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };
|
||||
};
|
||||
|
||||
template <int N, int COUNT> struct log2<N, 0, COUNT> {
|
||||
enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 };
|
||||
};
|
||||
|
||||
template <int RADIX_BITS, bool DESCENDING = false> class radix_rank {
|
||||
public:
|
||||
static size_t get_local_memory_size(size_t group_threads) {
|
||||
return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type);
|
||||
}
|
||||
|
||||
radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {}
|
||||
|
||||
template <typename Item, int VALUES_PER_THREAD>
|
||||
__dpct_inline__ void
|
||||
rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD],
|
||||
int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) {
|
||||
|
||||
digit_counter_type thread_prefixes[VALUES_PER_THREAD];
|
||||
digit_counter_type *digit_counters[VALUES_PER_THREAD];
|
||||
digit_counter_type *buffer =
|
||||
reinterpret_cast<digit_counter_type *>(_local_memory);
|
||||
|
||||
reset_local_memory(item);
|
||||
|
||||
item.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VALUES_PER_THREAD; ++i) {
|
||||
uint32_t digit = ::dpct::bfe(keys[i], current_bit, num_bits);
|
||||
uint32_t sub_counter = digit >> LOG_COUNTER_LANES;
|
||||
uint32_t counter_lane = digit & (COUNTER_LANES - 1);
|
||||
|
||||
if (DESCENDING) {
|
||||
sub_counter = PACKING_RATIO - 1 - sub_counter;
|
||||
counter_lane = COUNTER_LANES - 1 - counter_lane;
|
||||
}
|
||||
|
||||
digit_counters[i] =
|
||||
&buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO +
|
||||
item.get_local_linear_id() * PACKING_RATIO + sub_counter];
|
||||
thread_prefixes[i] = *digit_counters[i];
|
||||
*digit_counters[i] = thread_prefixes[i] + 1;
|
||||
}
|
||||
|
||||
item.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
scan_counters(item);
|
||||
|
||||
item.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
for (int i = 0; i < VALUES_PER_THREAD; ++i) {
|
||||
ranks[i] = thread_prefixes[i] + *digit_counters[i];
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename Item>
|
||||
__dpct_inline__ void reset_local_memory(const Item &item) {
|
||||
packed_counter_type *ptr =
|
||||
reinterpret_cast<packed_counter_type *>(_local_memory);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
|
||||
ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Item>
|
||||
__dpct_inline__ packed_counter_type upsweep(const Item &item) {
|
||||
packed_counter_type sum = 0;
|
||||
packed_counter_type *ptr =
|
||||
reinterpret_cast<packed_counter_type *>(_local_memory);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < PADDED_COUNTER_LANES; i++) {
|
||||
cached_segment[i] =
|
||||
ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
|
||||
sum += cached_segment[i];
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <typename Item>
|
||||
__dpct_inline__ void
|
||||
exclusive_downsweep(const Item &item, packed_counter_type raking_partial) {
|
||||
packed_counter_type *ptr =
|
||||
reinterpret_cast<packed_counter_type *>(_local_memory);
|
||||
packed_counter_type sum = raking_partial;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
|
||||
packed_counter_type value = cached_segment[i];
|
||||
cached_segment[i] = sum;
|
||||
sum += value;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
|
||||
ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] =
|
||||
cached_segment[i];
|
||||
}
|
||||
}
|
||||
|
||||
struct prefix_callback {
|
||||
__dpct_inline__ packed_counter_type
|
||||
operator()(packed_counter_type block_aggregate) {
|
||||
packed_counter_type block_prefix = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (int packed = 1; packed < PACKING_RATIO; packed++) {
|
||||
block_prefix += block_aggregate
|
||||
<< (sizeof(digit_counter_type) * 8 * packed);
|
||||
}
|
||||
|
||||
return block_prefix;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Item>
|
||||
__dpct_inline__ void scan_counters(const Item &item) {
|
||||
packed_counter_type raking_partial = upsweep(item);
|
||||
|
||||
prefix_callback callback;
|
||||
packed_counter_type exclusive_partial = exclusive_scan(
|
||||
item, raking_partial, sycl::ext::oneapi::plus<packed_counter_type>(),
|
||||
callback);
|
||||
|
||||
exclusive_downsweep(item, exclusive_partial);
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr int PACKING_RATIO =
|
||||
sizeof(packed_counter_type) / sizeof(digit_counter_type);
|
||||
static constexpr int LOG_PACKING_RATIO = log2<PACKING_RATIO>::VALUE;
|
||||
static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO;
|
||||
static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES;
|
||||
static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1;
|
||||
|
||||
packed_counter_type cached_segment[PADDED_COUNTER_LANES];
|
||||
uint8_t *_local_memory;
|
||||
};
|
||||
|
||||
template <typename T, typename U> struct base_traits {
|
||||
|
||||
static __dpct_inline__ U twiddle_in(U key) {
|
||||
throw std::runtime_error("Not implemented");
|
||||
}
|
||||
static __dpct_inline__ U twiddle_out(U key) {
|
||||
throw std::runtime_error("Not implemented");
|
||||
}
|
||||
};
|
||||
|
||||
template <typename U> struct base_traits<uint32_t, U> {
|
||||
static __dpct_inline__ U twiddle_in(U key) { return key; }
|
||||
static __dpct_inline__ U twiddle_out(U key) { return key; }
|
||||
};
|
||||
|
||||
template <typename U> struct base_traits<int, U> {
|
||||
static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
|
||||
static __dpct_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; }
|
||||
static __dpct_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; }
|
||||
};
|
||||
|
||||
template <typename U> struct base_traits<float, U> {
|
||||
static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
|
||||
static __dpct_inline__ U twiddle_in(U key) {
|
||||
U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT;
|
||||
return key ^ mask;
|
||||
}
|
||||
static __dpct_inline__ U twiddle_out(U key) {
|
||||
U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1);
|
||||
return key ^ mask;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct traits : base_traits<T, T> {};
|
||||
template <> struct traits<uint32_t> : base_traits<uint32_t, uint32_t> {};
|
||||
template <> struct traits<int> : base_traits<int, uint32_t> {};
|
||||
template <> struct traits<float> : base_traits<float, uint32_t> {};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <int N> struct power_of_two {
|
||||
enum { VALUE = ((N & (N - 1)) == 0) };
|
||||
};
|
||||
|
||||
__dpct_inline__ uint32_t shr_add(uint32_t x, uint32_t shift, uint32_t addend) {
|
||||
return (x >> shift) + addend;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// Implements scatter to blocked exchange pattern used in radix sort algorithm.
|
||||
///
|
||||
/// \tparam T type of the data elements exchanges
|
||||
/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
|
||||
template <typename T, int VALUES_PER_THREAD> class exchange {
|
||||
public:
|
||||
static size_t get_local_memory_size(size_t group_threads) {
|
||||
size_t padding_values =
|
||||
(INSERT_PADDING)
|
||||
? ((group_threads * VALUES_PER_THREAD) >> LOG_LOCAL_MEMORY_BANKS)
|
||||
: 0;
|
||||
return (group_threads * VALUES_PER_THREAD + padding_values) * sizeof(T);
|
||||
}
|
||||
|
||||
exchange(uint8_t *local_memory) : _local_memory(local_memory) {}
|
||||
|
||||
/// Rearrange elements from rank order to blocked order
|
||||
template <typename Item>
|
||||
__dpct_inline__ void
|
||||
scatter_to_blocked(Item item, T (&keys)[VALUES_PER_THREAD],
|
||||
int (&ranks)[VALUES_PER_THREAD]) {
|
||||
T *buffer = reinterpret_cast<T *>(_local_memory);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VALUES_PER_THREAD; i++) {
|
||||
int offset = ranks[i];
|
||||
if (INSERT_PADDING)
|
||||
offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
|
||||
buffer[offset] = keys[i];
|
||||
}
|
||||
|
||||
item.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VALUES_PER_THREAD; i++) {
|
||||
int offset = (item.get_local_id(0) * VALUES_PER_THREAD) + i;
|
||||
if (INSERT_PADDING)
|
||||
offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
|
||||
keys[i] = buffer[offset];
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr int LOG_LOCAL_MEMORY_BANKS = 5;
|
||||
static constexpr bool INSERT_PADDING =
|
||||
(VALUES_PER_THREAD > 4) &&
|
||||
(detail::power_of_two<VALUES_PER_THREAD>::VALUE);
|
||||
|
||||
uint8_t *_local_memory;
|
||||
};
|
||||
|
||||
/// Implements radix sort to sort integer data elements assigned to all threads
|
||||
/// in the group.
|
||||
///
|
||||
/// \tparam T type of the data elements exchanges
|
||||
/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
|
||||
/// \tparam DECENDING boolean value indicating if data elements are sorted in
|
||||
/// decending order.
|
||||
template <typename T, int VALUES_PER_THREAD, bool DESCENDING = false>
|
||||
class radix_sort {
|
||||
public:
|
||||
static size_t get_local_memory_size(size_t group_threads) {
|
||||
size_t ranks_size =
|
||||
detail::radix_rank<RADIX_BITS>::get_local_memory_size(group_threads);
|
||||
size_t exchange_size =
|
||||
exchange<T, VALUES_PER_THREAD>::get_local_memory_size(group_threads);
|
||||
return sycl::max(ranks_size, exchange_size);
|
||||
}
|
||||
|
||||
radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {}
|
||||
|
||||
template <typename Item>
|
||||
__dpct_inline__ void
|
||||
sort(const Item &item, T (&keys)[VALUES_PER_THREAD], int begin_bit = 0,
|
||||
int end_bit = 8 * sizeof(T)) {
|
||||
|
||||
uint32_t(&unsigned_keys)[VALUES_PER_THREAD] =
|
||||
reinterpret_cast<uint32_t(&)[VALUES_PER_THREAD]>(keys);
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VALUES_PER_THREAD; ++i) {
|
||||
unsigned_keys[i] = detail::traits<T>::twiddle_in(unsigned_keys[i]);
|
||||
}
|
||||
|
||||
while (true) {
|
||||
int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit);
|
||||
|
||||
int ranks[VALUES_PER_THREAD];
|
||||
detail::radix_rank<RADIX_BITS, DESCENDING>(_local_memory)
|
||||
.template rank_keys(item, unsigned_keys, ranks, begin_bit, pass_bits);
|
||||
begin_bit += RADIX_BITS;
|
||||
|
||||
item.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
exchange<T, VALUES_PER_THREAD>(_local_memory)
|
||||
.scatter_to_blocked(item, keys, ranks);
|
||||
|
||||
item.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
if (begin_bit >= end_bit)
|
||||
break;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VALUES_PER_THREAD; ++i) {
|
||||
unsigned_keys[i] = detail::traits<T>::twiddle_out(unsigned_keys[i]);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr int RADIX_BITS = 4;
|
||||
|
||||
uint8_t *_local_memory;
|
||||
};
|
||||
|
||||
/// Perform a reduction of the data elements assigned to all threads in the
|
||||
/// group.
|
||||
///
|
||||
/// \param item A work-item in a group.
|
||||
/// \param inputs Pointer to the input data for the reduce operation.
|
||||
/// \param binary_op functor that implements the binary operation used to
|
||||
/// perform the scan. \returns value of the reduction using binary_op
|
||||
template <typename Item, typename T, class BinaryOperation,
|
||||
int VALUES_PER_THREAD>
|
||||
__dpct_inline__ T
|
||||
reduce(Item item, T (&inputs)[VALUES_PER_THREAD], BinaryOperation binary_op) {
|
||||
T result = inputs[0];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i < VALUES_PER_THREAD; i++) {
|
||||
result = binary_op(result, inputs[i]);
|
||||
}
|
||||
return detail::__reduce_over_group(item.get_group(), result, binary_op);
|
||||
}
|
||||
|
||||
/// Perform a reduction on a limited number of the work items in a subgroup
|
||||
///
|
||||
/// \param item A work-item in a group.
|
||||
/// \param value value per work item which is to be reduced
|
||||
/// \param items_to_reduce num work items at the start of the subgroup to reduce
|
||||
/// \param binary_op functor that implements the binary operation used to
|
||||
/// perform the scan. \returns value of the reduction using binary_op
|
||||
template <typename Item, typename T, class BinaryOperation>
|
||||
__dpct_inline__
|
||||
typename ::std::enable_if_t<sycl::has_known_identity_v<BinaryOperation, T>, T>
|
||||
reduce_over_partial_group(const Item &item, const T &value,
|
||||
const ::std::uint16_t &items_to_reduce,
|
||||
BinaryOperation binary_op) {
|
||||
T value_temp = (item.get_local_linear_id() < items_to_reduce)
|
||||
? value
|
||||
: sycl::known_identity_v<BinaryOperation, T>;
|
||||
return detail::__reduce_over_group(item.get_sub_group(), value_temp,
|
||||
binary_op);
|
||||
}
|
||||
|
||||
/// Perform an inclusive scan over the values of inputs from all work-items in
|
||||
/// the group using the operator binary_op, which must be one of the SYCL 2020
|
||||
/// group algorithms library function objects.
|
||||
///
|
||||
/// \param item A work-item in a group.
|
||||
/// \param inputs Pointer to the input data for the scan operation.
|
||||
/// \param outputs Pointer to the location where scan results will be stored.
|
||||
/// \param binary_op functor that implements the binary operation used to
|
||||
/// perform the scan. \returns inclusive scan of the input elements assigned to
|
||||
/// work-items in the group.
|
||||
template <typename Item, typename T, class BinaryOperation,
|
||||
int VALUES_PER_THREAD>
|
||||
__dpct_inline__ void
|
||||
inclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD],
|
||||
T (&outputs)[VALUES_PER_THREAD], BinaryOperation binary_op) {
|
||||
T result = inputs[0];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i < VALUES_PER_THREAD; ++i) {
|
||||
result = binary_op(result, inputs[i]);
|
||||
}
|
||||
|
||||
T exclusive_result =
|
||||
detail::__exclusive_scan_over_group(item.get_group(), result, binary_op);
|
||||
|
||||
if (item.get_local_linear_id() == 0) {
|
||||
outputs[0] = inputs[0];
|
||||
} else {
|
||||
outputs[0] = binary_op(inputs[0], exclusive_result);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i < VALUES_PER_THREAD; ++i) {
|
||||
outputs[i] = binary_op(inputs[i], outputs[i - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform an inclusive scan over the values of inputs from all work-items in
|
||||
/// the group using the operator binary_op, which must be one of the SYCL 2020
|
||||
/// group algorithms library function objects.
|
||||
///
|
||||
/// \param item A work-item in a group.
|
||||
/// \param input Pointer to the input data for the scan operation.
|
||||
/// \param binary_op functor that implements the binary operation used to
|
||||
/// perform the scan. \param group_aggregate group-wide aggregate of all inputs
|
||||
/// in the work-items of the group. \returns inclusive scan of the input
|
||||
/// elements assigned to work-items in the group.
|
||||
template <typename Item, typename T, class BinaryOperation>
|
||||
__dpct_inline__ T inclusive_scan(const Item &item, T input,
|
||||
BinaryOperation binary_op,
|
||||
T &group_aggregate) {
|
||||
T output =
|
||||
detail::__inclusive_scan_over_group(item.get_group(), input, binary_op);
|
||||
if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
|
||||
group_aggregate = output;
|
||||
}
|
||||
|
||||
group_aggregate = detail::__group_broadcast(
|
||||
item.get_group(), group_aggregate, item.get_local_range().size() - 1);
|
||||
return output;
|
||||
}
|
||||
|
||||
/// Perform an inclusive scan over the values of input from all work-items in
|
||||
/// the group using the operator binary_op, which must be one of the SYCL 2020
|
||||
/// group algorithms library function objects.
|
||||
///
|
||||
/// \param item A work-item in a group.
|
||||
/// \param input Input data for the scan operation.
|
||||
/// \param binary_op functor that implements the binary operation used to
|
||||
/// perform the scan. \param prefix_callback_op functor invoked by the first
|
||||
/// work-item in the group that returns the
|
||||
/// initial value in the resulting scan of the work-items in the group.
|
||||
/// \returns inclusive scan of the input elements assigned to work-items in the
|
||||
/// group.
|
||||
template <typename Item, typename T, class BinaryOperation,
|
||||
class GroupPrefixCallbackOperation>
|
||||
__dpct_inline__ T
|
||||
inclusive_scan(const Item &item, T input, BinaryOperation binary_op,
|
||||
GroupPrefixCallbackOperation &prefix_callback_op) {
|
||||
T group_aggregate;
|
||||
|
||||
T output = inclusive_scan(item, input, binary_op, group_aggregate);
|
||||
T group_prefix = prefix_callback_op(group_aggregate);
|
||||
|
||||
return binary_op(group_prefix, output);
|
||||
}
|
||||
|
||||
} // namespace group
|
||||
|
||||
namespace device {
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename... _Args> constexpr auto __joint_reduce(_Args... __args) {
|
||||
return sycl::joint_reduce(__args...);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// Perform a reduce on each of the segments specified within data stored on
|
||||
/// the device.
|
||||
///
|
||||
/// \param queue Command queue used to access device used for reduction
|
||||
/// \param inputs Pointer to the data elements on the device to be reduced
|
||||
/// \param outputs Pointer to the storage where the reduced value for each
|
||||
/// segment will be stored \param segment_count number of segments to be reduced
|
||||
/// \param begin_offsets Pointer to the set of indices that are the first
|
||||
/// element in each segment \param end_offsets Pointer to the set of indices
|
||||
/// that are one past the last element in each segment \param binary_op functor
|
||||
/// that implements the binary operation used to perform the scan. \param init
|
||||
/// initial value of the reduction for each segment.
|
||||
template <int GROUP_SIZE, typename T, typename OffsetT, class BinaryOperation>
|
||||
void segmented_reduce(sycl::queue queue, T *inputs, T *outputs,
|
||||
size_t segment_count, OffsetT *begin_offsets,
|
||||
OffsetT *end_offsets, BinaryOperation binary_op, T init) {
|
||||
|
||||
sycl::range<1> global_size(segment_count * GROUP_SIZE);
|
||||
sycl::range<1> local_size(GROUP_SIZE);
|
||||
|
||||
queue.submit([&](sycl::handler &cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<1>(global_size, local_size), [=](sycl::nd_item<1> item) {
|
||||
OffsetT segment_begin = begin_offsets[item.get_group_linear_id()];
|
||||
OffsetT segment_end = end_offsets[item.get_group_linear_id()];
|
||||
if (segment_begin == segment_end) {
|
||||
if (item.get_local_linear_id() == 0) {
|
||||
outputs[item.get_group_linear_id()] = init;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
sycl::multi_ptr<T, sycl::access::address_space::global_space>
|
||||
input_ptr = inputs;
|
||||
T group_aggregate = detail::__joint_reduce(
|
||||
item.get_group(), input_ptr + segment_begin,
|
||||
input_ptr + segment_end, init, binary_op);
|
||||
|
||||
if (item.get_local_linear_id() == 0) {
|
||||
outputs[item.get_group_linear_id()] = group_aggregate;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
|
||||
|
||||
namespace experimental {
|
||||
namespace detail {
|
||||
template <typename _Tp, typename... _Ts> struct __is_any {
|
||||
constexpr static bool value = std::disjunction_v<
|
||||
std::is_same<std::remove_cv_t<_Tp>, std::remove_cv_t<_Ts>>...>;
|
||||
};
|
||||
|
||||
template <typename _Tp, typename _Bp> struct __in_native_op_list {
|
||||
constexpr static bool value =
|
||||
__is_any<_Bp, sycl::plus<_Tp>, sycl::bit_or<_Tp>, sycl::bit_xor<_Tp>,
|
||||
sycl::bit_and<_Tp>, sycl::maximum<_Tp>, sycl::minimum<_Tp>,
|
||||
sycl::multiplies<_Tp>>::value;
|
||||
};
|
||||
|
||||
template <typename _Tp, typename _Bp> struct __is_native_op {
|
||||
constexpr static bool value = __in_native_op_list<_Tp, _Bp>::value ||
|
||||
__in_native_op_list<void, _Bp>::value;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// Perform a reduce on each of the segments specified within data stored on
|
||||
/// the device. Compared with dpct::device::segmented_reduce, this experimental
|
||||
/// feature support user define reductions.
|
||||
///
|
||||
/// \param queue Command queue used to access device used for reduction
|
||||
/// \param inputs Pointer to the data elements on the device to be reduced
|
||||
/// \param outputs Pointer to the storage where the reduced value for each
|
||||
/// segment will be stored \param segment_count number of segments to be reduced
|
||||
/// \param begin_offsets Pointer to the set of indices that are the first
|
||||
/// element in each segment \param end_offsets Pointer to the set of indices
|
||||
/// that are one past the last element in each segment \param binary_op functor
|
||||
/// that implements the binary operation used to perform the scan. \param init
|
||||
/// initial value of the reduction for each segment.
|
||||
template <int GROUP_SIZE, typename T, typename OffsetT, class BinaryOperation>
|
||||
void segmented_reduce(sycl::queue queue, T *inputs, T *outputs,
|
||||
size_t segment_count, OffsetT *begin_offsets,
|
||||
OffsetT *end_offsets, BinaryOperation binary_op, T init) {
|
||||
|
||||
sycl::range<1> global_size(segment_count * GROUP_SIZE);
|
||||
sycl::range<1> local_size(GROUP_SIZE);
|
||||
|
||||
if constexpr (!detail::__is_native_op<T, BinaryOperation>::value) {
|
||||
queue.submit([&](sycl::handler &cgh) {
|
||||
size_t temp_memory_size = GROUP_SIZE * sizeof(T);
|
||||
auto scratch = sycl::local_accessor<std::byte, 1>(temp_memory_size, cgh);
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<1>(global_size, local_size),
|
||||
[=](sycl::nd_item<1> item) {
|
||||
OffsetT segment_begin = begin_offsets[item.get_group_linear_id()];
|
||||
OffsetT segment_end = end_offsets[item.get_group_linear_id()];
|
||||
if (segment_begin == segment_end) {
|
||||
if (item.get_local_linear_id() == 0) {
|
||||
outputs[item.get_group_linear_id()] = init;
|
||||
}
|
||||
return;
|
||||
}
|
||||
// Create a handle that associates the group with an allocation it
|
||||
// can use
|
||||
auto handle =
|
||||
sycl::ext::oneapi::experimental::group_with_scratchpad(
|
||||
item.get_group(),
|
||||
sycl::span(&scratch[0], temp_memory_size));
|
||||
T group_aggregate = sycl::ext::oneapi::experimental::joint_reduce(
|
||||
handle, inputs + segment_begin, inputs + segment_end, init,
|
||||
binary_op);
|
||||
if (item.get_local_linear_id() == 0) {
|
||||
outputs[item.get_group_linear_id()] = group_aggregate;
|
||||
}
|
||||
});
|
||||
});
|
||||
} else {
|
||||
dpct::device::segmented_reduce<GROUP_SIZE>(queue, inputs, outputs,
|
||||
segment_count, begin_offsets,
|
||||
end_offsets, binary_op, init);
|
||||
}
|
||||
}
|
||||
} // namespace experimental
|
||||
|
||||
#endif // SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
|
||||
|
||||
|
||||
} // namespace device
|
||||
} // namespace dpct
|
||||
|
||||
#endif
|
|
@ -1,453 +0,0 @@
|
|||
//==---- functional.h -----------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_FUNCTIONAL_H__
|
||||
#define __DPCT_FUNCTIONAL_H__
|
||||
|
||||
#include <functional>
|
||||
#include <oneapi/dpl/functional>
|
||||
#include <oneapi/dpl/iterator>
|
||||
|
||||
#if ONEDPL_USE_DPCPP_BACKEND
|
||||
#include <oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h>
|
||||
#endif
|
||||
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
#include "../dpct.hpp"
|
||||
#define _DPCT_GCC_VERSION \
|
||||
(__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
|
||||
|
||||
// Portability "#pragma" definition
|
||||
#ifdef _MSC_VER
|
||||
#define _DPCT_PRAGMA(x) __pragma(x)
|
||||
#else
|
||||
#define _DPCT_PRAGMA(x) _Pragma(#x)
|
||||
#endif
|
||||
|
||||
// Enable loop unrolling pragmas where supported
|
||||
#if (__INTEL_COMPILER || \
|
||||
(!defined(__INTEL_COMPILER) && _DPCT_GCC_VERSION >= 80000))
|
||||
#define _DPCT_PRAGMA_UNROLL _DPCT_PRAGMA(unroll)
|
||||
#else // no pragma unroll
|
||||
#define _DPCT_PRAGMA_UNROLL
|
||||
#endif
|
||||
|
||||
namespace dpct {
|
||||
|
||||
struct null_type {};
|
||||
|
||||
// Function object to wrap user defined functors to provide compile time "const"
|
||||
// workaround for user function objects.
|
||||
// The SYCL spec (4.12) states that writing to a function object during a SYCL
|
||||
// kernel is undefined behavior. This wrapper is provided as a compile-time
|
||||
// work around, but functors used in SYCL kernels must be `const` in practice.
|
||||
template <typename _Op> struct mark_functor_const {
|
||||
mutable _Op op;
|
||||
mark_functor_const() : op() {}
|
||||
mark_functor_const(const _Op &__op) : op(__op) {}
|
||||
mark_functor_const(_Op &&__op) : op(::std::move(__op)) {}
|
||||
template <typename... _T> auto operator()(_T &&...x) const {
|
||||
return op(std::forward<_T>(x)...);
|
||||
}
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <class _ExecPolicy, class _T>
|
||||
using enable_if_execution_policy =
|
||||
typename std::enable_if<oneapi::dpl::execution::is_execution_policy<
|
||||
typename std::decay<_ExecPolicy>::type>::value,
|
||||
_T>::type;
|
||||
|
||||
template <typename _T> struct is_hetero_execution_policy : ::std::false_type {};
|
||||
|
||||
template <typename... PolicyParams>
|
||||
struct is_hetero_execution_policy<
|
||||
oneapi::dpl::execution::device_policy<PolicyParams...>> : ::std::true_type {
|
||||
};
|
||||
|
||||
template <typename _T> struct is_fpga_execution_policy : ::std::false_type {};
|
||||
|
||||
#if _ONEDPL_FPGA_DEVICE
|
||||
template <unsigned int unroll_factor, typename... PolicyParams>
|
||||
struct is_hetero_execution_policy<
|
||||
execution::fpga_policy<unroll_factor, PolicyParams...>> : ::std::true_type {
|
||||
};
|
||||
#endif
|
||||
|
||||
template <class _ExecPolicy, class _T>
|
||||
using enable_if_hetero_execution_policy = typename std::enable_if<
|
||||
is_hetero_execution_policy<typename std::decay<_ExecPolicy>::type>::value,
|
||||
_T>::type;
|
||||
|
||||
#if _ONEDPL_CPP14_INTEGER_SEQUENCE_PRESENT
|
||||
|
||||
template <std::size_t... _Sp>
|
||||
using index_sequence = std::index_sequence<_Sp...>;
|
||||
template <std::size_t _Np>
|
||||
using make_index_sequence = std::make_index_sequence<_Np>;
|
||||
|
||||
#else
|
||||
|
||||
template <std::size_t... _Sp> class index_sequence {};
|
||||
|
||||
template <std::size_t _Np, std::size_t... _Sp>
|
||||
struct make_index_sequence_impl
|
||||
: make_index_sequence_impl<_Np - 1, _Np - 1, _Sp...> {};
|
||||
|
||||
template <std::size_t... _Sp> struct make_index_sequence_impl<0, _Sp...> {
|
||||
using type = index_sequence<_Sp...>;
|
||||
};
|
||||
|
||||
template <std::size_t _Np>
|
||||
using make_index_sequence = typename make_index_sequence_impl<_Np>::type;
|
||||
#endif
|
||||
|
||||
// Minimal buffer implementations for temporary storage in mapping rules
|
||||
// Some of our algorithms need to start with raw memory buffer,
|
||||
// not an initialized array, because initialization/destruction
|
||||
// would make the span be at least O(N).
|
||||
#if ONEDPL_USE_DPCPP_BACKEND
|
||||
template <typename _Tp> class __buffer {
|
||||
sycl::buffer<_Tp, 1> __buf;
|
||||
|
||||
__buffer(const __buffer &) = delete;
|
||||
|
||||
void operator=(const __buffer &) = delete;
|
||||
|
||||
public:
|
||||
// Try to obtain buffer of given size to store objects of _Tp type
|
||||
__buffer(std::size_t __n) : __buf(sycl::range<1>(__n)) {}
|
||||
|
||||
// Return pointer to buffer, or NULL if buffer could not be obtained.
|
||||
auto get() -> decltype(oneapi::dpl::begin(__buf)) const {
|
||||
return oneapi::dpl::begin(__buf);
|
||||
}
|
||||
};
|
||||
#else
|
||||
template <typename _Tp> class __buffer {
|
||||
std::unique_ptr<_Tp> _M_ptr;
|
||||
|
||||
__buffer(const __buffer &) = delete;
|
||||
|
||||
void operator=(const __buffer &) = delete;
|
||||
|
||||
public:
|
||||
// Try to obtain buffer of given size to store objects of _Tp type
|
||||
__buffer(const std::size_t __n) : _M_ptr(new _Tp[__n]) {}
|
||||
|
||||
// Return pointer to buffer, or NULL if buffer could not be obtained.
|
||||
_Tp *get() const { return _M_ptr.get(); }
|
||||
};
|
||||
#endif
|
||||
|
||||
// Implements C++14 std::less<void> specialization to allow parameter type
|
||||
// deduction.
|
||||
class __less {
|
||||
public:
|
||||
template <typename _Xp, typename _Yp>
|
||||
bool operator()(_Xp &&__x, _Yp &&__y) const {
|
||||
return std::forward<_Xp>(__x) < std::forward<_Yp>(__y);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Policy, typename NewName> struct rebind_policy {
|
||||
using type = Policy;
|
||||
};
|
||||
|
||||
template <typename KernelName, typename NewName>
|
||||
struct rebind_policy<oneapi::dpl::execution::device_policy<KernelName>,
|
||||
NewName> {
|
||||
using type = oneapi::dpl::execution::device_policy<NewName>;
|
||||
};
|
||||
|
||||
#if _ONEDPL_FPGA_DEVICE
|
||||
template <unsigned int factor, typename KernelName, typename NewName>
|
||||
struct rebind_policy<oneapi::dpl::execution::fpga_policy<factor, KernelName>,
|
||||
NewName> {
|
||||
using type = oneapi::dpl::execution::fpga_policy<factor, NewName>;
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename T1, typename T2,
|
||||
typename R1 = typename std::iterator_traits<T1>::reference,
|
||||
typename R2 = typename std::iterator_traits<T2>::reference>
|
||||
struct perm_fun {
|
||||
typedef R2 result_of;
|
||||
perm_fun(T1 input) : source(input) {}
|
||||
|
||||
R2 operator()(R1 x) const { return *(source + x); }
|
||||
|
||||
private:
|
||||
T1 source;
|
||||
};
|
||||
|
||||
// Functor compares first element (key) from tied sequence.
|
||||
template <typename Compare = class internal::__less> struct compare_key_fun {
|
||||
typedef bool result_of;
|
||||
compare_key_fun(Compare _comp = internal::__less()) : comp(_comp) {}
|
||||
|
||||
template <typename _T1, typename _T2>
|
||||
result_of operator()(_T1 &&a, _T2 &&b) const {
|
||||
using std::get;
|
||||
return comp(get<0>(a), get<0>(b));
|
||||
}
|
||||
|
||||
private:
|
||||
mutable Compare comp;
|
||||
};
|
||||
|
||||
// Functor evaluates second element of tied sequence with predicate.
|
||||
// Used by: copy_if, remove_copy_if, stable_partition_copy
|
||||
// Lambda:
|
||||
template <typename Predicate> struct predicate_key_fun {
|
||||
typedef bool result_of;
|
||||
predicate_key_fun(Predicate _pred) : pred(_pred) {}
|
||||
|
||||
template <typename _T1> result_of operator()(_T1 &&a) const {
|
||||
using std::get;
|
||||
return pred(get<1>(a));
|
||||
}
|
||||
|
||||
private:
|
||||
mutable Predicate pred;
|
||||
};
|
||||
|
||||
// Used by: remove_if
|
||||
template <typename Predicate> struct negate_predicate_key_fun {
|
||||
typedef bool result_of;
|
||||
negate_predicate_key_fun(Predicate _pred) : pred(_pred) {}
|
||||
|
||||
template <typename _T1> result_of operator()(_T1 &&a) const {
|
||||
using std::get;
|
||||
return !pred(get<1>(a));
|
||||
}
|
||||
|
||||
private:
|
||||
mutable Predicate pred;
|
||||
};
|
||||
|
||||
template <typename T> struct sequence_fun {
|
||||
using result_type = T;
|
||||
sequence_fun(T _init, T _step) : init(_init), step(_step) {}
|
||||
|
||||
template <typename _T> result_type operator()(_T &&i) const {
|
||||
return static_cast<T>(init + step * i);
|
||||
}
|
||||
|
||||
private:
|
||||
const T init;
|
||||
const T step;
|
||||
};
|
||||
|
||||
//[binary_pred](Ref a, Ref b){ return(binary_pred(get<0>(a),get<0>(b)));
|
||||
template <typename Predicate> struct unique_fun {
|
||||
typedef bool result_of;
|
||||
unique_fun(Predicate _pred) : pred(_pred) {}
|
||||
template <typename _T> result_of operator()(_T &&a, _T &&b) const {
|
||||
using std::get;
|
||||
return pred(get<0>(a), get<0>(b));
|
||||
}
|
||||
|
||||
private:
|
||||
mutable Predicate pred;
|
||||
};
|
||||
|
||||
// Lambda: [pred, &new_value](Ref1 a, Ref2 s) {return pred(s) ? new_value : a;
|
||||
// });
|
||||
template <typename T, typename Predicate> struct replace_if_fun {
|
||||
public:
|
||||
typedef T result_of;
|
||||
replace_if_fun(Predicate _pred, T _new_value)
|
||||
: pred(_pred), new_value(_new_value) {}
|
||||
|
||||
template <typename _T1, typename _T2> T operator()(_T1 &&a, _T2 &&s) const {
|
||||
return pred(s) ? new_value : a;
|
||||
}
|
||||
|
||||
private:
|
||||
mutable Predicate pred;
|
||||
const T new_value;
|
||||
};
|
||||
|
||||
//[pred,op](Ref a){return pred(a) ? op(a) : a; }
|
||||
template <typename T, typename Predicate, typename Operator>
|
||||
struct transform_if_fun {
|
||||
transform_if_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {}
|
||||
template <typename _T>
|
||||
void operator()(_T&& t) const {
|
||||
using std::get;
|
||||
if (pred(get<0>(t)))
|
||||
get<1>(t) = op(get<0>(t));
|
||||
}
|
||||
|
||||
private:
|
||||
mutable Predicate pred;
|
||||
mutable Operator op;
|
||||
};
|
||||
|
||||
//[pred, op](Ref1 a, Ref2 s) { return pred(s) ? op(a) : a; });
|
||||
template <typename T, typename Predicate, typename Operator>
|
||||
struct transform_if_unary_zip_mask_fun {
|
||||
transform_if_unary_zip_mask_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {}
|
||||
template <typename _T>
|
||||
void operator()(_T&& t) const {
|
||||
using std::get;
|
||||
if (pred(get<1>(t)))
|
||||
get<2>(t) = op(get<0>(t));
|
||||
}
|
||||
|
||||
private:
|
||||
mutable Predicate pred;
|
||||
mutable Operator op;
|
||||
};
|
||||
|
||||
template <typename T, typename Predicate, typename BinaryOperation>
|
||||
class transform_if_zip_mask_fun {
|
||||
public:
|
||||
transform_if_zip_mask_fun(Predicate _pred = oneapi::dpl::identity(),
|
||||
BinaryOperation _op = oneapi::dpl::identity())
|
||||
: pred(_pred), op(_op) {}
|
||||
template <typename _T> void operator()(_T &&t) const {
|
||||
using std::get;
|
||||
if (pred(get<2>(t)))
|
||||
get<3>(t) = op(get<0>(t), get<1>(t));
|
||||
}
|
||||
|
||||
private:
|
||||
mutable Predicate pred;
|
||||
mutable BinaryOperation op;
|
||||
};
|
||||
|
||||
// This following code is similar to a section of code in
|
||||
// oneDPL/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
|
||||
// It has a similar approach, and could be consolidated.
|
||||
// Outside of some differences in approach, there are two significant
|
||||
// differences in function.
|
||||
//
|
||||
// 1) This code allows the output type of the bit range translation to be fit
|
||||
// into to the minimal type required to provide that many bits. The code in
|
||||
// oneDPL to calculate the bucket for the radix is similar but its output is
|
||||
// always std::uint32_t. The assumption that the bit range desired will fit in
|
||||
// 32 bits is not true for this code.
|
||||
//
|
||||
// 2) This code ensures that for floating point type, -0.0f and 0.0f map to the
|
||||
// same value. This allows the output of this translation to be used to provide
|
||||
// a sort which ensures the stability of these values for floating point types.
|
||||
|
||||
template <int N> struct uint_byte_map {};
|
||||
template <> struct uint_byte_map<1> { using type = uint8_t; };
|
||||
template <> struct uint_byte_map<2> { using type = uint16_t; };
|
||||
template <> struct uint_byte_map<4> { using type = uint32_t; };
|
||||
template <> struct uint_byte_map<8> { using type = uint64_t; };
|
||||
|
||||
template <typename T> struct uint_map {
|
||||
using type = typename uint_byte_map<sizeof(T)>::type;
|
||||
};
|
||||
|
||||
template <typename T, typename OutKeyT> class translate_key {
|
||||
using uint_type_t = typename uint_map<T>::type;
|
||||
|
||||
public:
|
||||
translate_key(int begin_bit, int end_bit) {
|
||||
shift = begin_bit;
|
||||
mask = ~OutKeyT(0); // all ones
|
||||
mask = mask >> (sizeof(OutKeyT) * 8 -
|
||||
(end_bit - begin_bit)); // setup appropriate mask
|
||||
flip_sign = uint_type_t(1) << (sizeof(uint_type_t) * 8 - 1); // sign bit
|
||||
flip_key = ~uint_type_t(0); // 0xF...F
|
||||
}
|
||||
|
||||
inline OutKeyT operator()(const T &key) const {
|
||||
uint_type_t intermediate;
|
||||
if constexpr (std::is_floating_point<T>::value) {
|
||||
// normal case (both -0.0f and 0.0f equal -0.0f)
|
||||
if (key != T(-0.0f)) {
|
||||
uint_type_t is_negative = reinterpret_cast<const uint_type_t &>(key) >>
|
||||
(sizeof(uint_type_t) * 8 - 1);
|
||||
intermediate = reinterpret_cast<const uint_type_t &>(key) ^
|
||||
((is_negative * flip_key) | flip_sign);
|
||||
} else // special case for -0.0f to keep stability with 0.0f
|
||||
{
|
||||
T negzero = T(-0.0f);
|
||||
intermediate = reinterpret_cast<const uint_type_t &>(negzero);
|
||||
}
|
||||
} else if constexpr (std::is_signed<T>::value) {
|
||||
intermediate = reinterpret_cast<const uint_type_t &>(key) ^ flip_sign;
|
||||
} else {
|
||||
intermediate = key;
|
||||
}
|
||||
|
||||
return static_cast<OutKeyT>(intermediate >> shift) &
|
||||
mask; // shift, cast, and mask
|
||||
}
|
||||
|
||||
private:
|
||||
uint8_t shift;
|
||||
OutKeyT mask;
|
||||
uint_type_t flip_sign;
|
||||
uint_type_t flip_key;
|
||||
};
|
||||
|
||||
// Unary operator that returns reference to its argument. Ported from
|
||||
// oneDPL: oneapi/dpl/pstl/utils.h
|
||||
struct no_op_fun {
|
||||
template <typename Tp> Tp &&operator()(Tp &&a) const {
|
||||
return ::std::forward<Tp>(a);
|
||||
}
|
||||
};
|
||||
|
||||
// Unary functor which composes a pair of functors by calling them in succession
|
||||
// on an input
|
||||
template <typename FunctorInner, typename FunctorOuter>
|
||||
struct __composition_functor {
|
||||
__composition_functor(FunctorInner in, FunctorOuter out)
|
||||
: _in(in), _out(out) {}
|
||||
template <typename T> T operator()(const T &i) const {
|
||||
return _out(_in(i));
|
||||
}
|
||||
FunctorInner _in;
|
||||
FunctorOuter _out;
|
||||
};
|
||||
|
||||
// Unary functor which maps an index of a ROI into a 2D flattened array
|
||||
template <typename OffsetT> struct __roi_2d_index_functor {
|
||||
__roi_2d_index_functor(const OffsetT &num_cols,
|
||||
const ::std::size_t &row_stride)
|
||||
: _num_cols(num_cols), _row_stride(row_stride) {}
|
||||
|
||||
template <typename Index> Index operator()(const Index &i) const {
|
||||
return _row_stride * (i / _num_cols) + (i % _num_cols);
|
||||
}
|
||||
|
||||
OffsetT _num_cols;
|
||||
::std::size_t _row_stride;
|
||||
};
|
||||
|
||||
// Unary functor which maps and index into an interleaved array by its active
|
||||
// channel
|
||||
template <typename OffsetT> struct __interleaved_index_functor {
|
||||
__interleaved_index_functor(const OffsetT &total_channels,
|
||||
const OffsetT &active_channel)
|
||||
: _total_channels(total_channels), _active_channel(active_channel) {}
|
||||
|
||||
template <typename Index> Index operator()(const Index &i) const {
|
||||
return i * _total_channels + _active_channel;
|
||||
}
|
||||
|
||||
OffsetT _total_channels;
|
||||
OffsetT _active_channel;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace dpct
|
||||
|
||||
#endif
|
|
@ -1,347 +0,0 @@
|
|||
//==---- iterators.h ------------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_ITERATORS_H__
|
||||
#define __DPCT_ITERATORS_H__
|
||||
|
||||
#include <oneapi/dpl/iterator>
|
||||
|
||||
#include "functional.h"
|
||||
|
||||
namespace dpct {
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Wrapper class returned from a dereferenced transform_iterator which was
|
||||
// created using
|
||||
// make_transform_output_iterator(). Used to apply the supplied transform
|
||||
// function when writing into an object of this class.
|
||||
//
|
||||
// Example:
|
||||
// int a[] = {0, 1, 2, 3, 4};
|
||||
// int* p = a;
|
||||
// auto f = [](auto v) {return v*v;};
|
||||
// auto tr_out = dpct::make_transform_output_iterator(p+1, f);
|
||||
// auto wrap = *tr_out; // wrap is a transform_output_ref_wrapper
|
||||
// std::cout<<*(p+1)<<std::endl; // '1'
|
||||
// wrap = 2; // apply function, store 2*2=4
|
||||
// std::cout<<*(p+1)<<std::endl; // '4'
|
||||
template <typename T, typename _UnaryFunc> class transform_output_ref_wrapper {
|
||||
private:
|
||||
T __my_reference_;
|
||||
_UnaryFunc __my_unary_func_;
|
||||
|
||||
public:
|
||||
template <typename U>
|
||||
transform_output_ref_wrapper(U &&__reference, _UnaryFunc __unary_func)
|
||||
: __my_reference_(std::forward<U>(__reference)),
|
||||
__my_unary_func_(__unary_func) {}
|
||||
|
||||
// When writing to an object of this type, apply the supplied unary function,
|
||||
// then write to the wrapped reference
|
||||
template <typename UnaryInputType>
|
||||
transform_output_ref_wrapper &operator=(const UnaryInputType &e) {
|
||||
__my_reference_ = __my_unary_func_(e);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
// Unary functor to create a transform_output_reference_wrapper when a
|
||||
// transform_iterator is dereferenced, so that a
|
||||
// the supplied unary function may be applied on write, resulting in a
|
||||
// transform_output_iterator
|
||||
template <typename _UnaryFunc> struct _Unary_Out {
|
||||
_Unary_Out(_UnaryFunc __f_) : __f(__f_) {}
|
||||
_UnaryFunc __f;
|
||||
template <typename T> auto operator()(T &&val) const {
|
||||
return transform_output_ref_wrapper<T, _UnaryFunc>(std::forward<T>(val),
|
||||
__f);
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
using std::advance;
|
||||
|
||||
using std::distance;
|
||||
|
||||
template <typename T>
|
||||
oneapi::dpl::counting_iterator<T> make_counting_iterator(const T &input) {
|
||||
return oneapi::dpl::counting_iterator<T>(input);
|
||||
}
|
||||
|
||||
template <typename _Tp> class constant_iterator {
|
||||
public:
|
||||
typedef std::false_type is_hetero;
|
||||
typedef std::true_type is_passed_directly;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
typedef _Tp value_type;
|
||||
typedef _Tp *pointer;
|
||||
// There is no storage behind the iterator, so we return a value instead of
|
||||
// reference.
|
||||
typedef const _Tp reference;
|
||||
typedef const _Tp const_reference;
|
||||
typedef std::random_access_iterator_tag iterator_category;
|
||||
|
||||
explicit constant_iterator(_Tp __init)
|
||||
: __my_value_(__init), __my_counter_(0) {}
|
||||
|
||||
private:
|
||||
// used to construct iterator instances with different counter values required
|
||||
// by arithmetic operators
|
||||
constant_iterator(const _Tp &__value, const difference_type &__offset)
|
||||
: __my_value_(__value), __my_counter_(__offset) {}
|
||||
|
||||
public:
|
||||
// non-const variants of access operators are not provided so unintended
|
||||
// writes are caught at compile time.
|
||||
const_reference operator*() const { return __my_value_; }
|
||||
const_reference operator[](difference_type) const { return __my_value_; }
|
||||
|
||||
difference_type operator-(const constant_iterator &__it) const {
|
||||
return __my_counter_ - __it.__my_counter_;
|
||||
}
|
||||
|
||||
constant_iterator &operator+=(difference_type __forward) {
|
||||
__my_counter_ += __forward;
|
||||
return *this;
|
||||
}
|
||||
constant_iterator &operator-=(difference_type __backward) {
|
||||
return *this += -__backward;
|
||||
}
|
||||
constant_iterator &operator++() { return *this += 1; }
|
||||
constant_iterator &operator--() { return *this -= 1; }
|
||||
|
||||
constant_iterator operator++(int) {
|
||||
constant_iterator __it(*this);
|
||||
++(*this);
|
||||
return __it;
|
||||
}
|
||||
constant_iterator operator--(int) {
|
||||
constant_iterator __it(*this);
|
||||
--(*this);
|
||||
return __it;
|
||||
}
|
||||
|
||||
constant_iterator operator-(difference_type __backward) const {
|
||||
return constant_iterator(__my_value_, __my_counter_ - __backward);
|
||||
}
|
||||
constant_iterator operator+(difference_type __forward) const {
|
||||
return constant_iterator(__my_value_, __my_counter_ + __forward);
|
||||
}
|
||||
friend constant_iterator operator+(difference_type __forward,
|
||||
const constant_iterator __it) {
|
||||
return __it + __forward;
|
||||
}
|
||||
|
||||
bool operator==(const constant_iterator &__it) const {
|
||||
return __my_value_ == __it.__my_value_ &&
|
||||
this->__my_counter_ == __it.__my_counter_;
|
||||
}
|
||||
bool operator!=(const constant_iterator &__it) const {
|
||||
return !(*this == __it);
|
||||
}
|
||||
bool operator<(const constant_iterator &__it) const {
|
||||
return *this - __it < 0;
|
||||
}
|
||||
bool operator>(const constant_iterator &__it) const { return __it < *this; }
|
||||
bool operator<=(const constant_iterator &__it) const {
|
||||
return !(*this > __it);
|
||||
}
|
||||
bool operator>=(const constant_iterator &__it) const {
|
||||
return !(*this < __it);
|
||||
}
|
||||
|
||||
private:
|
||||
_Tp __my_value_;
|
||||
uint64_t __my_counter_;
|
||||
};
|
||||
|
||||
template <typename _Tp>
|
||||
constant_iterator<_Tp> make_constant_iterator(_Tp __value) {
|
||||
return constant_iterator<_Tp>(__value);
|
||||
}
|
||||
|
||||
// key_value_pair class to represent a key and value, specifically a
|
||||
// dereferenced arg_index_input_iterator
|
||||
template <typename _KeyTp, typename _ValueTp> class key_value_pair {
|
||||
public:
|
||||
key_value_pair() = default;
|
||||
|
||||
key_value_pair(const _KeyTp &_key, const _ValueTp &_value)
|
||||
: key(_key), value(_value) {}
|
||||
|
||||
bool operator==(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const {
|
||||
return (key == _kvp.key) && (value == _kvp.value);
|
||||
}
|
||||
|
||||
bool operator!=(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const {
|
||||
return (key != _kvp.key) || (value != _kvp.value);
|
||||
}
|
||||
|
||||
_KeyTp key;
|
||||
_ValueTp value;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename KeyTp, typename _ValueTp> struct make_key_value_pair {
|
||||
template <typename ValRefTp>
|
||||
key_value_pair<KeyTp, _ValueTp>
|
||||
operator()(const oneapi::dpl::__internal::tuple<KeyTp, ValRefTp> &tup) const {
|
||||
return ::dpct::key_value_pair<KeyTp, _ValueTp>(::std::get<0>(tup),
|
||||
::std::get<1>(tup));
|
||||
}
|
||||
};
|
||||
|
||||
template <class T> struct __zip_iterator_impl;
|
||||
template <class... Ts> struct __zip_iterator_impl<std::tuple<Ts...>> {
|
||||
using type = oneapi::dpl::zip_iterator<Ts...>;
|
||||
};
|
||||
|
||||
} // end namespace detail
|
||||
|
||||
// dpct::zip_iterator can only accept std::tuple type as template argument for
|
||||
// compatibility purpose. Please use oneapi::dpl::zip_iterator if you want to
|
||||
// pass iterator's types directly.
|
||||
template <typename... Ts>
|
||||
using zip_iterator = typename detail::__zip_iterator_impl<Ts...>::type;
|
||||
|
||||
// arg_index_input_iterator is an iterator over a input iterator, with a index.
|
||||
// When dereferenced, it returns a key_value_pair, which can be interrogated for
|
||||
// the index key or the value from the input iterator
|
||||
template <typename InputIteratorT, typename OffsetT = ptrdiff_t,
|
||||
typename OutputValueT =
|
||||
typename ::std::iterator_traits<InputIteratorT>::value_type>
|
||||
class arg_index_input_iterator
|
||||
: public oneapi::dpl::transform_iterator<
|
||||
oneapi::dpl::zip_iterator<oneapi::dpl::counting_iterator<OffsetT>,
|
||||
InputIteratorT>,
|
||||
detail::make_key_value_pair<OffsetT, OutputValueT>> {
|
||||
using arg_index_input_iterator_wrap = oneapi::dpl::transform_iterator<
|
||||
oneapi::dpl::zip_iterator<oneapi::dpl::counting_iterator<OffsetT>,
|
||||
InputIteratorT>,
|
||||
detail::make_key_value_pair<OffsetT, OutputValueT>>;
|
||||
|
||||
public:
|
||||
typedef OffsetT difference_type;
|
||||
|
||||
// signal to __get_sycl_range that this iterator is as a direct pass iterator
|
||||
using is_zip = ::std::true_type;
|
||||
|
||||
arg_index_input_iterator(const arg_index_input_iterator_wrap &__arg_wrap)
|
||||
: arg_index_input_iterator_wrap(__arg_wrap) {}
|
||||
arg_index_input_iterator(InputIteratorT __iter)
|
||||
: arg_index_input_iterator_wrap(
|
||||
oneapi::dpl::make_zip_iterator(
|
||||
oneapi::dpl::counting_iterator(OffsetT(0)), __iter),
|
||||
detail::make_key_value_pair<OffsetT, OutputValueT>()) {}
|
||||
|
||||
arg_index_input_iterator &operator=(const arg_index_input_iterator &__input) {
|
||||
arg_index_input_iterator_wrap::operator=(__input);
|
||||
return *this;
|
||||
}
|
||||
arg_index_input_iterator &operator++() {
|
||||
arg_index_input_iterator_wrap::operator++();
|
||||
return *this;
|
||||
}
|
||||
arg_index_input_iterator &operator--() {
|
||||
arg_index_input_iterator_wrap::operator--();
|
||||
return *this;
|
||||
}
|
||||
arg_index_input_iterator operator++(int) {
|
||||
arg_index_input_iterator __it(*this);
|
||||
++(*this);
|
||||
return __it;
|
||||
}
|
||||
arg_index_input_iterator operator--(int) {
|
||||
arg_index_input_iterator __it(*this);
|
||||
--(*this);
|
||||
return __it;
|
||||
}
|
||||
arg_index_input_iterator operator+(difference_type __forward) const {
|
||||
return arg_index_input_iterator(
|
||||
arg_index_input_iterator_wrap::operator+(__forward));
|
||||
}
|
||||
arg_index_input_iterator operator-(difference_type __backward) const {
|
||||
return arg_index_input_iterator(
|
||||
arg_index_input_iterator_wrap::operator-(__backward));
|
||||
}
|
||||
arg_index_input_iterator &operator+=(difference_type __forward) {
|
||||
arg_index_input_iterator_wrap::operator+=(__forward);
|
||||
return *this;
|
||||
}
|
||||
arg_index_input_iterator &operator-=(difference_type __backward) {
|
||||
arg_index_input_iterator_wrap::operator-=(__backward);
|
||||
return *this;
|
||||
}
|
||||
|
||||
friend arg_index_input_iterator
|
||||
operator+(difference_type __forward, const arg_index_input_iterator &__it) {
|
||||
return __it + __forward;
|
||||
}
|
||||
|
||||
difference_type operator-(const arg_index_input_iterator &__it) const {
|
||||
return arg_index_input_iterator_wrap::operator-(__it);
|
||||
}
|
||||
bool operator==(const arg_index_input_iterator &__it) const {
|
||||
return arg_index_input_iterator_wrap::operator==(__it);
|
||||
}
|
||||
bool operator!=(const arg_index_input_iterator &__it) const {
|
||||
return !(*this == __it);
|
||||
}
|
||||
bool operator<(const arg_index_input_iterator &__it) const {
|
||||
return *this - __it < 0;
|
||||
}
|
||||
bool operator>(const arg_index_input_iterator &__it) const {
|
||||
return __it < *this;
|
||||
}
|
||||
bool operator<=(const arg_index_input_iterator &__it) const {
|
||||
return !(*this > __it);
|
||||
}
|
||||
bool operator>=(const arg_index_input_iterator &__it) const {
|
||||
return !(*this < __it);
|
||||
}
|
||||
|
||||
// returns an arg_index_input_iterator with the same iter position, but a
|
||||
// count reset to 0
|
||||
arg_index_input_iterator create_normalized() {
|
||||
return arg_index_input_iterator(
|
||||
::std::get<1>(arg_index_input_iterator_wrap::base().base()));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename IterT> struct io_iterator_pair {
|
||||
inline io_iterator_pair() : selector(false) {}
|
||||
|
||||
inline io_iterator_pair(const IterT &first, const IterT &second)
|
||||
: selector(false) {
|
||||
iter[0] = first;
|
||||
iter[1] = second;
|
||||
}
|
||||
|
||||
inline IterT first() const { return selector ? iter[1] : iter[0]; }
|
||||
|
||||
inline IterT second() const { return selector ? iter[0] : iter[1]; }
|
||||
|
||||
inline void swap() { selector = !selector; }
|
||||
|
||||
bool selector;
|
||||
|
||||
IterT iter[2];
|
||||
};
|
||||
|
||||
template <typename _Iter, typename _UnaryFunc>
|
||||
auto make_transform_output_iterator(_Iter __it, _UnaryFunc __unary_func) {
|
||||
return oneapi::dpl::transform_iterator(
|
||||
__it, internal::_Unary_Out<_UnaryFunc>(__unary_func));
|
||||
}
|
||||
|
||||
} // end namespace dpct
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load diff
|
@ -1,32 +0,0 @@
|
|||
//==---- numeric.h --------------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_NUMERIC_H__
|
||||
#define __DPCT_NUMERIC_H__
|
||||
|
||||
namespace dpct {
|
||||
|
||||
template <typename Policy, typename InputIt1, typename InputIt2, typename T>
|
||||
T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1,
|
||||
InputIt2 first2, T init) {
|
||||
return std::transform_reduce(std::forward<Policy>(policy), first1, last1,
|
||||
first2, init);
|
||||
}
|
||||
|
||||
template <typename Policy, typename InputIt1, typename InputIt2, typename T,
|
||||
typename BinaryOperation1, typename BinaryOperation2>
|
||||
T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1,
|
||||
InputIt2 first2, T init, BinaryOperation1 op1,
|
||||
BinaryOperation2 op2) {
|
||||
return std::transform_reduce(std::forward<Policy>(policy), first1, last1,
|
||||
first2, init, op1, op2);
|
||||
}
|
||||
|
||||
} // end namespace dpct
|
||||
|
||||
#endif
|
|
@ -1,752 +0,0 @@
|
|||
//==---- vector.h ---------------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_VECTOR_H__
|
||||
#define __DPCT_VECTOR_H__
|
||||
|
||||
#include <oneapi/dpl/algorithm>
|
||||
#include <oneapi/dpl/execution>
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
#include "memory.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
|
||||
#include "../device.hpp"
|
||||
|
||||
namespace dpct {
|
||||
|
||||
namespace internal {
|
||||
template <typename Iter, typename Void = void> // for non-iterators
|
||||
struct is_iterator : std::false_type {};
|
||||
|
||||
template <typename Iter> // For iterators
|
||||
struct is_iterator<
|
||||
Iter,
|
||||
typename std::enable_if<
|
||||
!std::is_void<typename Iter::iterator_category>::value, void>::type>
|
||||
: std::true_type {};
|
||||
|
||||
template <typename T> // For pointers
|
||||
struct is_iterator<T *> : std::true_type {};
|
||||
} // end namespace internal
|
||||
|
||||
#ifndef DPCT_USM_LEVEL_NONE
|
||||
|
||||
template <typename T,
|
||||
typename Allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>>
|
||||
class device_vector {
|
||||
public:
|
||||
using iterator = device_iterator<T>;
|
||||
using const_iterator = const iterator;
|
||||
using reference = device_reference<T>;
|
||||
using const_reference = const reference;
|
||||
using value_type = T;
|
||||
using pointer = T *;
|
||||
using const_pointer = const T *;
|
||||
using difference_type =
|
||||
typename ::std::iterator_traits<iterator>::difference_type;
|
||||
using size_type = ::std::size_t;
|
||||
|
||||
private:
|
||||
Allocator _alloc;
|
||||
size_type _size;
|
||||
size_type _capacity;
|
||||
pointer _storage;
|
||||
|
||||
size_type _min_capacity() const { return size_type(1); }
|
||||
|
||||
void _set_capacity_and_alloc() {
|
||||
_capacity = ::std::max(_size * 2, _min_capacity());
|
||||
_storage = _alloc.allocate(_capacity);
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename OtherA> operator ::std::vector<T, OtherA>() const {
|
||||
auto __tmp = ::std::vector<T, OtherA>(this->size());
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
this->begin(), this->end(), __tmp.begin());
|
||||
return __tmp;
|
||||
}
|
||||
device_vector()
|
||||
: _alloc(get_default_queue()), _size(0), _capacity(_min_capacity()) {
|
||||
_set_capacity_and_alloc();
|
||||
}
|
||||
~device_vector() /*= default*/ { _alloc.deallocate(_storage, _capacity); };
|
||||
explicit device_vector(size_type n) : device_vector(n, T()) {}
|
||||
explicit device_vector(size_type n, const T &value)
|
||||
: _alloc(get_default_queue()), _size(n) {
|
||||
_set_capacity_and_alloc();
|
||||
if (_size > 0) {
|
||||
::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
begin(), end(), T(value));
|
||||
}
|
||||
}
|
||||
device_vector(const device_vector &other) : _alloc(get_default_queue()) {
|
||||
_size = other.size();
|
||||
_capacity = other.capacity();
|
||||
_storage = _alloc.allocate(_capacity);
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
other.begin(), other.end(), begin());
|
||||
}
|
||||
device_vector(device_vector &&other)
|
||||
: _alloc(get_default_queue()), _size(other.size()),
|
||||
_capacity(other.capacity()), _storage(other._storage) {
|
||||
other._size = 0;
|
||||
other._capacity = 0;
|
||||
other._storage = nullptr;
|
||||
}
|
||||
|
||||
template <typename InputIterator>
|
||||
device_vector(InputIterator first,
|
||||
typename ::std::enable_if<
|
||||
internal::is_iterator<InputIterator>::value &&
|
||||
!::std::is_pointer<InputIterator>::value &&
|
||||
::std::is_same<typename ::std::iterator_traits<
|
||||
InputIterator>::iterator_category,
|
||||
::std::random_access_iterator_tag>::value,
|
||||
InputIterator>::type last)
|
||||
: _alloc(get_default_queue()) {
|
||||
_size = ::std::distance(first, last);
|
||||
_set_capacity_and_alloc();
|
||||
if (_size > 0) {
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
first, last, begin());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputIterator>
|
||||
device_vector(InputIterator first,
|
||||
typename ::std::enable_if<::std::is_pointer<InputIterator>::value,
|
||||
InputIterator>::type last)
|
||||
: _alloc(get_default_queue()) {
|
||||
_size = ::std::distance(first, last);
|
||||
_set_capacity_and_alloc();
|
||||
if (_size > 0) {
|
||||
auto ptr_type = sycl::get_pointer_type(first, get_default_context());
|
||||
if (ptr_type != sycl::usm::alloc::host &&
|
||||
ptr_type != sycl::usm::alloc::unknown) {
|
||||
::std::copy(
|
||||
oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
first, last, begin());
|
||||
} else {
|
||||
sycl::buffer<typename ::std::iterator_traits<InputIterator>::value_type,
|
||||
1>
|
||||
buf(first, last);
|
||||
auto buf_first = oneapi::dpl::begin(buf);
|
||||
auto buf_last = oneapi::dpl::end(buf);
|
||||
::std::copy(
|
||||
oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
buf_first, buf_last, begin());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputIterator>
|
||||
device_vector(InputIterator first,
|
||||
typename ::std::enable_if<
|
||||
internal::is_iterator<InputIterator>::value &&
|
||||
!::std::is_pointer<InputIterator>::value &&
|
||||
!::std::is_same<typename ::std::iterator_traits<
|
||||
InputIterator>::iterator_category,
|
||||
::std::random_access_iterator_tag>::value,
|
||||
InputIterator>::type last)
|
||||
: _alloc(get_default_queue()), _size(::std::distance(first, last)) {
|
||||
_set_capacity_and_alloc();
|
||||
::std::vector<T> _tmp(first, last);
|
||||
if (_size > 0) {
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
_tmp.begin(), _tmp.end(), this->begin());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename OtherAllocator>
|
||||
device_vector(const device_vector<T, OtherAllocator> &v)
|
||||
: _alloc(get_default_queue()), _storage(v.real_begin()), _size(v.size()),
|
||||
_capacity(v.capacity()) {}
|
||||
|
||||
template <typename OtherAllocator>
|
||||
device_vector(::std::vector<T, OtherAllocator> &v)
|
||||
: _alloc(get_default_queue()), _size(v.size()) {
|
||||
_set_capacity_and_alloc();
|
||||
if (_size > 0) {
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
v.begin(), v.end(), this->begin());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename OtherAllocator>
|
||||
device_vector &operator=(const ::std::vector<T, OtherAllocator> &v) {
|
||||
resize(v.size());
|
||||
if (_size > 0) {
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
v.begin(), v.end(), begin());
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
device_vector &operator=(const device_vector &other) {
|
||||
// Copy assignment operator:
|
||||
resize(other.size());
|
||||
if (_size > 0) {
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
other.begin(), other.end(), begin());
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
device_vector &operator=(device_vector &&other) {
|
||||
// Move assignment operator:
|
||||
device_vector dummy(::std::move(other));
|
||||
this->swap(dummy);
|
||||
return *this;
|
||||
}
|
||||
size_type size() const { return _size; }
|
||||
iterator begin() noexcept { return device_iterator<T>(_storage, 0); }
|
||||
iterator end() { return device_iterator<T>(_storage, size()); }
|
||||
const_iterator begin() const noexcept {
|
||||
return device_iterator<T>(_storage, 0);
|
||||
}
|
||||
const_iterator cbegin() const noexcept { return begin(); }
|
||||
const_iterator end() const { return device_iterator<T>(_storage, size()); }
|
||||
const_iterator cend() const { return end(); }
|
||||
T *real_begin() { return _storage; }
|
||||
const T *real_begin() const { return _storage; }
|
||||
void swap(device_vector &v) {
|
||||
::std::swap(_size, v._size);
|
||||
::std::swap(_capacity, v._capacity);
|
||||
::std::swap(_storage, v._storage);
|
||||
::std::swap(_alloc, v._alloc);
|
||||
}
|
||||
reference operator[](size_type n) { return _storage[n]; }
|
||||
const_reference operator[](size_type n) const { return _storage[n]; }
|
||||
void reserve(size_type n) {
|
||||
if (n > capacity()) {
|
||||
// allocate buffer for new size
|
||||
auto tmp = _alloc.allocate(2 * n);
|
||||
// copy content (old buffer to new buffer)
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
begin(), end(), tmp);
|
||||
// deallocate old memory
|
||||
_alloc.deallocate(_storage, _capacity);
|
||||
_storage = tmp;
|
||||
_capacity = 2 * n;
|
||||
}
|
||||
}
|
||||
void resize(size_type new_size, const T &x = T()) {
|
||||
reserve(new_size);
|
||||
if (_size < new_size) {
|
||||
::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
begin() + _size, begin() + new_size, x);
|
||||
}
|
||||
_size = new_size;
|
||||
}
|
||||
size_type max_size(void) const {
|
||||
return ::std::numeric_limits<size_type>::max() / sizeof(T);
|
||||
}
|
||||
size_type capacity() const { return _capacity; }
|
||||
const_reference front() const { return *begin(); }
|
||||
reference front() { return *begin(); }
|
||||
const_reference back(void) const { return *(end() - 1); }
|
||||
reference back(void) { return *(end() - 1); }
|
||||
pointer data(void) { return _storage; }
|
||||
const_pointer data(void) const { return _storage; }
|
||||
void shrink_to_fit(void) {
|
||||
if (_size != capacity()) {
|
||||
size_type tmp_capacity = ::std::max(_size, _min_capacity());
|
||||
auto tmp = _alloc.allocate(tmp_capacity);
|
||||
if (_size > 0) {
|
||||
::std::copy(
|
||||
oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
begin(), end(), tmp);
|
||||
}
|
||||
_alloc.deallocate(_storage, _capacity);
|
||||
_storage = tmp;
|
||||
_capacity = tmp_capacity;
|
||||
}
|
||||
}
|
||||
void assign(size_type n, const T &x) {
|
||||
resize(n);
|
||||
if (_size > 0) {
|
||||
::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
begin(), begin() + n, x);
|
||||
}
|
||||
}
|
||||
template <typename InputIterator>
|
||||
void
|
||||
assign(InputIterator first,
|
||||
typename ::std::enable_if<internal::is_iterator<InputIterator>::value,
|
||||
InputIterator>::type last) {
|
||||
auto n = ::std::distance(first, last);
|
||||
resize(n);
|
||||
if (_size > 0) {
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
first, last, begin());
|
||||
}
|
||||
}
|
||||
void clear(void) { _size = 0; }
|
||||
bool empty(void) const { return (size() == 0); }
|
||||
void push_back(const T &x) { insert(end(), size_type(1), x); }
|
||||
void pop_back(void) {
|
||||
if (_size > 0)
|
||||
--_size;
|
||||
}
|
||||
iterator erase(iterator first, iterator last) {
|
||||
auto n = ::std::distance(first, last);
|
||||
if (last == end()) {
|
||||
_size = _size - n;
|
||||
return end();
|
||||
}
|
||||
auto m = ::std::distance(last, end());
|
||||
if (m <= 0) {
|
||||
return end();
|
||||
}
|
||||
auto tmp = _alloc.allocate(m);
|
||||
// copy remainder to temporary buffer.
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
last, end(), tmp);
|
||||
// override (erase) subsequence in storage.
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
tmp, tmp + m, first);
|
||||
_alloc.deallocate(tmp, m);
|
||||
_size -= n;
|
||||
return begin() + first.get_idx() + n;
|
||||
}
|
||||
iterator erase(iterator pos) { return erase(pos, pos + 1); }
|
||||
iterator insert(iterator position, const T &x) {
|
||||
auto n = ::std::distance(begin(), position);
|
||||
insert(position, size_type(1), x);
|
||||
return begin() + n;
|
||||
}
|
||||
void insert(iterator position, size_type n, const T &x) {
|
||||
if (position == end()) {
|
||||
resize(size() + n);
|
||||
::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
end() - n, end(), x);
|
||||
} else {
|
||||
auto i_n = ::std::distance(begin(), position);
|
||||
// allocate temporary storage
|
||||
auto m = ::std::distance(position, end());
|
||||
// will throw if position is not inside active vector
|
||||
auto tmp = _alloc.allocate(m);
|
||||
// copy remainder
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
position, end(), tmp);
|
||||
|
||||
resize(size() + n);
|
||||
// resizing might invalidate position
|
||||
position = begin() + position.get_idx();
|
||||
|
||||
::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
position, position + n, x);
|
||||
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
tmp, tmp + m, position + n);
|
||||
_alloc.deallocate(tmp, m);
|
||||
}
|
||||
}
|
||||
template <typename InputIterator>
|
||||
void
|
||||
insert(iterator position, InputIterator first,
|
||||
typename ::std::enable_if<internal::is_iterator<InputIterator>::value,
|
||||
InputIterator>::type last) {
|
||||
auto n = ::std::distance(first, last);
|
||||
if (position == end()) {
|
||||
resize(size() + n);
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
first, last, end());
|
||||
} else {
|
||||
auto m = ::std::distance(position, end());
|
||||
// will throw if position is not inside active vector
|
||||
auto tmp = _alloc.allocate(m);
|
||||
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
position, end(), tmp);
|
||||
|
||||
resize(size() + n);
|
||||
// resizing might invalidate position
|
||||
position = begin() + position.get_idx();
|
||||
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
first, last, position);
|
||||
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
tmp, tmp + m, position + n);
|
||||
_alloc.deallocate(tmp, m);
|
||||
}
|
||||
}
|
||||
Allocator get_allocator() const { return _alloc; }
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
template <typename T, typename Allocator = detail::__buffer_allocator<T>>
|
||||
class device_vector {
|
||||
static_assert(
|
||||
std::is_same<Allocator, detail::__buffer_allocator<T>>::value,
|
||||
"device_vector doesn't support custom allocator when USM is not used.");
|
||||
|
||||
public:
|
||||
using iterator = device_iterator<T>;
|
||||
using const_iterator = const iterator;
|
||||
using reference = device_reference<T>;
|
||||
using const_reference = const reference;
|
||||
using value_type = T;
|
||||
using pointer = T *;
|
||||
using const_pointer = const T *;
|
||||
using difference_type =
|
||||
typename std::iterator_traits<iterator>::difference_type;
|
||||
using size_type = std::size_t;
|
||||
|
||||
private:
|
||||
using Buffer = sycl::buffer<T, 1>;
|
||||
using Range = sycl::range<1>;
|
||||
// Using mem_mgr to handle memory allocation
|
||||
void *_storage;
|
||||
size_type _size;
|
||||
|
||||
size_type _min_capacity() const { return size_type(1); }
|
||||
|
||||
void *alloc_store(size_type num_bytes) {
|
||||
return detail::mem_mgr::instance().mem_alloc(num_bytes);
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename OtherA> operator std::vector<T, OtherA>() const {
|
||||
auto __tmp = std::vector<T, OtherA>(this->size());
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, this->begin(), this->end(),
|
||||
__tmp.begin());
|
||||
return __tmp;
|
||||
}
|
||||
device_vector()
|
||||
: _storage(alloc_store(_min_capacity() * sizeof(T))), _size(0) {}
|
||||
~device_vector() = default;
|
||||
explicit device_vector(size_type n) : device_vector(n, T()) {}
|
||||
explicit device_vector(size_type n, const T &value)
|
||||
: _storage(alloc_store(std::max(n, _min_capacity()) * sizeof(T))),
|
||||
_size(n) {
|
||||
auto buf = get_buffer();
|
||||
std::fill(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(buf),
|
||||
oneapi::dpl::begin(buf) + n, T(value));
|
||||
}
|
||||
device_vector(const device_vector &other)
|
||||
: _storage(other._storage), _size(other.size()) {}
|
||||
device_vector(device_vector &&other)
|
||||
: _storage(std::move(other._storage)), _size(other.size()) {}
|
||||
|
||||
template <typename InputIterator>
|
||||
device_vector(InputIterator first,
|
||||
typename std::enable_if<
|
||||
internal::is_iterator<InputIterator>::value &&
|
||||
!std::is_pointer<InputIterator>::value &&
|
||||
std::is_same<typename std::iterator_traits<
|
||||
InputIterator>::iterator_category,
|
||||
std::random_access_iterator_tag>::value,
|
||||
InputIterator>::type last)
|
||||
: _storage(alloc_store(std::distance(first, last) * sizeof(T))),
|
||||
_size(std::distance(first, last)) {
|
||||
auto buf = get_buffer();
|
||||
auto dst = oneapi::dpl::begin(buf);
|
||||
std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
first, last, dst);
|
||||
}
|
||||
|
||||
template <typename InputIterator>
|
||||
device_vector(InputIterator first,
|
||||
typename std::enable_if<std::is_pointer<InputIterator>::value,
|
||||
InputIterator>::type last)
|
||||
: _storage(alloc_store(std::distance(first, last) * sizeof(T))),
|
||||
_size(std::distance(first, last)) {
|
||||
auto buf = get_buffer();
|
||||
Buffer tmp_buf(first, last);
|
||||
auto start = oneapi::dpl::begin(tmp_buf);
|
||||
auto end = oneapi::dpl::end(tmp_buf);
|
||||
auto dst = oneapi::dpl::begin(buf);
|
||||
std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
start, end, dst);
|
||||
}
|
||||
|
||||
template <typename InputIterator>
|
||||
device_vector(InputIterator first,
|
||||
typename std::enable_if<
|
||||
internal::is_iterator<InputIterator>::value &&
|
||||
!std::is_same<typename std::iterator_traits<
|
||||
InputIterator>::iterator_category,
|
||||
std::random_access_iterator_tag>::value,
|
||||
InputIterator>::type last)
|
||||
: _storage(alloc_store(std::distance(first, last) * sizeof(T))),
|
||||
_size(std::distance(first, last)) {
|
||||
auto buf = get_buffer();
|
||||
std::vector<T> tmp(first, last);
|
||||
Buffer tmp_buf(tmp);
|
||||
auto start = oneapi::dpl::begin(tmp_buf);
|
||||
auto end = oneapi::dpl::end(tmp_buf);
|
||||
auto dst = oneapi::dpl::begin(buf);
|
||||
std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
start, end, dst);
|
||||
}
|
||||
|
||||
template <typename OtherAllocator>
|
||||
device_vector(const device_vector<T, OtherAllocator> &v)
|
||||
: _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) {
|
||||
auto buf = get_buffer();
|
||||
auto dst = oneapi::dpl::begin(buf);
|
||||
std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
|
||||
v.real_begin(), v.real_begin() + v.size(), dst);
|
||||
}
|
||||
|
||||
template <typename OtherAllocator>
|
||||
device_vector(std::vector<T, OtherAllocator> &v)
|
||||
: _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) {
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, v.begin(), v.end(),
|
||||
oneapi::dpl::begin(get_buffer()));
|
||||
}
|
||||
|
||||
device_vector &operator=(const device_vector &other) {
|
||||
// Copy assignment operator:
|
||||
_size = other.size();
|
||||
void *tmp = alloc_store(_size * sizeof(T));
|
||||
auto tmp_buf =
|
||||
detail::mem_mgr::instance()
|
||||
.translate_ptr(tmp)
|
||||
.buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default,
|
||||
oneapi::dpl::begin(other.get_buffer()),
|
||||
oneapi::dpl::end(other.get_buffer()),
|
||||
oneapi::dpl::begin(tmp_buf));
|
||||
detail::mem_mgr::instance().mem_free(_storage);
|
||||
_storage = tmp;
|
||||
return *this;
|
||||
}
|
||||
device_vector &operator=(device_vector &&other) {
|
||||
// Move assignment operator:
|
||||
_size = other.size();
|
||||
this->_storage = std::move(other._storage);
|
||||
return *this;
|
||||
}
|
||||
template <typename OtherAllocator>
|
||||
device_vector &operator=(const std::vector<T, OtherAllocator> &v) {
|
||||
Buffer data(v.begin(), v.end());
|
||||
_size = v.size();
|
||||
void *tmp = alloc_store(_size * sizeof(T));
|
||||
auto tmp_buf =
|
||||
detail::mem_mgr::instance()
|
||||
.translate_ptr(tmp)
|
||||
.buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(data),
|
||||
oneapi::dpl::end(data), oneapi::dpl::begin(tmp_buf));
|
||||
detail::mem_mgr::instance().mem_free(_storage);
|
||||
_storage = tmp;
|
||||
|
||||
return *this;
|
||||
}
|
||||
Buffer get_buffer() const {
|
||||
return detail::mem_mgr::instance()
|
||||
.translate_ptr(_storage)
|
||||
.buffer.template reinterpret<T, 1>(sycl::range<1>(capacity()));
|
||||
}
|
||||
size_type size() const { return _size; }
|
||||
iterator begin() noexcept { return device_iterator<T>(get_buffer(), 0); }
|
||||
iterator end() { return device_iterator<T>(get_buffer(), _size); }
|
||||
const_iterator begin() const noexcept {
|
||||
return device_iterator<T>(get_buffer(), 0);
|
||||
}
|
||||
const_iterator cbegin() const noexcept { return begin(); }
|
||||
const_iterator end() const { return device_iterator<T>(get_buffer(), _size); }
|
||||
const_iterator cend() const { return end(); }
|
||||
T *real_begin() {
|
||||
return (detail::mem_mgr::instance()
|
||||
.translate_ptr(_storage)
|
||||
.buffer.template get_access<sycl::access_mode::read_write>())
|
||||
.get_pointer();
|
||||
}
|
||||
const T *real_begin() const {
|
||||
return const_cast<device_vector *>(this)
|
||||
->detail::mem_mgr::instance()
|
||||
.translate_ptr(_storage)
|
||||
.buffer.template get_access<sycl::access_mode::read_write>()
|
||||
.get_pointer();
|
||||
}
|
||||
void swap(device_vector &v) {
|
||||
void *temp = v._storage;
|
||||
v._storage = this->_storage;
|
||||
this->_storage = temp;
|
||||
std::swap(_size, v._size);
|
||||
}
|
||||
reference operator[](size_type n) { return *(begin() + n); }
|
||||
const_reference operator[](size_type n) const { return *(begin() + n); }
|
||||
void reserve(size_type n) {
|
||||
if (n > capacity()) {
|
||||
// create new buffer (allocate for new size)
|
||||
void *a = alloc_store(n * sizeof(T));
|
||||
|
||||
// copy content (old buffer to new buffer)
|
||||
if (_storage != nullptr) {
|
||||
auto tmp = detail::mem_mgr::instance()
|
||||
.translate_ptr(a)
|
||||
.buffer.template reinterpret<T, 1>(sycl::range<1>(n));
|
||||
auto src_buf = get_buffer();
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default,
|
||||
oneapi::dpl::begin(src_buf), oneapi::dpl::end(src_buf),
|
||||
oneapi::dpl::begin(tmp));
|
||||
|
||||
// deallocate old memory
|
||||
detail::mem_mgr::instance().mem_free(_storage);
|
||||
}
|
||||
_storage = a;
|
||||
}
|
||||
}
|
||||
void resize(size_type new_size, const T &x = T()) {
|
||||
reserve(new_size);
|
||||
if (_size < new_size) {
|
||||
auto src_buf = get_buffer();
|
||||
std::fill(oneapi::dpl::execution::dpcpp_default,
|
||||
oneapi::dpl::begin(src_buf) + _size,
|
||||
oneapi::dpl::begin(src_buf) + new_size, x);
|
||||
}
|
||||
_size = new_size;
|
||||
}
|
||||
size_type max_size(void) const {
|
||||
return std::numeric_limits<size_type>::max() / sizeof(T);
|
||||
}
|
||||
size_type capacity() const {
|
||||
return _storage != nullptr ? detail::mem_mgr::instance()
|
||||
.translate_ptr(_storage)
|
||||
.buffer.size() /
|
||||
sizeof(T)
|
||||
: 0;
|
||||
}
|
||||
const_reference front() const { return *begin(); }
|
||||
reference front() { return *begin(); }
|
||||
const_reference back(void) const { return *(end() - 1); }
|
||||
reference back(void) { return *(end() - 1); }
|
||||
pointer data(void) { return reinterpret_cast<pointer>(_storage); }
|
||||
const_pointer data(void) const {
|
||||
return reinterpret_cast<const_pointer>(_storage);
|
||||
}
|
||||
void shrink_to_fit(void) {
|
||||
if (_size != capacity()) {
|
||||
void *a = alloc_store(_size * sizeof(T));
|
||||
auto tmp = detail::mem_mgr::instance()
|
||||
.translate_ptr(a)
|
||||
.buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default,
|
||||
oneapi::dpl::begin(get_buffer()),
|
||||
oneapi::dpl::begin(get_buffer()) + _size,
|
||||
oneapi::dpl::begin(tmp));
|
||||
detail::mem_mgr::instance().mem_free(_storage);
|
||||
_storage = a;
|
||||
}
|
||||
}
|
||||
void assign(size_type n, const T &x) {
|
||||
resize(n);
|
||||
std::fill(oneapi::dpl::execution::dpcpp_default, begin(), begin() + n, x);
|
||||
}
|
||||
template <typename InputIterator>
|
||||
void
|
||||
assign(InputIterator first,
|
||||
typename std::enable_if<internal::is_iterator<InputIterator>::value,
|
||||
InputIterator>::type last) {
|
||||
auto n = std::distance(first, last);
|
||||
resize(n);
|
||||
if (internal::is_iterator<InputIterator>::value &&
|
||||
!std::is_pointer<InputIterator>::value)
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, first, last, begin());
|
||||
else {
|
||||
Buffer tmp(first, last);
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
|
||||
oneapi::dpl::end(tmp), begin());
|
||||
}
|
||||
}
|
||||
void clear(void) {
|
||||
_size = 0;
|
||||
detail::mem_mgr::instance().mem_free(_storage);
|
||||
_storage = nullptr;
|
||||
}
|
||||
bool empty(void) const { return (size() == 0); }
|
||||
void push_back(const T &x) { insert(end(), size_type(1), x); }
|
||||
void pop_back(void) {
|
||||
if (_size > 0)
|
||||
--_size;
|
||||
}
|
||||
iterator erase(iterator first, iterator last) {
|
||||
auto n = std::distance(first, last);
|
||||
if (last == end()) {
|
||||
_size = _size - n;
|
||||
return end();
|
||||
}
|
||||
Buffer tmp{Range(std::distance(last, end()))};
|
||||
// copy remainder to temporary buffer.
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, last, end(),
|
||||
oneapi::dpl::begin(tmp));
|
||||
// override (erase) subsequence in storage.
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
|
||||
oneapi::dpl::end(tmp), first);
|
||||
resize(_size - n);
|
||||
return begin() + first.get_idx() + n;
|
||||
}
|
||||
iterator erase(iterator pos) { return erase(pos, pos + 1); }
|
||||
iterator insert(iterator position, const T &x) {
|
||||
auto n = std::distance(begin(), position);
|
||||
insert(position, size_type(1), x);
|
||||
return begin() + n;
|
||||
}
|
||||
void insert(iterator position, size_type n, const T &x) {
|
||||
if (position == end()) {
|
||||
resize(size() + n);
|
||||
std::fill(oneapi::dpl::execution::dpcpp_default, end() - n, end(), x);
|
||||
} else {
|
||||
auto i_n = std::distance(begin(), position);
|
||||
// allocate temporary storage
|
||||
Buffer tmp{Range(std::distance(position, end()))};
|
||||
// copy remainder
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, position, end(),
|
||||
oneapi::dpl::begin(tmp));
|
||||
|
||||
resize(size() + n);
|
||||
// resizing might invalidate position
|
||||
position = begin() + position.get_idx();
|
||||
|
||||
std::fill(oneapi::dpl::execution::dpcpp_default, position, position + n,
|
||||
x);
|
||||
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
|
||||
oneapi::dpl::end(tmp), position + n);
|
||||
}
|
||||
}
|
||||
template <typename InputIterator>
|
||||
void
|
||||
insert(iterator position, InputIterator first,
|
||||
typename std::enable_if<internal::is_iterator<InputIterator>::value,
|
||||
InputIterator>::type last) {
|
||||
auto n = std::distance(first, last);
|
||||
if (position == end()) {
|
||||
resize(size() + n);
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, first, last, end());
|
||||
} else {
|
||||
Buffer tmp{Range(std::distance(position, end()))};
|
||||
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, position, end(),
|
||||
oneapi::dpl::begin(tmp));
|
||||
|
||||
resize(size() + n);
|
||||
// resizing might invalidate position
|
||||
position = begin() + position.get_idx();
|
||||
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, first, last, position);
|
||||
std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
|
||||
oneapi::dpl::end(tmp), position + n);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
} // end namespace dpct
|
||||
|
||||
#endif
|
|
@ -1,26 +0,0 @@
|
|||
//==---- dpl_utils.hpp ----------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_DPL_UTILS_HPP__
|
||||
#define __DPCT_DPL_UTILS_HPP__
|
||||
|
||||
#define ONEDPL_USE_DPCPP_BACKEND 1
|
||||
#define __USE_DPCT 1
|
||||
|
||||
#include <oneapi/dpl/execution>
|
||||
#include <oneapi/dpl/algorithm>
|
||||
#include <oneapi/dpl/numeric>
|
||||
|
||||
#include "dpl_extras/memory.h"
|
||||
#include "dpl_extras/algorithm.h"
|
||||
#include "dpl_extras/numeric.h"
|
||||
#include "dpl_extras/iterators.h"
|
||||
#include "dpl_extras/vector.h"
|
||||
#include "dpl_extras/dpcpp_extensions.h"
|
||||
|
||||
#endif // __DPCT_DPL_UTILS_HPP__
|
1376
dpct/fft_utils.hpp
1376
dpct/fft_utils.hpp
File diff suppressed because it is too large
Load diff
901
dpct/image.hpp
901
dpct/image.hpp
|
@ -1,901 +0,0 @@
|
|||
//==---- image.hpp --------------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_IMAGE_HPP__
|
||||
#define __DPCT_IMAGE_HPP__
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
#include "memory.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
namespace dpct {
|
||||
|
||||
enum class image_channel_data_type {
|
||||
signed_int,
|
||||
unsigned_int,
|
||||
fp,
|
||||
};
|
||||
|
||||
class image_channel;
|
||||
class image_wrapper_base;
|
||||
namespace detail {
|
||||
/// Image object type traits, with accessor type and sampled data type defined.
|
||||
/// The data type of an image accessor must be one of sycl::int4, sycl::uint4,
|
||||
/// sycl::float4 and sycl::half4. The data type of accessors with 8bits/16bits
|
||||
/// channel width will be 32 bits. sycl::half is an exception.
|
||||
template <class T> struct image_trait {
|
||||
using acc_data_t = sycl::vec<T, 4>;
|
||||
template <int dimensions>
|
||||
using accessor_t =
|
||||
sycl::accessor<acc_data_t, dimensions, sycl::access_mode::read,
|
||||
sycl::access::target::image>;
|
||||
template <int dimensions>
|
||||
using array_accessor_t =
|
||||
sycl::accessor<acc_data_t, dimensions, sycl::access_mode::read,
|
||||
sycl::access::target::image_array>;
|
||||
using data_t = T;
|
||||
using elem_t = T;
|
||||
static constexpr image_channel_data_type data_type =
|
||||
std::is_integral<T>::value
|
||||
? (std::is_signed<T>::value ? image_channel_data_type::signed_int
|
||||
: image_channel_data_type::unsigned_int)
|
||||
: image_channel_data_type::fp;
|
||||
static constexpr int channel_num = 1;
|
||||
};
|
||||
template <>
|
||||
struct image_trait<std::uint8_t> : public image_trait<std::uint32_t> {
|
||||
using data_t = std::uint8_t;
|
||||
using elem_t = data_t;
|
||||
};
|
||||
template <>
|
||||
struct image_trait<std::uint16_t>
|
||||
: public image_trait<std::uint32_t> {
|
||||
using data_t = std::uint16_t;
|
||||
using elem_t = data_t;
|
||||
};
|
||||
template <>
|
||||
struct image_trait<std::int8_t> : public image_trait<std::int32_t> {
|
||||
using data_t = std::int8_t;
|
||||
using elem_t = data_t;
|
||||
};
|
||||
template <>
|
||||
struct image_trait<std::int16_t> : public image_trait<std::int32_t> {
|
||||
using data_t = std::int16_t;
|
||||
using elem_t = data_t;
|
||||
};
|
||||
template <>
|
||||
struct image_trait<char>
|
||||
: public image_trait<typename std::conditional<
|
||||
std::is_signed<char>::value, signed char, unsigned char>::type> {};
|
||||
|
||||
template <class T>
|
||||
struct image_trait<sycl::vec<T, 1>> : public image_trait<T> {};
|
||||
|
||||
template <class T>
|
||||
struct image_trait<sycl::vec<T, 2>> : public image_trait<T> {
|
||||
using data_t = sycl::vec<T, 2>;
|
||||
static constexpr int channel_num = 2;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct image_trait<sycl::vec<T, 3>>
|
||||
: public image_trait<sycl::vec<T, 4>> {
|
||||
static constexpr int channel_num = 3;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct image_trait<sycl::vec<T, 4>> : public image_trait<T> {
|
||||
using data_t = sycl::vec<T, 4>;
|
||||
static constexpr int channel_num = 4;
|
||||
};
|
||||
|
||||
/// Functor to fetch data from read result of an image accessor.
|
||||
template <class T> struct fetch_data {
|
||||
using return_t = typename image_trait<T>::data_t;
|
||||
using acc_data_t = typename image_trait<T>::acc_data_t;
|
||||
|
||||
return_t operator()(acc_data_t &&original_data) {
|
||||
return (return_t)original_data.r();
|
||||
}
|
||||
};
|
||||
template <class T>
|
||||
struct fetch_data<sycl::vec<T, 1>> : public fetch_data<T> {};
|
||||
template <class T> struct fetch_data<sycl::vec<T, 2>> {
|
||||
using return_t = typename image_trait<sycl::vec<T, 2>>::data_t;
|
||||
using acc_data_t = typename image_trait<sycl::vec<T, 2>>::acc_data_t;
|
||||
|
||||
return_t operator()(acc_data_t &&origin_data) {
|
||||
return return_t(origin_data.r(), origin_data.g());
|
||||
}
|
||||
};
|
||||
template <class T>
|
||||
struct fetch_data<sycl::vec<T, 3>>
|
||||
: public fetch_data<sycl::vec<T, 4>> {};
|
||||
template <class T> struct fetch_data<sycl::vec<T, 4>> {
|
||||
using return_t = typename image_trait<sycl::vec<T, 4>>::data_t;
|
||||
using acc_data_t = typename image_trait<sycl::vec<T, 4>>::acc_data_t;
|
||||
|
||||
return_t operator()(acc_data_t &&origin_data) {
|
||||
return return_t(origin_data.r(), origin_data.g(), origin_data.b(),
|
||||
origin_data.a());
|
||||
}
|
||||
};
|
||||
|
||||
/// Create image according with given type \p T and \p dims.
|
||||
template <class T> static image_wrapper_base *create_image_wrapper(int dims);
|
||||
|
||||
/// Create image with given data type \p T, channel order and dims
|
||||
template <class T>
|
||||
static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims);
|
||||
|
||||
/// Create image with channel info and specified dimensions.
|
||||
static image_wrapper_base *create_image_wrapper(image_channel channel, int dims);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// Image channel info, include channel number, order, data width and type
|
||||
class image_channel {
|
||||
image_channel_data_type _type = image_channel_data_type::signed_int;
|
||||
/// Number of channels.
|
||||
unsigned _channel_num = 0;
|
||||
/// Total size of all channels in bytes.
|
||||
unsigned _total_size = 0;
|
||||
/// Size of each channel in bytes.
|
||||
unsigned _channel_size = 0;
|
||||
|
||||
public:
|
||||
/// Create image channel info according to template argument \p T.
|
||||
template <class T> static image_channel create() {
|
||||
image_channel channel;
|
||||
channel.set_channel_size(detail::image_trait<T>::channel_num,
|
||||
sizeof(typename detail::image_trait<T>::elem_t) *
|
||||
8);
|
||||
channel.set_channel_data_type(detail::image_trait<T>::data_type);
|
||||
return channel;
|
||||
}
|
||||
|
||||
image_channel() = default;
|
||||
|
||||
image_channel_data_type get_channel_data_type() { return _type; }
|
||||
void set_channel_data_type(image_channel_data_type type) { _type = type; }
|
||||
|
||||
unsigned get_total_size() { return _total_size; }
|
||||
|
||||
unsigned get_channel_num() { return _channel_num; }
|
||||
void set_channel_num(unsigned channel_num) {
|
||||
_channel_num = channel_num;
|
||||
_total_size = _channel_size * _channel_num;
|
||||
}
|
||||
|
||||
/// image_channel constructor.
|
||||
/// \param r Channel r width in bits.
|
||||
/// \param g Channel g width in bits. Should be same with \p r, or zero.
|
||||
/// \param b Channel b width in bits. Should be same with \p g, or zero.
|
||||
/// \param a Channel a width in bits. Should be same with \p b, or zero.
|
||||
/// \param data_type Image channel data type: signed_nt, unsigned_int or fp.
|
||||
image_channel(int r, int g, int b, int a, image_channel_data_type data_type) {
|
||||
_type = data_type;
|
||||
if (a) {
|
||||
assert(r == a && "SYCL doesn't support different channel size");
|
||||
assert(r == b && "SYCL doesn't support different channel size");
|
||||
assert(r == g && "SYCL doesn't support different channel size");
|
||||
set_channel_size(4, a);
|
||||
} else if (b) {
|
||||
assert(r == b && "SYCL doesn't support different channel size");
|
||||
assert(r == g && "SYCL doesn't support different channel size");
|
||||
set_channel_size(3, b);
|
||||
} else if (g) {
|
||||
assert(r == g && "SYCL doesn't support different channel size");
|
||||
set_channel_size(2, g);
|
||||
} else {
|
||||
set_channel_size(1, r);
|
||||
}
|
||||
}
|
||||
|
||||
sycl::image_channel_type get_channel_type() const {
|
||||
if (_channel_size == 4) {
|
||||
if (_type == image_channel_data_type::signed_int)
|
||||
return sycl::image_channel_type::signed_int32;
|
||||
else if (_type == image_channel_data_type::unsigned_int)
|
||||
return sycl::image_channel_type::unsigned_int32;
|
||||
else if (_type == image_channel_data_type::fp)
|
||||
return sycl::image_channel_type::fp32;
|
||||
} else if (_channel_size == 2) {
|
||||
if (_type == image_channel_data_type::signed_int)
|
||||
return sycl::image_channel_type::signed_int16;
|
||||
else if (_type == image_channel_data_type::unsigned_int)
|
||||
return sycl::image_channel_type::unsigned_int16;
|
||||
else if (_type == image_channel_data_type::fp)
|
||||
return sycl::image_channel_type::fp16;
|
||||
} else {
|
||||
if (_type == image_channel_data_type::signed_int)
|
||||
return sycl::image_channel_type::signed_int8;
|
||||
else if (_type == image_channel_data_type::unsigned_int)
|
||||
return sycl::image_channel_type::unsigned_int8;
|
||||
}
|
||||
assert(false && "unexpected channel data kind and channel size");
|
||||
return sycl::image_channel_type::signed_int32;
|
||||
}
|
||||
void set_channel_type(sycl::image_channel_type type) {
|
||||
switch (type) {
|
||||
case sycl::image_channel_type::unsigned_int8:
|
||||
_type = image_channel_data_type::unsigned_int;
|
||||
_channel_size = 1;
|
||||
break;
|
||||
case sycl::image_channel_type::unsigned_int16:
|
||||
_type = image_channel_data_type::unsigned_int;
|
||||
_channel_size = 2;
|
||||
break;
|
||||
case sycl::image_channel_type::unsigned_int32:
|
||||
_type = image_channel_data_type::unsigned_int;
|
||||
_channel_size = 4;
|
||||
break;
|
||||
case sycl::image_channel_type::signed_int8:
|
||||
_type = image_channel_data_type::signed_int;
|
||||
_channel_size = 1;
|
||||
break;
|
||||
case sycl::image_channel_type::signed_int16:
|
||||
_type = image_channel_data_type::signed_int;
|
||||
_channel_size = 2;
|
||||
break;
|
||||
case sycl::image_channel_type::signed_int32:
|
||||
_type = image_channel_data_type::signed_int;
|
||||
_channel_size = 4;
|
||||
break;
|
||||
case sycl::image_channel_type::fp16:
|
||||
_type = image_channel_data_type::fp;
|
||||
_channel_size = 2;
|
||||
break;
|
||||
case sycl::image_channel_type::fp32:
|
||||
_type = image_channel_data_type::fp;
|
||||
_channel_size = 4;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
_total_size = _channel_size * _channel_num;
|
||||
}
|
||||
|
||||
sycl::image_channel_order get_channel_order() const {
|
||||
switch (_channel_num) {
|
||||
case 1:
|
||||
return sycl::image_channel_order::r;
|
||||
case 2:
|
||||
return sycl::image_channel_order::rg;
|
||||
case 3:
|
||||
return sycl::image_channel_order::rgb;
|
||||
case 4:
|
||||
return sycl::image_channel_order::rgba;
|
||||
default:
|
||||
return sycl::image_channel_order::r;
|
||||
}
|
||||
}
|
||||
/// Get the size for each channel in bits.
|
||||
unsigned get_channel_size() const { return _channel_size * 8; }
|
||||
|
||||
/// Set channel size.
|
||||
/// \param in_channel_num Channels number to set.
|
||||
/// \param channel_size Size for each channel in bits.
|
||||
void set_channel_size(unsigned in_channel_num,
|
||||
unsigned channel_size) {
|
||||
if (in_channel_num < _channel_num)
|
||||
return;
|
||||
_channel_num = in_channel_num;
|
||||
_channel_size = channel_size / 8;
|
||||
_total_size = _channel_size * _channel_num;
|
||||
}
|
||||
};
|
||||
|
||||
/// 2D or 3D matrix data for image.
|
||||
class image_matrix {
|
||||
image_channel _channel;
|
||||
int _range[3] = {1, 1, 1};
|
||||
int _dims = 0;
|
||||
void *_host_data = nullptr;
|
||||
|
||||
/// Set range of each dimension.
|
||||
template <int dimensions> void set_range(sycl::range<dimensions> range) {
|
||||
for (int i = 0; i < dimensions; ++i)
|
||||
_range[i] = range[i];
|
||||
_dims = dimensions;
|
||||
}
|
||||
|
||||
template <int... DimIdx>
|
||||
sycl::range<sizeof...(DimIdx)> get_range(integer_sequence<DimIdx...>) {
|
||||
return sycl::range<sizeof...(DimIdx)>(_range[DimIdx]...);
|
||||
}
|
||||
|
||||
public:
|
||||
/// Constructor with channel info and dimension size info.
|
||||
template <int dimensions>
|
||||
image_matrix(image_channel channel, sycl::range<dimensions> range)
|
||||
: _channel(channel) {
|
||||
set_range(range);
|
||||
_host_data = std::malloc(range.size() * _channel.get_total_size());
|
||||
}
|
||||
image_matrix(sycl::image_channel_type channel_type, unsigned channel_num,
|
||||
size_t x, size_t y) {
|
||||
_channel.set_channel_type(channel_type);
|
||||
_channel.set_channel_num(channel_num);
|
||||
_dims = 1;
|
||||
_range[0] = x;
|
||||
if (y) {
|
||||
_dims = 2;
|
||||
_range[1] = y;
|
||||
}
|
||||
_host_data = std::malloc(_range[0] * _range[1] * _channel.get_total_size());
|
||||
}
|
||||
|
||||
/// Construct a new image class with the matrix data.
|
||||
template <int dimensions> sycl::image<dimensions> *create_image() {
|
||||
return create_image<dimensions>(_channel);
|
||||
}
|
||||
/// Construct a new image class with the matrix data.
|
||||
template <int dimensions>
|
||||
sycl::image<dimensions> *create_image(image_channel channel) {
|
||||
return new sycl::image<dimensions>(
|
||||
_host_data, channel.get_channel_order(), channel.get_channel_type(),
|
||||
get_range(make_index_sequence<dimensions>()),
|
||||
sycl::property::image::use_host_ptr());
|
||||
}
|
||||
|
||||
/// Get channel info.
|
||||
inline image_channel get_channel() { return _channel; }
|
||||
/// Get range of the image.
|
||||
sycl::range<3> get_range() {
|
||||
return sycl::range<3>(_range[0], _range[1], _range[2]);
|
||||
}
|
||||
/// Get matrix dims.
|
||||
inline int get_dims() { return _dims; }
|
||||
/// Convert to pitched data.
|
||||
pitched_data to_pitched_data() {
|
||||
return pitched_data(_host_data, _range[0] * _channel.get_total_size(),
|
||||
_range[0], _range[1]);
|
||||
}
|
||||
|
||||
~image_matrix() {
|
||||
if (_host_data)
|
||||
std::free(_host_data);
|
||||
_host_data = nullptr;
|
||||
}
|
||||
};
|
||||
using image_matrix_p = image_matrix *;
|
||||
|
||||
enum class image_data_type { matrix, linear, pitch, unsupport };
|
||||
|
||||
/// Image data info.
|
||||
class image_data {
|
||||
public:
|
||||
image_data() { _type = image_data_type::unsupport; }
|
||||
image_data(image_matrix_p matrix_data) { set_data(matrix_data); }
|
||||
image_data(void *data_ptr, size_t x_size, image_channel channel) {
|
||||
set_data(data_ptr, x_size, channel);
|
||||
}
|
||||
image_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size,
|
||||
image_channel channel) {
|
||||
set_data(data_ptr, x_size, y_size, pitch_size, channel);
|
||||
}
|
||||
void set_data(image_matrix_p matrix_data) {
|
||||
_type = image_data_type::matrix;
|
||||
_data = matrix_data;
|
||||
_channel = matrix_data->get_channel();
|
||||
}
|
||||
void set_data(void *data_ptr, size_t x_size, image_channel channel) {
|
||||
_type = image_data_type::linear;
|
||||
_data = data_ptr;
|
||||
_x = x_size;
|
||||
_channel = channel;
|
||||
}
|
||||
void set_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size,
|
||||
image_channel channel) {
|
||||
_type = image_data_type::pitch;
|
||||
_data = data_ptr;
|
||||
_x = x_size;
|
||||
_y = y_size;
|
||||
_pitch = pitch_size;
|
||||
_channel = channel;
|
||||
}
|
||||
|
||||
image_data_type get_data_type() const { return _type; }
|
||||
void set_data_type(image_data_type type) { _type = type; }
|
||||
|
||||
void *get_data_ptr() const { return _data; }
|
||||
void set_data_ptr(void *data) { _data = data; }
|
||||
|
||||
size_t get_x() const { return _x; }
|
||||
void set_x(size_t x) { _x = x; }
|
||||
|
||||
size_t get_y() const { return _y; }
|
||||
void set_y(size_t y) { _y = y; }
|
||||
|
||||
size_t get_pitch() const { return _pitch; }
|
||||
void set_pitch(size_t pitch) { _pitch = pitch; }
|
||||
|
||||
image_channel get_channel() const { return _channel; }
|
||||
void set_channel(image_channel channel) { _channel = channel; }
|
||||
|
||||
image_channel_data_type get_channel_data_type() {
|
||||
return _channel.get_channel_data_type();
|
||||
}
|
||||
void set_channel_data_type(image_channel_data_type type) {
|
||||
_channel.set_channel_data_type(type);
|
||||
}
|
||||
|
||||
unsigned get_channel_size() { return _channel.get_channel_size(); }
|
||||
void set_channel_size(unsigned channel_num, unsigned channel_size) {
|
||||
return _channel.set_channel_size(channel_num, channel_size);
|
||||
}
|
||||
|
||||
unsigned get_channel_num() { return _channel.get_channel_num(); }
|
||||
void set_channel_num(unsigned num) {
|
||||
return _channel.set_channel_num(num);
|
||||
}
|
||||
|
||||
sycl::image_channel_type get_channel_type() {
|
||||
return _channel.get_channel_type();
|
||||
}
|
||||
void set_channel_type(sycl::image_channel_type type) {
|
||||
return _channel.set_channel_type(type);
|
||||
}
|
||||
|
||||
private:
|
||||
image_data_type _type;
|
||||
void *_data = nullptr;
|
||||
size_t _x, _y, _pitch;
|
||||
image_channel _channel;
|
||||
};
|
||||
|
||||
/// Image sampling info, include addressing mode, filtering mode and
|
||||
/// normalization info.
|
||||
class sampling_info {
|
||||
sycl::addressing_mode _addressing_mode =
|
||||
sycl::addressing_mode::clamp_to_edge;
|
||||
sycl::filtering_mode _filtering_mode = sycl::filtering_mode::nearest;
|
||||
sycl::coordinate_normalization_mode _coordinate_normalization_mode =
|
||||
sycl::coordinate_normalization_mode::unnormalized;
|
||||
|
||||
public:
|
||||
sycl::addressing_mode get_addressing_mode() { return _addressing_mode; }
|
||||
void set(sycl::addressing_mode addressing_mode) { _addressing_mode = addressing_mode; }
|
||||
|
||||
sycl::filtering_mode get_filtering_mode() { return _filtering_mode; }
|
||||
void set(sycl::filtering_mode filtering_mode) { _filtering_mode = filtering_mode; }
|
||||
|
||||
sycl::coordinate_normalization_mode get_coordinate_normalization_mode() {
|
||||
return _coordinate_normalization_mode;
|
||||
}
|
||||
void set(sycl::coordinate_normalization_mode coordinate_normalization_mode) {
|
||||
_coordinate_normalization_mode = coordinate_normalization_mode;
|
||||
}
|
||||
|
||||
bool is_coordinate_normalized() {
|
||||
return _coordinate_normalization_mode ==
|
||||
sycl::coordinate_normalization_mode::normalized;
|
||||
}
|
||||
void set_coordinate_normalization_mode(int is_normalized) {
|
||||
_coordinate_normalization_mode =
|
||||
is_normalized ? sycl::coordinate_normalization_mode::normalized
|
||||
: sycl::coordinate_normalization_mode::unnormalized;
|
||||
}
|
||||
void
|
||||
set(sycl::addressing_mode addressing_mode,
|
||||
sycl::filtering_mode filtering_mode,
|
||||
sycl::coordinate_normalization_mode coordinate_normalization_mode) {
|
||||
set(addressing_mode);
|
||||
set(filtering_mode);
|
||||
set(coordinate_normalization_mode);
|
||||
}
|
||||
void set(sycl::addressing_mode addressing_mode,
|
||||
sycl::filtering_mode filtering_mode, int is_normalized) {
|
||||
set(addressing_mode);
|
||||
set(filtering_mode);
|
||||
set_coordinate_normalization_mode(is_normalized);
|
||||
}
|
||||
|
||||
sycl::sampler get_sampler() {
|
||||
return sycl::sampler(_coordinate_normalization_mode, _addressing_mode,
|
||||
_filtering_mode);
|
||||
}
|
||||
};
|
||||
|
||||
/// Image base class.
|
||||
class image_wrapper_base {
|
||||
sampling_info _sampling_info;
|
||||
image_data _data;
|
||||
|
||||
public:
|
||||
virtual ~image_wrapper_base() = 0;
|
||||
|
||||
void attach(image_data data) { set_data(data); }
|
||||
/// Attach matrix data to this class.
|
||||
void attach(image_matrix *matrix) {
|
||||
detach();
|
||||
image_wrapper_base::set_data(image_data(matrix));
|
||||
}
|
||||
/// Attach matrix data to this class.
|
||||
void attach(image_matrix *matrix, image_channel channel) {
|
||||
attach(matrix);
|
||||
image_wrapper_base::set_channel(channel);
|
||||
}
|
||||
/// Attach linear data to this class.
|
||||
void attach(const void *ptr, size_t count) {
|
||||
attach(ptr, count, get_channel());
|
||||
}
|
||||
/// Attach linear data to this class.
|
||||
void attach(const void *ptr, size_t count, image_channel channel) {
|
||||
detach();
|
||||
image_wrapper_base::set_data(image_data(const_cast<void *>(ptr), count, channel));
|
||||
}
|
||||
/// Attach 2D data to this class.
|
||||
void attach(const void *data, size_t x, size_t y, size_t pitch) {
|
||||
attach(data, x, y, pitch, get_channel());
|
||||
}
|
||||
/// Attach 2D data to this class.
|
||||
void attach(const void *data, size_t x, size_t y, size_t pitch,
|
||||
image_channel channel) {
|
||||
detach();
|
||||
image_wrapper_base::set_data(
|
||||
image_data(const_cast<void *>(data), x, y, pitch, channel));
|
||||
}
|
||||
/// Detach data.
|
||||
virtual void detach() {}
|
||||
|
||||
sampling_info get_sampling_info() { return _sampling_info; }
|
||||
void set_sampling_info(sampling_info info) {
|
||||
_sampling_info = info;
|
||||
}
|
||||
const image_data &get_data() { return _data; }
|
||||
void set_data(image_data data) { _data = data; }
|
||||
|
||||
image_channel get_channel() { return _data.get_channel(); }
|
||||
void set_channel(image_channel channel) { _data.set_channel(channel); }
|
||||
|
||||
image_channel_data_type get_channel_data_type() {
|
||||
return _data.get_channel_data_type();
|
||||
}
|
||||
void set_channel_data_type(image_channel_data_type type) {
|
||||
_data.set_channel_data_type(type);
|
||||
}
|
||||
|
||||
unsigned get_channel_size() { return _data.get_channel_size(); }
|
||||
void set_channel_size(unsigned channel_num, unsigned channel_size) {
|
||||
return _data.set_channel_size(channel_num, channel_size);
|
||||
}
|
||||
|
||||
sycl::addressing_mode get_addressing_mode() {
|
||||
return _sampling_info.get_addressing_mode();
|
||||
}
|
||||
void set(sycl::addressing_mode addressing_mode) {
|
||||
_sampling_info.set(addressing_mode);
|
||||
}
|
||||
|
||||
sycl::filtering_mode get_filtering_mode() {
|
||||
return _sampling_info.get_filtering_mode();
|
||||
}
|
||||
void set(sycl::filtering_mode filtering_mode) {
|
||||
_sampling_info.set(filtering_mode);
|
||||
}
|
||||
|
||||
sycl::coordinate_normalization_mode get_coordinate_normalization_mode() {
|
||||
return _sampling_info.get_coordinate_normalization_mode();
|
||||
}
|
||||
void
|
||||
set(sycl::coordinate_normalization_mode coordinate_normalization_mode) {
|
||||
_sampling_info.set(coordinate_normalization_mode);
|
||||
}
|
||||
|
||||
bool is_coordinate_normalized() {
|
||||
return _sampling_info.is_coordinate_normalized();
|
||||
}
|
||||
void set_coordinate_normalization_mode(int is_normalized) {
|
||||
_sampling_info.set_coordinate_normalization_mode(is_normalized);
|
||||
}
|
||||
void
|
||||
set(sycl::addressing_mode addressing_mode,
|
||||
sycl::filtering_mode filtering_mode,
|
||||
sycl::coordinate_normalization_mode coordinate_normalization_mode) {
|
||||
set(addressing_mode);
|
||||
set(filtering_mode);
|
||||
set(coordinate_normalization_mode);
|
||||
}
|
||||
void set(sycl::addressing_mode addressing_mode,
|
||||
sycl::filtering_mode filtering_mode, int is_normalized) {
|
||||
set(addressing_mode);
|
||||
set(filtering_mode);
|
||||
set_coordinate_normalization_mode(is_normalized);
|
||||
}
|
||||
|
||||
unsigned get_channel_num() { return _data.get_channel_num(); }
|
||||
void set_channel_num(unsigned num) {
|
||||
return _data.set_channel_num(num);
|
||||
}
|
||||
|
||||
sycl::image_channel_type get_channel_type() {
|
||||
return _data.get_channel_type();
|
||||
}
|
||||
void set_channel_type(sycl::image_channel_type type) {
|
||||
return _data.set_channel_type(type);
|
||||
}
|
||||
|
||||
sycl::sampler get_sampler() {
|
||||
sycl::sampler smp = _sampling_info.get_sampler();
|
||||
/// linear memory only used for sycl::filtering_mode::nearest.
|
||||
if (_data.get_data_type() == image_data_type::linear) {
|
||||
smp = sycl::sampler(smp.get_coordinate_normalization_mode(),
|
||||
smp.get_addressing_mode(),
|
||||
sycl::filtering_mode::nearest);
|
||||
}
|
||||
return smp;
|
||||
}
|
||||
};
|
||||
inline image_wrapper_base::~image_wrapper_base() {}
|
||||
using image_wrapper_base_p = image_wrapper_base *;
|
||||
|
||||
template <class T, int dimensions, bool IsImageArray> class image_accessor_ext;
|
||||
|
||||
/// Image class, wrapper of sycl::image.
|
||||
template <class T, int dimensions, bool IsImageArray = false> class image_wrapper : public image_wrapper_base {
|
||||
sycl::image<dimensions> *_image = nullptr;
|
||||
|
||||
#ifndef DPCT_USM_LEVEL_NONE
|
||||
std::vector<char> _host_buffer;
|
||||
#endif
|
||||
|
||||
void create_image(sycl::queue q) {
|
||||
auto &data = get_data();
|
||||
if (data.get_data_type() == image_data_type::matrix) {
|
||||
_image = static_cast<image_matrix_p>(data.get_data_ptr())
|
||||
->create_image<dimensions>(data.get_channel());
|
||||
return;
|
||||
}
|
||||
auto ptr = data.get_data_ptr();
|
||||
auto channel = data.get_channel();
|
||||
|
||||
if (detail::get_pointer_attribute(q, ptr) == detail::pointer_access_attribute::device_only) {
|
||||
#ifdef DPCT_USM_LEVEL_NONE
|
||||
ptr = get_buffer(ptr)
|
||||
.template get_access<sycl::access_mode::read_write>()
|
||||
.get_pointer();
|
||||
#else
|
||||
auto sz = data.get_x();
|
||||
if (data.get_data_type() == image_data_type::pitch)
|
||||
sz *= channel.get_total_size() * data.get_y();
|
||||
_host_buffer.resize(sz);
|
||||
q.memcpy(_host_buffer.data(), ptr, sz).wait();
|
||||
ptr = _host_buffer.data();
|
||||
#endif
|
||||
}
|
||||
|
||||
if constexpr (dimensions == 1) {
|
||||
assert(data.get_data_type() == image_data_type::linear);
|
||||
_image = new sycl::image<1>(
|
||||
ptr, channel.get_channel_order(), channel.get_channel_type(),
|
||||
sycl::range<1>(data.get_x() / channel.get_total_size()));
|
||||
} else if constexpr (dimensions == 2) {
|
||||
assert(data.get_data_type() == image_data_type::pitch);
|
||||
_image = new sycl::image<2>(ptr, channel.get_channel_order(),
|
||||
channel.get_channel_type(),
|
||||
sycl::range<2>(data.get_x(), data.get_y()),
|
||||
sycl::range<1>(data.get_pitch()));
|
||||
} else {
|
||||
throw std::runtime_error("3D image only support matrix data");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
public:
|
||||
using acc_data_t = typename detail::image_trait<T>::acc_data_t;
|
||||
using accessor_t =
|
||||
typename image_accessor_ext<T, IsImageArray ? (dimensions - 1) : dimensions,
|
||||
IsImageArray>::accessor_t;
|
||||
|
||||
image_wrapper() { set_channel(image_channel::create<T>()); }
|
||||
~image_wrapper() { detach(); }
|
||||
|
||||
/// Get image accessor.
|
||||
accessor_t get_access(sycl::handler &cgh, sycl::queue &q = get_default_queue()) {
|
||||
if (!_image)
|
||||
create_image(q);
|
||||
return accessor_t(*_image, cgh);
|
||||
}
|
||||
|
||||
/// Detach data.
|
||||
void detach() override {
|
||||
if (_image)
|
||||
delete _image;
|
||||
_image = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
/// Wrap sampler and image accessor together.
|
||||
template <class T, int dimensions, bool IsImageArray = false>
|
||||
class image_accessor_ext {
|
||||
public:
|
||||
using accessor_t =
|
||||
typename detail::image_trait<T>::template accessor_t<dimensions>;
|
||||
using data_t = typename detail::image_trait<T>::data_t;
|
||||
sycl::sampler _sampler;
|
||||
accessor_t _img_acc;
|
||||
|
||||
public:
|
||||
image_accessor_ext(sycl::sampler sampler, accessor_t acc)
|
||||
: _sampler(sampler), _img_acc(acc) {}
|
||||
|
||||
/// Read data from accessor.
|
||||
template <bool Available = dimensions == 3>
|
||||
typename std::enable_if<Available, data_t>::type read(float x, float y,
|
||||
float z) {
|
||||
return detail::fetch_data<T>()(
|
||||
_img_acc.read(sycl::float4(x, y, z, 0), _sampler));
|
||||
}
|
||||
/// Read data from accessor.
|
||||
template <class Coord0, class Coord1, class Coord2,
|
||||
bool Available = dimensions == 3 &&
|
||||
std::is_integral<Coord0>::value
|
||||
&&std::is_integral<Coord1>::value
|
||||
&&std::is_integral<Coord2>::value>
|
||||
typename std::enable_if<Available, data_t>::type read(Coord0 x, Coord1 y,
|
||||
Coord2 z) {
|
||||
return detail::fetch_data<T>()(
|
||||
_img_acc.read(sycl::int4(x, y, z, 0), _sampler));
|
||||
}
|
||||
/// Read data from accessor.
|
||||
template <bool Available = dimensions == 2>
|
||||
typename std::enable_if<Available, data_t>::type read(float x, float y) {
|
||||
return detail::fetch_data<T>()(
|
||||
_img_acc.read(sycl::float2(x, y), _sampler));
|
||||
}
|
||||
/// Read data from accessor.
|
||||
template <class Coord0, class Coord1,
|
||||
bool Available = dimensions == 2 &&
|
||||
std::is_integral<Coord0>::value
|
||||
&&std::is_integral<Coord1>::value>
|
||||
typename std::enable_if<Available, data_t>::type read(Coord0 x, Coord1 y) {
|
||||
return detail::fetch_data<T>()(
|
||||
_img_acc.read(sycl::int2(x, y), _sampler));
|
||||
}
|
||||
/// Read data from accessor.
|
||||
template <bool Available = dimensions == 1>
|
||||
typename std::enable_if<Available, data_t>::type read(float x) {
|
||||
return detail::fetch_data<T>()(_img_acc.read(x, _sampler));
|
||||
}
|
||||
/// Read data from accessor.
|
||||
template <class CoordT,
|
||||
bool Available = dimensions == 1 && std::is_integral<CoordT>::value>
|
||||
typename std::enable_if<Available, data_t>::type read(CoordT x) {
|
||||
return detail::fetch_data<T>()(_img_acc.read(x, _sampler));
|
||||
}
|
||||
};
|
||||
|
||||
template <class T, int dimensions> class image_accessor_ext<T, dimensions, true> {
|
||||
public:
|
||||
using accessor_t =
|
||||
typename detail::image_trait<T>::template array_accessor_t<dimensions>;
|
||||
using data_t = typename detail::image_trait<T>::data_t;
|
||||
sycl::sampler _sampler;
|
||||
accessor_t _img_acc;
|
||||
|
||||
public:
|
||||
image_accessor_ext(sycl::sampler sampler, accessor_t acc)
|
||||
: _sampler(sampler), _img_acc(acc) {}
|
||||
|
||||
/// Read data from accessor.
|
||||
template <bool Available = dimensions == 2>
|
||||
typename std::enable_if<Available, data_t>::type read(int index, float x,
|
||||
float y) {
|
||||
return detail::fetch_data<T>()(
|
||||
_img_acc[index].read(sycl::float2(x, y), _sampler));
|
||||
}
|
||||
/// Read data from accessor.
|
||||
template <bool Available = dimensions == 2>
|
||||
typename std::enable_if<Available, data_t>::type read(int index, int x, int y) {
|
||||
return detail::fetch_data<T>()(
|
||||
_img_acc[index].read(sycl::int2(x, y), _sampler));
|
||||
}
|
||||
/// Read data from accessor.
|
||||
template <bool Available = dimensions == 1>
|
||||
typename std::enable_if<Available, data_t>::type read(int index, float x) {
|
||||
return detail::fetch_data<T>()(
|
||||
_img_acc[index].read(x, _sampler));
|
||||
}
|
||||
/// Read data from accessor.
|
||||
template <bool Available = dimensions == 1>
|
||||
typename std::enable_if<Available, data_t>::type read(int index, int x) {
|
||||
return detail::fetch_data<T>()(
|
||||
_img_acc[index].read(x, _sampler));
|
||||
}
|
||||
};
|
||||
|
||||
/// Create image wrapper according to image data and sampling info.
|
||||
/// \return Pointer to image wrapper base class.
|
||||
/// \param data Image data used to create image wrapper.
|
||||
/// \param info Image sampling info used to create image wrapper.
|
||||
/// \returns Pointer to base class of created image wrapper object.
|
||||
static inline image_wrapper_base *create_image_wrapper(image_data data,
|
||||
sampling_info info) {
|
||||
image_channel channel;
|
||||
int dims = 1;
|
||||
if (data.get_data_type() == image_data_type::matrix) {
|
||||
auto matrix = (image_matrix_p)data.get_data_ptr();
|
||||
channel = matrix->get_channel();
|
||||
dims = matrix->get_dims();
|
||||
} else {
|
||||
if (data.get_data_type() == image_data_type::pitch) {
|
||||
dims = 2;
|
||||
}
|
||||
channel = data.get_channel();
|
||||
}
|
||||
|
||||
if (auto ret = detail::create_image_wrapper(channel, dims)) {
|
||||
ret->set_sampling_info(info);
|
||||
ret->set_data(data);
|
||||
return ret;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
/// Create image according with given type \p T and \p dims.
|
||||
template <class T> static image_wrapper_base *create_image_wrapper(int dims) {
|
||||
switch (dims) {
|
||||
case 1:
|
||||
return new image_wrapper<T, 1>();
|
||||
case 2:
|
||||
return new image_wrapper<T, 2>();
|
||||
case 3:
|
||||
return new image_wrapper<T, 3>();
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
/// Create image with given data type \p T, channel order and dims
|
||||
template <class T>
|
||||
static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims) {
|
||||
switch (channel_num) {
|
||||
case 1:
|
||||
return create_image_wrapper<T>(dims);
|
||||
case 2:
|
||||
return create_image_wrapper<sycl::vec<T, 2>>(dims);
|
||||
case 3:
|
||||
return create_image_wrapper<sycl::vec<T, 3>>(dims);
|
||||
case 4:
|
||||
return create_image_wrapper<sycl::vec<T, 4>>(dims);
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
/// Create image with channel info and specified dimensions.
|
||||
static image_wrapper_base *create_image_wrapper(image_channel channel, int dims) {
|
||||
switch (channel.get_channel_type()) {
|
||||
case sycl::image_channel_type::fp16:
|
||||
return create_image_wrapper<sycl::half>(channel.get_channel_num(), dims);
|
||||
case sycl::image_channel_type::fp32:
|
||||
return create_image_wrapper<float>(channel.get_channel_num(), dims);
|
||||
case sycl::image_channel_type::signed_int8:
|
||||
return create_image_wrapper<std::int8_t>(channel.get_channel_num(), dims);
|
||||
case sycl::image_channel_type::signed_int16:
|
||||
return create_image_wrapper<std::int16_t>(channel.get_channel_num(), dims);
|
||||
case sycl::image_channel_type::signed_int32:
|
||||
return create_image_wrapper<std::int32_t>(channel.get_channel_num(), dims);
|
||||
case sycl::image_channel_type::unsigned_int8:
|
||||
return create_image_wrapper<std::uint8_t>(channel.get_channel_num(), dims);
|
||||
case sycl::image_channel_type::unsigned_int16:
|
||||
return create_image_wrapper<std::uint16_t>(channel.get_channel_num(), dims);
|
||||
case sycl::image_channel_type::unsigned_int32:
|
||||
return create_image_wrapper<std::uint32_t>(channel.get_channel_num(), dims);
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
} // namespace dpct
|
||||
|
||||
#endif // !__DPCT_IMAGE_HPP__
|
459
dpct/kernel.hpp
459
dpct/kernel.hpp
|
@ -1,459 +0,0 @@
|
|||
//==---- kernel.hpp -------------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_KERNEL_HPP__
|
||||
#define __DPCT_KERNEL_HPP__
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#ifdef _WIN32
|
||||
#include <unordered_set>
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
#if defined(__has_include) && __has_include(<filesystem>)
|
||||
#include <filesystem>
|
||||
#elif defined(__has_include) && __has_include(<experimental/filesystem>)
|
||||
#include <experimental/filesystem>
|
||||
#else
|
||||
#error "SYCLomatic runtime requires C++ filesystem support"
|
||||
#endif
|
||||
|
||||
#include <fstream>
|
||||
#include <image.hpp>
|
||||
#include <random>
|
||||
|
||||
namespace dpct {
|
||||
|
||||
typedef void (*kernel_functor)(sycl::queue &, const sycl::nd_range<3> &,
|
||||
unsigned int, void **, void **);
|
||||
|
||||
struct kernel_function_info {
|
||||
int max_work_group_size = 0;
|
||||
};
|
||||
|
||||
static inline void get_kernel_function_info(kernel_function_info *kernel_info,
|
||||
const void *function) {
|
||||
kernel_info->max_work_group_size =
|
||||
dpct::dev_mgr::instance()
|
||||
.current_device()
|
||||
.get_info<sycl::info::device::max_work_group_size>();
|
||||
}
|
||||
static inline kernel_function_info
|
||||
get_kernel_function_info(const void *function) {
|
||||
kernel_function_info kernel_info;
|
||||
kernel_info.max_work_group_size =
|
||||
dpct::dev_mgr::instance()
|
||||
.current_device()
|
||||
.get_info<sycl::info::device::max_work_group_size>();
|
||||
return kernel_info;
|
||||
}
|
||||
|
||||
|
||||
namespace detail {
|
||||
|
||||
#if defined(__has_include) && __has_include(<filesystem>)
|
||||
namespace fs = std::filesystem;
|
||||
#else
|
||||
namespace fs = std::experimental::filesystem;
|
||||
#endif
|
||||
|
||||
/// Write data to temporary file and return absolute path to temporary file.
|
||||
/// Temporary file is created in a temporary directory both of which have random
|
||||
/// names with only the user having access permissions. Only one temporary file
|
||||
/// will be created in the temporary directory.
|
||||
static inline fs::path write_data_to_file(char const *const data, size_t size) {
|
||||
std::error_code ec;
|
||||
|
||||
if (sizeof(size_t) >= sizeof(std::streamsize) &&
|
||||
size > (std::numeric_limits<std::streamsize>::max)())
|
||||
throw std::runtime_error("data file too large");
|
||||
|
||||
// random number generator
|
||||
std::random_device dev;
|
||||
std::mt19937 prng(dev());
|
||||
std::uniform_int_distribution<uint64_t> rand(0);
|
||||
|
||||
// find temporary directory
|
||||
auto tmp_dir = fs::temp_directory_path(ec);
|
||||
if (ec)
|
||||
throw std::runtime_error("could not find temporary directory");
|
||||
|
||||
// create private directory
|
||||
std::stringstream directory;
|
||||
fs::path directory_path;
|
||||
constexpr int max_attempts = 5;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < max_attempts; i++) {
|
||||
directory << std::hex << rand(prng);
|
||||
directory_path = tmp_dir / directory.str();
|
||||
if (fs::create_directory(directory_path)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i == max_attempts)
|
||||
throw std::runtime_error("could not create directory");
|
||||
|
||||
// only allow owner permissions to private directory
|
||||
fs::permissions(directory_path, fs::perms::owner_all, ec);
|
||||
if (ec)
|
||||
throw std::runtime_error("could not set directory permissions");
|
||||
|
||||
// random filename in private directory
|
||||
std::stringstream filename;
|
||||
filename << std::hex << rand(prng);
|
||||
#ifdef _WIN32
|
||||
auto filepath = directory_path / (filename.str() + ".dll");
|
||||
#else
|
||||
auto filepath = directory_path / filename.str();
|
||||
#endif
|
||||
|
||||
// write data to temporary file
|
||||
auto outfile = std::ofstream(filepath, std::ios::out | std::ios::binary);
|
||||
if (outfile) {
|
||||
// only allow program to write file
|
||||
fs::permissions(filepath, fs::perms::owner_write, ec);
|
||||
if (ec)
|
||||
throw std::runtime_error("could not set permissions");
|
||||
|
||||
outfile.write(data, size);
|
||||
if (!outfile.good())
|
||||
throw std::runtime_error("could not write data");
|
||||
outfile.close();
|
||||
|
||||
// only allow program to read/execute file
|
||||
fs::permissions(filepath, fs::perms::owner_read | fs::perms::owner_exec,
|
||||
ec);
|
||||
if (ec)
|
||||
throw std::runtime_error("could not set permissions");
|
||||
} else
|
||||
throw std::runtime_error("could not write data");
|
||||
|
||||
// check temporary file contents
|
||||
auto infile = std::ifstream(filepath, std::ios::in | std::ios::binary);
|
||||
if (infile) {
|
||||
bool mismatch = false;
|
||||
size_t cnt = 0;
|
||||
|
||||
while (1) {
|
||||
char c;
|
||||
infile.get(c);
|
||||
if (infile.eof())
|
||||
break;
|
||||
if (c != data[cnt++])
|
||||
mismatch = true;
|
||||
}
|
||||
if (cnt != size || mismatch)
|
||||
throw std::runtime_error("file contents not written correctly");
|
||||
} else
|
||||
throw std::runtime_error("could not validate file");
|
||||
|
||||
if (!filepath.is_absolute())
|
||||
throw std::runtime_error("temporary filepath is not absolute");
|
||||
|
||||
return filepath;
|
||||
}
|
||||
|
||||
static inline uint16_t extract16(unsigned char const *const ptr) {
|
||||
uint16_t ret = 0;
|
||||
|
||||
ret |= static_cast<uint16_t>(ptr[0]) << 0;
|
||||
ret |= static_cast<uint16_t>(ptr[1]) << 8;
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static inline uint32_t extract32(unsigned char const *const ptr) {
|
||||
uint32_t ret = 0;
|
||||
|
||||
ret |= static_cast<uint32_t>(ptr[0]) << 0;
|
||||
ret |= static_cast<uint32_t>(ptr[1]) << 8;
|
||||
ret |= static_cast<uint32_t>(ptr[2]) << 16;
|
||||
ret |= static_cast<uint32_t>(ptr[3]) << 24;
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static inline uint64_t extract64(unsigned char const *const ptr) {
|
||||
uint64_t ret = 0;
|
||||
|
||||
ret |= static_cast<uint64_t>(ptr[0]) << 0;
|
||||
ret |= static_cast<uint64_t>(ptr[1]) << 8;
|
||||
ret |= static_cast<uint64_t>(ptr[2]) << 16;
|
||||
ret |= static_cast<uint64_t>(ptr[3]) << 24;
|
||||
ret |= static_cast<uint64_t>(ptr[4]) << 32;
|
||||
ret |= static_cast<uint64_t>(ptr[5]) << 40;
|
||||
ret |= static_cast<uint64_t>(ptr[6]) << 48;
|
||||
ret |= static_cast<uint64_t>(ptr[7]) << 56;
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static inline uint64_t get_lib_size(char const *const blob) {
|
||||
#ifdef _WIN32
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// Analyze DOS stub
|
||||
unsigned char const *const ublob =
|
||||
reinterpret_cast<unsigned char const *const>(blob);
|
||||
if (ublob[0] != 0x4d || ublob[1] != 0x5a) {
|
||||
throw std::runtime_error("Blob is not a Windows DLL.");
|
||||
}
|
||||
uint32_t pe_header_offset = extract32(ublob + 0x3c);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// Ananlyze PE-header
|
||||
unsigned char const *const pe_header = ublob + pe_header_offset;
|
||||
|
||||
// signature
|
||||
uint32_t pe_signature = extract32(pe_header + 0);
|
||||
if (pe_signature != 0x00004550) {
|
||||
throw std::runtime_error("PE-header signature is not 0x00004550");
|
||||
}
|
||||
|
||||
// machine
|
||||
uint16_t machine = extract16(pe_header + 4);
|
||||
if (machine != 0x8664) {
|
||||
throw std::runtime_error("Only DLLs for x64 supported");
|
||||
}
|
||||
|
||||
// number of sections
|
||||
uint16_t number_of_sections = extract16(pe_header + 6);
|
||||
|
||||
// sizeof optional header
|
||||
uint16_t sizeof_optional_header = extract16(pe_header + 20);
|
||||
|
||||
// magic
|
||||
uint16_t magic = extract16(pe_header + 24);
|
||||
if (magic != 0x10b && magic != 0x20b) {
|
||||
throw std::runtime_error("MAGIC is not 0x010b or 0x020b");
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// Analyze tail of optional header
|
||||
constexpr int coff_header_size = 24;
|
||||
|
||||
unsigned char const *const tail_of_optional_header =
|
||||
pe_header + coff_header_size + sizeof_optional_header;
|
||||
if (extract64(tail_of_optional_header - 8) != 0) {
|
||||
throw std::runtime_error("Optional header not zero-padded");
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// Analyze last section header
|
||||
constexpr int section_header_size = 40;
|
||||
unsigned char const *const last_section_header =
|
||||
tail_of_optional_header + section_header_size * (number_of_sections - 1);
|
||||
|
||||
uint32_t sizeof_raw_data = extract32(last_section_header + 16);
|
||||
uint32_t pointer_to_raw_data = extract32(last_section_header + 20);
|
||||
|
||||
return sizeof_raw_data + pointer_to_raw_data;
|
||||
#else
|
||||
if (blob[0] != 0x7F || blob[1] != 'E' || blob[2] != 'L' || blob[3] != 'F')
|
||||
throw std::runtime_error("Blob is not in ELF format");
|
||||
|
||||
if (blob[4] != 0x02)
|
||||
throw std::runtime_error("Only 64-bit headers are supported");
|
||||
|
||||
if (blob[5] != 0x01)
|
||||
throw std::runtime_error("Only little-endian headers are supported");
|
||||
|
||||
unsigned char const *const ublob =
|
||||
reinterpret_cast<unsigned char const *const>(blob);
|
||||
uint64_t e_shoff = extract64(ublob + 0x28);
|
||||
uint16_t e_shentsize = extract16(ublob + 0x3A);
|
||||
uint16_t e_shnum = extract16(ublob + 0x3C);
|
||||
|
||||
return e_shoff + (e_shentsize * e_shnum);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
class path_lib_record {
|
||||
public:
|
||||
void operator=(const path_lib_record &) = delete;
|
||||
~path_lib_record() {
|
||||
for (auto entry : lib_to_path) {
|
||||
FreeLibrary(static_cast<HMODULE>(entry.first));
|
||||
fs::permissions(entry.second, fs::perms::owner_all);
|
||||
fs::remove_all(entry.second.remove_filename());
|
||||
}
|
||||
}
|
||||
static void record_lib_path(fs::path path, void *library) {
|
||||
lib_to_path[library] = path;
|
||||
}
|
||||
static void remove_lib(void *library) {
|
||||
auto path = lib_to_path[library];
|
||||
std::error_code ec;
|
||||
|
||||
FreeLibrary(static_cast<HMODULE>(library));
|
||||
fs::permissions(path, fs::perms::owner_all);
|
||||
if (fs::remove_all(path.remove_filename(), ec) != 2 || ec)
|
||||
// one directory and one temporary file should have been deleted
|
||||
throw std::runtime_error("Directory delete failed");
|
||||
|
||||
lib_to_path.erase(library);
|
||||
}
|
||||
|
||||
private:
|
||||
static inline std::unordered_map<void *, fs::path> lib_to_path;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace detail
|
||||
|
||||
class kernel_library {
|
||||
public:
|
||||
kernel_library() : ptr{nullptr} {}
|
||||
kernel_library(void *ptr) : ptr{ptr} {}
|
||||
|
||||
operator void *() const { return ptr; }
|
||||
|
||||
private:
|
||||
void *ptr;
|
||||
#ifdef _WIN32
|
||||
static inline detail::path_lib_record single_instance_to_trigger_destructor;
|
||||
#endif
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
static inline kernel_library load_dl_from_data(char const *const data,
|
||||
size_t size) {
|
||||
fs::path filename = write_data_to_file(data, size);
|
||||
#ifdef _WIN32
|
||||
void *so = LoadLibraryW(filename.wstring().c_str());
|
||||
#else
|
||||
void *so = dlopen(filename.c_str(), RTLD_LAZY);
|
||||
#endif
|
||||
if (so == nullptr)
|
||||
throw std::runtime_error("Failed to load kernel library");
|
||||
|
||||
#ifdef _WIN32
|
||||
detail::path_lib_record::record_lib_path(filename, so);
|
||||
#else
|
||||
std::error_code ec;
|
||||
|
||||
// Windows DLL cannot be deleted while in use
|
||||
if (fs::remove_all(filename.remove_filename(), ec) != 2 || ec)
|
||||
// one directory and one temporary file should have been deleted
|
||||
throw std::runtime_error("Directory delete failed");
|
||||
#endif
|
||||
|
||||
return so;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// Load kernel library and return a handle to use the library.
|
||||
/// \param [in] name The name of the library.
|
||||
static inline kernel_library load_kernel_library(const std::string &name) {
|
||||
std::ifstream ifs;
|
||||
ifs.open(name, std::ios::in | std::ios::binary);
|
||||
|
||||
std::stringstream buffer;
|
||||
buffer << ifs.rdbuf();
|
||||
|
||||
const std::string buffer_string = buffer.str();
|
||||
return detail::load_dl_from_data(buffer_string.c_str(), buffer_string.size());
|
||||
}
|
||||
|
||||
/// Load kernel library whose image is alreay in memory and return a handle to
|
||||
/// use the library.
|
||||
/// \param [in] image A pointer to the image in memory.
|
||||
static inline kernel_library load_kernel_library_mem(char const *const image) {
|
||||
const size_t size = detail::get_lib_size(image);
|
||||
|
||||
return detail::load_dl_from_data(image, size);
|
||||
}
|
||||
|
||||
/// Unload kernel library.
|
||||
/// \param [in,out] library Handle to the library to be closed.
|
||||
static inline void unload_kernel_library(const kernel_library &library) {
|
||||
#ifdef _WIN32
|
||||
detail::path_lib_record::remove_lib(library);
|
||||
#else
|
||||
dlclose(library);
|
||||
#endif
|
||||
}
|
||||
|
||||
class kernel_function {
|
||||
public:
|
||||
kernel_function() : ptr{nullptr} {}
|
||||
kernel_function(dpct::kernel_functor ptr) : ptr{ptr} {}
|
||||
|
||||
operator void *() const { return ((void *)ptr); }
|
||||
|
||||
void operator()(sycl::queue &q, const sycl::nd_range<3> &range,
|
||||
unsigned int a, void **args, void **extra) {
|
||||
ptr(q, range, a, args, extra);
|
||||
}
|
||||
|
||||
private:
|
||||
dpct::kernel_functor ptr;
|
||||
};
|
||||
|
||||
/// Find kernel function in a kernel library and return its address.
|
||||
/// \param [in] library Handle to the kernel library.
|
||||
/// \param [in] name Name of the kernel function.
|
||||
static inline dpct::kernel_function
|
||||
get_kernel_function(kernel_library &library, const std::string &name) {
|
||||
#ifdef _WIN32
|
||||
dpct::kernel_functor fn = reinterpret_cast<dpct::kernel_functor>(
|
||||
GetProcAddress(static_cast<HMODULE>(static_cast<void *>(library)),
|
||||
(name + std::string("_wrapper")).c_str()));
|
||||
#else
|
||||
dpct::kernel_functor fn = reinterpret_cast<dpct::kernel_functor>(
|
||||
dlsym(library, (name + std::string("_wrapper")).c_str()));
|
||||
#endif
|
||||
if (fn == nullptr)
|
||||
throw std::runtime_error("Failed to get function");
|
||||
return fn;
|
||||
}
|
||||
|
||||
/// Invoke a kernel function.
|
||||
/// \param [in] function kernel function.
|
||||
/// \param [in] queue SYCL queue used to execute kernel
|
||||
/// \param [in] groupRange SYCL group range
|
||||
/// \param [in] localRange SYCL local range
|
||||
/// \param [in] localMemSize The size of local memory required by the kernel
|
||||
/// function.
|
||||
/// \param [in] kernelParams Array of pointers to kernel arguments.
|
||||
/// \param [in] extra Extra arguments.
|
||||
static inline void invoke_kernel_function(dpct::kernel_function &function,
|
||||
sycl::queue &queue,
|
||||
sycl::range<3> groupRange,
|
||||
sycl::range<3> localRange,
|
||||
unsigned int localMemSize,
|
||||
void **kernelParams, void **extra) {
|
||||
function(queue, sycl::nd_range<3>(groupRange * localRange, localRange),
|
||||
localMemSize, kernelParams, extra);
|
||||
}
|
||||
|
||||
/// Find image wrapper in a kernel library and return its address.
|
||||
/// \param [in] library Handle to the kernel library.
|
||||
/// \param [in] name Name of the target image wrapper.
|
||||
static inline dpct::image_wrapper_base_p
|
||||
get_image_wrapper(dpct::kernel_library &library, const std::string &name) {
|
||||
#ifdef _WIN32
|
||||
dpct::image_wrapper_base_p fn =
|
||||
reinterpret_cast<dpct::image_wrapper_base_p>(GetProcAddress(
|
||||
static_cast<HMODULE>(static_cast<void *>(library)), name.c_str()));
|
||||
#else
|
||||
dpct::image_wrapper_base_p fn = reinterpret_cast<dpct::image_wrapper_base_p>(
|
||||
dlsym(library, name.c_str()));
|
||||
#endif
|
||||
if (fn == nullptr)
|
||||
throw std::runtime_error("Failed to get image");
|
||||
return fn;
|
||||
}
|
||||
|
||||
} // namespace dpct
|
||||
#endif // __DPCT_KERNEL_HPP__
|
File diff suppressed because it is too large
Load diff
|
@ -1,174 +0,0 @@
|
|||
//==---- lib_common_utils.hpp ---------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_LIB_COMMON_UTILS_HPP__
|
||||
#define __DPCT_LIB_COMMON_UTILS_HPP__
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <oneapi/mkl.hpp>
|
||||
#include "memory.hpp"
|
||||
#include "util.hpp"
|
||||
|
||||
namespace dpct {
|
||||
namespace detail {
|
||||
template <typename T> inline auto get_memory(const void *x) {
|
||||
T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
|
||||
#ifdef DPCT_USM_LEVEL_NONE
|
||||
return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
|
||||
#else
|
||||
return new_x;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q) {
|
||||
using Ty = typename DataType<T>::T2;
|
||||
Ty s_h;
|
||||
if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
|
||||
detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
|
||||
.wait();
|
||||
else
|
||||
s_h = *reinterpret_cast<const Ty *>(s);
|
||||
return s_h;
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
enum class version_field : int { major, minor, update, patch };
|
||||
|
||||
/// Returns the requested field of Intel(R) oneAPI Math Kernel Library version.
|
||||
/// \param field The version information field (major, minor, update or patch).
|
||||
/// \param result The result value.
|
||||
inline void mkl_get_version(version_field field, int *result) {
|
||||
#ifndef __INTEL_MKL__
|
||||
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
|
||||
"Project does not support this API.");
|
||||
#else
|
||||
MKLVersion version;
|
||||
mkl_get_version(&version);
|
||||
if (version_field::major == field) {
|
||||
*result = version.MajorVersion;
|
||||
} else if (version_field::minor == field) {
|
||||
*result = version.MinorVersion;
|
||||
} else if (version_field::update == field) {
|
||||
*result = version.UpdateVersion;
|
||||
} else if (version_field::patch == field) {
|
||||
*result = 0;
|
||||
} else {
|
||||
throw std::runtime_error("unknown field");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
enum class library_data_t : unsigned char {
|
||||
real_float = 0,
|
||||
complex_float,
|
||||
real_double,
|
||||
complex_double,
|
||||
real_half,
|
||||
complex_half,
|
||||
real_bfloat16,
|
||||
complex_bfloat16,
|
||||
real_int4,
|
||||
complex_int4,
|
||||
real_uint4,
|
||||
complex_uint4,
|
||||
real_int8,
|
||||
complex_int8,
|
||||
real_uint8,
|
||||
complex_uint8,
|
||||
real_int16,
|
||||
complex_int16,
|
||||
real_uint16,
|
||||
complex_uint16,
|
||||
real_int32,
|
||||
complex_int32,
|
||||
real_uint32,
|
||||
complex_uint32,
|
||||
real_int64,
|
||||
complex_int64,
|
||||
real_uint64,
|
||||
complex_uint64,
|
||||
real_int8_4,
|
||||
real_int8_32,
|
||||
real_uint8_4,
|
||||
library_data_t_size
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
template <typename ArgT>
|
||||
inline constexpr std::uint64_t get_type_combination_id(ArgT Val) {
|
||||
static_assert((unsigned char)library_data_t::library_data_t_size <=
|
||||
std::numeric_limits<unsigned char>::max() &&
|
||||
"library_data_t size exceeds limit.");
|
||||
static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
|
||||
return (std::uint64_t)Val;
|
||||
}
|
||||
|
||||
template <typename FirstT, typename... RestT>
|
||||
inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
|
||||
RestT... RestVal) {
|
||||
static_assert((std::uint8_t)library_data_t::library_data_t_size <=
|
||||
std::numeric_limits<unsigned char>::max() &&
|
||||
"library_data_t size exceeds limit.");
|
||||
static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
|
||||
static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
|
||||
return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
|
||||
}
|
||||
|
||||
inline constexpr std::size_t library_data_size[] = {
|
||||
8 * sizeof(float), // real_float
|
||||
8 * sizeof(std::complex<float>), // complex_float
|
||||
8 * sizeof(double), // real_double
|
||||
8 * sizeof(std::complex<double>), // complex_double
|
||||
8 * sizeof(sycl::half), // real_half
|
||||
8 * sizeof(std::complex<sycl::half>), // complex_half
|
||||
16, // real_bfloat16
|
||||
16 * 2, // complex_bfloat16
|
||||
4, // real_int4
|
||||
4 * 2, // complex_int4
|
||||
4, // real_uint4
|
||||
4 * 2, // complex_uint4
|
||||
8, // real_int8
|
||||
8 * 2, // complex_int8
|
||||
8, // real_uint8
|
||||
8 * 2, // complex_uint8
|
||||
16, // real_int16
|
||||
16 * 2, // complex_int16
|
||||
16, // real_uint16
|
||||
16 * 2, // complex_uint16
|
||||
32, // real_int32
|
||||
32 * 2, // complex_int32
|
||||
32, // real_uint32
|
||||
32 * 2, // complex_uint32
|
||||
64, // real_int64
|
||||
64 * 2, // complex_int64
|
||||
64, // real_uint64
|
||||
64 * 2, // complex_uint64
|
||||
8, // real_int8_4
|
||||
8, // real_int8_32
|
||||
8 // real_uint8_4
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
#ifdef DPCT_USM_LEVEL_NONE
|
||||
/// Cast a "rvalue reference to a temporary object" to an "lvalue reference to
|
||||
/// that temporary object".
|
||||
/// CAUTION:
|
||||
/// The returned lvalue reference is available only before the last step in
|
||||
/// evaluating the full-expression that contains this function call.
|
||||
/// \param [in] temporary_object The rvalue reference to a temporary object.
|
||||
/// \returns The lvalue reference to that temporary object.
|
||||
template <typename T>
|
||||
inline typename std::enable_if_t<std::is_rvalue_reference_v<T &&>, T &>
|
||||
rvalue_ref_to_lvalue_ref(T &&temporary_object) {
|
||||
return temporary_object;
|
||||
}
|
||||
#endif
|
||||
} // namespace dpct
|
||||
|
||||
#endif // __DPCT_LIB_COMMON_UTILS_HPP__
|
1814
dpct/math.hpp
1814
dpct/math.hpp
File diff suppressed because it is too large
Load diff
1497
dpct/memory.hpp
1497
dpct/memory.hpp
File diff suppressed because it is too large
Load diff
|
@ -1,535 +0,0 @@
|
|||
//==---- rng_utils.hpp ----------------------------*- C++ -*----------------==//
|
||||
//
|
||||
// Copyright (C) Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __DPCT_RNG_UTILS_HPP__
|
||||
#define __DPCT_RNG_UTILS_HPP__
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <oneapi/mkl.hpp>
|
||||
#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
|
||||
#include <oneapi/mkl/rng/device.hpp>
|
||||
#endif
|
||||
#include "device.hpp"
|
||||
#include "lib_common_utils.hpp"
|
||||
|
||||
namespace dpct {
|
||||
namespace rng {
|
||||
#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
|
||||
namespace device {
|
||||
/// The random number generator on device.
|
||||
/// \tparam engine_t The device random number generator engine. It can only be
|
||||
/// oneapi::mkl::rng::device::mrg32k3a<1> or
|
||||
/// oneapi::mkl::rng::device::mrg32k3a<4> or
|
||||
/// oneapi::mkl::rng::device::philox4x32x10<1> or
|
||||
/// oneapi::mkl::rng::device::philox4x32x10<4>.
|
||||
template <typename engine_t> class rng_generator {
|
||||
static_assert(
|
||||
std::disjunction_v<
|
||||
std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<1>>,
|
||||
std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<4>>,
|
||||
std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<1>>,
|
||||
std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<4>>,
|
||||
std::is_same<engine_t, oneapi::mkl::rng::device::mcg59<1>>>,
|
||||
"engine_t can only be oneapi::mkl::rng::device::mrg32k3a<1> or "
|
||||
"oneapi::mkl::rng::device::mrg32k3a<4> or "
|
||||
"oneapi::mkl::rng::device::philox4x32x10<1> or "
|
||||
"oneapi::mkl::rng::device::philox4x32x10<4> or "
|
||||
"oneapi::mkl::rng::device::mcg59<1>.");
|
||||
static constexpr bool _is_engine_vec_size_one = std::disjunction_v<
|
||||
std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<1>>,
|
||||
std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<1>>,
|
||||
std::is_same<engine_t, oneapi::mkl::rng::device::mcg59<1>>>;
|
||||
static constexpr std::uint64_t default_seed = 0;
|
||||
oneapi::mkl::rng::device::bits<std::uint32_t> _distr_bits;
|
||||
oneapi::mkl::rng::device::uniform_bits<std::uint32_t> _distr_uniform_bits;
|
||||
oneapi::mkl::rng::device::gaussian<float> _distr_gaussian_float;
|
||||
oneapi::mkl::rng::device::gaussian<double> _distr_gaussian_double;
|
||||
oneapi::mkl::rng::device::lognormal<float> _distr_lognormal_float;
|
||||
oneapi::mkl::rng::device::lognormal<double> _distr_lognormal_double;
|
||||
oneapi::mkl::rng::device::poisson<std::uint32_t> _distr_poisson;
|
||||
oneapi::mkl::rng::device::uniform<float> _distr_uniform_float;
|
||||
oneapi::mkl::rng::device::uniform<double> _distr_uniform_double;
|
||||
engine_t _engine;
|
||||
|
||||
public:
|
||||
/// Default constructor of rng_generator
|
||||
rng_generator() { _engine = engine_t(default_seed); }
|
||||
/// Constructor of rng_generator if engine type is not mcg59
|
||||
/// \param [in] seed The seed to initialize the engine state.
|
||||
/// \param [in] num_to_skip Set the number of elements need to be skipped.
|
||||
/// The number is calculated as: num_to_skip[0] + num_to_skip[1] * 2^64 +
|
||||
/// num_to_skip[2] * 2^128 + ... + num_to_skip[n-1] * 2^(64*(n-1))
|
||||
template <typename T = engine_t,
|
||||
typename std::enable_if<!std::is_same_v<
|
||||
T, oneapi::mkl::rng::device::mcg59<1>>>::type * = nullptr>
|
||||
rng_generator(std::uint64_t seed,
|
||||
std::initializer_list<std::uint64_t> num_to_skip) {
|
||||
_engine = engine_t(seed, num_to_skip);
|
||||
}
|
||||
/// Constructor of rng_generator if engine type is mcg59
|
||||
/// \param [in] seed The seed to initialize the engine state.
|
||||
/// \param [in] num_to_skip Set the number of elements need to be skipped.
|
||||
template <typename T = engine_t,
|
||||
typename std::enable_if<std::is_same_v<
|
||||
T, oneapi::mkl::rng::device::mcg59<1>>>::type * = nullptr>
|
||||
rng_generator(std::uint64_t seed, std::uint64_t num_to_skip) {
|
||||
_engine = engine_t(seed, num_to_skip);
|
||||
}
|
||||
|
||||
/// Generate random number(s) obeys distribution \tparam distr_t.
|
||||
/// \tparam T The distribution of the random number. It can only be
|
||||
/// oneapi::mkl::rng::device::bits<std::uint32_t>,
|
||||
/// oneapi::mkl::rng::device::uniform_bits<std::uint32_t>,
|
||||
/// oneapi::mkl::rng::device::gaussian<float>,
|
||||
/// oneapi::mkl::rng::device::gaussian<double>,
|
||||
/// oneapi::mkl::rng::device::lognormal<float>,
|
||||
/// oneapi::mkl::rng::device::lognormal<double>,
|
||||
/// oneapi::mkl::rng::device::poisson<std::uint32_t>,
|
||||
/// oneapi::mkl::rng::device::uniform<float> or
|
||||
/// oneapi::mkl::rng::device::uniform<double>
|
||||
/// \tparam vec_size The length of the return vector. It can only be 1, 2
|
||||
/// or 4.
|
||||
/// \param distr_params The parameter(s) for lognormal or poisson
|
||||
/// distribution.
|
||||
/// \return The vector of the random number(s).
|
||||
template <typename distr_t, int vec_size, class... distr_params_t>
|
||||
auto generate(distr_params_t... distr_params) {
|
||||
static_assert(vec_size == 1 || vec_size == 2 || vec_size == 4,
|
||||
"vec_size is not supported.");
|
||||
static_assert(
|
||||
std::disjunction_v<
|
||||
std::is_same<distr_t,
|
||||
oneapi::mkl::rng::device::bits<std::uint32_t>>,
|
||||
std::is_same<distr_t,
|
||||
oneapi::mkl::rng::device::uniform_bits<std::uint32_t>>,
|
||||
std::is_same<distr_t, oneapi::mkl::rng::device::gaussian<float>>,
|
||||
std::is_same<distr_t, oneapi::mkl::rng::device::gaussian<double>>,
|
||||
std::is_same<distr_t, oneapi::mkl::rng::device::lognormal<float>>,
|
||||
std::is_same<distr_t, oneapi::mkl::rng::device::lognormal<double>>,
|
||||
std::is_same<distr_t,
|
||||
oneapi::mkl::rng::device::poisson<std::uint32_t>>,
|
||||
std::is_same<distr_t, oneapi::mkl::rng::device::uniform<float>>,
|
||||
std::is_same<distr_t, oneapi::mkl::rng::device::uniform<double>>>,
|
||||
"distribution is not supported.");
|
||||
|
||||
if constexpr (std::is_same_v<
|
||||
distr_t, oneapi::mkl::rng::device::bits<std::uint32_t>>) {
|
||||
return generate_vec<vec_size>(_distr_bits);
|
||||
}
|
||||
if constexpr (std::is_same_v<
|
||||
distr_t,
|
||||
oneapi::mkl::rng::device::uniform_bits<std::uint32_t>>) {
|
||||
return generate_vec<vec_size>(_distr_uniform_bits);
|
||||
}
|
||||
if constexpr (std::is_same_v<distr_t,
|
||||
oneapi::mkl::rng::device::gaussian<float>>) {
|
||||
return generate_vec<vec_size>(_distr_gaussian_float);
|
||||
}
|
||||
if constexpr (std::is_same_v<distr_t,
|
||||
oneapi::mkl::rng::device::gaussian<double>>) {
|
||||
return generate_vec<vec_size>(_distr_gaussian_double);
|
||||
}
|
||||
if constexpr (std::is_same_v<distr_t,
|
||||
oneapi::mkl::rng::device::lognormal<float>>) {
|
||||
return generate_vec<vec_size>(_distr_lognormal_float, distr_params...,
|
||||
0.0f, 1.0f);
|
||||
}
|
||||
if constexpr (std::is_same_v<distr_t,
|
||||
oneapi::mkl::rng::device::lognormal<double>>) {
|
||||
return generate_vec<vec_size>(_distr_lognormal_double, distr_params...,
|
||||
0.0, 1.0);
|
||||
}
|
||||
if constexpr (std::is_same_v<distr_t, oneapi::mkl::rng::device::poisson<
|
||||
std::uint32_t>>) {
|
||||
return generate_vec<vec_size>(_distr_poisson, distr_params...);
|
||||
}
|
||||
if constexpr (std::is_same_v<distr_t,
|
||||
oneapi::mkl::rng::device::uniform<float>>) {
|
||||
return generate_vec<vec_size>(_distr_uniform_float);
|
||||
}
|
||||
if constexpr (std::is_same_v<distr_t,
|
||||
oneapi::mkl::rng::device::uniform<double>>) {
|
||||
return generate_vec<vec_size>(_distr_uniform_double);
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the random number generator engine.
|
||||
/// \return The reference of the internal random number generator engine.
|
||||
engine_t &get_engine() { return _engine; }
|
||||
|
||||
private:
|
||||
template <int vec_size, typename distr_t, class... distr_params_t>
|
||||
auto generate_vec(distr_t &distr, distr_params_t... distr_params) {
|
||||
if constexpr (sizeof...(distr_params_t)) {
|
||||
typename distr_t::param_type pt(distr_params...);
|
||||
distr.param(pt);
|
||||
}
|
||||
if constexpr (vec_size == 4) {
|
||||
if constexpr (_is_engine_vec_size_one) {
|
||||
sycl::vec<typename distr_t::result_type, 4> res;
|
||||
res.x() = oneapi::mkl::rng::device::generate(distr, _engine);
|
||||
res.y() = oneapi::mkl::rng::device::generate(distr, _engine);
|
||||
res.z() = oneapi::mkl::rng::device::generate(distr, _engine);
|
||||
res.w() = oneapi::mkl::rng::device::generate(distr, _engine);
|
||||
return res;
|
||||
} else {
|
||||
return oneapi::mkl::rng::device::generate(distr, _engine);
|
||||
}
|
||||
} else if constexpr (vec_size == 1) {
|
||||
if constexpr (_is_engine_vec_size_one) {
|
||||
return oneapi::mkl::rng::device::generate(distr, _engine);
|
||||
} else {
|
||||
return oneapi::mkl::rng::device::generate_single(distr, _engine);
|
||||
}
|
||||
} else if constexpr (vec_size == 2) {
|
||||
if constexpr (_is_engine_vec_size_one) {
|
||||
sycl::vec<typename distr_t::result_type, 2> res;
|
||||
res.x() = oneapi::mkl::rng::device::generate(distr, _engine);
|
||||
res.y() = oneapi::mkl::rng::device::generate(distr, _engine);
|
||||
return res;
|
||||
} else {
|
||||
sycl::vec<typename distr_t::result_type, 2> res;
|
||||
res.x() = oneapi::mkl::rng::device::generate_single(distr, _engine);
|
||||
res.y() = oneapi::mkl::rng::device::generate_single(distr, _engine);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
#endif
|
||||
|
||||
namespace host {
|
||||
namespace detail {
|
||||
class rng_generator_base {
|
||||
public:
|
||||
/// Set the seed of host rng_generator.
|
||||
/// \param seed The engine seed.
|
||||
virtual void set_seed(const std::uint64_t seed) = 0;
|
||||
|
||||
/// Set the dimensions of host rng_generator.
|
||||
/// \param dimensions The engine dimensions.
|
||||
virtual void set_dimensions(const std::uint32_t dimensions) = 0;
|
||||
|
||||
/// Set the queue of host rng_generator.
|
||||
/// \param queue The engine queue.
|
||||
virtual void set_queue(sycl::queue *queue) = 0;
|
||||
|
||||
/// Generate unsigned int random number(s) with 'uniform_bits' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
virtual inline void generate_uniform_bits(unsigned int *output,
|
||||
std::int64_t n) = 0;
|
||||
|
||||
/// Generate unsigned long long random number(s) with 'uniform_bits'
|
||||
/// distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
virtual inline void generate_uniform_bits(unsigned long long *output,
|
||||
std::int64_t n) = 0;
|
||||
|
||||
/// Generate float random number(s) with 'lognormal' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
/// \param m Mean of associated normal distribution
|
||||
/// \param s Standard deviation of associated normal distribution.
|
||||
virtual inline void generate_lognormal(float *output, std::int64_t n, float m,
|
||||
float s) = 0;
|
||||
|
||||
/// Generate double random number(s) with 'lognormal' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
/// \param m Mean of associated normal distribution
|
||||
/// \param s Standard deviation of associated normal distribution.
|
||||
virtual inline void generate_lognormal(double *output, std::int64_t n,
|
||||
double m, double s) = 0;
|
||||
|
||||
/// Generate float random number(s) with 'gaussian' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
/// \param mean Mean of normal distribution
|
||||
/// \param stddev Standard deviation of normal distribution.
|
||||
virtual inline void generate_gaussian(float *output, std::int64_t n,
|
||||
float mean, float stddev) = 0;
|
||||
|
||||
/// Generate double random number(s) with 'gaussian' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
/// \param mean Mean of normal distribution
|
||||
/// \param stddev Standard deviation of normal distribution.
|
||||
virtual inline void generate_gaussian(double *output, std::int64_t n,
|
||||
double mean, double stddev) = 0;
|
||||
|
||||
/// Generate unsigned int random number(s) with 'poisson' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
/// \param lambda Lambda for the Poisson distribution.
|
||||
virtual inline void generate_poisson(unsigned int *output, std::int64_t n,
|
||||
double lambda) = 0;
|
||||
|
||||
/// Generate float random number(s) with 'uniform' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
virtual inline void generate_uniform(float *output, std::int64_t n) = 0;
|
||||
|
||||
/// Generate double random number(s) with 'uniform' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
virtual inline void generate_uniform(double *output, std::int64_t n) = 0;
|
||||
|
||||
/// Skip ahead several random number(s).
|
||||
/// \param num_to_skip The number of random numbers to be skipped.
|
||||
virtual void skip_ahead(const std::uint64_t num_to_skip) = 0;
|
||||
|
||||
/// Set the direction numbers of host rng_generator. Only Sobol engine
|
||||
/// supports this method.
|
||||
/// \param direction_numbers The engine direction numbers.
|
||||
virtual void set_direction_numbers(
|
||||
const std::vector<std::uint32_t> &direction_numbers) = 0;
|
||||
|
||||
protected:
|
||||
sycl::queue *_queue{&dpct::get_default_queue()};
|
||||
std::uint64_t _seed{0};
|
||||
std::uint32_t _dimensions{1};
|
||||
std::vector<std::uint32_t> _direction_numbers;
|
||||
};
|
||||
|
||||
/// The random number generator on host.
|
||||
template <typename engine_t = oneapi::mkl::rng::philox4x32x10>
|
||||
class rng_generator : public rng_generator_base {
|
||||
public:
|
||||
/// Constructor of rng_generator.
|
||||
rng_generator() : _engine(create_engine(_queue, _seed, _dimensions)) {}
|
||||
|
||||
/// Set the seed of host rng_generator.
|
||||
/// \param seed The engine seed.
|
||||
void set_seed(const std::uint64_t seed) {
|
||||
if (seed == _seed) {
|
||||
return;
|
||||
}
|
||||
_seed = seed;
|
||||
_engine = create_engine(_queue, _seed, _dimensions);
|
||||
}
|
||||
|
||||
/// Set the dimensions of host rng_generator.
|
||||
/// \param dimensions The engine dimensions.
|
||||
void set_dimensions(const std::uint32_t dimensions) {
|
||||
if (dimensions == _dimensions) {
|
||||
return;
|
||||
}
|
||||
_dimensions = dimensions;
|
||||
_engine = create_engine(_queue, _seed, _dimensions);
|
||||
}
|
||||
|
||||
/// Set the queue of host rng_generator.
|
||||
/// \param queue The engine queue.
|
||||
void set_queue(sycl::queue *queue) {
|
||||
if (queue == _queue) {
|
||||
return;
|
||||
}
|
||||
_queue = queue;
|
||||
_engine = create_engine(_queue, _seed, _dimensions);
|
||||
}
|
||||
|
||||
/// Set the direction numbers of Sobol host rng_generator.
|
||||
/// \param direction_numbers The user-defined direction numbers.
|
||||
void
|
||||
set_direction_numbers(const std::vector<std::uint32_t> &direction_numbers) {
|
||||
#ifndef __INTEL_MKL__
|
||||
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
|
||||
"Interfaces Project does not support this API.");
|
||||
#else
|
||||
if constexpr (std::is_same_v<engine_t, oneapi::mkl::rng::sobol>) {
|
||||
if (direction_numbers == _direction_numbers) {
|
||||
return;
|
||||
}
|
||||
_direction_numbers = direction_numbers;
|
||||
_engine = oneapi::mkl::rng::sobol(*_queue, _direction_numbers);
|
||||
} else {
|
||||
throw std::runtime_error("Only Sobol engine supports this method.");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Generate unsigned int random number(s) with 'uniform_bits' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
inline void generate_uniform_bits(unsigned int *output, std::int64_t n) {
|
||||
#ifndef __INTEL_MKL__
|
||||
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
|
||||
"Interfaces Project does not support this API.");
|
||||
#else
|
||||
static_assert(sizeof(unsigned int) == sizeof(std::uint32_t));
|
||||
generate<oneapi::mkl::rng::uniform_bits<std::uint32_t>>(
|
||||
(std::uint32_t *)output, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Generate unsigned long long random number(s) with 'uniform_bits'
|
||||
/// distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
inline void generate_uniform_bits(unsigned long long *output,
|
||||
std::int64_t n) {
|
||||
#ifndef __INTEL_MKL__
|
||||
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
|
||||
"Interfaces Project does not support this API.");
|
||||
#else
|
||||
static_assert(sizeof(unsigned long long) == sizeof(std::uint64_t));
|
||||
generate<oneapi::mkl::rng::uniform_bits<std::uint64_t>>(
|
||||
(std::uint64_t *)output, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Generate float random number(s) with 'lognormal' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
/// \param m Mean of associated normal distribution
|
||||
/// \param s Standard deviation of associated normal distribution.
|
||||
inline void generate_lognormal(float *output, std::int64_t n, float m,
|
||||
float s) {
|
||||
generate<oneapi::mkl::rng::lognormal<float>>(output, n, m, s);
|
||||
}
|
||||
|
||||
/// Generate double random number(s) with 'lognormal' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
/// \param m Mean of associated normal distribution
|
||||
/// \param s Standard deviation of associated normal distribution.
|
||||
inline void generate_lognormal(double *output, std::int64_t n, double m,
|
||||
double s) {
|
||||
generate<oneapi::mkl::rng::lognormal<double>>(output, n, m, s);
|
||||
}
|
||||
|
||||
/// Generate float random number(s) with 'gaussian' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
/// \param mean Mean of normal distribution
|
||||
/// \param stddev Standard deviation of normal distribution.
|
||||
inline void generate_gaussian(float *output, std::int64_t n, float mean,
|
||||
float stddev) {
|
||||
generate<oneapi::mkl::rng::gaussian<float>>(output, n, mean, stddev);
|
||||
}
|
||||
|
||||
/// Generate double random number(s) with 'gaussian' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
/// \param mean Mean of normal distribution
|
||||
/// \param stddev Standard deviation of normal distribution.
|
||||
inline void generate_gaussian(double *output, std::int64_t n, double mean,
|
||||
double stddev) {
|
||||
generate<oneapi::mkl::rng::gaussian<double>>(output, n, mean, stddev);
|
||||
}
|
||||
|
||||
/// Generate unsigned int random number(s) with 'poisson' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
/// \param lambda Lambda for the Poisson distribution.
|
||||
inline void generate_poisson(unsigned int *output, std::int64_t n,
|
||||
double lambda) {
|
||||
generate<oneapi::mkl::rng::poisson<unsigned int>>(output, n, lambda);
|
||||
}
|
||||
|
||||
/// Generate float random number(s) with 'uniform' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
inline void generate_uniform(float *output, std::int64_t n) {
|
||||
generate<oneapi::mkl::rng::uniform<float>>(output, n);
|
||||
}
|
||||
|
||||
/// Generate double random number(s) with 'uniform' distribution.
|
||||
/// \param output The pointer of the first random number.
|
||||
/// \param n The number of random numbers.
|
||||
inline void generate_uniform(double *output, std::int64_t n) {
|
||||
generate<oneapi::mkl::rng::uniform<double>>(output, n);
|
||||
}
|
||||
|
||||
/// Skip ahead several random number(s).
|
||||
/// \param num_to_skip The number of random numbers to be skipped.
|
||||
void skip_ahead(const std::uint64_t num_to_skip) {
|
||||
#ifndef __INTEL_MKL__
|
||||
oneapi::mkl::rng::skip_ahead(_engine, num_to_skip);
|
||||
#else
|
||||
if constexpr (std::is_same_v<engine_t, oneapi::mkl::rng::mt2203>)
|
||||
throw std::runtime_error("no skip_ahead method of mt2203 engine.");
|
||||
else
|
||||
oneapi::mkl::rng::skip_ahead(_engine, num_to_skip);
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
static inline engine_t create_engine(sycl::queue *queue,
|
||||
const std::uint64_t seed,
|
||||
const std::uint32_t dimensions) {
|
||||
#ifdef __INTEL_MKL__
|
||||
return std::is_same_v<engine_t, oneapi::mkl::rng::sobol>
|
||||
? engine_t(*queue, dimensions)
|
||||
: engine_t(*queue, seed);
|
||||
#else
|
||||
return engine_t(*queue, seed);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename distr_t, typename buffer_t, class... distr_params_t>
|
||||
void generate(buffer_t *output, const std::int64_t n,
|
||||
const distr_params_t... distr_params) {
|
||||
auto output_buf = dpct::detail::get_memory<buffer_t>(output);
|
||||
oneapi::mkl::rng::generate(distr_t(distr_params...), _engine, n,
|
||||
output_buf);
|
||||
}
|
||||
engine_t _engine{};
|
||||
};
|
||||
} // namespace detail
|
||||
} // namespace host
|
||||
|
||||
enum class random_engine_type {
|
||||
philox4x32x10,
|
||||
mrg32k3a,
|
||||
mt2203,
|
||||
mt19937,
|
||||
sobol,
|
||||
mcg59
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<rng::host::detail::rng_generator_base> host_rng_ptr;
|
||||
|
||||
/// Create a host random number generator.
|
||||
/// \param type The random engine type.
|
||||
/// \return The pointer of random number generator.
|
||||
inline host_rng_ptr create_host_rng(const random_engine_type type) {
|
||||
switch (type) {
|
||||
case random_engine_type::philox4x32x10:
|
||||
return std::make_shared<
|
||||
rng::host::detail::rng_generator<oneapi::mkl::rng::philox4x32x10>>();
|
||||
case random_engine_type::mrg32k3a:
|
||||
return std::make_shared<
|
||||
rng::host::detail::rng_generator<oneapi::mkl::rng::mrg32k3a>>();
|
||||
#ifndef __INTEL_MKL__
|
||||
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
|
||||
"Interfaces Project does not support this API.");
|
||||
#else
|
||||
case random_engine_type::mt2203:
|
||||
return std::make_shared<
|
||||
rng::host::detail::rng_generator<oneapi::mkl::rng::mt2203>>();
|
||||
case random_engine_type::mt19937:
|
||||
return std::make_shared<
|
||||
rng::host::detail::rng_generator<oneapi::mkl::rng::mt19937>>();
|
||||
case random_engine_type::sobol:
|
||||
return std::make_shared<
|
||||
rng::host::detail::rng_generator<oneapi::mkl::rng::sobol>>();
|
||||
case random_engine_type::mcg59:
|
||||
return std::make_shared<
|
||||
rng::host::detail::rng_generator<oneapi::mkl::rng::mcg59>>();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
} // namespace rng
|
||||
} // namespace dpct
|
||||
|
||||
#endif // __DPCT_RNG_UTILS_HPP__
|
File diff suppressed because it is too large
Load diff
1070
dpct/util.hpp
1070
dpct/util.hpp
File diff suppressed because it is too large
Load diff
|
@ -19,9 +19,10 @@
|
|||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/half_type.hpp>
|
||||
#include <dpct/dpct.hpp>
|
||||
#include <dpct/blas_utils.hpp>
|
||||
#include <dpct/lib_common_utils.hpp>
|
||||
// #include <dpct/dpct.hpp>
|
||||
// #include <dpct/blas_utils.hpp>
|
||||
// #include <dpct/lib_common_utils.hpp>
|
||||
#include "dpct.hpp"
|
||||
#include "ggml-sycl.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
|
2
run.sh
2
run.sh
|
@ -14,6 +14,6 @@ echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
|
|||
#export GGML_SYCL_DEBUG=1
|
||||
#export GGML_SYCL_LIST_DEVICE=1
|
||||
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
|
||||
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
|
||||
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue