summary dpct definition in one header file to replace folder:dpct

This commit is contained in:
jianyuzh 2024-01-13 20:33:42 +08:00 committed by Meng, Hengyu
parent 5b5389941e
commit a47f5ec42e
27 changed files with 2836 additions and 25652 deletions

2831
dpct.hpp Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,842 +0,0 @@
//==---- atomic.hpp -------------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_ATOMIC_HPP__
#define __DPCT_ATOMIC_HPP__
#include <sycl/sycl.hpp>
namespace dpct {
/// Atomically add the value operand to the value at the addr and assign the
/// result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to add to the value at \p addr.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T,
sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
inline T atomic_fetch_add(T *addr, T operand) {
auto atm =
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_add(operand);
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device,
typename T1, typename T2>
inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
auto atm =
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_add(operand);
}
/// Atomically add the value operand to the value at the addr and assign the
/// result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to add to the value at \p addr.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T, sycl::access::address_space addressSpace =
sycl::access::address_space::global_space>
inline T atomic_fetch_add(T *addr, T operand,
sycl::memory_order memoryOrder) {
switch (memoryOrder) {
case sycl::memory_order::relaxed:
return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::acq_rel:
return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::seq_cst:
return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
sycl::memory_scope::device>(addr, operand);
default:
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
"atomics are: sycl::memory_order::relaxed, "
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
}
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
typename T1, typename T2>
inline T1 atomic_fetch_add(T1 *addr, T2 operand,
sycl::memory_order memoryOrder) {
atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
}
/// Atomically subtract the value operand from the value at the addr and assign
/// the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to subtract from the value at \p addr
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T,
sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
inline T atomic_fetch_sub(T *addr, T operand) {
auto atm =
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_sub(operand);
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device,
typename T1, typename T2>
inline T1 atomic_fetch_sub(T1 *addr, T2 operand) {
auto atm =
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_sub(operand);
}
/// Atomically subtract the value operand from the value at the addr and assign
/// the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to subtract from the value at \p addr
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T, sycl::access::address_space addressSpace =
sycl::access::address_space::global_space>
inline T atomic_fetch_sub(T *addr, T operand,
sycl::memory_order memoryOrder) {
switch (memoryOrder) {
case sycl::memory_order::relaxed:
return atomic_fetch_sub<T, addressSpace, sycl::memory_order::relaxed,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::acq_rel:
return atomic_fetch_sub<T, addressSpace, sycl::memory_order::acq_rel,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::seq_cst:
return atomic_fetch_sub<T, addressSpace, sycl::memory_order::seq_cst,
sycl::memory_scope::device>(addr, operand);
default:
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
"atomics are: sycl::memory_order::relaxed, "
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
}
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
typename T1, typename T2>
inline T1 atomic_fetch_sub(T1 *addr, T2 operand,
sycl::memory_order memoryOrder) {
atomic_fetch_sub<T1, addressSpace>(addr, operand, memoryOrder);
}
/// Atomically perform a bitwise AND between the value operand and the value at the addr
/// and assign the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to use in bitwise AND operation with the value at the \p addr.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T,
sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
inline T atomic_fetch_and(T *addr, T operand) {
auto atm =
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_and(operand);
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device,
typename T1, typename T2>
inline T1 atomic_fetch_and(T1 *addr, T2 operand) {
auto atm =
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_and(operand);
}
/// Atomically perform a bitwise AND between the value operand and the value at the addr
/// and assign the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to use in bitwise AND operation with the value at the \p addr.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T, sycl::access::address_space addressSpace =
sycl::access::address_space::global_space>
inline T atomic_fetch_and(T *addr, T operand,
sycl::memory_order memoryOrder) {
switch (memoryOrder) {
case sycl::memory_order::relaxed:
return atomic_fetch_and<T, addressSpace, sycl::memory_order::relaxed,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::acq_rel:
return atomic_fetch_and<T, addressSpace, sycl::memory_order::acq_rel,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::seq_cst:
return atomic_fetch_and<T, addressSpace, sycl::memory_order::seq_cst,
sycl::memory_scope::device>(addr, operand);
default:
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
"atomics are: sycl::memory_order::relaxed, "
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
}
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
typename T1, typename T2>
inline T1 atomic_fetch_and(T1 *addr, T2 operand,
sycl::memory_order memoryOrder) {
atomic_fetch_and<T1, addressSpace>(addr, operand, memoryOrder);
}
/// Atomically or the value at the addr with the value operand, and assign
/// the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to use in bitwise OR operation with the value at the \p addr.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T,
sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
inline T atomic_fetch_or(T *addr, T operand) {
auto atm =
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_or(operand);
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device,
typename T1, typename T2>
inline T1 atomic_fetch_or(T1 *addr, T2 operand) {
auto atm =
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_or(operand);
}
/// Atomically or the value at the addr with the value operand, and assign
/// the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to use in bitwise OR operation with the value at the \p addr.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T, sycl::access::address_space addressSpace =
sycl::access::address_space::global_space>
inline T atomic_fetch_or(T *addr, T operand,
sycl::memory_order memoryOrder) {
switch (memoryOrder) {
case sycl::memory_order::relaxed:
return atomic_fetch_or<T, addressSpace, sycl::memory_order::relaxed,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::acq_rel:
return atomic_fetch_or<T, addressSpace, sycl::memory_order::acq_rel,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::seq_cst:
return atomic_fetch_or<T, addressSpace, sycl::memory_order::seq_cst,
sycl::memory_scope::device>(addr, operand);
default:
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
"atomics are: sycl::memory_order::relaxed, "
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
}
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
typename T1, typename T2>
inline T1 atomic_fetch_or(T1 *addr, T2 operand,
sycl::memory_order memoryOrder) {
atomic_fetch_or<T1, addressSpace>(addr, operand, memoryOrder);
}
/// Atomically xor the value at the addr with the value operand, and assign
/// the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to use in bitwise XOR operation with the value at the \p addr.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T,
sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
inline T atomic_fetch_xor(T *addr, T operand) {
auto atm =
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_xor(operand);
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device,
typename T1, typename T2>
inline T1 atomic_fetch_xor(T1 *addr, T2 operand) {
auto atm =
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_xor(operand);
}
/// Atomically xor the value at the addr with the value operand, and assign
/// the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to use in bitwise XOR operation with the value at the \p addr.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T, sycl::access::address_space addressSpace =
sycl::access::address_space::global_space>
inline T atomic_fetch_xor(T *addr, T operand,
sycl::memory_order memoryOrder) {
switch (memoryOrder) {
case sycl::memory_order::relaxed:
return atomic_fetch_xor<T, addressSpace, sycl::memory_order::relaxed,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::acq_rel:
return atomic_fetch_xor<T, addressSpace, sycl::memory_order::acq_rel,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::seq_cst:
return atomic_fetch_xor<T, addressSpace, sycl::memory_order::seq_cst,
sycl::memory_scope::device>(addr, operand);
default:
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
"atomics are: sycl::memory_order::relaxed, "
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
}
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
typename T1, typename T2>
inline T1 atomic_fetch_xor(T1 *addr, T2 operand,
sycl::memory_order memoryOrder) {
atomic_fetch_xor<T1, addressSpace>(addr, operand, memoryOrder);
}
/// Atomically calculate the minimum of the value at addr and the value operand
/// and assign the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T,
sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
inline T atomic_fetch_min(T *addr, T operand) {
auto atm =
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_min(operand);
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device,
typename T1, typename T2>
inline T1 atomic_fetch_min(T1 *addr, T2 operand) {
auto atm =
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_min(operand);
}
/// Atomically calculate the minimum of the value at addr and the value operand
/// and assign the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T, sycl::access::address_space addressSpace =
sycl::access::address_space::global_space>
inline T atomic_fetch_min(T *addr, T operand,
sycl::memory_order memoryOrder) {
switch (memoryOrder) {
case sycl::memory_order::relaxed:
return atomic_fetch_min<T, addressSpace, sycl::memory_order::relaxed,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::acq_rel:
return atomic_fetch_min<T, addressSpace, sycl::memory_order::acq_rel,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::seq_cst:
return atomic_fetch_min<T, addressSpace, sycl::memory_order::seq_cst,
sycl::memory_scope::device>(addr, operand);
default:
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
"atomics are: sycl::memory_order::relaxed, "
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
}
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
typename T1, typename T2>
inline T1 atomic_fetch_min(T1 *addr, T2 operand,
sycl::memory_order memoryOrder) {
atomic_fetch_min<T1, addressSpace>(addr, operand, memoryOrder);
}
/// Atomically calculate the maximum of the value at addr and the value operand
/// and assign the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T,
sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
inline T atomic_fetch_max(T *addr, T operand) {
auto atm =
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_max(operand);
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device,
typename T1, typename T2>
inline T1 atomic_fetch_max(T1 *addr, T2 operand) {
auto atm =
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.fetch_max(operand);
}
/// Atomically calculate the maximum of the value at addr and the value operand
/// and assign the result to the value at addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T, sycl::access::address_space addressSpace =
sycl::access::address_space::global_space>
inline T atomic_fetch_max(T *addr, T operand,
sycl::memory_order memoryOrder) {
switch (memoryOrder) {
case sycl::memory_order::relaxed:
return atomic_fetch_max<T, addressSpace, sycl::memory_order::relaxed,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::acq_rel:
return atomic_fetch_max<T, addressSpace, sycl::memory_order::acq_rel,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::seq_cst:
return atomic_fetch_max<T, addressSpace, sycl::memory_order::seq_cst,
sycl::memory_scope::device>(addr, operand);
default:
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
"atomics are: sycl::memory_order::relaxed, "
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
}
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
typename T1, typename T2>
inline T1 atomic_fetch_max(T1 *addr, T2 operand,
sycl::memory_order memoryOrder) {
atomic_fetch_max<T1, addressSpace>(addr, operand, memoryOrder);
}
/// Atomically set \p operand to the value stored in \p addr, if old value stored in
/// \p addr is equal to zero or greater than \p operand, else decrease the value stored
/// in \p addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The threshold value.
/// \param memoryOrder The memory ordering used.
/// \returns The old value stored in \p addr.
template <sycl::access::address_space addressSpace = sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
inline unsigned int atomic_fetch_compare_dec(unsigned int *addr,
unsigned int operand) {
auto atm = sycl::atomic_ref<unsigned int, memoryOrder, memoryScope,
addressSpace>(addr[0]);
unsigned int old;
while (true) {
old = atm.load();
if (old == 0 || old > operand) {
if (atm.compare_exchange_strong(old, operand))
break;
} else if (atm.compare_exchange_strong(old, old - 1))
break;
}
return old;
}
/// Atomically increment the value stored in \p addr if old value stored in \p
/// addr is less than \p operand, else set 0 to the value stored in \p addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The threshold value.
/// \param memoryOrder The memory ordering used.
/// \returns The old value stored in \p addr.
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
inline unsigned int atomic_fetch_compare_inc(unsigned int *addr,
unsigned int operand) {
auto atm = sycl::atomic_ref<unsigned int, memoryOrder, memoryScope,
addressSpace>(addr[0]);
unsigned int old;
while (true) {
old = atm.load();
if (old >= operand) {
if (atm.compare_exchange_strong(old, 0))
break;
} else if (atm.compare_exchange_strong(old, old + 1))
break;
}
return old;
}
/// Atomically increment the value stored in \p addr if old value stored in \p
/// addr is less than \p operand, else set 0 to the value stored in \p addr.
/// \param [in, out] addr The pointer to the data.
/// \param operand The threshold value.
/// \param memoryOrder The memory ordering used.
/// \returns The old value stored in \p addr.
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space>
inline unsigned int
atomic_fetch_compare_inc(unsigned int *addr, unsigned int operand,
sycl::memory_order memoryOrder) {
switch (memoryOrder) {
case sycl::memory_order::relaxed:
return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::relaxed,
sycl::memory_scope::device>(addr,
operand);
case sycl::memory_order::acq_rel:
return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::acq_rel,
sycl::memory_scope::device>(addr,
operand);
case sycl::memory_order::seq_cst:
return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::seq_cst,
sycl::memory_scope::device>(addr,
operand);
default:
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
"atomics are: sycl::memory_order::relaxed, "
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
}
}
/// Atomically exchange the value at the address addr with the value operand.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to be exchanged with the value pointed by \p addr.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T,
sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
inline T atomic_exchange(T *addr, T operand) {
auto atm =
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.exchange(operand);
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device,
typename T1, typename T2>
inline T1 atomic_exchange(T1 *addr, T2 operand) {
auto atm =
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
return atm.exchange(operand);
}
/// Atomically exchange the value at the address addr with the value operand.
/// \param [in, out] addr The pointer to the data.
/// \param operand The value to be exchanged with the value pointed by \p addr.
/// \param memoryOrder The memory ordering used.
/// \returns The value at the \p addr before the call.
template <typename T, sycl::access::address_space addressSpace =
sycl::access::address_space::global_space>
inline T atomic_exchange(T *addr, T operand,
sycl::memory_order memoryOrder) {
switch (memoryOrder) {
case sycl::memory_order::relaxed:
return atomic_exchange<T, addressSpace, sycl::memory_order::relaxed,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::acq_rel:
return atomic_exchange<T, addressSpace, sycl::memory_order::acq_rel,
sycl::memory_scope::device>(addr, operand);
case sycl::memory_order::seq_cst:
return atomic_exchange<T, addressSpace, sycl::memory_order::seq_cst,
sycl::memory_scope::device>(addr, operand);
default:
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
"atomics are: sycl::memory_order::relaxed, "
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
}
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
typename T1, typename T2>
inline T1 atomic_exchange(T1 *addr, T2 operand,
sycl::memory_order memoryOrder) {
atomic_exchange<T1, addressSpace>(addr, operand, memoryOrder);
}
/// Atomically compare the value at \p addr to the value expected and exchange
/// with the value desired if the value at \p addr is equal to the value expected.
/// Returns the value at the \p addr before the call.
/// \param [in, out] addr Multi_ptr.
/// \param expected The value to compare against the value at \p addr.
/// \param desired The value to assign to \p addr if the value at \p addr is expected.
/// \param success The memory ordering used when comparison succeeds.
/// \param fail The memory ordering used when comparison fails.
/// \returns The value at the \p addr before the call.
template <typename T,
sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
T atomic_compare_exchange_strong(
sycl::multi_ptr<T, addressSpace> addr, T expected, T desired,
sycl::memory_order success = sycl::memory_order::relaxed,
sycl::memory_order fail = sycl::memory_order::relaxed) {
auto atm = sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(*addr);
atm.compare_exchange_strong(expected, desired, success, fail);
return expected;
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device,
typename T1, typename T2, typename T3>
T1 atomic_compare_exchange_strong(
sycl::multi_ptr<T1, addressSpace> addr, T2 expected, T3 desired,
sycl::memory_order success = sycl::memory_order::relaxed,
sycl::memory_order fail = sycl::memory_order::relaxed) {
auto atm =
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(*addr);
T1 expected_value = expected;
atm.compare_exchange_strong(expected_value, desired, success, fail);
return expected_value;
}
/// Atomically compare the value at \p addr to the value expected and exchange
/// with the value desired if the value at \p addr is equal to the value expected.
/// Returns the value at the \p addr before the call.
/// \param [in] addr The pointer to the data.
/// \param expected The value to compare against the value at \p addr.
/// \param desired The value to assign to \p addr if the value at \p addr is expected.
/// \param success The memory ordering used when comparison succeeds.
/// \param fail The memory ordering used when comparison fails.
/// \returns The value at the \p addr before the call.
template <typename T,
sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device>
T atomic_compare_exchange_strong(
T *addr, T expected, T desired,
sycl::memory_order success = sycl::memory_order::relaxed,
sycl::memory_order fail = sycl::memory_order::relaxed) {
auto atm =
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
atm.compare_exchange_strong(expected, desired, success, fail);
return expected;
}
template <sycl::access::address_space addressSpace =
sycl::access::address_space::global_space,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
sycl::memory_scope memoryScope = sycl::memory_scope::device,
typename T1, typename T2, typename T3>
T1 atomic_compare_exchange_strong(
T1 *addr, T2 expected, T3 desired,
sycl::memory_order success = sycl::memory_order::relaxed,
sycl::memory_order fail = sycl::memory_order::relaxed) {
T1 expected_value = expected;
auto atm =
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
atm.compare_exchange_strong(expected_value, desired, success, fail);
return expected_value;
}
/// Atomic extension to implement standard APIs in std::atomic
namespace detail{
template <typename T> struct IsValidAtomicType {
static constexpr bool value =
(std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
std::is_same<T, long long>::value ||
std::is_same<T, unsigned long long>::value ||
std::is_same<T, float>::value || std::is_same<T, double>::value ||
std::is_pointer<T>::value);
};
} // namespace detail
template <typename T,
sycl::memory_scope DefaultScope = sycl::memory_scope::system,
sycl::memory_order DefaultOrder = sycl::memory_order::seq_cst,
sycl::access::address_space Space =
sycl::access::address_space::generic_space>
class atomic{
static_assert(
detail::IsValidAtomicType<T>::value,
"Invalid atomic type. Valid types are int, unsigned int, long, "
"unsigned long, long long, unsigned long long, float, double "
"and pointer types");
T __d;
public:
/// default memory synchronization order
static constexpr sycl::memory_order default_read_order =
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>::default_read_order;
static constexpr sycl::memory_order default_write_order =
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>::default_write_order;
static constexpr sycl::memory_scope default_scope = DefaultScope;
static constexpr sycl::memory_order default_read_modify_write_order =
DefaultOrder;
/// Default constructor.
constexpr atomic() noexcept = default;
/// Constructor with initialize value.
constexpr atomic(T d) noexcept : __d(d){};
/// atomically replaces the value of the referenced object with a non-atomic argument
/// \param operand The value to replace the pointed value.
/// \param memoryOrder The memory ordering used.
/// \param memoryScope The memory scope used.
void store(T operand, sycl::memory_order memoryOrder = default_write_order,
sycl::memory_scope memoryScope = default_scope) noexcept {
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
atm.store(operand, memoryOrder, memoryScope);
}
/// atomically obtains the value of the referenced object
/// \param memoryOrder The memory ordering used.
/// \param memoryScope The memory scope used.
/// \returns The value of the referenced object
T load(sycl::memory_order memoryOrder = default_read_order,
sycl::memory_scope memoryScope = default_scope) const noexcept {
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(
const_cast<T &>(__d));
return atm.load(memoryOrder, memoryScope);
}
/// atomically replaces the value of the referenced object and obtains the value held previously
/// \param operand The value to replace the pointed value.
/// \param memoryOrder The memory ordering used.
/// \param memoryScope The memory scope used.
/// \returns The value of the referenced object before the call.
T exchange(T operand,
sycl::memory_order memoryOrder = default_read_modify_write_order,
sycl::memory_scope memoryScope = default_scope) noexcept {
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
return atm.exchange(operand, memoryOrder, memoryScope);
}
/// atomically compares the value of the referenced object with non-atomic argument
/// and performs atomic exchange if equal or atomic load if not
/// \param expected The value expected to be found in the object referenced by the atomic_ref object
/// \param desired The value to store in the referenced object if it is as expected
/// \param success The memory models for the read-modify-write
/// \param failure The memory models for load operations
/// \param memoryScope The memory scope used.
/// \returns true if the referenced object was successfully changed, false otherwise.
bool compare_exchange_weak(
T &expected, T desired,
sycl::memory_order success, sycl::memory_order failure,
sycl::memory_scope memoryScope = default_scope) noexcept {
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
return atm.compare_exchange_weak(expected, desired, success, failure, memoryScope);
}
/// \param expected The value expected to be found in the object referenced by the atomic_ref object
/// \param desired The value to store in the referenced object if it is as expected
/// \param memoryOrder The memory synchronization ordering for operations
/// \param memoryScope The memory scope used.
/// \returns true if the referenced object was successfully changed, false otherwise.
bool compare_exchange_weak(T &expected, T desired,
sycl::memory_order memoryOrder = default_read_modify_write_order,
sycl::memory_scope memoryScope = default_scope) noexcept {
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
return atm.compare_exchange_weak(expected, desired, memoryOrder, memoryScope);
}
/// atomically compares the value of the referenced object with non-atomic argument
/// and performs atomic exchange if equal or atomic load if not
/// \param expected The value expected to be found in the object referenced by the atomic_ref object
/// \param desired The value to store in the referenced object if it is as expected
/// \param success The memory models for the read-modify-write
/// \param failure The memory models for load operations
/// \param memoryScope The memory scope used.
/// \returns true if the referenced object was successfully changed, false otherwise.
bool compare_exchange_strong(
T &expected, T desired,
sycl::memory_order success, sycl::memory_order failure,
sycl::memory_scope memoryScope = default_scope) noexcept {
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
return atm.compare_exchange_strong(expected, desired, success, failure, memoryScope);
}
/// \param expected The value expected to be found in the object referenced by the atomic_ref object
/// \param desired The value to store in the referenced object if it is as expected
/// \param memoryOrder The memory synchronization ordering for operations
/// \param memoryScope The memory scope used.
/// \returns true if the referenced object was successfully changed, false otherwise.
bool compare_exchange_strong(T &expected, T desired,
sycl::memory_order memoryOrder = default_read_modify_write_order,
sycl::memory_scope memoryScope = default_scope) noexcept {
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
return atm.compare_exchange_strong(expected, desired, memoryOrder, memoryScope);
}
/// atomically adds the argument to the value stored in the atomic object and obtains the value held previously
/// \param operand The other argument of arithmetic addition
/// \param memoryOrder The memory ordering used.
/// \param memoryScope The memory scope used.
/// \returns The value of the referenced object before the call.
T fetch_add(T operand,
sycl::memory_order memoryOrder = default_read_modify_write_order,
sycl::memory_scope memoryScope = default_scope) noexcept {
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
return atm.fetch_add(operand, memoryOrder, memoryScope);
}
/// atomically subtracts the argument from the value stored in the atomic object and obtains the value held previously
/// \param operand The other argument of arithmetic subtraction
/// \param memoryOrder The memory ordering used.
/// \param memoryScope The memory scope used.
/// \returns The value of the referenced object before the call.
T fetch_sub(T operand,
sycl::memory_order memoryOrder = default_read_modify_write_order,
sycl::memory_scope memoryScope = default_scope) noexcept {
sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
return atm.fetch_sub(operand, memoryOrder, memoryScope);
}
};
} // namespace dpct
#endif // __DPCT_ATOMIC_HPP__

File diff suppressed because it is too large Load diff

View file

@ -1,286 +0,0 @@
//==---- ccl_utils.hpp----------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_CCL_UTILS_HPP__
#define __DPCT_CCL_UTILS_HPP__
#include <sycl/sycl.hpp>
#include <oneapi/ccl.hpp>
#include <unordered_map>
#include <memory>
#include "device.hpp"
namespace dpct {
namespace ccl {
namespace detail {
/// Get stored kvs with specified kvs address.
inline std::shared_ptr<oneapi::ccl::kvs> &
get_kvs(const oneapi::ccl::kvs::address_type &addr) {
struct hash {
std::size_t operator()(const oneapi::ccl::kvs::address_type &in) const {
return std::hash<std::string_view>()(std::string_view(in.data(), in.size()));
}
};
static std::unordered_map<oneapi::ccl::kvs::address_type,
std::shared_ptr<oneapi::ccl::kvs>, hash>
kvs_map;
return kvs_map[addr];
}
/// Help class to init ccl environment.
class ccl_init_helper {
public:
ccl_init_helper() { oneapi::ccl::init(); }
};
} // namespace detail
/// Get concatenated library version as an integer.
static inline int get_version() {
oneapi::ccl::init();
auto ver = oneapi::ccl::get_library_version();
return ver.major * 10000 + ver.minor * 100 + ver.update;
}
/// Create main kvs and return its address.
static inline oneapi::ccl::kvs::address_type create_kvs_address() {
oneapi::ccl::init();
auto ptr = oneapi::ccl::create_main_kvs();
auto addr = ptr->get_address();
detail::get_kvs(addr) = ptr;
return addr;
}
/// Get stored kvs with /p addr if exist. Otherwise, create kvs with /p addr.
static inline std::shared_ptr<oneapi::ccl::kvs>
create_kvs(const oneapi::ccl::kvs::address_type &addr) {
oneapi::ccl::init();
auto &ptr = detail::get_kvs(addr);
if (!ptr)
ptr = oneapi::ccl::create_kvs(addr);
return ptr;
}
/// dpct communicator extension
class communicator_wrapper : public dpct::ccl::detail::ccl_init_helper {
public:
communicator_wrapper(
int size, int rank, oneapi::ccl::kvs::address_type id,
const oneapi::ccl::comm_attr &attr = oneapi::ccl::default_comm_attr)
: _device_comm(oneapi::ccl::create_device(
static_cast<sycl::device &>(dpct::get_current_device()))),
_context_comm(oneapi::ccl::create_context(dpct::get_default_context())),
_comm(oneapi::ccl::create_communicator(
size, rank, _device_comm, _context_comm, dpct::ccl::create_kvs(id),
attr)) {
_queue_init = false;
_ccl_stream_ptr = nullptr;
}
~communicator_wrapper() {
delete _ccl_stream_ptr;
};
/// Return the rank in a oneapi::ccl::communicator
/// \returns The rank corresponding to communicator object
int rank() const {
return _comm.rank();
}
/// Retrieves the number of rank in oneapi::ccl::communicator
/// \returns The number of the ranks
int size() const {
return _comm.size();
}
/// Return underlying native device, which was used in oneapi::ccl::communicator
sycl::device get_device() const {
return _comm.get_device().get_native();
}
/// \brief allreduce is a collective communication operation that performs the global reduction operation
/// on values from all ranks of communicator and distributes the result back to all ranks.
/// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced
/// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff
/// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff
/// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
/// \param rtype the type of the reduction operation to be applied
/// \param queue_ptr a sycl::queue ptr associated with the operation
/// \return @ref void
void allreduce(const void *sendbuff, void *recvbuff, size_t count,
oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
sycl::queue *queue_ptr) {
call_func_wrapper(
[=](const oneapi::ccl::stream &stream) {
return oneapi::ccl::allreduce(sendbuff, recvbuff, count, dtype, rtype,
_comm, stream);
},
queue_ptr);
}
/// \brief reduce is a collective communication operation that performs the
/// global reduction operation on values from all ranks of the communicator
/// and returns the result to the root rank.
/// \param sendbuff the buffer with @c count elements of @c dtype that stores
/// local data to be reduced
/// \param recvbuff [out] the buffer to store reduced result,
/// must have the same dimension as @c sendbuff
/// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff
/// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
/// \param root the rank that gets the result of reduction
/// \param rtype the type of the reduction operation to be applied
/// \param queue_ptr a sycl::queue ptr associated with the operation
/// \return @ref void
void reduce(const void *sendbuff, void *recvbuff, size_t count,
oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
int root, sycl::queue *queue_ptr) {
call_func_wrapper(
[=](const oneapi::ccl::stream &stream) {
return oneapi::ccl::reduce(sendbuff, recvbuff, count, dtype, rtype,
root, _comm, stream);
},
queue_ptr);
}
/// \brief broadcast is a collective communication operation that broadcasts data
/// from one rank of communicator (denoted as root) to all other ranks.
/// Only support in-place operation
/// \param sendbuff the buffer with @c count elements of @c dtype that stores
/// local data to be reduced
/// \param recvbuff [out] the buffer to store reduced result
/// \param count the number of elements of type @c dtype in @c buf
/// \param dtype thedatatype of elements in @c buf
/// \param root the rank that broadcasts @c buf
/// \param queue_ptr a sycl::queue ptr associated with the operation
/// \return @ref void
void broadcast(void *sendbuff, void *recvbuff, size_t count,
oneapi::ccl::datatype dtype, int root,
sycl::queue *queue_ptr) {
if (sendbuff != recvbuff) {
throw std::runtime_error(
"oneCCL broadcast only support in-place operation. "
"sendbuff and recvbuff must be same.");
return;
}
call_func_wrapper(
[=](const oneapi::ccl::stream &stream) {
return oneapi::ccl::broadcast(recvbuff, count, dtype, root, _comm,
stream);
},
queue_ptr);
}
/// \brief reduce_scatter is a collective communication operation that performs the global reduction operation
/// on values from all ranks of the communicator and scatters the result in blocks back to all ranks.
/// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced
/// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff
/// \param recv_count the number of elements of type @c dtype in receive block
/// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
/// \param rtype the type of the reduction operation to be applied
/// \param queue_ptr a sycl::queue ptr associated with the operation
/// \return @ref void
void reduce_scatter(const void *sendbuff, void *recvbuff, size_t recv_count,
oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
sycl::queue *queue_ptr) {
call_func_wrapper(
[=](const oneapi::ccl::stream &stream) {
return oneapi::ccl::reduce_scatter(sendbuff, recvbuff, recv_count,
dtype, rtype, _comm, stream);
},
queue_ptr);
}
/// \brief send is a pt2pt communication operation that sends data from one rank of communicator.
/// \param sendbuff the buffer with @c count elements of @c dtype serves as send buffer for root
/// \param count the number of elements of type @c dtype in @c sendbuff
/// \param dtype the datatype of elements in @c sendbuff
/// \param peer the rank that receives @c sendbuff
/// \param queue_ptr a sycl::queue ptr associated with the operation
/// \return @ref void
void send(void *sendbuff, size_t count, oneapi::ccl::datatype dtype, int peer,
sycl::queue *queue_ptr) {
call_func_wrapper(
[=](const oneapi::ccl::stream &stream) {
return oneapi::ccl::send(sendbuff, count, dtype, peer, _comm, stream);
},
queue_ptr);
}
/// \brief recv is a pt2pt communication operation that sends data from one rank of communicator.
/// \param recvbuff the buffer with @c count elements of @c dtype serves as receive buffer
/// \param count the number of elements of type @c dtype in @c recvbuff
/// \param dtype the datatype of elements in @c recvbuff
/// \param peer the rank that receives @c recvbuff
/// \param queue_ptr a sycl::queue ptr associated with the operation
/// \return @ref void
void recv(void *recvbuff, size_t count, oneapi::ccl::datatype dtype, int peer,
sycl::queue *queue_ptr) {
call_func_wrapper(
[=](const oneapi::ccl::stream &stream) {
return oneapi::ccl::recv(recvbuff, count, dtype, peer, _comm, stream);
},
queue_ptr);
}
private:
oneapi::ccl::device _device_comm;
oneapi::ccl::context _context_comm;
oneapi::ccl::communicator _comm;
sycl::queue _queue;
bool _queue_init;
oneapi::ccl::stream *_ccl_stream_ptr;
template <class Fn>
void call_func_wrapper(Fn func, sycl::queue *qptr) {
if (_queue_init && *qptr != _queue) {
call_func_async(func, qptr);
} else {
if(!_queue_init) {
_queue = *qptr;
_queue_init = true;
_ccl_stream_ptr = new oneapi::ccl::stream(oneapi::ccl::create_stream(_queue));
}
std::invoke(func, *_ccl_stream_ptr);
}
}
class call_func_async {
sycl::queue *_q_ptr;
struct call_async_impl {
oneapi::ccl::stream _ccl_stream_impl;
oneapi::ccl::event _ccl_event_impl;
template <class Fn>
explicit call_async_impl(Fn func, sycl::queue *qptr)
: _ccl_stream_impl(oneapi::ccl::create_stream(*qptr)),
_ccl_event_impl(std::invoke(func, _ccl_stream_impl)) {}
};
call_async_impl *_imp;
public:
template <class Fn>
explicit call_func_async(Fn func, sycl::queue *qptr)
: _q_ptr(qptr),
_imp(new call_async_impl(func, qptr)) {}
~call_func_async() {
_q_ptr->submit([&](sycl::handler &cgh)
{ cgh.host_task([=]
{
_imp->_ccl_event_impl.wait();
delete _imp; }); });
}
};
};
typedef dpct::ccl::communicator_wrapper *comm_ptr;
} // namespace ccl
} // namespace dpct
#endif // __DPCT_CCL_UTILS_HPP__

View file

@ -1,781 +0,0 @@
//==---- device.hpp -------------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_DEVICE_HPP__
#define __DPCT_DEVICE_HPP__
#include <sycl/sycl.hpp>
#include <algorithm>
#include <array>
#include <cstring>
#include <iostream>
#include <mutex>
#include <set>
#include <sstream>
#include <map>
#include <vector>
#include <thread>
#if defined(__linux__)
#include <unistd.h>
#include <sys/syscall.h>
#endif
#if defined(_WIN64)
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#endif
namespace dpct {
namespace detail {
static void get_version(const sycl::device &dev, int &major, int &minor) {
// Version string has the following format:
// a. OpenCL<space><major.minor><space><vendor-specific-information>
// b. <major.minor>
std::string ver;
ver = dev.get_info<sycl::info::device::version>();
std::string::size_type i = 0;
while (i < ver.size()) {
if (isdigit(ver[i]))
break;
i++;
}
major = std::stoi(&(ver[i]));
while (i < ver.size()) {
if (ver[i] == '.')
break;
i++;
}
i++;
minor = std::stoi(&(ver[i]));
}
} // namespace detail
/// SYCL default exception handler
inline auto exception_handler = [](sycl::exception_list exceptions) {
for (std::exception_ptr const &e : exceptions) {
try {
std::rethrow_exception(e);
} catch (sycl::exception const &e) {
std::cerr << "Caught asynchronous SYCL exception:" << std::endl
<< e.what() << std::endl
<< "Exception caught at file:" << __FILE__
<< ", line:" << __LINE__ << std::endl;
}
}
};
typedef sycl::event *event_ptr;
typedef sycl::queue *queue_ptr;
typedef char *device_ptr;
/// Destroy \p event pointed memory.
///
/// \param event Pointer to the sycl::event address.
static void destroy_event(event_ptr event) {
delete event;
}
class device_info {
public:
// get interface
const char *get_name() const { return _name; }
char *get_name() { return _name; }
template <typename WorkItemSizesTy = sycl::range<3>,
std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
std::is_same_v<WorkItemSizesTy, int *>,
int> = 0>
auto get_max_work_item_sizes() const {
if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
return sycl::range<3>(_max_work_item_sizes_i[0],
_max_work_item_sizes_i[1],
_max_work_item_sizes_i[2]);
else {
return _max_work_item_sizes_i;
}
}
template <typename WorkItemSizesTy = sycl::range<3>,
std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
std::is_same_v<WorkItemSizesTy, int *>,
int> = 0>
auto get_max_work_item_sizes() {
if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
return sycl::range<3>(_max_work_item_sizes_i[0],
_max_work_item_sizes_i[1],
_max_work_item_sizes_i[2]);
else {
return _max_work_item_sizes_i;
}
}
bool get_host_unified_memory() const { return _host_unified_memory; }
int get_major_version() const { return _major; }
int get_minor_version() const { return _minor; }
int get_integrated() const { return _integrated; }
int get_max_clock_frequency() const { return _frequency; }
int get_max_compute_units() const { return _max_compute_units; }
int get_max_work_group_size() const { return _max_work_group_size; }
int get_max_sub_group_size() const { return _max_sub_group_size; }
int get_max_work_items_per_compute_unit() const {
return _max_work_items_per_compute_unit;
}
int get_max_register_size_per_work_group() const {
return _max_register_size_per_work_group;
}
template <typename NDRangeSizeTy = size_t *,
std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
std::is_same_v<NDRangeSizeTy, int *>,
int> = 0>
auto get_max_nd_range_size() const {
if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
return _max_nd_range_size;
else
return _max_nd_range_size_i;
}
template <typename NDRangeSizeTy = size_t *,
std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
std::is_same_v<NDRangeSizeTy, int *>,
int> = 0>
auto get_max_nd_range_size() {
if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
return _max_nd_range_size;
else
return _max_nd_range_size_i;
}
size_t get_global_mem_size() const { return _global_mem_size; }
size_t get_local_mem_size() const { return _local_mem_size; }
/// Returns the maximum clock rate of device's global memory in kHz. If
/// compiler does not support this API then returns default value 3200000 kHz.
unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
/// Returns the maximum bus width between device and memory in bits. If
/// compiler does not support this API then returns default value 64 bits.
unsigned int get_memory_bus_width() const { return _memory_bus_width; }
uint32_t get_device_id() const { return _device_id; }
std::array<unsigned char, 16> get_uuid() const { return _uuid; }
/// Returns global memory cache size in bytes.
unsigned int get_global_mem_cache_size() const {
return _global_mem_cache_size;
}
// set interface
void set_name(const char* name) {
size_t length = strlen(name);
if (length < 256) {
std::memcpy(_name, name, length + 1);
} else {
std::memcpy(_name, name, 255);
_name[255] = '\0';
}
}
void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) {
for (int i = 0; i < 3; ++i)
_max_work_item_sizes_i[i] = max_work_item_sizes[i];
}
[[deprecated]] void
set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) {
for (int i = 0; i < 3; ++i) {
_max_work_item_sizes_i[i] = max_work_item_sizes[i];
}
}
void set_host_unified_memory(bool host_unified_memory) {
_host_unified_memory = host_unified_memory;
}
void set_major_version(int major) { _major = major; }
void set_minor_version(int minor) { _minor = minor; }
void set_integrated(int integrated) { _integrated = integrated; }
void set_max_clock_frequency(int frequency) { _frequency = frequency; }
void set_max_compute_units(int max_compute_units) {
_max_compute_units = max_compute_units;
}
void set_global_mem_size(size_t global_mem_size) {
_global_mem_size = global_mem_size;
}
void set_local_mem_size(size_t local_mem_size) {
_local_mem_size = local_mem_size;
}
void set_max_work_group_size(int max_work_group_size) {
_max_work_group_size = max_work_group_size;
}
void set_max_sub_group_size(int max_sub_group_size) {
_max_sub_group_size = max_sub_group_size;
}
void
set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) {
_max_work_items_per_compute_unit = max_work_items_per_compute_unit;
}
void set_max_nd_range_size(int max_nd_range_size[]) {
for (int i = 0; i < 3; i++) {
_max_nd_range_size[i] = max_nd_range_size[i];
_max_nd_range_size_i[i] = max_nd_range_size[i];
}
}
void set_memory_clock_rate(unsigned int memory_clock_rate) {
_memory_clock_rate = memory_clock_rate;
}
void set_memory_bus_width(unsigned int memory_bus_width) {
_memory_bus_width = memory_bus_width;
}
void
set_max_register_size_per_work_group(int max_register_size_per_work_group) {
_max_register_size_per_work_group = max_register_size_per_work_group;
}
void set_device_id(uint32_t device_id) {
_device_id = device_id;
}
void set_uuid(std::array<unsigned char, 16> uuid) {
_uuid = std::move(uuid);
}
void set_global_mem_cache_size(unsigned int global_mem_cache_size) {
_global_mem_cache_size = global_mem_cache_size;
}
private:
char _name[256];
int _max_work_item_sizes_i[3];
bool _host_unified_memory = false;
int _major;
int _minor;
int _integrated = 0;
int _frequency;
// Set estimated value 3200000 kHz as default value.
unsigned int _memory_clock_rate = 3200000;
// Set estimated value 64 bits as default value.
unsigned int _memory_bus_width = 64;
unsigned int _global_mem_cache_size;
int _max_compute_units;
int _max_work_group_size;
int _max_sub_group_size;
int _max_work_items_per_compute_unit;
int _max_register_size_per_work_group;
size_t _global_mem_size;
size_t _local_mem_size;
size_t _max_nd_range_size[3];
int _max_nd_range_size_i[3];
uint32_t _device_id;
std::array<unsigned char, 16> _uuid;
};
static int get_major_version(const sycl::device &dev) {
int major, minor;
detail::get_version(dev, major, minor);
return major;
}
static int get_minor_version(const sycl::device &dev) {
int major, minor;
detail::get_version(dev, major, minor);
return minor;
}
static void get_device_info(device_info &out, const sycl::device &dev) {
device_info prop;
prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
int major, minor;
detail::get_version(dev, major, minor);
prop.set_major_version(major);
prop.set_minor_version(minor);
prop.set_max_work_item_sizes(
#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
// oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
// is an enum class element
dev.get_info<sycl::info::device::max_work_item_sizes>());
#else
// SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
// an int
dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
#endif
prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
prop.set_max_clock_frequency(
dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
prop.set_max_compute_units(
dev.get_info<sycl::info::device::max_compute_units>());
prop.set_max_work_group_size(
dev.get_info<sycl::info::device::max_work_group_size>());
prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) {
unsigned int tmp =
dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
if (tmp != 0)
prop.set_memory_clock_rate(1000 * tmp);
}
if (dev.has(sycl::aspect::ext_intel_memory_bus_width)) {
prop.set_memory_bus_width(
dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
}
if (dev.has(sycl::aspect::ext_intel_device_id)) {
prop.set_device_id(
dev.get_info<sycl::ext::intel::info::device::device_id>());
}
if (dev.has(sycl::aspect::ext_intel_device_info_uuid)) {
prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
}
#elif defined(_MSC_VER) && !defined(__clang__)
#pragma message("get_device_info: querying memory_clock_rate and \
memory_bus_width are not supported by the compiler used. \
Use 3200000 kHz as memory_clock_rate default value. \
Use 64 bits as memory_bus_width default value.")
#else
#warning "get_device_info: querying memory_clock_rate and \
memory_bus_width are not supported by the compiler used. \
Use 3200000 kHz as memory_clock_rate default value. \
Use 64 bits as memory_bus_width default value."
#endif
size_t max_sub_group_size = 1;
std::vector<size_t> sub_group_sizes =
dev.get_info<sycl::info::device::sub_group_sizes>();
for (const auto &sub_group_size : sub_group_sizes) {
if (max_sub_group_size < sub_group_size)
max_sub_group_size = sub_group_size;
}
prop.set_max_sub_group_size(max_sub_group_size);
prop.set_max_work_items_per_compute_unit(
dev.get_info<sycl::info::device::max_work_group_size>());
int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
prop.set_max_nd_range_size(max_nd_range_size);
// Estimates max register size per work group, feel free to update the value
// according to device properties.
prop.set_max_register_size_per_work_group(65536);
prop.set_global_mem_cache_size(
dev.get_info<sycl::info::device::global_mem_cache_size>());
out = prop;
}
/// dpct device extension
class device_ext : public sycl::device {
typedef std::mutex mutex_type;
public:
device_ext() : sycl::device(), _ctx(*this) {}
~device_ext() {
std::lock_guard<mutex_type> lock(m_mutex);
clear_queues();
}
device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this) {
std::lock_guard<mutex_type> lock(m_mutex);
init_queues();
}
int is_native_atomic_supported() { return 0; }
int get_major_version() const {
return dpct::get_major_version(*this);
}
int get_minor_version() const {
return dpct::get_minor_version(*this);
}
int get_max_compute_units() const {
return get_device_info().get_max_compute_units();
}
/// Return the maximum clock frequency of this device in KHz.
int get_max_clock_frequency() const {
return get_device_info().get_max_clock_frequency();
}
int get_integrated() const { return get_device_info().get_integrated(); }
int get_max_sub_group_size() const {
return get_device_info().get_max_sub_group_size();
}
int get_max_register_size_per_work_group() const {
return get_device_info().get_max_register_size_per_work_group();
}
int get_max_work_group_size() const {
return get_device_info().get_max_work_group_size();
}
int get_mem_base_addr_align() const {
return get_info<sycl::info::device::mem_base_addr_align>();
}
size_t get_global_mem_size() const {
return get_device_info().get_global_mem_size();
}
/// Get the number of bytes of free and total memory on the SYCL device.
/// \param [out] free_memory The number of bytes of free memory on the SYCL device.
/// \param [out] total_memory The number of bytes of total memory on the SYCL device.
void get_memory_info(size_t &free_memory, size_t &total_memory) {
#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
if (!has(sycl::aspect::ext_intel_free_memory)) {
std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
free_memory = 0;
} else {
free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
}
#else
std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
free_memory = 0;
#if defined(_MSC_VER) && !defined(__clang__)
#pragma message("Querying the number of bytes of free memory is not supported")
#else
#warning "Querying the number of bytes of free memory is not supported"
#endif
#endif
total_memory = get_device_info().get_global_mem_size();
}
void get_device_info(device_info &out) const {
dpct::get_device_info(out, *this);
}
device_info get_device_info() const {
device_info prop;
dpct::get_device_info(prop, *this);
return prop;
}
void reset() {
std::lock_guard<mutex_type> lock(m_mutex);
clear_queues();
init_queues();
}
sycl::queue &in_order_queue() { return *_q_in_order; }
sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
sycl::queue &default_queue() {
#ifdef DPCT_USM_LEVEL_NONE
return out_of_order_queue();
#else
return in_order_queue();
#endif // DPCT_USM_LEVEL_NONE
}
void queues_wait_and_throw() {
std::unique_lock<mutex_type> lock(m_mutex);
std::vector<std::shared_ptr<sycl::queue>> current_queues(
_queues);
lock.unlock();
for (const auto &q : current_queues) {
q->wait_and_throw();
}
// Guard the destruct of current_queues to make sure the ref count is safe.
lock.lock();
}
sycl::queue *create_queue(bool enable_exception_handler = false) {
#ifdef DPCT_USM_LEVEL_NONE
return create_out_of_order_queue(enable_exception_handler);
#else
return create_in_order_queue(enable_exception_handler);
#endif // DPCT_USM_LEVEL_NONE
}
sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
std::lock_guard<mutex_type> lock(m_mutex);
return create_queue_impl(enable_exception_handler,
sycl::property::queue::in_order());
}
sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) {
std::lock_guard<mutex_type> lock(m_mutex);
return create_queue_impl(enable_exception_handler);
}
void destroy_queue(sycl::queue *&queue) {
std::lock_guard<mutex_type> lock(m_mutex);
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
[=](const std::shared_ptr<sycl::queue> &q) -> bool {
return q.get() == queue;
}),
_queues.end());
queue = nullptr;
}
void set_saved_queue(sycl::queue* q) {
std::lock_guard<mutex_type> lock(m_mutex);
_saved_queue = q;
}
sycl::queue *get_saved_queue() const {
std::lock_guard<mutex_type> lock(m_mutex);
return _saved_queue;
}
sycl::context get_context() const { return _ctx; }
private:
void clear_queues() {
_queues.clear();
_q_in_order = _q_out_of_order = _saved_queue = nullptr;
}
void init_queues() {
_q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
_q_out_of_order = create_queue_impl(true);
_saved_queue = &default_queue();
}
/// Caller should acquire resource \p m_mutex before calling this function.
template <class... Properties>
sycl::queue *create_queue_impl(bool enable_exception_handler,
Properties... properties) {
sycl::async_handler eh = {};
if (enable_exception_handler) {
eh = exception_handler;
}
_queues.push_back(std::make_shared<sycl::queue>(
_ctx, *this, eh,
sycl::property_list(
#ifdef DPCT_PROFILING_ENABLED
sycl::property::queue::enable_profiling(),
#endif
properties...)));
return _queues.back().get();
}
void get_version(int &major, int &minor) const {
detail::get_version(*this, major, minor);
}
sycl::queue *_q_in_order, *_q_out_of_order;
sycl::queue *_saved_queue;
sycl::context _ctx;
std::vector<std::shared_ptr<sycl::queue>> _queues;
mutable mutex_type m_mutex;
};
static inline unsigned int get_tid() {
#if defined(__linux__)
return syscall(SYS_gettid);
#elif defined(_WIN64)
return GetCurrentThreadId();
#else
#error "Only support Windows and Linux."
#endif
}
/// device manager
class dev_mgr {
public:
device_ext &current_device() {
unsigned int dev_id=current_device_id();
check_id(dev_id);
return *_devs[dev_id];
}
device_ext &cpu_device() const {
std::lock_guard<std::recursive_mutex> lock(m_mutex);
if (_cpu_device == -1) {
throw std::runtime_error("no valid cpu device");
} else {
return *_devs[_cpu_device];
}
}
device_ext &get_device(unsigned int id) const {
std::lock_guard<std::recursive_mutex> lock(m_mutex);
check_id(id);
return *_devs[id];
}
unsigned int current_device_id() const {
std::lock_guard<std::recursive_mutex> lock(m_mutex);
auto it=_thread2dev_map.find(get_tid());
if(it != _thread2dev_map.end())
return it->second;
return DEFAULT_DEVICE_ID;
}
/// Select device with a device ID.
/// \param [in] id The id of the device which can
/// be obtained through get_device_id(const sycl::device).
void select_device(unsigned int id) {
std::lock_guard<std::recursive_mutex> lock(m_mutex);
check_id(id);
_thread2dev_map[get_tid()]=id;
}
unsigned int device_count() { return _devs.size(); }
unsigned int get_device_id(const sycl::device &dev) {
unsigned int id = 0;
for(auto dev_item : _devs) {
if (*dev_item == dev) {
break;
}
id++;
}
return id;
}
template <class DeviceSelector>
std::enable_if_t<
std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
sycl::device selected_device = sycl::device(selector);
unsigned int selected_device_id = get_device_id(selected_device);
select_device(selected_device_id);
}
/// Returns the instance of device manager singleton.
static dev_mgr &instance() {
static dev_mgr d_m;
return d_m;
}
dev_mgr(const dev_mgr &) = delete;
dev_mgr &operator=(const dev_mgr &) = delete;
dev_mgr(dev_mgr &&) = delete;
dev_mgr &operator=(dev_mgr &&) = delete;
private:
mutable std::recursive_mutex m_mutex;
dev_mgr() {
sycl::device default_device =
sycl::device(sycl::default_selector_v);
_devs.push_back(std::make_shared<device_ext>(default_device));
std::vector<sycl::device> sycl_all_devs =
sycl::device::get_devices(sycl::info::device_type::all);
// Collect other devices except for the default device.
if (default_device.is_cpu())
_cpu_device = 0;
for (auto &dev : sycl_all_devs) {
if (dev == default_device) {
continue;
}
_devs.push_back(std::make_shared<device_ext>(dev));
if (_cpu_device == -1 && dev.is_cpu()) {
_cpu_device = _devs.size() - 1;
}
}
}
void check_id(unsigned int id) const {
if (id >= _devs.size()) {
throw std::runtime_error("invalid device id");
}
}
std::vector<std::shared_ptr<device_ext>> _devs;
/// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
/// thread id in _thread2dev_map, which means default device should be used
/// for the current thread.
const unsigned int DEFAULT_DEVICE_ID = 0;
/// thread-id to device-id map.
std::map<unsigned int, unsigned int> _thread2dev_map;
int _cpu_device = -1;
};
/// Util function to get the default queue of current selected device depends on
/// the USM config. Return the default out-of-ordered queue when USM-none is
/// enabled, otherwise return the default in-ordered queue.
static inline sycl::queue &get_default_queue() {
return dev_mgr::instance().current_device().default_queue();
}
/// Util function to get the default in-ordered queue of current device in
/// dpct device manager.
static inline sycl::queue &get_in_order_queue() {
return dev_mgr::instance().current_device().in_order_queue();
}
/// Util function to get the default out-of-ordered queue of current device in
/// dpct device manager.
static inline sycl::queue &get_out_of_order_queue() {
return dev_mgr::instance().current_device().out_of_order_queue();
}
/// Util function to get the id of current device in
/// dpct device manager.
static inline unsigned int get_current_device_id() {
return dev_mgr::instance().current_device_id();
}
/// Util function to get the current device.
static inline device_ext &get_current_device() {
return dev_mgr::instance().current_device();
}
/// Util function to get a device by id.
static inline device_ext &get_device(unsigned int id) {
return dev_mgr::instance().get_device(id);
}
/// Util function to get the context of the default queue of current
/// device in dpct device manager.
static inline sycl::context get_default_context() {
return dpct::get_current_device().get_context();
}
/// Util function to get a CPU device.
static inline device_ext &cpu_device() {
return dev_mgr::instance().cpu_device();
}
static inline unsigned int select_device(unsigned int id) {
dev_mgr::instance().select_device(id);
return id;
}
template <class DeviceSelector>
static inline std::enable_if_t<
std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
dev_mgr::instance().select_device(selector);
}
static inline unsigned int get_device_id(const sycl::device &dev){
return dev_mgr::instance().get_device_id(dev);
}
/// Util function to check whether a device supports some kinds of sycl::aspect.
inline void
has_capability_or_fail(const sycl::device &dev,
const std::initializer_list<sycl::aspect> &props) {
for (const auto &it : props) {
if (dev.has(it))
continue;
switch (it) {
case sycl::aspect::fp64:
throw std::runtime_error("'double' is not supported in '" +
dev.get_info<sycl::info::device::name>() +
"' device");
break;
case sycl::aspect::fp16:
throw std::runtime_error("'half' is not supported in '" +
dev.get_info<sycl::info::device::name>() +
"' device");
break;
default:
#define __SYCL_ASPECT(ASPECT, ID) \
case sycl::aspect::ASPECT: \
return #ASPECT;
#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string {
switch (AspectNum) {
#include <sycl/info/aspects.def>
#include <sycl/info/aspects_deprecated.def>
default:
return "unknown aspect";
}
};
#undef __SYCL_ASPECT_DEPRECATED_ALIAS
#undef __SYCL_ASPECT_DEPRECATED
#undef __SYCL_ASPECT
throw std::runtime_error(
"'" + getAspectNameStr(it) + "' is not supported in '" +
dev.get_info<sycl::info::device::name>() + "' device");
}
break;
}
}
} // namespace dpct
#endif // __DPCT_DEVICE_HPP__

File diff suppressed because it is too large Load diff

View file

@ -1,62 +0,0 @@
//==---- dpct.hpp ---------------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_HPP__
#define __DPCT_HPP__
#include <sycl/sycl.hpp>
#include <iostream>
#include <limits.h>
#include <math.h>
template <class... Args> class dpct_kernel_name;
template <int Arg> class dpct_kernel_scalar;
#include "atomic.hpp"
#include "device.hpp"
#include "image.hpp"
#include "kernel.hpp"
#include "math.hpp"
#include "memory.hpp"
#include "util.hpp"
#if defined(_MSC_VER)
#define __dpct_align__(n) __declspec(align(n))
#define __dpct_inline__ __forceinline
#else
#define __dpct_align__(n) __attribute__((aligned(n)))
#define __dpct_inline__ __inline__ __attribute__((always_inline))
#endif
#if defined(_MSC_VER)
#define __dpct_noinline__ __declspec(noinline)
#else
#define __dpct_noinline__ __attribute__((noinline))
#endif
#define DPCT_COMPATIBILITY_TEMP (900)
namespace dpct{
enum error_code { success = 0, default_error = 999 };
}
#define DPCT_CHECK_ERROR(expr) \
[&]() { \
try { \
expr; \
return dpct::success; \
} catch (std::exception const &e) { \
std::cerr << e.what() << std::endl; \
return dpct::default_error; \
} \
}()
#define DPCT_PI_F (3.14159274101257f)
#define DPCT_PI (3.141592653589793115998)
#endif // __DPCT_HPP__

File diff suppressed because it is too large Load diff

View file

@ -1,747 +0,0 @@
//==---- dpcpp_extensions.h ------------------*- C++ -*---------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------===//
#ifndef __DPCT_DPCPP_EXTENSIONS_H__
#define __DPCT_DPCPP_EXTENSIONS_H__
#include <sycl/sycl.hpp>
#include <stdexcept>
#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
#include <sycl/ext/oneapi/experimental/user_defined_reductions.hpp>
#endif
#include "../dpct.hpp"
#include "functional.h"
namespace dpct {
namespace group {
namespace detail {
template <typename... _Args>
constexpr auto __reduce_over_group(_Args... __args) {
return sycl::reduce_over_group(__args...);
}
template <typename... _Args> constexpr auto __group_broadcast(_Args... __args) {
return sycl::group_broadcast(__args...);
}
template <typename... _Args>
constexpr auto __exclusive_scan_over_group(_Args... __args) {
return sycl::exclusive_scan_over_group(__args...);
}
template <typename... _Args>
constexpr auto __inclusive_scan_over_group(_Args... __args) {
return sycl::inclusive_scan_over_group(__args...);
}
} // end namespace detail
/// Perform an exclusive scan over the values of inputs from all work-items in
/// the group using the operator binary_op, which must be one of the SYCL 2020
/// group algorithms library function objects.
///
/// \param item A work-item in a group.
/// \param inputs Pointer to the input data for the scan operation.
/// \param outputs Pointer to the location where scan results will be stored.
/// \param init initial value of the scan result.
/// \param binary_op functor that implements the binary operation used to
/// perform the scan.
template <typename Item, typename T, class BinaryOperation,
int VALUES_PER_THREAD>
__dpct_inline__ void
exclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD],
T (&outputs)[VALUES_PER_THREAD], T init,
BinaryOperation binary_op) {
T result = inputs[0];
#pragma unroll
for (int i = 1; i < VALUES_PER_THREAD; ++i) {
result = binary_op(result, inputs[i]);
}
T exclusive_result =
detail::__exclusive_scan_over_group(item.get_group(), result, binary_op);
T input = inputs[0];
if (item.get_local_linear_id() == 0) {
outputs[0] = init;
} else {
outputs[0] = exclusive_result;
}
#pragma unroll
for (int i = 1; i < VALUES_PER_THREAD; ++i) {
T output = binary_op(input, outputs[i - 1]);
input = inputs[i];
outputs[i] = output;
}
}
/// Perform an exclusive scan over the values of input from all work-items in
/// the group using the operator binary_op, which must be one of the SYCL 2020
/// group algorithms library function objects.
///
/// \param item A work-item in a group.
/// \param input Input data for the scan operation.
/// \param init initial value of the scan result.
/// \param binary_op functor that implements the binary operation used to
/// perform the scan. \param group_aggregate group-wide aggregate of all inputs
/// in the work-items of the group. \returns exclusive scan of the first i
/// work-items where item is the i-th work item.
template <typename Item, typename T, class BinaryOperation>
__dpct_inline__ T
exclusive_scan(const Item &item, T input, T init, BinaryOperation binary_op,
T &group_aggregate) {
T output = detail::__exclusive_scan_over_group(item.get_group(), input, init,
binary_op);
if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
group_aggregate = binary_op(output, input);
}
group_aggregate = detail::__group_broadcast(
item.get_group(), group_aggregate, item.get_local_range().size() - 1);
return output;
}
/// Perform an exclusive scan over the values of input from all work-items in
/// the group using the operator binary_op, which must be one of the SYCL 2020
/// group algorithms library function objects.
///
/// \param item A work-item in a group.
/// \param input Input data for the scan operation.
/// \param binary_op functor that implements the binary operation used to
/// perform the scan. \param prefix_callback_op functor invoked by the first
/// work-item in the group that returns the
/// initial value in the resulting scan of the work-items in the group.
/// \returns exclusive scan of the input elements assigned to work-items in the
/// group.
template <typename Item, typename T, class BinaryOperation,
class GroupPrefixCallbackOperation>
__dpct_inline__ T
exclusive_scan(const Item &item, T input, BinaryOperation binary_op,
GroupPrefixCallbackOperation &prefix_callback_op) {
T group_aggregate;
T output =
detail::__exclusive_scan_over_group(item.get_group(), input, binary_op);
if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
group_aggregate = binary_op(output, input);
}
group_aggregate = detail::__group_broadcast(
item.get_group(), group_aggregate, item.get_local_range().size() - 1);
T group_prefix = prefix_callback_op(group_aggregate);
if (item.get_local_linear_id() == 0) {
output = group_prefix;
} else {
output = binary_op(group_prefix, output);
}
return output;
}
namespace detail {
typedef uint16_t digit_counter_type;
typedef uint32_t packed_counter_type;
template <int N, int CURRENT_VAL = N, int COUNT = 0> struct log2 {
enum { VALUE = log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };
};
template <int N, int COUNT> struct log2<N, 0, COUNT> {
enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 };
};
template <int RADIX_BITS, bool DESCENDING = false> class radix_rank {
public:
static size_t get_local_memory_size(size_t group_threads) {
return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type);
}
radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {}
template <typename Item, int VALUES_PER_THREAD>
__dpct_inline__ void
rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD],
int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) {
digit_counter_type thread_prefixes[VALUES_PER_THREAD];
digit_counter_type *digit_counters[VALUES_PER_THREAD];
digit_counter_type *buffer =
reinterpret_cast<digit_counter_type *>(_local_memory);
reset_local_memory(item);
item.barrier(sycl::access::fence_space::local_space);
#pragma unroll
for (int i = 0; i < VALUES_PER_THREAD; ++i) {
uint32_t digit = ::dpct::bfe(keys[i], current_bit, num_bits);
uint32_t sub_counter = digit >> LOG_COUNTER_LANES;
uint32_t counter_lane = digit & (COUNTER_LANES - 1);
if (DESCENDING) {
sub_counter = PACKING_RATIO - 1 - sub_counter;
counter_lane = COUNTER_LANES - 1 - counter_lane;
}
digit_counters[i] =
&buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO +
item.get_local_linear_id() * PACKING_RATIO + sub_counter];
thread_prefixes[i] = *digit_counters[i];
*digit_counters[i] = thread_prefixes[i] + 1;
}
item.barrier(sycl::access::fence_space::local_space);
scan_counters(item);
item.barrier(sycl::access::fence_space::local_space);
for (int i = 0; i < VALUES_PER_THREAD; ++i) {
ranks[i] = thread_prefixes[i] + *digit_counters[i];
}
}
private:
template <typename Item>
__dpct_inline__ void reset_local_memory(const Item &item) {
packed_counter_type *ptr =
reinterpret_cast<packed_counter_type *>(_local_memory);
#pragma unroll
for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0;
}
}
template <typename Item>
__dpct_inline__ packed_counter_type upsweep(const Item &item) {
packed_counter_type sum = 0;
packed_counter_type *ptr =
reinterpret_cast<packed_counter_type *>(_local_memory);
#pragma unroll
for (int i = 0; i < PADDED_COUNTER_LANES; i++) {
cached_segment[i] =
ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i];
}
#pragma unroll
for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
sum += cached_segment[i];
}
return sum;
}
template <typename Item>
__dpct_inline__ void
exclusive_downsweep(const Item &item, packed_counter_type raking_partial) {
packed_counter_type *ptr =
reinterpret_cast<packed_counter_type *>(_local_memory);
packed_counter_type sum = raking_partial;
#pragma unroll
for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
packed_counter_type value = cached_segment[i];
cached_segment[i] = sum;
sum += value;
}
#pragma unroll
for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] =
cached_segment[i];
}
}
struct prefix_callback {
__dpct_inline__ packed_counter_type
operator()(packed_counter_type block_aggregate) {
packed_counter_type block_prefix = 0;
#pragma unroll
for (int packed = 1; packed < PACKING_RATIO; packed++) {
block_prefix += block_aggregate
<< (sizeof(digit_counter_type) * 8 * packed);
}
return block_prefix;
}
};
template <typename Item>
__dpct_inline__ void scan_counters(const Item &item) {
packed_counter_type raking_partial = upsweep(item);
prefix_callback callback;
packed_counter_type exclusive_partial = exclusive_scan(
item, raking_partial, sycl::ext::oneapi::plus<packed_counter_type>(),
callback);
exclusive_downsweep(item, exclusive_partial);
}
private:
static constexpr int PACKING_RATIO =
sizeof(packed_counter_type) / sizeof(digit_counter_type);
static constexpr int LOG_PACKING_RATIO = log2<PACKING_RATIO>::VALUE;
static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO;
static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES;
static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1;
packed_counter_type cached_segment[PADDED_COUNTER_LANES];
uint8_t *_local_memory;
};
template <typename T, typename U> struct base_traits {
static __dpct_inline__ U twiddle_in(U key) {
throw std::runtime_error("Not implemented");
}
static __dpct_inline__ U twiddle_out(U key) {
throw std::runtime_error("Not implemented");
}
};
template <typename U> struct base_traits<uint32_t, U> {
static __dpct_inline__ U twiddle_in(U key) { return key; }
static __dpct_inline__ U twiddle_out(U key) { return key; }
};
template <typename U> struct base_traits<int, U> {
static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
static __dpct_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; }
static __dpct_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; }
};
template <typename U> struct base_traits<float, U> {
static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
static __dpct_inline__ U twiddle_in(U key) {
U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT;
return key ^ mask;
}
static __dpct_inline__ U twiddle_out(U key) {
U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1);
return key ^ mask;
}
};
template <typename T> struct traits : base_traits<T, T> {};
template <> struct traits<uint32_t> : base_traits<uint32_t, uint32_t> {};
template <> struct traits<int> : base_traits<int, uint32_t> {};
template <> struct traits<float> : base_traits<float, uint32_t> {};
} // namespace detail
namespace detail {
template <int N> struct power_of_two {
enum { VALUE = ((N & (N - 1)) == 0) };
};
__dpct_inline__ uint32_t shr_add(uint32_t x, uint32_t shift, uint32_t addend) {
return (x >> shift) + addend;
}
} // namespace detail
/// Implements scatter to blocked exchange pattern used in radix sort algorithm.
///
/// \tparam T type of the data elements exchanges
/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
template <typename T, int VALUES_PER_THREAD> class exchange {
public:
static size_t get_local_memory_size(size_t group_threads) {
size_t padding_values =
(INSERT_PADDING)
? ((group_threads * VALUES_PER_THREAD) >> LOG_LOCAL_MEMORY_BANKS)
: 0;
return (group_threads * VALUES_PER_THREAD + padding_values) * sizeof(T);
}
exchange(uint8_t *local_memory) : _local_memory(local_memory) {}
/// Rearrange elements from rank order to blocked order
template <typename Item>
__dpct_inline__ void
scatter_to_blocked(Item item, T (&keys)[VALUES_PER_THREAD],
int (&ranks)[VALUES_PER_THREAD]) {
T *buffer = reinterpret_cast<T *>(_local_memory);
#pragma unroll
for (int i = 0; i < VALUES_PER_THREAD; i++) {
int offset = ranks[i];
if (INSERT_PADDING)
offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
buffer[offset] = keys[i];
}
item.barrier(sycl::access::fence_space::local_space);
#pragma unroll
for (int i = 0; i < VALUES_PER_THREAD; i++) {
int offset = (item.get_local_id(0) * VALUES_PER_THREAD) + i;
if (INSERT_PADDING)
offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
keys[i] = buffer[offset];
}
}
private:
static constexpr int LOG_LOCAL_MEMORY_BANKS = 5;
static constexpr bool INSERT_PADDING =
(VALUES_PER_THREAD > 4) &&
(detail::power_of_two<VALUES_PER_THREAD>::VALUE);
uint8_t *_local_memory;
};
/// Implements radix sort to sort integer data elements assigned to all threads
/// in the group.
///
/// \tparam T type of the data elements exchanges
/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
/// \tparam DECENDING boolean value indicating if data elements are sorted in
/// decending order.
template <typename T, int VALUES_PER_THREAD, bool DESCENDING = false>
class radix_sort {
public:
static size_t get_local_memory_size(size_t group_threads) {
size_t ranks_size =
detail::radix_rank<RADIX_BITS>::get_local_memory_size(group_threads);
size_t exchange_size =
exchange<T, VALUES_PER_THREAD>::get_local_memory_size(group_threads);
return sycl::max(ranks_size, exchange_size);
}
radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {}
template <typename Item>
__dpct_inline__ void
sort(const Item &item, T (&keys)[VALUES_PER_THREAD], int begin_bit = 0,
int end_bit = 8 * sizeof(T)) {
uint32_t(&unsigned_keys)[VALUES_PER_THREAD] =
reinterpret_cast<uint32_t(&)[VALUES_PER_THREAD]>(keys);
#pragma unroll
for (int i = 0; i < VALUES_PER_THREAD; ++i) {
unsigned_keys[i] = detail::traits<T>::twiddle_in(unsigned_keys[i]);
}
while (true) {
int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit);
int ranks[VALUES_PER_THREAD];
detail::radix_rank<RADIX_BITS, DESCENDING>(_local_memory)
.template rank_keys(item, unsigned_keys, ranks, begin_bit, pass_bits);
begin_bit += RADIX_BITS;
item.barrier(sycl::access::fence_space::local_space);
exchange<T, VALUES_PER_THREAD>(_local_memory)
.scatter_to_blocked(item, keys, ranks);
item.barrier(sycl::access::fence_space::local_space);
if (begin_bit >= end_bit)
break;
}
#pragma unroll
for (int i = 0; i < VALUES_PER_THREAD; ++i) {
unsigned_keys[i] = detail::traits<T>::twiddle_out(unsigned_keys[i]);
}
}
private:
static constexpr int RADIX_BITS = 4;
uint8_t *_local_memory;
};
/// Perform a reduction of the data elements assigned to all threads in the
/// group.
///
/// \param item A work-item in a group.
/// \param inputs Pointer to the input data for the reduce operation.
/// \param binary_op functor that implements the binary operation used to
/// perform the scan. \returns value of the reduction using binary_op
template <typename Item, typename T, class BinaryOperation,
int VALUES_PER_THREAD>
__dpct_inline__ T
reduce(Item item, T (&inputs)[VALUES_PER_THREAD], BinaryOperation binary_op) {
T result = inputs[0];
#pragma unroll
for (int i = 1; i < VALUES_PER_THREAD; i++) {
result = binary_op(result, inputs[i]);
}
return detail::__reduce_over_group(item.get_group(), result, binary_op);
}
/// Perform a reduction on a limited number of the work items in a subgroup
///
/// \param item A work-item in a group.
/// \param value value per work item which is to be reduced
/// \param items_to_reduce num work items at the start of the subgroup to reduce
/// \param binary_op functor that implements the binary operation used to
/// perform the scan. \returns value of the reduction using binary_op
template <typename Item, typename T, class BinaryOperation>
__dpct_inline__
typename ::std::enable_if_t<sycl::has_known_identity_v<BinaryOperation, T>, T>
reduce_over_partial_group(const Item &item, const T &value,
const ::std::uint16_t &items_to_reduce,
BinaryOperation binary_op) {
T value_temp = (item.get_local_linear_id() < items_to_reduce)
? value
: sycl::known_identity_v<BinaryOperation, T>;
return detail::__reduce_over_group(item.get_sub_group(), value_temp,
binary_op);
}
/// Perform an inclusive scan over the values of inputs from all work-items in
/// the group using the operator binary_op, which must be one of the SYCL 2020
/// group algorithms library function objects.
///
/// \param item A work-item in a group.
/// \param inputs Pointer to the input data for the scan operation.
/// \param outputs Pointer to the location where scan results will be stored.
/// \param binary_op functor that implements the binary operation used to
/// perform the scan. \returns inclusive scan of the input elements assigned to
/// work-items in the group.
template <typename Item, typename T, class BinaryOperation,
int VALUES_PER_THREAD>
__dpct_inline__ void
inclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD],
T (&outputs)[VALUES_PER_THREAD], BinaryOperation binary_op) {
T result = inputs[0];
#pragma unroll
for (int i = 1; i < VALUES_PER_THREAD; ++i) {
result = binary_op(result, inputs[i]);
}
T exclusive_result =
detail::__exclusive_scan_over_group(item.get_group(), result, binary_op);
if (item.get_local_linear_id() == 0) {
outputs[0] = inputs[0];
} else {
outputs[0] = binary_op(inputs[0], exclusive_result);
}
#pragma unroll
for (int i = 1; i < VALUES_PER_THREAD; ++i) {
outputs[i] = binary_op(inputs[i], outputs[i - 1]);
}
}
/// Perform an inclusive scan over the values of inputs from all work-items in
/// the group using the operator binary_op, which must be one of the SYCL 2020
/// group algorithms library function objects.
///
/// \param item A work-item in a group.
/// \param input Pointer to the input data for the scan operation.
/// \param binary_op functor that implements the binary operation used to
/// perform the scan. \param group_aggregate group-wide aggregate of all inputs
/// in the work-items of the group. \returns inclusive scan of the input
/// elements assigned to work-items in the group.
template <typename Item, typename T, class BinaryOperation>
__dpct_inline__ T inclusive_scan(const Item &item, T input,
BinaryOperation binary_op,
T &group_aggregate) {
T output =
detail::__inclusive_scan_over_group(item.get_group(), input, binary_op);
if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
group_aggregate = output;
}
group_aggregate = detail::__group_broadcast(
item.get_group(), group_aggregate, item.get_local_range().size() - 1);
return output;
}
/// Perform an inclusive scan over the values of input from all work-items in
/// the group using the operator binary_op, which must be one of the SYCL 2020
/// group algorithms library function objects.
///
/// \param item A work-item in a group.
/// \param input Input data for the scan operation.
/// \param binary_op functor that implements the binary operation used to
/// perform the scan. \param prefix_callback_op functor invoked by the first
/// work-item in the group that returns the
/// initial value in the resulting scan of the work-items in the group.
/// \returns inclusive scan of the input elements assigned to work-items in the
/// group.
template <typename Item, typename T, class BinaryOperation,
class GroupPrefixCallbackOperation>
__dpct_inline__ T
inclusive_scan(const Item &item, T input, BinaryOperation binary_op,
GroupPrefixCallbackOperation &prefix_callback_op) {
T group_aggregate;
T output = inclusive_scan(item, input, binary_op, group_aggregate);
T group_prefix = prefix_callback_op(group_aggregate);
return binary_op(group_prefix, output);
}
} // namespace group
namespace device {
namespace detail {
template <typename... _Args> constexpr auto __joint_reduce(_Args... __args) {
return sycl::joint_reduce(__args...);
}
} // namespace detail
/// Perform a reduce on each of the segments specified within data stored on
/// the device.
///
/// \param queue Command queue used to access device used for reduction
/// \param inputs Pointer to the data elements on the device to be reduced
/// \param outputs Pointer to the storage where the reduced value for each
/// segment will be stored \param segment_count number of segments to be reduced
/// \param begin_offsets Pointer to the set of indices that are the first
/// element in each segment \param end_offsets Pointer to the set of indices
/// that are one past the last element in each segment \param binary_op functor
/// that implements the binary operation used to perform the scan. \param init
/// initial value of the reduction for each segment.
template <int GROUP_SIZE, typename T, typename OffsetT, class BinaryOperation>
void segmented_reduce(sycl::queue queue, T *inputs, T *outputs,
size_t segment_count, OffsetT *begin_offsets,
OffsetT *end_offsets, BinaryOperation binary_op, T init) {
sycl::range<1> global_size(segment_count * GROUP_SIZE);
sycl::range<1> local_size(GROUP_SIZE);
queue.submit([&](sycl::handler &cgh) {
cgh.parallel_for(
sycl::nd_range<1>(global_size, local_size), [=](sycl::nd_item<1> item) {
OffsetT segment_begin = begin_offsets[item.get_group_linear_id()];
OffsetT segment_end = end_offsets[item.get_group_linear_id()];
if (segment_begin == segment_end) {
if (item.get_local_linear_id() == 0) {
outputs[item.get_group_linear_id()] = init;
}
return;
}
sycl::multi_ptr<T, sycl::access::address_space::global_space>
input_ptr = inputs;
T group_aggregate = detail::__joint_reduce(
item.get_group(), input_ptr + segment_begin,
input_ptr + segment_end, init, binary_op);
if (item.get_local_linear_id() == 0) {
outputs[item.get_group_linear_id()] = group_aggregate;
}
});
});
}
#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
namespace experimental {
namespace detail {
template <typename _Tp, typename... _Ts> struct __is_any {
constexpr static bool value = std::disjunction_v<
std::is_same<std::remove_cv_t<_Tp>, std::remove_cv_t<_Ts>>...>;
};
template <typename _Tp, typename _Bp> struct __in_native_op_list {
constexpr static bool value =
__is_any<_Bp, sycl::plus<_Tp>, sycl::bit_or<_Tp>, sycl::bit_xor<_Tp>,
sycl::bit_and<_Tp>, sycl::maximum<_Tp>, sycl::minimum<_Tp>,
sycl::multiplies<_Tp>>::value;
};
template <typename _Tp, typename _Bp> struct __is_native_op {
constexpr static bool value = __in_native_op_list<_Tp, _Bp>::value ||
__in_native_op_list<void, _Bp>::value;
};
} // namespace detail
/// Perform a reduce on each of the segments specified within data stored on
/// the device. Compared with dpct::device::segmented_reduce, this experimental
/// feature support user define reductions.
///
/// \param queue Command queue used to access device used for reduction
/// \param inputs Pointer to the data elements on the device to be reduced
/// \param outputs Pointer to the storage where the reduced value for each
/// segment will be stored \param segment_count number of segments to be reduced
/// \param begin_offsets Pointer to the set of indices that are the first
/// element in each segment \param end_offsets Pointer to the set of indices
/// that are one past the last element in each segment \param binary_op functor
/// that implements the binary operation used to perform the scan. \param init
/// initial value of the reduction for each segment.
template <int GROUP_SIZE, typename T, typename OffsetT, class BinaryOperation>
void segmented_reduce(sycl::queue queue, T *inputs, T *outputs,
size_t segment_count, OffsetT *begin_offsets,
OffsetT *end_offsets, BinaryOperation binary_op, T init) {
sycl::range<1> global_size(segment_count * GROUP_SIZE);
sycl::range<1> local_size(GROUP_SIZE);
if constexpr (!detail::__is_native_op<T, BinaryOperation>::value) {
queue.submit([&](sycl::handler &cgh) {
size_t temp_memory_size = GROUP_SIZE * sizeof(T);
auto scratch = sycl::local_accessor<std::byte, 1>(temp_memory_size, cgh);
cgh.parallel_for(
sycl::nd_range<1>(global_size, local_size),
[=](sycl::nd_item<1> item) {
OffsetT segment_begin = begin_offsets[item.get_group_linear_id()];
OffsetT segment_end = end_offsets[item.get_group_linear_id()];
if (segment_begin == segment_end) {
if (item.get_local_linear_id() == 0) {
outputs[item.get_group_linear_id()] = init;
}
return;
}
// Create a handle that associates the group with an allocation it
// can use
auto handle =
sycl::ext::oneapi::experimental::group_with_scratchpad(
item.get_group(),
sycl::span(&scratch[0], temp_memory_size));
T group_aggregate = sycl::ext::oneapi::experimental::joint_reduce(
handle, inputs + segment_begin, inputs + segment_end, init,
binary_op);
if (item.get_local_linear_id() == 0) {
outputs[item.get_group_linear_id()] = group_aggregate;
}
});
});
} else {
dpct::device::segmented_reduce<GROUP_SIZE>(queue, inputs, outputs,
segment_count, begin_offsets,
end_offsets, binary_op, init);
}
}
} // namespace experimental
#endif // SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
} // namespace device
} // namespace dpct
#endif

View file

@ -1,453 +0,0 @@
//==---- functional.h -----------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_FUNCTIONAL_H__
#define __DPCT_FUNCTIONAL_H__
#include <functional>
#include <oneapi/dpl/functional>
#include <oneapi/dpl/iterator>
#if ONEDPL_USE_DPCPP_BACKEND
#include <oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h>
#endif
#include <tuple>
#include <utility>
#include "../dpct.hpp"
#define _DPCT_GCC_VERSION \
(__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
// Portability "#pragma" definition
#ifdef _MSC_VER
#define _DPCT_PRAGMA(x) __pragma(x)
#else
#define _DPCT_PRAGMA(x) _Pragma(#x)
#endif
// Enable loop unrolling pragmas where supported
#if (__INTEL_COMPILER || \
(!defined(__INTEL_COMPILER) && _DPCT_GCC_VERSION >= 80000))
#define _DPCT_PRAGMA_UNROLL _DPCT_PRAGMA(unroll)
#else // no pragma unroll
#define _DPCT_PRAGMA_UNROLL
#endif
namespace dpct {
struct null_type {};
// Function object to wrap user defined functors to provide compile time "const"
// workaround for user function objects.
// The SYCL spec (4.12) states that writing to a function object during a SYCL
// kernel is undefined behavior. This wrapper is provided as a compile-time
// work around, but functors used in SYCL kernels must be `const` in practice.
template <typename _Op> struct mark_functor_const {
mutable _Op op;
mark_functor_const() : op() {}
mark_functor_const(const _Op &__op) : op(__op) {}
mark_functor_const(_Op &&__op) : op(::std::move(__op)) {}
template <typename... _T> auto operator()(_T &&...x) const {
return op(std::forward<_T>(x)...);
}
};
namespace internal {
template <class _ExecPolicy, class _T>
using enable_if_execution_policy =
typename std::enable_if<oneapi::dpl::execution::is_execution_policy<
typename std::decay<_ExecPolicy>::type>::value,
_T>::type;
template <typename _T> struct is_hetero_execution_policy : ::std::false_type {};
template <typename... PolicyParams>
struct is_hetero_execution_policy<
oneapi::dpl::execution::device_policy<PolicyParams...>> : ::std::true_type {
};
template <typename _T> struct is_fpga_execution_policy : ::std::false_type {};
#if _ONEDPL_FPGA_DEVICE
template <unsigned int unroll_factor, typename... PolicyParams>
struct is_hetero_execution_policy<
execution::fpga_policy<unroll_factor, PolicyParams...>> : ::std::true_type {
};
#endif
template <class _ExecPolicy, class _T>
using enable_if_hetero_execution_policy = typename std::enable_if<
is_hetero_execution_policy<typename std::decay<_ExecPolicy>::type>::value,
_T>::type;
#if _ONEDPL_CPP14_INTEGER_SEQUENCE_PRESENT
template <std::size_t... _Sp>
using index_sequence = std::index_sequence<_Sp...>;
template <std::size_t _Np>
using make_index_sequence = std::make_index_sequence<_Np>;
#else
template <std::size_t... _Sp> class index_sequence {};
template <std::size_t _Np, std::size_t... _Sp>
struct make_index_sequence_impl
: make_index_sequence_impl<_Np - 1, _Np - 1, _Sp...> {};
template <std::size_t... _Sp> struct make_index_sequence_impl<0, _Sp...> {
using type = index_sequence<_Sp...>;
};
template <std::size_t _Np>
using make_index_sequence = typename make_index_sequence_impl<_Np>::type;
#endif
// Minimal buffer implementations for temporary storage in mapping rules
// Some of our algorithms need to start with raw memory buffer,
// not an initialized array, because initialization/destruction
// would make the span be at least O(N).
#if ONEDPL_USE_DPCPP_BACKEND
template <typename _Tp> class __buffer {
sycl::buffer<_Tp, 1> __buf;
__buffer(const __buffer &) = delete;
void operator=(const __buffer &) = delete;
public:
// Try to obtain buffer of given size to store objects of _Tp type
__buffer(std::size_t __n) : __buf(sycl::range<1>(__n)) {}
// Return pointer to buffer, or NULL if buffer could not be obtained.
auto get() -> decltype(oneapi::dpl::begin(__buf)) const {
return oneapi::dpl::begin(__buf);
}
};
#else
template <typename _Tp> class __buffer {
std::unique_ptr<_Tp> _M_ptr;
__buffer(const __buffer &) = delete;
void operator=(const __buffer &) = delete;
public:
// Try to obtain buffer of given size to store objects of _Tp type
__buffer(const std::size_t __n) : _M_ptr(new _Tp[__n]) {}
// Return pointer to buffer, or NULL if buffer could not be obtained.
_Tp *get() const { return _M_ptr.get(); }
};
#endif
// Implements C++14 std::less<void> specialization to allow parameter type
// deduction.
class __less {
public:
template <typename _Xp, typename _Yp>
bool operator()(_Xp &&__x, _Yp &&__y) const {
return std::forward<_Xp>(__x) < std::forward<_Yp>(__y);
}
};
template <typename Policy, typename NewName> struct rebind_policy {
using type = Policy;
};
template <typename KernelName, typename NewName>
struct rebind_policy<oneapi::dpl::execution::device_policy<KernelName>,
NewName> {
using type = oneapi::dpl::execution::device_policy<NewName>;
};
#if _ONEDPL_FPGA_DEVICE
template <unsigned int factor, typename KernelName, typename NewName>
struct rebind_policy<oneapi::dpl::execution::fpga_policy<factor, KernelName>,
NewName> {
using type = oneapi::dpl::execution::fpga_policy<factor, NewName>;
};
#endif
template <typename T1, typename T2,
typename R1 = typename std::iterator_traits<T1>::reference,
typename R2 = typename std::iterator_traits<T2>::reference>
struct perm_fun {
typedef R2 result_of;
perm_fun(T1 input) : source(input) {}
R2 operator()(R1 x) const { return *(source + x); }
private:
T1 source;
};
// Functor compares first element (key) from tied sequence.
template <typename Compare = class internal::__less> struct compare_key_fun {
typedef bool result_of;
compare_key_fun(Compare _comp = internal::__less()) : comp(_comp) {}
template <typename _T1, typename _T2>
result_of operator()(_T1 &&a, _T2 &&b) const {
using std::get;
return comp(get<0>(a), get<0>(b));
}
private:
mutable Compare comp;
};
// Functor evaluates second element of tied sequence with predicate.
// Used by: copy_if, remove_copy_if, stable_partition_copy
// Lambda:
template <typename Predicate> struct predicate_key_fun {
typedef bool result_of;
predicate_key_fun(Predicate _pred) : pred(_pred) {}
template <typename _T1> result_of operator()(_T1 &&a) const {
using std::get;
return pred(get<1>(a));
}
private:
mutable Predicate pred;
};
// Used by: remove_if
template <typename Predicate> struct negate_predicate_key_fun {
typedef bool result_of;
negate_predicate_key_fun(Predicate _pred) : pred(_pred) {}
template <typename _T1> result_of operator()(_T1 &&a) const {
using std::get;
return !pred(get<1>(a));
}
private:
mutable Predicate pred;
};
template <typename T> struct sequence_fun {
using result_type = T;
sequence_fun(T _init, T _step) : init(_init), step(_step) {}
template <typename _T> result_type operator()(_T &&i) const {
return static_cast<T>(init + step * i);
}
private:
const T init;
const T step;
};
//[binary_pred](Ref a, Ref b){ return(binary_pred(get<0>(a),get<0>(b)));
template <typename Predicate> struct unique_fun {
typedef bool result_of;
unique_fun(Predicate _pred) : pred(_pred) {}
template <typename _T> result_of operator()(_T &&a, _T &&b) const {
using std::get;
return pred(get<0>(a), get<0>(b));
}
private:
mutable Predicate pred;
};
// Lambda: [pred, &new_value](Ref1 a, Ref2 s) {return pred(s) ? new_value : a;
// });
template <typename T, typename Predicate> struct replace_if_fun {
public:
typedef T result_of;
replace_if_fun(Predicate _pred, T _new_value)
: pred(_pred), new_value(_new_value) {}
template <typename _T1, typename _T2> T operator()(_T1 &&a, _T2 &&s) const {
return pred(s) ? new_value : a;
}
private:
mutable Predicate pred;
const T new_value;
};
//[pred,op](Ref a){return pred(a) ? op(a) : a; }
template <typename T, typename Predicate, typename Operator>
struct transform_if_fun {
transform_if_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {}
template <typename _T>
void operator()(_T&& t) const {
using std::get;
if (pred(get<0>(t)))
get<1>(t) = op(get<0>(t));
}
private:
mutable Predicate pred;
mutable Operator op;
};
//[pred, op](Ref1 a, Ref2 s) { return pred(s) ? op(a) : a; });
template <typename T, typename Predicate, typename Operator>
struct transform_if_unary_zip_mask_fun {
transform_if_unary_zip_mask_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {}
template <typename _T>
void operator()(_T&& t) const {
using std::get;
if (pred(get<1>(t)))
get<2>(t) = op(get<0>(t));
}
private:
mutable Predicate pred;
mutable Operator op;
};
template <typename T, typename Predicate, typename BinaryOperation>
class transform_if_zip_mask_fun {
public:
transform_if_zip_mask_fun(Predicate _pred = oneapi::dpl::identity(),
BinaryOperation _op = oneapi::dpl::identity())
: pred(_pred), op(_op) {}
template <typename _T> void operator()(_T &&t) const {
using std::get;
if (pred(get<2>(t)))
get<3>(t) = op(get<0>(t), get<1>(t));
}
private:
mutable Predicate pred;
mutable BinaryOperation op;
};
// This following code is similar to a section of code in
// oneDPL/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
// It has a similar approach, and could be consolidated.
// Outside of some differences in approach, there are two significant
// differences in function.
//
// 1) This code allows the output type of the bit range translation to be fit
// into to the minimal type required to provide that many bits. The code in
// oneDPL to calculate the bucket for the radix is similar but its output is
// always std::uint32_t. The assumption that the bit range desired will fit in
// 32 bits is not true for this code.
//
// 2) This code ensures that for floating point type, -0.0f and 0.0f map to the
// same value. This allows the output of this translation to be used to provide
// a sort which ensures the stability of these values for floating point types.
template <int N> struct uint_byte_map {};
template <> struct uint_byte_map<1> { using type = uint8_t; };
template <> struct uint_byte_map<2> { using type = uint16_t; };
template <> struct uint_byte_map<4> { using type = uint32_t; };
template <> struct uint_byte_map<8> { using type = uint64_t; };
template <typename T> struct uint_map {
using type = typename uint_byte_map<sizeof(T)>::type;
};
template <typename T, typename OutKeyT> class translate_key {
using uint_type_t = typename uint_map<T>::type;
public:
translate_key(int begin_bit, int end_bit) {
shift = begin_bit;
mask = ~OutKeyT(0); // all ones
mask = mask >> (sizeof(OutKeyT) * 8 -
(end_bit - begin_bit)); // setup appropriate mask
flip_sign = uint_type_t(1) << (sizeof(uint_type_t) * 8 - 1); // sign bit
flip_key = ~uint_type_t(0); // 0xF...F
}
inline OutKeyT operator()(const T &key) const {
uint_type_t intermediate;
if constexpr (std::is_floating_point<T>::value) {
// normal case (both -0.0f and 0.0f equal -0.0f)
if (key != T(-0.0f)) {
uint_type_t is_negative = reinterpret_cast<const uint_type_t &>(key) >>
(sizeof(uint_type_t) * 8 - 1);
intermediate = reinterpret_cast<const uint_type_t &>(key) ^
((is_negative * flip_key) | flip_sign);
} else // special case for -0.0f to keep stability with 0.0f
{
T negzero = T(-0.0f);
intermediate = reinterpret_cast<const uint_type_t &>(negzero);
}
} else if constexpr (std::is_signed<T>::value) {
intermediate = reinterpret_cast<const uint_type_t &>(key) ^ flip_sign;
} else {
intermediate = key;
}
return static_cast<OutKeyT>(intermediate >> shift) &
mask; // shift, cast, and mask
}
private:
uint8_t shift;
OutKeyT mask;
uint_type_t flip_sign;
uint_type_t flip_key;
};
// Unary operator that returns reference to its argument. Ported from
// oneDPL: oneapi/dpl/pstl/utils.h
struct no_op_fun {
template <typename Tp> Tp &&operator()(Tp &&a) const {
return ::std::forward<Tp>(a);
}
};
// Unary functor which composes a pair of functors by calling them in succession
// on an input
template <typename FunctorInner, typename FunctorOuter>
struct __composition_functor {
__composition_functor(FunctorInner in, FunctorOuter out)
: _in(in), _out(out) {}
template <typename T> T operator()(const T &i) const {
return _out(_in(i));
}
FunctorInner _in;
FunctorOuter _out;
};
// Unary functor which maps an index of a ROI into a 2D flattened array
template <typename OffsetT> struct __roi_2d_index_functor {
__roi_2d_index_functor(const OffsetT &num_cols,
const ::std::size_t &row_stride)
: _num_cols(num_cols), _row_stride(row_stride) {}
template <typename Index> Index operator()(const Index &i) const {
return _row_stride * (i / _num_cols) + (i % _num_cols);
}
OffsetT _num_cols;
::std::size_t _row_stride;
};
// Unary functor which maps and index into an interleaved array by its active
// channel
template <typename OffsetT> struct __interleaved_index_functor {
__interleaved_index_functor(const OffsetT &total_channels,
const OffsetT &active_channel)
: _total_channels(total_channels), _active_channel(active_channel) {}
template <typename Index> Index operator()(const Index &i) const {
return i * _total_channels + _active_channel;
}
OffsetT _total_channels;
OffsetT _active_channel;
};
} // end namespace internal
} // end namespace dpct
#endif

View file

@ -1,347 +0,0 @@
//==---- iterators.h ------------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_ITERATORS_H__
#define __DPCT_ITERATORS_H__
#include <oneapi/dpl/iterator>
#include "functional.h"
namespace dpct {
namespace internal {
// Wrapper class returned from a dereferenced transform_iterator which was
// created using
// make_transform_output_iterator(). Used to apply the supplied transform
// function when writing into an object of this class.
//
// Example:
// int a[] = {0, 1, 2, 3, 4};
// int* p = a;
// auto f = [](auto v) {return v*v;};
// auto tr_out = dpct::make_transform_output_iterator(p+1, f);
// auto wrap = *tr_out; // wrap is a transform_output_ref_wrapper
// std::cout<<*(p+1)<<std::endl; // '1'
// wrap = 2; // apply function, store 2*2=4
// std::cout<<*(p+1)<<std::endl; // '4'
template <typename T, typename _UnaryFunc> class transform_output_ref_wrapper {
private:
T __my_reference_;
_UnaryFunc __my_unary_func_;
public:
template <typename U>
transform_output_ref_wrapper(U &&__reference, _UnaryFunc __unary_func)
: __my_reference_(std::forward<U>(__reference)),
__my_unary_func_(__unary_func) {}
// When writing to an object of this type, apply the supplied unary function,
// then write to the wrapped reference
template <typename UnaryInputType>
transform_output_ref_wrapper &operator=(const UnaryInputType &e) {
__my_reference_ = __my_unary_func_(e);
return *this;
}
};
// Unary functor to create a transform_output_reference_wrapper when a
// transform_iterator is dereferenced, so that a
// the supplied unary function may be applied on write, resulting in a
// transform_output_iterator
template <typename _UnaryFunc> struct _Unary_Out {
_Unary_Out(_UnaryFunc __f_) : __f(__f_) {}
_UnaryFunc __f;
template <typename T> auto operator()(T &&val) const {
return transform_output_ref_wrapper<T, _UnaryFunc>(std::forward<T>(val),
__f);
}
};
} // end namespace internal
using std::advance;
using std::distance;
template <typename T>
oneapi::dpl::counting_iterator<T> make_counting_iterator(const T &input) {
return oneapi::dpl::counting_iterator<T>(input);
}
template <typename _Tp> class constant_iterator {
public:
typedef std::false_type is_hetero;
typedef std::true_type is_passed_directly;
typedef std::ptrdiff_t difference_type;
typedef _Tp value_type;
typedef _Tp *pointer;
// There is no storage behind the iterator, so we return a value instead of
// reference.
typedef const _Tp reference;
typedef const _Tp const_reference;
typedef std::random_access_iterator_tag iterator_category;
explicit constant_iterator(_Tp __init)
: __my_value_(__init), __my_counter_(0) {}
private:
// used to construct iterator instances with different counter values required
// by arithmetic operators
constant_iterator(const _Tp &__value, const difference_type &__offset)
: __my_value_(__value), __my_counter_(__offset) {}
public:
// non-const variants of access operators are not provided so unintended
// writes are caught at compile time.
const_reference operator*() const { return __my_value_; }
const_reference operator[](difference_type) const { return __my_value_; }
difference_type operator-(const constant_iterator &__it) const {
return __my_counter_ - __it.__my_counter_;
}
constant_iterator &operator+=(difference_type __forward) {
__my_counter_ += __forward;
return *this;
}
constant_iterator &operator-=(difference_type __backward) {
return *this += -__backward;
}
constant_iterator &operator++() { return *this += 1; }
constant_iterator &operator--() { return *this -= 1; }
constant_iterator operator++(int) {
constant_iterator __it(*this);
++(*this);
return __it;
}
constant_iterator operator--(int) {
constant_iterator __it(*this);
--(*this);
return __it;
}
constant_iterator operator-(difference_type __backward) const {
return constant_iterator(__my_value_, __my_counter_ - __backward);
}
constant_iterator operator+(difference_type __forward) const {
return constant_iterator(__my_value_, __my_counter_ + __forward);
}
friend constant_iterator operator+(difference_type __forward,
const constant_iterator __it) {
return __it + __forward;
}
bool operator==(const constant_iterator &__it) const {
return __my_value_ == __it.__my_value_ &&
this->__my_counter_ == __it.__my_counter_;
}
bool operator!=(const constant_iterator &__it) const {
return !(*this == __it);
}
bool operator<(const constant_iterator &__it) const {
return *this - __it < 0;
}
bool operator>(const constant_iterator &__it) const { return __it < *this; }
bool operator<=(const constant_iterator &__it) const {
return !(*this > __it);
}
bool operator>=(const constant_iterator &__it) const {
return !(*this < __it);
}
private:
_Tp __my_value_;
uint64_t __my_counter_;
};
template <typename _Tp>
constant_iterator<_Tp> make_constant_iterator(_Tp __value) {
return constant_iterator<_Tp>(__value);
}
// key_value_pair class to represent a key and value, specifically a
// dereferenced arg_index_input_iterator
template <typename _KeyTp, typename _ValueTp> class key_value_pair {
public:
key_value_pair() = default;
key_value_pair(const _KeyTp &_key, const _ValueTp &_value)
: key(_key), value(_value) {}
bool operator==(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const {
return (key == _kvp.key) && (value == _kvp.value);
}
bool operator!=(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const {
return (key != _kvp.key) || (value != _kvp.value);
}
_KeyTp key;
_ValueTp value;
};
namespace detail {
template <typename KeyTp, typename _ValueTp> struct make_key_value_pair {
template <typename ValRefTp>
key_value_pair<KeyTp, _ValueTp>
operator()(const oneapi::dpl::__internal::tuple<KeyTp, ValRefTp> &tup) const {
return ::dpct::key_value_pair<KeyTp, _ValueTp>(::std::get<0>(tup),
::std::get<1>(tup));
}
};
template <class T> struct __zip_iterator_impl;
template <class... Ts> struct __zip_iterator_impl<std::tuple<Ts...>> {
using type = oneapi::dpl::zip_iterator<Ts...>;
};
} // end namespace detail
// dpct::zip_iterator can only accept std::tuple type as template argument for
// compatibility purpose. Please use oneapi::dpl::zip_iterator if you want to
// pass iterator's types directly.
template <typename... Ts>
using zip_iterator = typename detail::__zip_iterator_impl<Ts...>::type;
// arg_index_input_iterator is an iterator over a input iterator, with a index.
// When dereferenced, it returns a key_value_pair, which can be interrogated for
// the index key or the value from the input iterator
template <typename InputIteratorT, typename OffsetT = ptrdiff_t,
typename OutputValueT =
typename ::std::iterator_traits<InputIteratorT>::value_type>
class arg_index_input_iterator
: public oneapi::dpl::transform_iterator<
oneapi::dpl::zip_iterator<oneapi::dpl::counting_iterator<OffsetT>,
InputIteratorT>,
detail::make_key_value_pair<OffsetT, OutputValueT>> {
using arg_index_input_iterator_wrap = oneapi::dpl::transform_iterator<
oneapi::dpl::zip_iterator<oneapi::dpl::counting_iterator<OffsetT>,
InputIteratorT>,
detail::make_key_value_pair<OffsetT, OutputValueT>>;
public:
typedef OffsetT difference_type;
// signal to __get_sycl_range that this iterator is as a direct pass iterator
using is_zip = ::std::true_type;
arg_index_input_iterator(const arg_index_input_iterator_wrap &__arg_wrap)
: arg_index_input_iterator_wrap(__arg_wrap) {}
arg_index_input_iterator(InputIteratorT __iter)
: arg_index_input_iterator_wrap(
oneapi::dpl::make_zip_iterator(
oneapi::dpl::counting_iterator(OffsetT(0)), __iter),
detail::make_key_value_pair<OffsetT, OutputValueT>()) {}
arg_index_input_iterator &operator=(const arg_index_input_iterator &__input) {
arg_index_input_iterator_wrap::operator=(__input);
return *this;
}
arg_index_input_iterator &operator++() {
arg_index_input_iterator_wrap::operator++();
return *this;
}
arg_index_input_iterator &operator--() {
arg_index_input_iterator_wrap::operator--();
return *this;
}
arg_index_input_iterator operator++(int) {
arg_index_input_iterator __it(*this);
++(*this);
return __it;
}
arg_index_input_iterator operator--(int) {
arg_index_input_iterator __it(*this);
--(*this);
return __it;
}
arg_index_input_iterator operator+(difference_type __forward) const {
return arg_index_input_iterator(
arg_index_input_iterator_wrap::operator+(__forward));
}
arg_index_input_iterator operator-(difference_type __backward) const {
return arg_index_input_iterator(
arg_index_input_iterator_wrap::operator-(__backward));
}
arg_index_input_iterator &operator+=(difference_type __forward) {
arg_index_input_iterator_wrap::operator+=(__forward);
return *this;
}
arg_index_input_iterator &operator-=(difference_type __backward) {
arg_index_input_iterator_wrap::operator-=(__backward);
return *this;
}
friend arg_index_input_iterator
operator+(difference_type __forward, const arg_index_input_iterator &__it) {
return __it + __forward;
}
difference_type operator-(const arg_index_input_iterator &__it) const {
return arg_index_input_iterator_wrap::operator-(__it);
}
bool operator==(const arg_index_input_iterator &__it) const {
return arg_index_input_iterator_wrap::operator==(__it);
}
bool operator!=(const arg_index_input_iterator &__it) const {
return !(*this == __it);
}
bool operator<(const arg_index_input_iterator &__it) const {
return *this - __it < 0;
}
bool operator>(const arg_index_input_iterator &__it) const {
return __it < *this;
}
bool operator<=(const arg_index_input_iterator &__it) const {
return !(*this > __it);
}
bool operator>=(const arg_index_input_iterator &__it) const {
return !(*this < __it);
}
// returns an arg_index_input_iterator with the same iter position, but a
// count reset to 0
arg_index_input_iterator create_normalized() {
return arg_index_input_iterator(
::std::get<1>(arg_index_input_iterator_wrap::base().base()));
}
};
template <typename IterT> struct io_iterator_pair {
inline io_iterator_pair() : selector(false) {}
inline io_iterator_pair(const IterT &first, const IterT &second)
: selector(false) {
iter[0] = first;
iter[1] = second;
}
inline IterT first() const { return selector ? iter[1] : iter[0]; }
inline IterT second() const { return selector ? iter[0] : iter[1]; }
inline void swap() { selector = !selector; }
bool selector;
IterT iter[2];
};
template <typename _Iter, typename _UnaryFunc>
auto make_transform_output_iterator(_Iter __it, _UnaryFunc __unary_func) {
return oneapi::dpl::transform_iterator(
__it, internal::_Unary_Out<_UnaryFunc>(__unary_func));
}
} // end namespace dpct
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,32 +0,0 @@
//==---- numeric.h --------------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_NUMERIC_H__
#define __DPCT_NUMERIC_H__
namespace dpct {
template <typename Policy, typename InputIt1, typename InputIt2, typename T>
T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1,
InputIt2 first2, T init) {
return std::transform_reduce(std::forward<Policy>(policy), first1, last1,
first2, init);
}
template <typename Policy, typename InputIt1, typename InputIt2, typename T,
typename BinaryOperation1, typename BinaryOperation2>
T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1,
InputIt2 first2, T init, BinaryOperation1 op1,
BinaryOperation2 op2) {
return std::transform_reduce(std::forward<Policy>(policy), first1, last1,
first2, init, op1, op2);
}
} // end namespace dpct
#endif

View file

@ -1,752 +0,0 @@
//==---- vector.h ---------------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_VECTOR_H__
#define __DPCT_VECTOR_H__
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/execution>
#include <sycl/sycl.hpp>
#include "memory.h"
#include <algorithm>
#include <iterator>
#include <vector>
#include "../device.hpp"
namespace dpct {
namespace internal {
template <typename Iter, typename Void = void> // for non-iterators
struct is_iterator : std::false_type {};
template <typename Iter> // For iterators
struct is_iterator<
Iter,
typename std::enable_if<
!std::is_void<typename Iter::iterator_category>::value, void>::type>
: std::true_type {};
template <typename T> // For pointers
struct is_iterator<T *> : std::true_type {};
} // end namespace internal
#ifndef DPCT_USM_LEVEL_NONE
template <typename T,
typename Allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>>
class device_vector {
public:
using iterator = device_iterator<T>;
using const_iterator = const iterator;
using reference = device_reference<T>;
using const_reference = const reference;
using value_type = T;
using pointer = T *;
using const_pointer = const T *;
using difference_type =
typename ::std::iterator_traits<iterator>::difference_type;
using size_type = ::std::size_t;
private:
Allocator _alloc;
size_type _size;
size_type _capacity;
pointer _storage;
size_type _min_capacity() const { return size_type(1); }
void _set_capacity_and_alloc() {
_capacity = ::std::max(_size * 2, _min_capacity());
_storage = _alloc.allocate(_capacity);
}
public:
template <typename OtherA> operator ::std::vector<T, OtherA>() const {
auto __tmp = ::std::vector<T, OtherA>(this->size());
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
this->begin(), this->end(), __tmp.begin());
return __tmp;
}
device_vector()
: _alloc(get_default_queue()), _size(0), _capacity(_min_capacity()) {
_set_capacity_and_alloc();
}
~device_vector() /*= default*/ { _alloc.deallocate(_storage, _capacity); };
explicit device_vector(size_type n) : device_vector(n, T()) {}
explicit device_vector(size_type n, const T &value)
: _alloc(get_default_queue()), _size(n) {
_set_capacity_and_alloc();
if (_size > 0) {
::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
begin(), end(), T(value));
}
}
device_vector(const device_vector &other) : _alloc(get_default_queue()) {
_size = other.size();
_capacity = other.capacity();
_storage = _alloc.allocate(_capacity);
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
other.begin(), other.end(), begin());
}
device_vector(device_vector &&other)
: _alloc(get_default_queue()), _size(other.size()),
_capacity(other.capacity()), _storage(other._storage) {
other._size = 0;
other._capacity = 0;
other._storage = nullptr;
}
template <typename InputIterator>
device_vector(InputIterator first,
typename ::std::enable_if<
internal::is_iterator<InputIterator>::value &&
!::std::is_pointer<InputIterator>::value &&
::std::is_same<typename ::std::iterator_traits<
InputIterator>::iterator_category,
::std::random_access_iterator_tag>::value,
InputIterator>::type last)
: _alloc(get_default_queue()) {
_size = ::std::distance(first, last);
_set_capacity_and_alloc();
if (_size > 0) {
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
first, last, begin());
}
}
template <typename InputIterator>
device_vector(InputIterator first,
typename ::std::enable_if<::std::is_pointer<InputIterator>::value,
InputIterator>::type last)
: _alloc(get_default_queue()) {
_size = ::std::distance(first, last);
_set_capacity_and_alloc();
if (_size > 0) {
auto ptr_type = sycl::get_pointer_type(first, get_default_context());
if (ptr_type != sycl::usm::alloc::host &&
ptr_type != sycl::usm::alloc::unknown) {
::std::copy(
oneapi::dpl::execution::make_device_policy(get_default_queue()),
first, last, begin());
} else {
sycl::buffer<typename ::std::iterator_traits<InputIterator>::value_type,
1>
buf(first, last);
auto buf_first = oneapi::dpl::begin(buf);
auto buf_last = oneapi::dpl::end(buf);
::std::copy(
oneapi::dpl::execution::make_device_policy(get_default_queue()),
buf_first, buf_last, begin());
}
}
}
template <typename InputIterator>
device_vector(InputIterator first,
typename ::std::enable_if<
internal::is_iterator<InputIterator>::value &&
!::std::is_pointer<InputIterator>::value &&
!::std::is_same<typename ::std::iterator_traits<
InputIterator>::iterator_category,
::std::random_access_iterator_tag>::value,
InputIterator>::type last)
: _alloc(get_default_queue()), _size(::std::distance(first, last)) {
_set_capacity_and_alloc();
::std::vector<T> _tmp(first, last);
if (_size > 0) {
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
_tmp.begin(), _tmp.end(), this->begin());
}
}
template <typename OtherAllocator>
device_vector(const device_vector<T, OtherAllocator> &v)
: _alloc(get_default_queue()), _storage(v.real_begin()), _size(v.size()),
_capacity(v.capacity()) {}
template <typename OtherAllocator>
device_vector(::std::vector<T, OtherAllocator> &v)
: _alloc(get_default_queue()), _size(v.size()) {
_set_capacity_and_alloc();
if (_size > 0) {
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
v.begin(), v.end(), this->begin());
}
}
template <typename OtherAllocator>
device_vector &operator=(const ::std::vector<T, OtherAllocator> &v) {
resize(v.size());
if (_size > 0) {
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
v.begin(), v.end(), begin());
}
return *this;
}
device_vector &operator=(const device_vector &other) {
// Copy assignment operator:
resize(other.size());
if (_size > 0) {
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
other.begin(), other.end(), begin());
}
return *this;
}
device_vector &operator=(device_vector &&other) {
// Move assignment operator:
device_vector dummy(::std::move(other));
this->swap(dummy);
return *this;
}
size_type size() const { return _size; }
iterator begin() noexcept { return device_iterator<T>(_storage, 0); }
iterator end() { return device_iterator<T>(_storage, size()); }
const_iterator begin() const noexcept {
return device_iterator<T>(_storage, 0);
}
const_iterator cbegin() const noexcept { return begin(); }
const_iterator end() const { return device_iterator<T>(_storage, size()); }
const_iterator cend() const { return end(); }
T *real_begin() { return _storage; }
const T *real_begin() const { return _storage; }
void swap(device_vector &v) {
::std::swap(_size, v._size);
::std::swap(_capacity, v._capacity);
::std::swap(_storage, v._storage);
::std::swap(_alloc, v._alloc);
}
reference operator[](size_type n) { return _storage[n]; }
const_reference operator[](size_type n) const { return _storage[n]; }
void reserve(size_type n) {
if (n > capacity()) {
// allocate buffer for new size
auto tmp = _alloc.allocate(2 * n);
// copy content (old buffer to new buffer)
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
begin(), end(), tmp);
// deallocate old memory
_alloc.deallocate(_storage, _capacity);
_storage = tmp;
_capacity = 2 * n;
}
}
void resize(size_type new_size, const T &x = T()) {
reserve(new_size);
if (_size < new_size) {
::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
begin() + _size, begin() + new_size, x);
}
_size = new_size;
}
size_type max_size(void) const {
return ::std::numeric_limits<size_type>::max() / sizeof(T);
}
size_type capacity() const { return _capacity; }
const_reference front() const { return *begin(); }
reference front() { return *begin(); }
const_reference back(void) const { return *(end() - 1); }
reference back(void) { return *(end() - 1); }
pointer data(void) { return _storage; }
const_pointer data(void) const { return _storage; }
void shrink_to_fit(void) {
if (_size != capacity()) {
size_type tmp_capacity = ::std::max(_size, _min_capacity());
auto tmp = _alloc.allocate(tmp_capacity);
if (_size > 0) {
::std::copy(
oneapi::dpl::execution::make_device_policy(get_default_queue()),
begin(), end(), tmp);
}
_alloc.deallocate(_storage, _capacity);
_storage = tmp;
_capacity = tmp_capacity;
}
}
void assign(size_type n, const T &x) {
resize(n);
if (_size > 0) {
::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
begin(), begin() + n, x);
}
}
template <typename InputIterator>
void
assign(InputIterator first,
typename ::std::enable_if<internal::is_iterator<InputIterator>::value,
InputIterator>::type last) {
auto n = ::std::distance(first, last);
resize(n);
if (_size > 0) {
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
first, last, begin());
}
}
void clear(void) { _size = 0; }
bool empty(void) const { return (size() == 0); }
void push_back(const T &x) { insert(end(), size_type(1), x); }
void pop_back(void) {
if (_size > 0)
--_size;
}
iterator erase(iterator first, iterator last) {
auto n = ::std::distance(first, last);
if (last == end()) {
_size = _size - n;
return end();
}
auto m = ::std::distance(last, end());
if (m <= 0) {
return end();
}
auto tmp = _alloc.allocate(m);
// copy remainder to temporary buffer.
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
last, end(), tmp);
// override (erase) subsequence in storage.
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
tmp, tmp + m, first);
_alloc.deallocate(tmp, m);
_size -= n;
return begin() + first.get_idx() + n;
}
iterator erase(iterator pos) { return erase(pos, pos + 1); }
iterator insert(iterator position, const T &x) {
auto n = ::std::distance(begin(), position);
insert(position, size_type(1), x);
return begin() + n;
}
void insert(iterator position, size_type n, const T &x) {
if (position == end()) {
resize(size() + n);
::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
end() - n, end(), x);
} else {
auto i_n = ::std::distance(begin(), position);
// allocate temporary storage
auto m = ::std::distance(position, end());
// will throw if position is not inside active vector
auto tmp = _alloc.allocate(m);
// copy remainder
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
position, end(), tmp);
resize(size() + n);
// resizing might invalidate position
position = begin() + position.get_idx();
::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
position, position + n, x);
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
tmp, tmp + m, position + n);
_alloc.deallocate(tmp, m);
}
}
template <typename InputIterator>
void
insert(iterator position, InputIterator first,
typename ::std::enable_if<internal::is_iterator<InputIterator>::value,
InputIterator>::type last) {
auto n = ::std::distance(first, last);
if (position == end()) {
resize(size() + n);
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
first, last, end());
} else {
auto m = ::std::distance(position, end());
// will throw if position is not inside active vector
auto tmp = _alloc.allocate(m);
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
position, end(), tmp);
resize(size() + n);
// resizing might invalidate position
position = begin() + position.get_idx();
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
first, last, position);
::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
tmp, tmp + m, position + n);
_alloc.deallocate(tmp, m);
}
}
Allocator get_allocator() const { return _alloc; }
};
#else
template <typename T, typename Allocator = detail::__buffer_allocator<T>>
class device_vector {
static_assert(
std::is_same<Allocator, detail::__buffer_allocator<T>>::value,
"device_vector doesn't support custom allocator when USM is not used.");
public:
using iterator = device_iterator<T>;
using const_iterator = const iterator;
using reference = device_reference<T>;
using const_reference = const reference;
using value_type = T;
using pointer = T *;
using const_pointer = const T *;
using difference_type =
typename std::iterator_traits<iterator>::difference_type;
using size_type = std::size_t;
private:
using Buffer = sycl::buffer<T, 1>;
using Range = sycl::range<1>;
// Using mem_mgr to handle memory allocation
void *_storage;
size_type _size;
size_type _min_capacity() const { return size_type(1); }
void *alloc_store(size_type num_bytes) {
return detail::mem_mgr::instance().mem_alloc(num_bytes);
}
public:
template <typename OtherA> operator std::vector<T, OtherA>() const {
auto __tmp = std::vector<T, OtherA>(this->size());
std::copy(oneapi::dpl::execution::dpcpp_default, this->begin(), this->end(),
__tmp.begin());
return __tmp;
}
device_vector()
: _storage(alloc_store(_min_capacity() * sizeof(T))), _size(0) {}
~device_vector() = default;
explicit device_vector(size_type n) : device_vector(n, T()) {}
explicit device_vector(size_type n, const T &value)
: _storage(alloc_store(std::max(n, _min_capacity()) * sizeof(T))),
_size(n) {
auto buf = get_buffer();
std::fill(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(buf),
oneapi::dpl::begin(buf) + n, T(value));
}
device_vector(const device_vector &other)
: _storage(other._storage), _size(other.size()) {}
device_vector(device_vector &&other)
: _storage(std::move(other._storage)), _size(other.size()) {}
template <typename InputIterator>
device_vector(InputIterator first,
typename std::enable_if<
internal::is_iterator<InputIterator>::value &&
!std::is_pointer<InputIterator>::value &&
std::is_same<typename std::iterator_traits<
InputIterator>::iterator_category,
std::random_access_iterator_tag>::value,
InputIterator>::type last)
: _storage(alloc_store(std::distance(first, last) * sizeof(T))),
_size(std::distance(first, last)) {
auto buf = get_buffer();
auto dst = oneapi::dpl::begin(buf);
std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
first, last, dst);
}
template <typename InputIterator>
device_vector(InputIterator first,
typename std::enable_if<std::is_pointer<InputIterator>::value,
InputIterator>::type last)
: _storage(alloc_store(std::distance(first, last) * sizeof(T))),
_size(std::distance(first, last)) {
auto buf = get_buffer();
Buffer tmp_buf(first, last);
auto start = oneapi::dpl::begin(tmp_buf);
auto end = oneapi::dpl::end(tmp_buf);
auto dst = oneapi::dpl::begin(buf);
std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
start, end, dst);
}
template <typename InputIterator>
device_vector(InputIterator first,
typename std::enable_if<
internal::is_iterator<InputIterator>::value &&
!std::is_same<typename std::iterator_traits<
InputIterator>::iterator_category,
std::random_access_iterator_tag>::value,
InputIterator>::type last)
: _storage(alloc_store(std::distance(first, last) * sizeof(T))),
_size(std::distance(first, last)) {
auto buf = get_buffer();
std::vector<T> tmp(first, last);
Buffer tmp_buf(tmp);
auto start = oneapi::dpl::begin(tmp_buf);
auto end = oneapi::dpl::end(tmp_buf);
auto dst = oneapi::dpl::begin(buf);
std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
start, end, dst);
}
template <typename OtherAllocator>
device_vector(const device_vector<T, OtherAllocator> &v)
: _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) {
auto buf = get_buffer();
auto dst = oneapi::dpl::begin(buf);
std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
v.real_begin(), v.real_begin() + v.size(), dst);
}
template <typename OtherAllocator>
device_vector(std::vector<T, OtherAllocator> &v)
: _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) {
std::copy(oneapi::dpl::execution::dpcpp_default, v.begin(), v.end(),
oneapi::dpl::begin(get_buffer()));
}
device_vector &operator=(const device_vector &other) {
// Copy assignment operator:
_size = other.size();
void *tmp = alloc_store(_size * sizeof(T));
auto tmp_buf =
detail::mem_mgr::instance()
.translate_ptr(tmp)
.buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
std::copy(oneapi::dpl::execution::dpcpp_default,
oneapi::dpl::begin(other.get_buffer()),
oneapi::dpl::end(other.get_buffer()),
oneapi::dpl::begin(tmp_buf));
detail::mem_mgr::instance().mem_free(_storage);
_storage = tmp;
return *this;
}
device_vector &operator=(device_vector &&other) {
// Move assignment operator:
_size = other.size();
this->_storage = std::move(other._storage);
return *this;
}
template <typename OtherAllocator>
device_vector &operator=(const std::vector<T, OtherAllocator> &v) {
Buffer data(v.begin(), v.end());
_size = v.size();
void *tmp = alloc_store(_size * sizeof(T));
auto tmp_buf =
detail::mem_mgr::instance()
.translate_ptr(tmp)
.buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(data),
oneapi::dpl::end(data), oneapi::dpl::begin(tmp_buf));
detail::mem_mgr::instance().mem_free(_storage);
_storage = tmp;
return *this;
}
Buffer get_buffer() const {
return detail::mem_mgr::instance()
.translate_ptr(_storage)
.buffer.template reinterpret<T, 1>(sycl::range<1>(capacity()));
}
size_type size() const { return _size; }
iterator begin() noexcept { return device_iterator<T>(get_buffer(), 0); }
iterator end() { return device_iterator<T>(get_buffer(), _size); }
const_iterator begin() const noexcept {
return device_iterator<T>(get_buffer(), 0);
}
const_iterator cbegin() const noexcept { return begin(); }
const_iterator end() const { return device_iterator<T>(get_buffer(), _size); }
const_iterator cend() const { return end(); }
T *real_begin() {
return (detail::mem_mgr::instance()
.translate_ptr(_storage)
.buffer.template get_access<sycl::access_mode::read_write>())
.get_pointer();
}
const T *real_begin() const {
return const_cast<device_vector *>(this)
->detail::mem_mgr::instance()
.translate_ptr(_storage)
.buffer.template get_access<sycl::access_mode::read_write>()
.get_pointer();
}
void swap(device_vector &v) {
void *temp = v._storage;
v._storage = this->_storage;
this->_storage = temp;
std::swap(_size, v._size);
}
reference operator[](size_type n) { return *(begin() + n); }
const_reference operator[](size_type n) const { return *(begin() + n); }
void reserve(size_type n) {
if (n > capacity()) {
// create new buffer (allocate for new size)
void *a = alloc_store(n * sizeof(T));
// copy content (old buffer to new buffer)
if (_storage != nullptr) {
auto tmp = detail::mem_mgr::instance()
.translate_ptr(a)
.buffer.template reinterpret<T, 1>(sycl::range<1>(n));
auto src_buf = get_buffer();
std::copy(oneapi::dpl::execution::dpcpp_default,
oneapi::dpl::begin(src_buf), oneapi::dpl::end(src_buf),
oneapi::dpl::begin(tmp));
// deallocate old memory
detail::mem_mgr::instance().mem_free(_storage);
}
_storage = a;
}
}
void resize(size_type new_size, const T &x = T()) {
reserve(new_size);
if (_size < new_size) {
auto src_buf = get_buffer();
std::fill(oneapi::dpl::execution::dpcpp_default,
oneapi::dpl::begin(src_buf) + _size,
oneapi::dpl::begin(src_buf) + new_size, x);
}
_size = new_size;
}
size_type max_size(void) const {
return std::numeric_limits<size_type>::max() / sizeof(T);
}
size_type capacity() const {
return _storage != nullptr ? detail::mem_mgr::instance()
.translate_ptr(_storage)
.buffer.size() /
sizeof(T)
: 0;
}
const_reference front() const { return *begin(); }
reference front() { return *begin(); }
const_reference back(void) const { return *(end() - 1); }
reference back(void) { return *(end() - 1); }
pointer data(void) { return reinterpret_cast<pointer>(_storage); }
const_pointer data(void) const {
return reinterpret_cast<const_pointer>(_storage);
}
void shrink_to_fit(void) {
if (_size != capacity()) {
void *a = alloc_store(_size * sizeof(T));
auto tmp = detail::mem_mgr::instance()
.translate_ptr(a)
.buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
std::copy(oneapi::dpl::execution::dpcpp_default,
oneapi::dpl::begin(get_buffer()),
oneapi::dpl::begin(get_buffer()) + _size,
oneapi::dpl::begin(tmp));
detail::mem_mgr::instance().mem_free(_storage);
_storage = a;
}
}
void assign(size_type n, const T &x) {
resize(n);
std::fill(oneapi::dpl::execution::dpcpp_default, begin(), begin() + n, x);
}
template <typename InputIterator>
void
assign(InputIterator first,
typename std::enable_if<internal::is_iterator<InputIterator>::value,
InputIterator>::type last) {
auto n = std::distance(first, last);
resize(n);
if (internal::is_iterator<InputIterator>::value &&
!std::is_pointer<InputIterator>::value)
std::copy(oneapi::dpl::execution::dpcpp_default, first, last, begin());
else {
Buffer tmp(first, last);
std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
oneapi::dpl::end(tmp), begin());
}
}
void clear(void) {
_size = 0;
detail::mem_mgr::instance().mem_free(_storage);
_storage = nullptr;
}
bool empty(void) const { return (size() == 0); }
void push_back(const T &x) { insert(end(), size_type(1), x); }
void pop_back(void) {
if (_size > 0)
--_size;
}
iterator erase(iterator first, iterator last) {
auto n = std::distance(first, last);
if (last == end()) {
_size = _size - n;
return end();
}
Buffer tmp{Range(std::distance(last, end()))};
// copy remainder to temporary buffer.
std::copy(oneapi::dpl::execution::dpcpp_default, last, end(),
oneapi::dpl::begin(tmp));
// override (erase) subsequence in storage.
std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
oneapi::dpl::end(tmp), first);
resize(_size - n);
return begin() + first.get_idx() + n;
}
iterator erase(iterator pos) { return erase(pos, pos + 1); }
iterator insert(iterator position, const T &x) {
auto n = std::distance(begin(), position);
insert(position, size_type(1), x);
return begin() + n;
}
void insert(iterator position, size_type n, const T &x) {
if (position == end()) {
resize(size() + n);
std::fill(oneapi::dpl::execution::dpcpp_default, end() - n, end(), x);
} else {
auto i_n = std::distance(begin(), position);
// allocate temporary storage
Buffer tmp{Range(std::distance(position, end()))};
// copy remainder
std::copy(oneapi::dpl::execution::dpcpp_default, position, end(),
oneapi::dpl::begin(tmp));
resize(size() + n);
// resizing might invalidate position
position = begin() + position.get_idx();
std::fill(oneapi::dpl::execution::dpcpp_default, position, position + n,
x);
std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
oneapi::dpl::end(tmp), position + n);
}
}
template <typename InputIterator>
void
insert(iterator position, InputIterator first,
typename std::enable_if<internal::is_iterator<InputIterator>::value,
InputIterator>::type last) {
auto n = std::distance(first, last);
if (position == end()) {
resize(size() + n);
std::copy(oneapi::dpl::execution::dpcpp_default, first, last, end());
} else {
Buffer tmp{Range(std::distance(position, end()))};
std::copy(oneapi::dpl::execution::dpcpp_default, position, end(),
oneapi::dpl::begin(tmp));
resize(size() + n);
// resizing might invalidate position
position = begin() + position.get_idx();
std::copy(oneapi::dpl::execution::dpcpp_default, first, last, position);
std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
oneapi::dpl::end(tmp), position + n);
}
}
};
#endif
} // end namespace dpct
#endif

View file

@ -1,26 +0,0 @@
//==---- dpl_utils.hpp ----------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_DPL_UTILS_HPP__
#define __DPCT_DPL_UTILS_HPP__
#define ONEDPL_USE_DPCPP_BACKEND 1
#define __USE_DPCT 1
#include <oneapi/dpl/execution>
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/numeric>
#include "dpl_extras/memory.h"
#include "dpl_extras/algorithm.h"
#include "dpl_extras/numeric.h"
#include "dpl_extras/iterators.h"
#include "dpl_extras/vector.h"
#include "dpl_extras/dpcpp_extensions.h"
#endif // __DPCT_DPL_UTILS_HPP__

File diff suppressed because it is too large Load diff

View file

@ -1,901 +0,0 @@
//==---- image.hpp --------------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_IMAGE_HPP__
#define __DPCT_IMAGE_HPP__
#include <sycl/sycl.hpp>
#include "memory.hpp"
#include "util.hpp"
namespace dpct {
enum class image_channel_data_type {
signed_int,
unsigned_int,
fp,
};
class image_channel;
class image_wrapper_base;
namespace detail {
/// Image object type traits, with accessor type and sampled data type defined.
/// The data type of an image accessor must be one of sycl::int4, sycl::uint4,
/// sycl::float4 and sycl::half4. The data type of accessors with 8bits/16bits
/// channel width will be 32 bits. sycl::half is an exception.
template <class T> struct image_trait {
using acc_data_t = sycl::vec<T, 4>;
template <int dimensions>
using accessor_t =
sycl::accessor<acc_data_t, dimensions, sycl::access_mode::read,
sycl::access::target::image>;
template <int dimensions>
using array_accessor_t =
sycl::accessor<acc_data_t, dimensions, sycl::access_mode::read,
sycl::access::target::image_array>;
using data_t = T;
using elem_t = T;
static constexpr image_channel_data_type data_type =
std::is_integral<T>::value
? (std::is_signed<T>::value ? image_channel_data_type::signed_int
: image_channel_data_type::unsigned_int)
: image_channel_data_type::fp;
static constexpr int channel_num = 1;
};
template <>
struct image_trait<std::uint8_t> : public image_trait<std::uint32_t> {
using data_t = std::uint8_t;
using elem_t = data_t;
};
template <>
struct image_trait<std::uint16_t>
: public image_trait<std::uint32_t> {
using data_t = std::uint16_t;
using elem_t = data_t;
};
template <>
struct image_trait<std::int8_t> : public image_trait<std::int32_t> {
using data_t = std::int8_t;
using elem_t = data_t;
};
template <>
struct image_trait<std::int16_t> : public image_trait<std::int32_t> {
using data_t = std::int16_t;
using elem_t = data_t;
};
template <>
struct image_trait<char>
: public image_trait<typename std::conditional<
std::is_signed<char>::value, signed char, unsigned char>::type> {};
template <class T>
struct image_trait<sycl::vec<T, 1>> : public image_trait<T> {};
template <class T>
struct image_trait<sycl::vec<T, 2>> : public image_trait<T> {
using data_t = sycl::vec<T, 2>;
static constexpr int channel_num = 2;
};
template <class T>
struct image_trait<sycl::vec<T, 3>>
: public image_trait<sycl::vec<T, 4>> {
static constexpr int channel_num = 3;
};
template <class T>
struct image_trait<sycl::vec<T, 4>> : public image_trait<T> {
using data_t = sycl::vec<T, 4>;
static constexpr int channel_num = 4;
};
/// Functor to fetch data from read result of an image accessor.
template <class T> struct fetch_data {
using return_t = typename image_trait<T>::data_t;
using acc_data_t = typename image_trait<T>::acc_data_t;
return_t operator()(acc_data_t &&original_data) {
return (return_t)original_data.r();
}
};
template <class T>
struct fetch_data<sycl::vec<T, 1>> : public fetch_data<T> {};
template <class T> struct fetch_data<sycl::vec<T, 2>> {
using return_t = typename image_trait<sycl::vec<T, 2>>::data_t;
using acc_data_t = typename image_trait<sycl::vec<T, 2>>::acc_data_t;
return_t operator()(acc_data_t &&origin_data) {
return return_t(origin_data.r(), origin_data.g());
}
};
template <class T>
struct fetch_data<sycl::vec<T, 3>>
: public fetch_data<sycl::vec<T, 4>> {};
template <class T> struct fetch_data<sycl::vec<T, 4>> {
using return_t = typename image_trait<sycl::vec<T, 4>>::data_t;
using acc_data_t = typename image_trait<sycl::vec<T, 4>>::acc_data_t;
return_t operator()(acc_data_t &&origin_data) {
return return_t(origin_data.r(), origin_data.g(), origin_data.b(),
origin_data.a());
}
};
/// Create image according with given type \p T and \p dims.
template <class T> static image_wrapper_base *create_image_wrapper(int dims);
/// Create image with given data type \p T, channel order and dims
template <class T>
static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims);
/// Create image with channel info and specified dimensions.
static image_wrapper_base *create_image_wrapper(image_channel channel, int dims);
} // namespace detail
/// Image channel info, include channel number, order, data width and type
class image_channel {
image_channel_data_type _type = image_channel_data_type::signed_int;
/// Number of channels.
unsigned _channel_num = 0;
/// Total size of all channels in bytes.
unsigned _total_size = 0;
/// Size of each channel in bytes.
unsigned _channel_size = 0;
public:
/// Create image channel info according to template argument \p T.
template <class T> static image_channel create() {
image_channel channel;
channel.set_channel_size(detail::image_trait<T>::channel_num,
sizeof(typename detail::image_trait<T>::elem_t) *
8);
channel.set_channel_data_type(detail::image_trait<T>::data_type);
return channel;
}
image_channel() = default;
image_channel_data_type get_channel_data_type() { return _type; }
void set_channel_data_type(image_channel_data_type type) { _type = type; }
unsigned get_total_size() { return _total_size; }
unsigned get_channel_num() { return _channel_num; }
void set_channel_num(unsigned channel_num) {
_channel_num = channel_num;
_total_size = _channel_size * _channel_num;
}
/// image_channel constructor.
/// \param r Channel r width in bits.
/// \param g Channel g width in bits. Should be same with \p r, or zero.
/// \param b Channel b width in bits. Should be same with \p g, or zero.
/// \param a Channel a width in bits. Should be same with \p b, or zero.
/// \param data_type Image channel data type: signed_nt, unsigned_int or fp.
image_channel(int r, int g, int b, int a, image_channel_data_type data_type) {
_type = data_type;
if (a) {
assert(r == a && "SYCL doesn't support different channel size");
assert(r == b && "SYCL doesn't support different channel size");
assert(r == g && "SYCL doesn't support different channel size");
set_channel_size(4, a);
} else if (b) {
assert(r == b && "SYCL doesn't support different channel size");
assert(r == g && "SYCL doesn't support different channel size");
set_channel_size(3, b);
} else if (g) {
assert(r == g && "SYCL doesn't support different channel size");
set_channel_size(2, g);
} else {
set_channel_size(1, r);
}
}
sycl::image_channel_type get_channel_type() const {
if (_channel_size == 4) {
if (_type == image_channel_data_type::signed_int)
return sycl::image_channel_type::signed_int32;
else if (_type == image_channel_data_type::unsigned_int)
return sycl::image_channel_type::unsigned_int32;
else if (_type == image_channel_data_type::fp)
return sycl::image_channel_type::fp32;
} else if (_channel_size == 2) {
if (_type == image_channel_data_type::signed_int)
return sycl::image_channel_type::signed_int16;
else if (_type == image_channel_data_type::unsigned_int)
return sycl::image_channel_type::unsigned_int16;
else if (_type == image_channel_data_type::fp)
return sycl::image_channel_type::fp16;
} else {
if (_type == image_channel_data_type::signed_int)
return sycl::image_channel_type::signed_int8;
else if (_type == image_channel_data_type::unsigned_int)
return sycl::image_channel_type::unsigned_int8;
}
assert(false && "unexpected channel data kind and channel size");
return sycl::image_channel_type::signed_int32;
}
void set_channel_type(sycl::image_channel_type type) {
switch (type) {
case sycl::image_channel_type::unsigned_int8:
_type = image_channel_data_type::unsigned_int;
_channel_size = 1;
break;
case sycl::image_channel_type::unsigned_int16:
_type = image_channel_data_type::unsigned_int;
_channel_size = 2;
break;
case sycl::image_channel_type::unsigned_int32:
_type = image_channel_data_type::unsigned_int;
_channel_size = 4;
break;
case sycl::image_channel_type::signed_int8:
_type = image_channel_data_type::signed_int;
_channel_size = 1;
break;
case sycl::image_channel_type::signed_int16:
_type = image_channel_data_type::signed_int;
_channel_size = 2;
break;
case sycl::image_channel_type::signed_int32:
_type = image_channel_data_type::signed_int;
_channel_size = 4;
break;
case sycl::image_channel_type::fp16:
_type = image_channel_data_type::fp;
_channel_size = 2;
break;
case sycl::image_channel_type::fp32:
_type = image_channel_data_type::fp;
_channel_size = 4;
break;
default:
break;
}
_total_size = _channel_size * _channel_num;
}
sycl::image_channel_order get_channel_order() const {
switch (_channel_num) {
case 1:
return sycl::image_channel_order::r;
case 2:
return sycl::image_channel_order::rg;
case 3:
return sycl::image_channel_order::rgb;
case 4:
return sycl::image_channel_order::rgba;
default:
return sycl::image_channel_order::r;
}
}
/// Get the size for each channel in bits.
unsigned get_channel_size() const { return _channel_size * 8; }
/// Set channel size.
/// \param in_channel_num Channels number to set.
/// \param channel_size Size for each channel in bits.
void set_channel_size(unsigned in_channel_num,
unsigned channel_size) {
if (in_channel_num < _channel_num)
return;
_channel_num = in_channel_num;
_channel_size = channel_size / 8;
_total_size = _channel_size * _channel_num;
}
};
/// 2D or 3D matrix data for image.
class image_matrix {
image_channel _channel;
int _range[3] = {1, 1, 1};
int _dims = 0;
void *_host_data = nullptr;
/// Set range of each dimension.
template <int dimensions> void set_range(sycl::range<dimensions> range) {
for (int i = 0; i < dimensions; ++i)
_range[i] = range[i];
_dims = dimensions;
}
template <int... DimIdx>
sycl::range<sizeof...(DimIdx)> get_range(integer_sequence<DimIdx...>) {
return sycl::range<sizeof...(DimIdx)>(_range[DimIdx]...);
}
public:
/// Constructor with channel info and dimension size info.
template <int dimensions>
image_matrix(image_channel channel, sycl::range<dimensions> range)
: _channel(channel) {
set_range(range);
_host_data = std::malloc(range.size() * _channel.get_total_size());
}
image_matrix(sycl::image_channel_type channel_type, unsigned channel_num,
size_t x, size_t y) {
_channel.set_channel_type(channel_type);
_channel.set_channel_num(channel_num);
_dims = 1;
_range[0] = x;
if (y) {
_dims = 2;
_range[1] = y;
}
_host_data = std::malloc(_range[0] * _range[1] * _channel.get_total_size());
}
/// Construct a new image class with the matrix data.
template <int dimensions> sycl::image<dimensions> *create_image() {
return create_image<dimensions>(_channel);
}
/// Construct a new image class with the matrix data.
template <int dimensions>
sycl::image<dimensions> *create_image(image_channel channel) {
return new sycl::image<dimensions>(
_host_data, channel.get_channel_order(), channel.get_channel_type(),
get_range(make_index_sequence<dimensions>()),
sycl::property::image::use_host_ptr());
}
/// Get channel info.
inline image_channel get_channel() { return _channel; }
/// Get range of the image.
sycl::range<3> get_range() {
return sycl::range<3>(_range[0], _range[1], _range[2]);
}
/// Get matrix dims.
inline int get_dims() { return _dims; }
/// Convert to pitched data.
pitched_data to_pitched_data() {
return pitched_data(_host_data, _range[0] * _channel.get_total_size(),
_range[0], _range[1]);
}
~image_matrix() {
if (_host_data)
std::free(_host_data);
_host_data = nullptr;
}
};
using image_matrix_p = image_matrix *;
enum class image_data_type { matrix, linear, pitch, unsupport };
/// Image data info.
class image_data {
public:
image_data() { _type = image_data_type::unsupport; }
image_data(image_matrix_p matrix_data) { set_data(matrix_data); }
image_data(void *data_ptr, size_t x_size, image_channel channel) {
set_data(data_ptr, x_size, channel);
}
image_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size,
image_channel channel) {
set_data(data_ptr, x_size, y_size, pitch_size, channel);
}
void set_data(image_matrix_p matrix_data) {
_type = image_data_type::matrix;
_data = matrix_data;
_channel = matrix_data->get_channel();
}
void set_data(void *data_ptr, size_t x_size, image_channel channel) {
_type = image_data_type::linear;
_data = data_ptr;
_x = x_size;
_channel = channel;
}
void set_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size,
image_channel channel) {
_type = image_data_type::pitch;
_data = data_ptr;
_x = x_size;
_y = y_size;
_pitch = pitch_size;
_channel = channel;
}
image_data_type get_data_type() const { return _type; }
void set_data_type(image_data_type type) { _type = type; }
void *get_data_ptr() const { return _data; }
void set_data_ptr(void *data) { _data = data; }
size_t get_x() const { return _x; }
void set_x(size_t x) { _x = x; }
size_t get_y() const { return _y; }
void set_y(size_t y) { _y = y; }
size_t get_pitch() const { return _pitch; }
void set_pitch(size_t pitch) { _pitch = pitch; }
image_channel get_channel() const { return _channel; }
void set_channel(image_channel channel) { _channel = channel; }
image_channel_data_type get_channel_data_type() {
return _channel.get_channel_data_type();
}
void set_channel_data_type(image_channel_data_type type) {
_channel.set_channel_data_type(type);
}
unsigned get_channel_size() { return _channel.get_channel_size(); }
void set_channel_size(unsigned channel_num, unsigned channel_size) {
return _channel.set_channel_size(channel_num, channel_size);
}
unsigned get_channel_num() { return _channel.get_channel_num(); }
void set_channel_num(unsigned num) {
return _channel.set_channel_num(num);
}
sycl::image_channel_type get_channel_type() {
return _channel.get_channel_type();
}
void set_channel_type(sycl::image_channel_type type) {
return _channel.set_channel_type(type);
}
private:
image_data_type _type;
void *_data = nullptr;
size_t _x, _y, _pitch;
image_channel _channel;
};
/// Image sampling info, include addressing mode, filtering mode and
/// normalization info.
class sampling_info {
sycl::addressing_mode _addressing_mode =
sycl::addressing_mode::clamp_to_edge;
sycl::filtering_mode _filtering_mode = sycl::filtering_mode::nearest;
sycl::coordinate_normalization_mode _coordinate_normalization_mode =
sycl::coordinate_normalization_mode::unnormalized;
public:
sycl::addressing_mode get_addressing_mode() { return _addressing_mode; }
void set(sycl::addressing_mode addressing_mode) { _addressing_mode = addressing_mode; }
sycl::filtering_mode get_filtering_mode() { return _filtering_mode; }
void set(sycl::filtering_mode filtering_mode) { _filtering_mode = filtering_mode; }
sycl::coordinate_normalization_mode get_coordinate_normalization_mode() {
return _coordinate_normalization_mode;
}
void set(sycl::coordinate_normalization_mode coordinate_normalization_mode) {
_coordinate_normalization_mode = coordinate_normalization_mode;
}
bool is_coordinate_normalized() {
return _coordinate_normalization_mode ==
sycl::coordinate_normalization_mode::normalized;
}
void set_coordinate_normalization_mode(int is_normalized) {
_coordinate_normalization_mode =
is_normalized ? sycl::coordinate_normalization_mode::normalized
: sycl::coordinate_normalization_mode::unnormalized;
}
void
set(sycl::addressing_mode addressing_mode,
sycl::filtering_mode filtering_mode,
sycl::coordinate_normalization_mode coordinate_normalization_mode) {
set(addressing_mode);
set(filtering_mode);
set(coordinate_normalization_mode);
}
void set(sycl::addressing_mode addressing_mode,
sycl::filtering_mode filtering_mode, int is_normalized) {
set(addressing_mode);
set(filtering_mode);
set_coordinate_normalization_mode(is_normalized);
}
sycl::sampler get_sampler() {
return sycl::sampler(_coordinate_normalization_mode, _addressing_mode,
_filtering_mode);
}
};
/// Image base class.
class image_wrapper_base {
sampling_info _sampling_info;
image_data _data;
public:
virtual ~image_wrapper_base() = 0;
void attach(image_data data) { set_data(data); }
/// Attach matrix data to this class.
void attach(image_matrix *matrix) {
detach();
image_wrapper_base::set_data(image_data(matrix));
}
/// Attach matrix data to this class.
void attach(image_matrix *matrix, image_channel channel) {
attach(matrix);
image_wrapper_base::set_channel(channel);
}
/// Attach linear data to this class.
void attach(const void *ptr, size_t count) {
attach(ptr, count, get_channel());
}
/// Attach linear data to this class.
void attach(const void *ptr, size_t count, image_channel channel) {
detach();
image_wrapper_base::set_data(image_data(const_cast<void *>(ptr), count, channel));
}
/// Attach 2D data to this class.
void attach(const void *data, size_t x, size_t y, size_t pitch) {
attach(data, x, y, pitch, get_channel());
}
/// Attach 2D data to this class.
void attach(const void *data, size_t x, size_t y, size_t pitch,
image_channel channel) {
detach();
image_wrapper_base::set_data(
image_data(const_cast<void *>(data), x, y, pitch, channel));
}
/// Detach data.
virtual void detach() {}
sampling_info get_sampling_info() { return _sampling_info; }
void set_sampling_info(sampling_info info) {
_sampling_info = info;
}
const image_data &get_data() { return _data; }
void set_data(image_data data) { _data = data; }
image_channel get_channel() { return _data.get_channel(); }
void set_channel(image_channel channel) { _data.set_channel(channel); }
image_channel_data_type get_channel_data_type() {
return _data.get_channel_data_type();
}
void set_channel_data_type(image_channel_data_type type) {
_data.set_channel_data_type(type);
}
unsigned get_channel_size() { return _data.get_channel_size(); }
void set_channel_size(unsigned channel_num, unsigned channel_size) {
return _data.set_channel_size(channel_num, channel_size);
}
sycl::addressing_mode get_addressing_mode() {
return _sampling_info.get_addressing_mode();
}
void set(sycl::addressing_mode addressing_mode) {
_sampling_info.set(addressing_mode);
}
sycl::filtering_mode get_filtering_mode() {
return _sampling_info.get_filtering_mode();
}
void set(sycl::filtering_mode filtering_mode) {
_sampling_info.set(filtering_mode);
}
sycl::coordinate_normalization_mode get_coordinate_normalization_mode() {
return _sampling_info.get_coordinate_normalization_mode();
}
void
set(sycl::coordinate_normalization_mode coordinate_normalization_mode) {
_sampling_info.set(coordinate_normalization_mode);
}
bool is_coordinate_normalized() {
return _sampling_info.is_coordinate_normalized();
}
void set_coordinate_normalization_mode(int is_normalized) {
_sampling_info.set_coordinate_normalization_mode(is_normalized);
}
void
set(sycl::addressing_mode addressing_mode,
sycl::filtering_mode filtering_mode,
sycl::coordinate_normalization_mode coordinate_normalization_mode) {
set(addressing_mode);
set(filtering_mode);
set(coordinate_normalization_mode);
}
void set(sycl::addressing_mode addressing_mode,
sycl::filtering_mode filtering_mode, int is_normalized) {
set(addressing_mode);
set(filtering_mode);
set_coordinate_normalization_mode(is_normalized);
}
unsigned get_channel_num() { return _data.get_channel_num(); }
void set_channel_num(unsigned num) {
return _data.set_channel_num(num);
}
sycl::image_channel_type get_channel_type() {
return _data.get_channel_type();
}
void set_channel_type(sycl::image_channel_type type) {
return _data.set_channel_type(type);
}
sycl::sampler get_sampler() {
sycl::sampler smp = _sampling_info.get_sampler();
/// linear memory only used for sycl::filtering_mode::nearest.
if (_data.get_data_type() == image_data_type::linear) {
smp = sycl::sampler(smp.get_coordinate_normalization_mode(),
smp.get_addressing_mode(),
sycl::filtering_mode::nearest);
}
return smp;
}
};
inline image_wrapper_base::~image_wrapper_base() {}
using image_wrapper_base_p = image_wrapper_base *;
template <class T, int dimensions, bool IsImageArray> class image_accessor_ext;
/// Image class, wrapper of sycl::image.
template <class T, int dimensions, bool IsImageArray = false> class image_wrapper : public image_wrapper_base {
sycl::image<dimensions> *_image = nullptr;
#ifndef DPCT_USM_LEVEL_NONE
std::vector<char> _host_buffer;
#endif
void create_image(sycl::queue q) {
auto &data = get_data();
if (data.get_data_type() == image_data_type::matrix) {
_image = static_cast<image_matrix_p>(data.get_data_ptr())
->create_image<dimensions>(data.get_channel());
return;
}
auto ptr = data.get_data_ptr();
auto channel = data.get_channel();
if (detail::get_pointer_attribute(q, ptr) == detail::pointer_access_attribute::device_only) {
#ifdef DPCT_USM_LEVEL_NONE
ptr = get_buffer(ptr)
.template get_access<sycl::access_mode::read_write>()
.get_pointer();
#else
auto sz = data.get_x();
if (data.get_data_type() == image_data_type::pitch)
sz *= channel.get_total_size() * data.get_y();
_host_buffer.resize(sz);
q.memcpy(_host_buffer.data(), ptr, sz).wait();
ptr = _host_buffer.data();
#endif
}
if constexpr (dimensions == 1) {
assert(data.get_data_type() == image_data_type::linear);
_image = new sycl::image<1>(
ptr, channel.get_channel_order(), channel.get_channel_type(),
sycl::range<1>(data.get_x() / channel.get_total_size()));
} else if constexpr (dimensions == 2) {
assert(data.get_data_type() == image_data_type::pitch);
_image = new sycl::image<2>(ptr, channel.get_channel_order(),
channel.get_channel_type(),
sycl::range<2>(data.get_x(), data.get_y()),
sycl::range<1>(data.get_pitch()));
} else {
throw std::runtime_error("3D image only support matrix data");
}
return;
}
public:
using acc_data_t = typename detail::image_trait<T>::acc_data_t;
using accessor_t =
typename image_accessor_ext<T, IsImageArray ? (dimensions - 1) : dimensions,
IsImageArray>::accessor_t;
image_wrapper() { set_channel(image_channel::create<T>()); }
~image_wrapper() { detach(); }
/// Get image accessor.
accessor_t get_access(sycl::handler &cgh, sycl::queue &q = get_default_queue()) {
if (!_image)
create_image(q);
return accessor_t(*_image, cgh);
}
/// Detach data.
void detach() override {
if (_image)
delete _image;
_image = nullptr;
}
};
/// Wrap sampler and image accessor together.
template <class T, int dimensions, bool IsImageArray = false>
class image_accessor_ext {
public:
using accessor_t =
typename detail::image_trait<T>::template accessor_t<dimensions>;
using data_t = typename detail::image_trait<T>::data_t;
sycl::sampler _sampler;
accessor_t _img_acc;
public:
image_accessor_ext(sycl::sampler sampler, accessor_t acc)
: _sampler(sampler), _img_acc(acc) {}
/// Read data from accessor.
template <bool Available = dimensions == 3>
typename std::enable_if<Available, data_t>::type read(float x, float y,
float z) {
return detail::fetch_data<T>()(
_img_acc.read(sycl::float4(x, y, z, 0), _sampler));
}
/// Read data from accessor.
template <class Coord0, class Coord1, class Coord2,
bool Available = dimensions == 3 &&
std::is_integral<Coord0>::value
&&std::is_integral<Coord1>::value
&&std::is_integral<Coord2>::value>
typename std::enable_if<Available, data_t>::type read(Coord0 x, Coord1 y,
Coord2 z) {
return detail::fetch_data<T>()(
_img_acc.read(sycl::int4(x, y, z, 0), _sampler));
}
/// Read data from accessor.
template <bool Available = dimensions == 2>
typename std::enable_if<Available, data_t>::type read(float x, float y) {
return detail::fetch_data<T>()(
_img_acc.read(sycl::float2(x, y), _sampler));
}
/// Read data from accessor.
template <class Coord0, class Coord1,
bool Available = dimensions == 2 &&
std::is_integral<Coord0>::value
&&std::is_integral<Coord1>::value>
typename std::enable_if<Available, data_t>::type read(Coord0 x, Coord1 y) {
return detail::fetch_data<T>()(
_img_acc.read(sycl::int2(x, y), _sampler));
}
/// Read data from accessor.
template <bool Available = dimensions == 1>
typename std::enable_if<Available, data_t>::type read(float x) {
return detail::fetch_data<T>()(_img_acc.read(x, _sampler));
}
/// Read data from accessor.
template <class CoordT,
bool Available = dimensions == 1 && std::is_integral<CoordT>::value>
typename std::enable_if<Available, data_t>::type read(CoordT x) {
return detail::fetch_data<T>()(_img_acc.read(x, _sampler));
}
};
template <class T, int dimensions> class image_accessor_ext<T, dimensions, true> {
public:
using accessor_t =
typename detail::image_trait<T>::template array_accessor_t<dimensions>;
using data_t = typename detail::image_trait<T>::data_t;
sycl::sampler _sampler;
accessor_t _img_acc;
public:
image_accessor_ext(sycl::sampler sampler, accessor_t acc)
: _sampler(sampler), _img_acc(acc) {}
/// Read data from accessor.
template <bool Available = dimensions == 2>
typename std::enable_if<Available, data_t>::type read(int index, float x,
float y) {
return detail::fetch_data<T>()(
_img_acc[index].read(sycl::float2(x, y), _sampler));
}
/// Read data from accessor.
template <bool Available = dimensions == 2>
typename std::enable_if<Available, data_t>::type read(int index, int x, int y) {
return detail::fetch_data<T>()(
_img_acc[index].read(sycl::int2(x, y), _sampler));
}
/// Read data from accessor.
template <bool Available = dimensions == 1>
typename std::enable_if<Available, data_t>::type read(int index, float x) {
return detail::fetch_data<T>()(
_img_acc[index].read(x, _sampler));
}
/// Read data from accessor.
template <bool Available = dimensions == 1>
typename std::enable_if<Available, data_t>::type read(int index, int x) {
return detail::fetch_data<T>()(
_img_acc[index].read(x, _sampler));
}
};
/// Create image wrapper according to image data and sampling info.
/// \return Pointer to image wrapper base class.
/// \param data Image data used to create image wrapper.
/// \param info Image sampling info used to create image wrapper.
/// \returns Pointer to base class of created image wrapper object.
static inline image_wrapper_base *create_image_wrapper(image_data data,
sampling_info info) {
image_channel channel;
int dims = 1;
if (data.get_data_type() == image_data_type::matrix) {
auto matrix = (image_matrix_p)data.get_data_ptr();
channel = matrix->get_channel();
dims = matrix->get_dims();
} else {
if (data.get_data_type() == image_data_type::pitch) {
dims = 2;
}
channel = data.get_channel();
}
if (auto ret = detail::create_image_wrapper(channel, dims)) {
ret->set_sampling_info(info);
ret->set_data(data);
return ret;
}
return nullptr;
}
namespace detail {
/// Create image according with given type \p T and \p dims.
template <class T> static image_wrapper_base *create_image_wrapper(int dims) {
switch (dims) {
case 1:
return new image_wrapper<T, 1>();
case 2:
return new image_wrapper<T, 2>();
case 3:
return new image_wrapper<T, 3>();
default:
return nullptr;
}
}
/// Create image with given data type \p T, channel order and dims
template <class T>
static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims) {
switch (channel_num) {
case 1:
return create_image_wrapper<T>(dims);
case 2:
return create_image_wrapper<sycl::vec<T, 2>>(dims);
case 3:
return create_image_wrapper<sycl::vec<T, 3>>(dims);
case 4:
return create_image_wrapper<sycl::vec<T, 4>>(dims);
default:
return nullptr;
}
}
/// Create image with channel info and specified dimensions.
static image_wrapper_base *create_image_wrapper(image_channel channel, int dims) {
switch (channel.get_channel_type()) {
case sycl::image_channel_type::fp16:
return create_image_wrapper<sycl::half>(channel.get_channel_num(), dims);
case sycl::image_channel_type::fp32:
return create_image_wrapper<float>(channel.get_channel_num(), dims);
case sycl::image_channel_type::signed_int8:
return create_image_wrapper<std::int8_t>(channel.get_channel_num(), dims);
case sycl::image_channel_type::signed_int16:
return create_image_wrapper<std::int16_t>(channel.get_channel_num(), dims);
case sycl::image_channel_type::signed_int32:
return create_image_wrapper<std::int32_t>(channel.get_channel_num(), dims);
case sycl::image_channel_type::unsigned_int8:
return create_image_wrapper<std::uint8_t>(channel.get_channel_num(), dims);
case sycl::image_channel_type::unsigned_int16:
return create_image_wrapper<std::uint16_t>(channel.get_channel_num(), dims);
case sycl::image_channel_type::unsigned_int32:
return create_image_wrapper<std::uint32_t>(channel.get_channel_num(), dims);
default:
return nullptr;
}
}
} // namespace detail
} // namespace dpct
#endif // !__DPCT_IMAGE_HPP__

View file

@ -1,459 +0,0 @@
//==---- kernel.hpp -------------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_KERNEL_HPP__
#define __DPCT_KERNEL_HPP__
#include <sycl/sycl.hpp>
#ifdef _WIN32
#include <unordered_set>
#include <windows.h>
#else
#include <dlfcn.h>
#endif
#if defined(__has_include) && __has_include(<filesystem>)
#include <filesystem>
#elif defined(__has_include) && __has_include(<experimental/filesystem>)
#include <experimental/filesystem>
#else
#error "SYCLomatic runtime requires C++ filesystem support"
#endif
#include <fstream>
#include <image.hpp>
#include <random>
namespace dpct {
typedef void (*kernel_functor)(sycl::queue &, const sycl::nd_range<3> &,
unsigned int, void **, void **);
struct kernel_function_info {
int max_work_group_size = 0;
};
static inline void get_kernel_function_info(kernel_function_info *kernel_info,
const void *function) {
kernel_info->max_work_group_size =
dpct::dev_mgr::instance()
.current_device()
.get_info<sycl::info::device::max_work_group_size>();
}
static inline kernel_function_info
get_kernel_function_info(const void *function) {
kernel_function_info kernel_info;
kernel_info.max_work_group_size =
dpct::dev_mgr::instance()
.current_device()
.get_info<sycl::info::device::max_work_group_size>();
return kernel_info;
}
namespace detail {
#if defined(__has_include) && __has_include(<filesystem>)
namespace fs = std::filesystem;
#else
namespace fs = std::experimental::filesystem;
#endif
/// Write data to temporary file and return absolute path to temporary file.
/// Temporary file is created in a temporary directory both of which have random
/// names with only the user having access permissions. Only one temporary file
/// will be created in the temporary directory.
static inline fs::path write_data_to_file(char const *const data, size_t size) {
std::error_code ec;
if (sizeof(size_t) >= sizeof(std::streamsize) &&
size > (std::numeric_limits<std::streamsize>::max)())
throw std::runtime_error("data file too large");
// random number generator
std::random_device dev;
std::mt19937 prng(dev());
std::uniform_int_distribution<uint64_t> rand(0);
// find temporary directory
auto tmp_dir = fs::temp_directory_path(ec);
if (ec)
throw std::runtime_error("could not find temporary directory");
// create private directory
std::stringstream directory;
fs::path directory_path;
constexpr int max_attempts = 5;
int i;
for (i = 0; i < max_attempts; i++) {
directory << std::hex << rand(prng);
directory_path = tmp_dir / directory.str();
if (fs::create_directory(directory_path)) {
break;
}
}
if (i == max_attempts)
throw std::runtime_error("could not create directory");
// only allow owner permissions to private directory
fs::permissions(directory_path, fs::perms::owner_all, ec);
if (ec)
throw std::runtime_error("could not set directory permissions");
// random filename in private directory
std::stringstream filename;
filename << std::hex << rand(prng);
#ifdef _WIN32
auto filepath = directory_path / (filename.str() + ".dll");
#else
auto filepath = directory_path / filename.str();
#endif
// write data to temporary file
auto outfile = std::ofstream(filepath, std::ios::out | std::ios::binary);
if (outfile) {
// only allow program to write file
fs::permissions(filepath, fs::perms::owner_write, ec);
if (ec)
throw std::runtime_error("could not set permissions");
outfile.write(data, size);
if (!outfile.good())
throw std::runtime_error("could not write data");
outfile.close();
// only allow program to read/execute file
fs::permissions(filepath, fs::perms::owner_read | fs::perms::owner_exec,
ec);
if (ec)
throw std::runtime_error("could not set permissions");
} else
throw std::runtime_error("could not write data");
// check temporary file contents
auto infile = std::ifstream(filepath, std::ios::in | std::ios::binary);
if (infile) {
bool mismatch = false;
size_t cnt = 0;
while (1) {
char c;
infile.get(c);
if (infile.eof())
break;
if (c != data[cnt++])
mismatch = true;
}
if (cnt != size || mismatch)
throw std::runtime_error("file contents not written correctly");
} else
throw std::runtime_error("could not validate file");
if (!filepath.is_absolute())
throw std::runtime_error("temporary filepath is not absolute");
return filepath;
}
static inline uint16_t extract16(unsigned char const *const ptr) {
uint16_t ret = 0;
ret |= static_cast<uint16_t>(ptr[0]) << 0;
ret |= static_cast<uint16_t>(ptr[1]) << 8;
return (ret);
}
static inline uint32_t extract32(unsigned char const *const ptr) {
uint32_t ret = 0;
ret |= static_cast<uint32_t>(ptr[0]) << 0;
ret |= static_cast<uint32_t>(ptr[1]) << 8;
ret |= static_cast<uint32_t>(ptr[2]) << 16;
ret |= static_cast<uint32_t>(ptr[3]) << 24;
return (ret);
}
static inline uint64_t extract64(unsigned char const *const ptr) {
uint64_t ret = 0;
ret |= static_cast<uint64_t>(ptr[0]) << 0;
ret |= static_cast<uint64_t>(ptr[1]) << 8;
ret |= static_cast<uint64_t>(ptr[2]) << 16;
ret |= static_cast<uint64_t>(ptr[3]) << 24;
ret |= static_cast<uint64_t>(ptr[4]) << 32;
ret |= static_cast<uint64_t>(ptr[5]) << 40;
ret |= static_cast<uint64_t>(ptr[6]) << 48;
ret |= static_cast<uint64_t>(ptr[7]) << 56;
return (ret);
}
static inline uint64_t get_lib_size(char const *const blob) {
#ifdef _WIN32
///////////////////////////////////////////////////////////////////////
// Analyze DOS stub
unsigned char const *const ublob =
reinterpret_cast<unsigned char const *const>(blob);
if (ublob[0] != 0x4d || ublob[1] != 0x5a) {
throw std::runtime_error("Blob is not a Windows DLL.");
}
uint32_t pe_header_offset = extract32(ublob + 0x3c);
///////////////////////////////////////////////////////////////////////
// Ananlyze PE-header
unsigned char const *const pe_header = ublob + pe_header_offset;
// signature
uint32_t pe_signature = extract32(pe_header + 0);
if (pe_signature != 0x00004550) {
throw std::runtime_error("PE-header signature is not 0x00004550");
}
// machine
uint16_t machine = extract16(pe_header + 4);
if (machine != 0x8664) {
throw std::runtime_error("Only DLLs for x64 supported");
}
// number of sections
uint16_t number_of_sections = extract16(pe_header + 6);
// sizeof optional header
uint16_t sizeof_optional_header = extract16(pe_header + 20);
// magic
uint16_t magic = extract16(pe_header + 24);
if (magic != 0x10b && magic != 0x20b) {
throw std::runtime_error("MAGIC is not 0x010b or 0x020b");
}
///////////////////////////////////////////////////////////////////////
// Analyze tail of optional header
constexpr int coff_header_size = 24;
unsigned char const *const tail_of_optional_header =
pe_header + coff_header_size + sizeof_optional_header;
if (extract64(tail_of_optional_header - 8) != 0) {
throw std::runtime_error("Optional header not zero-padded");
}
///////////////////////////////////////////////////////////////////////
// Analyze last section header
constexpr int section_header_size = 40;
unsigned char const *const last_section_header =
tail_of_optional_header + section_header_size * (number_of_sections - 1);
uint32_t sizeof_raw_data = extract32(last_section_header + 16);
uint32_t pointer_to_raw_data = extract32(last_section_header + 20);
return sizeof_raw_data + pointer_to_raw_data;
#else
if (blob[0] != 0x7F || blob[1] != 'E' || blob[2] != 'L' || blob[3] != 'F')
throw std::runtime_error("Blob is not in ELF format");
if (blob[4] != 0x02)
throw std::runtime_error("Only 64-bit headers are supported");
if (blob[5] != 0x01)
throw std::runtime_error("Only little-endian headers are supported");
unsigned char const *const ublob =
reinterpret_cast<unsigned char const *const>(blob);
uint64_t e_shoff = extract64(ublob + 0x28);
uint16_t e_shentsize = extract16(ublob + 0x3A);
uint16_t e_shnum = extract16(ublob + 0x3C);
return e_shoff + (e_shentsize * e_shnum);
#endif
}
#ifdef _WIN32
class path_lib_record {
public:
void operator=(const path_lib_record &) = delete;
~path_lib_record() {
for (auto entry : lib_to_path) {
FreeLibrary(static_cast<HMODULE>(entry.first));
fs::permissions(entry.second, fs::perms::owner_all);
fs::remove_all(entry.second.remove_filename());
}
}
static void record_lib_path(fs::path path, void *library) {
lib_to_path[library] = path;
}
static void remove_lib(void *library) {
auto path = lib_to_path[library];
std::error_code ec;
FreeLibrary(static_cast<HMODULE>(library));
fs::permissions(path, fs::perms::owner_all);
if (fs::remove_all(path.remove_filename(), ec) != 2 || ec)
// one directory and one temporary file should have been deleted
throw std::runtime_error("Directory delete failed");
lib_to_path.erase(library);
}
private:
static inline std::unordered_map<void *, fs::path> lib_to_path;
};
#endif
} // namespace detail
class kernel_library {
public:
kernel_library() : ptr{nullptr} {}
kernel_library(void *ptr) : ptr{ptr} {}
operator void *() const { return ptr; }
private:
void *ptr;
#ifdef _WIN32
static inline detail::path_lib_record single_instance_to_trigger_destructor;
#endif
};
namespace detail {
static inline kernel_library load_dl_from_data(char const *const data,
size_t size) {
fs::path filename = write_data_to_file(data, size);
#ifdef _WIN32
void *so = LoadLibraryW(filename.wstring().c_str());
#else
void *so = dlopen(filename.c_str(), RTLD_LAZY);
#endif
if (so == nullptr)
throw std::runtime_error("Failed to load kernel library");
#ifdef _WIN32
detail::path_lib_record::record_lib_path(filename, so);
#else
std::error_code ec;
// Windows DLL cannot be deleted while in use
if (fs::remove_all(filename.remove_filename(), ec) != 2 || ec)
// one directory and one temporary file should have been deleted
throw std::runtime_error("Directory delete failed");
#endif
return so;
}
} // namespace detail
/// Load kernel library and return a handle to use the library.
/// \param [in] name The name of the library.
static inline kernel_library load_kernel_library(const std::string &name) {
std::ifstream ifs;
ifs.open(name, std::ios::in | std::ios::binary);
std::stringstream buffer;
buffer << ifs.rdbuf();
const std::string buffer_string = buffer.str();
return detail::load_dl_from_data(buffer_string.c_str(), buffer_string.size());
}
/// Load kernel library whose image is alreay in memory and return a handle to
/// use the library.
/// \param [in] image A pointer to the image in memory.
static inline kernel_library load_kernel_library_mem(char const *const image) {
const size_t size = detail::get_lib_size(image);
return detail::load_dl_from_data(image, size);
}
/// Unload kernel library.
/// \param [in,out] library Handle to the library to be closed.
static inline void unload_kernel_library(const kernel_library &library) {
#ifdef _WIN32
detail::path_lib_record::remove_lib(library);
#else
dlclose(library);
#endif
}
class kernel_function {
public:
kernel_function() : ptr{nullptr} {}
kernel_function(dpct::kernel_functor ptr) : ptr{ptr} {}
operator void *() const { return ((void *)ptr); }
void operator()(sycl::queue &q, const sycl::nd_range<3> &range,
unsigned int a, void **args, void **extra) {
ptr(q, range, a, args, extra);
}
private:
dpct::kernel_functor ptr;
};
/// Find kernel function in a kernel library and return its address.
/// \param [in] library Handle to the kernel library.
/// \param [in] name Name of the kernel function.
static inline dpct::kernel_function
get_kernel_function(kernel_library &library, const std::string &name) {
#ifdef _WIN32
dpct::kernel_functor fn = reinterpret_cast<dpct::kernel_functor>(
GetProcAddress(static_cast<HMODULE>(static_cast<void *>(library)),
(name + std::string("_wrapper")).c_str()));
#else
dpct::kernel_functor fn = reinterpret_cast<dpct::kernel_functor>(
dlsym(library, (name + std::string("_wrapper")).c_str()));
#endif
if (fn == nullptr)
throw std::runtime_error("Failed to get function");
return fn;
}
/// Invoke a kernel function.
/// \param [in] function kernel function.
/// \param [in] queue SYCL queue used to execute kernel
/// \param [in] groupRange SYCL group range
/// \param [in] localRange SYCL local range
/// \param [in] localMemSize The size of local memory required by the kernel
/// function.
/// \param [in] kernelParams Array of pointers to kernel arguments.
/// \param [in] extra Extra arguments.
static inline void invoke_kernel_function(dpct::kernel_function &function,
sycl::queue &queue,
sycl::range<3> groupRange,
sycl::range<3> localRange,
unsigned int localMemSize,
void **kernelParams, void **extra) {
function(queue, sycl::nd_range<3>(groupRange * localRange, localRange),
localMemSize, kernelParams, extra);
}
/// Find image wrapper in a kernel library and return its address.
/// \param [in] library Handle to the kernel library.
/// \param [in] name Name of the target image wrapper.
static inline dpct::image_wrapper_base_p
get_image_wrapper(dpct::kernel_library &library, const std::string &name) {
#ifdef _WIN32
dpct::image_wrapper_base_p fn =
reinterpret_cast<dpct::image_wrapper_base_p>(GetProcAddress(
static_cast<HMODULE>(static_cast<void *>(library)), name.c_str()));
#else
dpct::image_wrapper_base_p fn = reinterpret_cast<dpct::image_wrapper_base_p>(
dlsym(library, name.c_str()));
#endif
if (fn == nullptr)
throw std::runtime_error("Failed to get image");
return fn;
}
} // namespace dpct
#endif // __DPCT_KERNEL_HPP__

File diff suppressed because it is too large Load diff

View file

@ -1,174 +0,0 @@
//==---- lib_common_utils.hpp ---------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_LIB_COMMON_UTILS_HPP__
#define __DPCT_LIB_COMMON_UTILS_HPP__
#include <sycl/sycl.hpp>
#include <oneapi/mkl.hpp>
#include "memory.hpp"
#include "util.hpp"
namespace dpct {
namespace detail {
template <typename T> inline auto get_memory(const void *x) {
T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
#ifdef DPCT_USM_LEVEL_NONE
return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
#else
return new_x;
#endif
}
template <typename T>
inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q) {
using Ty = typename DataType<T>::T2;
Ty s_h;
if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
.wait();
else
s_h = *reinterpret_cast<const Ty *>(s);
return s_h;
}
} // namespace detail
enum class version_field : int { major, minor, update, patch };
/// Returns the requested field of Intel(R) oneAPI Math Kernel Library version.
/// \param field The version information field (major, minor, update or patch).
/// \param result The result value.
inline void mkl_get_version(version_field field, int *result) {
#ifndef __INTEL_MKL__
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
"Project does not support this API.");
#else
MKLVersion version;
mkl_get_version(&version);
if (version_field::major == field) {
*result = version.MajorVersion;
} else if (version_field::minor == field) {
*result = version.MinorVersion;
} else if (version_field::update == field) {
*result = version.UpdateVersion;
} else if (version_field::patch == field) {
*result = 0;
} else {
throw std::runtime_error("unknown field");
}
#endif
}
enum class library_data_t : unsigned char {
real_float = 0,
complex_float,
real_double,
complex_double,
real_half,
complex_half,
real_bfloat16,
complex_bfloat16,
real_int4,
complex_int4,
real_uint4,
complex_uint4,
real_int8,
complex_int8,
real_uint8,
complex_uint8,
real_int16,
complex_int16,
real_uint16,
complex_uint16,
real_int32,
complex_int32,
real_uint32,
complex_uint32,
real_int64,
complex_int64,
real_uint64,
complex_uint64,
real_int8_4,
real_int8_32,
real_uint8_4,
library_data_t_size
};
namespace detail {
template <typename ArgT>
inline constexpr std::uint64_t get_type_combination_id(ArgT Val) {
static_assert((unsigned char)library_data_t::library_data_t_size <=
std::numeric_limits<unsigned char>::max() &&
"library_data_t size exceeds limit.");
static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
return (std::uint64_t)Val;
}
template <typename FirstT, typename... RestT>
inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
RestT... RestVal) {
static_assert((std::uint8_t)library_data_t::library_data_t_size <=
std::numeric_limits<unsigned char>::max() &&
"library_data_t size exceeds limit.");
static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
}
inline constexpr std::size_t library_data_size[] = {
8 * sizeof(float), // real_float
8 * sizeof(std::complex<float>), // complex_float
8 * sizeof(double), // real_double
8 * sizeof(std::complex<double>), // complex_double
8 * sizeof(sycl::half), // real_half
8 * sizeof(std::complex<sycl::half>), // complex_half
16, // real_bfloat16
16 * 2, // complex_bfloat16
4, // real_int4
4 * 2, // complex_int4
4, // real_uint4
4 * 2, // complex_uint4
8, // real_int8
8 * 2, // complex_int8
8, // real_uint8
8 * 2, // complex_uint8
16, // real_int16
16 * 2, // complex_int16
16, // real_uint16
16 * 2, // complex_uint16
32, // real_int32
32 * 2, // complex_int32
32, // real_uint32
32 * 2, // complex_uint32
64, // real_int64
64 * 2, // complex_int64
64, // real_uint64
64 * 2, // complex_uint64
8, // real_int8_4
8, // real_int8_32
8 // real_uint8_4
};
} // namespace detail
#ifdef DPCT_USM_LEVEL_NONE
/// Cast a "rvalue reference to a temporary object" to an "lvalue reference to
/// that temporary object".
/// CAUTION:
/// The returned lvalue reference is available only before the last step in
/// evaluating the full-expression that contains this function call.
/// \param [in] temporary_object The rvalue reference to a temporary object.
/// \returns The lvalue reference to that temporary object.
template <typename T>
inline typename std::enable_if_t<std::is_rvalue_reference_v<T &&>, T &>
rvalue_ref_to_lvalue_ref(T &&temporary_object) {
return temporary_object;
}
#endif
} // namespace dpct
#endif // __DPCT_LIB_COMMON_UTILS_HPP__

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,535 +0,0 @@
//==---- rng_utils.hpp ----------------------------*- C++ -*----------------==//
//
// Copyright (C) Intel Corporation
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef __DPCT_RNG_UTILS_HPP__
#define __DPCT_RNG_UTILS_HPP__
#include <sycl/sycl.hpp>
#include <oneapi/mkl.hpp>
#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
#include <oneapi/mkl/rng/device.hpp>
#endif
#include "device.hpp"
#include "lib_common_utils.hpp"
namespace dpct {
namespace rng {
#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
namespace device {
/// The random number generator on device.
/// \tparam engine_t The device random number generator engine. It can only be
/// oneapi::mkl::rng::device::mrg32k3a<1> or
/// oneapi::mkl::rng::device::mrg32k3a<4> or
/// oneapi::mkl::rng::device::philox4x32x10<1> or
/// oneapi::mkl::rng::device::philox4x32x10<4>.
template <typename engine_t> class rng_generator {
static_assert(
std::disjunction_v<
std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<1>>,
std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<4>>,
std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<1>>,
std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<4>>,
std::is_same<engine_t, oneapi::mkl::rng::device::mcg59<1>>>,
"engine_t can only be oneapi::mkl::rng::device::mrg32k3a<1> or "
"oneapi::mkl::rng::device::mrg32k3a<4> or "
"oneapi::mkl::rng::device::philox4x32x10<1> or "
"oneapi::mkl::rng::device::philox4x32x10<4> or "
"oneapi::mkl::rng::device::mcg59<1>.");
static constexpr bool _is_engine_vec_size_one = std::disjunction_v<
std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<1>>,
std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<1>>,
std::is_same<engine_t, oneapi::mkl::rng::device::mcg59<1>>>;
static constexpr std::uint64_t default_seed = 0;
oneapi::mkl::rng::device::bits<std::uint32_t> _distr_bits;
oneapi::mkl::rng::device::uniform_bits<std::uint32_t> _distr_uniform_bits;
oneapi::mkl::rng::device::gaussian<float> _distr_gaussian_float;
oneapi::mkl::rng::device::gaussian<double> _distr_gaussian_double;
oneapi::mkl::rng::device::lognormal<float> _distr_lognormal_float;
oneapi::mkl::rng::device::lognormal<double> _distr_lognormal_double;
oneapi::mkl::rng::device::poisson<std::uint32_t> _distr_poisson;
oneapi::mkl::rng::device::uniform<float> _distr_uniform_float;
oneapi::mkl::rng::device::uniform<double> _distr_uniform_double;
engine_t _engine;
public:
/// Default constructor of rng_generator
rng_generator() { _engine = engine_t(default_seed); }
/// Constructor of rng_generator if engine type is not mcg59
/// \param [in] seed The seed to initialize the engine state.
/// \param [in] num_to_skip Set the number of elements need to be skipped.
/// The number is calculated as: num_to_skip[0] + num_to_skip[1] * 2^64 +
/// num_to_skip[2] * 2^128 + ... + num_to_skip[n-1] * 2^(64*(n-1))
template <typename T = engine_t,
typename std::enable_if<!std::is_same_v<
T, oneapi::mkl::rng::device::mcg59<1>>>::type * = nullptr>
rng_generator(std::uint64_t seed,
std::initializer_list<std::uint64_t> num_to_skip) {
_engine = engine_t(seed, num_to_skip);
}
/// Constructor of rng_generator if engine type is mcg59
/// \param [in] seed The seed to initialize the engine state.
/// \param [in] num_to_skip Set the number of elements need to be skipped.
template <typename T = engine_t,
typename std::enable_if<std::is_same_v<
T, oneapi::mkl::rng::device::mcg59<1>>>::type * = nullptr>
rng_generator(std::uint64_t seed, std::uint64_t num_to_skip) {
_engine = engine_t(seed, num_to_skip);
}
/// Generate random number(s) obeys distribution \tparam distr_t.
/// \tparam T The distribution of the random number. It can only be
/// oneapi::mkl::rng::device::bits<std::uint32_t>,
/// oneapi::mkl::rng::device::uniform_bits<std::uint32_t>,
/// oneapi::mkl::rng::device::gaussian<float>,
/// oneapi::mkl::rng::device::gaussian<double>,
/// oneapi::mkl::rng::device::lognormal<float>,
/// oneapi::mkl::rng::device::lognormal<double>,
/// oneapi::mkl::rng::device::poisson<std::uint32_t>,
/// oneapi::mkl::rng::device::uniform<float> or
/// oneapi::mkl::rng::device::uniform<double>
/// \tparam vec_size The length of the return vector. It can only be 1, 2
/// or 4.
/// \param distr_params The parameter(s) for lognormal or poisson
/// distribution.
/// \return The vector of the random number(s).
template <typename distr_t, int vec_size, class... distr_params_t>
auto generate(distr_params_t... distr_params) {
static_assert(vec_size == 1 || vec_size == 2 || vec_size == 4,
"vec_size is not supported.");
static_assert(
std::disjunction_v<
std::is_same<distr_t,
oneapi::mkl::rng::device::bits<std::uint32_t>>,
std::is_same<distr_t,
oneapi::mkl::rng::device::uniform_bits<std::uint32_t>>,
std::is_same<distr_t, oneapi::mkl::rng::device::gaussian<float>>,
std::is_same<distr_t, oneapi::mkl::rng::device::gaussian<double>>,
std::is_same<distr_t, oneapi::mkl::rng::device::lognormal<float>>,
std::is_same<distr_t, oneapi::mkl::rng::device::lognormal<double>>,
std::is_same<distr_t,
oneapi::mkl::rng::device::poisson<std::uint32_t>>,
std::is_same<distr_t, oneapi::mkl::rng::device::uniform<float>>,
std::is_same<distr_t, oneapi::mkl::rng::device::uniform<double>>>,
"distribution is not supported.");
if constexpr (std::is_same_v<
distr_t, oneapi::mkl::rng::device::bits<std::uint32_t>>) {
return generate_vec<vec_size>(_distr_bits);
}
if constexpr (std::is_same_v<
distr_t,
oneapi::mkl::rng::device::uniform_bits<std::uint32_t>>) {
return generate_vec<vec_size>(_distr_uniform_bits);
}
if constexpr (std::is_same_v<distr_t,
oneapi::mkl::rng::device::gaussian<float>>) {
return generate_vec<vec_size>(_distr_gaussian_float);
}
if constexpr (std::is_same_v<distr_t,
oneapi::mkl::rng::device::gaussian<double>>) {
return generate_vec<vec_size>(_distr_gaussian_double);
}
if constexpr (std::is_same_v<distr_t,
oneapi::mkl::rng::device::lognormal<float>>) {
return generate_vec<vec_size>(_distr_lognormal_float, distr_params...,
0.0f, 1.0f);
}
if constexpr (std::is_same_v<distr_t,
oneapi::mkl::rng::device::lognormal<double>>) {
return generate_vec<vec_size>(_distr_lognormal_double, distr_params...,
0.0, 1.0);
}
if constexpr (std::is_same_v<distr_t, oneapi::mkl::rng::device::poisson<
std::uint32_t>>) {
return generate_vec<vec_size>(_distr_poisson, distr_params...);
}
if constexpr (std::is_same_v<distr_t,
oneapi::mkl::rng::device::uniform<float>>) {
return generate_vec<vec_size>(_distr_uniform_float);
}
if constexpr (std::is_same_v<distr_t,
oneapi::mkl::rng::device::uniform<double>>) {
return generate_vec<vec_size>(_distr_uniform_double);
}
}
/// Get the random number generator engine.
/// \return The reference of the internal random number generator engine.
engine_t &get_engine() { return _engine; }
private:
template <int vec_size, typename distr_t, class... distr_params_t>
auto generate_vec(distr_t &distr, distr_params_t... distr_params) {
if constexpr (sizeof...(distr_params_t)) {
typename distr_t::param_type pt(distr_params...);
distr.param(pt);
}
if constexpr (vec_size == 4) {
if constexpr (_is_engine_vec_size_one) {
sycl::vec<typename distr_t::result_type, 4> res;
res.x() = oneapi::mkl::rng::device::generate(distr, _engine);
res.y() = oneapi::mkl::rng::device::generate(distr, _engine);
res.z() = oneapi::mkl::rng::device::generate(distr, _engine);
res.w() = oneapi::mkl::rng::device::generate(distr, _engine);
return res;
} else {
return oneapi::mkl::rng::device::generate(distr, _engine);
}
} else if constexpr (vec_size == 1) {
if constexpr (_is_engine_vec_size_one) {
return oneapi::mkl::rng::device::generate(distr, _engine);
} else {
return oneapi::mkl::rng::device::generate_single(distr, _engine);
}
} else if constexpr (vec_size == 2) {
if constexpr (_is_engine_vec_size_one) {
sycl::vec<typename distr_t::result_type, 2> res;
res.x() = oneapi::mkl::rng::device::generate(distr, _engine);
res.y() = oneapi::mkl::rng::device::generate(distr, _engine);
return res;
} else {
sycl::vec<typename distr_t::result_type, 2> res;
res.x() = oneapi::mkl::rng::device::generate_single(distr, _engine);
res.y() = oneapi::mkl::rng::device::generate_single(distr, _engine);
return res;
}
}
}
};
} // namespace device
#endif
namespace host {
namespace detail {
class rng_generator_base {
public:
/// Set the seed of host rng_generator.
/// \param seed The engine seed.
virtual void set_seed(const std::uint64_t seed) = 0;
/// Set the dimensions of host rng_generator.
/// \param dimensions The engine dimensions.
virtual void set_dimensions(const std::uint32_t dimensions) = 0;
/// Set the queue of host rng_generator.
/// \param queue The engine queue.
virtual void set_queue(sycl::queue *queue) = 0;
/// Generate unsigned int random number(s) with 'uniform_bits' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
virtual inline void generate_uniform_bits(unsigned int *output,
std::int64_t n) = 0;
/// Generate unsigned long long random number(s) with 'uniform_bits'
/// distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
virtual inline void generate_uniform_bits(unsigned long long *output,
std::int64_t n) = 0;
/// Generate float random number(s) with 'lognormal' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
/// \param m Mean of associated normal distribution
/// \param s Standard deviation of associated normal distribution.
virtual inline void generate_lognormal(float *output, std::int64_t n, float m,
float s) = 0;
/// Generate double random number(s) with 'lognormal' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
/// \param m Mean of associated normal distribution
/// \param s Standard deviation of associated normal distribution.
virtual inline void generate_lognormal(double *output, std::int64_t n,
double m, double s) = 0;
/// Generate float random number(s) with 'gaussian' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
/// \param mean Mean of normal distribution
/// \param stddev Standard deviation of normal distribution.
virtual inline void generate_gaussian(float *output, std::int64_t n,
float mean, float stddev) = 0;
/// Generate double random number(s) with 'gaussian' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
/// \param mean Mean of normal distribution
/// \param stddev Standard deviation of normal distribution.
virtual inline void generate_gaussian(double *output, std::int64_t n,
double mean, double stddev) = 0;
/// Generate unsigned int random number(s) with 'poisson' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
/// \param lambda Lambda for the Poisson distribution.
virtual inline void generate_poisson(unsigned int *output, std::int64_t n,
double lambda) = 0;
/// Generate float random number(s) with 'uniform' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
virtual inline void generate_uniform(float *output, std::int64_t n) = 0;
/// Generate double random number(s) with 'uniform' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
virtual inline void generate_uniform(double *output, std::int64_t n) = 0;
/// Skip ahead several random number(s).
/// \param num_to_skip The number of random numbers to be skipped.
virtual void skip_ahead(const std::uint64_t num_to_skip) = 0;
/// Set the direction numbers of host rng_generator. Only Sobol engine
/// supports this method.
/// \param direction_numbers The engine direction numbers.
virtual void set_direction_numbers(
const std::vector<std::uint32_t> &direction_numbers) = 0;
protected:
sycl::queue *_queue{&dpct::get_default_queue()};
std::uint64_t _seed{0};
std::uint32_t _dimensions{1};
std::vector<std::uint32_t> _direction_numbers;
};
/// The random number generator on host.
template <typename engine_t = oneapi::mkl::rng::philox4x32x10>
class rng_generator : public rng_generator_base {
public:
/// Constructor of rng_generator.
rng_generator() : _engine(create_engine(_queue, _seed, _dimensions)) {}
/// Set the seed of host rng_generator.
/// \param seed The engine seed.
void set_seed(const std::uint64_t seed) {
if (seed == _seed) {
return;
}
_seed = seed;
_engine = create_engine(_queue, _seed, _dimensions);
}
/// Set the dimensions of host rng_generator.
/// \param dimensions The engine dimensions.
void set_dimensions(const std::uint32_t dimensions) {
if (dimensions == _dimensions) {
return;
}
_dimensions = dimensions;
_engine = create_engine(_queue, _seed, _dimensions);
}
/// Set the queue of host rng_generator.
/// \param queue The engine queue.
void set_queue(sycl::queue *queue) {
if (queue == _queue) {
return;
}
_queue = queue;
_engine = create_engine(_queue, _seed, _dimensions);
}
/// Set the direction numbers of Sobol host rng_generator.
/// \param direction_numbers The user-defined direction numbers.
void
set_direction_numbers(const std::vector<std::uint32_t> &direction_numbers) {
#ifndef __INTEL_MKL__
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
"Interfaces Project does not support this API.");
#else
if constexpr (std::is_same_v<engine_t, oneapi::mkl::rng::sobol>) {
if (direction_numbers == _direction_numbers) {
return;
}
_direction_numbers = direction_numbers;
_engine = oneapi::mkl::rng::sobol(*_queue, _direction_numbers);
} else {
throw std::runtime_error("Only Sobol engine supports this method.");
}
#endif
}
/// Generate unsigned int random number(s) with 'uniform_bits' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
inline void generate_uniform_bits(unsigned int *output, std::int64_t n) {
#ifndef __INTEL_MKL__
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
"Interfaces Project does not support this API.");
#else
static_assert(sizeof(unsigned int) == sizeof(std::uint32_t));
generate<oneapi::mkl::rng::uniform_bits<std::uint32_t>>(
(std::uint32_t *)output, n);
#endif
}
/// Generate unsigned long long random number(s) with 'uniform_bits'
/// distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
inline void generate_uniform_bits(unsigned long long *output,
std::int64_t n) {
#ifndef __INTEL_MKL__
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
"Interfaces Project does not support this API.");
#else
static_assert(sizeof(unsigned long long) == sizeof(std::uint64_t));
generate<oneapi::mkl::rng::uniform_bits<std::uint64_t>>(
(std::uint64_t *)output, n);
#endif
}
/// Generate float random number(s) with 'lognormal' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
/// \param m Mean of associated normal distribution
/// \param s Standard deviation of associated normal distribution.
inline void generate_lognormal(float *output, std::int64_t n, float m,
float s) {
generate<oneapi::mkl::rng::lognormal<float>>(output, n, m, s);
}
/// Generate double random number(s) with 'lognormal' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
/// \param m Mean of associated normal distribution
/// \param s Standard deviation of associated normal distribution.
inline void generate_lognormal(double *output, std::int64_t n, double m,
double s) {
generate<oneapi::mkl::rng::lognormal<double>>(output, n, m, s);
}
/// Generate float random number(s) with 'gaussian' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
/// \param mean Mean of normal distribution
/// \param stddev Standard deviation of normal distribution.
inline void generate_gaussian(float *output, std::int64_t n, float mean,
float stddev) {
generate<oneapi::mkl::rng::gaussian<float>>(output, n, mean, stddev);
}
/// Generate double random number(s) with 'gaussian' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
/// \param mean Mean of normal distribution
/// \param stddev Standard deviation of normal distribution.
inline void generate_gaussian(double *output, std::int64_t n, double mean,
double stddev) {
generate<oneapi::mkl::rng::gaussian<double>>(output, n, mean, stddev);
}
/// Generate unsigned int random number(s) with 'poisson' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
/// \param lambda Lambda for the Poisson distribution.
inline void generate_poisson(unsigned int *output, std::int64_t n,
double lambda) {
generate<oneapi::mkl::rng::poisson<unsigned int>>(output, n, lambda);
}
/// Generate float random number(s) with 'uniform' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
inline void generate_uniform(float *output, std::int64_t n) {
generate<oneapi::mkl::rng::uniform<float>>(output, n);
}
/// Generate double random number(s) with 'uniform' distribution.
/// \param output The pointer of the first random number.
/// \param n The number of random numbers.
inline void generate_uniform(double *output, std::int64_t n) {
generate<oneapi::mkl::rng::uniform<double>>(output, n);
}
/// Skip ahead several random number(s).
/// \param num_to_skip The number of random numbers to be skipped.
void skip_ahead(const std::uint64_t num_to_skip) {
#ifndef __INTEL_MKL__
oneapi::mkl::rng::skip_ahead(_engine, num_to_skip);
#else
if constexpr (std::is_same_v<engine_t, oneapi::mkl::rng::mt2203>)
throw std::runtime_error("no skip_ahead method of mt2203 engine.");
else
oneapi::mkl::rng::skip_ahead(_engine, num_to_skip);
#endif
}
private:
static inline engine_t create_engine(sycl::queue *queue,
const std::uint64_t seed,
const std::uint32_t dimensions) {
#ifdef __INTEL_MKL__
return std::is_same_v<engine_t, oneapi::mkl::rng::sobol>
? engine_t(*queue, dimensions)
: engine_t(*queue, seed);
#else
return engine_t(*queue, seed);
#endif
}
template <typename distr_t, typename buffer_t, class... distr_params_t>
void generate(buffer_t *output, const std::int64_t n,
const distr_params_t... distr_params) {
auto output_buf = dpct::detail::get_memory<buffer_t>(output);
oneapi::mkl::rng::generate(distr_t(distr_params...), _engine, n,
output_buf);
}
engine_t _engine{};
};
} // namespace detail
} // namespace host
enum class random_engine_type {
philox4x32x10,
mrg32k3a,
mt2203,
mt19937,
sobol,
mcg59
};
typedef std::shared_ptr<rng::host::detail::rng_generator_base> host_rng_ptr;
/// Create a host random number generator.
/// \param type The random engine type.
/// \return The pointer of random number generator.
inline host_rng_ptr create_host_rng(const random_engine_type type) {
switch (type) {
case random_engine_type::philox4x32x10:
return std::make_shared<
rng::host::detail::rng_generator<oneapi::mkl::rng::philox4x32x10>>();
case random_engine_type::mrg32k3a:
return std::make_shared<
rng::host::detail::rng_generator<oneapi::mkl::rng::mrg32k3a>>();
#ifndef __INTEL_MKL__
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
"Interfaces Project does not support this API.");
#else
case random_engine_type::mt2203:
return std::make_shared<
rng::host::detail::rng_generator<oneapi::mkl::rng::mt2203>>();
case random_engine_type::mt19937:
return std::make_shared<
rng::host::detail::rng_generator<oneapi::mkl::rng::mt19937>>();
case random_engine_type::sobol:
return std::make_shared<
rng::host::detail::rng_generator<oneapi::mkl::rng::sobol>>();
case random_engine_type::mcg59:
return std::make_shared<
rng::host::detail::rng_generator<oneapi::mkl::rng::mcg59>>();
#endif
}
}
} // namespace rng
} // namespace dpct
#endif // __DPCT_RNG_UTILS_HPP__

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -19,9 +19,10 @@
#include <sycl/sycl.hpp>
#include <sycl/half_type.hpp>
#include <dpct/dpct.hpp>
#include <dpct/blas_utils.hpp>
#include <dpct/lib_common_utils.hpp>
// #include <dpct/dpct.hpp>
// #include <dpct/blas_utils.hpp>
// #include <dpct/lib_common_utils.hpp>
#include "dpct.hpp"
#include "ggml-sycl.h"
#include "ggml.h"
#include "ggml-backend-impl.h"

2
run.sh
View file

@ -14,6 +14,6 @@ echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
#export GGML_SYCL_DEBUG=1
#export GGML_SYCL_LIST_DEVICE=1
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0