mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 03:27:39 +00:00
513 lines
17 KiB
C++
513 lines
17 KiB
C++
/*
|
|
* kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
|
|
*/
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef KMP_DISPATCH_H
|
|
#define KMP_DISPATCH_H
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/* ------------------------------------------------------------------------ */
|
|
|
|
#include "kmp.h"
|
|
#include "kmp_error.h"
|
|
#include "kmp_i18n.h"
|
|
#include "kmp_itt.h"
|
|
#include "kmp_stats.h"
|
|
#include "kmp_str.h"
|
|
#if KMP_OS_WINDOWS && KMP_ARCH_X86
|
|
#include <float.h>
|
|
#endif
|
|
|
|
#if OMPT_SUPPORT
|
|
#include "ompt-internal.h"
|
|
#include "ompt-specific.h"
|
|
#endif
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/* ------------------------------------------------------------------------ */
|
|
#if KMP_USE_HIER_SCHED
|
|
// Forward declarations of some hierarchical scheduling data structures
|
|
template <typename T> struct kmp_hier_t;
|
|
template <typename T> struct kmp_hier_top_unit_t;
|
|
#endif // KMP_USE_HIER_SCHED
|
|
|
|
template <typename T> struct dispatch_shared_info_template;
|
|
template <typename T> struct dispatch_private_info_template;
|
|
|
|
template <typename T>
|
|
extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
|
|
dispatch_private_info_template<T> *pr,
|
|
enum sched_type schedule, T lb, T ub,
|
|
typename traits_t<T>::signed_t st,
|
|
#if USE_ITT_BUILD
|
|
kmp_uint64 *cur_chunk,
|
|
#endif
|
|
typename traits_t<T>::signed_t chunk,
|
|
T nproc, T unit_id);
|
|
template <typename T>
|
|
extern int __kmp_dispatch_next_algorithm(
|
|
int gtid, dispatch_private_info_template<T> *pr,
|
|
dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
|
|
T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
|
|
|
|
void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
|
|
void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
|
|
|
|
#if KMP_STATIC_STEAL_ENABLED
|
|
|
|
// replaces dispatch_private_info{32,64} structures and
|
|
// dispatch_private_info{32,64}_t types
|
|
template <typename T> struct dispatch_private_infoXX_template {
|
|
typedef typename traits_t<T>::unsigned_t UT;
|
|
typedef typename traits_t<T>::signed_t ST;
|
|
UT count; // unsigned
|
|
T ub;
|
|
/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
|
|
T lb;
|
|
ST st; // signed
|
|
UT tc; // unsigned
|
|
kmp_lock_t *steal_lock; // lock used for chunk stealing
|
|
|
|
UT ordered_lower; // unsigned
|
|
UT ordered_upper; // unsigned
|
|
|
|
/* parm[1-4] are used in different ways by different scheduling algorithms */
|
|
|
|
// KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
|
|
// a) parm3 is properly aligned and
|
|
// b) all parm1-4 are in the same cache line.
|
|
// Because of parm1-4 are used together, performance seems to be better
|
|
// if they are in the same line (not measured though).
|
|
struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
|
|
T parm1;
|
|
T parm2;
|
|
T parm3;
|
|
T parm4;
|
|
};
|
|
|
|
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
|
|
UT pchunks; // total number of chunks for processes with p-core
|
|
UT num_procs_with_pcore; // number of threads with p-core
|
|
T first_thread_with_ecore;
|
|
#endif
|
|
#if KMP_OS_WINDOWS
|
|
T last_upper;
|
|
#endif /* KMP_OS_WINDOWS */
|
|
};
|
|
|
|
#else /* KMP_STATIC_STEAL_ENABLED */
|
|
|
|
// replaces dispatch_private_info{32,64} structures and
|
|
// dispatch_private_info{32,64}_t types
|
|
template <typename T> struct dispatch_private_infoXX_template {
|
|
typedef typename traits_t<T>::unsigned_t UT;
|
|
typedef typename traits_t<T>::signed_t ST;
|
|
T lb;
|
|
T ub;
|
|
ST st; // signed
|
|
UT tc; // unsigned
|
|
|
|
T parm1;
|
|
T parm2;
|
|
T parm3;
|
|
T parm4;
|
|
|
|
UT count; // unsigned
|
|
|
|
UT ordered_lower; // unsigned
|
|
UT ordered_upper; // unsigned
|
|
#if KMP_OS_WINDOWS
|
|
T last_upper;
|
|
#endif /* KMP_OS_WINDOWS */
|
|
};
|
|
#endif /* KMP_STATIC_STEAL_ENABLED */
|
|
|
|
template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
|
|
// duplicate alignment here, otherwise size of structure is not correct in our
|
|
// compiler
|
|
union KMP_ALIGN_CACHE private_info_tmpl {
|
|
dispatch_private_infoXX_template<T> p;
|
|
dispatch_private_info64_t p64;
|
|
} u;
|
|
enum sched_type schedule; /* scheduling algorithm */
|
|
kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
|
|
std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
|
|
kmp_uint32 ordered_bumped;
|
|
dispatch_private_info *next; /* stack of buffers for nest of serial regions */
|
|
kmp_uint32 type_size;
|
|
#if KMP_USE_HIER_SCHED
|
|
kmp_int32 hier_id;
|
|
kmp_hier_top_unit_t<T> *hier_parent;
|
|
// member functions
|
|
kmp_int32 get_hier_id() const { return hier_id; }
|
|
kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
|
|
#endif
|
|
enum cons_type pushed_ws;
|
|
};
|
|
|
|
// replaces dispatch_shared_info{32,64} structures and
|
|
// dispatch_shared_info{32,64}_t types
|
|
template <typename T> struct dispatch_shared_infoXX_template {
|
|
typedef typename traits_t<T>::unsigned_t UT;
|
|
typedef typename traits_t<T>::signed_t ST;
|
|
/* chunk index under dynamic, number of idle threads under static-steal;
|
|
iteration index otherwise */
|
|
volatile UT iteration;
|
|
volatile ST num_done;
|
|
volatile UT ordered_iteration;
|
|
// to retain the structure size making ordered_iteration scalar
|
|
UT ordered_dummy[KMP_MAX_ORDERED - 3];
|
|
};
|
|
|
|
// replaces dispatch_shared_info structure and dispatch_shared_info_t type
|
|
template <typename T> struct dispatch_shared_info_template {
|
|
typedef typename traits_t<T>::unsigned_t UT;
|
|
// we need union here to keep the structure size
|
|
union shared_info_tmpl {
|
|
dispatch_shared_infoXX_template<UT> s;
|
|
dispatch_shared_info64_t s64;
|
|
} u;
|
|
volatile kmp_uint32 buffer_index;
|
|
volatile kmp_int32 doacross_buf_idx; // teamwise index
|
|
kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
|
|
kmp_int32 doacross_num_done; // count finished threads
|
|
#if KMP_USE_HIER_SCHED
|
|
kmp_hier_t<T> *hier;
|
|
#endif
|
|
#if KMP_USE_HWLOC
|
|
// When linking with libhwloc, the ORDERED EPCC test slowsdown on big
|
|
// machines (> 48 cores). Performance analysis showed that a cache thrash
|
|
// was occurring and this padding helps alleviate the problem.
|
|
char padding[64];
|
|
#endif
|
|
};
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/* ------------------------------------------------------------------------ */
|
|
|
|
#undef USE_TEST_LOCKS
|
|
|
|
// test_then_add template (general template should NOT be used)
|
|
template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
|
|
|
|
template <>
|
|
__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
|
|
kmp_int32 d) {
|
|
kmp_int32 r;
|
|
r = KMP_TEST_THEN_ADD32(p, d);
|
|
return r;
|
|
}
|
|
|
|
template <>
|
|
__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
|
|
kmp_int64 d) {
|
|
kmp_int64 r;
|
|
r = KMP_TEST_THEN_ADD64(p, d);
|
|
return r;
|
|
}
|
|
|
|
// test_then_inc_acq template (general template should NOT be used)
|
|
template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
|
|
|
|
template <>
|
|
__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
|
|
kmp_int32 r;
|
|
r = KMP_TEST_THEN_INC_ACQ32(p);
|
|
return r;
|
|
}
|
|
|
|
template <>
|
|
__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
|
|
kmp_int64 r;
|
|
r = KMP_TEST_THEN_INC_ACQ64(p);
|
|
return r;
|
|
}
|
|
|
|
// test_then_inc template (general template should NOT be used)
|
|
template <typename T> static __forceinline T test_then_inc(volatile T *p);
|
|
|
|
template <>
|
|
__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
|
|
kmp_int32 r;
|
|
r = KMP_TEST_THEN_INC32(p);
|
|
return r;
|
|
}
|
|
|
|
template <>
|
|
__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
|
|
kmp_int64 r;
|
|
r = KMP_TEST_THEN_INC64(p);
|
|
return r;
|
|
}
|
|
|
|
// compare_and_swap template (general template should NOT be used)
|
|
template <typename T>
|
|
static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
|
|
|
|
template <>
|
|
__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
|
|
kmp_int32 c, kmp_int32 s) {
|
|
return KMP_COMPARE_AND_STORE_REL32(p, c, s);
|
|
}
|
|
|
|
template <>
|
|
__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
|
|
kmp_int64 c, kmp_int64 s) {
|
|
return KMP_COMPARE_AND_STORE_REL64(p, c, s);
|
|
}
|
|
|
|
template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
|
|
return value >= checker;
|
|
}
|
|
template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
|
|
return value == checker;
|
|
}
|
|
|
|
/*
|
|
Spin wait loop that pauses between checks.
|
|
Waits until function returns non-zero when called with *spinner and check.
|
|
Does NOT put threads to sleep.
|
|
Arguments:
|
|
UT is unsigned 4- or 8-byte type
|
|
spinner - memory location to check value
|
|
checker - value which spinner is >, <, ==, etc.
|
|
pred - predicate function to perform binary comparison of some sort
|
|
#if USE_ITT_BUILD
|
|
obj -- is higher-level synchronization object to report to ittnotify. It
|
|
is used to report locks consistently. For example, if lock is acquired
|
|
immediately, its address is reported to ittnotify via
|
|
KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
|
|
and lock routine calls to KMP_WAIT(), the later should report the
|
|
same address, not an address of low-level spinner.
|
|
#endif // USE_ITT_BUILD
|
|
TODO: make inline function (move to header file for icl)
|
|
*/
|
|
template <typename UT>
|
|
static UT __kmp_wait(volatile UT *spinner, UT checker,
|
|
kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
|
|
// note: we may not belong to a team at this point
|
|
volatile UT *spin = spinner;
|
|
UT check = checker;
|
|
kmp_uint32 spins;
|
|
kmp_uint32 (*f)(UT, UT) = pred;
|
|
kmp_uint64 time;
|
|
UT r;
|
|
|
|
KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
|
|
KMP_INIT_YIELD(spins);
|
|
KMP_INIT_BACKOFF(time);
|
|
// main wait spin loop
|
|
while (!f(r = *spin, check)) {
|
|
KMP_FSYNC_SPIN_PREPARE(obj);
|
|
/* GEH - remove this since it was accidentally introduced when kmp_wait was
|
|
split.
|
|
It causes problems with infinite recursion because of exit lock */
|
|
/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
|
|
__kmp_abort_thread(); */
|
|
// If oversubscribed, or have waited a bit then yield.
|
|
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
|
|
}
|
|
KMP_FSYNC_SPIN_ACQUIRED(obj);
|
|
return r;
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
/* ------------------------------------------------------------------------ */
|
|
|
|
template <typename UT>
|
|
void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
|
|
dispatch_private_info_template<UT> *pr;
|
|
|
|
int gtid = *gtid_ref;
|
|
// int cid = *cid_ref;
|
|
kmp_info_t *th = __kmp_threads[gtid];
|
|
KMP_DEBUG_ASSERT(th->th.th_dispatch);
|
|
|
|
KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
|
|
if (__kmp_env_consistency_check) {
|
|
pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
|
|
th->th.th_dispatch->th_dispatch_pr_current);
|
|
if (pr->pushed_ws != ct_none) {
|
|
#if KMP_USE_DYNAMIC_LOCK
|
|
__kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
|
|
#else
|
|
__kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
if (!th->th.th_team->t.t_serialized) {
|
|
dispatch_shared_info_template<UT> *sh =
|
|
reinterpret_cast<dispatch_shared_info_template<UT> *>(
|
|
th->th.th_dispatch->th_dispatch_sh_current);
|
|
UT lower;
|
|
|
|
if (!__kmp_env_consistency_check) {
|
|
pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
|
|
th->th.th_dispatch->th_dispatch_pr_current);
|
|
}
|
|
lower = pr->u.p.ordered_lower;
|
|
|
|
#if !defined(KMP_GOMP_COMPAT)
|
|
if (__kmp_env_consistency_check) {
|
|
if (pr->ordered_bumped) {
|
|
struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
|
|
__kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
|
|
ct_ordered_in_pdo, loc_ref,
|
|
&p->stack_data[p->w_top]);
|
|
}
|
|
}
|
|
#endif /* !defined(KMP_GOMP_COMPAT) */
|
|
|
|
KMP_MB();
|
|
#ifdef KMP_DEBUG
|
|
{
|
|
char *buff;
|
|
// create format specifiers before the debug output
|
|
buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
|
|
"ordered_iter:%%%s lower:%%%s\n",
|
|
traits_t<UT>::spec, traits_t<UT>::spec);
|
|
KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
|
|
__kmp_str_free(&buff);
|
|
}
|
|
#endif
|
|
__kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
|
|
__kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
|
|
KMP_MB(); /* is this necessary? */
|
|
#ifdef KMP_DEBUG
|
|
{
|
|
char *buff;
|
|
// create format specifiers before the debug output
|
|
buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
|
|
"ordered_iter:%%%s lower:%%%s\n",
|
|
traits_t<UT>::spec, traits_t<UT>::spec);
|
|
KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
|
|
__kmp_str_free(&buff);
|
|
}
|
|
#endif
|
|
}
|
|
KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
|
|
}
|
|
|
|
template <typename UT>
|
|
void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
|
|
typedef typename traits_t<UT>::signed_t ST;
|
|
dispatch_private_info_template<UT> *pr;
|
|
|
|
int gtid = *gtid_ref;
|
|
// int cid = *cid_ref;
|
|
kmp_info_t *th = __kmp_threads[gtid];
|
|
KMP_DEBUG_ASSERT(th->th.th_dispatch);
|
|
|
|
KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
|
|
if (__kmp_env_consistency_check) {
|
|
pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
|
|
th->th.th_dispatch->th_dispatch_pr_current);
|
|
if (pr->pushed_ws != ct_none) {
|
|
__kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
|
|
}
|
|
}
|
|
|
|
if (!th->th.th_team->t.t_serialized) {
|
|
dispatch_shared_info_template<UT> *sh =
|
|
reinterpret_cast<dispatch_shared_info_template<UT> *>(
|
|
th->th.th_dispatch->th_dispatch_sh_current);
|
|
|
|
if (!__kmp_env_consistency_check) {
|
|
pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
|
|
th->th.th_dispatch->th_dispatch_pr_current);
|
|
}
|
|
|
|
KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
|
|
#if !defined(KMP_GOMP_COMPAT)
|
|
if (__kmp_env_consistency_check) {
|
|
if (pr->ordered_bumped != 0) {
|
|
struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
|
|
/* How to test it? - OM */
|
|
__kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
|
|
ct_ordered_in_pdo, loc_ref,
|
|
&p->stack_data[p->w_top]);
|
|
}
|
|
}
|
|
#endif /* !defined(KMP_GOMP_COMPAT) */
|
|
|
|
KMP_MB(); /* Flush all pending memory write invalidates. */
|
|
|
|
pr->ordered_bumped += 1;
|
|
|
|
KD_TRACE(1000,
|
|
("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
|
|
gtid, pr->ordered_bumped));
|
|
|
|
KMP_MB(); /* Flush all pending memory write invalidates. */
|
|
|
|
/* TODO use general release procedure? */
|
|
test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
|
|
|
|
KMP_MB(); /* Flush all pending memory write invalidates. */
|
|
}
|
|
KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
|
|
}
|
|
|
|
/* Computes and returns x to the power of y, where y must a non-negative integer
|
|
*/
|
|
template <typename UT>
|
|
static __forceinline long double __kmp_pow(long double x, UT y) {
|
|
long double s = 1.0L;
|
|
|
|
KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
|
|
// KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
|
|
while (y) {
|
|
if (y & 1)
|
|
s *= x;
|
|
x *= x;
|
|
y >>= 1;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
/* Computes and returns the number of unassigned iterations after idx chunks
|
|
have been assigned
|
|
(the total number of unassigned iterations in chunks with index greater than
|
|
or equal to idx).
|
|
__forceinline seems to be broken so that if we __forceinline this function,
|
|
the behavior is wrong
|
|
(one of the unit tests, sch_guided_analytical_basic.cpp, fails)
|
|
*/
|
|
template <typename T>
|
|
static __inline typename traits_t<T>::unsigned_t
|
|
__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
|
|
typename traits_t<T>::unsigned_t idx) {
|
|
/* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
|
|
least for ICL 8.1, long double arithmetic may not really have
|
|
long double precision, even with /Qlong_double. Currently, we
|
|
workaround that in the caller code, by manipulating the FPCW for
|
|
Windows* OS on IA-32 architecture. The lack of precision is not
|
|
expected to be a correctness issue, though.
|
|
*/
|
|
typedef typename traits_t<T>::unsigned_t UT;
|
|
|
|
long double x = tc * __kmp_pow<UT>(base, idx);
|
|
UT r = (UT)x;
|
|
if (x == r)
|
|
return r;
|
|
return r + 1;
|
|
}
|
|
|
|
// Parameters of the guided-iterative algorithm:
|
|
// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
|
|
// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
|
|
// by default n = 2. For example with n = 3 the chunks distribution will be more
|
|
// flat.
|
|
// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
|
|
static const int guided_int_param = 2;
|
|
static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
|
|
#endif // KMP_DISPATCH_H
|