diff --git a/Makefile b/Makefile
index 6e6453a4d..63bce3f8f 100644
--- a/Makefile
+++ b/Makefile
@@ -286,6 +286,7 @@ include third_party/readline/BUILD.mk		# │
 include third_party/libunwind/BUILD.mk		# |
 include third_party/libcxxabi/BUILD.mk		# |
 include third_party/libcxx/BUILD.mk		# │
+include third_party/openmp/BUILD.mk		# │
 include third_party/double-conversion/BUILD.mk	# │
 include third_party/pcre/BUILD.mk		# │
 include third_party/less/BUILD.mk		# │
@@ -441,6 +442,7 @@ COSMOPOLITAN_OBJECTS =			\
 	THIRD_PARTY_GETOPT		\
 	LIBC_LOG			\
 	LIBC_TIME			\
+	THIRD_PARTY_OPENMP		\
 	THIRD_PARTY_MUSL		\
 	THIRD_PARTY_ZLIB_GZ		\
 	THIRD_PARTY_LIBCXXABI		\
@@ -522,6 +524,7 @@ COSMOCC_PKGS =				\
 	THIRD_PARTY_AARCH64		\
 	THIRD_PARTY_LIBCXX		\
 	THIRD_PARTY_LIBCXXABI		\
+	THIRD_PARTY_OPENMP		\
 	THIRD_PARTY_INTEL
 
 o/$(MODE)/cosmopolitan.a:		\
diff --git a/build/rules.mk b/build/rules.mk
index a54591714..728df92fd 100644
--- a/build/rules.mk
+++ b/build/rules.mk
@@ -40,6 +40,9 @@ o/$(MODE)/%.h: %.c
 o/$(MODE)/%.o: %.cc
 	@$(COMPILE) -AOBJECTIFY.cxx $(OBJECTIFY.cxx) $(OUTPUT_OPTION) $<
 
+o/$(MODE)/%.o: %.cpp
+	@$(COMPILE) -AOBJECTIFY.cxx $(OBJECTIFY.cxx) $(OUTPUT_OPTION) $<
+
 o/$(MODE)/%.lds: %.lds
 	@$(COMPILE) -APREPROCESS $(PREPROCESS.lds) $(OUTPUT_OPTION) $<
 
diff --git a/libc/calls/sched_getcpu.c b/libc/calls/sched_getcpu.c
new file mode 100644
index 000000000..761bba995
--- /dev/null
+++ b/libc/calls/sched_getcpu.c
@@ -0,0 +1,47 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/cpuset.h"
+#include "libc/dce.h"
+#include "libc/nexgen32e/rdtscp.h"
+#include "libc/nexgen32e/x86feature.h"
+#include "libc/nt/struct/processornumber.h"
+#include "libc/nt/synchronization.h"
+
+int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);
+
+/**
+ * Returns ID of CPU on which thread is currently scheduled.
+ */
+int sched_getcpu(void) {
+  if (X86_HAVE(RDTSCP)) {
+    unsigned tsc_aux;
+    rdtscp(&tsc_aux);
+    return TSC_AUX_CORE(tsc_aux);
+  } else if (IsWindows()) {
+    struct NtProcessorNumber pn;
+    GetCurrentProcessorNumberEx(&pn);
+    return 64 * pn.Group + pn.Number;
+  } else {
+    unsigned cpu = 0;
+    int rc = sys_getcpu(&cpu, 0, 0);
+    if (rc == -1) return -1;
+    return cpu;
+  }
+}
diff --git a/libc/calls/struct/cpuset.h b/libc/calls/struct/cpuset.h
index e7f3b8dd0..ddae4de88 100644
--- a/libc/calls/struct/cpuset.h
+++ b/libc/calls/struct/cpuset.h
@@ -53,5 +53,7 @@ int CPU_COUNT_S(size_t, const cpu_set_t *) libcesque;
 #define CPU_CLR_S(i, size, set)   _CPU_S(i, size, set, &= ~)
 #define CPU_ISSET_S(i, size, set) _CPU_S(i, size, set, &)
 
+typedef cpu_set_t cpuset_t; /* for freebsd compatibility */
+
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_CPUSET_H_ */
diff --git a/libc/intrin/gettid.c b/libc/intrin/gettid.c
index 2eb0354df..91b37737f 100644
--- a/libc/intrin/gettid.c
+++ b/libc/intrin/gettid.c
@@ -45,7 +45,7 @@ int gettid(void) {
     }
   }
   if (IsXnuSilicon()) {
-    return enosys();
+    return enosys();  // can only happen if we can't access thread local storage
   } else {
     return sys_gettid();
   }
diff --git a/libc/nexgen32e/rdtscp.h b/libc/nexgen32e/rdtscp.h
index 7e8bfe779..9c3502faf 100644
--- a/libc/nexgen32e/rdtscp.h
+++ b/libc/nexgen32e/rdtscp.h
@@ -18,8 +18,8 @@ COSMOPOLITAN_C_START_
  */
 #define rdtscp(OPT_OUT_IA32_TSC_AUX)               \
   ({                                               \
+    uint64_t Rax, Rdx;                             \
     uint32_t Ecx, *EcxOut;                         \
-    uint64_t Rax, Rcx, Rdx;                        \
     asm volatile("rdtscp"                          \
                  : "=a"(Rax), "=c"(Ecx), "=d"(Rdx) \
                  : /* no inputs */                 \
diff --git a/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WaitOnAddress.S b/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WaitOnAddress.S
index a553304de..d243cb370 100644
--- a/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WaitOnAddress.S
+++ b/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WaitOnAddress.S
@@ -2,9 +2,9 @@
 .imp	API-MS-Win-Core-Synch-l1-2-0,__imp_WaitOnAddress,WaitOnAddress
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WaitOnAddress:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WakeByAddressAll.S b/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WakeByAddressAll.S
index e4322b1e7..eba2639ee 100644
--- a/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WakeByAddressAll.S
+++ b/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WakeByAddressAll.S
@@ -2,9 +2,9 @@
 .imp	API-MS-Win-Core-Synch-l1-2-0,__imp_WakeByAddressAll,WakeByAddressAll
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WakeByAddressAll:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WakeByAddressSingle.S b/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WakeByAddressSingle.S
index 32f0d572c..9dcaa68dd 100644
--- a/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WakeByAddressSingle.S
+++ b/libc/nt/API-MS-Win-Core-Synch-l1-2-0/WakeByAddressSingle.S
@@ -2,9 +2,9 @@
 .imp	API-MS-Win-Core-Synch-l1-2-0,__imp_WakeByAddressSingle,WakeByAddressSingle
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WakeByAddressSingle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/PowrProf/SetSuspendState.S b/libc/nt/PowrProf/SetSuspendState.S
index 8c491015f..2f4a337e1 100644
--- a/libc/nt/PowrProf/SetSuspendState.S
+++ b/libc/nt/PowrProf/SetSuspendState.S
@@ -2,9 +2,9 @@
 .imp	PowrProf,__imp_SetSuspendState,SetSuspendState
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetSuspendState:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/AccessCheck.S b/libc/nt/advapi32/AccessCheck.S
index 162dcb215..9819ecb17 100644
--- a/libc/nt/advapi32/AccessCheck.S
+++ b/libc/nt/advapi32/AccessCheck.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_AccessCheck,AccessCheck
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AccessCheck:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/AdjustTokenPrivileges.S b/libc/nt/advapi32/AdjustTokenPrivileges.S
index b6b7d4d2c..f689f9ead 100644
--- a/libc/nt/advapi32/AdjustTokenPrivileges.S
+++ b/libc/nt/advapi32/AdjustTokenPrivileges.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_AdjustTokenPrivileges,AdjustTokenPrivileges
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AdjustTokenPrivileges:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/CreateProcessAsUserW.S b/libc/nt/advapi32/CreateProcessAsUserW.S
index 2d33f18c7..e63d98ec8 100644
--- a/libc/nt/advapi32/CreateProcessAsUserW.S
+++ b/libc/nt/advapi32/CreateProcessAsUserW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_CreateProcessAsUserW,CreateProcessAsUserW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateProcessAsUser:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/DeregisterEventSource.S b/libc/nt/advapi32/DeregisterEventSource.S
index 4130882a1..65039a6c4 100644
--- a/libc/nt/advapi32/DeregisterEventSource.S
+++ b/libc/nt/advapi32/DeregisterEventSource.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_DeregisterEventSource,DeregisterEventSource
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DeregisterEventSource:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/DuplicateToken.S b/libc/nt/advapi32/DuplicateToken.S
index 422d8e113..6e6f7a470 100644
--- a/libc/nt/advapi32/DuplicateToken.S
+++ b/libc/nt/advapi32/DuplicateToken.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_DuplicateToken,DuplicateToken
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DuplicateToken:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/DuplicateTokenEx.S b/libc/nt/advapi32/DuplicateTokenEx.S
index 6517ef2ae..d087b3309 100644
--- a/libc/nt/advapi32/DuplicateTokenEx.S
+++ b/libc/nt/advapi32/DuplicateTokenEx.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_DuplicateTokenEx,DuplicateTokenEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DuplicateTokenEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/GetFileSecurityW.S b/libc/nt/advapi32/GetFileSecurityW.S
index 417073aa1..318d92139 100644
--- a/libc/nt/advapi32/GetFileSecurityW.S
+++ b/libc/nt/advapi32/GetFileSecurityW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_GetFileSecurityW,GetFileSecurityW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetFileSecurity:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/GetUserNameW.S b/libc/nt/advapi32/GetUserNameW.S
index e50fac47e..656ea8ea4 100644
--- a/libc/nt/advapi32/GetUserNameW.S
+++ b/libc/nt/advapi32/GetUserNameW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_GetUserNameW,GetUserNameW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetUserName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/ImpersonateSelf.S b/libc/nt/advapi32/ImpersonateSelf.S
index 7cbbf1480..db383d771 100644
--- a/libc/nt/advapi32/ImpersonateSelf.S
+++ b/libc/nt/advapi32/ImpersonateSelf.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_ImpersonateSelf,ImpersonateSelf
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ImpersonateSelf:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/InitiateShutdownW.S b/libc/nt/advapi32/InitiateShutdownW.S
index 29bca0ac6..c13eb4117 100644
--- a/libc/nt/advapi32/InitiateShutdownW.S
+++ b/libc/nt/advapi32/InitiateShutdownW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_InitiateShutdownW,InitiateShutdownW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 InitiateShutdown:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/LookupPrivilegeValueW.S b/libc/nt/advapi32/LookupPrivilegeValueW.S
index e1df0b364..21b845b4f 100644
--- a/libc/nt/advapi32/LookupPrivilegeValueW.S
+++ b/libc/nt/advapi32/LookupPrivilegeValueW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_LookupPrivilegeValueW,LookupPrivilegeValueW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LookupPrivilegeValue:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/MapGenericMask.S b/libc/nt/advapi32/MapGenericMask.S
index 4bb072a1a..d88e44df9 100644
--- a/libc/nt/advapi32/MapGenericMask.S
+++ b/libc/nt/advapi32/MapGenericMask.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_MapGenericMask,MapGenericMask
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 MapGenericMask:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/OpenProcessToken.S b/libc/nt/advapi32/OpenProcessToken.S
index 704b22ad1..ae930b3f2 100644
--- a/libc/nt/advapi32/OpenProcessToken.S
+++ b/libc/nt/advapi32/OpenProcessToken.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_OpenProcessToken,OpenProcessToken
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 OpenProcessToken:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/OpenThreadToken.S b/libc/nt/advapi32/OpenThreadToken.S
index b4582ccb2..42a461a43 100644
--- a/libc/nt/advapi32/OpenThreadToken.S
+++ b/libc/nt/advapi32/OpenThreadToken.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_OpenThreadToken,OpenThreadToken
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 OpenThreadToken:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegCloseKey.S b/libc/nt/advapi32/RegCloseKey.S
index 45d32d8c7..3592b4509 100644
--- a/libc/nt/advapi32/RegCloseKey.S
+++ b/libc/nt/advapi32/RegCloseKey.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegCloseKey,RegCloseKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegCloseKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegConnectRegistryW.S b/libc/nt/advapi32/RegConnectRegistryW.S
index ea268f64d..0f0940158 100644
--- a/libc/nt/advapi32/RegConnectRegistryW.S
+++ b/libc/nt/advapi32/RegConnectRegistryW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegConnectRegistryW,RegConnectRegistryW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegConnectRegistry:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegCreateKeyExW.S b/libc/nt/advapi32/RegCreateKeyExW.S
index 78f90b845..3d7c044e5 100644
--- a/libc/nt/advapi32/RegCreateKeyExW.S
+++ b/libc/nt/advapi32/RegCreateKeyExW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegCreateKeyExW,RegCreateKeyExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegCreateKeyEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegCreateKeyW.S b/libc/nt/advapi32/RegCreateKeyW.S
index c83abb4f8..2fb969324 100644
--- a/libc/nt/advapi32/RegCreateKeyW.S
+++ b/libc/nt/advapi32/RegCreateKeyW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegCreateKeyW,RegCreateKeyW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegCreateKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegDeleteKeyExW.S b/libc/nt/advapi32/RegDeleteKeyExW.S
index 459ee65fa..4f05588bc 100644
--- a/libc/nt/advapi32/RegDeleteKeyExW.S
+++ b/libc/nt/advapi32/RegDeleteKeyExW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegDeleteKeyExW,RegDeleteKeyExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegDeleteKeyEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegDeleteKeyW.S b/libc/nt/advapi32/RegDeleteKeyW.S
index 298f0d99a..00c8963a8 100644
--- a/libc/nt/advapi32/RegDeleteKeyW.S
+++ b/libc/nt/advapi32/RegDeleteKeyW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegDeleteKeyW,RegDeleteKeyW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegDeleteKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegDeleteTreeW.S b/libc/nt/advapi32/RegDeleteTreeW.S
index 26684967f..fcd4e652a 100644
--- a/libc/nt/advapi32/RegDeleteTreeW.S
+++ b/libc/nt/advapi32/RegDeleteTreeW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegDeleteTreeW,RegDeleteTreeW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegDeleteTree:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegDeleteValueW.S b/libc/nt/advapi32/RegDeleteValueW.S
index dd851108e..2efec2697 100644
--- a/libc/nt/advapi32/RegDeleteValueW.S
+++ b/libc/nt/advapi32/RegDeleteValueW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegDeleteValueW,RegDeleteValueW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegDeleteValue:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegDisablePredefinedCache.S b/libc/nt/advapi32/RegDisablePredefinedCache.S
index e82ede9cc..33cc5aa66 100644
--- a/libc/nt/advapi32/RegDisablePredefinedCache.S
+++ b/libc/nt/advapi32/RegDisablePredefinedCache.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegDisablePredefinedCache,RegDisablePredefinedCache
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegDisablePredefinedCache:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegDisableReflectionKey.S b/libc/nt/advapi32/RegDisableReflectionKey.S
index c4c70f000..46c078dbd 100644
--- a/libc/nt/advapi32/RegDisableReflectionKey.S
+++ b/libc/nt/advapi32/RegDisableReflectionKey.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegDisableReflectionKey,RegDisableReflectionKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegDisableReflectionKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegEnableReflectionKey.S b/libc/nt/advapi32/RegEnableReflectionKey.S
index 97d5fa5b0..12ebc3127 100644
--- a/libc/nt/advapi32/RegEnableReflectionKey.S
+++ b/libc/nt/advapi32/RegEnableReflectionKey.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegEnableReflectionKey,RegEnableReflectionKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegEnableReflectionKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegEnumKeyExW.S b/libc/nt/advapi32/RegEnumKeyExW.S
index eefd94a6e..63b2866f6 100644
--- a/libc/nt/advapi32/RegEnumKeyExW.S
+++ b/libc/nt/advapi32/RegEnumKeyExW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegEnumKeyExW,RegEnumKeyExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegEnumKeyEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegEnumKeyW.S b/libc/nt/advapi32/RegEnumKeyW.S
index 723cb6ff8..08cd8466e 100644
--- a/libc/nt/advapi32/RegEnumKeyW.S
+++ b/libc/nt/advapi32/RegEnumKeyW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegEnumKeyW,RegEnumKeyW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegEnumKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegEnumValueW.S b/libc/nt/advapi32/RegEnumValueW.S
index 1b98ddf7a..e58d424ad 100644
--- a/libc/nt/advapi32/RegEnumValueW.S
+++ b/libc/nt/advapi32/RegEnumValueW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegEnumValueW,RegEnumValueW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegEnumValue:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegFlushKey.S b/libc/nt/advapi32/RegFlushKey.S
index 94de55ede..129ed4f31 100644
--- a/libc/nt/advapi32/RegFlushKey.S
+++ b/libc/nt/advapi32/RegFlushKey.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegFlushKey,RegFlushKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegFlushKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegGetKeySecurity.S b/libc/nt/advapi32/RegGetKeySecurity.S
index 454d16356..aaa0d6b4f 100644
--- a/libc/nt/advapi32/RegGetKeySecurity.S
+++ b/libc/nt/advapi32/RegGetKeySecurity.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegGetKeySecurity,RegGetKeySecurity
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegGetKeySecurity:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegGetValueW.S b/libc/nt/advapi32/RegGetValueW.S
index 17a4c3b4a..d5761c3ee 100644
--- a/libc/nt/advapi32/RegGetValueW.S
+++ b/libc/nt/advapi32/RegGetValueW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegGetValueW,RegGetValueW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegGetValue:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegLoadKeyW.S b/libc/nt/advapi32/RegLoadKeyW.S
index d6d4cad08..2b1b2932b 100644
--- a/libc/nt/advapi32/RegLoadKeyW.S
+++ b/libc/nt/advapi32/RegLoadKeyW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegLoadKeyW,RegLoadKeyW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegLoadKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegNotifyChangeKeyValue.S b/libc/nt/advapi32/RegNotifyChangeKeyValue.S
index 49bb7c139..5eea44a1c 100644
--- a/libc/nt/advapi32/RegNotifyChangeKeyValue.S
+++ b/libc/nt/advapi32/RegNotifyChangeKeyValue.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegNotifyChangeKeyValue,RegNotifyChangeKeyValue
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegNotifyChangeKeyValue:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegOpenCurrentUser.S b/libc/nt/advapi32/RegOpenCurrentUser.S
index 14e14fb52..2a9827015 100644
--- a/libc/nt/advapi32/RegOpenCurrentUser.S
+++ b/libc/nt/advapi32/RegOpenCurrentUser.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegOpenCurrentUser,RegOpenCurrentUser
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegOpenCurrentUser:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegOpenKeyExW.S b/libc/nt/advapi32/RegOpenKeyExW.S
index 88a056832..1b42ea8fd 100644
--- a/libc/nt/advapi32/RegOpenKeyExW.S
+++ b/libc/nt/advapi32/RegOpenKeyExW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegOpenKeyExW,RegOpenKeyExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegOpenKeyEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegOpenUserClassesRoot.S b/libc/nt/advapi32/RegOpenUserClassesRoot.S
index 1d347f946..cda594360 100644
--- a/libc/nt/advapi32/RegOpenUserClassesRoot.S
+++ b/libc/nt/advapi32/RegOpenUserClassesRoot.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegOpenUserClassesRoot,RegOpenUserClassesRoot
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegOpenUserClassesRoot:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegOverridePredefKey.S b/libc/nt/advapi32/RegOverridePredefKey.S
index bfd4e308c..b7b15dcda 100644
--- a/libc/nt/advapi32/RegOverridePredefKey.S
+++ b/libc/nt/advapi32/RegOverridePredefKey.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegOverridePredefKey,RegOverridePredefKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegOverridePredefKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegQueryInfoKeyW.S b/libc/nt/advapi32/RegQueryInfoKeyW.S
index ea8bb226d..cc7630057 100644
--- a/libc/nt/advapi32/RegQueryInfoKeyW.S
+++ b/libc/nt/advapi32/RegQueryInfoKeyW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegQueryInfoKeyW,RegQueryInfoKeyW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegQueryInfoKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegQueryMultipleValuesW.S b/libc/nt/advapi32/RegQueryMultipleValuesW.S
index 0bd9a2d73..47a7eff97 100644
--- a/libc/nt/advapi32/RegQueryMultipleValuesW.S
+++ b/libc/nt/advapi32/RegQueryMultipleValuesW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegQueryMultipleValuesW,RegQueryMultipleValuesW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegQueryMultipleValues:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegQueryReflectionKey.S b/libc/nt/advapi32/RegQueryReflectionKey.S
index 3d1d27fc9..29832e010 100644
--- a/libc/nt/advapi32/RegQueryReflectionKey.S
+++ b/libc/nt/advapi32/RegQueryReflectionKey.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegQueryReflectionKey,RegQueryReflectionKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegQueryReflectionKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegQueryValueExW.S b/libc/nt/advapi32/RegQueryValueExW.S
index 92bbde441..7cd813de5 100644
--- a/libc/nt/advapi32/RegQueryValueExW.S
+++ b/libc/nt/advapi32/RegQueryValueExW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegQueryValueExW,RegQueryValueExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegQueryValueEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegQueryValueW.S b/libc/nt/advapi32/RegQueryValueW.S
index c02b35efd..7c68c0dac 100644
--- a/libc/nt/advapi32/RegQueryValueW.S
+++ b/libc/nt/advapi32/RegQueryValueW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegQueryValueW,RegQueryValueW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegQueryValue:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegReplaceKeyW.S b/libc/nt/advapi32/RegReplaceKeyW.S
index ba2b42108..774ffbd37 100644
--- a/libc/nt/advapi32/RegReplaceKeyW.S
+++ b/libc/nt/advapi32/RegReplaceKeyW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegReplaceKeyW,RegReplaceKeyW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegReplaceKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegRestoreKeyW.S b/libc/nt/advapi32/RegRestoreKeyW.S
index 133da1ea8..00916567b 100644
--- a/libc/nt/advapi32/RegRestoreKeyW.S
+++ b/libc/nt/advapi32/RegRestoreKeyW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegRestoreKeyW,RegRestoreKeyW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegRestoreKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegSaveKeyW.S b/libc/nt/advapi32/RegSaveKeyW.S
index 682946553..72a87ff37 100644
--- a/libc/nt/advapi32/RegSaveKeyW.S
+++ b/libc/nt/advapi32/RegSaveKeyW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegSaveKeyW,RegSaveKeyW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegSaveKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegSetKeySecurity.S b/libc/nt/advapi32/RegSetKeySecurity.S
index 49a0a5879..21963175c 100644
--- a/libc/nt/advapi32/RegSetKeySecurity.S
+++ b/libc/nt/advapi32/RegSetKeySecurity.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegSetKeySecurity,RegSetKeySecurity
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegSetKeySecurity:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegSetValueExW.S b/libc/nt/advapi32/RegSetValueExW.S
index f4f213cd0..c6c52f51e 100644
--- a/libc/nt/advapi32/RegSetValueExW.S
+++ b/libc/nt/advapi32/RegSetValueExW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegSetValueExW,RegSetValueExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegSetValueEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegSetValueW.S b/libc/nt/advapi32/RegSetValueW.S
index ea8bbfab9..1793bede9 100644
--- a/libc/nt/advapi32/RegSetValueW.S
+++ b/libc/nt/advapi32/RegSetValueW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegSetValueW,RegSetValueW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegSetValue:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegUnLoadKeyW.S b/libc/nt/advapi32/RegUnLoadKeyW.S
index b938db468..844260b53 100644
--- a/libc/nt/advapi32/RegUnLoadKeyW.S
+++ b/libc/nt/advapi32/RegUnLoadKeyW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegUnLoadKeyW,RegUnLoadKeyW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegUnLoadKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RegisterEventSourceW.S b/libc/nt/advapi32/RegisterEventSourceW.S
index 83a05098b..a1c525cca 100644
--- a/libc/nt/advapi32/RegisterEventSourceW.S
+++ b/libc/nt/advapi32/RegisterEventSourceW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RegisterEventSourceW,RegisterEventSourceW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegisterEventSource:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/ReportEventA.S b/libc/nt/advapi32/ReportEventA.S
index 37ab47236..eddb3ffa7 100644
--- a/libc/nt/advapi32/ReportEventA.S
+++ b/libc/nt/advapi32/ReportEventA.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_ReportEventA,ReportEventA
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReportEventA:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/ReportEventW.S b/libc/nt/advapi32/ReportEventW.S
index 794170e2c..b11d9230b 100644
--- a/libc/nt/advapi32/ReportEventW.S
+++ b/libc/nt/advapi32/ReportEventW.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_ReportEventW,ReportEventW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReportEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/RevertToSelf.S b/libc/nt/advapi32/RevertToSelf.S
index 4ecb5e741..f7620c03b 100644
--- a/libc/nt/advapi32/RevertToSelf.S
+++ b/libc/nt/advapi32/RevertToSelf.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_RevertToSelf,RevertToSelf
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RevertToSelf:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/advapi32/SystemFunction036.S b/libc/nt/advapi32/SystemFunction036.S
index 332f6038f..4380c47e3 100644
--- a/libc/nt/advapi32/SystemFunction036.S
+++ b/libc/nt/advapi32/SystemFunction036.S
@@ -2,9 +2,9 @@
 .imp	advapi32,__imp_SystemFunction036,SystemFunction036
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlGenRandom:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/comdlg32/ChooseColorW.S b/libc/nt/comdlg32/ChooseColorW.S
index e4c37f733..b743f8a3d 100644
--- a/libc/nt/comdlg32/ChooseColorW.S
+++ b/libc/nt/comdlg32/ChooseColorW.S
@@ -2,9 +2,9 @@
 .imp	comdlg32,__imp_ChooseColorW,ChooseColorW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ChooseColor:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/comdlg32/ChooseFontW.S b/libc/nt/comdlg32/ChooseFontW.S
index a4782e7a4..5d7c4da75 100644
--- a/libc/nt/comdlg32/ChooseFontW.S
+++ b/libc/nt/comdlg32/ChooseFontW.S
@@ -2,9 +2,9 @@
 .imp	comdlg32,__imp_ChooseFontW,ChooseFontW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ChooseFont:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/comdlg32/GetFileTitleW.S b/libc/nt/comdlg32/GetFileTitleW.S
index 9748cbefd..9d82001e3 100644
--- a/libc/nt/comdlg32/GetFileTitleW.S
+++ b/libc/nt/comdlg32/GetFileTitleW.S
@@ -2,9 +2,9 @@
 .imp	comdlg32,__imp_GetFileTitleW,GetFileTitleW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetFileTitle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/comdlg32/GetOpenFileNameW.S b/libc/nt/comdlg32/GetOpenFileNameW.S
index c01881c15..f011fa3f1 100644
--- a/libc/nt/comdlg32/GetOpenFileNameW.S
+++ b/libc/nt/comdlg32/GetOpenFileNameW.S
@@ -2,9 +2,9 @@
 .imp	comdlg32,__imp_GetOpenFileNameW,GetOpenFileNameW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetOpenFileName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/comdlg32/GetSaveFileNameW.S b/libc/nt/comdlg32/GetSaveFileNameW.S
index 8c17100d3..3724601db 100644
--- a/libc/nt/comdlg32/GetSaveFileNameW.S
+++ b/libc/nt/comdlg32/GetSaveFileNameW.S
@@ -2,9 +2,9 @@
 .imp	comdlg32,__imp_GetSaveFileNameW,GetSaveFileNameW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetSaveFileName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/comdlg32/PrintDlgW.S b/libc/nt/comdlg32/PrintDlgW.S
index 660609ef8..e1d4b029b 100644
--- a/libc/nt/comdlg32/PrintDlgW.S
+++ b/libc/nt/comdlg32/PrintDlgW.S
@@ -2,9 +2,9 @@
 .imp	comdlg32,__imp_PrintDlgW,PrintDlgW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PrintDlg:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/comdlg32/ReplaceTextW.S b/libc/nt/comdlg32/ReplaceTextW.S
index 61762b907..4d38dd9fe 100644
--- a/libc/nt/comdlg32/ReplaceTextW.S
+++ b/libc/nt/comdlg32/ReplaceTextW.S
@@ -2,9 +2,9 @@
 .imp	comdlg32,__imp_ReplaceTextW,ReplaceTextW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReplaceText:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/BitBlt.S b/libc/nt/gdi32/BitBlt.S
index 6ae449526..b00ce43b5 100644
--- a/libc/nt/gdi32/BitBlt.S
+++ b/libc/nt/gdi32/BitBlt.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_BitBlt,BitBlt
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 BitBlt:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/ChoosePixelFormat.S b/libc/nt/gdi32/ChoosePixelFormat.S
index 20fb4699b..0efc84c25 100644
--- a/libc/nt/gdi32/ChoosePixelFormat.S
+++ b/libc/nt/gdi32/ChoosePixelFormat.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_ChoosePixelFormat,ChoosePixelFormat
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ChoosePixelFormat:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/CreateBitmap.S b/libc/nt/gdi32/CreateBitmap.S
index d747b7a55..c5b68e326 100644
--- a/libc/nt/gdi32/CreateBitmap.S
+++ b/libc/nt/gdi32/CreateBitmap.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_CreateBitmap,CreateBitmap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateBitmap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/CreateCompatibleBitmap.S b/libc/nt/gdi32/CreateCompatibleBitmap.S
index c67ece9ff..c60770558 100644
--- a/libc/nt/gdi32/CreateCompatibleBitmap.S
+++ b/libc/nt/gdi32/CreateCompatibleBitmap.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_CreateCompatibleBitmap,CreateCompatibleBitmap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateCompatibleBitmap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/CreateCompatibleDC.S b/libc/nt/gdi32/CreateCompatibleDC.S
index 545aa271a..e59fe5293 100644
--- a/libc/nt/gdi32/CreateCompatibleDC.S
+++ b/libc/nt/gdi32/CreateCompatibleDC.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_CreateCompatibleDC,CreateCompatibleDC
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateCompatibleDC:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/CreateDIBSection.S b/libc/nt/gdi32/CreateDIBSection.S
index 8bd327fbd..9d415a60d 100644
--- a/libc/nt/gdi32/CreateDIBSection.S
+++ b/libc/nt/gdi32/CreateDIBSection.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_CreateDIBSection,CreateDIBSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateDIBSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/CreateRectRgn.S b/libc/nt/gdi32/CreateRectRgn.S
index 28fafc1c5..4a7d95df5 100644
--- a/libc/nt/gdi32/CreateRectRgn.S
+++ b/libc/nt/gdi32/CreateRectRgn.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_CreateRectRgn,CreateRectRgn
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateRectRgn:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/DeleteDC.S b/libc/nt/gdi32/DeleteDC.S
index 6789a4595..128ea9875 100644
--- a/libc/nt/gdi32/DeleteDC.S
+++ b/libc/nt/gdi32/DeleteDC.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_DeleteDC,DeleteDC
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DeleteDC:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/DeleteObject.S b/libc/nt/gdi32/DeleteObject.S
index 7b59322a4..acf09e0ae 100644
--- a/libc/nt/gdi32/DeleteObject.S
+++ b/libc/nt/gdi32/DeleteObject.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_DeleteObject,DeleteObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DeleteObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/GetPixel.S b/libc/nt/gdi32/GetPixel.S
index 64be4d296..8a5a9f20e 100644
--- a/libc/nt/gdi32/GetPixel.S
+++ b/libc/nt/gdi32/GetPixel.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_GetPixel,GetPixel
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetPixel:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/RestoreDC.S b/libc/nt/gdi32/RestoreDC.S
index 0bc13138c..a3dd7af2d 100644
--- a/libc/nt/gdi32/RestoreDC.S
+++ b/libc/nt/gdi32/RestoreDC.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_RestoreDC,RestoreDC
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RestoreDC:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/SaveDC.S b/libc/nt/gdi32/SaveDC.S
index 1b6ed114c..804cfdce2 100644
--- a/libc/nt/gdi32/SaveDC.S
+++ b/libc/nt/gdi32/SaveDC.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_SaveDC,SaveDC
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SaveDC:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/SelectObject.S b/libc/nt/gdi32/SelectObject.S
index 02480341b..1d87a6409 100644
--- a/libc/nt/gdi32/SelectObject.S
+++ b/libc/nt/gdi32/SelectObject.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_SelectObject,SelectObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SelectObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/SetBkMode.S b/libc/nt/gdi32/SetBkMode.S
index 6f3afad43..d3b2b6bb6 100644
--- a/libc/nt/gdi32/SetBkMode.S
+++ b/libc/nt/gdi32/SetBkMode.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_SetBkMode,SetBkMode
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetBkMode:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/SetPixel.S b/libc/nt/gdi32/SetPixel.S
index 91df1fb3b..830ed3e6a 100644
--- a/libc/nt/gdi32/SetPixel.S
+++ b/libc/nt/gdi32/SetPixel.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_SetPixel,SetPixel
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetPixel:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/SetPixelFormat.S b/libc/nt/gdi32/SetPixelFormat.S
index b5c6155d2..f56d90bb6 100644
--- a/libc/nt/gdi32/SetPixelFormat.S
+++ b/libc/nt/gdi32/SetPixelFormat.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_SetPixelFormat,SetPixelFormat
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetPixelFormat:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/SetTextAlign.S b/libc/nt/gdi32/SetTextAlign.S
index a5ce6ae5b..b535efb83 100644
--- a/libc/nt/gdi32/SetTextAlign.S
+++ b/libc/nt/gdi32/SetTextAlign.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_SetTextAlign,SetTextAlign
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetTextAlign:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/SetTextColor.S b/libc/nt/gdi32/SetTextColor.S
index 03a6e8227..7edcf2408 100644
--- a/libc/nt/gdi32/SetTextColor.S
+++ b/libc/nt/gdi32/SetTextColor.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_SetTextColor,SetTextColor
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetTextColor:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/SetTextJustification.S b/libc/nt/gdi32/SetTextJustification.S
index b2ca9e1cb..b26f82630 100644
--- a/libc/nt/gdi32/SetTextJustification.S
+++ b/libc/nt/gdi32/SetTextJustification.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_SetTextJustification,SetTextJustification
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetTextJustification:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/gdi32/SwapBuffers.S b/libc/nt/gdi32/SwapBuffers.S
index f2b12ab95..3a50e12ef 100644
--- a/libc/nt/gdi32/SwapBuffers.S
+++ b/libc/nt/gdi32/SwapBuffers.S
@@ -2,9 +2,9 @@
 .imp	gdi32,__imp_SwapBuffers,SwapBuffers
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SwapBuffers:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/AddIPAddress.S b/libc/nt/iphlpapi/AddIPAddress.S
index a209b5aca..562142345 100644
--- a/libc/nt/iphlpapi/AddIPAddress.S
+++ b/libc/nt/iphlpapi/AddIPAddress.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_AddIPAddress,AddIPAddress
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AddIPAddress:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/AllocateAndGetTcpExTableFromStack.S b/libc/nt/iphlpapi/AllocateAndGetTcpExTableFromStack.S
index d30b90fd9..e623d8b2f 100644
--- a/libc/nt/iphlpapi/AllocateAndGetTcpExTableFromStack.S
+++ b/libc/nt/iphlpapi/AllocateAndGetTcpExTableFromStack.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_AllocateAndGetTcpExTableFromStack,AllocateAndGetTcpExTableFromStack
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AllocateAndGetTcpExTableFromStack:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/AllocateAndGetUdpExTableFromStack.S b/libc/nt/iphlpapi/AllocateAndGetUdpExTableFromStack.S
index 1bca2335e..409d48b75 100644
--- a/libc/nt/iphlpapi/AllocateAndGetUdpExTableFromStack.S
+++ b/libc/nt/iphlpapi/AllocateAndGetUdpExTableFromStack.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_AllocateAndGetUdpExTableFromStack,AllocateAndGetUdpExTableFromStack
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AllocateAndGetUdpExTableFromStack:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/CancelIPChangeNotify.S b/libc/nt/iphlpapi/CancelIPChangeNotify.S
index a8c9413cb..bedead0cb 100644
--- a/libc/nt/iphlpapi/CancelIPChangeNotify.S
+++ b/libc/nt/iphlpapi/CancelIPChangeNotify.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_CancelIPChangeNotify,CancelIPChangeNotify
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CancelIPChangeNotify:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/CaptureInterfaceHardwareCrossTimestamp.S b/libc/nt/iphlpapi/CaptureInterfaceHardwareCrossTimestamp.S
index 32c7902e6..a2fce74ce 100644
--- a/libc/nt/iphlpapi/CaptureInterfaceHardwareCrossTimestamp.S
+++ b/libc/nt/iphlpapi/CaptureInterfaceHardwareCrossTimestamp.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_CaptureInterfaceHardwareCrossTimestamp,CaptureInterfaceHardwareCrossTimestamp
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CaptureInterfaceHardwareCrossTimestamp:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/CreateIpForwardEntry.S b/libc/nt/iphlpapi/CreateIpForwardEntry.S
index 035e35198..5722b50f8 100644
--- a/libc/nt/iphlpapi/CreateIpForwardEntry.S
+++ b/libc/nt/iphlpapi/CreateIpForwardEntry.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_CreateIpForwardEntry,CreateIpForwardEntry
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateIpForwardEntry:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/FlushIpNetTable.S b/libc/nt/iphlpapi/FlushIpNetTable.S
index 9c5d85a7f..7de380fc9 100644
--- a/libc/nt/iphlpapi/FlushIpNetTable.S
+++ b/libc/nt/iphlpapi/FlushIpNetTable.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_FlushIpNetTable,FlushIpNetTable
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FlushIpNetTable:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/GetAdapterIndex.S b/libc/nt/iphlpapi/GetAdapterIndex.S
index 82a332225..2fa893bdb 100644
--- a/libc/nt/iphlpapi/GetAdapterIndex.S
+++ b/libc/nt/iphlpapi/GetAdapterIndex.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_GetAdapterIndex,GetAdapterIndex
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetAdapterIndex:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/GetAdapterOrderMap.S b/libc/nt/iphlpapi/GetAdapterOrderMap.S
index afb1397de..b5ef57b9c 100644
--- a/libc/nt/iphlpapi/GetAdapterOrderMap.S
+++ b/libc/nt/iphlpapi/GetAdapterOrderMap.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_GetAdapterOrderMap,GetAdapterOrderMap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetAdapterOrderMap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/GetAdaptersAddresses.S b/libc/nt/iphlpapi/GetAdaptersAddresses.S
index ad6f07db8..0ba3862ea 100644
--- a/libc/nt/iphlpapi/GetAdaptersAddresses.S
+++ b/libc/nt/iphlpapi/GetAdaptersAddresses.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_GetAdaptersAddresses,GetAdaptersAddresses
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetAdaptersAddresses:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/GetAdaptersInfo.S b/libc/nt/iphlpapi/GetAdaptersInfo.S
index fcb78d164..dd39115d1 100644
--- a/libc/nt/iphlpapi/GetAdaptersInfo.S
+++ b/libc/nt/iphlpapi/GetAdaptersInfo.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_GetAdaptersInfo,GetAdaptersInfo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetAdaptersInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/GetBestInterface.S b/libc/nt/iphlpapi/GetBestInterface.S
index be6c4c0f6..806718f04 100644
--- a/libc/nt/iphlpapi/GetBestInterface.S
+++ b/libc/nt/iphlpapi/GetBestInterface.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_GetBestInterface,GetBestInterface
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetBestInterface:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/GetBestInterfaceEx.S b/libc/nt/iphlpapi/GetBestInterfaceEx.S
index 79143b113..0730f8c0c 100644
--- a/libc/nt/iphlpapi/GetBestInterfaceEx.S
+++ b/libc/nt/iphlpapi/GetBestInterfaceEx.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_GetBestInterfaceEx,GetBestInterfaceEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetBestInterfaceEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/GetBestRoute.S b/libc/nt/iphlpapi/GetBestRoute.S
index 4bfccc2ef..e696d3aaf 100644
--- a/libc/nt/iphlpapi/GetBestRoute.S
+++ b/libc/nt/iphlpapi/GetBestRoute.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_GetBestRoute,GetBestRoute
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetBestRoute:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/GetNumberOfInterfaces.S b/libc/nt/iphlpapi/GetNumberOfInterfaces.S
index 8a47ce528..a75864f56 100644
--- a/libc/nt/iphlpapi/GetNumberOfInterfaces.S
+++ b/libc/nt/iphlpapi/GetNumberOfInterfaces.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_GetNumberOfInterfaces,GetNumberOfInterfaces
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetNumberOfInterfaces:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/GetTcpTable.S b/libc/nt/iphlpapi/GetTcpTable.S
index 0e1960eb4..7ff67ecf2 100644
--- a/libc/nt/iphlpapi/GetTcpTable.S
+++ b/libc/nt/iphlpapi/GetTcpTable.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_GetTcpTable,GetTcpTable
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetTcpTable:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/iphlpapi/GetTcpTable2.S b/libc/nt/iphlpapi/GetTcpTable2.S
index afe1aaa74..90419fdc1 100644
--- a/libc/nt/iphlpapi/GetTcpTable2.S
+++ b/libc/nt/iphlpapi/GetTcpTable2.S
@@ -2,9 +2,9 @@
 .imp	iphlpapi,__imp_GetTcpTable2,GetTcpTable2
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetTcpTable2:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/AcquireSRWLockExclusive.S b/libc/nt/kernel32/AcquireSRWLockExclusive.S
index 9aee77c35..cd610ea8b 100644
--- a/libc/nt/kernel32/AcquireSRWLockExclusive.S
+++ b/libc/nt/kernel32/AcquireSRWLockExclusive.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_AcquireSRWLockExclusive,AcquireSRWLockExclusive
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AcquireSRWLockExclusive:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/AcquireSRWLockShared.S b/libc/nt/kernel32/AcquireSRWLockShared.S
index 77aca4beb..e36146bcc 100644
--- a/libc/nt/kernel32/AcquireSRWLockShared.S
+++ b/libc/nt/kernel32/AcquireSRWLockShared.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_AcquireSRWLockShared,AcquireSRWLockShared
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AcquireSRWLockShared:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/AddDllDirectory.S b/libc/nt/kernel32/AddDllDirectory.S
index 03b0aa1b3..efe211ee1 100644
--- a/libc/nt/kernel32/AddDllDirectory.S
+++ b/libc/nt/kernel32/AddDllDirectory.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_AddDllDirectory,AddDllDirectory
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AddDllDirectory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/AddVectoredContinueHandler.S b/libc/nt/kernel32/AddVectoredContinueHandler.S
index 123967910..cefad106c 100644
--- a/libc/nt/kernel32/AddVectoredContinueHandler.S
+++ b/libc/nt/kernel32/AddVectoredContinueHandler.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_AddVectoredContinueHandler,AddVectoredContinueHandler
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AddVectoredContinueHandler:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/AddVectoredExceptionHandler.S b/libc/nt/kernel32/AddVectoredExceptionHandler.S
index 20829c62d..9eb2c9265 100644
--- a/libc/nt/kernel32/AddVectoredExceptionHandler.S
+++ b/libc/nt/kernel32/AddVectoredExceptionHandler.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_AddVectoredExceptionHandler,AddVectoredExceptionHandler
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AddVectoredExceptionHandler:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/AllocConsole.S b/libc/nt/kernel32/AllocConsole.S
index 4c9c8e918..7b03930e3 100644
--- a/libc/nt/kernel32/AllocConsole.S
+++ b/libc/nt/kernel32/AllocConsole.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_AllocConsole,AllocConsole
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AllocConsole:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/AttachConsole.S b/libc/nt/kernel32/AttachConsole.S
index f98ce7a15..f0a6594d2 100644
--- a/libc/nt/kernel32/AttachConsole.S
+++ b/libc/nt/kernel32/AttachConsole.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_AttachConsole,AttachConsole
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AttachConsole:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CallNamedPipeA.S b/libc/nt/kernel32/CallNamedPipeA.S
index e140ef910..71b828e8e 100644
--- a/libc/nt/kernel32/CallNamedPipeA.S
+++ b/libc/nt/kernel32/CallNamedPipeA.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CallNamedPipeA,CallNamedPipeA
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CallNamedPipeA:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CallNamedPipeW.S b/libc/nt/kernel32/CallNamedPipeW.S
index fa699c00c..a63e114e7 100644
--- a/libc/nt/kernel32/CallNamedPipeW.S
+++ b/libc/nt/kernel32/CallNamedPipeW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CallNamedPipeW,CallNamedPipeW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CallNamedPipe:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CancelIo.S b/libc/nt/kernel32/CancelIo.S
index c930034bd..62bab1ec4 100644
--- a/libc/nt/kernel32/CancelIo.S
+++ b/libc/nt/kernel32/CancelIo.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CancelIo,CancelIo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CancelIo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CancelIoEx.S b/libc/nt/kernel32/CancelIoEx.S
index 2ef5fc09d..3a0fe8f9b 100644
--- a/libc/nt/kernel32/CancelIoEx.S
+++ b/libc/nt/kernel32/CancelIoEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CancelIoEx,CancelIoEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CancelIoEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CancelSynchronousIo.S b/libc/nt/kernel32/CancelSynchronousIo.S
index 33d3486ca..ee7434b85 100644
--- a/libc/nt/kernel32/CancelSynchronousIo.S
+++ b/libc/nt/kernel32/CancelSynchronousIo.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CancelSynchronousIo,CancelSynchronousIo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CancelSynchronousIo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CheckRemoteDebuggerPresent.S b/libc/nt/kernel32/CheckRemoteDebuggerPresent.S
index 93cfb58ea..9da745755 100644
--- a/libc/nt/kernel32/CheckRemoteDebuggerPresent.S
+++ b/libc/nt/kernel32/CheckRemoteDebuggerPresent.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CheckRemoteDebuggerPresent,CheckRemoteDebuggerPresent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CheckRemoteDebuggerPresent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ClearCommBreak.S b/libc/nt/kernel32/ClearCommBreak.S
index 22638d50c..4d330ca1f 100644
--- a/libc/nt/kernel32/ClearCommBreak.S
+++ b/libc/nt/kernel32/ClearCommBreak.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ClearCommBreak,ClearCommBreak
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ClearCommBreak:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CloseHandle.S b/libc/nt/kernel32/CloseHandle.S
index d0bf6080e..5a3b2224f 100644
--- a/libc/nt/kernel32/CloseHandle.S
+++ b/libc/nt/kernel32/CloseHandle.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CloseHandle,CloseHandle
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CloseHandle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ClosePseudoConsole.S b/libc/nt/kernel32/ClosePseudoConsole.S
index 66ae2d527..9c33c81b5 100644
--- a/libc/nt/kernel32/ClosePseudoConsole.S
+++ b/libc/nt/kernel32/ClosePseudoConsole.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ClosePseudoConsole,ClosePseudoConsole
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ClosePseudoConsole:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ConnectNamedPipe.S b/libc/nt/kernel32/ConnectNamedPipe.S
index bbaf6a2bc..854c36c56 100644
--- a/libc/nt/kernel32/ConnectNamedPipe.S
+++ b/libc/nt/kernel32/ConnectNamedPipe.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ConnectNamedPipe,ConnectNamedPipe
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ConnectNamedPipe:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ContinueDebugEvent.S b/libc/nt/kernel32/ContinueDebugEvent.S
index c77e9f214..621796bde 100644
--- a/libc/nt/kernel32/ContinueDebugEvent.S
+++ b/libc/nt/kernel32/ContinueDebugEvent.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ContinueDebugEvent,ContinueDebugEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ContinueDebugEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CopyFileW.S b/libc/nt/kernel32/CopyFileW.S
index c5b9dc8f8..ccaa5377d 100644
--- a/libc/nt/kernel32/CopyFileW.S
+++ b/libc/nt/kernel32/CopyFileW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CopyFileW,CopyFileW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CopyFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CreateEventExW.S b/libc/nt/kernel32/CreateEventExW.S
index 00ea2f970..4d1614d8a 100644
--- a/libc/nt/kernel32/CreateEventExW.S
+++ b/libc/nt/kernel32/CreateEventExW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CreateEventExW,CreateEventExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateEventEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CreateEventW.S b/libc/nt/kernel32/CreateEventW.S
index eb80f4c4b..a5aa6cda5 100644
--- a/libc/nt/kernel32/CreateEventW.S
+++ b/libc/nt/kernel32/CreateEventW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CreateEventW,CreateEventW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CreateHardLinkW.S b/libc/nt/kernel32/CreateHardLinkW.S
index 57ec858c6..b0ea184de 100644
--- a/libc/nt/kernel32/CreateHardLinkW.S
+++ b/libc/nt/kernel32/CreateHardLinkW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CreateHardLinkW,CreateHardLinkW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateHardLink:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CreateIoCompletionPort.S b/libc/nt/kernel32/CreateIoCompletionPort.S
index 53025b835..dad978127 100644
--- a/libc/nt/kernel32/CreateIoCompletionPort.S
+++ b/libc/nt/kernel32/CreateIoCompletionPort.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CreateIoCompletionPort,CreateIoCompletionPort
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateIoCompletionPort:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CreatePseudoConsole.S b/libc/nt/kernel32/CreatePseudoConsole.S
index 1de6313ae..0d326315d 100644
--- a/libc/nt/kernel32/CreatePseudoConsole.S
+++ b/libc/nt/kernel32/CreatePseudoConsole.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CreatePseudoConsole,CreatePseudoConsole
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreatePseudoConsole:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CreateSemaphoreW.S b/libc/nt/kernel32/CreateSemaphoreW.S
index 7a2c07569..02be9b198 100644
--- a/libc/nt/kernel32/CreateSemaphoreW.S
+++ b/libc/nt/kernel32/CreateSemaphoreW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CreateSemaphoreW,CreateSemaphoreW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateSemaphore:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CreateToolhelp32Snapshot.S b/libc/nt/kernel32/CreateToolhelp32Snapshot.S
index c3907dbf1..8a9dc0217 100644
--- a/libc/nt/kernel32/CreateToolhelp32Snapshot.S
+++ b/libc/nt/kernel32/CreateToolhelp32Snapshot.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CreateToolhelp32Snapshot,CreateToolhelp32Snapshot
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateToolhelp32Snapshot:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CreateWaitableTimerExW.S b/libc/nt/kernel32/CreateWaitableTimerExW.S
index 0bd970a48..dccd6148f 100644
--- a/libc/nt/kernel32/CreateWaitableTimerExW.S
+++ b/libc/nt/kernel32/CreateWaitableTimerExW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CreateWaitableTimerExW,CreateWaitableTimerExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateWaitableTimerEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/CreateWaitableTimerW.S b/libc/nt/kernel32/CreateWaitableTimerW.S
index c03fdfe06..b2353e579 100644
--- a/libc/nt/kernel32/CreateWaitableTimerW.S
+++ b/libc/nt/kernel32/CreateWaitableTimerW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_CreateWaitableTimerW,CreateWaitableTimerW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateWaitableTimer:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/DebugActiveProcess.S b/libc/nt/kernel32/DebugActiveProcess.S
index 005b37cce..76910b7b4 100644
--- a/libc/nt/kernel32/DebugActiveProcess.S
+++ b/libc/nt/kernel32/DebugActiveProcess.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_DebugActiveProcess,DebugActiveProcess
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DebugActiveProcess:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/DebugActiveProcessStop.S b/libc/nt/kernel32/DebugActiveProcessStop.S
index c0aaf8ec7..d8af4880a 100644
--- a/libc/nt/kernel32/DebugActiveProcessStop.S
+++ b/libc/nt/kernel32/DebugActiveProcessStop.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_DebugActiveProcessStop,DebugActiveProcessStop
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DebugActiveProcessStop:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/DebugBreakProcess.S b/libc/nt/kernel32/DebugBreakProcess.S
index 7841a4a9b..7053588bd 100644
--- a/libc/nt/kernel32/DebugBreakProcess.S
+++ b/libc/nt/kernel32/DebugBreakProcess.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_DebugBreakProcess,DebugBreakProcess
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DebugBreakProcess:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/DeleteCriticalSection.S b/libc/nt/kernel32/DeleteCriticalSection.S
index 6bfb8e1b7..c73f3fa21 100644
--- a/libc/nt/kernel32/DeleteCriticalSection.S
+++ b/libc/nt/kernel32/DeleteCriticalSection.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_DeleteCriticalSection,DeleteCriticalSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DeleteCriticalSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/DeleteProcThreadAttributeList.S b/libc/nt/kernel32/DeleteProcThreadAttributeList.S
index 113cdfc8e..e8d490a58 100644
--- a/libc/nt/kernel32/DeleteProcThreadAttributeList.S
+++ b/libc/nt/kernel32/DeleteProcThreadAttributeList.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_DeleteProcThreadAttributeList,DeleteProcThreadAttributeList
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DeleteProcThreadAttributeList:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/DisconnectNamedPipe.S b/libc/nt/kernel32/DisconnectNamedPipe.S
index d61b2340f..5a10befc5 100644
--- a/libc/nt/kernel32/DisconnectNamedPipe.S
+++ b/libc/nt/kernel32/DisconnectNamedPipe.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_DisconnectNamedPipe,DisconnectNamedPipe
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DisconnectNamedPipe:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/DuplicateHandle.S b/libc/nt/kernel32/DuplicateHandle.S
index d054ec5d6..6e3d89738 100644
--- a/libc/nt/kernel32/DuplicateHandle.S
+++ b/libc/nt/kernel32/DuplicateHandle.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_DuplicateHandle,DuplicateHandle
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DuplicateHandle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/EnterCriticalSection.S b/libc/nt/kernel32/EnterCriticalSection.S
index 289060ebc..d27279be1 100644
--- a/libc/nt/kernel32/EnterCriticalSection.S
+++ b/libc/nt/kernel32/EnterCriticalSection.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_EnterCriticalSection,EnterCriticalSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 EnterCriticalSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ExitProcess.S b/libc/nt/kernel32/ExitProcess.S
index 8178a3d9a..7f9b3151d 100644
--- a/libc/nt/kernel32/ExitProcess.S
+++ b/libc/nt/kernel32/ExitProcess.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ExitProcess,ExitProcess
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ExitProcess:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ExitThread.S b/libc/nt/kernel32/ExitThread.S
index 010e481b5..912619c5a 100644
--- a/libc/nt/kernel32/ExitThread.S
+++ b/libc/nt/kernel32/ExitThread.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ExitThread,ExitThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ExitThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FatalExit.S b/libc/nt/kernel32/FatalExit.S
index a6447503e..3d7ec38a1 100644
--- a/libc/nt/kernel32/FatalExit.S
+++ b/libc/nt/kernel32/FatalExit.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FatalExit,FatalExit
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FatalExit:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FillConsoleOutputAttribute.S b/libc/nt/kernel32/FillConsoleOutputAttribute.S
index c203d953b..207966997 100644
--- a/libc/nt/kernel32/FillConsoleOutputAttribute.S
+++ b/libc/nt/kernel32/FillConsoleOutputAttribute.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FillConsoleOutputAttribute,FillConsoleOutputAttribute
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FillConsoleOutputAttribute:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FillConsoleOutputCharacterW.S b/libc/nt/kernel32/FillConsoleOutputCharacterW.S
index fd6d0506a..48c05a4cf 100644
--- a/libc/nt/kernel32/FillConsoleOutputCharacterW.S
+++ b/libc/nt/kernel32/FillConsoleOutputCharacterW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FillConsoleOutputCharacterW,FillConsoleOutputCharacterW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FillConsoleOutputCharacter:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FindFirstFileExW.S b/libc/nt/kernel32/FindFirstFileExW.S
index 1f0034069..4fbcc1d19 100644
--- a/libc/nt/kernel32/FindFirstFileExW.S
+++ b/libc/nt/kernel32/FindFirstFileExW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FindFirstFileExW,FindFirstFileExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FindFirstFileEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FindFirstVolumeW.S b/libc/nt/kernel32/FindFirstVolumeW.S
index 159805bb3..5336621ca 100644
--- a/libc/nt/kernel32/FindFirstVolumeW.S
+++ b/libc/nt/kernel32/FindFirstVolumeW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FindFirstVolumeW,FindFirstVolumeW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FindFirstVolume:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FindNextVolumeW.S b/libc/nt/kernel32/FindNextVolumeW.S
index 953e8f48a..8b92f61b4 100644
--- a/libc/nt/kernel32/FindNextVolumeW.S
+++ b/libc/nt/kernel32/FindNextVolumeW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FindNextVolumeW,FindNextVolumeW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FindNextVolume:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FindVolumeClose.S b/libc/nt/kernel32/FindVolumeClose.S
index 1ed2789ac..48e40366f 100644
--- a/libc/nt/kernel32/FindVolumeClose.S
+++ b/libc/nt/kernel32/FindVolumeClose.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FindVolumeClose,FindVolumeClose
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FindVolumeClose:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FlushConsoleInputBuffer.S b/libc/nt/kernel32/FlushConsoleInputBuffer.S
index e540831e3..974133261 100644
--- a/libc/nt/kernel32/FlushConsoleInputBuffer.S
+++ b/libc/nt/kernel32/FlushConsoleInputBuffer.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FlushConsoleInputBuffer,FlushConsoleInputBuffer
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FlushConsoleInputBuffer:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FormatMessageA.S b/libc/nt/kernel32/FormatMessageA.S
index 606d454e1..fbb60488f 100644
--- a/libc/nt/kernel32/FormatMessageA.S
+++ b/libc/nt/kernel32/FormatMessageA.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FormatMessageA,FormatMessageA
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FormatMessageA:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FormatMessageW.S b/libc/nt/kernel32/FormatMessageW.S
index acf297a03..624178e99 100644
--- a/libc/nt/kernel32/FormatMessageW.S
+++ b/libc/nt/kernel32/FormatMessageW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FormatMessageW,FormatMessageW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FormatMessage:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FreeConsole.S b/libc/nt/kernel32/FreeConsole.S
index 065558a66..156a23bff 100644
--- a/libc/nt/kernel32/FreeConsole.S
+++ b/libc/nt/kernel32/FreeConsole.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FreeConsole,FreeConsole
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FreeConsole:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FreeEnvironmentStringsW.S b/libc/nt/kernel32/FreeEnvironmentStringsW.S
index 1530ba325..b023c8c32 100644
--- a/libc/nt/kernel32/FreeEnvironmentStringsW.S
+++ b/libc/nt/kernel32/FreeEnvironmentStringsW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FreeEnvironmentStringsW,FreeEnvironmentStringsW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FreeEnvironmentStrings:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FreeLibrary.S b/libc/nt/kernel32/FreeLibrary.S
index 9a51b3d77..4ed923c2d 100644
--- a/libc/nt/kernel32/FreeLibrary.S
+++ b/libc/nt/kernel32/FreeLibrary.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FreeLibrary,FreeLibrary
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FreeLibrary:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/FreeResource.S b/libc/nt/kernel32/FreeResource.S
index 92d1d8a54..ea4ea8fd6 100644
--- a/libc/nt/kernel32/FreeResource.S
+++ b/libc/nt/kernel32/FreeResource.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_FreeResource,FreeResource
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FreeResource:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetCommandLineW.S b/libc/nt/kernel32/GetCommandLineW.S
index fba9cf257..d57917bcb 100644
--- a/libc/nt/kernel32/GetCommandLineW.S
+++ b/libc/nt/kernel32/GetCommandLineW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetCommandLineW,GetCommandLineW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetCommandLine:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetCompressedFileSizeW.S b/libc/nt/kernel32/GetCompressedFileSizeW.S
index f2e936850..875d31074 100644
--- a/libc/nt/kernel32/GetCompressedFileSizeW.S
+++ b/libc/nt/kernel32/GetCompressedFileSizeW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetCompressedFileSizeW,GetCompressedFileSizeW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetCompressedFileSize:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetComputerNameExW.S b/libc/nt/kernel32/GetComputerNameExW.S
index c2a595bc7..5fcfbb4cf 100644
--- a/libc/nt/kernel32/GetComputerNameExW.S
+++ b/libc/nt/kernel32/GetComputerNameExW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetComputerNameExW,GetComputerNameExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetComputerNameEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetConsoleCP.S b/libc/nt/kernel32/GetConsoleCP.S
index bc913bc1d..47701af68 100644
--- a/libc/nt/kernel32/GetConsoleCP.S
+++ b/libc/nt/kernel32/GetConsoleCP.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetConsoleCP,GetConsoleCP
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetConsoleCP:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetConsoleCursorInfo.S b/libc/nt/kernel32/GetConsoleCursorInfo.S
index bd69dc160..156c08d77 100644
--- a/libc/nt/kernel32/GetConsoleCursorInfo.S
+++ b/libc/nt/kernel32/GetConsoleCursorInfo.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetConsoleCursorInfo,GetConsoleCursorInfo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetConsoleCursorInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetConsoleMode.S b/libc/nt/kernel32/GetConsoleMode.S
index 0ea264b22..118be8689 100644
--- a/libc/nt/kernel32/GetConsoleMode.S
+++ b/libc/nt/kernel32/GetConsoleMode.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetConsoleMode,GetConsoleMode
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetConsoleMode:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetConsoleOutputCP.S b/libc/nt/kernel32/GetConsoleOutputCP.S
index 8f5f15873..f24ac9ef8 100644
--- a/libc/nt/kernel32/GetConsoleOutputCP.S
+++ b/libc/nt/kernel32/GetConsoleOutputCP.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetConsoleOutputCP,GetConsoleOutputCP
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetConsoleOutputCP:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetConsoleScreenBufferInfo.S b/libc/nt/kernel32/GetConsoleScreenBufferInfo.S
index d9bd67a7e..1c1877e25 100644
--- a/libc/nt/kernel32/GetConsoleScreenBufferInfo.S
+++ b/libc/nt/kernel32/GetConsoleScreenBufferInfo.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetConsoleScreenBufferInfo,GetConsoleScreenBufferInfo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetConsoleScreenBufferInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetConsoleScreenBufferInfoEx.S b/libc/nt/kernel32/GetConsoleScreenBufferInfoEx.S
index 84caa6220..a8e91ee68 100644
--- a/libc/nt/kernel32/GetConsoleScreenBufferInfoEx.S
+++ b/libc/nt/kernel32/GetConsoleScreenBufferInfoEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetConsoleScreenBufferInfoEx,GetConsoleScreenBufferInfoEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetConsoleScreenBufferInfoEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetConsoleSelectionInfo.S b/libc/nt/kernel32/GetConsoleSelectionInfo.S
index 7cbd2092f..69489bf8d 100644
--- a/libc/nt/kernel32/GetConsoleSelectionInfo.S
+++ b/libc/nt/kernel32/GetConsoleSelectionInfo.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetConsoleSelectionInfo,GetConsoleSelectionInfo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetConsoleSelectionInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetConsoleTitleW.S b/libc/nt/kernel32/GetConsoleTitleW.S
index a05f7482a..3767aedfc 100644
--- a/libc/nt/kernel32/GetConsoleTitleW.S
+++ b/libc/nt/kernel32/GetConsoleTitleW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetConsoleTitleW,GetConsoleTitleW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetConsoleTitle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetConsoleWindow.S b/libc/nt/kernel32/GetConsoleWindow.S
index 6dcad7b82..ef67d908d 100644
--- a/libc/nt/kernel32/GetConsoleWindow.S
+++ b/libc/nt/kernel32/GetConsoleWindow.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetConsoleWindow,GetConsoleWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetConsoleWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetCurrentDirectoryW.S b/libc/nt/kernel32/GetCurrentDirectoryW.S
index ded8786d7..454f47630 100644
--- a/libc/nt/kernel32/GetCurrentDirectoryW.S
+++ b/libc/nt/kernel32/GetCurrentDirectoryW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetCurrentDirectoryW,GetCurrentDirectoryW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetCurrentDirectory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetCurrentProcessId.S b/libc/nt/kernel32/GetCurrentProcessId.S
index 805231fa8..330f9865f 100644
--- a/libc/nt/kernel32/GetCurrentProcessId.S
+++ b/libc/nt/kernel32/GetCurrentProcessId.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetCurrentProcessId,GetCurrentProcessId
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetCurrentProcessId:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetCurrentProcessorNumberEx.S b/libc/nt/kernel32/GetCurrentProcessorNumberEx.S
new file mode 100644
index 000000000..c74111ad2
--- /dev/null
+++ b/libc/nt/kernel32/GetCurrentProcessorNumberEx.S
@@ -0,0 +1,20 @@
+#include "libc/nt/codegen.h"
+.imp	kernel32,__imp_GetCurrentProcessorNumberEx,GetCurrentProcessorNumberEx
+
+	.text.windows
+	.ftrace1
+GetCurrentProcessorNumberEx:
+	.ftrace2
+#ifdef __x86_64__
+	push	%rbp
+	mov	%rsp,%rbp
+	mov	%rdi,%rcx
+	sub	$32,%rsp
+	call	*__imp_GetCurrentProcessorNumberEx(%rip)
+	leave
+#elif defined(__aarch64__)
+	mov	x0,#0
+#endif
+	ret
+	.endfn	GetCurrentProcessorNumberEx,globl
+	.previous
diff --git a/libc/nt/kernel32/GetCurrentThread.S b/libc/nt/kernel32/GetCurrentThread.S
index e4916d249..18e8e6b7f 100644
--- a/libc/nt/kernel32/GetCurrentThread.S
+++ b/libc/nt/kernel32/GetCurrentThread.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetCurrentThread,GetCurrentThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetCurrentThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetCurrentThreadId.S b/libc/nt/kernel32/GetCurrentThreadId.S
index 7053df5f8..53ba52dbb 100644
--- a/libc/nt/kernel32/GetCurrentThreadId.S
+++ b/libc/nt/kernel32/GetCurrentThreadId.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetCurrentThreadId,GetCurrentThreadId
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetCurrentThreadId:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetEnvironmentStringsW.S b/libc/nt/kernel32/GetEnvironmentStringsW.S
index 7755b6829..b63709b9f 100644
--- a/libc/nt/kernel32/GetEnvironmentStringsW.S
+++ b/libc/nt/kernel32/GetEnvironmentStringsW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetEnvironmentStringsW,GetEnvironmentStringsW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetEnvironmentStrings:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetEnvironmentVariableW.S b/libc/nt/kernel32/GetEnvironmentVariableW.S
index 45909f189..e87f878b1 100644
--- a/libc/nt/kernel32/GetEnvironmentVariableW.S
+++ b/libc/nt/kernel32/GetEnvironmentVariableW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetEnvironmentVariableW,GetEnvironmentVariableW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetEnvironmentVariable:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetExitCodeThread.S b/libc/nt/kernel32/GetExitCodeThread.S
index 6eb4bd7e7..871084245 100644
--- a/libc/nt/kernel32/GetExitCodeThread.S
+++ b/libc/nt/kernel32/GetExitCodeThread.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetExitCodeThread,GetExitCodeThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetExitCodeThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetFileAttributesExW.S b/libc/nt/kernel32/GetFileAttributesExW.S
index 45f718e41..fbd1f32c3 100644
--- a/libc/nt/kernel32/GetFileAttributesExW.S
+++ b/libc/nt/kernel32/GetFileAttributesExW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetFileAttributesExW,GetFileAttributesExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetFileAttributesEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetFileInformationByHandle.S b/libc/nt/kernel32/GetFileInformationByHandle.S
index b819f51e1..49ed3e7d9 100644
--- a/libc/nt/kernel32/GetFileInformationByHandle.S
+++ b/libc/nt/kernel32/GetFileInformationByHandle.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetFileInformationByHandle,GetFileInformationByHandle
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetFileInformationByHandle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetFileInformationByHandleEx.S b/libc/nt/kernel32/GetFileInformationByHandleEx.S
index 01862e594..7df36849b 100644
--- a/libc/nt/kernel32/GetFileInformationByHandleEx.S
+++ b/libc/nt/kernel32/GetFileInformationByHandleEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetFileInformationByHandleEx,GetFileInformationByHandleEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetFileInformationByHandleEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetFileSize.S b/libc/nt/kernel32/GetFileSize.S
index 2dfa1b3d0..3f628cf24 100644
--- a/libc/nt/kernel32/GetFileSize.S
+++ b/libc/nt/kernel32/GetFileSize.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetFileSize,GetFileSize
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetFileSize:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetFileSizeEx.S b/libc/nt/kernel32/GetFileSizeEx.S
index 49749b315..0558a1953 100644
--- a/libc/nt/kernel32/GetFileSizeEx.S
+++ b/libc/nt/kernel32/GetFileSizeEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetFileSizeEx,GetFileSizeEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetFileSizeEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetFileTime.S b/libc/nt/kernel32/GetFileTime.S
index 54c6a88e8..6f7ad4927 100644
--- a/libc/nt/kernel32/GetFileTime.S
+++ b/libc/nt/kernel32/GetFileTime.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetFileTime,GetFileTime
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetFileTime:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetFileType.S b/libc/nt/kernel32/GetFileType.S
index 00ba964bb..618f9ed01 100644
--- a/libc/nt/kernel32/GetFileType.S
+++ b/libc/nt/kernel32/GetFileType.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetFileType,GetFileType
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetFileType:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetFinalPathNameByHandleW.S b/libc/nt/kernel32/GetFinalPathNameByHandleW.S
index b1bb54f8b..fa001492c 100644
--- a/libc/nt/kernel32/GetFinalPathNameByHandleW.S
+++ b/libc/nt/kernel32/GetFinalPathNameByHandleW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetFinalPathNameByHandleW,GetFinalPathNameByHandleW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetFinalPathNameByHandle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetFullPathNameW.S b/libc/nt/kernel32/GetFullPathNameW.S
index 527328019..bb660e7e5 100644
--- a/libc/nt/kernel32/GetFullPathNameW.S
+++ b/libc/nt/kernel32/GetFullPathNameW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetFullPathNameW,GetFullPathNameW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetFullPathName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetHandleInformation.S b/libc/nt/kernel32/GetHandleInformation.S
index af7515e39..3210433a0 100644
--- a/libc/nt/kernel32/GetHandleInformation.S
+++ b/libc/nt/kernel32/GetHandleInformation.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetHandleInformation,GetHandleInformation
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetHandleInformation:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetLargestConsoleWindowSize.S b/libc/nt/kernel32/GetLargestConsoleWindowSize.S
index 823facd4a..6aebeb93d 100644
--- a/libc/nt/kernel32/GetLargestConsoleWindowSize.S
+++ b/libc/nt/kernel32/GetLargestConsoleWindowSize.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetLargestConsoleWindowSize,GetLargestConsoleWindowSize
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetLargestConsoleWindowSize:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetLastError.S b/libc/nt/kernel32/GetLastError.S
index 3d2bc4b68..cae8fbb7b 100644
--- a/libc/nt/kernel32/GetLastError.S
+++ b/libc/nt/kernel32/GetLastError.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetLastError,GetLastError
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetLastError:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetLogicalDrives.S b/libc/nt/kernel32/GetLogicalDrives.S
index 0e9dee042..dcbb6ec6d 100644
--- a/libc/nt/kernel32/GetLogicalDrives.S
+++ b/libc/nt/kernel32/GetLogicalDrives.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetLogicalDrives,GetLogicalDrives
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetLogicalDrives:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetMaximumProcessorCount.S b/libc/nt/kernel32/GetMaximumProcessorCount.S
index d031cf453..9a5ebd8cf 100644
--- a/libc/nt/kernel32/GetMaximumProcessorCount.S
+++ b/libc/nt/kernel32/GetMaximumProcessorCount.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetMaximumProcessorCount,GetMaximumProcessorCount
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetMaximumProcessorCount:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetModuleFileNameW.S b/libc/nt/kernel32/GetModuleFileNameW.S
index 6d06660ca..97590b348 100644
--- a/libc/nt/kernel32/GetModuleFileNameW.S
+++ b/libc/nt/kernel32/GetModuleFileNameW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetModuleFileNameW,GetModuleFileNameW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetModuleFileName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetModuleHandleA.S b/libc/nt/kernel32/GetModuleHandleA.S
index 79c6e3972..305e0c31a 100644
--- a/libc/nt/kernel32/GetModuleHandleA.S
+++ b/libc/nt/kernel32/GetModuleHandleA.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetModuleHandleA,GetModuleHandleA
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetModuleHandle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetModuleHandleExW.S b/libc/nt/kernel32/GetModuleHandleExW.S
index 6b4ad847f..a351a2cf4 100644
--- a/libc/nt/kernel32/GetModuleHandleExW.S
+++ b/libc/nt/kernel32/GetModuleHandleExW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetModuleHandleExW,GetModuleHandleExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetModuleHandleEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetModuleHandleW.S b/libc/nt/kernel32/GetModuleHandleW.S
index 98f7dd750..8b207b348 100644
--- a/libc/nt/kernel32/GetModuleHandleW.S
+++ b/libc/nt/kernel32/GetModuleHandleW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetModuleHandleW,GetModuleHandleW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetModuleHandleW:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetNamedPipeInfo.S b/libc/nt/kernel32/GetNamedPipeInfo.S
index 275043c27..c7cc3444d 100644
--- a/libc/nt/kernel32/GetNamedPipeInfo.S
+++ b/libc/nt/kernel32/GetNamedPipeInfo.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetNamedPipeInfo,GetNamedPipeInfo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetNamedPipeInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetNumberOfConsoleInputEvents.S b/libc/nt/kernel32/GetNumberOfConsoleInputEvents.S
index 1299b4dc0..4c18f3a49 100644
--- a/libc/nt/kernel32/GetNumberOfConsoleInputEvents.S
+++ b/libc/nt/kernel32/GetNumberOfConsoleInputEvents.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetNumberOfConsoleInputEvents,GetNumberOfConsoleInputEvents
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetNumberOfConsoleInputEvents:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetNumberOfConsoleMouseButtons.S b/libc/nt/kernel32/GetNumberOfConsoleMouseButtons.S
index 113a2c37e..51cfdad96 100644
--- a/libc/nt/kernel32/GetNumberOfConsoleMouseButtons.S
+++ b/libc/nt/kernel32/GetNumberOfConsoleMouseButtons.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetNumberOfConsoleMouseButtons,GetNumberOfConsoleMouseButtons
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetNumberOfConsoleMouseButtons:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetOverlappedResult.S b/libc/nt/kernel32/GetOverlappedResult.S
index 191241fc5..d8ed51733 100644
--- a/libc/nt/kernel32/GetOverlappedResult.S
+++ b/libc/nt/kernel32/GetOverlappedResult.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetOverlappedResult,GetOverlappedResult
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetOverlappedResult:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetOverlappedResultEx.S b/libc/nt/kernel32/GetOverlappedResultEx.S
index e6e9b6820..53888f243 100644
--- a/libc/nt/kernel32/GetOverlappedResultEx.S
+++ b/libc/nt/kernel32/GetOverlappedResultEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetOverlappedResultEx,GetOverlappedResultEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetOverlappedResultEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetPriorityClass.S b/libc/nt/kernel32/GetPriorityClass.S
index 23ce72413..2eab73485 100644
--- a/libc/nt/kernel32/GetPriorityClass.S
+++ b/libc/nt/kernel32/GetPriorityClass.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetPriorityClass,GetPriorityClass
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetPriorityClass:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcAddress.S b/libc/nt/kernel32/GetProcAddress.S
index 8483e3dad..c792fd56c 100644
--- a/libc/nt/kernel32/GetProcAddress.S
+++ b/libc/nt/kernel32/GetProcAddress.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcAddress,GetProcAddress
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcAddress:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessAffinityMask.S b/libc/nt/kernel32/GetProcessAffinityMask.S
index b417e6b5e..3bfeaf936 100644
--- a/libc/nt/kernel32/GetProcessAffinityMask.S
+++ b/libc/nt/kernel32/GetProcessAffinityMask.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessAffinityMask,GetProcessAffinityMask
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessAffinityMask:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessHandleCount.S b/libc/nt/kernel32/GetProcessHandleCount.S
index 85f55877d..2268f19ac 100644
--- a/libc/nt/kernel32/GetProcessHandleCount.S
+++ b/libc/nt/kernel32/GetProcessHandleCount.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessHandleCount,GetProcessHandleCount
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessHandleCount:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessHeap.S b/libc/nt/kernel32/GetProcessHeap.S
index 837bd96a4..df246015a 100644
--- a/libc/nt/kernel32/GetProcessHeap.S
+++ b/libc/nt/kernel32/GetProcessHeap.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessHeap,GetProcessHeap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessHeap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessHeaps.S b/libc/nt/kernel32/GetProcessHeaps.S
index 17edbc96d..e68dd8728 100644
--- a/libc/nt/kernel32/GetProcessHeaps.S
+++ b/libc/nt/kernel32/GetProcessHeaps.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessHeaps,GetProcessHeaps
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessHeaps:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessId.S b/libc/nt/kernel32/GetProcessId.S
index ca20798d8..6190236e4 100644
--- a/libc/nt/kernel32/GetProcessId.S
+++ b/libc/nt/kernel32/GetProcessId.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessId,GetProcessId
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessId:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessIdOfThread.S b/libc/nt/kernel32/GetProcessIdOfThread.S
index 548fd2010..d79643c87 100644
--- a/libc/nt/kernel32/GetProcessIdOfThread.S
+++ b/libc/nt/kernel32/GetProcessIdOfThread.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessIdOfThread,GetProcessIdOfThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessIdOfThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessInformation.S b/libc/nt/kernel32/GetProcessInformation.S
index d583f042c..b3f97877a 100644
--- a/libc/nt/kernel32/GetProcessInformation.S
+++ b/libc/nt/kernel32/GetProcessInformation.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessInformation,GetProcessInformation
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessInformation:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessIoCounters.S b/libc/nt/kernel32/GetProcessIoCounters.S
index 11897a883..afb68db00 100644
--- a/libc/nt/kernel32/GetProcessIoCounters.S
+++ b/libc/nt/kernel32/GetProcessIoCounters.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessIoCounters,GetProcessIoCounters
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessIoCounters:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessPriorityBoost.S b/libc/nt/kernel32/GetProcessPriorityBoost.S
index 3a077d873..a6c261989 100644
--- a/libc/nt/kernel32/GetProcessPriorityBoost.S
+++ b/libc/nt/kernel32/GetProcessPriorityBoost.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessPriorityBoost,GetProcessPriorityBoost
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessPriorityBoost:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessTimes.S b/libc/nt/kernel32/GetProcessTimes.S
index 8dcaf1d76..38d7a3e6f 100644
--- a/libc/nt/kernel32/GetProcessTimes.S
+++ b/libc/nt/kernel32/GetProcessTimes.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessTimes,GetProcessTimes
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessTimes:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessWorkingSetSize.S b/libc/nt/kernel32/GetProcessWorkingSetSize.S
index ef14007c1..bf3c93edf 100644
--- a/libc/nt/kernel32/GetProcessWorkingSetSize.S
+++ b/libc/nt/kernel32/GetProcessWorkingSetSize.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessWorkingSetSize,GetProcessWorkingSetSize
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessWorkingSetSize:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetProcessWorkingSetSizeEx.S b/libc/nt/kernel32/GetProcessWorkingSetSizeEx.S
index 0321193be..90bbd7ed0 100644
--- a/libc/nt/kernel32/GetProcessWorkingSetSizeEx.S
+++ b/libc/nt/kernel32/GetProcessWorkingSetSizeEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetProcessWorkingSetSizeEx,GetProcessWorkingSetSizeEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessWorkingSetSizeEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetQueuedCompletionStatus.S b/libc/nt/kernel32/GetQueuedCompletionStatus.S
index 99f51ae76..26d6b9c1e 100644
--- a/libc/nt/kernel32/GetQueuedCompletionStatus.S
+++ b/libc/nt/kernel32/GetQueuedCompletionStatus.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetQueuedCompletionStatus,GetQueuedCompletionStatus
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetQueuedCompletionStatus:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetQueuedCompletionStatusEx.S b/libc/nt/kernel32/GetQueuedCompletionStatusEx.S
index b818443b9..f9d1f5399 100644
--- a/libc/nt/kernel32/GetQueuedCompletionStatusEx.S
+++ b/libc/nt/kernel32/GetQueuedCompletionStatusEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetQueuedCompletionStatusEx,GetQueuedCompletionStatusEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetQueuedCompletionStatusEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetStartupInfoW.S b/libc/nt/kernel32/GetStartupInfoW.S
index 45ffea82e..1a7081ff8 100644
--- a/libc/nt/kernel32/GetStartupInfoW.S
+++ b/libc/nt/kernel32/GetStartupInfoW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetStartupInfoW,GetStartupInfoW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetStartupInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetStdHandle.S b/libc/nt/kernel32/GetStdHandle.S
index aeb258dab..a09027a25 100644
--- a/libc/nt/kernel32/GetStdHandle.S
+++ b/libc/nt/kernel32/GetStdHandle.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetStdHandle,GetStdHandle
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetStdHandle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetSystemDirectoryA.S b/libc/nt/kernel32/GetSystemDirectoryA.S
index f4e57e8dc..3fbe9e7c9 100644
--- a/libc/nt/kernel32/GetSystemDirectoryA.S
+++ b/libc/nt/kernel32/GetSystemDirectoryA.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetSystemDirectoryA,GetSystemDirectoryA
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetSystemDirectoryA:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetSystemDirectoryW.S b/libc/nt/kernel32/GetSystemDirectoryW.S
index 556391d51..86301b3cf 100644
--- a/libc/nt/kernel32/GetSystemDirectoryW.S
+++ b/libc/nt/kernel32/GetSystemDirectoryW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetSystemDirectoryW,GetSystemDirectoryW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetSystemDirectory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetSystemInfo.S b/libc/nt/kernel32/GetSystemInfo.S
index 872c7a25f..e4d8e5234 100644
--- a/libc/nt/kernel32/GetSystemInfo.S
+++ b/libc/nt/kernel32/GetSystemInfo.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetSystemInfo,GetSystemInfo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetSystemInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetSystemTime.S b/libc/nt/kernel32/GetSystemTime.S
index 32c8adaa4..e9f52309d 100644
--- a/libc/nt/kernel32/GetSystemTime.S
+++ b/libc/nt/kernel32/GetSystemTime.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetSystemTime,GetSystemTime
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetSystemTime:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetSystemTimeAdjustment.S b/libc/nt/kernel32/GetSystemTimeAdjustment.S
index fde412982..7b96f22b6 100644
--- a/libc/nt/kernel32/GetSystemTimeAdjustment.S
+++ b/libc/nt/kernel32/GetSystemTimeAdjustment.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetSystemTimeAdjustment,GetSystemTimeAdjustment
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetSystemTimeAdjustment:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetSystemTimeAsFileTime.S b/libc/nt/kernel32/GetSystemTimeAsFileTime.S
index 35948f348..6a534c96b 100644
--- a/libc/nt/kernel32/GetSystemTimeAsFileTime.S
+++ b/libc/nt/kernel32/GetSystemTimeAsFileTime.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetSystemTimeAsFileTime,GetSystemTimeAsFileTime
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetSystemTimeAsFileTime:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetSystemTimePreciseAsFileTime.S b/libc/nt/kernel32/GetSystemTimePreciseAsFileTime.S
index a1dd973b8..048390f4b 100644
--- a/libc/nt/kernel32/GetSystemTimePreciseAsFileTime.S
+++ b/libc/nt/kernel32/GetSystemTimePreciseAsFileTime.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetSystemTimePreciseAsFileTime,GetSystemTimePreciseAsFileTime
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetSystemTimePreciseAsFileTime:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetSystemTimes.S b/libc/nt/kernel32/GetSystemTimes.S
index eb506aadc..14980f77c 100644
--- a/libc/nt/kernel32/GetSystemTimes.S
+++ b/libc/nt/kernel32/GetSystemTimes.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetSystemTimes,GetSystemTimes
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetSystemTimes:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetTempPathA.S b/libc/nt/kernel32/GetTempPathA.S
index bb6b9ef0d..f534edbbc 100644
--- a/libc/nt/kernel32/GetTempPathA.S
+++ b/libc/nt/kernel32/GetTempPathA.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetTempPathA,GetTempPathA
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetTempPathA:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetTempPathW.S b/libc/nt/kernel32/GetTempPathW.S
index 5f6e485f9..2eec121d2 100644
--- a/libc/nt/kernel32/GetTempPathW.S
+++ b/libc/nt/kernel32/GetTempPathW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetTempPathW,GetTempPathW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetTempPath:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetThreadContext.S b/libc/nt/kernel32/GetThreadContext.S
index ce82f481f..de121fcb1 100644
--- a/libc/nt/kernel32/GetThreadContext.S
+++ b/libc/nt/kernel32/GetThreadContext.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetThreadContext,GetThreadContext
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetThreadContext:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetThreadDescription.S b/libc/nt/kernel32/GetThreadDescription.S
index fb3e3b7fc..257d1e89d 100644
--- a/libc/nt/kernel32/GetThreadDescription.S
+++ b/libc/nt/kernel32/GetThreadDescription.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetThreadDescription,GetThreadDescription
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetThreadDescription:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetThreadIOPendingFlag.S b/libc/nt/kernel32/GetThreadIOPendingFlag.S
index 2ba5bac5d..bea685949 100644
--- a/libc/nt/kernel32/GetThreadIOPendingFlag.S
+++ b/libc/nt/kernel32/GetThreadIOPendingFlag.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetThreadIOPendingFlag,GetThreadIOPendingFlag
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetThreadIOPendingFlag:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetThreadId.S b/libc/nt/kernel32/GetThreadId.S
index 314c434a4..8f7b2f11a 100644
--- a/libc/nt/kernel32/GetThreadId.S
+++ b/libc/nt/kernel32/GetThreadId.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetThreadId,GetThreadId
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetThreadId:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetThreadPriority.S b/libc/nt/kernel32/GetThreadPriority.S
index ef7c26182..8a67e00ff 100644
--- a/libc/nt/kernel32/GetThreadPriority.S
+++ b/libc/nt/kernel32/GetThreadPriority.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetThreadPriority,GetThreadPriority
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetThreadPriority:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetThreadPriorityBoost.S b/libc/nt/kernel32/GetThreadPriorityBoost.S
index a3a380b53..575ba85c4 100644
--- a/libc/nt/kernel32/GetThreadPriorityBoost.S
+++ b/libc/nt/kernel32/GetThreadPriorityBoost.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetThreadPriorityBoost,GetThreadPriorityBoost
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetThreadPriorityBoost:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetThreadTimes.S b/libc/nt/kernel32/GetThreadTimes.S
index 32beb39b6..4f7e629a3 100644
--- a/libc/nt/kernel32/GetThreadTimes.S
+++ b/libc/nt/kernel32/GetThreadTimes.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetThreadTimes,GetThreadTimes
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetThreadTimes:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetTickCount64.S b/libc/nt/kernel32/GetTickCount64.S
index 74f5f0aa3..51bd43d7e 100644
--- a/libc/nt/kernel32/GetTickCount64.S
+++ b/libc/nt/kernel32/GetTickCount64.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetTickCount64,GetTickCount64
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetTickCount64:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetVersionExW.S b/libc/nt/kernel32/GetVersionExW.S
index f3b3e9ec1..1de5915b1 100644
--- a/libc/nt/kernel32/GetVersionExW.S
+++ b/libc/nt/kernel32/GetVersionExW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetVersionExW,GetVersionExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetVersionEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetVolumeInformationByHandleW.S b/libc/nt/kernel32/GetVolumeInformationByHandleW.S
index a5225aaa6..f67a6c853 100644
--- a/libc/nt/kernel32/GetVolumeInformationByHandleW.S
+++ b/libc/nt/kernel32/GetVolumeInformationByHandleW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetVolumeInformationByHandleW,GetVolumeInformationByHandleW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetVolumeInformationByHandle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetVolumePathNameW.S b/libc/nt/kernel32/GetVolumePathNameW.S
index a01e20290..ebef7f971 100644
--- a/libc/nt/kernel32/GetVolumePathNameW.S
+++ b/libc/nt/kernel32/GetVolumePathNameW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetVolumePathNameW,GetVolumePathNameW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetVolumePathName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetWindowsDirectoryA.S b/libc/nt/kernel32/GetWindowsDirectoryA.S
index 1769878b3..6c3d716ae 100644
--- a/libc/nt/kernel32/GetWindowsDirectoryA.S
+++ b/libc/nt/kernel32/GetWindowsDirectoryA.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetWindowsDirectoryA,GetWindowsDirectoryA
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetWindowsDirectoryA:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GetWindowsDirectoryW.S b/libc/nt/kernel32/GetWindowsDirectoryW.S
index 90d47b544..9812360b6 100644
--- a/libc/nt/kernel32/GetWindowsDirectoryW.S
+++ b/libc/nt/kernel32/GetWindowsDirectoryW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GetWindowsDirectoryW,GetWindowsDirectoryW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetWindowsDirectory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GlobalAlloc.S b/libc/nt/kernel32/GlobalAlloc.S
index 7d86847e2..d75f749bb 100644
--- a/libc/nt/kernel32/GlobalAlloc.S
+++ b/libc/nt/kernel32/GlobalAlloc.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GlobalAlloc,GlobalAlloc
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GlobalAlloc:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GlobalFree.S b/libc/nt/kernel32/GlobalFree.S
index 053a835f0..f59f7ab32 100644
--- a/libc/nt/kernel32/GlobalFree.S
+++ b/libc/nt/kernel32/GlobalFree.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GlobalFree,GlobalFree
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GlobalFree:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/GlobalMemoryStatusEx.S b/libc/nt/kernel32/GlobalMemoryStatusEx.S
index ea561086e..d9d243531 100644
--- a/libc/nt/kernel32/GlobalMemoryStatusEx.S
+++ b/libc/nt/kernel32/GlobalMemoryStatusEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_GlobalMemoryStatusEx,GlobalMemoryStatusEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GlobalMemoryStatusEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/HeapAlloc.S b/libc/nt/kernel32/HeapAlloc.S
index 06852b07f..600b5b9f4 100644
--- a/libc/nt/kernel32/HeapAlloc.S
+++ b/libc/nt/kernel32/HeapAlloc.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_HeapAlloc,HeapAlloc
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 HeapAlloc:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/HeapCompact.S b/libc/nt/kernel32/HeapCompact.S
index a2b217189..ec3109670 100644
--- a/libc/nt/kernel32/HeapCompact.S
+++ b/libc/nt/kernel32/HeapCompact.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_HeapCompact,HeapCompact
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 HeapCompact:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/HeapCreate.S b/libc/nt/kernel32/HeapCreate.S
index 3cd3684bd..acd5a5165 100644
--- a/libc/nt/kernel32/HeapCreate.S
+++ b/libc/nt/kernel32/HeapCreate.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_HeapCreate,HeapCreate
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 HeapCreate:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/HeapDestroy.S b/libc/nt/kernel32/HeapDestroy.S
index 1b04d3054..ca2a8fa26 100644
--- a/libc/nt/kernel32/HeapDestroy.S
+++ b/libc/nt/kernel32/HeapDestroy.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_HeapDestroy,HeapDestroy
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 HeapDestroy:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/HeapFree.S b/libc/nt/kernel32/HeapFree.S
index a12281c98..b46422031 100644
--- a/libc/nt/kernel32/HeapFree.S
+++ b/libc/nt/kernel32/HeapFree.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_HeapFree,HeapFree
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 HeapFree:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/HeapReAlloc.S b/libc/nt/kernel32/HeapReAlloc.S
index 90279161b..b65a06791 100644
--- a/libc/nt/kernel32/HeapReAlloc.S
+++ b/libc/nt/kernel32/HeapReAlloc.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_HeapReAlloc,HeapReAlloc
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 HeapReAlloc:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/InitializeContext.S b/libc/nt/kernel32/InitializeContext.S
index 9572b9c9b..940c789db 100644
--- a/libc/nt/kernel32/InitializeContext.S
+++ b/libc/nt/kernel32/InitializeContext.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_InitializeContext,InitializeContext
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 InitializeContext:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/InitializeCriticalSection.S b/libc/nt/kernel32/InitializeCriticalSection.S
index d63020993..20f259b71 100644
--- a/libc/nt/kernel32/InitializeCriticalSection.S
+++ b/libc/nt/kernel32/InitializeCriticalSection.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_InitializeCriticalSection,InitializeCriticalSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 InitializeCriticalSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/InitializeCriticalSectionAndSpinCount.S b/libc/nt/kernel32/InitializeCriticalSectionAndSpinCount.S
index 9eb08109c..589228b4f 100644
--- a/libc/nt/kernel32/InitializeCriticalSectionAndSpinCount.S
+++ b/libc/nt/kernel32/InitializeCriticalSectionAndSpinCount.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_InitializeCriticalSectionAndSpinCount,InitializeCriticalSectionAndSpinCount
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 InitializeCriticalSectionAndSpinCount:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/InitializeProcThreadAttributeList.S b/libc/nt/kernel32/InitializeProcThreadAttributeList.S
index 9955b5a2f..242b32f6e 100644
--- a/libc/nt/kernel32/InitializeProcThreadAttributeList.S
+++ b/libc/nt/kernel32/InitializeProcThreadAttributeList.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_InitializeProcThreadAttributeList,InitializeProcThreadAttributeList
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 InitializeProcThreadAttributeList:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/InitializeSRWLock.S b/libc/nt/kernel32/InitializeSRWLock.S
index 945ef21c8..b210f893a 100644
--- a/libc/nt/kernel32/InitializeSRWLock.S
+++ b/libc/nt/kernel32/InitializeSRWLock.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_InitializeSRWLock,InitializeSRWLock
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 InitializeSRWLock:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/LeaveCriticalSection.S b/libc/nt/kernel32/LeaveCriticalSection.S
index f53990f7f..32ddd03cc 100644
--- a/libc/nt/kernel32/LeaveCriticalSection.S
+++ b/libc/nt/kernel32/LeaveCriticalSection.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_LeaveCriticalSection,LeaveCriticalSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LeaveCriticalSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/LoadLibraryExW.S b/libc/nt/kernel32/LoadLibraryExW.S
index 70bada6ec..40dbd0f9d 100644
--- a/libc/nt/kernel32/LoadLibraryExW.S
+++ b/libc/nt/kernel32/LoadLibraryExW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_LoadLibraryExW,LoadLibraryExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LoadLibraryEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/LoadLibraryW.S b/libc/nt/kernel32/LoadLibraryW.S
index 34e2bb653..49532cdf7 100644
--- a/libc/nt/kernel32/LoadLibraryW.S
+++ b/libc/nt/kernel32/LoadLibraryW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_LoadLibraryW,LoadLibraryW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LoadLibrary:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/LoadResource.S b/libc/nt/kernel32/LoadResource.S
index 9622a8794..066a61232 100644
--- a/libc/nt/kernel32/LoadResource.S
+++ b/libc/nt/kernel32/LoadResource.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_LoadResource,LoadResource
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LoadResource:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/LocalFree.S b/libc/nt/kernel32/LocalFree.S
index cce25cbfc..f2840207e 100644
--- a/libc/nt/kernel32/LocalFree.S
+++ b/libc/nt/kernel32/LocalFree.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_LocalFree,LocalFree
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LocalFree:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/LockFile.S b/libc/nt/kernel32/LockFile.S
index 68a07c2cd..3cbeb13b6 100644
--- a/libc/nt/kernel32/LockFile.S
+++ b/libc/nt/kernel32/LockFile.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_LockFile,LockFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LockFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/LockResource.S b/libc/nt/kernel32/LockResource.S
index 17a59073d..3fc297ecd 100644
--- a/libc/nt/kernel32/LockResource.S
+++ b/libc/nt/kernel32/LockResource.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_LockResource,LockResource
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LockResource:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/MoveFileW.S b/libc/nt/kernel32/MoveFileW.S
index b2d2d097b..7b6bfd0de 100644
--- a/libc/nt/kernel32/MoveFileW.S
+++ b/libc/nt/kernel32/MoveFileW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_MoveFileW,MoveFileW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 MoveFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/MultiByteToWideChar.S b/libc/nt/kernel32/MultiByteToWideChar.S
index b0a55dbd1..ebf45c0ee 100644
--- a/libc/nt/kernel32/MultiByteToWideChar.S
+++ b/libc/nt/kernel32/MultiByteToWideChar.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_MultiByteToWideChar,MultiByteToWideChar
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 MultiByteToWideChar:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/OfferVirtualMemory.S b/libc/nt/kernel32/OfferVirtualMemory.S
index 135d5602f..587c18399 100644
--- a/libc/nt/kernel32/OfferVirtualMemory.S
+++ b/libc/nt/kernel32/OfferVirtualMemory.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_OfferVirtualMemory,OfferVirtualMemory
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 OfferVirtualMemory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/OpenThread.S b/libc/nt/kernel32/OpenThread.S
index c3e91e37f..b888eecb8 100644
--- a/libc/nt/kernel32/OpenThread.S
+++ b/libc/nt/kernel32/OpenThread.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_OpenThread,OpenThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 OpenThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/PeekConsoleInputW.S b/libc/nt/kernel32/PeekConsoleInputW.S
index 7f9c8bfad..ce0634390 100644
--- a/libc/nt/kernel32/PeekConsoleInputW.S
+++ b/libc/nt/kernel32/PeekConsoleInputW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_PeekConsoleInputW,PeekConsoleInputW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PeekConsoleInput:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/PeekNamedPipe.S b/libc/nt/kernel32/PeekNamedPipe.S
index fddf83325..5fac0b263 100644
--- a/libc/nt/kernel32/PeekNamedPipe.S
+++ b/libc/nt/kernel32/PeekNamedPipe.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_PeekNamedPipe,PeekNamedPipe
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PeekNamedPipe:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/PostQueuedCompletionStatus.S b/libc/nt/kernel32/PostQueuedCompletionStatus.S
index 510e29893..cd33720be 100644
--- a/libc/nt/kernel32/PostQueuedCompletionStatus.S
+++ b/libc/nt/kernel32/PostQueuedCompletionStatus.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_PostQueuedCompletionStatus,PostQueuedCompletionStatus
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PostQueuedCompletionStatus:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/PrefetchVirtualMemory.S b/libc/nt/kernel32/PrefetchVirtualMemory.S
index 7b4b51469..06a47589b 100644
--- a/libc/nt/kernel32/PrefetchVirtualMemory.S
+++ b/libc/nt/kernel32/PrefetchVirtualMemory.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_PrefetchVirtualMemory,PrefetchVirtualMemory
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PrefetchVirtualMemory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/Process32FirstW.S b/libc/nt/kernel32/Process32FirstW.S
index e14adf67d..a69cf479a 100644
--- a/libc/nt/kernel32/Process32FirstW.S
+++ b/libc/nt/kernel32/Process32FirstW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_Process32FirstW,Process32FirstW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 Process32First:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/Process32NextW.S b/libc/nt/kernel32/Process32NextW.S
index 4b48edc08..ad8f287c0 100644
--- a/libc/nt/kernel32/Process32NextW.S
+++ b/libc/nt/kernel32/Process32NextW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_Process32NextW,Process32NextW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 Process32Next:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/PulseEvent.S b/libc/nt/kernel32/PulseEvent.S
index 0e64e6c79..c62d6cab6 100644
--- a/libc/nt/kernel32/PulseEvent.S
+++ b/libc/nt/kernel32/PulseEvent.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_PulseEvent,PulseEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PulseEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/PurgeComm.S b/libc/nt/kernel32/PurgeComm.S
index f7372cf6b..9c364634e 100644
--- a/libc/nt/kernel32/PurgeComm.S
+++ b/libc/nt/kernel32/PurgeComm.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_PurgeComm,PurgeComm
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PurgeComm:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/QueryPerformanceCounter.S b/libc/nt/kernel32/QueryPerformanceCounter.S
index bb589ec47..989bc1cbb 100644
--- a/libc/nt/kernel32/QueryPerformanceCounter.S
+++ b/libc/nt/kernel32/QueryPerformanceCounter.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_QueryPerformanceCounter,QueryPerformanceCounter
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 QueryPerformanceCounter:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/QueryPerformanceFrequency.S b/libc/nt/kernel32/QueryPerformanceFrequency.S
index d462c74f2..7d8b034b1 100644
--- a/libc/nt/kernel32/QueryPerformanceFrequency.S
+++ b/libc/nt/kernel32/QueryPerformanceFrequency.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_QueryPerformanceFrequency,QueryPerformanceFrequency
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 QueryPerformanceFrequency:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReadConsoleInputW.S b/libc/nt/kernel32/ReadConsoleInputW.S
index 33d47ffaa..c2a09efc6 100644
--- a/libc/nt/kernel32/ReadConsoleInputW.S
+++ b/libc/nt/kernel32/ReadConsoleInputW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReadConsoleInputW,ReadConsoleInputW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReadConsoleInput:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReadConsoleOutputAttribute.S b/libc/nt/kernel32/ReadConsoleOutputAttribute.S
index b7bf491a4..dc07c64db 100644
--- a/libc/nt/kernel32/ReadConsoleOutputAttribute.S
+++ b/libc/nt/kernel32/ReadConsoleOutputAttribute.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReadConsoleOutputAttribute,ReadConsoleOutputAttribute
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReadConsoleOutputAttribute:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReadConsoleOutputCharacterW.S b/libc/nt/kernel32/ReadConsoleOutputCharacterW.S
index 644f48f99..12a3bbe3e 100644
--- a/libc/nt/kernel32/ReadConsoleOutputCharacterW.S
+++ b/libc/nt/kernel32/ReadConsoleOutputCharacterW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReadConsoleOutputCharacterW,ReadConsoleOutputCharacterW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReadConsoleOutputCharacter:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReadConsoleOutputW.S b/libc/nt/kernel32/ReadConsoleOutputW.S
index 660fe46c5..8229d6fbe 100644
--- a/libc/nt/kernel32/ReadConsoleOutputW.S
+++ b/libc/nt/kernel32/ReadConsoleOutputW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReadConsoleOutputW,ReadConsoleOutputW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReadConsoleOutput:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReadConsoleW.S b/libc/nt/kernel32/ReadConsoleW.S
index 3c4d37f5b..da80b54d7 100644
--- a/libc/nt/kernel32/ReadConsoleW.S
+++ b/libc/nt/kernel32/ReadConsoleW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReadConsoleW,ReadConsoleW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReadConsole:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReadFile.S b/libc/nt/kernel32/ReadFile.S
index 7545e86d7..0ea807703 100644
--- a/libc/nt/kernel32/ReadFile.S
+++ b/libc/nt/kernel32/ReadFile.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReadFile,ReadFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReadFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReadFileEx.S b/libc/nt/kernel32/ReadFileEx.S
index 20ea1407d..71e7a522c 100644
--- a/libc/nt/kernel32/ReadFileEx.S
+++ b/libc/nt/kernel32/ReadFileEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReadFileEx,ReadFileEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReadFileEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReadFileScatter.S b/libc/nt/kernel32/ReadFileScatter.S
index 3a8c446f9..66d1ca4c5 100644
--- a/libc/nt/kernel32/ReadFileScatter.S
+++ b/libc/nt/kernel32/ReadFileScatter.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReadFileScatter,ReadFileScatter
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReadFileScatter:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/RegisterWaitForSingleObject.S b/libc/nt/kernel32/RegisterWaitForSingleObject.S
index cd35863a7..982d302b3 100644
--- a/libc/nt/kernel32/RegisterWaitForSingleObject.S
+++ b/libc/nt/kernel32/RegisterWaitForSingleObject.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_RegisterWaitForSingleObject,RegisterWaitForSingleObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegisterWaitForSingleObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReleaseMutex.S b/libc/nt/kernel32/ReleaseMutex.S
index 1885b68b2..6be8f53d5 100644
--- a/libc/nt/kernel32/ReleaseMutex.S
+++ b/libc/nt/kernel32/ReleaseMutex.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReleaseMutex,ReleaseMutex
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReleaseMutex:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReleaseSRWLockExclusive.S b/libc/nt/kernel32/ReleaseSRWLockExclusive.S
index 8dfe5c8f1..b321d4979 100644
--- a/libc/nt/kernel32/ReleaseSRWLockExclusive.S
+++ b/libc/nt/kernel32/ReleaseSRWLockExclusive.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReleaseSRWLockExclusive,ReleaseSRWLockExclusive
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReleaseSRWLockExclusive:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReleaseSRWLockShared.S b/libc/nt/kernel32/ReleaseSRWLockShared.S
index 017510c65..3b227d6a3 100644
--- a/libc/nt/kernel32/ReleaseSRWLockShared.S
+++ b/libc/nt/kernel32/ReleaseSRWLockShared.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReleaseSRWLockShared,ReleaseSRWLockShared
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReleaseSRWLockShared:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ReleaseSemaphore.S b/libc/nt/kernel32/ReleaseSemaphore.S
index a85596760..b93c4a7e3 100644
--- a/libc/nt/kernel32/ReleaseSemaphore.S
+++ b/libc/nt/kernel32/ReleaseSemaphore.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ReleaseSemaphore,ReleaseSemaphore
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReleaseSemaphore:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/RemoveVectoredContinueHandler.S b/libc/nt/kernel32/RemoveVectoredContinueHandler.S
index 57e1f519c..e9d243abc 100644
--- a/libc/nt/kernel32/RemoveVectoredContinueHandler.S
+++ b/libc/nt/kernel32/RemoveVectoredContinueHandler.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_RemoveVectoredContinueHandler,RemoveVectoredContinueHandler
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RemoveVectoredContinueHandler:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/RemoveVectoredExceptionHandler.S b/libc/nt/kernel32/RemoveVectoredExceptionHandler.S
index 98d2f4508..120a946a6 100644
--- a/libc/nt/kernel32/RemoveVectoredExceptionHandler.S
+++ b/libc/nt/kernel32/RemoveVectoredExceptionHandler.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_RemoveVectoredExceptionHandler,RemoveVectoredExceptionHandler
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RemoveVectoredExceptionHandler:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ResetEvent.S b/libc/nt/kernel32/ResetEvent.S
index 3f1aac3b0..1c5c3b743 100644
--- a/libc/nt/kernel32/ResetEvent.S
+++ b/libc/nt/kernel32/ResetEvent.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ResetEvent,ResetEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ResetEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ResizePseudoConsole.S b/libc/nt/kernel32/ResizePseudoConsole.S
index b10b73d47..d0124326e 100644
--- a/libc/nt/kernel32/ResizePseudoConsole.S
+++ b/libc/nt/kernel32/ResizePseudoConsole.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ResizePseudoConsole,ResizePseudoConsole
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ResizePseudoConsole:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/ResumeThread.S b/libc/nt/kernel32/ResumeThread.S
index 1f8a3304a..482221e2b 100644
--- a/libc/nt/kernel32/ResumeThread.S
+++ b/libc/nt/kernel32/ResumeThread.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_ResumeThread,ResumeThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ResumeThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetConsoleActiveScreenBuffer.S b/libc/nt/kernel32/SetConsoleActiveScreenBuffer.S
index 9abeba181..237d1d10b 100644
--- a/libc/nt/kernel32/SetConsoleActiveScreenBuffer.S
+++ b/libc/nt/kernel32/SetConsoleActiveScreenBuffer.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetConsoleActiveScreenBuffer,SetConsoleActiveScreenBuffer
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetConsoleActiveScreenBuffer:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetConsoleCP.S b/libc/nt/kernel32/SetConsoleCP.S
index f93b57b9e..5aebfd378 100644
--- a/libc/nt/kernel32/SetConsoleCP.S
+++ b/libc/nt/kernel32/SetConsoleCP.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetConsoleCP,SetConsoleCP
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetConsoleCP:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetConsoleCtrlHandler.S b/libc/nt/kernel32/SetConsoleCtrlHandler.S
index 32474be79..df3c779fd 100644
--- a/libc/nt/kernel32/SetConsoleCtrlHandler.S
+++ b/libc/nt/kernel32/SetConsoleCtrlHandler.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetConsoleCtrlHandler,SetConsoleCtrlHandler
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetConsoleCtrlHandler:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetConsoleCursorInfo.S b/libc/nt/kernel32/SetConsoleCursorInfo.S
index 4d3c8bba4..8b920f069 100644
--- a/libc/nt/kernel32/SetConsoleCursorInfo.S
+++ b/libc/nt/kernel32/SetConsoleCursorInfo.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetConsoleCursorInfo,SetConsoleCursorInfo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetConsoleCursorInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetConsoleCursorPosition.S b/libc/nt/kernel32/SetConsoleCursorPosition.S
index 5f7cccdf8..13ae99d28 100644
--- a/libc/nt/kernel32/SetConsoleCursorPosition.S
+++ b/libc/nt/kernel32/SetConsoleCursorPosition.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetConsoleCursorPosition,SetConsoleCursorPosition
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetConsoleCursorPosition:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetConsoleMode.S b/libc/nt/kernel32/SetConsoleMode.S
index b51311cf4..326caced1 100644
--- a/libc/nt/kernel32/SetConsoleMode.S
+++ b/libc/nt/kernel32/SetConsoleMode.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetConsoleMode,SetConsoleMode
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetConsoleMode:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetConsoleOutputCP.S b/libc/nt/kernel32/SetConsoleOutputCP.S
index 8fda814bc..d3440ae7c 100644
--- a/libc/nt/kernel32/SetConsoleOutputCP.S
+++ b/libc/nt/kernel32/SetConsoleOutputCP.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetConsoleOutputCP,SetConsoleOutputCP
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetConsoleOutputCP:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetConsoleScreenBufferInfoEx.S b/libc/nt/kernel32/SetConsoleScreenBufferInfoEx.S
index 6e80e657e..047038853 100644
--- a/libc/nt/kernel32/SetConsoleScreenBufferInfoEx.S
+++ b/libc/nt/kernel32/SetConsoleScreenBufferInfoEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetConsoleScreenBufferInfoEx,SetConsoleScreenBufferInfoEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetConsoleScreenBufferInfoEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetConsoleScreenBufferSize.S b/libc/nt/kernel32/SetConsoleScreenBufferSize.S
index 059df3c98..915ead0c4 100644
--- a/libc/nt/kernel32/SetConsoleScreenBufferSize.S
+++ b/libc/nt/kernel32/SetConsoleScreenBufferSize.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetConsoleScreenBufferSize,SetConsoleScreenBufferSize
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetConsoleScreenBufferSize:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetConsoleTitleW.S b/libc/nt/kernel32/SetConsoleTitleW.S
index 8a29ae7a6..72969928b 100644
--- a/libc/nt/kernel32/SetConsoleTitleW.S
+++ b/libc/nt/kernel32/SetConsoleTitleW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetConsoleTitleW,SetConsoleTitleW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetConsoleTitle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetConsoleWindowInfo.S b/libc/nt/kernel32/SetConsoleWindowInfo.S
index 4c143788b..7c37d08f7 100644
--- a/libc/nt/kernel32/SetConsoleWindowInfo.S
+++ b/libc/nt/kernel32/SetConsoleWindowInfo.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetConsoleWindowInfo,SetConsoleWindowInfo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetConsoleWindowInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetCriticalSectionSpinCount.S b/libc/nt/kernel32/SetCriticalSectionSpinCount.S
index c57b93ac7..ab6324254 100644
--- a/libc/nt/kernel32/SetCriticalSectionSpinCount.S
+++ b/libc/nt/kernel32/SetCriticalSectionSpinCount.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetCriticalSectionSpinCount,SetCriticalSectionSpinCount
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetCriticalSectionSpinCount:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetDefaultDllDirectories.S b/libc/nt/kernel32/SetDefaultDllDirectories.S
index 29c14a04e..ff9458813 100644
--- a/libc/nt/kernel32/SetDefaultDllDirectories.S
+++ b/libc/nt/kernel32/SetDefaultDllDirectories.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetDefaultDllDirectories,SetDefaultDllDirectories
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetDefaultDllDirectories:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetEndOfFile.S b/libc/nt/kernel32/SetEndOfFile.S
index 46d6fb951..15b28001b 100644
--- a/libc/nt/kernel32/SetEndOfFile.S
+++ b/libc/nt/kernel32/SetEndOfFile.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetEndOfFile,SetEndOfFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetEndOfFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetEnvironmentVariableW.S b/libc/nt/kernel32/SetEnvironmentVariableW.S
index ba7418c30..2aac41c69 100644
--- a/libc/nt/kernel32/SetEnvironmentVariableW.S
+++ b/libc/nt/kernel32/SetEnvironmentVariableW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetEnvironmentVariableW,SetEnvironmentVariableW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetEnvironmentVariable:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetErrorMode.S b/libc/nt/kernel32/SetErrorMode.S
index e0f59392b..b58769b22 100644
--- a/libc/nt/kernel32/SetErrorMode.S
+++ b/libc/nt/kernel32/SetErrorMode.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetErrorMode,SetErrorMode
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetErrorMode:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetEvent.S b/libc/nt/kernel32/SetEvent.S
index aba6fe17b..62d86495b 100644
--- a/libc/nt/kernel32/SetEvent.S
+++ b/libc/nt/kernel32/SetEvent.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetEvent,SetEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetFileAttributesW.S b/libc/nt/kernel32/SetFileAttributesW.S
index 08962e132..97f3457ff 100644
--- a/libc/nt/kernel32/SetFileAttributesW.S
+++ b/libc/nt/kernel32/SetFileAttributesW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetFileAttributesW,SetFileAttributesW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetFileAttributes:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetFileCompletionNotificationModes.S b/libc/nt/kernel32/SetFileCompletionNotificationModes.S
index 6fd0025e1..6e964fa9e 100644
--- a/libc/nt/kernel32/SetFileCompletionNotificationModes.S
+++ b/libc/nt/kernel32/SetFileCompletionNotificationModes.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetFileCompletionNotificationModes,SetFileCompletionNotificationModes
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetFileCompletionNotificationModes:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetFileInformationByHandle.S b/libc/nt/kernel32/SetFileInformationByHandle.S
index f18905dc8..9275f4199 100644
--- a/libc/nt/kernel32/SetFileInformationByHandle.S
+++ b/libc/nt/kernel32/SetFileInformationByHandle.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetFileInformationByHandle,SetFileInformationByHandle
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetFileInformationByHandle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetFileTime.S b/libc/nt/kernel32/SetFileTime.S
index 3ab65e142..eb382f3e3 100644
--- a/libc/nt/kernel32/SetFileTime.S
+++ b/libc/nt/kernel32/SetFileTime.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetFileTime,SetFileTime
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetFileTime:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetFileValidData.S b/libc/nt/kernel32/SetFileValidData.S
index 2de257d47..d1143f4af 100644
--- a/libc/nt/kernel32/SetFileValidData.S
+++ b/libc/nt/kernel32/SetFileValidData.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetFileValidData,SetFileValidData
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetFileValidData:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetHandleCount.S b/libc/nt/kernel32/SetHandleCount.S
index 23fe29c3a..d62cd872b 100644
--- a/libc/nt/kernel32/SetHandleCount.S
+++ b/libc/nt/kernel32/SetHandleCount.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetHandleCount,SetHandleCount
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetHandleCount:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetHandleInformation.S b/libc/nt/kernel32/SetHandleInformation.S
index 3acaf4676..763806e90 100644
--- a/libc/nt/kernel32/SetHandleInformation.S
+++ b/libc/nt/kernel32/SetHandleInformation.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetHandleInformation,SetHandleInformation
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetHandleInformation:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetLastError.S b/libc/nt/kernel32/SetLastError.S
index 7795f0025..7eca74f1a 100644
--- a/libc/nt/kernel32/SetLastError.S
+++ b/libc/nt/kernel32/SetLastError.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetLastError,SetLastError
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetLastError:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetNamedPipeHandleState.S b/libc/nt/kernel32/SetNamedPipeHandleState.S
index a391c8aba..0548bd2ab 100644
--- a/libc/nt/kernel32/SetNamedPipeHandleState.S
+++ b/libc/nt/kernel32/SetNamedPipeHandleState.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetNamedPipeHandleState,SetNamedPipeHandleState
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetNamedPipeHandleState:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetPriorityClass.S b/libc/nt/kernel32/SetPriorityClass.S
index 5a22616f4..1e0e8b849 100644
--- a/libc/nt/kernel32/SetPriorityClass.S
+++ b/libc/nt/kernel32/SetPriorityClass.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetPriorityClass,SetPriorityClass
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetPriorityClass:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetProcessAffinityMask.S b/libc/nt/kernel32/SetProcessAffinityMask.S
index d633041b6..f1aa4b735 100644
--- a/libc/nt/kernel32/SetProcessAffinityMask.S
+++ b/libc/nt/kernel32/SetProcessAffinityMask.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetProcessAffinityMask,SetProcessAffinityMask
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetProcessAffinityMask:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetProcessPriorityBoost.S b/libc/nt/kernel32/SetProcessPriorityBoost.S
index 236dfbf56..df0412c91 100644
--- a/libc/nt/kernel32/SetProcessPriorityBoost.S
+++ b/libc/nt/kernel32/SetProcessPriorityBoost.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetProcessPriorityBoost,SetProcessPriorityBoost
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetProcessPriorityBoost:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetProcessWorkingSetSize.S b/libc/nt/kernel32/SetProcessWorkingSetSize.S
index 1e395d4ea..df69b9248 100644
--- a/libc/nt/kernel32/SetProcessWorkingSetSize.S
+++ b/libc/nt/kernel32/SetProcessWorkingSetSize.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetProcessWorkingSetSize,SetProcessWorkingSetSize
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetProcessWorkingSetSize:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetProcessWorkingSetSizeEx.S b/libc/nt/kernel32/SetProcessWorkingSetSizeEx.S
index af63d2467..cd2e72886 100644
--- a/libc/nt/kernel32/SetProcessWorkingSetSizeEx.S
+++ b/libc/nt/kernel32/SetProcessWorkingSetSizeEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetProcessWorkingSetSizeEx,SetProcessWorkingSetSizeEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetProcessWorkingSetSizeEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetStdHandle.S b/libc/nt/kernel32/SetStdHandle.S
index 491acd189..77080b71e 100644
--- a/libc/nt/kernel32/SetStdHandle.S
+++ b/libc/nt/kernel32/SetStdHandle.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetStdHandle,SetStdHandle
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetStdHandle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetThreadAffinityMask.S b/libc/nt/kernel32/SetThreadAffinityMask.S
index ef79ef00c..26d456ea2 100644
--- a/libc/nt/kernel32/SetThreadAffinityMask.S
+++ b/libc/nt/kernel32/SetThreadAffinityMask.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetThreadAffinityMask,SetThreadAffinityMask
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetThreadAffinityMask:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetThreadContext.S b/libc/nt/kernel32/SetThreadContext.S
index c6a0582be..ef80f0f58 100644
--- a/libc/nt/kernel32/SetThreadContext.S
+++ b/libc/nt/kernel32/SetThreadContext.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetThreadContext,SetThreadContext
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetThreadContext:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetThreadDescription.S b/libc/nt/kernel32/SetThreadDescription.S
index 882e14d40..260240d1c 100644
--- a/libc/nt/kernel32/SetThreadDescription.S
+++ b/libc/nt/kernel32/SetThreadDescription.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetThreadDescription,SetThreadDescription
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetThreadDescription:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetThreadPriority.S b/libc/nt/kernel32/SetThreadPriority.S
index be988c61a..f2349b4ad 100644
--- a/libc/nt/kernel32/SetThreadPriority.S
+++ b/libc/nt/kernel32/SetThreadPriority.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetThreadPriority,SetThreadPriority
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetThreadPriority:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetThreadPriorityBoost.S b/libc/nt/kernel32/SetThreadPriorityBoost.S
index 05de3cfba..5c9cda446 100644
--- a/libc/nt/kernel32/SetThreadPriorityBoost.S
+++ b/libc/nt/kernel32/SetThreadPriorityBoost.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetThreadPriorityBoost,SetThreadPriorityBoost
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetThreadPriorityBoost:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetUnhandledExceptionFilter.S b/libc/nt/kernel32/SetUnhandledExceptionFilter.S
index 88f92d927..cb55e3b93 100644
--- a/libc/nt/kernel32/SetUnhandledExceptionFilter.S
+++ b/libc/nt/kernel32/SetUnhandledExceptionFilter.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetUnhandledExceptionFilter,SetUnhandledExceptionFilter
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetUnhandledExceptionFilter:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SetWaitableTimer.S b/libc/nt/kernel32/SetWaitableTimer.S
index d9aec1545..e3a6e7f45 100644
--- a/libc/nt/kernel32/SetWaitableTimer.S
+++ b/libc/nt/kernel32/SetWaitableTimer.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SetWaitableTimer,SetWaitableTimer
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetWaitableTimer:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/Sleep.S b/libc/nt/kernel32/Sleep.S
index 834758f76..4df691484 100644
--- a/libc/nt/kernel32/Sleep.S
+++ b/libc/nt/kernel32/Sleep.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_Sleep,Sleep
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 Sleep:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SleepEx.S b/libc/nt/kernel32/SleepEx.S
index 95c8e88ae..273503d02 100644
--- a/libc/nt/kernel32/SleepEx.S
+++ b/libc/nt/kernel32/SleepEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SleepEx,SleepEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SleepEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SuspendThread.S b/libc/nt/kernel32/SuspendThread.S
index 9d1a16392..2b9ed8af5 100644
--- a/libc/nt/kernel32/SuspendThread.S
+++ b/libc/nt/kernel32/SuspendThread.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SuspendThread,SuspendThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SuspendThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/SystemTimeToFileTime.S b/libc/nt/kernel32/SystemTimeToFileTime.S
index 8c2d563e9..01329fdcc 100644
--- a/libc/nt/kernel32/SystemTimeToFileTime.S
+++ b/libc/nt/kernel32/SystemTimeToFileTime.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_SystemTimeToFileTime,SystemTimeToFileTime
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SystemTimeToFileTime:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/TerminateThread.S b/libc/nt/kernel32/TerminateThread.S
index 860e51cad..05b0449f6 100644
--- a/libc/nt/kernel32/TerminateThread.S
+++ b/libc/nt/kernel32/TerminateThread.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_TerminateThread,TerminateThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TerminateThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/TlsAlloc.S b/libc/nt/kernel32/TlsAlloc.S
index 6a102b0e1..bf7ddfd7e 100644
--- a/libc/nt/kernel32/TlsAlloc.S
+++ b/libc/nt/kernel32/TlsAlloc.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_TlsAlloc,TlsAlloc
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TlsAlloc:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/TlsFree.S b/libc/nt/kernel32/TlsFree.S
index 0bd15a28b..251a88ae0 100644
--- a/libc/nt/kernel32/TlsFree.S
+++ b/libc/nt/kernel32/TlsFree.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_TlsFree,TlsFree
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TlsFree:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/TlsGetValue.S b/libc/nt/kernel32/TlsGetValue.S
index bc7ed14e0..98287c9e6 100644
--- a/libc/nt/kernel32/TlsGetValue.S
+++ b/libc/nt/kernel32/TlsGetValue.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_TlsGetValue,TlsGetValue
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TlsGetValue:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/TlsSetValue.S b/libc/nt/kernel32/TlsSetValue.S
index f2a8dfc9b..1fedd6e02 100644
--- a/libc/nt/kernel32/TlsSetValue.S
+++ b/libc/nt/kernel32/TlsSetValue.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_TlsSetValue,TlsSetValue
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TlsSetValue:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/TransactNamedPipe.S b/libc/nt/kernel32/TransactNamedPipe.S
index 4ec2f8d77..826528a32 100644
--- a/libc/nt/kernel32/TransactNamedPipe.S
+++ b/libc/nt/kernel32/TransactNamedPipe.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_TransactNamedPipe,TransactNamedPipe
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TransactNamedPipe:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/TransmitCommChar.S b/libc/nt/kernel32/TransmitCommChar.S
index 29d611362..3f676b5c0 100644
--- a/libc/nt/kernel32/TransmitCommChar.S
+++ b/libc/nt/kernel32/TransmitCommChar.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_TransmitCommChar,TransmitCommChar
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TransmitCommChar:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/TryAcquireSRWLockExclusive.S b/libc/nt/kernel32/TryAcquireSRWLockExclusive.S
index 954526f87..887cd90b1 100644
--- a/libc/nt/kernel32/TryAcquireSRWLockExclusive.S
+++ b/libc/nt/kernel32/TryAcquireSRWLockExclusive.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_TryAcquireSRWLockExclusive,TryAcquireSRWLockExclusive
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TryAcquireSRWLockExclusive:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/TryAcquireSRWLockShared.S b/libc/nt/kernel32/TryAcquireSRWLockShared.S
index 88996c799..9d54d9e36 100644
--- a/libc/nt/kernel32/TryAcquireSRWLockShared.S
+++ b/libc/nt/kernel32/TryAcquireSRWLockShared.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_TryAcquireSRWLockShared,TryAcquireSRWLockShared
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TryAcquireSRWLockShared:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/TryEnterCriticalSection.S b/libc/nt/kernel32/TryEnterCriticalSection.S
index beb6c91c6..04adab81e 100644
--- a/libc/nt/kernel32/TryEnterCriticalSection.S
+++ b/libc/nt/kernel32/TryEnterCriticalSection.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_TryEnterCriticalSection,TryEnterCriticalSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TryEnterCriticalSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/UnlockFile.S b/libc/nt/kernel32/UnlockFile.S
index fdad93773..0413691c9 100644
--- a/libc/nt/kernel32/UnlockFile.S
+++ b/libc/nt/kernel32/UnlockFile.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_UnlockFile,UnlockFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 UnlockFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/UnmapViewOfFile2.S b/libc/nt/kernel32/UnmapViewOfFile2.S
index 975c07ac4..a407fe3a1 100644
--- a/libc/nt/kernel32/UnmapViewOfFile2.S
+++ b/libc/nt/kernel32/UnmapViewOfFile2.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_UnmapViewOfFile2,UnmapViewOfFile2
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 UnmapViewOfFile2:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/UnmapViewOfFileEx.S b/libc/nt/kernel32/UnmapViewOfFileEx.S
index e4b331eea..edb3c93ff 100644
--- a/libc/nt/kernel32/UnmapViewOfFileEx.S
+++ b/libc/nt/kernel32/UnmapViewOfFileEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_UnmapViewOfFileEx,UnmapViewOfFileEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 UnmapViewOfFileEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/UpdateProcThreadAttribute.S b/libc/nt/kernel32/UpdateProcThreadAttribute.S
index 3dc818403..53a4d3855 100644
--- a/libc/nt/kernel32/UpdateProcThreadAttribute.S
+++ b/libc/nt/kernel32/UpdateProcThreadAttribute.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_UpdateProcThreadAttribute,UpdateProcThreadAttribute
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 UpdateProcThreadAttribute:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/VirtualAlloc.S b/libc/nt/kernel32/VirtualAlloc.S
index 5804c218c..f8e5f815a 100644
--- a/libc/nt/kernel32/VirtualAlloc.S
+++ b/libc/nt/kernel32/VirtualAlloc.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_VirtualAlloc,VirtualAlloc
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 VirtualAlloc:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/VirtualAllocEx.S b/libc/nt/kernel32/VirtualAllocEx.S
index 7facb7203..bdf00950b 100644
--- a/libc/nt/kernel32/VirtualAllocEx.S
+++ b/libc/nt/kernel32/VirtualAllocEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_VirtualAllocEx,VirtualAllocEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 VirtualAllocEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/VirtualFree.S b/libc/nt/kernel32/VirtualFree.S
index 3fa3e6b40..aa6bdcff6 100644
--- a/libc/nt/kernel32/VirtualFree.S
+++ b/libc/nt/kernel32/VirtualFree.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_VirtualFree,VirtualFree
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 VirtualFree:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/VirtualLock.S b/libc/nt/kernel32/VirtualLock.S
index c364a7e7b..8ab9b8a55 100644
--- a/libc/nt/kernel32/VirtualLock.S
+++ b/libc/nt/kernel32/VirtualLock.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_VirtualLock,VirtualLock
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 VirtualLock:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/VirtualQuery.S b/libc/nt/kernel32/VirtualQuery.S
index 1c2afee63..83b6be896 100644
--- a/libc/nt/kernel32/VirtualQuery.S
+++ b/libc/nt/kernel32/VirtualQuery.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_VirtualQuery,VirtualQuery
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 VirtualQuery:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/VirtualUnlock.S b/libc/nt/kernel32/VirtualUnlock.S
index 7b1d0eb5d..6524d114a 100644
--- a/libc/nt/kernel32/VirtualUnlock.S
+++ b/libc/nt/kernel32/VirtualUnlock.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_VirtualUnlock,VirtualUnlock
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 VirtualUnlock:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/WaitForMultipleObjectsEx.S b/libc/nt/kernel32/WaitForMultipleObjectsEx.S
index 57b8440d4..649cd5498 100644
--- a/libc/nt/kernel32/WaitForMultipleObjectsEx.S
+++ b/libc/nt/kernel32/WaitForMultipleObjectsEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_WaitForMultipleObjectsEx,WaitForMultipleObjectsEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WaitForMultipleObjectsEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/WaitForSingleObjectEx.S b/libc/nt/kernel32/WaitForSingleObjectEx.S
index fed788b60..185904048 100644
--- a/libc/nt/kernel32/WaitForSingleObjectEx.S
+++ b/libc/nt/kernel32/WaitForSingleObjectEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_WaitForSingleObjectEx,WaitForSingleObjectEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WaitForSingleObjectEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/WideCharToMultiByte.S b/libc/nt/kernel32/WideCharToMultiByte.S
index 0584d99fd..e78202569 100644
--- a/libc/nt/kernel32/WideCharToMultiByte.S
+++ b/libc/nt/kernel32/WideCharToMultiByte.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_WideCharToMultiByte,WideCharToMultiByte
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WideCharToMultiByte:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/WriteConsoleInputW.S b/libc/nt/kernel32/WriteConsoleInputW.S
index 0ebeea059..bd60a478d 100644
--- a/libc/nt/kernel32/WriteConsoleInputW.S
+++ b/libc/nt/kernel32/WriteConsoleInputW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_WriteConsoleInputW,WriteConsoleInputW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WriteConsoleInput:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/WriteConsoleOutputAttribute.S b/libc/nt/kernel32/WriteConsoleOutputAttribute.S
index 1c0708bcb..a38b50279 100644
--- a/libc/nt/kernel32/WriteConsoleOutputAttribute.S
+++ b/libc/nt/kernel32/WriteConsoleOutputAttribute.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_WriteConsoleOutputAttribute,WriteConsoleOutputAttribute
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WriteConsoleOutputAttribute:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/WriteConsoleOutputCharacterW.S b/libc/nt/kernel32/WriteConsoleOutputCharacterW.S
index 5d902b977..27390e116 100644
--- a/libc/nt/kernel32/WriteConsoleOutputCharacterW.S
+++ b/libc/nt/kernel32/WriteConsoleOutputCharacterW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_WriteConsoleOutputCharacterW,WriteConsoleOutputCharacterW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WriteConsoleOutputCharacter:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/WriteConsoleW.S b/libc/nt/kernel32/WriteConsoleW.S
index 2733d8b51..21a0a533a 100644
--- a/libc/nt/kernel32/WriteConsoleW.S
+++ b/libc/nt/kernel32/WriteConsoleW.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_WriteConsoleW,WriteConsoleW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WriteConsole:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/WriteFile.S b/libc/nt/kernel32/WriteFile.S
index 424cf1e93..2f2f79f7b 100644
--- a/libc/nt/kernel32/WriteFile.S
+++ b/libc/nt/kernel32/WriteFile.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_WriteFile,WriteFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WriteFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/WriteFileEx.S b/libc/nt/kernel32/WriteFileEx.S
index 9e04c9efa..357dea7cc 100644
--- a/libc/nt/kernel32/WriteFileEx.S
+++ b/libc/nt/kernel32/WriteFileEx.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_WriteFileEx,WriteFileEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WriteFileEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/kernel32/WriteFileGather.S b/libc/nt/kernel32/WriteFileGather.S
index 5cdab48aa..e9c101e91 100644
--- a/libc/nt/kernel32/WriteFileGather.S
+++ b/libc/nt/kernel32/WriteFileGather.S
@@ -2,9 +2,9 @@
 .imp	kernel32,__imp_WriteFileGather,WriteFileGather
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WriteFileGather:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/master.sh b/libc/nt/master.sh
index daee92599..11d9e4528 100755
--- a/libc/nt/master.sh
+++ b/libc/nt/master.sh
@@ -109,6 +109,7 @@ imp	'GetConsoleTitle'					GetConsoleTitleW					kernel32	2
 imp	'GetConsoleWindow'					GetConsoleWindow					kernel32	0
 imp	'GetCurrentDirectory'					GetCurrentDirectoryW					kernel32	2
 imp	'GetCurrentProcessId'					GetCurrentProcessId					kernel32	0
+imp	'GetCurrentProcessorNumberEx'				GetCurrentProcessorNumberEx				kernel32	1
 imp	'GetCurrentThread'					GetCurrentThread					kernel32	0
 imp	'GetCurrentThreadId'					GetCurrentThreadId					kernel32	0
 imp	'GetEnvironmentStrings'					GetEnvironmentStringsW					kernel32	1
diff --git a/libc/nt/ntdll/CsrClientCallServer.S b/libc/nt/ntdll/CsrClientCallServer.S
index b473dc813..14cf2ea24 100644
--- a/libc/nt/ntdll/CsrClientCallServer.S
+++ b/libc/nt/ntdll/CsrClientCallServer.S
@@ -2,9 +2,9 @@
 .ntimp	CsrClientCallServer,CsrClientCallServer
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CsrClientCallServer:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/LdrGetDllHandle.S b/libc/nt/ntdll/LdrGetDllHandle.S
index e33fe507f..19023300e 100644
--- a/libc/nt/ntdll/LdrGetDllHandle.S
+++ b/libc/nt/ntdll/LdrGetDllHandle.S
@@ -2,9 +2,9 @@
 .ntimp	LdrGetDllHandle,LdrGetDllHandle
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LdrGetDllHandle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/LdrGetProcedureAddress.S b/libc/nt/ntdll/LdrGetProcedureAddress.S
index 31b91a3dd..6cae0e056 100644
--- a/libc/nt/ntdll/LdrGetProcedureAddress.S
+++ b/libc/nt/ntdll/LdrGetProcedureAddress.S
@@ -2,9 +2,9 @@
 .ntimp	LdrGetProcedureAddress,LdrGetProcedureAddress
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LdrGetProcedureAddress:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/LdrLoadDll.S b/libc/nt/ntdll/LdrLoadDll.S
index 3db73813f..0993d8dda 100644
--- a/libc/nt/ntdll/LdrLoadDll.S
+++ b/libc/nt/ntdll/LdrLoadDll.S
@@ -2,9 +2,9 @@
 .ntimp	LdrLoadDll,LdrLoadDll
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LdrLoadDll:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/LdrUnloadDll.S b/libc/nt/ntdll/LdrUnloadDll.S
index 0594710da..1e37eaba7 100644
--- a/libc/nt/ntdll/LdrUnloadDll.S
+++ b/libc/nt/ntdll/LdrUnloadDll.S
@@ -2,9 +2,9 @@
 .ntimp	LdrUnloadDll,LdrUnloadDll
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LdrUnloadDll:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtAllocateVirtualMemory.S b/libc/nt/ntdll/NtAllocateVirtualMemory.S
index 3fafe36a7..20c853279 100644
--- a/libc/nt/ntdll/NtAllocateVirtualMemory.S
+++ b/libc/nt/ntdll/NtAllocateVirtualMemory.S
@@ -2,9 +2,9 @@
 .ntimp	NtAllocateVirtualMemory,NtAllocateVirtualMemory
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtAllocateVirtualMemory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCallbackReturn.S b/libc/nt/ntdll/NtCallbackReturn.S
index b314b30df..bbb5853b0 100644
--- a/libc/nt/ntdll/NtCallbackReturn.S
+++ b/libc/nt/ntdll/NtCallbackReturn.S
@@ -2,9 +2,9 @@
 .ntimp	NtCallbackReturn,NtCallbackReturn
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCallbackReturn:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCancelIoFile.S b/libc/nt/ntdll/NtCancelIoFile.S
index 32e79cb4c..506e09427 100644
--- a/libc/nt/ntdll/NtCancelIoFile.S
+++ b/libc/nt/ntdll/NtCancelIoFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtCancelIoFile,NtCancelIoFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCancelIoFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCancelIoFileEx.S b/libc/nt/ntdll/NtCancelIoFileEx.S
index 79c343d6d..8ebba92d0 100644
--- a/libc/nt/ntdll/NtCancelIoFileEx.S
+++ b/libc/nt/ntdll/NtCancelIoFileEx.S
@@ -2,9 +2,9 @@
 .ntimp	NtCancelIoFileEx,NtCancelIoFileEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCancelIoFileEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtClearEvent.S b/libc/nt/ntdll/NtClearEvent.S
index 223b5cd01..a03633ea9 100644
--- a/libc/nt/ntdll/NtClearEvent.S
+++ b/libc/nt/ntdll/NtClearEvent.S
@@ -2,9 +2,9 @@
 .ntimp	NtClearEvent,NtClearEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtClearEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtClose.S b/libc/nt/ntdll/NtClose.S
index 022f5f4bb..7481f1ea4 100644
--- a/libc/nt/ntdll/NtClose.S
+++ b/libc/nt/ntdll/NtClose.S
@@ -2,9 +2,9 @@
 .ntimp	NtClose,NtClose
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtClose:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtContinue.S b/libc/nt/ntdll/NtContinue.S
index 547962034..c68c394b0 100644
--- a/libc/nt/ntdll/NtContinue.S
+++ b/libc/nt/ntdll/NtContinue.S
@@ -2,9 +2,9 @@
 .ntimp	NtContinue,NtContinue
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtContinue:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateDirectoryObject.S b/libc/nt/ntdll/NtCreateDirectoryObject.S
index 763434e47..d20e663d0 100644
--- a/libc/nt/ntdll/NtCreateDirectoryObject.S
+++ b/libc/nt/ntdll/NtCreateDirectoryObject.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateDirectoryObject,NtCreateDirectoryObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateDirectoryObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateEvent.S b/libc/nt/ntdll/NtCreateEvent.S
index 0226a39b8..1fdb5c6ec 100644
--- a/libc/nt/ntdll/NtCreateEvent.S
+++ b/libc/nt/ntdll/NtCreateEvent.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateEvent,NtCreateEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateFile.S b/libc/nt/ntdll/NtCreateFile.S
index 668c4e789..a2c513539 100644
--- a/libc/nt/ntdll/NtCreateFile.S
+++ b/libc/nt/ntdll/NtCreateFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateFile,NtCreateFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateIoCompletion.S b/libc/nt/ntdll/NtCreateIoCompletion.S
index f182398b0..d4dd2fafd 100644
--- a/libc/nt/ntdll/NtCreateIoCompletion.S
+++ b/libc/nt/ntdll/NtCreateIoCompletion.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateIoCompletion,NtCreateIoCompletion
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateIoCompletion:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateKey.S b/libc/nt/ntdll/NtCreateKey.S
index e9c1c7775..f95c7dacc 100644
--- a/libc/nt/ntdll/NtCreateKey.S
+++ b/libc/nt/ntdll/NtCreateKey.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateKey,NtCreateKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateKeyedEvent.S b/libc/nt/ntdll/NtCreateKeyedEvent.S
index f9e89cf10..ff26e15e1 100644
--- a/libc/nt/ntdll/NtCreateKeyedEvent.S
+++ b/libc/nt/ntdll/NtCreateKeyedEvent.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateKeyedEvent,NtCreateKeyedEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateKeyedEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateNamedPipeFile.S b/libc/nt/ntdll/NtCreateNamedPipeFile.S
index 29f17c9b0..b2e065805 100644
--- a/libc/nt/ntdll/NtCreateNamedPipeFile.S
+++ b/libc/nt/ntdll/NtCreateNamedPipeFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateNamedPipeFile,NtCreateNamedPipeFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateNamedPipeFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateProcess.S b/libc/nt/ntdll/NtCreateProcess.S
index aa3e19157..e4f58b853 100644
--- a/libc/nt/ntdll/NtCreateProcess.S
+++ b/libc/nt/ntdll/NtCreateProcess.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateProcess,NtCreateProcess
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateProcess:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateProfile.S b/libc/nt/ntdll/NtCreateProfile.S
index 524176af2..81dbe5383 100644
--- a/libc/nt/ntdll/NtCreateProfile.S
+++ b/libc/nt/ntdll/NtCreateProfile.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateProfile,NtCreateProfile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateProfile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateSection.S b/libc/nt/ntdll/NtCreateSection.S
index df56c3194..2b41c4b10 100644
--- a/libc/nt/ntdll/NtCreateSection.S
+++ b/libc/nt/ntdll/NtCreateSection.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateSection,NtCreateSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateThread.S b/libc/nt/ntdll/NtCreateThread.S
index 3129a0791..6f893567a 100644
--- a/libc/nt/ntdll/NtCreateThread.S
+++ b/libc/nt/ntdll/NtCreateThread.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateThread,NtCreateThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtCreateTimer.S b/libc/nt/ntdll/NtCreateTimer.S
index c304211fc..caa7d06fa 100644
--- a/libc/nt/ntdll/NtCreateTimer.S
+++ b/libc/nt/ntdll/NtCreateTimer.S
@@ -2,9 +2,9 @@
 .ntimp	NtCreateTimer,NtCreateTimer
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtCreateTimer:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtDelayExecution.S b/libc/nt/ntdll/NtDelayExecution.S
index 8667a26d9..f25ae8418 100644
--- a/libc/nt/ntdll/NtDelayExecution.S
+++ b/libc/nt/ntdll/NtDelayExecution.S
@@ -2,9 +2,9 @@
 .ntimp	NtDelayExecution,NtDelayExecution
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtDelayExecution:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtDeleteFile.S b/libc/nt/ntdll/NtDeleteFile.S
index bb5ac189b..7b1a804b1 100644
--- a/libc/nt/ntdll/NtDeleteFile.S
+++ b/libc/nt/ntdll/NtDeleteFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtDeleteFile,NtDeleteFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtDeleteFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtDeleteKey.S b/libc/nt/ntdll/NtDeleteKey.S
index 3702086a2..031a2ef25 100644
--- a/libc/nt/ntdll/NtDeleteKey.S
+++ b/libc/nt/ntdll/NtDeleteKey.S
@@ -2,9 +2,9 @@
 .ntimp	NtDeleteKey,NtDeleteKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtDeleteKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtDeviceIoControlFile.S b/libc/nt/ntdll/NtDeviceIoControlFile.S
index 1958f7507..5a3869850 100644
--- a/libc/nt/ntdll/NtDeviceIoControlFile.S
+++ b/libc/nt/ntdll/NtDeviceIoControlFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtDeviceIoControlFile,NtDeviceIoControlFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtDeviceIoControlFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtDuplicateObject.S b/libc/nt/ntdll/NtDuplicateObject.S
index 6e286f1eb..681dc880c 100644
--- a/libc/nt/ntdll/NtDuplicateObject.S
+++ b/libc/nt/ntdll/NtDuplicateObject.S
@@ -2,9 +2,9 @@
 .ntimp	NtDuplicateObject,NtDuplicateObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtDuplicateObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtEnumerateKey.S b/libc/nt/ntdll/NtEnumerateKey.S
index 04532e705..19f84dc7b 100644
--- a/libc/nt/ntdll/NtEnumerateKey.S
+++ b/libc/nt/ntdll/NtEnumerateKey.S
@@ -2,9 +2,9 @@
 .ntimp	NtEnumerateKey,NtEnumerateKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtEnumerateKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtEnumerateValueKey.S b/libc/nt/ntdll/NtEnumerateValueKey.S
index 521db2b8c..00832f40a 100644
--- a/libc/nt/ntdll/NtEnumerateValueKey.S
+++ b/libc/nt/ntdll/NtEnumerateValueKey.S
@@ -2,9 +2,9 @@
 .ntimp	NtEnumerateValueKey,NtEnumerateValueKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtEnumerateValueKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtFlushBuffersFile.S b/libc/nt/ntdll/NtFlushBuffersFile.S
index abe9e4dcd..37e58ccc3 100644
--- a/libc/nt/ntdll/NtFlushBuffersFile.S
+++ b/libc/nt/ntdll/NtFlushBuffersFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtFlushBuffersFile,NtFlushBuffersFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtFlushBuffersFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtFlushInstructionCache.S b/libc/nt/ntdll/NtFlushInstructionCache.S
index fe7dcf48c..65dd0e152 100644
--- a/libc/nt/ntdll/NtFlushInstructionCache.S
+++ b/libc/nt/ntdll/NtFlushInstructionCache.S
@@ -2,9 +2,9 @@
 .ntimp	NtFlushInstructionCache,NtFlushInstructionCache
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtFlushInstructionCache:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtFlushKey.S b/libc/nt/ntdll/NtFlushKey.S
index c990a309a..f23081531 100644
--- a/libc/nt/ntdll/NtFlushKey.S
+++ b/libc/nt/ntdll/NtFlushKey.S
@@ -2,9 +2,9 @@
 .ntimp	NtFlushKey,NtFlushKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtFlushKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtFlushVirtualMemory.S b/libc/nt/ntdll/NtFlushVirtualMemory.S
index 8d6331fad..a6c0eb9fc 100644
--- a/libc/nt/ntdll/NtFlushVirtualMemory.S
+++ b/libc/nt/ntdll/NtFlushVirtualMemory.S
@@ -2,9 +2,9 @@
 .ntimp	NtFlushVirtualMemory,NtFlushVirtualMemory
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtFlushVirtualMemory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtFreeVirtualMemory.S b/libc/nt/ntdll/NtFreeVirtualMemory.S
index a96ff4721..61c9d7c8e 100644
--- a/libc/nt/ntdll/NtFreeVirtualMemory.S
+++ b/libc/nt/ntdll/NtFreeVirtualMemory.S
@@ -2,9 +2,9 @@
 .ntimp	NtFreeVirtualMemory,NtFreeVirtualMemory
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtFreeVirtualMemory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtFsControlFile.S b/libc/nt/ntdll/NtFsControlFile.S
index 8587d277a..b3545af90 100644
--- a/libc/nt/ntdll/NtFsControlFile.S
+++ b/libc/nt/ntdll/NtFsControlFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtFsControlFile,NtFsControlFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtFsControlFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtGetContextThread.S b/libc/nt/ntdll/NtGetContextThread.S
index 5c0ae8bb6..916c87c7e 100644
--- a/libc/nt/ntdll/NtGetContextThread.S
+++ b/libc/nt/ntdll/NtGetContextThread.S
@@ -2,9 +2,9 @@
 .ntimp	NtGetContextThread,NtGetContextThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtGetContextThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtMapViewOfSection.S b/libc/nt/ntdll/NtMapViewOfSection.S
index de8a5bb0d..8753f2299 100644
--- a/libc/nt/ntdll/NtMapViewOfSection.S
+++ b/libc/nt/ntdll/NtMapViewOfSection.S
@@ -2,9 +2,9 @@
 .ntimp	NtMapViewOfSection,NtMapViewOfSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtMapViewOfSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtOpenDirectoryObject.S b/libc/nt/ntdll/NtOpenDirectoryObject.S
index 69b2af476..c61d23c62 100644
--- a/libc/nt/ntdll/NtOpenDirectoryObject.S
+++ b/libc/nt/ntdll/NtOpenDirectoryObject.S
@@ -2,9 +2,9 @@
 .ntimp	NtOpenDirectoryObject,NtOpenDirectoryObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtOpenDirectoryObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtOpenFile.S b/libc/nt/ntdll/NtOpenFile.S
index 2a5d39c1b..b86e41d6b 100644
--- a/libc/nt/ntdll/NtOpenFile.S
+++ b/libc/nt/ntdll/NtOpenFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtOpenFile,NtOpenFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtOpenFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtOpenKey.S b/libc/nt/ntdll/NtOpenKey.S
index d455c35cd..9ff5d3eab 100644
--- a/libc/nt/ntdll/NtOpenKey.S
+++ b/libc/nt/ntdll/NtOpenKey.S
@@ -2,9 +2,9 @@
 .ntimp	NtOpenKey,NtOpenKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtOpenKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtOpenProcess.S b/libc/nt/ntdll/NtOpenProcess.S
index 5335dc20f..e4bbf8119 100644
--- a/libc/nt/ntdll/NtOpenProcess.S
+++ b/libc/nt/ntdll/NtOpenProcess.S
@@ -2,9 +2,9 @@
 .ntimp	NtOpenProcess,NtOpenProcess
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtOpenProcess:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtOpenProcessToken.S b/libc/nt/ntdll/NtOpenProcessToken.S
index 23ae6bc8c..abff707ea 100644
--- a/libc/nt/ntdll/NtOpenProcessToken.S
+++ b/libc/nt/ntdll/NtOpenProcessToken.S
@@ -2,9 +2,9 @@
 .ntimp	NtOpenProcessToken,NtOpenProcessToken
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtOpenProcessToken:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtOpenSection.S b/libc/nt/ntdll/NtOpenSection.S
index dba0cb27b..4d9552733 100644
--- a/libc/nt/ntdll/NtOpenSection.S
+++ b/libc/nt/ntdll/NtOpenSection.S
@@ -2,9 +2,9 @@
 .ntimp	NtOpenSection,NtOpenSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtOpenSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtOpenSymbolicLinkObject.S b/libc/nt/ntdll/NtOpenSymbolicLinkObject.S
index 1786ae9fb..3bf490ee4 100644
--- a/libc/nt/ntdll/NtOpenSymbolicLinkObject.S
+++ b/libc/nt/ntdll/NtOpenSymbolicLinkObject.S
@@ -2,9 +2,9 @@
 .ntimp	NtOpenSymbolicLinkObject,NtOpenSymbolicLinkObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtOpenSymbolicLinkObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtOpenThread.S b/libc/nt/ntdll/NtOpenThread.S
index 49bb3774e..daa34ddfe 100644
--- a/libc/nt/ntdll/NtOpenThread.S
+++ b/libc/nt/ntdll/NtOpenThread.S
@@ -2,9 +2,9 @@
 .ntimp	NtOpenThread,NtOpenThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtOpenThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtOpenThreadToken.S b/libc/nt/ntdll/NtOpenThreadToken.S
index 9c1bc8c67..c6132050d 100644
--- a/libc/nt/ntdll/NtOpenThreadToken.S
+++ b/libc/nt/ntdll/NtOpenThreadToken.S
@@ -2,9 +2,9 @@
 .ntimp	NtOpenThreadToken,NtOpenThreadToken
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtOpenThreadToken:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtProtectVirtualMemory.S b/libc/nt/ntdll/NtProtectVirtualMemory.S
index 722dec11c..d9908ef9f 100644
--- a/libc/nt/ntdll/NtProtectVirtualMemory.S
+++ b/libc/nt/ntdll/NtProtectVirtualMemory.S
@@ -2,9 +2,9 @@
 .ntimp	NtProtectVirtualMemory,NtProtectVirtualMemory
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtProtectVirtualMemory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryAttributesFile.S b/libc/nt/ntdll/NtQueryAttributesFile.S
index d10fbcb53..d37a27b83 100644
--- a/libc/nt/ntdll/NtQueryAttributesFile.S
+++ b/libc/nt/ntdll/NtQueryAttributesFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryAttributesFile,NtQueryAttributesFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryAttributesFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryDirectoryFile.S b/libc/nt/ntdll/NtQueryDirectoryFile.S
index 04de7f5c2..2e3aede1e 100644
--- a/libc/nt/ntdll/NtQueryDirectoryFile.S
+++ b/libc/nt/ntdll/NtQueryDirectoryFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryDirectoryFile,NtQueryDirectoryFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryDirectoryFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryFullAttributesFile.S b/libc/nt/ntdll/NtQueryFullAttributesFile.S
index 0ee2d33e9..5efcdf4a2 100644
--- a/libc/nt/ntdll/NtQueryFullAttributesFile.S
+++ b/libc/nt/ntdll/NtQueryFullAttributesFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryFullAttributesFile,NtQueryFullAttributesFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryFullAttributesFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryInformationFile.S b/libc/nt/ntdll/NtQueryInformationFile.S
index 44abc9fad..e7e3401ff 100644
--- a/libc/nt/ntdll/NtQueryInformationFile.S
+++ b/libc/nt/ntdll/NtQueryInformationFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryInformationFile,NtQueryInformationFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryInformationFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryInformationJobObject.S b/libc/nt/ntdll/NtQueryInformationJobObject.S
index e86d27e96..6eb9c469e 100644
--- a/libc/nt/ntdll/NtQueryInformationJobObject.S
+++ b/libc/nt/ntdll/NtQueryInformationJobObject.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryInformationJobObject,NtQueryInformationJobObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryInformationJobObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryInformationProcess.S b/libc/nt/ntdll/NtQueryInformationProcess.S
index 1c2466da5..6b2a7bdac 100644
--- a/libc/nt/ntdll/NtQueryInformationProcess.S
+++ b/libc/nt/ntdll/NtQueryInformationProcess.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryInformationProcess,NtQueryInformationProcess
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryInformationProcess:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryInformationThread.S b/libc/nt/ntdll/NtQueryInformationThread.S
index 91ab1c108..48c2c185a 100644
--- a/libc/nt/ntdll/NtQueryInformationThread.S
+++ b/libc/nt/ntdll/NtQueryInformationThread.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryInformationThread,NtQueryInformationThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryInformationThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryInformationToken.S b/libc/nt/ntdll/NtQueryInformationToken.S
index 256a43bc5..cea9d1d43 100644
--- a/libc/nt/ntdll/NtQueryInformationToken.S
+++ b/libc/nt/ntdll/NtQueryInformationToken.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryInformationToken,NtQueryInformationToken
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryInformationToken:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryIntervalProfile.S b/libc/nt/ntdll/NtQueryIntervalProfile.S
index b9ce1a426..06e711bc4 100644
--- a/libc/nt/ntdll/NtQueryIntervalProfile.S
+++ b/libc/nt/ntdll/NtQueryIntervalProfile.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryIntervalProfile,NtQueryIntervalProfile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryIntervalProfile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryObject.S b/libc/nt/ntdll/NtQueryObject.S
index 00a6804ad..616baaad0 100644
--- a/libc/nt/ntdll/NtQueryObject.S
+++ b/libc/nt/ntdll/NtQueryObject.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryObject,NtQueryObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryPerformanceCounter.S b/libc/nt/ntdll/NtQueryPerformanceCounter.S
index cc0de99fd..bc416368b 100644
--- a/libc/nt/ntdll/NtQueryPerformanceCounter.S
+++ b/libc/nt/ntdll/NtQueryPerformanceCounter.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryPerformanceCounter,NtQueryPerformanceCounter
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryPerformanceCounter:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQuerySection.S b/libc/nt/ntdll/NtQuerySection.S
index c1f314050..260091317 100644
--- a/libc/nt/ntdll/NtQuerySection.S
+++ b/libc/nt/ntdll/NtQuerySection.S
@@ -2,9 +2,9 @@
 .ntimp	NtQuerySection,NtQuerySection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQuerySection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQuerySecurityObject.S b/libc/nt/ntdll/NtQuerySecurityObject.S
index 90c2f36a4..e99ff35c2 100644
--- a/libc/nt/ntdll/NtQuerySecurityObject.S
+++ b/libc/nt/ntdll/NtQuerySecurityObject.S
@@ -2,9 +2,9 @@
 .ntimp	NtQuerySecurityObject,NtQuerySecurityObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQuerySecurityObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQuerySymbolicLinkObject.S b/libc/nt/ntdll/NtQuerySymbolicLinkObject.S
index 4fe8d1a90..2f0201f50 100644
--- a/libc/nt/ntdll/NtQuerySymbolicLinkObject.S
+++ b/libc/nt/ntdll/NtQuerySymbolicLinkObject.S
@@ -2,9 +2,9 @@
 .ntimp	NtQuerySymbolicLinkObject,NtQuerySymbolicLinkObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQuerySymbolicLinkObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQuerySystemInformation.S b/libc/nt/ntdll/NtQuerySystemInformation.S
index 1c66c4390..472f7fa53 100644
--- a/libc/nt/ntdll/NtQuerySystemInformation.S
+++ b/libc/nt/ntdll/NtQuerySystemInformation.S
@@ -2,9 +2,9 @@
 .ntimp	NtQuerySystemInformation,NtQuerySystemInformation
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQuerySystemInformation:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQuerySystemTime.S b/libc/nt/ntdll/NtQuerySystemTime.S
index b900d498b..54f3b4e2d 100644
--- a/libc/nt/ntdll/NtQuerySystemTime.S
+++ b/libc/nt/ntdll/NtQuerySystemTime.S
@@ -2,9 +2,9 @@
 .ntimp	NtQuerySystemTime,NtQuerySystemTime
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQuerySystemTime:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryValueKey.S b/libc/nt/ntdll/NtQueryValueKey.S
index f48b62724..4bb1cad12 100644
--- a/libc/nt/ntdll/NtQueryValueKey.S
+++ b/libc/nt/ntdll/NtQueryValueKey.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryValueKey,NtQueryValueKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryValueKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryVirtualMemory.S b/libc/nt/ntdll/NtQueryVirtualMemory.S
index 102972387..8eff2da54 100644
--- a/libc/nt/ntdll/NtQueryVirtualMemory.S
+++ b/libc/nt/ntdll/NtQueryVirtualMemory.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryVirtualMemory,NtQueryVirtualMemory
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryVirtualMemory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueryVolumeInformationFile.S b/libc/nt/ntdll/NtQueryVolumeInformationFile.S
index 20d7658b6..ca1c55b08 100644
--- a/libc/nt/ntdll/NtQueryVolumeInformationFile.S
+++ b/libc/nt/ntdll/NtQueryVolumeInformationFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueryVolumeInformationFile,NtQueryVolumeInformationFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueryVolumeInformationFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtQueueApcThread.S b/libc/nt/ntdll/NtQueueApcThread.S
index f3966d0b4..26625b7ea 100644
--- a/libc/nt/ntdll/NtQueueApcThread.S
+++ b/libc/nt/ntdll/NtQueueApcThread.S
@@ -2,9 +2,9 @@
 .ntimp	NtQueueApcThread,NtQueueApcThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtQueueApcThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtRaiseException.S b/libc/nt/ntdll/NtRaiseException.S
index c4f7ffaf1..cf857b1a0 100644
--- a/libc/nt/ntdll/NtRaiseException.S
+++ b/libc/nt/ntdll/NtRaiseException.S
@@ -2,9 +2,9 @@
 .ntimp	NtRaiseException,NtRaiseException
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtRaiseException:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtRaiseHardError.S b/libc/nt/ntdll/NtRaiseHardError.S
index 5e8d3a71c..e0a50c07d 100644
--- a/libc/nt/ntdll/NtRaiseHardError.S
+++ b/libc/nt/ntdll/NtRaiseHardError.S
@@ -2,9 +2,9 @@
 .ntimp	NtRaiseHardError,NtRaiseHardError
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtRaiseHardError:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtReadFile.S b/libc/nt/ntdll/NtReadFile.S
index 7da47cecc..2e9b0d83c 100644
--- a/libc/nt/ntdll/NtReadFile.S
+++ b/libc/nt/ntdll/NtReadFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtReadFile,NtReadFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtReadFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtReadVirtualMemory.S b/libc/nt/ntdll/NtReadVirtualMemory.S
index ebd104ca5..efe7cf73f 100644
--- a/libc/nt/ntdll/NtReadVirtualMemory.S
+++ b/libc/nt/ntdll/NtReadVirtualMemory.S
@@ -2,9 +2,9 @@
 .ntimp	NtReadVirtualMemory,NtReadVirtualMemory
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtReadVirtualMemory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtReleaseKeyedEvent.S b/libc/nt/ntdll/NtReleaseKeyedEvent.S
index 91da68573..a37dfbe92 100644
--- a/libc/nt/ntdll/NtReleaseKeyedEvent.S
+++ b/libc/nt/ntdll/NtReleaseKeyedEvent.S
@@ -2,9 +2,9 @@
 .ntimp	NtReleaseKeyedEvent,NtReleaseKeyedEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtReleaseKeyedEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtResumeThread.S b/libc/nt/ntdll/NtResumeThread.S
index 87c0b5080..297fb8e26 100644
--- a/libc/nt/ntdll/NtResumeThread.S
+++ b/libc/nt/ntdll/NtResumeThread.S
@@ -2,9 +2,9 @@
 .ntimp	NtResumeThread,NtResumeThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtResumeThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtSetContextThread.S b/libc/nt/ntdll/NtSetContextThread.S
index c7dc25c42..2a9d21e91 100644
--- a/libc/nt/ntdll/NtSetContextThread.S
+++ b/libc/nt/ntdll/NtSetContextThread.S
@@ -2,9 +2,9 @@
 .ntimp	NtSetContextThread,NtSetContextThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtSetContextThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtSetEvent.S b/libc/nt/ntdll/NtSetEvent.S
index 949a1376a..b92aa93ed 100644
--- a/libc/nt/ntdll/NtSetEvent.S
+++ b/libc/nt/ntdll/NtSetEvent.S
@@ -2,9 +2,9 @@
 .ntimp	NtSetEvent,NtSetEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtSetEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtSetInformationFile.S b/libc/nt/ntdll/NtSetInformationFile.S
index ddf5cc03b..c5611ea25 100644
--- a/libc/nt/ntdll/NtSetInformationFile.S
+++ b/libc/nt/ntdll/NtSetInformationFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtSetInformationFile,NtSetInformationFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtSetInformationFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtSetInformationThread.S b/libc/nt/ntdll/NtSetInformationThread.S
index 7618e38c5..82a739f2e 100644
--- a/libc/nt/ntdll/NtSetInformationThread.S
+++ b/libc/nt/ntdll/NtSetInformationThread.S
@@ -2,9 +2,9 @@
 .ntimp	NtSetInformationThread,NtSetInformationThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtSetInformationThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtSetIntervalProfile.S b/libc/nt/ntdll/NtSetIntervalProfile.S
index 8945e5ab9..df084af36 100644
--- a/libc/nt/ntdll/NtSetIntervalProfile.S
+++ b/libc/nt/ntdll/NtSetIntervalProfile.S
@@ -2,9 +2,9 @@
 .ntimp	NtSetIntervalProfile,NtSetIntervalProfile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtSetIntervalProfile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtSetTimer.S b/libc/nt/ntdll/NtSetTimer.S
index fb05926f3..9856accc0 100644
--- a/libc/nt/ntdll/NtSetTimer.S
+++ b/libc/nt/ntdll/NtSetTimer.S
@@ -2,9 +2,9 @@
 .ntimp	NtSetTimer,NtSetTimer
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtSetTimer:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtSetValueKey.S b/libc/nt/ntdll/NtSetValueKey.S
index 5098bd712..96ac75b1d 100644
--- a/libc/nt/ntdll/NtSetValueKey.S
+++ b/libc/nt/ntdll/NtSetValueKey.S
@@ -2,9 +2,9 @@
 .ntimp	NtSetValueKey,NtSetValueKey
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtSetValueKey:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtSignalAndWaitForSingleObject.S b/libc/nt/ntdll/NtSignalAndWaitForSingleObject.S
index 56fbed044..9414b5d33 100644
--- a/libc/nt/ntdll/NtSignalAndWaitForSingleObject.S
+++ b/libc/nt/ntdll/NtSignalAndWaitForSingleObject.S
@@ -2,9 +2,9 @@
 .ntimp	NtSignalAndWaitForSingleObject,NtSignalAndWaitForSingleObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtSignalAndWaitForSingleObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtStartProfile.S b/libc/nt/ntdll/NtStartProfile.S
index 778634fb1..58857ca0b 100644
--- a/libc/nt/ntdll/NtStartProfile.S
+++ b/libc/nt/ntdll/NtStartProfile.S
@@ -2,9 +2,9 @@
 .ntimp	NtStartProfile,NtStartProfile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtStartProfile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtStopProfile.S b/libc/nt/ntdll/NtStopProfile.S
index 2043178d6..cf69686c2 100644
--- a/libc/nt/ntdll/NtStopProfile.S
+++ b/libc/nt/ntdll/NtStopProfile.S
@@ -2,9 +2,9 @@
 .ntimp	NtStopProfile,NtStopProfile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtStopProfile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtSuspendThread.S b/libc/nt/ntdll/NtSuspendThread.S
index 1074ea9d5..93863a616 100644
--- a/libc/nt/ntdll/NtSuspendThread.S
+++ b/libc/nt/ntdll/NtSuspendThread.S
@@ -2,9 +2,9 @@
 .ntimp	NtSuspendThread,NtSuspendThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtSuspendThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtTerminateProcess.S b/libc/nt/ntdll/NtTerminateProcess.S
index 3a26e1563..93add9636 100644
--- a/libc/nt/ntdll/NtTerminateProcess.S
+++ b/libc/nt/ntdll/NtTerminateProcess.S
@@ -2,9 +2,9 @@
 .ntimp	NtTerminateProcess,NtTerminateProcess
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtTerminateProcess:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtTerminateThread.S b/libc/nt/ntdll/NtTerminateThread.S
index 999686231..cff4b3820 100644
--- a/libc/nt/ntdll/NtTerminateThread.S
+++ b/libc/nt/ntdll/NtTerminateThread.S
@@ -2,9 +2,9 @@
 .ntimp	NtTerminateThread,NtTerminateThread
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtTerminateThread:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtTestAlert.S b/libc/nt/ntdll/NtTestAlert.S
index d13522758..3d3fa0c3b 100644
--- a/libc/nt/ntdll/NtTestAlert.S
+++ b/libc/nt/ntdll/NtTestAlert.S
@@ -2,9 +2,9 @@
 .ntimp	NtTestAlert,NtTestAlert
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtTestAlert:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtUnmapViewOfSection.S b/libc/nt/ntdll/NtUnmapViewOfSection.S
index 34a3b2029..cfd770a19 100644
--- a/libc/nt/ntdll/NtUnmapViewOfSection.S
+++ b/libc/nt/ntdll/NtUnmapViewOfSection.S
@@ -2,9 +2,9 @@
 .ntimp	NtUnmapViewOfSection,NtUnmapViewOfSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtUnmapViewOfSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtWaitForKeyedEvent.S b/libc/nt/ntdll/NtWaitForKeyedEvent.S
index d2758c18e..5e0de2d97 100644
--- a/libc/nt/ntdll/NtWaitForKeyedEvent.S
+++ b/libc/nt/ntdll/NtWaitForKeyedEvent.S
@@ -2,9 +2,9 @@
 .ntimp	NtWaitForKeyedEvent,NtWaitForKeyedEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtWaitForKeyedEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtWaitForSingleObject.S b/libc/nt/ntdll/NtWaitForSingleObject.S
index f869d9c7b..6e6927617 100644
--- a/libc/nt/ntdll/NtWaitForSingleObject.S
+++ b/libc/nt/ntdll/NtWaitForSingleObject.S
@@ -2,9 +2,9 @@
 .ntimp	NtWaitForSingleObject,NtWaitForSingleObject
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtWaitForSingleObject:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtWriteFile.S b/libc/nt/ntdll/NtWriteFile.S
index 28e1ae810..997fffab6 100644
--- a/libc/nt/ntdll/NtWriteFile.S
+++ b/libc/nt/ntdll/NtWriteFile.S
@@ -2,9 +2,9 @@
 .ntimp	NtWriteFile,NtWriteFile
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtWriteFile:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtWriteVirtualMemory.S b/libc/nt/ntdll/NtWriteVirtualMemory.S
index e7c6d710c..a71eeae27 100644
--- a/libc/nt/ntdll/NtWriteVirtualMemory.S
+++ b/libc/nt/ntdll/NtWriteVirtualMemory.S
@@ -2,9 +2,9 @@
 .ntimp	NtWriteVirtualMemory,NtWriteVirtualMemory
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtWriteVirtualMemory:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/NtYieldExecution.S b/libc/nt/ntdll/NtYieldExecution.S
index 9881e9420..1f0105f0b 100644
--- a/libc/nt/ntdll/NtYieldExecution.S
+++ b/libc/nt/ntdll/NtYieldExecution.S
@@ -2,9 +2,9 @@
 .ntimp	NtYieldExecution,NtYieldExecution
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 NtYieldExecution:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlAllocateHeap.S b/libc/nt/ntdll/RtlAllocateHeap.S
index aaa13c70f..2b20c33e2 100644
--- a/libc/nt/ntdll/RtlAllocateHeap.S
+++ b/libc/nt/ntdll/RtlAllocateHeap.S
@@ -2,9 +2,9 @@
 .ntimp	RtlAllocateHeap,RtlAllocateHeap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlAllocateHeap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlCloneUserProcess.S b/libc/nt/ntdll/RtlCloneUserProcess.S
index 9c62f5819..f1d84b08e 100644
--- a/libc/nt/ntdll/RtlCloneUserProcess.S
+++ b/libc/nt/ntdll/RtlCloneUserProcess.S
@@ -2,9 +2,9 @@
 .ntimp	RtlCloneUserProcess,RtlCloneUserProcess
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlCloneUserProcess:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlConvertSidToUnicodeString.S b/libc/nt/ntdll/RtlConvertSidToUnicodeString.S
index baa215f76..691233810 100644
--- a/libc/nt/ntdll/RtlConvertSidToUnicodeString.S
+++ b/libc/nt/ntdll/RtlConvertSidToUnicodeString.S
@@ -2,9 +2,9 @@
 .ntimp	RtlConvertSidToUnicodeString,RtlConvertSidToUnicodeString
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlConvertSidToUnicodeString:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlCreateHeap.S b/libc/nt/ntdll/RtlCreateHeap.S
index ae45e2440..24141bd39 100644
--- a/libc/nt/ntdll/RtlCreateHeap.S
+++ b/libc/nt/ntdll/RtlCreateHeap.S
@@ -2,9 +2,9 @@
 .ntimp	RtlCreateHeap,RtlCreateHeap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlCreateHeap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlCreateProcessParameters.S b/libc/nt/ntdll/RtlCreateProcessParameters.S
index 86efc2e26..1af4423b2 100644
--- a/libc/nt/ntdll/RtlCreateProcessParameters.S
+++ b/libc/nt/ntdll/RtlCreateProcessParameters.S
@@ -2,9 +2,9 @@
 .ntimp	RtlCreateProcessParameters,RtlCreateProcessParameters
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlCreateProcessParameters:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlDeleteCriticalSection.S b/libc/nt/ntdll/RtlDeleteCriticalSection.S
index 2231999ea..5c52ef1f8 100644
--- a/libc/nt/ntdll/RtlDeleteCriticalSection.S
+++ b/libc/nt/ntdll/RtlDeleteCriticalSection.S
@@ -2,9 +2,9 @@
 .ntimp	RtlDeleteCriticalSection,RtlDeleteCriticalSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlDeleteCriticalSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlDestroyHeap.S b/libc/nt/ntdll/RtlDestroyHeap.S
index 81073ce82..c0121bbed 100644
--- a/libc/nt/ntdll/RtlDestroyHeap.S
+++ b/libc/nt/ntdll/RtlDestroyHeap.S
@@ -2,9 +2,9 @@
 .ntimp	RtlDestroyHeap,RtlDestroyHeap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlDestroyHeap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlDestroyProcessParameters.S b/libc/nt/ntdll/RtlDestroyProcessParameters.S
index d8c03db61..fe7fccbf3 100644
--- a/libc/nt/ntdll/RtlDestroyProcessParameters.S
+++ b/libc/nt/ntdll/RtlDestroyProcessParameters.S
@@ -2,9 +2,9 @@
 .ntimp	RtlDestroyProcessParameters,RtlDestroyProcessParameters
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlDestroyProcessParameters:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlEnterCriticalSection.S b/libc/nt/ntdll/RtlEnterCriticalSection.S
index f68cf2cfc..aef32af61 100644
--- a/libc/nt/ntdll/RtlEnterCriticalSection.S
+++ b/libc/nt/ntdll/RtlEnterCriticalSection.S
@@ -2,9 +2,9 @@
 .ntimp	RtlEnterCriticalSection,RtlEnterCriticalSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlEnterCriticalSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlFreeHeap.S b/libc/nt/ntdll/RtlFreeHeap.S
index d878a3c12..58b9da69e 100644
--- a/libc/nt/ntdll/RtlFreeHeap.S
+++ b/libc/nt/ntdll/RtlFreeHeap.S
@@ -2,9 +2,9 @@
 .ntimp	RtlFreeHeap,RtlFreeHeap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlFreeHeap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlFreeUnicodeString.S b/libc/nt/ntdll/RtlFreeUnicodeString.S
index ea48ef15d..b0ff3f6ae 100644
--- a/libc/nt/ntdll/RtlFreeUnicodeString.S
+++ b/libc/nt/ntdll/RtlFreeUnicodeString.S
@@ -2,9 +2,9 @@
 .ntimp	RtlFreeUnicodeString,RtlFreeUnicodeString
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlFreeUnicodeString:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlGetProcessHeaps.S b/libc/nt/ntdll/RtlGetProcessHeaps.S
index 3697a6630..36973f65b 100644
--- a/libc/nt/ntdll/RtlGetProcessHeaps.S
+++ b/libc/nt/ntdll/RtlGetProcessHeaps.S
@@ -2,9 +2,9 @@
 .ntimp	RtlGetProcessHeaps,RtlGetProcessHeaps
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlGetProcessHeaps:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlInitUnicodeString.S b/libc/nt/ntdll/RtlInitUnicodeString.S
index 9e3eed1aa..1a411ad25 100644
--- a/libc/nt/ntdll/RtlInitUnicodeString.S
+++ b/libc/nt/ntdll/RtlInitUnicodeString.S
@@ -2,9 +2,9 @@
 .ntimp	RtlInitUnicodeString,RtlInitUnicodeString
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlInitUnicodeString:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlInitializeCriticalSection.S b/libc/nt/ntdll/RtlInitializeCriticalSection.S
index fa1f1c32b..f308fcff3 100644
--- a/libc/nt/ntdll/RtlInitializeCriticalSection.S
+++ b/libc/nt/ntdll/RtlInitializeCriticalSection.S
@@ -2,9 +2,9 @@
 .ntimp	RtlInitializeCriticalSection,RtlInitializeCriticalSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlInitializeCriticalSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlLeaveCriticalSection.S b/libc/nt/ntdll/RtlLeaveCriticalSection.S
index d0243521a..89ccaba6b 100644
--- a/libc/nt/ntdll/RtlLeaveCriticalSection.S
+++ b/libc/nt/ntdll/RtlLeaveCriticalSection.S
@@ -2,9 +2,9 @@
 .ntimp	RtlLeaveCriticalSection,RtlLeaveCriticalSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlLeaveCriticalSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlLockHeap.S b/libc/nt/ntdll/RtlLockHeap.S
index d5fe45c61..ed07abcc2 100644
--- a/libc/nt/ntdll/RtlLockHeap.S
+++ b/libc/nt/ntdll/RtlLockHeap.S
@@ -2,9 +2,9 @@
 .ntimp	RtlLockHeap,RtlLockHeap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlLockHeap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlNtStatusToDosError.S b/libc/nt/ntdll/RtlNtStatusToDosError.S
index e4d9f3519..6c1640398 100644
--- a/libc/nt/ntdll/RtlNtStatusToDosError.S
+++ b/libc/nt/ntdll/RtlNtStatusToDosError.S
@@ -2,9 +2,9 @@
 .ntimp	RtlNtStatusToDosError,RtlNtStatusToDosError
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlNtStatusToDosError:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlQueryEnvironmentVariable.S b/libc/nt/ntdll/RtlQueryEnvironmentVariable.S
index b40f4060d..a2b0f4991 100644
--- a/libc/nt/ntdll/RtlQueryEnvironmentVariable.S
+++ b/libc/nt/ntdll/RtlQueryEnvironmentVariable.S
@@ -2,9 +2,9 @@
 .ntimp	RtlQueryEnvironmentVariable,RtlQueryEnvironmentVariable
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlQueryEnvironmentVariable:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlReAllocateHeap.S b/libc/nt/ntdll/RtlReAllocateHeap.S
index fea7827d3..b5384af88 100644
--- a/libc/nt/ntdll/RtlReAllocateHeap.S
+++ b/libc/nt/ntdll/RtlReAllocateHeap.S
@@ -2,9 +2,9 @@
 .ntimp	RtlReAllocateHeap,RtlReAllocateHeap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlReAllocateHeap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlSizeHeap.S b/libc/nt/ntdll/RtlSizeHeap.S
index 7b97fc047..1c6290c76 100644
--- a/libc/nt/ntdll/RtlSizeHeap.S
+++ b/libc/nt/ntdll/RtlSizeHeap.S
@@ -2,9 +2,9 @@
 .ntimp	RtlSizeHeap,RtlSizeHeap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlSizeHeap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlTryEnterCriticalSection.S b/libc/nt/ntdll/RtlTryEnterCriticalSection.S
index 0580e967a..0d469d4be 100644
--- a/libc/nt/ntdll/RtlTryEnterCriticalSection.S
+++ b/libc/nt/ntdll/RtlTryEnterCriticalSection.S
@@ -2,9 +2,9 @@
 .ntimp	RtlTryEnterCriticalSection,RtlTryEnterCriticalSection
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlTryEnterCriticalSection:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlUnlockHeap.S b/libc/nt/ntdll/RtlUnlockHeap.S
index e07b81852..9e4e13881 100644
--- a/libc/nt/ntdll/RtlUnlockHeap.S
+++ b/libc/nt/ntdll/RtlUnlockHeap.S
@@ -2,9 +2,9 @@
 .ntimp	RtlUnlockHeap,RtlUnlockHeap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlUnlockHeap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlValidateHeap.S b/libc/nt/ntdll/RtlValidateHeap.S
index 1311d2762..83a0e2e8d 100644
--- a/libc/nt/ntdll/RtlValidateHeap.S
+++ b/libc/nt/ntdll/RtlValidateHeap.S
@@ -2,9 +2,9 @@
 .ntimp	RtlValidateHeap,RtlValidateHeap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlValidateHeap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/RtlWalkHeap.S b/libc/nt/ntdll/RtlWalkHeap.S
index 859f3af8f..356fced4c 100644
--- a/libc/nt/ntdll/RtlWalkHeap.S
+++ b/libc/nt/ntdll/RtlWalkHeap.S
@@ -2,9 +2,9 @@
 .ntimp	RtlWalkHeap,RtlWalkHeap
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RtlWalkHeap:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ntdll/ZwAreMappedFilesTheSame.S b/libc/nt/ntdll/ZwAreMappedFilesTheSame.S
index 16bc0035c..10741f2ea 100644
--- a/libc/nt/ntdll/ZwAreMappedFilesTheSame.S
+++ b/libc/nt/ntdll/ZwAreMappedFilesTheSame.S
@@ -2,9 +2,9 @@
 .ntimp	ZwAreMappedFilesTheSame,ZwAreMappedFilesTheSame
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ZwAreMappedFilesTheSame:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/pdh/PdhAddEnglishCounterW.S b/libc/nt/pdh/PdhAddEnglishCounterW.S
index 5df0a96ba..9cf96c467 100644
--- a/libc/nt/pdh/PdhAddEnglishCounterW.S
+++ b/libc/nt/pdh/PdhAddEnglishCounterW.S
@@ -2,9 +2,9 @@
 .imp	pdh,__imp_PdhAddEnglishCounterW,PdhAddEnglishCounterW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PdhAddEnglishCounter:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/pdh/PdhCollectQueryDataEx.S b/libc/nt/pdh/PdhCollectQueryDataEx.S
index 53feb44ea..9638a6c17 100644
--- a/libc/nt/pdh/PdhCollectQueryDataEx.S
+++ b/libc/nt/pdh/PdhCollectQueryDataEx.S
@@ -2,9 +2,9 @@
 .imp	pdh,__imp_PdhCollectQueryDataEx,PdhCollectQueryDataEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PdhCollectQueryDataEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/pdh/PdhGetFormattedCounterValue.S b/libc/nt/pdh/PdhGetFormattedCounterValue.S
index d7c8b5ca6..3ca1afc5b 100644
--- a/libc/nt/pdh/PdhGetFormattedCounterValue.S
+++ b/libc/nt/pdh/PdhGetFormattedCounterValue.S
@@ -2,9 +2,9 @@
 .imp	pdh,__imp_PdhGetFormattedCounterValue,PdhGetFormattedCounterValue
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PdhGetFormattedCounterValue:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/pdh/PdhOpenQueryW.S b/libc/nt/pdh/PdhOpenQueryW.S
index 42730dba4..b3457355a 100644
--- a/libc/nt/pdh/PdhOpenQueryW.S
+++ b/libc/nt/pdh/PdhOpenQueryW.S
@@ -2,9 +2,9 @@
 .imp	pdh,__imp_PdhOpenQueryW,PdhOpenQueryW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PdhOpenQuery:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/psapi/EnumProcessModules.S b/libc/nt/psapi/EnumProcessModules.S
index 1ef1a7744..e2cb316f8 100644
--- a/libc/nt/psapi/EnumProcessModules.S
+++ b/libc/nt/psapi/EnumProcessModules.S
@@ -2,9 +2,9 @@
 .imp	psapi,__imp_EnumProcessModules,EnumProcessModules
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 EnumProcessModules:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/psapi/EnumProcessModulesEx.S b/libc/nt/psapi/EnumProcessModulesEx.S
index b9526f3cd..aa979e151 100644
--- a/libc/nt/psapi/EnumProcessModulesEx.S
+++ b/libc/nt/psapi/EnumProcessModulesEx.S
@@ -2,9 +2,9 @@
 .imp	psapi,__imp_EnumProcessModulesEx,EnumProcessModulesEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 EnumProcessModulesEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/psapi/EnumProcesses.S b/libc/nt/psapi/EnumProcesses.S
index 46682c40c..df59cf115 100644
--- a/libc/nt/psapi/EnumProcesses.S
+++ b/libc/nt/psapi/EnumProcesses.S
@@ -2,9 +2,9 @@
 .imp	psapi,__imp_EnumProcesses,EnumProcesses
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 EnumProcesses:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/psapi/GetModuleBaseNameW.S b/libc/nt/psapi/GetModuleBaseNameW.S
index 6210a49ac..8a891c806 100644
--- a/libc/nt/psapi/GetModuleBaseNameW.S
+++ b/libc/nt/psapi/GetModuleBaseNameW.S
@@ -2,9 +2,9 @@
 .imp	psapi,__imp_GetModuleBaseNameW,GetModuleBaseNameW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetModuleBaseName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/psapi/GetProcessImageFileNameW.S b/libc/nt/psapi/GetProcessImageFileNameW.S
index b7c0c69a4..802d11240 100644
--- a/libc/nt/psapi/GetProcessImageFileNameW.S
+++ b/libc/nt/psapi/GetProcessImageFileNameW.S
@@ -2,9 +2,9 @@
 .imp	psapi,__imp_GetProcessImageFileNameW,GetProcessImageFileNameW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessImageFileName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/psapi/GetProcessMemoryInfo.S b/libc/nt/psapi/GetProcessMemoryInfo.S
index 543f4ab0f..69bf0bca6 100644
--- a/libc/nt/psapi/GetProcessMemoryInfo.S
+++ b/libc/nt/psapi/GetProcessMemoryInfo.S
@@ -2,9 +2,9 @@
 .imp	psapi,__imp_GetProcessMemoryInfo,GetProcessMemoryInfo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetProcessMemoryInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/struct/processornumber.h b/libc/nt/struct/processornumber.h
new file mode 100644
index 000000000..983cebbe1
--- /dev/null
+++ b/libc/nt/struct/processornumber.h
@@ -0,0 +1,12 @@
+#ifndef COSMOPOLITAN_LIBC_NT_STRUCT_PROCESSORNUMBER_H_
+#define COSMOPOLITAN_LIBC_NT_STRUCT_PROCESSORNUMBER_H_
+COSMOPOLITAN_C_START_
+
+struct NtProcessorNumber {
+  uint16_t Group;
+  uint8_t Number;
+  uint8_t Reserved;
+};
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_LIBC_NT_STRUCT_PROCESSORNUMBER_H_ */
diff --git a/libc/nt/synchronization.h b/libc/nt/synchronization.h
index e5a768859..d4cbef44f 100644
--- a/libc/nt/synchronization.h
+++ b/libc/nt/synchronization.h
@@ -4,6 +4,7 @@
 #include "libc/nt/struct/criticalsection.h"
 #include "libc/nt/struct/filetime.h"
 #include "libc/nt/struct/linkedlist.h"
+#include "libc/nt/struct/processornumber.h"
 #include "libc/nt/struct/securityattributes.h"
 #include "libc/nt/struct/systemtime.h"
 #include "libc/nt/thunk/msabi.h"
@@ -115,6 +116,8 @@ bool32 GetSystemTimeAdjustment(uint32_t *lpTimeAdjustment,
                                uint32_t *lpTimeIncrement,
                                bool32 *lpTimeAdjustmentDisabled);
 
+void GetCurrentProcessorNumberEx(struct NtProcessorNumber *out_ProcNumber);
+
 #if ShouldUseMsabiAttribute()
 #include "libc/nt/thunk/synchronization.inc"
 #endif /* ShouldUseMsabiAttribute() */
diff --git a/libc/nt/user32/AdjustWindowRect.S b/libc/nt/user32/AdjustWindowRect.S
index 2b537e267..5e9842e31 100644
--- a/libc/nt/user32/AdjustWindowRect.S
+++ b/libc/nt/user32/AdjustWindowRect.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_AdjustWindowRect,AdjustWindowRect
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AdjustWindowRect:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/AnimateWindow.S b/libc/nt/user32/AnimateWindow.S
index 429820ad2..fa2de433c 100644
--- a/libc/nt/user32/AnimateWindow.S
+++ b/libc/nt/user32/AnimateWindow.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_AnimateWindow,AnimateWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AnimateWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/AppendMenuA.S b/libc/nt/user32/AppendMenuA.S
index e7e4b8406..39aeaf081 100644
--- a/libc/nt/user32/AppendMenuA.S
+++ b/libc/nt/user32/AppendMenuA.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_AppendMenuA,AppendMenuA
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AppendMenuA:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/AppendMenuW.S b/libc/nt/user32/AppendMenuW.S
index 3565f8cc2..a6321876f 100644
--- a/libc/nt/user32/AppendMenuW.S
+++ b/libc/nt/user32/AppendMenuW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_AppendMenuW,AppendMenuW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 AppendMenu:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/BeginPaint.S b/libc/nt/user32/BeginPaint.S
index 3fa7a741d..3558a6727 100644
--- a/libc/nt/user32/BeginPaint.S
+++ b/libc/nt/user32/BeginPaint.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_BeginPaint,BeginPaint
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 BeginPaint:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/BringWindowToTop.S b/libc/nt/user32/BringWindowToTop.S
index cad128121..fb24b673e 100644
--- a/libc/nt/user32/BringWindowToTop.S
+++ b/libc/nt/user32/BringWindowToTop.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_BringWindowToTop,BringWindowToTop
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 BringWindowToTop:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/CallNextHookEx.S b/libc/nt/user32/CallNextHookEx.S
index 4f942681a..3227077fc 100644
--- a/libc/nt/user32/CallNextHookEx.S
+++ b/libc/nt/user32/CallNextHookEx.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_CallNextHookEx,CallNextHookEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CallNextHookEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/CloseWindow.S b/libc/nt/user32/CloseWindow.S
index baa555965..7ea6e7000 100644
--- a/libc/nt/user32/CloseWindow.S
+++ b/libc/nt/user32/CloseWindow.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_CloseWindow,CloseWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CloseWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/CreateIconIndirect.S b/libc/nt/user32/CreateIconIndirect.S
index 138bf8248..5f34c5260 100644
--- a/libc/nt/user32/CreateIconIndirect.S
+++ b/libc/nt/user32/CreateIconIndirect.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_CreateIconIndirect,CreateIconIndirect
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateIconIndirect:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/CreateMenu.S b/libc/nt/user32/CreateMenu.S
index 2c2755836..f3f978730 100644
--- a/libc/nt/user32/CreateMenu.S
+++ b/libc/nt/user32/CreateMenu.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_CreateMenu,CreateMenu
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateMenu:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/CreatePopupMenu.S b/libc/nt/user32/CreatePopupMenu.S
index 5ff63a9b4..8640547c9 100644
--- a/libc/nt/user32/CreatePopupMenu.S
+++ b/libc/nt/user32/CreatePopupMenu.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_CreatePopupMenu,CreatePopupMenu
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreatePopupMenu:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/CreateWindowExW.S b/libc/nt/user32/CreateWindowExW.S
index 9491dde03..ce258a0e0 100644
--- a/libc/nt/user32/CreateWindowExW.S
+++ b/libc/nt/user32/CreateWindowExW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_CreateWindowExW,CreateWindowExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 CreateWindowEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/DefWindowProcW.S b/libc/nt/user32/DefWindowProcW.S
index 798b00313..0f30f28a4 100644
--- a/libc/nt/user32/DefWindowProcW.S
+++ b/libc/nt/user32/DefWindowProcW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_DefWindowProcW,DefWindowProcW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DefWindowProc:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/DeleteMenu.S b/libc/nt/user32/DeleteMenu.S
index 5be482983..df6d252fb 100644
--- a/libc/nt/user32/DeleteMenu.S
+++ b/libc/nt/user32/DeleteMenu.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_DeleteMenu,DeleteMenu
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DeleteMenu:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/DestroyIcon.S b/libc/nt/user32/DestroyIcon.S
index 8990343c6..b8941b68f 100644
--- a/libc/nt/user32/DestroyIcon.S
+++ b/libc/nt/user32/DestroyIcon.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_DestroyIcon,DestroyIcon
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DestroyIcon:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/DestroyMenu.S b/libc/nt/user32/DestroyMenu.S
index c7c543012..602ea02d9 100644
--- a/libc/nt/user32/DestroyMenu.S
+++ b/libc/nt/user32/DestroyMenu.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_DestroyMenu,DestroyMenu
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DestroyMenu:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/DestroyWindow.S b/libc/nt/user32/DestroyWindow.S
index 10e9e41c8..45d364c81 100644
--- a/libc/nt/user32/DestroyWindow.S
+++ b/libc/nt/user32/DestroyWindow.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_DestroyWindow,DestroyWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DestroyWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/DispatchMessageW.S b/libc/nt/user32/DispatchMessageW.S
index c6408a0ba..ead562d9e 100644
--- a/libc/nt/user32/DispatchMessageW.S
+++ b/libc/nt/user32/DispatchMessageW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_DispatchMessageW,DispatchMessageW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DispatchMessage:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/DrawTextExW.S b/libc/nt/user32/DrawTextExW.S
index 6e02da576..8c1edd0d5 100644
--- a/libc/nt/user32/DrawTextExW.S
+++ b/libc/nt/user32/DrawTextExW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_DrawTextExW,DrawTextExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DrawTextEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/DrawTextW.S b/libc/nt/user32/DrawTextW.S
index 104ef0317..7b6c32c62 100644
--- a/libc/nt/user32/DrawTextW.S
+++ b/libc/nt/user32/DrawTextW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_DrawTextW,DrawTextW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 DrawText:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/EndPaint.S b/libc/nt/user32/EndPaint.S
index 631394e0d..76c5cb450 100644
--- a/libc/nt/user32/EndPaint.S
+++ b/libc/nt/user32/EndPaint.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_EndPaint,EndPaint
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 EndPaint:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/EnumChildWindows.S b/libc/nt/user32/EnumChildWindows.S
index 528023daf..d3ccfec38 100644
--- a/libc/nt/user32/EnumChildWindows.S
+++ b/libc/nt/user32/EnumChildWindows.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_EnumChildWindows,EnumChildWindows
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 EnumChildWindows:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/FillRect.S b/libc/nt/user32/FillRect.S
index f0652d8c6..d5b11e264 100644
--- a/libc/nt/user32/FillRect.S
+++ b/libc/nt/user32/FillRect.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_FillRect,FillRect
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FillRect:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/FindWindowExW.S b/libc/nt/user32/FindWindowExW.S
index 33d107e52..4e2ab3905 100644
--- a/libc/nt/user32/FindWindowExW.S
+++ b/libc/nt/user32/FindWindowExW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_FindWindowExW,FindWindowExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FindWindowEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/FindWindowW.S b/libc/nt/user32/FindWindowW.S
index 47c4fc6e0..afa9a8028 100644
--- a/libc/nt/user32/FindWindowW.S
+++ b/libc/nt/user32/FindWindowW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_FindWindowW,FindWindowW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FindWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetClientRect.S b/libc/nt/user32/GetClientRect.S
index 28f88904d..743b7f614 100644
--- a/libc/nt/user32/GetClientRect.S
+++ b/libc/nt/user32/GetClientRect.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetClientRect,GetClientRect
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetClientRect:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetCursor.S b/libc/nt/user32/GetCursor.S
index fe3f4520a..f81efbe5a 100644
--- a/libc/nt/user32/GetCursor.S
+++ b/libc/nt/user32/GetCursor.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetCursor,GetCursor
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetCursor:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetCursorPos.S b/libc/nt/user32/GetCursorPos.S
index 08020daf7..1737ee1d1 100644
--- a/libc/nt/user32/GetCursorPos.S
+++ b/libc/nt/user32/GetCursorPos.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetCursorPos,GetCursorPos
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetCursorPos:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetDC.S b/libc/nt/user32/GetDC.S
index 113d2100c..e9ec21aca 100644
--- a/libc/nt/user32/GetDC.S
+++ b/libc/nt/user32/GetDC.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetDC,GetDC
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetDC:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetDesktopWindow.S b/libc/nt/user32/GetDesktopWindow.S
index 699670add..85e827fb9 100644
--- a/libc/nt/user32/GetDesktopWindow.S
+++ b/libc/nt/user32/GetDesktopWindow.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetDesktopWindow,GetDesktopWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetDesktopWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetKeyState.S b/libc/nt/user32/GetKeyState.S
index b001bee54..cb4a06e91 100644
--- a/libc/nt/user32/GetKeyState.S
+++ b/libc/nt/user32/GetKeyState.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetKeyState,GetKeyState
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetKeyState:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetKeyboardLayout.S b/libc/nt/user32/GetKeyboardLayout.S
index ff2f7678e..c8d59e5a9 100644
--- a/libc/nt/user32/GetKeyboardLayout.S
+++ b/libc/nt/user32/GetKeyboardLayout.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetKeyboardLayout,GetKeyboardLayout
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetKeyboardLayout:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetMenu.S b/libc/nt/user32/GetMenu.S
index 3c690d970..f9ab04f2c 100644
--- a/libc/nt/user32/GetMenu.S
+++ b/libc/nt/user32/GetMenu.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetMenu,GetMenu
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetMenu:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetMessageW.S b/libc/nt/user32/GetMessageW.S
index bef0fef10..7a6ef86d3 100644
--- a/libc/nt/user32/GetMessageW.S
+++ b/libc/nt/user32/GetMessageW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetMessageW,GetMessageW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetMessage:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetParent.S b/libc/nt/user32/GetParent.S
index 00411cf41..f0c384f03 100644
--- a/libc/nt/user32/GetParent.S
+++ b/libc/nt/user32/GetParent.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetParent,GetParent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetParent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetShellWindow.S b/libc/nt/user32/GetShellWindow.S
index 50956f532..c5d46cb1d 100644
--- a/libc/nt/user32/GetShellWindow.S
+++ b/libc/nt/user32/GetShellWindow.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetShellWindow,GetShellWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetShellWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetSystemMenu.S b/libc/nt/user32/GetSystemMenu.S
index 8592ffad5..ea641cd9e 100644
--- a/libc/nt/user32/GetSystemMenu.S
+++ b/libc/nt/user32/GetSystemMenu.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetSystemMenu,GetSystemMenu
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetSystemMenu:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetWindow.S b/libc/nt/user32/GetWindow.S
index a37283934..005dc3419 100644
--- a/libc/nt/user32/GetWindow.S
+++ b/libc/nt/user32/GetWindow.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetWindow,GetWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetWindowPlacement.S b/libc/nt/user32/GetWindowPlacement.S
index 6395ba586..05ae59fa6 100644
--- a/libc/nt/user32/GetWindowPlacement.S
+++ b/libc/nt/user32/GetWindowPlacement.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetWindowPlacement,GetWindowPlacement
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetWindowPlacement:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetWindowRect.S b/libc/nt/user32/GetWindowRect.S
index 1f17fc9a4..a767517b5 100644
--- a/libc/nt/user32/GetWindowRect.S
+++ b/libc/nt/user32/GetWindowRect.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetWindowRect,GetWindowRect
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetWindowRect:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/GetWindowTextW.S b/libc/nt/user32/GetWindowTextW.S
index 3f3b2a94f..04e7467d0 100644
--- a/libc/nt/user32/GetWindowTextW.S
+++ b/libc/nt/user32/GetWindowTextW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_GetWindowTextW,GetWindowTextW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetWindowText:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/InsertMenuW.S b/libc/nt/user32/InsertMenuW.S
index 29f7d0966..34ec25e44 100644
--- a/libc/nt/user32/InsertMenuW.S
+++ b/libc/nt/user32/InsertMenuW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_InsertMenuW,InsertMenuW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 InsertMenu:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/InvalidateRect.S b/libc/nt/user32/InvalidateRect.S
index 122cb3af8..aa5874eb3 100644
--- a/libc/nt/user32/InvalidateRect.S
+++ b/libc/nt/user32/InvalidateRect.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_InvalidateRect,InvalidateRect
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 InvalidateRect:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/IsChild.S b/libc/nt/user32/IsChild.S
index 9b977c821..72d40b85e 100644
--- a/libc/nt/user32/IsChild.S
+++ b/libc/nt/user32/IsChild.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_IsChild,IsChild
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 IsChild:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/IsIconic.S b/libc/nt/user32/IsIconic.S
index 50b87ad83..594acede2 100644
--- a/libc/nt/user32/IsIconic.S
+++ b/libc/nt/user32/IsIconic.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_IsIconic,IsIconic
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 IsIconic:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/IsMenu.S b/libc/nt/user32/IsMenu.S
index e03c523e3..f820f34e8 100644
--- a/libc/nt/user32/IsMenu.S
+++ b/libc/nt/user32/IsMenu.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_IsMenu,IsMenu
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 IsMenu:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/IsWindow.S b/libc/nt/user32/IsWindow.S
index 4b56b4526..fcb6ceca7 100644
--- a/libc/nt/user32/IsWindow.S
+++ b/libc/nt/user32/IsWindow.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_IsWindow,IsWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 IsWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/IsWindowVisible.S b/libc/nt/user32/IsWindowVisible.S
index 58767213f..4902f2244 100644
--- a/libc/nt/user32/IsWindowVisible.S
+++ b/libc/nt/user32/IsWindowVisible.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_IsWindowVisible,IsWindowVisible
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 IsWindowVisible:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/IsZoomed.S b/libc/nt/user32/IsZoomed.S
index 2b7c222ca..11e50f7cc 100644
--- a/libc/nt/user32/IsZoomed.S
+++ b/libc/nt/user32/IsZoomed.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_IsZoomed,IsZoomed
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 IsZoomed:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/KillTimer.S b/libc/nt/user32/KillTimer.S
index a8b729367..6da6acf78 100644
--- a/libc/nt/user32/KillTimer.S
+++ b/libc/nt/user32/KillTimer.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_KillTimer,KillTimer
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 KillTimer:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/LoadCursorW.S b/libc/nt/user32/LoadCursorW.S
index 87657a5c9..b4019c66d 100644
--- a/libc/nt/user32/LoadCursorW.S
+++ b/libc/nt/user32/LoadCursorW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_LoadCursorW,LoadCursorW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LoadCursor:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/LoadIconW.S b/libc/nt/user32/LoadIconW.S
index a4f8ccb7d..1ab0fbf26 100644
--- a/libc/nt/user32/LoadIconW.S
+++ b/libc/nt/user32/LoadIconW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_LoadIconW,LoadIconW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LoadIcon:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/LoadImageW.S b/libc/nt/user32/LoadImageW.S
index bb1614167..9c9787c5e 100644
--- a/libc/nt/user32/LoadImageW.S
+++ b/libc/nt/user32/LoadImageW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_LoadImageW,LoadImageW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 LoadImage:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/MapVirtualKeyExW.S b/libc/nt/user32/MapVirtualKeyExW.S
index 97fc81085..e8e1a8b8d 100644
--- a/libc/nt/user32/MapVirtualKeyExW.S
+++ b/libc/nt/user32/MapVirtualKeyExW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_MapVirtualKeyExW,MapVirtualKeyExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 MapVirtualKeyEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/MessageBoxExW.S b/libc/nt/user32/MessageBoxExW.S
index c2092f7d9..c4182c218 100644
--- a/libc/nt/user32/MessageBoxExW.S
+++ b/libc/nt/user32/MessageBoxExW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_MessageBoxExW,MessageBoxExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 MessageBoxEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/MessageBoxW.S b/libc/nt/user32/MessageBoxW.S
index 85345a4b9..6b0dd6bdf 100644
--- a/libc/nt/user32/MessageBoxW.S
+++ b/libc/nt/user32/MessageBoxW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_MessageBoxW,MessageBoxW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 MessageBox:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/MoveWindow.S b/libc/nt/user32/MoveWindow.S
index 54ad218d6..5a313cd94 100644
--- a/libc/nt/user32/MoveWindow.S
+++ b/libc/nt/user32/MoveWindow.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_MoveWindow,MoveWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 MoveWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/PeekMessageW.S b/libc/nt/user32/PeekMessageW.S
index f2d9d8d1b..b094563ca 100644
--- a/libc/nt/user32/PeekMessageW.S
+++ b/libc/nt/user32/PeekMessageW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_PeekMessageW,PeekMessageW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PeekMessage:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/PostQuitMessage.S b/libc/nt/user32/PostQuitMessage.S
index 79c5de3fa..e45ee7a3c 100644
--- a/libc/nt/user32/PostQuitMessage.S
+++ b/libc/nt/user32/PostQuitMessage.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_PostQuitMessage,PostQuitMessage
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 PostQuitMessage:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/RedrawWindow.S b/libc/nt/user32/RedrawWindow.S
index 312605512..05d90c361 100644
--- a/libc/nt/user32/RedrawWindow.S
+++ b/libc/nt/user32/RedrawWindow.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_RedrawWindow,RedrawWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RedrawWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/RegisterClassExW.S b/libc/nt/user32/RegisterClassExW.S
index 48bf6c0ea..5100077af 100644
--- a/libc/nt/user32/RegisterClassExW.S
+++ b/libc/nt/user32/RegisterClassExW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_RegisterClassExW,RegisterClassExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegisterClassEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/RegisterClassW.S b/libc/nt/user32/RegisterClassW.S
index 40937b59a..cbc6b6053 100644
--- a/libc/nt/user32/RegisterClassW.S
+++ b/libc/nt/user32/RegisterClassW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_RegisterClassW,RegisterClassW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 RegisterClass:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/ReleaseCapture.S b/libc/nt/user32/ReleaseCapture.S
index 0916a7ffd..ad9582655 100644
--- a/libc/nt/user32/ReleaseCapture.S
+++ b/libc/nt/user32/ReleaseCapture.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_ReleaseCapture,ReleaseCapture
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReleaseCapture:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/ReleaseDC.S b/libc/nt/user32/ReleaseDC.S
index 21264c7e4..0d65255b0 100644
--- a/libc/nt/user32/ReleaseDC.S
+++ b/libc/nt/user32/ReleaseDC.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_ReleaseDC,ReleaseDC
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ReleaseDC:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SendMessageW.S b/libc/nt/user32/SendMessageW.S
index d81327891..20f92c623 100644
--- a/libc/nt/user32/SendMessageW.S
+++ b/libc/nt/user32/SendMessageW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SendMessageW,SendMessageW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SendMessage:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SetCapture.S b/libc/nt/user32/SetCapture.S
index 20055412b..b489d5183 100644
--- a/libc/nt/user32/SetCapture.S
+++ b/libc/nt/user32/SetCapture.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SetCapture,SetCapture
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetCapture:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SetClassLongW.S b/libc/nt/user32/SetClassLongW.S
index d5f18bebd..54bacfdfe 100644
--- a/libc/nt/user32/SetClassLongW.S
+++ b/libc/nt/user32/SetClassLongW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SetClassLongW,SetClassLongW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetClassLong:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SetCursor.S b/libc/nt/user32/SetCursor.S
index c5486f5dc..6bcd985c9 100644
--- a/libc/nt/user32/SetCursor.S
+++ b/libc/nt/user32/SetCursor.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SetCursor,SetCursor
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetCursor:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SetParent.S b/libc/nt/user32/SetParent.S
index e3c8e3dfa..d33f7dd08 100644
--- a/libc/nt/user32/SetParent.S
+++ b/libc/nt/user32/SetParent.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SetParent,SetParent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetParent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SetTimer.S b/libc/nt/user32/SetTimer.S
index 8d9961554..35658e9eb 100644
--- a/libc/nt/user32/SetTimer.S
+++ b/libc/nt/user32/SetTimer.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SetTimer,SetTimer
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetTimer:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SetWindowLongW.S b/libc/nt/user32/SetWindowLongW.S
index be98651e9..c6f5f29f2 100644
--- a/libc/nt/user32/SetWindowLongW.S
+++ b/libc/nt/user32/SetWindowLongW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SetWindowLongW,SetWindowLongW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetWindowLong:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SetWindowPlacement.S b/libc/nt/user32/SetWindowPlacement.S
index 0038ecbfb..cc61465fb 100644
--- a/libc/nt/user32/SetWindowPlacement.S
+++ b/libc/nt/user32/SetWindowPlacement.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SetWindowPlacement,SetWindowPlacement
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetWindowPlacement:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SetWindowPos.S b/libc/nt/user32/SetWindowPos.S
index cf0960512..bc680fc31 100644
--- a/libc/nt/user32/SetWindowPos.S
+++ b/libc/nt/user32/SetWindowPos.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SetWindowPos,SetWindowPos
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetWindowPos:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SetWindowTextW.S b/libc/nt/user32/SetWindowTextW.S
index eea86c3f2..5a1e0fe57 100644
--- a/libc/nt/user32/SetWindowTextW.S
+++ b/libc/nt/user32/SetWindowTextW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SetWindowTextW,SetWindowTextW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetWindowText:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SetWindowsHookExW.S b/libc/nt/user32/SetWindowsHookExW.S
index 65f219c72..f6b69a8a7 100644
--- a/libc/nt/user32/SetWindowsHookExW.S
+++ b/libc/nt/user32/SetWindowsHookExW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SetWindowsHookExW,SetWindowsHookExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetWindowsHookEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/SetWindowsHookW.S b/libc/nt/user32/SetWindowsHookW.S
index 909067516..6767721e8 100644
--- a/libc/nt/user32/SetWindowsHookW.S
+++ b/libc/nt/user32/SetWindowsHookW.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_SetWindowsHookW,SetWindowsHookW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetWindowsHook:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/ShowCaret.S b/libc/nt/user32/ShowCaret.S
index a1be01b76..578f85cf2 100644
--- a/libc/nt/user32/ShowCaret.S
+++ b/libc/nt/user32/ShowCaret.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_ShowCaret,ShowCaret
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ShowCaret:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/ShowCursor.S b/libc/nt/user32/ShowCursor.S
index b48a0ebc6..d60f4b1ff 100644
--- a/libc/nt/user32/ShowCursor.S
+++ b/libc/nt/user32/ShowCursor.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_ShowCursor,ShowCursor
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ShowCursor:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/ShowWindow.S b/libc/nt/user32/ShowWindow.S
index 43abe8746..2940cd941 100644
--- a/libc/nt/user32/ShowWindow.S
+++ b/libc/nt/user32/ShowWindow.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_ShowWindow,ShowWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 ShowWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/TrackPopupMenu.S b/libc/nt/user32/TrackPopupMenu.S
index 3e86d1c01..c14ba2eb8 100644
--- a/libc/nt/user32/TrackPopupMenu.S
+++ b/libc/nt/user32/TrackPopupMenu.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_TrackPopupMenu,TrackPopupMenu
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TrackPopupMenu:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/TranslateMessage.S b/libc/nt/user32/TranslateMessage.S
index 01a37634f..9dfecd8fc 100644
--- a/libc/nt/user32/TranslateMessage.S
+++ b/libc/nt/user32/TranslateMessage.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_TranslateMessage,TranslateMessage
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 TranslateMessage:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/UnhookWindowsHook.S b/libc/nt/user32/UnhookWindowsHook.S
index b32e5fa06..a080b32d9 100644
--- a/libc/nt/user32/UnhookWindowsHook.S
+++ b/libc/nt/user32/UnhookWindowsHook.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_UnhookWindowsHook,UnhookWindowsHook
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 UnhookWindowsHook:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/UnhookWindowsHookEx.S b/libc/nt/user32/UnhookWindowsHookEx.S
index 256e02313..4256822e3 100644
--- a/libc/nt/user32/UnhookWindowsHookEx.S
+++ b/libc/nt/user32/UnhookWindowsHookEx.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_UnhookWindowsHookEx,UnhookWindowsHookEx
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 UnhookWindowsHookEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/UpdateWindow.S b/libc/nt/user32/UpdateWindow.S
index 77dc65aeb..5f9a00e6a 100644
--- a/libc/nt/user32/UpdateWindow.S
+++ b/libc/nt/user32/UpdateWindow.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_UpdateWindow,UpdateWindow
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 UpdateWindow:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/user32/WaitForInputIdle.S b/libc/nt/user32/WaitForInputIdle.S
index c40ba58c4..a29d5f635 100644
--- a/libc/nt/user32/WaitForInputIdle.S
+++ b/libc/nt/user32/WaitForInputIdle.S
@@ -2,9 +2,9 @@
 .imp	user32,__imp_WaitForInputIdle,WaitForInputIdle
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WaitForInputIdle:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/FreeAddrInfoExW.S b/libc/nt/ws2_32/FreeAddrInfoExW.S
index 6f8484c9e..2b5e6cf52 100644
--- a/libc/nt/ws2_32/FreeAddrInfoExW.S
+++ b/libc/nt/ws2_32/FreeAddrInfoExW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_FreeAddrInfoExW,FreeAddrInfoExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FreeAddrInfoEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/FreeAddrInfoW.S b/libc/nt/ws2_32/FreeAddrInfoW.S
index 98dee1830..e097490fb 100644
--- a/libc/nt/ws2_32/FreeAddrInfoW.S
+++ b/libc/nt/ws2_32/FreeAddrInfoW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_FreeAddrInfoW,FreeAddrInfoW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 FreeAddrInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/GetAddrInfoExCancel.S b/libc/nt/ws2_32/GetAddrInfoExCancel.S
index 2b456f290..6672815f2 100644
--- a/libc/nt/ws2_32/GetAddrInfoExCancel.S
+++ b/libc/nt/ws2_32/GetAddrInfoExCancel.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_GetAddrInfoExCancel,GetAddrInfoExCancel
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetAddrInfoExCancel:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/GetAddrInfoExOverlappedResult.S b/libc/nt/ws2_32/GetAddrInfoExOverlappedResult.S
index 116eaae5f..3bf60177a 100644
--- a/libc/nt/ws2_32/GetAddrInfoExOverlappedResult.S
+++ b/libc/nt/ws2_32/GetAddrInfoExOverlappedResult.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_GetAddrInfoExOverlappedResult,GetAddrInfoExOverlappedResult
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetAddrInfoExOverlappedResult:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/GetAddrInfoExW.S b/libc/nt/ws2_32/GetAddrInfoExW.S
index 5a9288465..14cfb3092 100644
--- a/libc/nt/ws2_32/GetAddrInfoExW.S
+++ b/libc/nt/ws2_32/GetAddrInfoExW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_GetAddrInfoExW,GetAddrInfoExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetAddrInfoEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/GetAddrInfoW.S b/libc/nt/ws2_32/GetAddrInfoW.S
index 36f98bf26..5069a891b 100644
--- a/libc/nt/ws2_32/GetAddrInfoW.S
+++ b/libc/nt/ws2_32/GetAddrInfoW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_GetAddrInfoW,GetAddrInfoW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetAddrInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/GetHostNameW.S b/libc/nt/ws2_32/GetHostNameW.S
index 94d8e2b21..ab3f8c554 100644
--- a/libc/nt/ws2_32/GetHostNameW.S
+++ b/libc/nt/ws2_32/GetHostNameW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_GetHostNameW,GetHostNameW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetHostName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/GetNameInfoW.S b/libc/nt/ws2_32/GetNameInfoW.S
index 846a590d1..14034198a 100644
--- a/libc/nt/ws2_32/GetNameInfoW.S
+++ b/libc/nt/ws2_32/GetNameInfoW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_GetNameInfoW,GetNameInfoW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 GetNameInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/SetAddrInfoExW.S b/libc/nt/ws2_32/SetAddrInfoExW.S
index df51c42ed..9d22457e8 100644
--- a/libc/nt/ws2_32/SetAddrInfoExW.S
+++ b/libc/nt/ws2_32/SetAddrInfoExW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_SetAddrInfoExW,SetAddrInfoExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 SetAddrInfoEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAAccept.S b/libc/nt/ws2_32/WSAAccept.S
index ed271f694..f3a45f116 100644
--- a/libc/nt/ws2_32/WSAAccept.S
+++ b/libc/nt/ws2_32/WSAAccept.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAAccept,WSAAccept
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAAccept:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAAddressToStringW.S b/libc/nt/ws2_32/WSAAddressToStringW.S
index 87bf5bd59..de741d27a 100644
--- a/libc/nt/ws2_32/WSAAddressToStringW.S
+++ b/libc/nt/ws2_32/WSAAddressToStringW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAAddressToStringW,WSAAddressToStringW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAAddressToString:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAAsyncGetHostByAddr.S b/libc/nt/ws2_32/WSAAsyncGetHostByAddr.S
index 953dc2115..742ffaa7e 100644
--- a/libc/nt/ws2_32/WSAAsyncGetHostByAddr.S
+++ b/libc/nt/ws2_32/WSAAsyncGetHostByAddr.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAAsyncGetHostByAddr,WSAAsyncGetHostByAddr
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAAsyncGetHostByAddr:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAAsyncGetHostByName.S b/libc/nt/ws2_32/WSAAsyncGetHostByName.S
index 4cc2ff532..f2d6a2edf 100644
--- a/libc/nt/ws2_32/WSAAsyncGetHostByName.S
+++ b/libc/nt/ws2_32/WSAAsyncGetHostByName.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAAsyncGetHostByName,WSAAsyncGetHostByName
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAAsyncGetHostByName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAAsyncGetProtoByName.S b/libc/nt/ws2_32/WSAAsyncGetProtoByName.S
index ab421a152..c8e25cb4f 100644
--- a/libc/nt/ws2_32/WSAAsyncGetProtoByName.S
+++ b/libc/nt/ws2_32/WSAAsyncGetProtoByName.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAAsyncGetProtoByName,WSAAsyncGetProtoByName
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAAsyncGetProtoByName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAAsyncGetProtoByNumber.S b/libc/nt/ws2_32/WSAAsyncGetProtoByNumber.S
index ea7c3737d..261a4cdca 100644
--- a/libc/nt/ws2_32/WSAAsyncGetProtoByNumber.S
+++ b/libc/nt/ws2_32/WSAAsyncGetProtoByNumber.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAAsyncGetProtoByNumber,WSAAsyncGetProtoByNumber
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAAsyncGetProtoByNumber:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSACleanup.S b/libc/nt/ws2_32/WSACleanup.S
index 63b9fed73..88d40cacd 100644
--- a/libc/nt/ws2_32/WSACleanup.S
+++ b/libc/nt/ws2_32/WSACleanup.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSACleanup,WSACleanup
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSACleanup:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSACloseEvent.S b/libc/nt/ws2_32/WSACloseEvent.S
index a075aedd0..c716dbddf 100644
--- a/libc/nt/ws2_32/WSACloseEvent.S
+++ b/libc/nt/ws2_32/WSACloseEvent.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSACloseEvent,WSACloseEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSACloseEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAConnect.S b/libc/nt/ws2_32/WSAConnect.S
index 28dae15ba..c2a59a4fd 100644
--- a/libc/nt/ws2_32/WSAConnect.S
+++ b/libc/nt/ws2_32/WSAConnect.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAConnect,WSAConnect
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAConnect:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAConnectByList.S b/libc/nt/ws2_32/WSAConnectByList.S
index 57f2eeef9..c640736ca 100644
--- a/libc/nt/ws2_32/WSAConnectByList.S
+++ b/libc/nt/ws2_32/WSAConnectByList.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAConnectByList,WSAConnectByList
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAConnectByList:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAConnectByNameW.S b/libc/nt/ws2_32/WSAConnectByNameW.S
index 0994289c1..11563e7cd 100644
--- a/libc/nt/ws2_32/WSAConnectByNameW.S
+++ b/libc/nt/ws2_32/WSAConnectByNameW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAConnectByNameW,WSAConnectByNameW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAConnectByName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSACreateEvent.S b/libc/nt/ws2_32/WSACreateEvent.S
index 0a02cc9cb..86781abf9 100644
--- a/libc/nt/ws2_32/WSACreateEvent.S
+++ b/libc/nt/ws2_32/WSACreateEvent.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSACreateEvent,WSACreateEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSACreateEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSADuplicateSocketW.S b/libc/nt/ws2_32/WSADuplicateSocketW.S
index c2ccc3a8d..7809bf023 100644
--- a/libc/nt/ws2_32/WSADuplicateSocketW.S
+++ b/libc/nt/ws2_32/WSADuplicateSocketW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSADuplicateSocketW,WSADuplicateSocketW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSADuplicateSocket:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAEnumNameSpaceProvidersExW.S b/libc/nt/ws2_32/WSAEnumNameSpaceProvidersExW.S
index 1a86366c4..b96af20e6 100644
--- a/libc/nt/ws2_32/WSAEnumNameSpaceProvidersExW.S
+++ b/libc/nt/ws2_32/WSAEnumNameSpaceProvidersExW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAEnumNameSpaceProvidersExW,WSAEnumNameSpaceProvidersExW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAEnumNameSpaceProvidersEx:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAEnumNameSpaceProvidersW.S b/libc/nt/ws2_32/WSAEnumNameSpaceProvidersW.S
index aac16a9e6..0ae8f1169 100644
--- a/libc/nt/ws2_32/WSAEnumNameSpaceProvidersW.S
+++ b/libc/nt/ws2_32/WSAEnumNameSpaceProvidersW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAEnumNameSpaceProvidersW,WSAEnumNameSpaceProvidersW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAEnumNameSpaceProviders:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAEnumNetworkEvents.S b/libc/nt/ws2_32/WSAEnumNetworkEvents.S
index cadd9e753..be8b706e5 100644
--- a/libc/nt/ws2_32/WSAEnumNetworkEvents.S
+++ b/libc/nt/ws2_32/WSAEnumNetworkEvents.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAEnumNetworkEvents,WSAEnumNetworkEvents
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAEnumNetworkEvents:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAEnumProtocolsW.S b/libc/nt/ws2_32/WSAEnumProtocolsW.S
index e7d0ba9ca..501ce31ec 100644
--- a/libc/nt/ws2_32/WSAEnumProtocolsW.S
+++ b/libc/nt/ws2_32/WSAEnumProtocolsW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAEnumProtocolsW,WSAEnumProtocolsW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAEnumProtocols:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAEventSelect.S b/libc/nt/ws2_32/WSAEventSelect.S
index ff84dcf24..20ca2a4dd 100644
--- a/libc/nt/ws2_32/WSAEventSelect.S
+++ b/libc/nt/ws2_32/WSAEventSelect.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAEventSelect,WSAEventSelect
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAEventSelect:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAGetLastError.S b/libc/nt/ws2_32/WSAGetLastError.S
index 102035f76..7cf7047ec 100644
--- a/libc/nt/ws2_32/WSAGetLastError.S
+++ b/libc/nt/ws2_32/WSAGetLastError.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAGetLastError,WSAGetLastError
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAGetLastError:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAGetQOSByName.S b/libc/nt/ws2_32/WSAGetQOSByName.S
index e118faf91..6775b5b4f 100644
--- a/libc/nt/ws2_32/WSAGetQOSByName.S
+++ b/libc/nt/ws2_32/WSAGetQOSByName.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAGetQOSByName,WSAGetQOSByName
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAGetQOSByName:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAGetServiceClassInfoW.S b/libc/nt/ws2_32/WSAGetServiceClassInfoW.S
index eb0476b94..3abee6573 100644
--- a/libc/nt/ws2_32/WSAGetServiceClassInfoW.S
+++ b/libc/nt/ws2_32/WSAGetServiceClassInfoW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAGetServiceClassInfoW,WSAGetServiceClassInfoW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAGetServiceClassInfo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAGetServiceClassNameByClassIdW.S b/libc/nt/ws2_32/WSAGetServiceClassNameByClassIdW.S
index 3292718a3..8c21e487b 100644
--- a/libc/nt/ws2_32/WSAGetServiceClassNameByClassIdW.S
+++ b/libc/nt/ws2_32/WSAGetServiceClassNameByClassIdW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAGetServiceClassNameByClassIdW,WSAGetServiceClassNameByClassIdW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAGetServiceClassNameByClassId:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAInstallServiceClassW.S b/libc/nt/ws2_32/WSAInstallServiceClassW.S
index 7dd19f2e2..37b754b33 100644
--- a/libc/nt/ws2_32/WSAInstallServiceClassW.S
+++ b/libc/nt/ws2_32/WSAInstallServiceClassW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAInstallServiceClassW,WSAInstallServiceClassW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAInstallServiceClass:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAIoctl.S b/libc/nt/ws2_32/WSAIoctl.S
index 249792565..ed3c69ba6 100644
--- a/libc/nt/ws2_32/WSAIoctl.S
+++ b/libc/nt/ws2_32/WSAIoctl.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAIoctl,WSAIoctl
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAIoctl:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAJoinLeaf.S b/libc/nt/ws2_32/WSAJoinLeaf.S
index 2a39bf462..0865e9be1 100644
--- a/libc/nt/ws2_32/WSAJoinLeaf.S
+++ b/libc/nt/ws2_32/WSAJoinLeaf.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAJoinLeaf,WSAJoinLeaf
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAJoinLeaf:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSALookupServiceBeginW.S b/libc/nt/ws2_32/WSALookupServiceBeginW.S
index f327af282..08d955202 100644
--- a/libc/nt/ws2_32/WSALookupServiceBeginW.S
+++ b/libc/nt/ws2_32/WSALookupServiceBeginW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSALookupServiceBeginW,WSALookupServiceBeginW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSALookupServiceBegin:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSALookupServiceEnd.S b/libc/nt/ws2_32/WSALookupServiceEnd.S
index 4d1bd5850..d068e9a8d 100644
--- a/libc/nt/ws2_32/WSALookupServiceEnd.S
+++ b/libc/nt/ws2_32/WSALookupServiceEnd.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSALookupServiceEnd,WSALookupServiceEnd
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSALookupServiceEnd:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSALookupServiceNextW.S b/libc/nt/ws2_32/WSALookupServiceNextW.S
index 9df3b7eb3..f0148e728 100644
--- a/libc/nt/ws2_32/WSALookupServiceNextW.S
+++ b/libc/nt/ws2_32/WSALookupServiceNextW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSALookupServiceNextW,WSALookupServiceNextW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSALookupServiceNext:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSANSPIoctl.S b/libc/nt/ws2_32/WSANSPIoctl.S
index 25ac253b6..7d76a95a6 100644
--- a/libc/nt/ws2_32/WSANSPIoctl.S
+++ b/libc/nt/ws2_32/WSANSPIoctl.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSANSPIoctl,WSANSPIoctl
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSANSPIoctl:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAPoll.S b/libc/nt/ws2_32/WSAPoll.S
index b72842f56..24b7b3bcf 100644
--- a/libc/nt/ws2_32/WSAPoll.S
+++ b/libc/nt/ws2_32/WSAPoll.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAPoll,WSAPoll
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAPoll:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAProviderConfigChange.S b/libc/nt/ws2_32/WSAProviderConfigChange.S
index 7e225628c..9f607be19 100644
--- a/libc/nt/ws2_32/WSAProviderConfigChange.S
+++ b/libc/nt/ws2_32/WSAProviderConfigChange.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAProviderConfigChange,WSAProviderConfigChange
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAProviderConfigChange:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSARecvDisconnect.S b/libc/nt/ws2_32/WSARecvDisconnect.S
index 2bc279b9e..a548691be 100644
--- a/libc/nt/ws2_32/WSARecvDisconnect.S
+++ b/libc/nt/ws2_32/WSARecvDisconnect.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSARecvDisconnect,WSARecvDisconnect
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSARecvDisconnect:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSARemoveServiceClass.S b/libc/nt/ws2_32/WSARemoveServiceClass.S
index f6dbc5387..904408a6a 100644
--- a/libc/nt/ws2_32/WSARemoveServiceClass.S
+++ b/libc/nt/ws2_32/WSARemoveServiceClass.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSARemoveServiceClass,WSARemoveServiceClass
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSARemoveServiceClass:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAResetEvent.S b/libc/nt/ws2_32/WSAResetEvent.S
index 790f766c3..c367bd108 100644
--- a/libc/nt/ws2_32/WSAResetEvent.S
+++ b/libc/nt/ws2_32/WSAResetEvent.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAResetEvent,WSAResetEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAResetEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSASend.S b/libc/nt/ws2_32/WSASend.S
index 5c6716eb8..cdab3401d 100644
--- a/libc/nt/ws2_32/WSASend.S
+++ b/libc/nt/ws2_32/WSASend.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSASend,WSASend
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSASend:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSASendDisconnect.S b/libc/nt/ws2_32/WSASendDisconnect.S
index 89ea5b35a..361e0a327 100644
--- a/libc/nt/ws2_32/WSASendDisconnect.S
+++ b/libc/nt/ws2_32/WSASendDisconnect.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSASendDisconnect,WSASendDisconnect
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSASendDisconnect:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSASendMsg.S b/libc/nt/ws2_32/WSASendMsg.S
index 97f92b776..a76cfc71a 100644
--- a/libc/nt/ws2_32/WSASendMsg.S
+++ b/libc/nt/ws2_32/WSASendMsg.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSASendMsg,WSASendMsg
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSASendMsg:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSASendTo.S b/libc/nt/ws2_32/WSASendTo.S
index bce486bb2..96ee99df8 100644
--- a/libc/nt/ws2_32/WSASendTo.S
+++ b/libc/nt/ws2_32/WSASendTo.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSASendTo,WSASendTo
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSASendTo:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSASetEvent.S b/libc/nt/ws2_32/WSASetEvent.S
index b260c2570..1c4661f5e 100644
--- a/libc/nt/ws2_32/WSASetEvent.S
+++ b/libc/nt/ws2_32/WSASetEvent.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSASetEvent,WSASetEvent
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSASetEvent:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSASetLastError.S b/libc/nt/ws2_32/WSASetLastError.S
index 8b195595e..de265dfa5 100644
--- a/libc/nt/ws2_32/WSASetLastError.S
+++ b/libc/nt/ws2_32/WSASetLastError.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSASetLastError,WSASetLastError
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSASetLastError:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSASetServiceW.S b/libc/nt/ws2_32/WSASetServiceW.S
index 44f70565c..7aaf2334d 100644
--- a/libc/nt/ws2_32/WSASetServiceW.S
+++ b/libc/nt/ws2_32/WSASetServiceW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSASetServiceW,WSASetServiceW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSASetService:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSASocketW.S b/libc/nt/ws2_32/WSASocketW.S
index a235b2497..2e313efdb 100644
--- a/libc/nt/ws2_32/WSASocketW.S
+++ b/libc/nt/ws2_32/WSASocketW.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSASocketW,WSASocketW
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSASocket:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/nt/ws2_32/WSAStartup.S b/libc/nt/ws2_32/WSAStartup.S
index 5a0558ad7..80fb46438 100644
--- a/libc/nt/ws2_32/WSAStartup.S
+++ b/libc/nt/ws2_32/WSAStartup.S
@@ -2,9 +2,9 @@
 .imp	ws2_32,__imp_WSAStartup,WSAStartup
 
 	.text.windows
-        .ftrace1
+	.ftrace1
 WSAStartup:
-        .ftrace2
+	.ftrace2
 #ifdef __x86_64__
 	push	%rbp
 	mov	%rsp,%rbp
diff --git a/libc/str/nltypes.h b/libc/str/nltypes.h
index 6e9366be1..2b351603c 100644
--- a/libc/str/nltypes.h
+++ b/libc/str/nltypes.h
@@ -9,9 +9,9 @@ COSMOPOLITAN_C_START_
 typedef int nl_item;
 typedef void *nl_catd;
 
-nl_catd catopen(const char *, int) libcesque;
-char *catgets(nl_catd, int, int, const char *) libcesque;
-int catclose(nl_catd) libcesque;
+nl_catd catopen(const char *, int);
+char *catgets(nl_catd, int, int, const char *);
+int catclose(nl_catd);
 
 COSMOPOLITAN_C_END_
 #endif /* COSMOPOLITAN_LIBC_STR_NLTYPES_H_ */
diff --git a/libc/thread/pthread_timedjoin_np.c b/libc/thread/pthread_timedjoin_np.c
index 9974b9101..f1a1e56a5 100644
--- a/libc/thread/pthread_timedjoin_np.c
+++ b/libc/thread/pthread_timedjoin_np.c
@@ -38,6 +38,7 @@ static const char *DescribeReturnValue(char buf[30], int err, void **value) {
   *p++ = '[';
   p = FormatHex64(p, (uintptr_t)*value, 1);
   *p++ = ']';
+  *p = 0;
   return buf;
 }
 
diff --git a/test/libcxx/BUILD.mk b/test/libcxx/BUILD.mk
index 17bf2897a..348b4c3b1 100644
--- a/test/libcxx/BUILD.mk
+++ b/test/libcxx/BUILD.mk
@@ -12,10 +12,14 @@ TEST_LIBCXX_CHECKS = $(TEST_LIBCXX_COMS:%=%.runs)
 TEST_LIBCXX_TESTS = $(TEST_LIBCXX_COMS:%=%.ok)
 
 TEST_LIBCXX_DIRECTDEPS =				\
+	LIBC_CALLS					\
 	LIBC_INTRIN					\
 	LIBC_NEXGEN32E					\
 	LIBC_RUNTIME					\
-	THIRD_PARTY_LIBCXX
+	LIBC_STDIO					\
+	THIRD_PARTY_LIBCXX				\
+	THIRD_PARTY_DOUBLECONVERSION			\
+	THIRD_PARTY_OPENMP
 
 TEST_LIBCXX_DEPS :=					\
 	$(call uniq,$(foreach x,$(TEST_LIBCXX_DIRECTDEPS),$($(x))))
@@ -34,6 +38,8 @@ o/$(MODE)/test/libcxx/%.com.dbg:			\
 
 $(TEST_LIBCXX_OBJS): private CCFLAGS += -fexceptions -frtti
 
+o/$(MODE)/test/libcxx/openmp_test.o: private CXXFLAGS += -fopenmp -O3
+
 .PHONY: o/$(MODE)/test/libcxx
 o/$(MODE)/test/libcxx:					\
 		$(TEST_LIBCXX_BINS)			\
diff --git a/test/libcxx/openmp_test.cc b/test/libcxx/openmp_test.cc
new file mode 100644
index 000000000..ab9c8bf37
--- /dev/null
+++ b/test/libcxx/openmp_test.cc
@@ -0,0 +1,236 @@
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
+│ vi: set et ft=c++ ts=2 sts=2 sw=2 fenc=utf-8                             :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/fmt/itoa.h"
+#include "libc/inttypes.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/clock.h"
+#include "third_party/double-conversion/double-to-string.h"
+#include "third_party/double-conversion/utils.h"
+#include "third_party/openmp/omp.h"
+
+#ifndef __FAST_MATH__
+#define FLAWLESS 0
+#else
+#define FLAWLESS 1e-05
+#endif
+
+#if defined(__OPTIMIZE__) && !defined(__SANITIZE_ADDRESS__)
+#define ITERATIONS 10
+#else
+#define ITERATIONS 1
+#endif
+
+// m×n → (m×n)ᵀ
+template <typename T>
+void transpose(long m, long n, const T *A, long sa, T *B, long sb) {
+#pragma omp parallel for collapse(2)
+  for (long i = 0; i < m; ++i) {
+    for (long j = 0; j < n; ++j) {
+      B[sb * j + i] = A[sa * i + j];
+    }
+  }
+}
+
+// m×k * k×n → m×n
+template <typename T>
+void matmul(long m, long n, long k, const T *A, long sa, const T *B, long sb,
+            T *C, long sc) {
+#pragma omp parallel for collapse(2)
+  for (long i = 0; i < m; ++i) {
+    for (long j = 0; j < n; ++j) {
+      T sum = 0;
+      for (long l = 0; l < k; ++l) {
+        sum += A[sa * i + l] * B[sb * l + j];
+      }
+      C[sc * i + j] = sum;
+    }
+  }
+}
+
+template <long BM, long BN, typename T>
+void sgemmk(long k, const T *A, long sa, const T *B, long sb, T *C, long sc) {
+  T S[BM][BN] = {0};
+  for (long l = 0; l < k; ++l) {
+    for (long i = 0; i < BM; ++i) {
+      for (long j = 0; j < BN; ++j) {
+        S[i][j] += A[sa * l + i] * B[sb * l + j];
+      }
+    }
+  }
+  for (long i = 0; i < BM; ++i) {
+    for (long j = 0; j < BN; ++j) {
+      C[sc * i + j] = S[i][j];
+    }
+  }
+}
+
+// (m×k)ᵀ * k×n → m×n
+template <long BM, long BN, typename T>
+void sgemm(long m, long n, long k, const T *A, long sa, const T *B, long sb,
+           T *C, long sc) {
+#pragma omp parallel for collapse(2)
+  for (long i = 0; i < m; i += BM) {
+    for (long j = 0; j < n; j += BN) {
+      sgemmk<BM, BN>(k, A + i, sa, B + j, sb, C + sc * i + j, sc);
+    }
+  }
+}
+
+template <typename T>
+void show(long m, long n, const T *A, long sa) {
+  long max = 4;
+  printf("{");
+  for (long i = 0; i < m; ++i) {
+    if (i) {
+      if (i == max) {
+        printf(", ...");
+        break;
+      } else {
+        printf(", ");
+      }
+    }
+    printf("{");
+    for (long j = 0; j < n; ++j) {
+      if (j) {
+        if (j == max) {
+          printf(", ...");
+          break;
+        } else {
+          printf(", ");
+        }
+      }
+      printf("%g", static_cast<double>(A[j + i * sa]));
+    }
+    printf("}");
+  }
+  printf("}");
+}
+
+template <typename T>
+double diff(long m, long n, const T *A, long sa, const T *B, long sb) {
+  double s = 0;
+  for (long i = 0; i < m; ++i) {
+    for (long j = 0; j < n; ++j) {
+      s += fabs(A[sa * i + j] - B[sb * i + j]);
+    }
+  }
+  return s / m / n;
+}
+
+template <typename T>
+void check(double tol, long m, long n, const T *A, long sa, const T *B, long sb,
+           const char *file, long line) {
+  double sad = diff(m, n, A, sa, B, sb);
+  if (sad > tol) {
+    printf("%s:%d: sad %g exceeds %g\n\twant ", file, line, sad, tol);
+    show(m, n, A, sa);
+    printf("\n\t got ");
+    show(m, n, B, sb);
+    printf("\n");
+    exit(1);
+  }
+}
+
+#define check(tol, m, n, A, sa, B, sb) \
+  check(tol, m, n, A, sa, B, sb, __FILE__, __LINE__)
+
+long micros(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);
+  return ts.tv_sec * 1000000 + (ts.tv_nsec + 999) / 1000;
+}
+
+#define bench(x)                                                            \
+  do {                                                                      \
+    long t1 = micros();                                                     \
+    for (long i = 0; i < ITERATIONS; ++i) {                                 \
+      asm volatile("" ::: "memory");                                        \
+      x;                                                                    \
+      asm volatile("" ::: "memory");                                        \
+    }                                                                       \
+    long t2 = micros();                                                     \
+    printf("%8" PRId64 " µs %s\n", (t2 - t1 + ITERATIONS - 1) / ITERATIONS, \
+           #x);                                                             \
+  } while (0)
+
+unsigned long rando(void) {
+  static unsigned long s;
+  unsigned long z = (s += 0x9e3779b97f4a7c15);
+  z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
+  z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
+  return z ^ (z >> 31);
+}
+
+double real01(unsigned long x) {  // (0,1)
+  return 1. / 4503599627370496. * ((x >> 12) + .5);
+}
+
+double numba(void) {  // (-1,1)
+  return real01(rando()) * 2 - 1;
+}
+
+template <typename T>
+void fill(T *A, long n) {
+  for (long i = 0; i < n; ++i) {
+    A[i] = numba();
+  }
+}
+
+void check_reference_gemm_is_ok(void) {
+  constexpr long m = 2;
+  constexpr long n = 2;
+  constexpr long k = 2;
+  float A[m][k] = {{1, 2}, {3, 4}};
+  float B[k][n] = {{5, 6}, {7, 8}};
+  float C[m][n] = {{666, 666}, {666, 666}};
+  float G[m][n] = {{19, 22}, {43, 50}};
+  bench(matmul(m, n, k, (float *)A, k, (float *)B, n, (float *)C, n));
+  check(FLAWLESS, m, n, (float *)G, n, (float *)C, n);
+}
+
+void check_transposed_blocking_gemm_is_ok(void) {
+  long m = 1024;
+  long k = 512;
+  long n = 80;
+  float *A = new float[m * k];
+  float *B = new float[k * n];
+  float *C = new float[m * n];
+  float *D = new float[m * n];
+  fill(A, m * k);
+  fill(B, k * n);
+  bench(matmul(m, n, k, A, k, B, n, C, n));
+  float *At = new float[k * m];
+  bench(transpose(m, k, A, k, At, m));
+  bench((sgemm<8, 4>(m, n, k, At, m, B, n, D, n)));
+  check(FLAWLESS, m, n, C, n, D, n);
+  delete[] At;
+  delete[] D;
+  delete[] C;
+  delete[] B;
+  delete[] A;
+}
+
+int main(int argc, char *argv[]) {
+  check_reference_gemm_is_ok();
+  check_transposed_blocking_gemm_is_ok();
+}
diff --git a/third_party/BUILD.mk b/third_party/BUILD.mk
index 72b213b9f..36eef50ad 100644
--- a/third_party/BUILD.mk
+++ b/third_party/BUILD.mk
@@ -28,6 +28,7 @@ o/$(MODE)/third_party:				\
 	o/$(MODE)/third_party/musl		\
 	o/$(MODE)/third_party/ncurses		\
 	o/$(MODE)/third_party/nsync		\
+	o/$(MODE)/third_party/openmp		\
 	o/$(MODE)/third_party/pcre		\
 	o/$(MODE)/third_party/puff		\
 	o/$(MODE)/third_party/python		\
diff --git a/third_party/compiler_rt/fp_compare_impl.inc b/third_party/compiler_rt/fp_compare_impl.inc
new file mode 100644
index 000000000..1a6b75e30
--- /dev/null
+++ b/third_party/compiler_rt/fp_compare_impl.inc
@@ -0,0 +1,119 @@
+//===-- lib/fp_compare_impl.inc - Floating-point comparison -------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "fp_lib.inc"
+
+// GCC uses long (at least for x86_64) as the return type of the comparison
+// functions. We need to ensure that the return value is sign-extended in the
+// same way as GCC expects (since otherwise GCC-generated __builtin_isinf
+// returns true for finite 128-bit floating-point numbers).
+#ifdef __aarch64__
+// AArch64 GCC overrides libgcc_cmp_return to use int instead of long.
+typedef int CMP_RESULT;
+#elif __SIZEOF_POINTER__ == 8 && __SIZEOF_LONG__ == 4
+// LLP64 ABIs use long long instead of long.
+typedef long long CMP_RESULT;
+#elif __AVR__
+// AVR uses a single byte for the return value.
+typedef char CMP_RESULT;
+#else
+// Otherwise the comparison functions return long.
+typedef long CMP_RESULT;
+#endif
+
+#if !defined(__clang__) && defined(__GNUC__)
+// GCC uses a special __libgcc_cmp_return__ mode to define the return type, so
+// check that we are ABI-compatible when compiling the builtins with GCC.
+typedef int GCC_CMP_RESULT __attribute__((__mode__(__libgcc_cmp_return__)));
+_Static_assert(sizeof(GCC_CMP_RESULT) == sizeof(CMP_RESULT),
+               "SOFTFP ABI not compatible with GCC");
+#endif
+
+enum {
+  LE_LESS = -1,
+  LE_EQUAL = 0,
+  LE_GREATER = 1,
+  LE_UNORDERED = 1,
+};
+
+static inline CMP_RESULT __leXf2__(fp_t a, fp_t b) {
+  const srep_t aInt = toRep(a);
+  const srep_t bInt = toRep(b);
+  const rep_t aAbs = aInt & absMask;
+  const rep_t bAbs = bInt & absMask;
+
+  // If either a or b is NaN, they are unordered.
+  if (aAbs > infRep || bAbs > infRep)
+    return LE_UNORDERED;
+
+  // If a and b are both zeros, they are equal.
+  if ((aAbs | bAbs) == 0)
+    return LE_EQUAL;
+
+  // If at least one of a and b is positive, we get the same result comparing
+  // a and b as signed integers as we would with a floating-point compare.
+  if ((aInt & bInt) >= 0) {
+    if (aInt < bInt)
+      return LE_LESS;
+    else if (aInt == bInt)
+      return LE_EQUAL;
+    else
+      return LE_GREATER;
+  } else {
+    // Otherwise, both are negative, so we need to flip the sense of the
+    // comparison to get the correct result.  (This assumes a twos- or ones-
+    // complement integer representation; if integers are represented in a
+    // sign-magnitude representation, then this flip is incorrect).
+    if (aInt > bInt)
+      return LE_LESS;
+    else if (aInt == bInt)
+      return LE_EQUAL;
+    else
+      return LE_GREATER;
+  }
+}
+
+enum {
+  GE_LESS = -1,
+  GE_EQUAL = 0,
+  GE_GREATER = 1,
+  GE_UNORDERED = -1 // Note: different from LE_UNORDERED
+};
+
+static inline CMP_RESULT __geXf2__(fp_t a, fp_t b) {
+  const srep_t aInt = toRep(a);
+  const srep_t bInt = toRep(b);
+  const rep_t aAbs = aInt & absMask;
+  const rep_t bAbs = bInt & absMask;
+
+  if (aAbs > infRep || bAbs > infRep)
+    return GE_UNORDERED;
+  if ((aAbs | bAbs) == 0)
+    return GE_EQUAL;
+  if ((aInt & bInt) >= 0) {
+    if (aInt < bInt)
+      return GE_LESS;
+    else if (aInt == bInt)
+      return GE_EQUAL;
+    else
+      return GE_GREATER;
+  } else {
+    if (aInt > bInt)
+      return GE_LESS;
+    else if (aInt == bInt)
+      return GE_EQUAL;
+    else
+      return GE_GREATER;
+  }
+}
+
+static inline CMP_RESULT __unordXf2__(fp_t a, fp_t b) {
+  const rep_t aAbs = toRep(a) & absMask;
+  const rep_t bAbs = toRep(b) & absMask;
+  return aAbs > infRep || bAbs > infRep;
+}
diff --git a/third_party/libcxx/ctime b/third_party/libcxx/ctime
index 7dfbae790..4879de5bb 100644
--- a/third_party/libcxx/ctime
+++ b/third_party/libcxx/ctime
@@ -11,10 +11,14 @@
 #define _LIBCPP_CTIME
 
 #include "third_party/libcxx/__config"
-#include "libc/calls/calls.h"
 #include "libc/calls/struct/timespec.h"
+#include "libc/calls/struct/timeval.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/sysv/consts/sched.h"
+#include "libc/sysv/consts/timer.h"
 #include "libc/calls/weirdtypes.h"
 #include "libc/time/struct/tm.h"
+#include "libc/calls/calls.h"
 #include "libc/time/time.h"
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/third_party/libcxxabi/BUILD.mk b/third_party/libcxxabi/BUILD.mk
index de4444019..062c9e827 100644
--- a/third_party/libcxxabi/BUILD.mk
+++ b/third_party/libcxxabi/BUILD.mk
@@ -83,6 +83,7 @@ $(THIRD_PARTY_LIBCXXABI_A).pkg:						\
 # TODO: Remove constinit hacks when we have C++20
 $(THIRD_PARTY_LIBCXXABI_A_OBJS): private				\
 		CXXFLAGS +=						\
+			-fno-sanitize=all				\
 			-ffunction-sections				\
 			-fdata-sections					\
 			-fexceptions					\
diff --git a/third_party/libcxxabi/cxa_personality.cc b/third_party/libcxxabi/cxa_personality.cc
index 0ad930749..38346323a 100644
--- a/third_party/libcxxabi/cxa_personality.cc
+++ b/third_party/libcxxabi/cxa_personality.cc
@@ -202,7 +202,7 @@ enum
 /// @link http://dwarfstd.org/Dwarf4.pdf @unlink
 /// @param data reference variable holding memory pointer to decode from
 /// @returns decoded value
-static dontasan
+static
 uintptr_t
 readULEB128(const uint8_t** data)
 {
@@ -225,7 +225,7 @@ readULEB128(const uint8_t** data)
 /// @link http://dwarfstd.org/Dwarf4.pdf @unlink
 /// @param data reference variable holding memory pointer to decode from
 /// @returns decoded value
-static dontasan
+static
 intptr_t
 readSLEB128(const uint8_t** data)
 {
@@ -542,7 +542,7 @@ struct scan_results
 
 }  // unnamed namespace
 
-static dontasan
+static
 void
 set_registers(_Unwind_Exception* unwind_exception, _Unwind_Context* context,
               const scan_results& results)
@@ -581,7 +581,6 @@ set_registers(_Unwind_Exception* unwind_exception, _Unwind_Context* context,
         _UA_CLEANUP_PHASE && !_UA_HANDLER_FRAME
 */
 
-dontasan
 static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
                         bool native_exception,
                         _Unwind_Exception *unwind_exception,
@@ -912,7 +911,6 @@ static _Unwind_Reason_Code __gxx_personality_imp
 #else
 _LIBCXXABI_FUNC_VIS _Unwind_Reason_Code
 #ifdef __USING_SJLJ_EXCEPTIONS__
-dontasan
 __gxx_personality_sj0
 #elif defined(__MVS__)
 __zos_cxx_personality_v2
diff --git a/third_party/libunwind/BUILD.mk b/third_party/libunwind/BUILD.mk
index cbdf7b55d..cc2274eb5 100644
--- a/third_party/libunwind/BUILD.mk
+++ b/third_party/libunwind/BUILD.mk
@@ -68,8 +68,16 @@ $(THIRD_PARTY_LIBUNWIND_A).pkg:						\
 		$(THIRD_PARTY_LIBUNWIND_A_OBJS)				\
 		$(foreach x,$(THIRD_PARTY_LIBUNWIND_A_DIRECTDEPS),$($(x)_A).pkg)
 
+$(THIRD_PARTY_LIBUNWIND_A_OBJS): private				\
+		CFLAGS +=						\
+			-fno-sanitize=all				\
+			-ffunction-sections				\
+			-fdata-sections					\
+			-D_LIBUNWIND_USE_DLADDR=0
+
 $(THIRD_PARTY_LIBUNWIND_A_OBJS): private				\
 		CXXFLAGS +=						\
+			-fno-sanitize=all				\
 			-ffunction-sections				\
 			-fdata-sections					\
 			-D_LIBUNWIND_USE_DLADDR=0
diff --git a/third_party/libunwind/README.cosmo b/third_party/libunwind/README.cosmo
index 3ed1afde6..fc8f5f242 100644
--- a/third_party/libunwind/README.cosmo
+++ b/third_party/libunwind/README.cosmo
@@ -15,5 +15,3 @@ LOCAL CHANGES
 
   - Fixed `_Unwind_FunctionContext` struct to be ABI-compatible with
     code generated by GCC.
-
-  - Added `dontasan` annotations to functions that raised ASAN errors.
diff --git a/third_party/libunwind/Unwind-sjlj.c b/third_party/libunwind/Unwind-sjlj.c
index 085603321..514358e5b 100644
--- a/third_party/libunwind/Unwind-sjlj.c
+++ b/third_party/libunwind/Unwind-sjlj.c
@@ -105,7 +105,7 @@ __Unwind_SjLj_SetTopOfFunctionStack(struct _Unwind_FunctionContext *fc) {
 
 
 /// Called at start of each function that catches exceptions
-_LIBUNWIND_EXPORT void dontasan
+_LIBUNWIND_EXPORT void
 _Unwind_SjLj_Register(struct _Unwind_FunctionContext *fc) {
   fc->prev = __Unwind_SjLj_GetTopOfFunctionStack();
   __Unwind_SjLj_SetTopOfFunctionStack(fc);
@@ -113,7 +113,7 @@ _Unwind_SjLj_Register(struct _Unwind_FunctionContext *fc) {
 
 
 /// Called at end of each function that catches exceptions
-_LIBUNWIND_EXPORT void dontasan
+_LIBUNWIND_EXPORT void
 _Unwind_SjLj_Unregister(struct _Unwind_FunctionContext *fc) {
   __Unwind_SjLj_SetTopOfFunctionStack(fc->prev);
 }
@@ -426,7 +426,6 @@ _LIBUNWIND_EXPORT uintptr_t _Unwind_GetGR(struct _Unwind_Context *context,
 
 
 /// Called by personality handler during phase 2 to alter register values.
-dontasan
 _LIBUNWIND_EXPORT void _Unwind_SetGR(struct _Unwind_Context *context, int index,
                                      uintptr_t new_value) {
   _LIBUNWIND_TRACE_API("_Unwind_SetGR(context=%p, reg=%d, value=0x%" PRIuPTR
diff --git a/third_party/musl/catclose.c b/third_party/musl/catclose.c
new file mode 100644
index 000000000..54e24dd21
--- /dev/null
+++ b/third_party/musl/catclose.c
@@ -0,0 +1,14 @@
+#define _BSD_SOURCE
+#include <nl_types.h>
+#include <stdint.h>
+#include <endian.h>
+#include <sys/mman.h>
+
+#define V(p) be32toh(*(uint32_t *)(p))
+
+int catclose (nl_catd catd)
+{
+	char *map = (char *)catd;
+	munmap(map, V(map+8)+20);
+	return 0;
+}
diff --git a/third_party/musl/catgets.c b/third_party/musl/catgets.c
new file mode 100644
index 000000000..71c31c1d6
--- /dev/null
+++ b/third_party/musl/catgets.c
@@ -0,0 +1,38 @@
+#define _BSD_SOURCE
+#include <nl_types.h>
+#include <endian.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+
+#define V(p) be32toh(*(uint32_t *)(p))
+
+static int cmp(const void *a, const void *b)
+{
+	uint32_t x = V(a), y = V(b);
+	return x<y ? -1 : x>y ? 1 : 0;
+}
+
+char *catgets (nl_catd catd, int set_id, int msg_id, const char *s)
+{
+	const char *map = (const char *)catd;
+	uint32_t nsets = V(map+4);
+	const char *sets = map+20;
+	const char *msgs = map+20+V(map+12);
+	const char *strings = map+20+V(map+16);
+	uint32_t set_id_be = htobe32(set_id);
+	uint32_t msg_id_be = htobe32(msg_id);
+	const char *set = bsearch(&set_id_be, sets, nsets, 12, cmp);
+	if (!set) {
+		errno = ENOMSG;
+		return (char *)s;
+	}
+	uint32_t nmsgs = V(set+4);
+	msgs += 12*V(set+8);
+	const char *msg = bsearch(&msg_id_be, msgs, nmsgs, 12, cmp);
+	if (!msg) {
+		errno = ENOMSG;
+		return (char *)s;
+	}
+	return (char *)(strings + V(msg+8));
+}
diff --git a/third_party/musl/catopen.c b/third_party/musl/catopen.c
new file mode 100644
index 000000000..fc73e95ed
--- /dev/null
+++ b/third_party/musl/catopen.c
@@ -0,0 +1,79 @@
+#define _BSD_SOURCE
+#include <nl_types.h>
+#include <string.h>
+#include <stdint.h>
+#include <endian.h>
+#include <errno.h>
+#include <langinfo.h>
+#include <locale.h>
+#include "third_party/musl/mapfile.internal.h"
+#include <sys/mman.h>
+
+#define V(p) be32toh(*(uint32_t *)(p))
+
+static nl_catd do_catopen(const char *name)
+{
+	size_t size;
+	const unsigned char *map = __map_file(name, &size);
+	/* Size recorded in the file must match file size; otherwise
+	 * the information needed to unmap the file will be lost. */
+	if (!map || V(map) != 0xff88ff89 || 20+V(map+8) != size) {
+		if(map) munmap((void *)map, size);
+		errno = ENOENT;
+		return (nl_catd)-1;
+	}
+	return (nl_catd)map;
+}
+
+nl_catd catopen(const char *name, int oflag)
+{
+	nl_catd catd;
+
+	if (strchr(name, '/')) return do_catopen(name);
+
+	char buf[PATH_MAX];
+	size_t i;
+	const char *path, *lang, *p, *z;
+	if (issetugid() || !(path = getenv("NLSPATH"))) {
+		errno = ENOENT;
+		return (nl_catd)-1;
+	}
+	lang = oflag ? nl_langinfo(_NL_LOCALE_NAME(LC_MESSAGES)) : getenv("LANG");
+	if (!lang) lang = "";
+	for (p=path; *p; p=z) {
+		i = 0;
+		z = strchrnul(p, ':');
+		for (; p<z; p++) {
+			const char *v;
+			size_t l;
+			if (*p!='%') v=p, l=1;
+			else switch (*++p) {
+			case 'N': v=name; l=strlen(v); break;
+			case 'L': v=lang; l=strlen(v); break;
+			case 'l': v=lang; l=strcspn(v,"_.@"); break;
+			case 't':
+				v=strchrnul(lang,'_');
+				if (*v) v++;
+				l=strcspn(v,".@");
+				break;
+			case 'c': v="UTF-8"; l=5; break;
+			case '%': v="%"; l=1; break;
+			default: v=0;
+			}
+			if (!v || l >= sizeof buf - i) {
+				break;
+			}
+			memcpy(buf+i, v, l);
+			i += l;
+		}
+		if (!*z && (p<z || !i)) break;
+		if (p<z) continue;
+		if (*z) z++;
+		buf[i] = 0;
+		/* Leading : or :: in NLSPATH is same as %N */
+		catd = do_catopen(i ? buf : name);
+		if (catd != (nl_catd)-1) return catd;
+	}
+	errno = ENOENT;
+	return (nl_catd)-1;
+}
diff --git a/third_party/musl/mapfile.c b/third_party/musl/mapfile.c
new file mode 100644
index 000000000..a7ec9f2e1
--- /dev/null
+++ b/third_party/musl/mapfile.c
@@ -0,0 +1,17 @@
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+const char unsigned *__map_file(const char *pathname, size_t *size)
+{
+	struct stat st;
+	const unsigned char *map = MAP_FAILED;
+	int fd = open(pathname, O_RDONLY|O_CLOEXEC|O_NONBLOCK);
+	if (fd < 0) return 0;
+	if (!fstat(fd, &st)) {
+		map = mmap(0, st.st_size, PROT_READ, MAP_SHARED, fd, 0);
+		*size = st.st_size;
+	}
+	close(fd);
+	return map == MAP_FAILED ? 0 : map;
+}
diff --git a/third_party/musl/mapfile.internal.h b/third_party/musl/mapfile.internal.h
new file mode 100644
index 000000000..1d12509bc
--- /dev/null
+++ b/third_party/musl/mapfile.internal.h
@@ -0,0 +1,8 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_MUSL_MAPFILE_INTERNAL_H_
+#define COSMOPOLITAN_THIRD_PARTY_MUSL_MAPFILE_INTERNAL_H_
+COSMOPOLITAN_C_START_
+
+const char unsigned *__map_file(const char *, size_t *);
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_THIRD_PARTY_MUSL_MAPFILE_INTERNAL_H_ */
diff --git a/third_party/openmp/BUILD.mk b/third_party/openmp/BUILD.mk
new file mode 100644
index 000000000..d788914aa
--- /dev/null
+++ b/third_party/openmp/BUILD.mk
@@ -0,0 +1,82 @@
+#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
+#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
+
+PKGS += THIRD_PARTY_OPENMP
+
+THIRD_PARTY_OPENMP_ARTIFACTS += THIRD_PARTY_OPENMP_A
+THIRD_PARTY_OPENMP = $(THIRD_PARTY_OPENMP_A_DEPS) $(THIRD_PARTY_OPENMP_A)
+THIRD_PARTY_OPENMP_A = o/$(MODE)/third_party/openmp/openmp.a
+THIRD_PARTY_OPENMP_A_FILES := $(wildcard third_party/openmp/*)
+THIRD_PARTY_OPENMP_A_HDRS = $(filter %.h,$(THIRD_PARTY_OPENMP_A_FILES))
+THIRD_PARTY_OPENMP_A_INCS = $(filter %.inc,$(THIRD_PARTY_OPENMP_A_FILES))
+THIRD_PARTY_OPENMP_A_SRCS_CPP = $(filter %.cpp,$(THIRD_PARTY_OPENMP_A_FILES))
+THIRD_PARTY_OPENMP_A_SRCS_S = $(filter %.S,$(THIRD_PARTY_OPENMP_A_FILES))
+THIRD_PARTY_OPENMP_A_SRCS = $(THIRD_PARTY_OPENMP_A_SRCS_CPP) $(THIRD_PARTY_OPENMP_A_SRCS_S)
+THIRD_PARTY_OPENMP_A_OBJS_CPP = $(THIRD_PARTY_OPENMP_A_SRCS_CPP:%.cpp=o/$(MODE)/%.o)
+THIRD_PARTY_OPENMP_A_OBJS_S = $(THIRD_PARTY_OPENMP_A_SRCS_S:%.S=o/$(MODE)/%.o)
+THIRD_PARTY_OPENMP_A_OBJS = $(THIRD_PARTY_OPENMP_A_OBJS_CPP) $(THIRD_PARTY_OPENMP_A_OBJS_S)
+
+THIRD_PARTY_OPENMP_A_DIRECTDEPS =				\
+	LIBC_CALLS						\
+	LIBC_DLOPEN						\
+	LIBC_FMT						\
+	LIBC_INTRIN						\
+	LIBC_MEM						\
+	LIBC_NEXGEN32E						\
+	LIBC_PROC						\
+	LIBC_RUNTIME						\
+	LIBC_STDIO						\
+	LIBC_STR						\
+	LIBC_SYSV						\
+	LIBC_SYSV_CALLS						\
+	LIBC_THREAD						\
+	THIRD_PARTY_COMPILER_RT					\
+	THIRD_PARTY_GDTOA					\
+	THIRD_PARTY_LIBCXX					\
+	THIRD_PARTY_NSYNC					\
+	THIRD_PARTY_MUSL
+
+THIRD_PARTY_OPENMP_A_DEPS :=					\
+	$(call uniq,$(foreach x,$(THIRD_PARTY_OPENMP_A_DIRECTDEPS),$($(x))))
+
+THIRD_PARTY_OPENMP_A_CHECKS =					\
+	$(THIRD_PARTY_OPENMP_A).pkg
+
+$(THIRD_PARTY_OPENMP_A):					\
+		third_party/openmp/				\
+		$(THIRD_PARTY_OPENMP_A).pkg			\
+		$(THIRD_PARTY_OPENMP_A_OBJS)
+
+$(THIRD_PARTY_OPENMP_A).pkg:					\
+		$(THIRD_PARTY_OPENMP_A_OBJS)			\
+		$(foreach x,$(THIRD_PARTY_OPENMP_A_DIRECTDEPS),$($(x)_A).pkg)
+
+$(THIRD_PARTY_OPENMP_A_OBJS): private				\
+		COPTS +=					\
+			-fno-sanitize=all			\
+			-fdata-sections				\
+			-ffunction-sections			\
+			-Wno-maybe-uninitialized		\
+			-Wno-stringop-truncation		\
+			-Wno-class-memaccess			\
+			-Wno-unused-but-set-variable		\
+			-Wno-frame-address			\
+			-fno-strict-aliasing
+
+o/$(MODE)/third_party/openmp/util1.o: private COPTS += -fportcosmo
+
+# these assembly files are safe to build on aarch64
+o/$(MODE)/third_party/openmp/util2.o: third_party/openmp/util2.S
+	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
+
+THIRD_PARTY_OPENMP_LIBS = $(foreach x,$(THIRD_PARTY_OPENMP_ARTIFACTS),$($(x)))
+THIRD_PARTY_OPENMP_SRCS = $(foreach x,$(THIRD_PARTY_OPENMP_ARTIFACTS),$($(x)_SRCS))
+THIRD_PARTY_OPENMP_HDRS = $(foreach x,$(THIRD_PARTY_OPENMP_ARTIFACTS),$($(x)_HDRS))
+THIRD_PARTY_OPENMP_INCS = $(foreach x,$(THIRD_PARTY_OPENMP_ARTIFACTS),$($(x)_INCS))
+THIRD_PARTY_OPENMP_CHECKS = $(foreach x,$(THIRD_PARTY_OPENMP_ARTIFACTS),$($(x)_CHECKS))
+THIRD_PARTY_OPENMP_OBJS = $(foreach x,$(THIRD_PARTY_OPENMP_ARTIFACTS),$($(x)_OBJS))
+$(THIRD_PARTY_OPENMP_OBJS): third_party/openmp/BUILD.mk
+
+.PHONY: o/$(MODE)/third_party/openmp
+o/$(MODE)/third_party/openmp:					\
+	$(THIRD_PARTY_OPENMP_CHECKS)
diff --git a/third_party/openmp/README.cosmo b/third_party/openmp/README.cosmo
new file mode 100644
index 000000000..87543d2c9
--- /dev/null
+++ b/third_party/openmp/README.cosmo
@@ -0,0 +1,16 @@
+DESCRIPTION
+
+  libomp - LLVM's Compiler Runtime for Multiprocessing
+
+ORIGIN
+
+  https://github.com/llvm/llvm-project/
+  commit 70c3e30e01bd123e87824e36b6e38a39451ac28d
+  date   Mon Jan 29 09:54:34 2024 +0800
+
+LOCAL CHANGES
+
+  - Use Cosmo's gettid() function
+  - Ran third_party/openmp/generate.sh
+  - Removed usage of syscall() function
+  - Turned off quad floating point support (why does openmp have it?)
diff --git a/third_party/openmp/generate.sh b/third_party/openmp/generate.sh
new file mode 100755
index 000000000..c98353dfa
--- /dev/null
+++ b/third_party/openmp/generate.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+perl ~/vendor/llvm-project/openmp/runtime/tools/message-converter.pl \
+  --os=lin --prefix=kmp_i18n --enum=third_party/openmp/kmp_i18n_id.inc \
+  ~/vendor/llvm-project/openmp/runtime/src/i18n/en_US.txt || exit
+
+perl ~/vendor/llvm-project/openmp/runtime/tools/message-converter.pl \
+  --os=lin --prefix=kmp_i18n --default=third_party/openmp/kmp_i18n_default.inc \
+  ~/vendor/llvm-project/openmp/runtime/src/i18n/en_US.txt
diff --git a/third_party/openmp/kmp.h b/third_party/openmp/kmp.h
new file mode 100644
index 000000000..2339aad3c
--- /dev/null
+++ b/third_party/openmp/kmp.h
@@ -0,0 +1,4835 @@
+/*! \file */
+/*
+ * kmp.h -- KPTS runtime header file.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_H
+#define KMP_H
+
+#include "kmp_config.h"
+
+/* #define BUILD_PARALLEL_ORDERED 1 */
+
+/* This fix replaces gettimeofday with clock_gettime for better scalability on
+   the Altix.  Requires user code to be linked with -lrt. */
+//#define FIX_SGI_CLOCK
+
+/* Defines for OpenMP 3.0 tasking and auto scheduling */
+
+#ifndef KMP_STATIC_STEAL_ENABLED
+#define KMP_STATIC_STEAL_ENABLED 1
+#endif
+#define KMP_WEIGHTED_ITERATIONS_SUPPORTED                                      \
+  (KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED &&                       \
+   (KMP_ARCH_X86 || KMP_ARCH_X86_64))
+
+#define TASK_CURRENT_NOT_QUEUED 0
+#define TASK_CURRENT_QUEUED 1
+
+#ifdef BUILD_TIED_TASK_STACK
+#define TASK_STACK_EMPTY 0 // entries when the stack is empty
+#define TASK_STACK_BLOCK_BITS 5 // Used in TASK_STACK_SIZE and TASK_STACK_MASK
+// Number of entries in each task stack array
+#define TASK_STACK_BLOCK_SIZE (1 << TASK_STACK_BLOCK_BITS)
+// Mask for determining index into stack block
+#define TASK_STACK_INDEX_MASK (TASK_STACK_BLOCK_SIZE - 1)
+#endif // BUILD_TIED_TASK_STACK
+
+#define TASK_NOT_PUSHED 1
+#define TASK_SUCCESSFULLY_PUSHED 0
+#define TASK_TIED 1
+#define TASK_UNTIED 0
+#define TASK_EXPLICIT 1
+#define TASK_IMPLICIT 0
+#define TASK_PROXY 1
+#define TASK_FULL 0
+#define TASK_DETACHABLE 1
+#define TASK_UNDETACHABLE 0
+
+#define KMP_CANCEL_THREADS
+#define KMP_THREAD_ATTR
+
+// Android does not have pthread_cancel.  Undefine KMP_CANCEL_THREADS if being
+// built on Android
+#if defined(__ANDROID__)
+#undef KMP_CANCEL_THREADS
+#endif
+
+// Some WASI targets (e.g., wasm32-wasi-threads) do not support thread
+// cancellation.
+#if KMP_OS_WASI
+#undef KMP_CANCEL_THREADS
+#endif
+
+#if !KMP_OS_WASI
+#include <signal.h>
+#endif
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits>
+#include <type_traits>
+/* include <ctype.h> don't use; problems with /MD on Windows* OS NT due to bad
+   Microsoft library. Some macros provided below to replace these functions  */
+#ifndef __ABSOFT_WIN
+#include <sys/types.h>
+#endif
+#include <limits.h>
+#include <time.h>
+
+#include <errno.h>
+
+#include "kmp_os.h"
+
+#include "kmp_safe_c_api.h"
+
+#if KMP_STATS_ENABLED
+class kmp_stats_list;
+#endif
+
+#if KMP_USE_HIER_SCHED
+// Only include hierarchical scheduling if affinity is supported
+#undef KMP_USE_HIER_SCHED
+#define KMP_USE_HIER_SCHED KMP_AFFINITY_SUPPORTED
+#endif
+
+#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED
+// #include "hwloc.h"
+#ifndef HWLOC_OBJ_NUMANODE
+#define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
+#endif
+#ifndef HWLOC_OBJ_PACKAGE
+#define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET
+#endif
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#include <xmmintrin.h>
+#endif
+
+// The below has to be defined before including "kmp_barrier.h".
+#define KMP_INTERNAL_MALLOC(sz) malloc(sz)
+#define KMP_INTERNAL_FREE(p) free(p)
+#define KMP_INTERNAL_REALLOC(p, sz) realloc((p), (sz))
+#define KMP_INTERNAL_CALLOC(n, sz) calloc((n), (sz))
+
+#include "kmp_debug.h"
+#include "kmp_lock.h"
+#include "kmp_version.h"
+#include "kmp_barrier.h"
+#if USE_DEBUGGER
+#include "kmp_debugger.h"
+#endif
+#include "kmp_i18n.h"
+
+#define KMP_HANDLE_SIGNALS ((KMP_OS_UNIX && !KMP_OS_WASI) || KMP_OS_WINDOWS)
+
+#include "kmp_wrapper_malloc.h"
+#if KMP_OS_UNIX
+#include <unistd.h>
+#if !defined NSIG && defined _NSIG
+#define NSIG _NSIG
+#endif
+#endif
+
+#if KMP_OS_LINUX
+#pragma weak clock_gettime
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#endif
+
+#if OMPD_SUPPORT
+#include "ompd-specific.h"
+#endif
+
+#ifndef UNLIKELY
+#define UNLIKELY(x) (x)
+#endif
+
+// Affinity format function
+#include "kmp_str.h"
+
+// 0 - no fast memory allocation, alignment: 8-byte on x86, 16-byte on x64.
+// 3 - fast allocation using sync, non-sync free lists of any size, non-self
+// free lists of limited size.
+#ifndef USE_FAST_MEMORY
+#define USE_FAST_MEMORY 3
+#endif
+
+#ifndef KMP_NESTED_HOT_TEAMS
+#define KMP_NESTED_HOT_TEAMS 0
+#define USE_NESTED_HOT_ARG(x)
+#else
+#if KMP_NESTED_HOT_TEAMS
+#define USE_NESTED_HOT_ARG(x) , x
+#else
+#define USE_NESTED_HOT_ARG(x)
+#endif
+#endif
+
+// Assume using BGET compare_exchange instruction instead of lock by default.
+#ifndef USE_CMP_XCHG_FOR_BGET
+#define USE_CMP_XCHG_FOR_BGET 1
+#endif
+
+// Test to see if queuing lock is better than bootstrap lock for bget
+// #ifndef USE_QUEUING_LOCK_FOR_BGET
+// #define USE_QUEUING_LOCK_FOR_BGET
+// #endif
+
+#define KMP_NSEC_PER_SEC 1000000000L
+#define KMP_USEC_PER_SEC 1000000L
+#define KMP_NSEC_PER_USEC 1000L
+
+/*!
+@ingroup BASIC_TYPES
+@{
+*/
+
+/*!
+Values for bit flags used in the ident_t to describe the fields.
+*/
+enum {
+  /*! Use trampoline for internal microtasks */
+  KMP_IDENT_IMB = 0x01,
+  /*! Use c-style ident structure */
+  KMP_IDENT_KMPC = 0x02,
+  /* 0x04 is no longer used */
+  /*! Entry point generated by auto-parallelization */
+  KMP_IDENT_AUTOPAR = 0x08,
+  /*! Compiler generates atomic reduction option for kmpc_reduce* */
+  KMP_IDENT_ATOMIC_REDUCE = 0x10,
+  /*! To mark a 'barrier' directive in user code */
+  KMP_IDENT_BARRIER_EXPL = 0x20,
+  /*! To Mark implicit barriers. */
+  KMP_IDENT_BARRIER_IMPL = 0x0040,
+  KMP_IDENT_BARRIER_IMPL_MASK = 0x01C0,
+  KMP_IDENT_BARRIER_IMPL_FOR = 0x0040,
+  KMP_IDENT_BARRIER_IMPL_SECTIONS = 0x00C0,
+
+  KMP_IDENT_BARRIER_IMPL_SINGLE = 0x0140,
+  KMP_IDENT_BARRIER_IMPL_WORKSHARE = 0x01C0,
+
+  /*! To mark a static loop in OMPT callbacks */
+  KMP_IDENT_WORK_LOOP = 0x200,
+  /*! To mark a sections directive in OMPT callbacks */
+  KMP_IDENT_WORK_SECTIONS = 0x400,
+  /*! To mark a distribute construct in OMPT callbacks */
+  KMP_IDENT_WORK_DISTRIBUTE = 0x800,
+  /*! Atomic hint; bottom four bits as omp_sync_hint_t. Top four reserved and
+      not currently used. If one day we need more bits, then we can use
+      an invalid combination of hints to mean that another, larger field
+      should be used in a different flag. */
+  KMP_IDENT_ATOMIC_HINT_MASK = 0xFF0000,
+  KMP_IDENT_ATOMIC_HINT_UNCONTENDED = 0x010000,
+  KMP_IDENT_ATOMIC_HINT_CONTENDED = 0x020000,
+  KMP_IDENT_ATOMIC_HINT_NONSPECULATIVE = 0x040000,
+  KMP_IDENT_ATOMIC_HINT_SPECULATIVE = 0x080000,
+  KMP_IDENT_OPENMP_SPEC_VERSION_MASK = 0xFF000000
+};
+
+/*!
+ * The ident structure that describes a source location.
+ */
+typedef struct ident {
+  kmp_int32 reserved_1; /**<  might be used in Fortran; see above  */
+  kmp_int32 flags; /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
+                      identifies this union member  */
+  kmp_int32 reserved_2; /**<  not really used in Fortran any more; see above */
+#if USE_ITT_BUILD
+/*  but currently used for storing region-specific ITT */
+/*  contextual information. */
+#endif /* USE_ITT_BUILD */
+  kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++  */
+  char const *psource; /**< String describing the source location.
+                       The string is composed of semi-colon separated fields
+                       which describe the source file, the function and a pair
+                       of line numbers that delimit the construct. */
+  // Returns the OpenMP version in form major*10+minor (e.g., 50 for 5.0)
+  kmp_int32 get_openmp_version() {
+    return (((flags & KMP_IDENT_OPENMP_SPEC_VERSION_MASK) >> 24) & 0xFF);
+  }
+} ident_t;
+/*!
+@}
+*/
+
+// Some forward declarations.
+typedef union kmp_team kmp_team_t;
+typedef struct kmp_taskdata kmp_taskdata_t;
+typedef union kmp_task_team kmp_task_team_t;
+typedef union kmp_team kmp_team_p;
+typedef union kmp_info kmp_info_p;
+typedef union kmp_root kmp_root_p;
+
+template <bool C = false, bool S = true> class kmp_flag_32;
+template <bool C = false, bool S = true> class kmp_flag_64;
+template <bool C = false, bool S = true> class kmp_atomic_flag_64;
+class kmp_flag_oncore;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+/* Pack two 32-bit signed integers into a 64-bit signed integer */
+/* ToDo: Fix word ordering for big-endian machines. */
+#define KMP_PACK_64(HIGH_32, LOW_32)                                           \
+  ((kmp_int64)((((kmp_uint64)(HIGH_32)) << 32) | (kmp_uint64)(LOW_32)))
+
+// Generic string manipulation macros. Assume that _x is of type char *
+#define SKIP_WS(_x)                                                            \
+  {                                                                            \
+    while (*(_x) == ' ' || *(_x) == '\t')                                      \
+      (_x)++;                                                                  \
+  }
+#define SKIP_DIGITS(_x)                                                        \
+  {                                                                            \
+    while (*(_x) >= '0' && *(_x) <= '9')                                       \
+      (_x)++;                                                                  \
+  }
+#define SKIP_TOKEN(_x)                                                         \
+  {                                                                            \
+    while ((*(_x) >= '0' && *(_x) <= '9') || (*(_x) >= 'a' && *(_x) <= 'z') || \
+           (*(_x) >= 'A' && *(_x) <= 'Z') || *(_x) == '_')                     \
+      (_x)++;                                                                  \
+  }
+#define SKIP_TO(_x, _c)                                                        \
+  {                                                                            \
+    while (*(_x) != '\0' && *(_x) != (_c))                                     \
+      (_x)++;                                                                  \
+  }
+
+/* ------------------------------------------------------------------------ */
+
+#define KMP_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
+
+/* ------------------------------------------------------------------------ */
+/* Enumeration types */
+
+enum kmp_state_timer {
+  ts_stop,
+  ts_start,
+  ts_pause,
+
+  ts_last_state
+};
+
+enum dynamic_mode {
+  dynamic_default,
+#ifdef USE_LOAD_BALANCE
+  dynamic_load_balance,
+#endif /* USE_LOAD_BALANCE */
+  dynamic_random,
+  dynamic_thread_limit,
+  dynamic_max
+};
+
+/* external schedule constants, duplicate enum omp_sched in omp.h in order to
+ * not include it here */
+#ifndef KMP_SCHED_TYPE_DEFINED
+#define KMP_SCHED_TYPE_DEFINED
+typedef enum kmp_sched {
+  kmp_sched_lower = 0, // lower and upper bounds are for routine parameter check
+  // Note: need to adjust __kmp_sch_map global array in case enum is changed
+  kmp_sched_static = 1, // mapped to kmp_sch_static_chunked           (33)
+  kmp_sched_dynamic = 2, // mapped to kmp_sch_dynamic_chunked          (35)
+  kmp_sched_guided = 3, // mapped to kmp_sch_guided_chunked           (36)
+  kmp_sched_auto = 4, // mapped to kmp_sch_auto                     (38)
+  kmp_sched_upper_std = 5, // upper bound for standard schedules
+  kmp_sched_lower_ext = 100, // lower bound of Intel extension schedules
+  kmp_sched_trapezoidal = 101, // mapped to kmp_sch_trapezoidal (39)
+#if KMP_STATIC_STEAL_ENABLED
+  kmp_sched_static_steal = 102, // mapped to kmp_sch_static_steal (44)
+#endif
+  kmp_sched_upper,
+  kmp_sched_default = kmp_sched_static, // default scheduling
+  kmp_sched_monotonic = 0x80000000
+} kmp_sched_t;
+#endif
+
+/*!
+ @ingroup WORK_SHARING
+ * Describes the loop schedule to be used for a parallel for loop.
+ */
+enum sched_type : kmp_int32 {
+  kmp_sch_lower = 32, /**< lower bound for unordered values */
+  kmp_sch_static_chunked = 33,
+  kmp_sch_static = 34, /**< static unspecialized */
+  kmp_sch_dynamic_chunked = 35,
+  kmp_sch_guided_chunked = 36, /**< guided unspecialized */
+  kmp_sch_runtime = 37,
+  kmp_sch_auto = 38, /**< auto */
+  kmp_sch_trapezoidal = 39,
+
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_static_greedy = 40,
+  kmp_sch_static_balanced = 41,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_guided_iterative_chunked = 42,
+  kmp_sch_guided_analytical_chunked = 43,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_static_steal = 44,
+
+  /* static with chunk adjustment (e.g., simd) */
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46, /**< guided with chunk adjustment */
+  kmp_sch_runtime_simd = 47, /**< runtime with chunk adjustment */
+
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_upper, /**< upper bound for unordered values */
+
+  kmp_ord_lower = 64, /**< lower bound for ordered values, must be power of 2 */
+  kmp_ord_static_chunked = 65,
+  kmp_ord_static = 66, /**< ordered static unspecialized */
+  kmp_ord_dynamic_chunked = 67,
+  kmp_ord_guided_chunked = 68,
+  kmp_ord_runtime = 69,
+  kmp_ord_auto = 70, /**< ordered auto */
+  kmp_ord_trapezoidal = 71,
+  kmp_ord_upper, /**< upper bound for ordered values */
+
+  /* Schedules for Distribute construct */
+  kmp_distribute_static_chunked = 91, /**< distribute static chunked */
+  kmp_distribute_static = 92, /**< distribute static unspecialized */
+
+  /* For the "nomerge" versions, kmp_dispatch_next*() will always return a
+     single iteration/chunk, even if the loop is serialized. For the schedule
+     types listed above, the entire iteration vector is returned if the loop is
+     serialized. This doesn't work for gcc/gcomp sections. */
+  kmp_nm_lower = 160, /**< lower bound for nomerge values */
+
+  kmp_nm_static_chunked =
+      (kmp_sch_static_chunked - kmp_sch_lower + kmp_nm_lower),
+  kmp_nm_static = 162, /**< static unspecialized */
+  kmp_nm_dynamic_chunked = 163,
+  kmp_nm_guided_chunked = 164, /**< guided unspecialized */
+  kmp_nm_runtime = 165,
+  kmp_nm_auto = 166, /**< auto */
+  kmp_nm_trapezoidal = 167,
+
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_nm_static_greedy = 168,
+  kmp_nm_static_balanced = 169,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_nm_guided_iterative_chunked = 170,
+  kmp_nm_guided_analytical_chunked = 171,
+  kmp_nm_static_steal =
+      172, /* accessible only through OMP_SCHEDULE environment variable */
+
+  kmp_nm_ord_static_chunked = 193,
+  kmp_nm_ord_static = 194, /**< ordered static unspecialized */
+  kmp_nm_ord_dynamic_chunked = 195,
+  kmp_nm_ord_guided_chunked = 196,
+  kmp_nm_ord_runtime = 197,
+  kmp_nm_ord_auto = 198, /**< auto */
+  kmp_nm_ord_trapezoidal = 199,
+  kmp_nm_upper, /**< upper bound for nomerge values */
+
+  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. Since
+     we need to distinguish the three possible cases (no modifier, monotonic
+     modifier, nonmonotonic modifier), we need separate bits for each modifier.
+     The absence of monotonic does not imply nonmonotonic, especially since 4.5
+     says that the behaviour of the "no modifier" case is implementation defined
+     in 4.5, but will become "nonmonotonic" in 5.0.
+
+     Since we're passing a full 32 bit value, we can use a couple of high bits
+     for these flags; out of paranoia we avoid the sign bit.
+
+     These modifiers can be or-ed into non-static schedules by the compiler to
+     pass the additional information. They will be stripped early in the
+     processing in __kmp_dispatch_init when setting up schedules, so most of the
+     code won't ever see schedules with these bits set.  */
+  kmp_sch_modifier_monotonic =
+      (1 << 29), /**< Set if the monotonic schedule modifier was present */
+  kmp_sch_modifier_nonmonotonic =
+      (1 << 30), /**< Set if the nonmonotonic schedule modifier was present */
+
+#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
+  (enum sched_type)(                                                           \
+      (s) & ~(kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic))
+#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sch_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_NONMONOTONIC(s) (((s)&kmp_sch_modifier_nonmonotonic) != 0)
+#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
+  (((s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)) == 0)
+#define SCHEDULE_GET_MODIFIERS(s)                                              \
+  ((enum sched_type)(                                                          \
+      (s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)))
+#define SCHEDULE_SET_MODIFIERS(s, m)                                           \
+  (s = (enum sched_type)((kmp_int32)s | (kmp_int32)m))
+#define SCHEDULE_NONMONOTONIC 0
+#define SCHEDULE_MONOTONIC 1
+
+  kmp_sch_default = kmp_sch_static /**< default scheduling algorithm */
+};
+
+// Apply modifiers on internal kind to standard kind
+static inline void
+__kmp_sched_apply_mods_stdkind(kmp_sched_t *kind,
+                               enum sched_type internal_kind) {
+  if (SCHEDULE_HAS_MONOTONIC(internal_kind)) {
+    *kind = (kmp_sched_t)((int)*kind | (int)kmp_sched_monotonic);
+  }
+}
+
+// Apply modifiers on standard kind to internal kind
+static inline void
+__kmp_sched_apply_mods_intkind(kmp_sched_t kind,
+                               enum sched_type *internal_kind) {
+  if ((int)kind & (int)kmp_sched_monotonic) {
+    *internal_kind = (enum sched_type)((int)*internal_kind |
+                                       (int)kmp_sch_modifier_monotonic);
+  }
+}
+
+// Get standard schedule without modifiers
+static inline kmp_sched_t __kmp_sched_without_mods(kmp_sched_t kind) {
+  return (kmp_sched_t)((int)kind & ~((int)kmp_sched_monotonic));
+}
+
+/* Type to keep runtime schedule set via OMP_SCHEDULE or omp_set_schedule() */
+typedef union kmp_r_sched {
+  struct {
+    enum sched_type r_sched_type;
+    int chunk;
+  };
+  kmp_int64 sched;
+} kmp_r_sched_t;
+
+extern enum sched_type __kmp_sch_map[]; // map OMP 3.0 schedule types with our
+// internal schedule types
+
+enum library_type {
+  library_none,
+  library_serial,
+  library_turnaround,
+  library_throughput
+};
+
+#if KMP_OS_LINUX
+enum clock_function_type {
+  clock_function_gettimeofday,
+  clock_function_clock_gettime
+};
+#endif /* KMP_OS_LINUX */
+
+#if KMP_MIC_SUPPORTED
+enum mic_type { non_mic, mic1, mic2, mic3, dummy };
+#endif
+
+/* -- fast reduction stuff ------------------------------------------------ */
+
+#undef KMP_FAST_REDUCTION_BARRIER
+#define KMP_FAST_REDUCTION_BARRIER 1
+
+#undef KMP_FAST_REDUCTION_CORE_DUO
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#define KMP_FAST_REDUCTION_CORE_DUO 1
+#endif
+
+enum _reduction_method {
+  reduction_method_not_defined = 0,
+  critical_reduce_block = (1 << 8),
+  atomic_reduce_block = (2 << 8),
+  tree_reduce_block = (3 << 8),
+  empty_reduce_block = (4 << 8)
+};
+
+// Description of the packed_reduction_method variable:
+// The packed_reduction_method variable consists of two enum types variables
+// that are packed together into 0-th byte and 1-st byte:
+// 0: (packed_reduction_method & 0x000000FF) is a 'enum barrier_type' value of
+// barrier that will be used in fast reduction: bs_plain_barrier or
+// bs_reduction_barrier
+// 1: (packed_reduction_method & 0x0000FF00) is a reduction method that will
+// be used in fast reduction;
+// Reduction method is of 'enum _reduction_method' type and it's defined the way
+// so that the bits of 0-th byte are empty, so no need to execute a shift
+// instruction while packing/unpacking
+
+#if KMP_FAST_REDUCTION_BARRIER
+#define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method, barrier_type)      \
+  ((reduction_method) | (barrier_type))
+
+#define UNPACK_REDUCTION_METHOD(packed_reduction_method)                       \
+  ((enum _reduction_method)((packed_reduction_method) & (0x0000FF00)))
+
+#define UNPACK_REDUCTION_BARRIER(packed_reduction_method)                      \
+  ((enum barrier_type)((packed_reduction_method) & (0x000000FF)))
+#else
+#define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method, barrier_type)      \
+  (reduction_method)
+
+#define UNPACK_REDUCTION_METHOD(packed_reduction_method)                       \
+  (packed_reduction_method)
+
+#define UNPACK_REDUCTION_BARRIER(packed_reduction_method) (bs_plain_barrier)
+#endif
+
+#define TEST_REDUCTION_METHOD(packed_reduction_method, which_reduction_block)  \
+  ((UNPACK_REDUCTION_METHOD(packed_reduction_method)) ==                       \
+   (which_reduction_block))
+
+#if KMP_FAST_REDUCTION_BARRIER
+#define TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER                               \
+  (PACK_REDUCTION_METHOD_AND_BARRIER(tree_reduce_block, bs_reduction_barrier))
+
+#define TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER                                   \
+  (PACK_REDUCTION_METHOD_AND_BARRIER(tree_reduce_block, bs_plain_barrier))
+#endif
+
+typedef int PACKED_REDUCTION_METHOD_T;
+
+/* -- end of fast reduction stuff ----------------------------------------- */
+
+#if KMP_OS_WINDOWS
+#define USE_CBLKDATA
+#if KMP_MSVC_COMPAT
+#pragma warning(push)
+#pragma warning(disable : 271 310)
+#endif
+#include <windows.h>
+#if KMP_MSVC_COMPAT
+#pragma warning(pop)
+#endif
+#endif
+
+#if KMP_OS_UNIX
+#if !KMP_OS_WASI
+#include <dlfcn.h>
+#endif
+#include <pthread.h>
+#endif
+
+enum kmp_hw_t : int {
+  KMP_HW_UNKNOWN = -1,
+  KMP_HW_SOCKET = 0,
+  KMP_HW_PROC_GROUP,
+  KMP_HW_NUMA,
+  KMP_HW_DIE,
+  KMP_HW_LLC,
+  KMP_HW_L3,
+  KMP_HW_TILE,
+  KMP_HW_MODULE,
+  KMP_HW_L2,
+  KMP_HW_L1,
+  KMP_HW_CORE,
+  KMP_HW_THREAD,
+  KMP_HW_LAST
+};
+
+typedef enum kmp_hw_core_type_t {
+  KMP_HW_CORE_TYPE_UNKNOWN = 0x0,
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  KMP_HW_CORE_TYPE_ATOM = 0x20,
+  KMP_HW_CORE_TYPE_CORE = 0x40,
+  KMP_HW_MAX_NUM_CORE_TYPES = 3,
+#else
+  KMP_HW_MAX_NUM_CORE_TYPES = 1,
+#endif
+} kmp_hw_core_type_t;
+
+#define KMP_HW_MAX_NUM_CORE_EFFS 8
+
+#define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type)                                   \
+  KMP_DEBUG_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
+#define KMP_ASSERT_VALID_HW_TYPE(type)                                         \
+  KMP_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
+
+#define KMP_FOREACH_HW_TYPE(type)                                              \
+  for (kmp_hw_t type = (kmp_hw_t)0; type < KMP_HW_LAST;                        \
+       type = (kmp_hw_t)((int)type + 1))
+
+const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural = false);
+const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural = false);
+const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type);
+
+/* Only Linux* OS and Windows* OS support thread affinity. */
+#if KMP_AFFINITY_SUPPORTED
+
+// GROUP_AFFINITY is already defined for _MSC_VER>=1600 (VS2010 and later).
+#if KMP_OS_WINDOWS
+#if _MSC_VER < 1600 && KMP_MSVC_COMPAT
+typedef struct GROUP_AFFINITY {
+  KAFFINITY Mask;
+  WORD Group;
+  WORD Reserved[3];
+} GROUP_AFFINITY;
+#endif /* _MSC_VER < 1600 */
+#if KMP_GROUP_AFFINITY
+extern int __kmp_num_proc_groups;
+#else
+static const int __kmp_num_proc_groups = 1;
+#endif /* KMP_GROUP_AFFINITY */
+typedef DWORD (*kmp_GetActiveProcessorCount_t)(WORD);
+extern kmp_GetActiveProcessorCount_t __kmp_GetActiveProcessorCount;
+
+typedef WORD (*kmp_GetActiveProcessorGroupCount_t)(void);
+extern kmp_GetActiveProcessorGroupCount_t __kmp_GetActiveProcessorGroupCount;
+
+typedef BOOL (*kmp_GetThreadGroupAffinity_t)(HANDLE, GROUP_AFFINITY *);
+extern kmp_GetThreadGroupAffinity_t __kmp_GetThreadGroupAffinity;
+
+typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE, const GROUP_AFFINITY *,
+                                             GROUP_AFFINITY *);
+extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
+#endif /* KMP_OS_WINDOWS */
+
+#if KMP_USE_HWLOC
+extern hwloc_topology_t __kmp_hwloc_topology;
+extern int __kmp_hwloc_error;
+#endif
+
+extern size_t __kmp_affin_mask_size;
+#define KMP_AFFINITY_CAPABLE() (__kmp_affin_mask_size > 0)
+#define KMP_AFFINITY_DISABLE() (__kmp_affin_mask_size = 0)
+#define KMP_AFFINITY_ENABLE(mask_size) (__kmp_affin_mask_size = mask_size)
+#define KMP_CPU_SET_ITERATE(i, mask)                                           \
+  for (i = (mask)->begin(); (int)i != (mask)->end(); i = (mask)->next(i))
+#define KMP_CPU_SET(i, mask) (mask)->set(i)
+#define KMP_CPU_ISSET(i, mask) (mask)->is_set(i)
+#define KMP_CPU_CLR(i, mask) (mask)->clear(i)
+#define KMP_CPU_ZERO(mask) (mask)->zero()
+#define KMP_CPU_ISEMPTY(mask) (mask)->empty()
+#define KMP_CPU_COPY(dest, src) (dest)->copy(src)
+#define KMP_CPU_AND(dest, src) (dest)->bitwise_and(src)
+#define KMP_CPU_COMPLEMENT(max_bit_number, mask) (mask)->bitwise_not()
+#define KMP_CPU_UNION(dest, src) (dest)->bitwise_or(src)
+#define KMP_CPU_EQUAL(dest, src) (dest)->is_equal(src)
+#define KMP_CPU_ALLOC(ptr) (ptr = __kmp_affinity_dispatch->allocate_mask())
+#define KMP_CPU_FREE(ptr) __kmp_affinity_dispatch->deallocate_mask(ptr)
+#define KMP_CPU_ALLOC_ON_STACK(ptr) KMP_CPU_ALLOC(ptr)
+#define KMP_CPU_FREE_FROM_STACK(ptr) KMP_CPU_FREE(ptr)
+#define KMP_CPU_INTERNAL_ALLOC(ptr) KMP_CPU_ALLOC(ptr)
+#define KMP_CPU_INTERNAL_FREE(ptr) KMP_CPU_FREE(ptr)
+#define KMP_CPU_INDEX(arr, i) __kmp_affinity_dispatch->index_mask_array(arr, i)
+#define KMP_CPU_ALLOC_ARRAY(arr, n)                                            \
+  (arr = __kmp_affinity_dispatch->allocate_mask_array(n))
+#define KMP_CPU_FREE_ARRAY(arr, n)                                             \
+  __kmp_affinity_dispatch->deallocate_mask_array(arr)
+#define KMP_CPU_INTERNAL_ALLOC_ARRAY(arr, n) KMP_CPU_ALLOC_ARRAY(arr, n)
+#define KMP_CPU_INTERNAL_FREE_ARRAY(arr, n) KMP_CPU_FREE_ARRAY(arr, n)
+#define __kmp_get_system_affinity(mask, abort_bool)                            \
+  (mask)->get_system_affinity(abort_bool)
+#define __kmp_set_system_affinity(mask, abort_bool)                            \
+  (mask)->set_system_affinity(abort_bool)
+#define __kmp_get_proc_group(mask) (mask)->get_proc_group()
+
+class KMPAffinity {
+public:
+  class Mask {
+  public:
+    void *operator new(size_t n);
+    void operator delete(void *p);
+    void *operator new[](size_t n);
+    void operator delete[](void *p);
+    virtual ~Mask() {}
+    // Set bit i to 1
+    virtual void set(int i) {}
+    // Return bit i
+    virtual bool is_set(int i) const { return false; }
+    // Set bit i to 0
+    virtual void clear(int i) {}
+    // Zero out entire mask
+    virtual void zero() {}
+    // Check whether mask is empty
+    virtual bool empty() const { return true; }
+    // Copy src into this mask
+    virtual void copy(const Mask *src) {}
+    // this &= rhs
+    virtual void bitwise_and(const Mask *rhs) {}
+    // this |= rhs
+    virtual void bitwise_or(const Mask *rhs) {}
+    // this = ~this
+    virtual void bitwise_not() {}
+    // this == rhs
+    virtual bool is_equal(const Mask *rhs) const { return false; }
+    // API for iterating over an affinity mask
+    // for (int i = mask->begin(); i != mask->end(); i = mask->next(i))
+    virtual int begin() const { return 0; }
+    virtual int end() const { return 0; }
+    virtual int next(int previous) const { return 0; }
+#if KMP_OS_WINDOWS
+    virtual int set_process_affinity(bool abort_on_error) const { return -1; }
+#endif
+    // Set the system's affinity to this affinity mask's value
+    virtual int set_system_affinity(bool abort_on_error) const { return -1; }
+    // Set this affinity mask to the current system affinity
+    virtual int get_system_affinity(bool abort_on_error) { return -1; }
+    // Only 1 DWORD in the mask should have any procs set.
+    // Return the appropriate index, or -1 for an invalid mask.
+    virtual int get_proc_group() const { return -1; }
+    int get_max_cpu() const {
+      int cpu;
+      int max_cpu = -1;
+      KMP_CPU_SET_ITERATE(cpu, this) {
+        if (cpu > max_cpu)
+          max_cpu = cpu;
+      }
+      return max_cpu;
+    }
+  };
+  void *operator new(size_t n);
+  void operator delete(void *p);
+  // Need virtual destructor
+  virtual ~KMPAffinity() = default;
+  // Determine if affinity is capable
+  virtual void determine_capable(const char *env_var) {}
+  // Bind the current thread to os proc
+  virtual void bind_thread(int proc) {}
+  // Factory functions to allocate/deallocate a mask
+  virtual Mask *allocate_mask() { return nullptr; }
+  virtual void deallocate_mask(Mask *m) {}
+  virtual Mask *allocate_mask_array(int num) { return nullptr; }
+  virtual void deallocate_mask_array(Mask *m) {}
+  virtual Mask *index_mask_array(Mask *m, int index) { return nullptr; }
+  static void pick_api();
+  static void destroy_api();
+  enum api_type {
+    NATIVE_OS
+#if KMP_USE_HWLOC
+    ,
+    HWLOC
+#endif
+  };
+  virtual api_type get_api_type() const {
+    KMP_ASSERT(0);
+    return NATIVE_OS;
+  }
+
+private:
+  static bool picked_api;
+};
+
+typedef KMPAffinity::Mask kmp_affin_mask_t;
+extern KMPAffinity *__kmp_affinity_dispatch;
+
+class kmp_affinity_raii_t {
+  kmp_affin_mask_t *mask;
+  bool restored;
+
+public:
+  kmp_affinity_raii_t(const kmp_affin_mask_t *new_mask = nullptr)
+      : restored(false) {
+    if (KMP_AFFINITY_CAPABLE()) {
+      KMP_CPU_ALLOC(mask);
+      KMP_ASSERT(mask != NULL);
+      __kmp_get_system_affinity(mask, /*abort_on_error=*/true);
+      if (new_mask)
+        __kmp_set_system_affinity(new_mask, /*abort_on_error=*/true);
+    }
+  }
+  void restore() {
+    if (!restored && KMP_AFFINITY_CAPABLE()) {
+      __kmp_set_system_affinity(mask, /*abort_on_error=*/true);
+      KMP_CPU_FREE(mask);
+    }
+    restored = true;
+  }
+  ~kmp_affinity_raii_t() { restore(); }
+};
+
+// Declare local char buffers with this size for printing debug and info
+// messages, using __kmp_affinity_print_mask().
+#define KMP_AFFIN_MASK_PRINT_LEN 1024
+
+enum affinity_type {
+  affinity_none = 0,
+  affinity_physical,
+  affinity_logical,
+  affinity_compact,
+  affinity_scatter,
+  affinity_explicit,
+  affinity_balanced,
+  affinity_disabled, // not used outsize the env var parser
+  affinity_default
+};
+
+enum affinity_top_method {
+  affinity_top_method_all = 0, // try all (supported) methods, in order
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  affinity_top_method_apicid,
+  affinity_top_method_x2apicid,
+  affinity_top_method_x2apicid_1f,
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+  affinity_top_method_cpuinfo, // KMP_CPUINFO_FILE is usable on Windows* OS, too
+#if KMP_GROUP_AFFINITY
+  affinity_top_method_group,
+#endif /* KMP_GROUP_AFFINITY */
+  affinity_top_method_flat,
+#if KMP_USE_HWLOC
+  affinity_top_method_hwloc,
+#endif
+  affinity_top_method_default
+};
+
+#define affinity_respect_mask_default (2)
+
+typedef struct kmp_affinity_flags_t {
+  unsigned dups : 1;
+  unsigned verbose : 1;
+  unsigned warnings : 1;
+  unsigned respect : 2;
+  unsigned reset : 1;
+  unsigned initialized : 1;
+  unsigned core_types_gran : 1;
+  unsigned core_effs_gran : 1;
+  unsigned omp_places : 1;
+  unsigned reserved : 22;
+} kmp_affinity_flags_t;
+KMP_BUILD_ASSERT(sizeof(kmp_affinity_flags_t) == 4);
+
+typedef struct kmp_affinity_ids_t {
+  int os_id;
+  int ids[KMP_HW_LAST];
+} kmp_affinity_ids_t;
+
+typedef struct kmp_affinity_attrs_t {
+  int core_type : 8;
+  int core_eff : 8;
+  unsigned valid : 1;
+  unsigned reserved : 15;
+} kmp_affinity_attrs_t;
+#define KMP_AFFINITY_ATTRS_UNKNOWN                                             \
+  { KMP_HW_CORE_TYPE_UNKNOWN, kmp_hw_attr_t::UNKNOWN_CORE_EFF, 0, 0 }
+
+typedef struct kmp_affinity_t {
+  char *proclist;
+  enum affinity_type type;
+  kmp_hw_t gran;
+  int gran_levels;
+  kmp_affinity_attrs_t core_attr_gran;
+  int compact;
+  int offset;
+  kmp_affinity_flags_t flags;
+  unsigned num_masks;
+  kmp_affin_mask_t *masks;
+  kmp_affinity_ids_t *ids;
+  kmp_affinity_attrs_t *attrs;
+  unsigned num_os_id_masks;
+  kmp_affin_mask_t *os_id_masks;
+  const char *env_var;
+} kmp_affinity_t;
+
+#define KMP_AFFINITY_INIT(env)                                                 \
+  {                                                                            \
+    nullptr, affinity_default, KMP_HW_UNKNOWN, -1, KMP_AFFINITY_ATTRS_UNKNOWN, \
+        0, 0,                                                                  \
+        {TRUE,  FALSE, TRUE, affinity_respect_mask_default, FALSE, FALSE,      \
+         FALSE, FALSE, FALSE},                                                 \
+        0, nullptr, nullptr, nullptr, 0, nullptr, env                          \
+  }
+
+extern enum affinity_top_method __kmp_affinity_top_method;
+extern kmp_affinity_t __kmp_affinity;
+extern kmp_affinity_t __kmp_hh_affinity;
+extern kmp_affinity_t *__kmp_affinities[2];
+
+extern void __kmp_affinity_bind_thread(int which);
+
+extern kmp_affin_mask_t *__kmp_affin_fullMask;
+extern kmp_affin_mask_t *__kmp_affin_origMask;
+extern char *__kmp_cpuinfo_file;
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+extern int __kmp_first_osid_with_ecore;
+#endif
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+// This needs to be kept in sync with the values in omp.h !!!
+typedef enum kmp_proc_bind_t {
+  proc_bind_false = 0,
+  proc_bind_true,
+  proc_bind_primary,
+  proc_bind_close,
+  proc_bind_spread,
+  proc_bind_intel, // use KMP_AFFINITY interface
+  proc_bind_default
+} kmp_proc_bind_t;
+
+typedef struct kmp_nested_proc_bind_t {
+  kmp_proc_bind_t *bind_types;
+  int size;
+  int used;
+} kmp_nested_proc_bind_t;
+
+extern kmp_nested_proc_bind_t __kmp_nested_proc_bind;
+extern kmp_proc_bind_t __kmp_teams_proc_bind;
+
+extern int __kmp_display_affinity;
+extern char *__kmp_affinity_format;
+static const size_t KMP_AFFINITY_FORMAT_SIZE = 512;
+#if OMPT_SUPPORT
+extern int __kmp_tool;
+extern char *__kmp_tool_libraries;
+#endif // OMPT_SUPPORT
+
+#if KMP_AFFINITY_SUPPORTED
+#define KMP_PLACE_ALL (-1)
+#define KMP_PLACE_UNDEFINED (-2)
+// Is KMP_AFFINITY is being used instead of OMP_PROC_BIND/OMP_PLACES?
+#define KMP_AFFINITY_NON_PROC_BIND                                             \
+  ((__kmp_nested_proc_bind.bind_types[0] == proc_bind_false ||                 \
+    __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) &&                \
+   (__kmp_affinity.num_masks > 0 || __kmp_affinity.type == affinity_balanced))
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+extern int __kmp_affinity_num_places;
+
+typedef enum kmp_cancel_kind_t {
+  cancel_noreq = 0,
+  cancel_parallel = 1,
+  cancel_loop = 2,
+  cancel_sections = 3,
+  cancel_taskgroup = 4
+} kmp_cancel_kind_t;
+
+// KMP_HW_SUBSET support:
+typedef struct kmp_hws_item {
+  int num;
+  int offset;
+} kmp_hws_item_t;
+
+extern kmp_hws_item_t __kmp_hws_socket;
+extern kmp_hws_item_t __kmp_hws_die;
+extern kmp_hws_item_t __kmp_hws_node;
+extern kmp_hws_item_t __kmp_hws_tile;
+extern kmp_hws_item_t __kmp_hws_core;
+extern kmp_hws_item_t __kmp_hws_proc;
+extern int __kmp_hws_requested;
+extern int __kmp_hws_abs_flag; // absolute or per-item number requested
+
+/* ------------------------------------------------------------------------ */
+
+#define KMP_PAD(type, sz)                                                      \
+  (sizeof(type) + (sz - ((sizeof(type) - 1) % (sz)) - 1))
+
+// We need to avoid using -1 as a GTID as +1 is added to the gtid
+// when storing it in a lock, and the value 0 is reserved.
+#define KMP_GTID_DNE (-2) /* Does not exist */
+#define KMP_GTID_SHUTDOWN (-3) /* Library is shutting down */
+#define KMP_GTID_MONITOR (-4) /* Monitor thread ID */
+#define KMP_GTID_UNKNOWN (-5) /* Is not known */
+#define KMP_GTID_MIN (-6) /* Minimal gtid for low bound check in DEBUG */
+
+/* OpenMP 5.0 Memory Management support */
+
+#ifndef __OMP_H
+// Duplicate type definitions from omp.h
+typedef uintptr_t omp_uintptr_t;
+
+typedef enum {
+  omp_atk_sync_hint = 1,
+  omp_atk_alignment = 2,
+  omp_atk_access = 3,
+  omp_atk_pool_size = 4,
+  omp_atk_fallback = 5,
+  omp_atk_fb_data = 6,
+  omp_atk_pinned = 7,
+  omp_atk_partition = 8
+} omp_alloctrait_key_t;
+
+typedef enum {
+  omp_atv_false = 0,
+  omp_atv_true = 1,
+  omp_atv_contended = 3,
+  omp_atv_uncontended = 4,
+  omp_atv_serialized = 5,
+  omp_atv_sequential = omp_atv_serialized, // (deprecated)
+  omp_atv_private = 6,
+  omp_atv_all = 7,
+  omp_atv_thread = 8,
+  omp_atv_pteam = 9,
+  omp_atv_cgroup = 10,
+  omp_atv_default_mem_fb = 11,
+  omp_atv_null_fb = 12,
+  omp_atv_abort_fb = 13,
+  omp_atv_allocator_fb = 14,
+  omp_atv_environment = 15,
+  omp_atv_nearest = 16,
+  omp_atv_blocked = 17,
+  omp_atv_interleaved = 18
+} omp_alloctrait_value_t;
+#define omp_atv_default ((omp_uintptr_t)-1)
+
+typedef void *omp_memspace_handle_t;
+extern omp_memspace_handle_t const omp_default_mem_space;
+extern omp_memspace_handle_t const omp_large_cap_mem_space;
+extern omp_memspace_handle_t const omp_const_mem_space;
+extern omp_memspace_handle_t const omp_high_bw_mem_space;
+extern omp_memspace_handle_t const omp_low_lat_mem_space;
+extern omp_memspace_handle_t const llvm_omp_target_host_mem_space;
+extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
+extern omp_memspace_handle_t const llvm_omp_target_device_mem_space;
+
+typedef struct {
+  omp_alloctrait_key_t key;
+  omp_uintptr_t value;
+} omp_alloctrait_t;
+
+typedef void *omp_allocator_handle_t;
+extern omp_allocator_handle_t const omp_null_allocator;
+extern omp_allocator_handle_t const omp_default_mem_alloc;
+extern omp_allocator_handle_t const omp_large_cap_mem_alloc;
+extern omp_allocator_handle_t const omp_const_mem_alloc;
+extern omp_allocator_handle_t const omp_high_bw_mem_alloc;
+extern omp_allocator_handle_t const omp_low_lat_mem_alloc;
+extern omp_allocator_handle_t const omp_cgroup_mem_alloc;
+extern omp_allocator_handle_t const omp_pteam_mem_alloc;
+extern omp_allocator_handle_t const omp_thread_mem_alloc;
+extern omp_allocator_handle_t const llvm_omp_target_host_mem_alloc;
+extern omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc;
+extern omp_allocator_handle_t const llvm_omp_target_device_mem_alloc;
+extern omp_allocator_handle_t const kmp_max_mem_alloc;
+extern omp_allocator_handle_t __kmp_def_allocator;
+
+// end of duplicate type definitions from omp.h
+#endif
+
+extern int __kmp_memkind_available;
+
+typedef omp_memspace_handle_t kmp_memspace_t; // placeholder
+
+typedef struct kmp_allocator_t {
+  omp_memspace_handle_t memspace;
+  void **memkind; // pointer to memkind
+  size_t alignment;
+  omp_alloctrait_value_t fb;
+  kmp_allocator_t *fb_data;
+  kmp_uint64 pool_size;
+  kmp_uint64 pool_used;
+  bool pinned;
+} kmp_allocator_t;
+
+extern omp_allocator_handle_t __kmpc_init_allocator(int gtid,
+                                                    omp_memspace_handle_t,
+                                                    int ntraits,
+                                                    omp_alloctrait_t traits[]);
+extern void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t al);
+extern void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t al);
+extern omp_allocator_handle_t __kmpc_get_default_allocator(int gtid);
+// external interfaces, may be used by compiler
+extern void *__kmpc_alloc(int gtid, size_t sz, omp_allocator_handle_t al);
+extern void *__kmpc_aligned_alloc(int gtid, size_t align, size_t sz,
+                                  omp_allocator_handle_t al);
+extern void *__kmpc_calloc(int gtid, size_t nmemb, size_t sz,
+                           omp_allocator_handle_t al);
+extern void *__kmpc_realloc(int gtid, void *ptr, size_t sz,
+                            omp_allocator_handle_t al,
+                            omp_allocator_handle_t free_al);
+extern void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al);
+// internal interfaces, contain real implementation
+extern void *__kmp_alloc(int gtid, size_t align, size_t sz,
+                         omp_allocator_handle_t al);
+extern void *__kmp_calloc(int gtid, size_t align, size_t nmemb, size_t sz,
+                          omp_allocator_handle_t al);
+extern void *__kmp_realloc(int gtid, void *ptr, size_t sz,
+                           omp_allocator_handle_t al,
+                           omp_allocator_handle_t free_al);
+extern void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al);
+
+extern void __kmp_init_memkind();
+extern void __kmp_fini_memkind();
+extern void __kmp_init_target_mem();
+
+/* ------------------------------------------------------------------------ */
+
+#if ENABLE_LIBOMPTARGET
+extern void __kmp_init_target_task();
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+#define KMP_UINT64_MAX                                                         \
+  (~((kmp_uint64)1 << ((sizeof(kmp_uint64) * (1 << 3)) - 1)))
+
+#define KMP_MIN_NTH 1
+
+#ifndef KMP_MAX_NTH
+#if defined(PTHREAD_THREADS_MAX) && PTHREAD_THREADS_MAX < INT_MAX
+#define KMP_MAX_NTH PTHREAD_THREADS_MAX
+#else
+#ifdef __ve__
+// VE's pthread supports only up to 64 threads per a VE process.
+// Please check p. 14 of following documentation for more details.
+// https://sxauroratsubasa.sakura.ne.jp/documents/veos/en/VEOS_high_level_design.pdf
+#define KMP_MAX_NTH 64
+#else
+#define KMP_MAX_NTH INT_MAX
+#endif
+#endif
+#endif /* KMP_MAX_NTH */
+
+#ifdef PTHREAD_STACK_MIN
+#define KMP_MIN_STKSIZE ((size_t)PTHREAD_STACK_MIN)
+#else
+#define KMP_MIN_STKSIZE ((size_t)(32 * 1024))
+#endif
+
+#define KMP_MAX_STKSIZE (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1)))
+
+#if KMP_ARCH_X86
+#define KMP_DEFAULT_STKSIZE ((size_t)(2 * 1024 * 1024))
+#elif KMP_ARCH_X86_64
+#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
+#define KMP_BACKUP_STKSIZE ((size_t)(2 * 1024 * 1024))
+#elif KMP_ARCH_VE
+// Minimum stack size for pthread for VE is 4MB.
+//   https://www.hpc.nec/documents/veos/en/glibc/Difference_Points_glibc.htm
+#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
+#elif KMP_OS_AIX
+// The default stack size for worker threads on AIX is 4MB.
+#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
+#else
+#define KMP_DEFAULT_STKSIZE ((size_t)(1024 * 1024))
+#endif
+
+#define KMP_DEFAULT_MALLOC_POOL_INCR ((size_t)(1024 * 1024))
+#define KMP_MIN_MALLOC_POOL_INCR ((size_t)(4 * 1024))
+#define KMP_MAX_MALLOC_POOL_INCR                                               \
+  (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1)))
+
+#define KMP_MIN_STKOFFSET (0)
+#define KMP_MAX_STKOFFSET KMP_MAX_STKSIZE
+#if KMP_OS_DARWIN
+#define KMP_DEFAULT_STKOFFSET KMP_MIN_STKOFFSET
+#else
+#define KMP_DEFAULT_STKOFFSET CACHE_LINE
+#endif
+
+#define KMP_MIN_STKPADDING (0)
+#define KMP_MAX_STKPADDING (2 * 1024 * 1024)
+
+#define KMP_BLOCKTIME_MULTIPLIER                                               \
+  (1000000) /* number of blocktime units per second */
+#define KMP_MIN_BLOCKTIME (0)
+#define KMP_MAX_BLOCKTIME                                                      \
+  (INT_MAX) /* Must be this for "infinite" setting the work */
+
+/* __kmp_blocktime is in microseconds */
+#define KMP_DEFAULT_BLOCKTIME (__kmp_is_hybrid_cpu() ? (0) : (200000))
+
+#if KMP_USE_MONITOR
+#define KMP_DEFAULT_MONITOR_STKSIZE ((size_t)(64 * 1024))
+#define KMP_MIN_MONITOR_WAKEUPS (1) // min times monitor wakes up per second
+#define KMP_MAX_MONITOR_WAKEUPS (1000) // max times monitor can wake up per sec
+
+/* Calculate new number of monitor wakeups for a specific block time based on
+   previous monitor_wakeups. Only allow increasing number of wakeups */
+#define KMP_WAKEUPS_FROM_BLOCKTIME(blocktime, monitor_wakeups)                 \
+  (((blocktime) == KMP_MAX_BLOCKTIME)   ? (monitor_wakeups)                    \
+   : ((blocktime) == KMP_MIN_BLOCKTIME) ? KMP_MAX_MONITOR_WAKEUPS              \
+   : ((monitor_wakeups) > (KMP_BLOCKTIME_MULTIPLIER / (blocktime)))            \
+       ? (monitor_wakeups)                                                     \
+       : (KMP_BLOCKTIME_MULTIPLIER) / (blocktime))
+
+/* Calculate number of intervals for a specific block time based on
+   monitor_wakeups */
+#define KMP_INTERVALS_FROM_BLOCKTIME(blocktime, monitor_wakeups)               \
+  (((blocktime) + (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)) - 1) /        \
+   (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)))
+#else
+#define KMP_BLOCKTIME(team, tid)                                               \
+  (get__bt_set(team, tid) ? get__blocktime(team, tid) : __kmp_dflt_blocktime)
+#if KMP_OS_UNIX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+// HW TSC is used to reduce overhead (clock tick instead of nanosecond).
+extern kmp_uint64 __kmp_ticks_per_msec;
+extern kmp_uint64 __kmp_ticks_per_usec;
+#if KMP_COMPILER_ICC || KMP_COMPILER_ICX
+#define KMP_NOW() ((kmp_uint64)_rdtsc())
+#else
+#define KMP_NOW() __kmp_hardware_timestamp()
+#endif
+#define KMP_BLOCKTIME_INTERVAL(team, tid)                                      \
+  ((kmp_uint64)KMP_BLOCKTIME(team, tid) * __kmp_ticks_per_usec)
+#define KMP_BLOCKING(goal, count) ((goal) > KMP_NOW())
+#else
+// System time is retrieved sporadically while blocking.
+extern kmp_uint64 __kmp_now_nsec();
+#define KMP_NOW() __kmp_now_nsec()
+#define KMP_BLOCKTIME_INTERVAL(team, tid)                                      \
+  ((kmp_uint64)KMP_BLOCKTIME(team, tid) * (kmp_uint64)KMP_NSEC_PER_USEC)
+#define KMP_BLOCKING(goal, count) ((count) % 1000 != 0 || (goal) > KMP_NOW())
+#endif
+#endif // KMP_USE_MONITOR
+
+#define KMP_MIN_STATSCOLS 40
+#define KMP_MAX_STATSCOLS 4096
+#define KMP_DEFAULT_STATSCOLS 80
+
+#define KMP_MIN_INTERVAL 0
+#define KMP_MAX_INTERVAL (INT_MAX - 1)
+#define KMP_DEFAULT_INTERVAL 0
+
+#define KMP_MIN_CHUNK 1
+#define KMP_MAX_CHUNK (INT_MAX - 1)
+#define KMP_DEFAULT_CHUNK 1
+
+#define KMP_MIN_DISP_NUM_BUFF 1
+#define KMP_DFLT_DISP_NUM_BUFF 7
+#define KMP_MAX_DISP_NUM_BUFF 4096
+
+#define KMP_MAX_ORDERED 8
+
+#define KMP_MAX_FIELDS 32
+
+#define KMP_MAX_BRANCH_BITS 31
+
+#define KMP_MAX_ACTIVE_LEVELS_LIMIT INT_MAX
+
+#define KMP_MAX_DEFAULT_DEVICE_LIMIT INT_MAX
+
+#define KMP_MAX_TASK_PRIORITY_LIMIT INT_MAX
+
+/* Minimum number of threads before switch to TLS gtid (experimentally
+   determined) */
+/* josh TODO: what about OS X* tuning? */
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#define KMP_TLS_GTID_MIN 5
+#else
+#define KMP_TLS_GTID_MIN INT_MAX
+#endif
+
+#define KMP_MASTER_TID(tid) (0 == (tid))
+#define KMP_WORKER_TID(tid) (0 != (tid))
+
+#define KMP_MASTER_GTID(gtid) (0 == __kmp_tid_from_gtid((gtid)))
+#define KMP_WORKER_GTID(gtid) (0 != __kmp_tid_from_gtid((gtid)))
+#define KMP_INITIAL_GTID(gtid) (0 == (gtid))
+
+#ifndef TRUE
+#define FALSE 0
+#define TRUE (!FALSE)
+#endif
+
+/* NOTE: all of the following constants must be even */
+
+#if KMP_OS_WINDOWS
+#define KMP_INIT_WAIT 64U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 32U /* susequent number of spin-tests */
+#elif KMP_OS_LINUX
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_DARWIN
+/* TODO: tune for KMP_OS_DARWIN */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_DRAGONFLY
+/* TODO: tune for KMP_OS_DRAGONFLY */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_FREEBSD
+/* TODO: tune for KMP_OS_FREEBSD */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_NETBSD
+/* TODO: tune for KMP_OS_NETBSD */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_OPENBSD
+/* TODO: tune for KMP_OS_OPENBSD */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_HURD
+/* TODO: tune for KMP_OS_HURD */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_SOLARIS
+/* TODO: tune for KMP_OS_SOLARIS */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_WASI
+/* TODO: tune for KMP_OS_WASI */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_AIX
+/* TODO: tune for KMP_OS_AIX */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+typedef struct kmp_cpuid {
+  kmp_uint32 eax;
+  kmp_uint32 ebx;
+  kmp_uint32 ecx;
+  kmp_uint32 edx;
+} kmp_cpuid_t;
+
+typedef struct kmp_cpuinfo_flags_t {
+  unsigned sse2 : 1; // 0 if SSE2 instructions are not supported, 1 otherwise.
+  unsigned rtm : 1; // 0 if RTM instructions are not supported, 1 otherwise.
+  unsigned hybrid : 1;
+  unsigned reserved : 29; // Ensure size of 32 bits
+} kmp_cpuinfo_flags_t;
+
+typedef struct kmp_cpuinfo {
+  int initialized; // If 0, other fields are not initialized.
+  int signature; // CPUID(1).EAX
+  int family; // CPUID(1).EAX[27:20]+CPUID(1).EAX[11:8] (Extended Family+Family)
+  int model; // ( CPUID(1).EAX[19:16] << 4 ) + CPUID(1).EAX[7:4] ( ( Extended
+  // Model << 4 ) + Model)
+  int stepping; // CPUID(1).EAX[3:0] ( Stepping )
+  kmp_cpuinfo_flags_t flags;
+  int apic_id;
+  int physical_id;
+  int logical_id;
+  kmp_uint64 frequency; // Nominal CPU frequency in Hz.
+  char name[3 * sizeof(kmp_cpuid_t)]; // CPUID(0x80000002,0x80000003,0x80000004)
+} kmp_cpuinfo_t;
+
+extern void __kmp_query_cpuid(kmp_cpuinfo_t *p);
+
+#if KMP_OS_UNIX
+// subleaf is only needed for cache and topology discovery and can be set to
+// zero in most cases
+static inline void __kmp_x86_cpuid(int leaf, int subleaf, struct kmp_cpuid *p) {
+  __asm__ __volatile__("cpuid"
+                       : "=a"(p->eax), "=b"(p->ebx), "=c"(p->ecx), "=d"(p->edx)
+                       : "a"(leaf), "c"(subleaf));
+}
+// Load p into FPU control word
+static inline void __kmp_load_x87_fpu_control_word(const kmp_int16 *p) {
+  __asm__ __volatile__("fldcw %0" : : "m"(*p));
+}
+// Store FPU control word into p
+static inline void __kmp_store_x87_fpu_control_word(kmp_int16 *p) {
+  __asm__ __volatile__("fstcw %0" : "=m"(*p));
+}
+static inline void __kmp_clear_x87_fpu_status_word() {
+#if KMP_MIC
+  // 32-bit protected mode x87 FPU state
+  struct x87_fpu_state {
+    unsigned cw;
+    unsigned sw;
+    unsigned tw;
+    unsigned fip;
+    unsigned fips;
+    unsigned fdp;
+    unsigned fds;
+  };
+  struct x87_fpu_state fpu_state = {0, 0, 0, 0, 0, 0, 0};
+  __asm__ __volatile__("fstenv %0\n\t" // store FP env
+                       "andw $0x7f00, %1\n\t" // clear 0-7,15 bits of FP SW
+                       "fldenv %0\n\t" // load FP env back
+                       : "+m"(fpu_state), "+m"(fpu_state.sw));
+#else
+  __asm__ __volatile__("fnclex");
+#endif // KMP_MIC
+}
+#if __SSE__
+static inline void __kmp_load_mxcsr(const kmp_uint32 *p) { _mm_setcsr(*p); }
+static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); }
+#else
+static inline void __kmp_load_mxcsr(const kmp_uint32 *p) {}
+static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = 0; }
+#endif
+#else
+// Windows still has these as external functions in assembly file
+extern void __kmp_x86_cpuid(int mode, int mode2, struct kmp_cpuid *p);
+extern void __kmp_load_x87_fpu_control_word(const kmp_int16 *p);
+extern void __kmp_store_x87_fpu_control_word(kmp_int16 *p);
+extern void __kmp_clear_x87_fpu_status_word();
+static inline void __kmp_load_mxcsr(const kmp_uint32 *p) { _mm_setcsr(*p); }
+static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); }
+#endif // KMP_OS_UNIX
+
+#define KMP_X86_MXCSR_MASK 0xffffffc0 /* ignore status flags (6 lsb) */
+
+// User-level Monitor/Mwait
+#if KMP_HAVE_UMWAIT
+// We always try for UMWAIT first
+#if KMP_HAVE_WAITPKG_INTRINSICS
+#if KMP_HAVE_IMMINTRIN_H
+#include <immintrin.h>
+#elif KMP_HAVE_INTRIN_H
+#include <intrin.h>
+#endif
+#endif // KMP_HAVE_WAITPKG_INTRINSICS
+
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline int __kmp_tpause(uint32_t hint, uint64_t counter) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+  uint32_t timeHi = uint32_t(counter >> 32);
+  uint32_t timeLo = uint32_t(counter & 0xffffffff);
+  char flag;
+  __asm__ volatile("#tpause\n.byte 0x66, 0x0F, 0xAE, 0xF1\n"
+                   "setb   %0"
+                   // The "=q" restraint means any register accessible as rl
+                   //   in 32-bit mode: a, b, c, and d;
+                   //   in 64-bit mode: any integer register
+                   : "=q"(flag)
+                   : "a"(timeLo), "d"(timeHi), "c"(hint)
+                   :);
+  return flag;
+#else
+  return _tpause(hint, counter);
+#endif
+}
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline void __kmp_umonitor(void *cacheline) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+  __asm__ volatile("# umonitor\n.byte 0xF3, 0x0F, 0xAE, 0x01 "
+                   :
+                   : "a"(cacheline)
+                   :);
+#else
+  _umonitor(cacheline);
+#endif
+}
+KMP_ATTRIBUTE_TARGET_WAITPKG
+static inline int __kmp_umwait(uint32_t hint, uint64_t counter) {
+#if !KMP_HAVE_WAITPKG_INTRINSICS
+  uint32_t timeHi = uint32_t(counter >> 32);
+  uint32_t timeLo = uint32_t(counter & 0xffffffff);
+  char flag;
+  __asm__ volatile("#umwait\n.byte 0xF2, 0x0F, 0xAE, 0xF1\n"
+                   "setb   %0"
+                   // The "=q" restraint means any register accessible as rl
+                   //   in 32-bit mode: a, b, c, and d;
+                   //   in 64-bit mode: any integer register
+                   : "=q"(flag)
+                   : "a"(timeLo), "d"(timeHi), "c"(hint)
+                   :);
+  return flag;
+#else
+  return _umwait(hint, counter);
+#endif
+}
+#elif KMP_HAVE_MWAIT
+#if KMP_OS_UNIX
+#include <pmmintrin.h>
+#else
+#include <intrin.h>
+#endif
+#if KMP_OS_UNIX
+__attribute__((target("sse3")))
+#endif
+static inline void
+__kmp_mm_monitor(void *cacheline, unsigned extensions, unsigned hints) {
+  _mm_monitor(cacheline, extensions, hints);
+}
+#if KMP_OS_UNIX
+__attribute__((target("sse3")))
+#endif
+static inline void
+__kmp_mm_mwait(unsigned extensions, unsigned hints) {
+  _mm_mwait(extensions, hints);
+}
+#endif // KMP_HAVE_UMWAIT
+
+#if KMP_ARCH_X86
+extern void __kmp_x86_pause(void);
+#elif KMP_MIC
+// Performance testing on KNC (C0QS-7120 P/A/X/D, 61-core, 16 GB Memory) showed
+// regression after removal of extra PAUSE from spin loops. Changing
+// the delay from 100 to 300 showed even better performance than double PAUSE
+// on Spec OMP2001 and LCPC tasking tests, no regressions on EPCC.
+static inline void __kmp_x86_pause(void) { _mm_delay_32(300); }
+#else
+static inline void __kmp_x86_pause(void) { _mm_pause(); }
+#endif
+#define KMP_CPU_PAUSE() __kmp_x86_pause()
+#elif KMP_ARCH_PPC64
+#define KMP_PPC64_PRI_LOW() __asm__ volatile("or 1, 1, 1")
+#define KMP_PPC64_PRI_MED() __asm__ volatile("or 2, 2, 2")
+#define KMP_PPC64_PRI_LOC_MB() __asm__ volatile("" : : : "memory")
+#define KMP_CPU_PAUSE()                                                        \
+  do {                                                                         \
+    KMP_PPC64_PRI_LOW();                                                       \
+    KMP_PPC64_PRI_MED();                                                       \
+    KMP_PPC64_PRI_LOC_MB();                                                    \
+  } while (0)
+#else
+#define KMP_CPU_PAUSE() /* nothing to do */
+#endif
+
+#define KMP_INIT_YIELD(count)                                                  \
+  { (count) = __kmp_yield_init; }
+
+#define KMP_INIT_BACKOFF(time)                                                 \
+  { (time) = __kmp_pause_init; }
+
+#define KMP_OVERSUBSCRIBED                                                     \
+  (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc))
+
+#define KMP_TRY_YIELD                                                          \
+  ((__kmp_use_yield == 1) || (__kmp_use_yield == 2 && (KMP_OVERSUBSCRIBED)))
+
+#define KMP_TRY_YIELD_OVERSUB                                                  \
+  ((__kmp_use_yield == 1 || __kmp_use_yield == 2) && (KMP_OVERSUBSCRIBED))
+
+#define KMP_YIELD(cond)                                                        \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    if ((cond) && (KMP_TRY_YIELD))                                             \
+      __kmp_yield();                                                           \
+  }
+
+#define KMP_YIELD_OVERSUB()                                                    \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    if ((KMP_TRY_YIELD_OVERSUB))                                               \
+      __kmp_yield();                                                           \
+  }
+
+// Note the decrement of 2 in the following Macros. With KMP_LIBRARY=turnaround,
+// there should be no yielding since initial value from KMP_INIT_YIELD() is odd.
+#define KMP_YIELD_SPIN(count)                                                  \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    if (KMP_TRY_YIELD) {                                                       \
+      (count) -= 2;                                                            \
+      if (!(count)) {                                                          \
+        __kmp_yield();                                                         \
+        (count) = __kmp_yield_next;                                            \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// If TPAUSE is available & enabled, use it. If oversubscribed, use the slower
+// (C0.2) state, which improves performance of other SMT threads on the same
+// core, otherwise, use the fast (C0.1) default state, or whatever the user has
+// requested. Uses a timed TPAUSE, and exponential backoff. If TPAUSE isn't
+// available, fall back to the regular CPU pause and yield combination.
+#if KMP_HAVE_UMWAIT
+#define KMP_TPAUSE_MAX_MASK ((kmp_uint64)0xFFFF)
+#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time)                               \
+  {                                                                            \
+    if (__kmp_tpause_enabled) {                                                \
+      if (KMP_OVERSUBSCRIBED) {                                                \
+        __kmp_tpause(0, (time));                                               \
+      } else {                                                                 \
+        __kmp_tpause(__kmp_tpause_hint, (time));                               \
+      }                                                                        \
+      (time) = (time << 1 | 1) & KMP_TPAUSE_MAX_MASK;                          \
+    } else {                                                                   \
+      KMP_CPU_PAUSE();                                                         \
+      if ((KMP_TRY_YIELD_OVERSUB)) {                                           \
+        __kmp_yield();                                                         \
+      } else if (__kmp_use_yield == 1) {                                       \
+        (count) -= 2;                                                          \
+        if (!(count)) {                                                        \
+          __kmp_yield();                                                       \
+          (count) = __kmp_yield_next;                                          \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+  }
+#else
+#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time)                               \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    if ((KMP_TRY_YIELD_OVERSUB))                                               \
+      __kmp_yield();                                                           \
+    else if (__kmp_use_yield == 1) {                                           \
+      (count) -= 2;                                                            \
+      if (!(count)) {                                                          \
+        __kmp_yield();                                                         \
+        (count) = __kmp_yield_next;                                            \
+      }                                                                        \
+    }                                                                          \
+  }
+#endif // KMP_HAVE_UMWAIT
+
+/* ------------------------------------------------------------------------ */
+/* Support datatypes for the orphaned construct nesting checks.             */
+/* ------------------------------------------------------------------------ */
+
+/* When adding to this enum, add its corresponding string in cons_text_c[]
+ * array in kmp_error.cpp */
+enum cons_type {
+  ct_none,
+  ct_parallel,
+  ct_pdo,
+  ct_pdo_ordered,
+  ct_psections,
+  ct_psingle,
+  ct_critical,
+  ct_ordered_in_parallel,
+  ct_ordered_in_pdo,
+  ct_master,
+  ct_reduce,
+  ct_barrier,
+  ct_masked
+};
+
+#define IS_CONS_TYPE_ORDERED(ct) ((ct) == ct_pdo_ordered)
+
+struct cons_data {
+  ident_t const *ident;
+  enum cons_type type;
+  int prev;
+  kmp_user_lock_p
+      name; /* address exclusively for critical section name comparison */
+};
+
+struct cons_header {
+  int p_top, w_top, s_top;
+  int stack_size, stack_top;
+  struct cons_data *stack_data;
+};
+
+struct kmp_region_info {
+  char *text;
+  int offset[KMP_MAX_FIELDS];
+  int length[KMP_MAX_FIELDS];
+};
+
+/* ---------------------------------------------------------------------- */
+/* ---------------------------------------------------------------------- */
+
+#if KMP_OS_WINDOWS
+typedef HANDLE kmp_thread_t;
+typedef DWORD kmp_key_t;
+#endif /* KMP_OS_WINDOWS */
+
+#if KMP_OS_UNIX
+typedef pthread_t kmp_thread_t;
+typedef pthread_key_t kmp_key_t;
+#endif
+
+extern kmp_key_t __kmp_gtid_threadprivate_key;
+
+typedef struct kmp_sys_info {
+  long maxrss; /* the maximum resident set size utilized (in kilobytes)     */
+  long minflt; /* the number of page faults serviced without any I/O        */
+  long majflt; /* the number of page faults serviced that required I/O      */
+  long nswap; /* the number of times a process was "swapped" out of memory */
+  long inblock; /* the number of times the file system had to perform input  */
+  long oublock; /* the number of times the file system had to perform output */
+  long nvcsw; /* the number of times a context switch was voluntarily      */
+  long nivcsw; /* the number of times a context switch was forced           */
+} kmp_sys_info_t;
+
+#if USE_ITT_BUILD
+// We cannot include "kmp_itt.h" due to circular dependency. Declare the only
+// required type here. Later we will check the type meets requirements.
+typedef int kmp_itt_mark_t;
+#define KMP_ITT_DEBUG 0
+#endif /* USE_ITT_BUILD */
+
+typedef kmp_int32 kmp_critical_name[8];
+
+/*!
+@ingroup PARALLEL
+The type for a microtask which gets passed to @ref __kmpc_fork_call().
+The arguments to the outlined function are
+@param global_tid the global thread identity of the thread executing the
+function.
+@param bound_tid  the local identity of the thread executing the function
+@param ... pointers to shared variables accessed by the function.
+*/
+typedef void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...);
+typedef void (*kmpc_micro_bound)(kmp_int32 *bound_tid, kmp_int32 *bound_nth,
+                                 ...);
+
+/*!
+@ingroup THREADPRIVATE
+@{
+*/
+/* ---------------------------------------------------------------------------
+ */
+/* Threadprivate initialization/finalization function declarations */
+
+/*  for non-array objects:  __kmpc_threadprivate_register()  */
+
+/*!
+ Pointer to the constructor function.
+ The first argument is the <tt>this</tt> pointer
+*/
+typedef void *(*kmpc_ctor)(void *);
+
+/*!
+ Pointer to the destructor function.
+ The first argument is the <tt>this</tt> pointer
+*/
+typedef void (*kmpc_dtor)(
+    void * /*, size_t */); /* 2nd arg: magic number for KCC unused by Intel
+                              compiler */
+/*!
+ Pointer to an alternate constructor.
+ The first argument is the <tt>this</tt> pointer.
+*/
+typedef void *(*kmpc_cctor)(void *, void *);
+
+/* for array objects: __kmpc_threadprivate_register_vec() */
+/* First arg: "this" pointer */
+/* Last arg: number of array elements */
+/*!
+ Array constructor.
+ First argument is the <tt>this</tt> pointer
+ Second argument the number of array elements.
+*/
+typedef void *(*kmpc_ctor_vec)(void *, size_t);
+/*!
+ Pointer to the array destructor function.
+ The first argument is the <tt>this</tt> pointer
+ Second argument the number of array elements.
+*/
+typedef void (*kmpc_dtor_vec)(void *, size_t);
+/*!
+ Array constructor.
+ First argument is the <tt>this</tt> pointer
+ Third argument the number of array elements.
+*/
+typedef void *(*kmpc_cctor_vec)(void *, void *,
+                                size_t); /* function unused by compiler */
+
+/*!
+@}
+*/
+
+/* keeps tracked of threadprivate cache allocations for cleanup later */
+typedef struct kmp_cached_addr {
+  void **addr; /* address of allocated cache */
+  void ***compiler_cache; /* pointer to compiler's cache */
+  void *data; /* pointer to global data */
+  struct kmp_cached_addr *next; /* pointer to next cached address */
+} kmp_cached_addr_t;
+
+struct private_data {
+  struct private_data *next; /* The next descriptor in the list      */
+  void *data; /* The data buffer for this descriptor  */
+  int more; /* The repeat count for this descriptor */
+  size_t size; /* The data size for this descriptor    */
+};
+
+struct private_common {
+  struct private_common *next;
+  struct private_common *link;
+  void *gbl_addr;
+  void *par_addr; /* par_addr == gbl_addr for PRIMARY thread */
+  size_t cmn_size;
+};
+
+struct shared_common {
+  struct shared_common *next;
+  struct private_data *pod_init;
+  void *obj_init;
+  void *gbl_addr;
+  union {
+    kmpc_ctor ctor;
+    kmpc_ctor_vec ctorv;
+  } ct;
+  union {
+    kmpc_cctor cctor;
+    kmpc_cctor_vec cctorv;
+  } cct;
+  union {
+    kmpc_dtor dtor;
+    kmpc_dtor_vec dtorv;
+  } dt;
+  size_t vec_len;
+  int is_vec;
+  size_t cmn_size;
+};
+
+#define KMP_HASH_TABLE_LOG2 9 /* log2 of the hash table size */
+#define KMP_HASH_TABLE_SIZE                                                    \
+  (1 << KMP_HASH_TABLE_LOG2) /* size of the hash table */
+#define KMP_HASH_SHIFT 3 /* throw away this many low bits from the address */
+#define KMP_HASH(x)                                                            \
+  ((((kmp_uintptr_t)x) >> KMP_HASH_SHIFT) & (KMP_HASH_TABLE_SIZE - 1))
+
+struct common_table {
+  struct private_common *data[KMP_HASH_TABLE_SIZE];
+};
+
+struct shared_table {
+  struct shared_common *data[KMP_HASH_TABLE_SIZE];
+};
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_USE_HIER_SCHED
+// Shared barrier data that exists inside a single unit of the scheduling
+// hierarchy
+typedef struct kmp_hier_private_bdata_t {
+  kmp_int32 num_active;
+  kmp_uint64 index;
+  kmp_uint64 wait_val[2];
+} kmp_hier_private_bdata_t;
+#endif
+
+typedef struct kmp_sched_flags {
+  unsigned ordered : 1;
+  unsigned nomerge : 1;
+  unsigned contains_last : 1;
+  unsigned use_hier : 1; // Used in KMP_USE_HIER_SCHED code
+  unsigned use_hybrid : 1; // Used in KMP_WEIGHTED_ITERATIONS_SUPPORTED code
+  unsigned unused : 27;
+} kmp_sched_flags_t;
+
+KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
+
+#if KMP_STATIC_STEAL_ENABLED
+typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
+  kmp_int32 count;
+  kmp_int32 ub;
+  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+  kmp_int32 lb;
+  kmp_int32 st;
+  kmp_int32 tc;
+  kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+  kmp_uint32 ordered_lower;
+  kmp_uint32 ordered_upper;
+
+  // KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
+  //    a) parm3 is properly aligned and
+  //    b) all parm1-4 are on the same cache line.
+  // Because of parm1-4 are used together, performance seems to be better
+  // if they are on the same cache line (not measured though).
+
+  struct KMP_ALIGN(32) {
+    kmp_int32 parm1;
+    kmp_int32 parm2;
+    kmp_int32 parm3;
+    kmp_int32 parm4;
+  };
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  kmp_uint32 pchunks;
+  kmp_uint32 num_procs_with_pcore;
+  kmp_int32 first_thread_with_ecore;
+#endif
+#if KMP_OS_WINDOWS
+  kmp_int32 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info32_t;
+
+#if CACHE_LINE <= 128
+KMP_BUILD_ASSERT(sizeof(dispatch_private_info32_t) <= 128);
+#endif
+
+typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
+  kmp_int64 count; // current chunk number for static & static-steal scheduling
+  kmp_int64 ub; /* upper-bound */
+  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+  kmp_int64 lb; /* lower-bound */
+  kmp_int64 st; /* stride */
+  kmp_int64 tc; /* trip count (number of iterations) */
+  kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+  kmp_uint64 ordered_lower;
+  kmp_uint64 ordered_upper;
+  /* parm[1-4] are used in different ways by different scheduling algorithms */
+
+  // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
+  //    a) parm3 is properly aligned and
+  //    b) all parm1-4 are in the same cache line.
+  // Because of parm1-4 are used together, performance seems to be better
+  // if they are in the same line (not measured though).
+  struct KMP_ALIGN(32) {
+    kmp_int64 parm1;
+    kmp_int64 parm2;
+    kmp_int64 parm3;
+    kmp_int64 parm4;
+  };
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  kmp_uint64 pchunks;
+  kmp_uint64 num_procs_with_pcore;
+  kmp_int64 first_thread_with_ecore;
+#endif
+
+#if KMP_OS_WINDOWS
+  kmp_int64 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info64_t;
+
+#if CACHE_LINE <= 128
+KMP_BUILD_ASSERT(sizeof(dispatch_private_info64_t) <= 128);
+#endif
+
+#else /* KMP_STATIC_STEAL_ENABLED */
+typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
+  kmp_int32 lb;
+  kmp_int32 ub;
+  kmp_int32 st;
+  kmp_int32 tc;
+
+  kmp_int32 parm1;
+  kmp_int32 parm2;
+  kmp_int32 parm3;
+  kmp_int32 parm4;
+
+  kmp_int32 count;
+
+  kmp_uint32 ordered_lower;
+  kmp_uint32 ordered_upper;
+#if KMP_OS_WINDOWS
+  kmp_int32 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info32_t;
+
+typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
+  kmp_int64 lb; /* lower-bound */
+  kmp_int64 ub; /* upper-bound */
+  kmp_int64 st; /* stride */
+  kmp_int64 tc; /* trip count (number of iterations) */
+
+  /* parm[1-4] are used in different ways by different scheduling algorithms */
+  kmp_int64 parm1;
+  kmp_int64 parm2;
+  kmp_int64 parm3;
+  kmp_int64 parm4;
+
+  kmp_int64 count; /* current chunk number for static scheduling */
+
+  kmp_uint64 ordered_lower;
+  kmp_uint64 ordered_upper;
+#if KMP_OS_WINDOWS
+  kmp_int64 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info64_t;
+#endif /* KMP_STATIC_STEAL_ENABLED */
+
+typedef struct KMP_ALIGN_CACHE dispatch_private_info {
+  union private_info {
+    dispatch_private_info32_t p32;
+    dispatch_private_info64_t p64;
+  } u;
+  enum sched_type schedule; /* scheduling algorithm */
+  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+  std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
+  kmp_int32 ordered_bumped;
+  // Stack of buffers for nest of serial regions
+  struct dispatch_private_info *next;
+  kmp_int32 type_size; /* the size of types in private_info */
+#if KMP_USE_HIER_SCHED
+  kmp_int32 hier_id;
+  void *parent; /* hierarchical scheduling parent pointer */
+#endif
+  enum cons_type pushed_ws;
+} dispatch_private_info_t;
+
+typedef struct dispatch_shared_info32 {
+  /* chunk index under dynamic, number of idle threads under static-steal;
+     iteration index otherwise */
+  volatile kmp_uint32 iteration;
+  volatile kmp_int32 num_done;
+  volatile kmp_uint32 ordered_iteration;
+  // Dummy to retain the structure size after making ordered_iteration scalar
+  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 1];
+} dispatch_shared_info32_t;
+
+typedef struct dispatch_shared_info64 {
+  /* chunk index under dynamic, number of idle threads under static-steal;
+     iteration index otherwise */
+  volatile kmp_uint64 iteration;
+  volatile kmp_int64 num_done;
+  volatile kmp_uint64 ordered_iteration;
+  // Dummy to retain the structure size after making ordered_iteration scalar
+  kmp_int64 ordered_dummy[KMP_MAX_ORDERED - 3];
+} dispatch_shared_info64_t;
+
+typedef struct dispatch_shared_info {
+  union shared_info {
+    dispatch_shared_info32_t s32;
+    dispatch_shared_info64_t s64;
+  } u;
+  volatile kmp_uint32 buffer_index;
+  volatile kmp_int32 doacross_buf_idx; // teamwise index
+  volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1)
+  kmp_int32 doacross_num_done; // count finished threads
+#if KMP_USE_HIER_SCHED
+  void *hier;
+#endif
+#if KMP_USE_HWLOC
+  // When linking with libhwloc, the ORDERED EPCC test slows down on big
+  // machines (> 48 cores). Performance analysis showed that a cache thrash
+  // was occurring and this padding helps alleviate the problem.
+  char padding[64];
+#endif
+} dispatch_shared_info_t;
+
+typedef struct kmp_disp {
+  /* Vector for ORDERED SECTION */
+  void (*th_deo_fcn)(int *gtid, int *cid, ident_t *);
+  /* Vector for END ORDERED SECTION */
+  void (*th_dxo_fcn)(int *gtid, int *cid, ident_t *);
+
+  dispatch_shared_info_t *th_dispatch_sh_current;
+  dispatch_private_info_t *th_dispatch_pr_current;
+
+  dispatch_private_info_t *th_disp_buffer;
+  kmp_uint32 th_disp_index;
+  kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
+  volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
+  kmp_int64 *th_doacross_info; // info on loop bounds
+#if KMP_USE_INTERNODE_ALIGNMENT
+  char more_padding[INTERNODE_CACHE_LINE];
+#endif
+} kmp_disp_t;
+
+/* ------------------------------------------------------------------------ */
+/* Barrier stuff */
+
+/* constants for barrier state update */
+#define KMP_INIT_BARRIER_STATE 0 /* should probably start from zero */
+#define KMP_BARRIER_SLEEP_BIT 0 /* bit used for suspend/sleep part of state */
+#define KMP_BARRIER_UNUSED_BIT 1 // bit that must never be set for valid state
+#define KMP_BARRIER_BUMP_BIT 2 /* lsb used for bump of go/arrived state */
+
+#define KMP_BARRIER_SLEEP_STATE (1 << KMP_BARRIER_SLEEP_BIT)
+#define KMP_BARRIER_UNUSED_STATE (1 << KMP_BARRIER_UNUSED_BIT)
+#define KMP_BARRIER_STATE_BUMP (1 << KMP_BARRIER_BUMP_BIT)
+
+#if (KMP_BARRIER_SLEEP_BIT >= KMP_BARRIER_BUMP_BIT)
+#error "Barrier sleep bit must be smaller than barrier bump bit"
+#endif
+#if (KMP_BARRIER_UNUSED_BIT >= KMP_BARRIER_BUMP_BIT)
+#error "Barrier unused bit must be smaller than barrier bump bit"
+#endif
+
+// Constants for release barrier wait state: currently, hierarchical only
+#define KMP_BARRIER_NOT_WAITING 0 // Normal state; worker not in wait_sleep
+#define KMP_BARRIER_OWN_FLAG                                                   \
+  1 // Normal state; worker waiting on own b_go flag in release
+#define KMP_BARRIER_PARENT_FLAG                                                \
+  2 // Special state; worker waiting on parent's b_go flag in release
+#define KMP_BARRIER_SWITCH_TO_OWN_FLAG                                         \
+  3 // Special state; tells worker to shift from parent to own b_go
+#define KMP_BARRIER_SWITCHING                                                  \
+  4 // Special state; worker resets appropriate flag on wake-up
+
+#define KMP_NOT_SAFE_TO_REAP                                                   \
+  0 // Thread th_reap_state: not safe to reap (tasking)
+#define KMP_SAFE_TO_REAP 1 // Thread th_reap_state: safe to reap (not tasking)
+
+// The flag_type describes the storage used for the flag.
+enum flag_type {
+  flag32, /**< atomic 32 bit flags */
+  flag64, /**< 64 bit flags */
+  atomic_flag64, /**< atomic 64 bit flags */
+  flag_oncore, /**< special 64-bit flag for on-core barrier (hierarchical) */
+  flag_unset
+};
+
+enum barrier_type {
+  bs_plain_barrier = 0, /* 0, All non-fork/join barriers (except reduction
+                           barriers if enabled) */
+  bs_forkjoin_barrier, /* 1, All fork/join (parallel region) barriers */
+#if KMP_FAST_REDUCTION_BARRIER
+  bs_reduction_barrier, /* 2, All barriers that are used in reduction */
+#endif // KMP_FAST_REDUCTION_BARRIER
+  bs_last_barrier /* Just a placeholder to mark the end */
+};
+
+// to work with reduction barriers just like with plain barriers
+#if !KMP_FAST_REDUCTION_BARRIER
+#define bs_reduction_barrier bs_plain_barrier
+#endif // KMP_FAST_REDUCTION_BARRIER
+
+typedef enum kmp_bar_pat { /* Barrier communication patterns */
+                           bp_linear_bar =
+                               0, /* Single level (degenerate) tree */
+                           bp_tree_bar =
+                               1, /* Balanced tree with branching factor 2^n */
+                           bp_hyper_bar = 2, /* Hypercube-embedded tree with min
+                                                branching factor 2^n */
+                           bp_hierarchical_bar = 3, /* Machine hierarchy tree */
+                           bp_dist_bar = 4, /* Distributed barrier */
+                           bp_last_bar /* Placeholder to mark the end */
+} kmp_bar_pat_e;
+
+#define KMP_BARRIER_ICV_PUSH 1
+
+/* Record for holding the values of the internal controls stack records */
+typedef struct kmp_internal_control {
+  int serial_nesting_level; /* corresponds to the value of the
+                               th_team_serialized field */
+  kmp_int8 dynamic; /* internal control for dynamic adjustment of threads (per
+                       thread) */
+  kmp_int8
+      bt_set; /* internal control for whether blocktime is explicitly set */
+  int blocktime; /* internal control for blocktime */
+#if KMP_USE_MONITOR
+  int bt_intervals; /* internal control for blocktime intervals */
+#endif
+  int nproc; /* internal control for #threads for next parallel region (per
+                thread) */
+  int thread_limit; /* internal control for thread-limit-var */
+  int task_thread_limit; /* internal control for thread-limit-var of a task*/
+  int max_active_levels; /* internal control for max_active_levels */
+  kmp_r_sched_t
+      sched; /* internal control for runtime schedule {sched,chunk} pair */
+  kmp_proc_bind_t proc_bind; /* internal control for affinity  */
+  kmp_int32 default_device; /* internal control for default device */
+  struct kmp_internal_control *next;
+} kmp_internal_control_t;
+
+static inline void copy_icvs(kmp_internal_control_t *dst,
+                             kmp_internal_control_t *src) {
+  *dst = *src;
+}
+
+/* Thread barrier needs volatile barrier fields */
+typedef struct KMP_ALIGN_CACHE kmp_bstate {
+  // th_fixed_icvs is aligned by virtue of kmp_bstate being aligned (and all
+  // uses of it). It is not explicitly aligned below, because we *don't* want
+  // it to be padded -- instead, we fit b_go into the same cache line with
+  // th_fixed_icvs, enabling NGO cache lines stores in the hierarchical barrier.
+  kmp_internal_control_t th_fixed_icvs; // Initial ICVs for the thread
+  // Tuck b_go into end of th_fixed_icvs cache line, so it can be stored with
+  // same NGO store
+  volatile kmp_uint64 b_go; // STATE => task should proceed (hierarchical)
+  KMP_ALIGN_CACHE volatile kmp_uint64
+      b_arrived; // STATE => task reached synch point.
+  kmp_uint32 *skip_per_level;
+  kmp_uint32 my_level;
+  kmp_int32 parent_tid;
+  kmp_int32 old_tid;
+  kmp_uint32 depth;
+  struct kmp_bstate *parent_bar;
+  kmp_team_t *team;
+  kmp_uint64 leaf_state;
+  kmp_uint32 nproc;
+  kmp_uint8 base_leaf_kids;
+  kmp_uint8 leaf_kids;
+  kmp_uint8 offset;
+  kmp_uint8 wait_flag;
+  kmp_uint8 use_oncore_barrier;
+#if USE_DEBUGGER
+  // The following field is intended for the debugger solely. Only the worker
+  // thread itself accesses this field: the worker increases it by 1 when it
+  // arrives to a barrier.
+  KMP_ALIGN_CACHE kmp_uint b_worker_arrived;
+#endif /* USE_DEBUGGER */
+} kmp_bstate_t;
+
+union KMP_ALIGN_CACHE kmp_barrier_union {
+  double b_align; /* use worst case alignment */
+  char b_pad[KMP_PAD(kmp_bstate_t, CACHE_LINE)];
+  kmp_bstate_t bb;
+};
+
+typedef union kmp_barrier_union kmp_balign_t;
+
+/* Team barrier needs only non-volatile arrived counter */
+union KMP_ALIGN_CACHE kmp_barrier_team_union {
+  double b_align; /* use worst case alignment */
+  char b_pad[CACHE_LINE];
+  struct {
+    kmp_uint64 b_arrived; /* STATE => task reached synch point. */
+#if USE_DEBUGGER
+    // The following two fields are indended for the debugger solely. Only
+    // primary thread of the team accesses these fields: the first one is
+    // increased by 1 when the primary thread arrives to a barrier, the second
+    // one is increased by one when all the threads arrived.
+    kmp_uint b_master_arrived;
+    kmp_uint b_team_arrived;
+#endif
+  };
+};
+
+typedef union kmp_barrier_team_union kmp_balign_team_t;
+
+/* Padding for Linux* OS pthreads condition variables and mutexes used to signal
+   threads when a condition changes.  This is to workaround an NPTL bug where
+   padding was added to pthread_cond_t which caused the initialization routine
+   to write outside of the structure if compiled on pre-NPTL threads.  */
+#if KMP_OS_WINDOWS
+typedef struct kmp_win32_mutex {
+  /* The Lock */
+  CRITICAL_SECTION cs;
+} kmp_win32_mutex_t;
+
+typedef struct kmp_win32_cond {
+  /* Count of the number of waiters. */
+  int waiters_count_;
+
+  /* Serialize access to <waiters_count_> */
+  kmp_win32_mutex_t waiters_count_lock_;
+
+  /* Number of threads to release via a <cond_broadcast> or a <cond_signal> */
+  int release_count_;
+
+  /* Keeps track of the current "generation" so that we don't allow */
+  /* one thread to steal all the "releases" from the broadcast. */
+  int wait_generation_count_;
+
+  /* A manual-reset event that's used to block and release waiting threads. */
+  HANDLE event_;
+} kmp_win32_cond_t;
+#endif
+
+#if KMP_OS_UNIX
+
+union KMP_ALIGN_CACHE kmp_cond_union {
+  double c_align;
+  char c_pad[CACHE_LINE];
+  pthread_cond_t c_cond;
+};
+
+typedef union kmp_cond_union kmp_cond_align_t;
+
+union KMP_ALIGN_CACHE kmp_mutex_union {
+  double m_align;
+  char m_pad[CACHE_LINE];
+  pthread_mutex_t m_mutex;
+};
+
+typedef union kmp_mutex_union kmp_mutex_align_t;
+
+#endif /* KMP_OS_UNIX */
+
+typedef struct kmp_desc_base {
+  void *ds_stackbase;
+  size_t ds_stacksize;
+  int ds_stackgrow;
+  kmp_thread_t ds_thread;
+  volatile int ds_tid;
+  int ds_gtid;
+#if KMP_OS_WINDOWS
+  volatile int ds_alive;
+  DWORD ds_thread_id;
+/* ds_thread keeps thread handle on Windows* OS. It is enough for RTL purposes.
+   However, debugger support (libomp_db) cannot work with handles, because they
+   uncomparable. For example, debugger requests info about thread with handle h.
+   h is valid within debugger process, and meaningless within debugee process.
+   Even if h is duped by call to DuplicateHandle(), so the result h' is valid
+   within debugee process, but it is a *new* handle which does *not* equal to
+   any other handle in debugee... The only way to compare handles is convert
+   them to system-wide ids. GetThreadId() function is available only in
+   Longhorn and Server 2003. :-( In contrast, GetCurrentThreadId() is available
+   on all Windows* OS flavours (including Windows* 95). Thus, we have to get
+   thread id by call to GetCurrentThreadId() from within the thread and save it
+   to let libomp_db identify threads.  */
+#endif /* KMP_OS_WINDOWS */
+} kmp_desc_base_t;
+
+typedef union KMP_ALIGN_CACHE kmp_desc {
+  double ds_align; /* use worst case alignment */
+  char ds_pad[KMP_PAD(kmp_desc_base_t, CACHE_LINE)];
+  kmp_desc_base_t ds;
+} kmp_desc_t;
+
+typedef struct kmp_local {
+  volatile int this_construct; /* count of single's encountered by thread */
+  void *reduce_data;
+#if KMP_USE_BGET
+  void *bget_data;
+  void *bget_list;
+#if !USE_CMP_XCHG_FOR_BGET
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+  kmp_lock_t bget_lock; /* Lock for accessing bget free list */
+#else
+  kmp_bootstrap_lock_t bget_lock; // Lock for accessing bget free list. Must be
+// bootstrap lock so we can use it at library
+// shutdown.
+#endif /* USE_LOCK_FOR_BGET */
+#endif /* ! USE_CMP_XCHG_FOR_BGET */
+#endif /* KMP_USE_BGET */
+
+  PACKED_REDUCTION_METHOD_T
+  packed_reduction_method; /* stored by __kmpc_reduce*(), used by
+                              __kmpc_end_reduce*() */
+
+} kmp_local_t;
+
+#define KMP_CHECK_UPDATE(a, b)                                                 \
+  if ((a) != (b))                                                              \
+  (a) = (b)
+#define KMP_CHECK_UPDATE_SYNC(a, b)                                            \
+  if ((a) != (b))                                                              \
+  TCW_SYNC_PTR((a), (b))
+
+#define get__blocktime(xteam, xtid)                                            \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime)
+#define get__bt_set(xteam, xtid)                                               \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set)
+#if KMP_USE_MONITOR
+#define get__bt_intervals(xteam, xtid)                                         \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals)
+#endif
+
+#define get__dynamic_2(xteam, xtid)                                            \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.dynamic)
+#define get__nproc_2(xteam, xtid)                                              \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nproc)
+#define get__sched_2(xteam, xtid)                                              \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.sched)
+
+#define set__blocktime_team(xteam, xtid, xval)                                 \
+  (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime) =     \
+       (xval))
+
+#if KMP_USE_MONITOR
+#define set__bt_intervals_team(xteam, xtid, xval)                              \
+  (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals) =  \
+       (xval))
+#endif
+
+#define set__bt_set_team(xteam, xtid, xval)                                    \
+  (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set) = (xval))
+
+#define set__dynamic(xthread, xval)                                            \
+  (((xthread)->th.th_current_task->td_icvs.dynamic) = (xval))
+#define get__dynamic(xthread)                                                  \
+  (((xthread)->th.th_current_task->td_icvs.dynamic) ? (FTN_TRUE) : (FTN_FALSE))
+
+#define set__nproc(xthread, xval)                                              \
+  (((xthread)->th.th_current_task->td_icvs.nproc) = (xval))
+
+#define set__thread_limit(xthread, xval)                                       \
+  (((xthread)->th.th_current_task->td_icvs.thread_limit) = (xval))
+
+#define set__max_active_levels(xthread, xval)                                  \
+  (((xthread)->th.th_current_task->td_icvs.max_active_levels) = (xval))
+
+#define get__max_active_levels(xthread)                                        \
+  ((xthread)->th.th_current_task->td_icvs.max_active_levels)
+
+#define set__sched(xthread, xval)                                              \
+  (((xthread)->th.th_current_task->td_icvs.sched) = (xval))
+
+#define set__proc_bind(xthread, xval)                                          \
+  (((xthread)->th.th_current_task->td_icvs.proc_bind) = (xval))
+#define get__proc_bind(xthread)                                                \
+  ((xthread)->th.th_current_task->td_icvs.proc_bind)
+
+// OpenMP tasking data structures
+
+typedef enum kmp_tasking_mode {
+  tskm_immediate_exec = 0,
+  tskm_extra_barrier = 1,
+  tskm_task_teams = 2,
+  tskm_max = 2
+} kmp_tasking_mode_t;
+
+extern kmp_tasking_mode_t
+    __kmp_tasking_mode; /* determines how/when to execute tasks */
+extern int __kmp_task_stealing_constraint;
+extern int __kmp_enable_task_throttling;
+extern kmp_int32 __kmp_default_device; // Set via OMP_DEFAULT_DEVICE if
+// specified, defaults to 0 otherwise
+// Set via OMP_MAX_TASK_PRIORITY if specified, defaults to 0 otherwise
+extern kmp_int32 __kmp_max_task_priority;
+// Set via KMP_TASKLOOP_MIN_TASKS if specified, defaults to 0 otherwise
+extern kmp_uint64 __kmp_taskloop_min_tasks;
+
+/* NOTE: kmp_taskdata_t and kmp_task_t structures allocated in single block with
+   taskdata first */
+#define KMP_TASK_TO_TASKDATA(task) (((kmp_taskdata_t *)task) - 1)
+#define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1)
+
+// The tt_found_tasks flag is a signal to all threads in the team that tasks
+// were spawned and queued since the previous barrier release.
+#define KMP_TASKING_ENABLED(task_team)                                         \
+  (TRUE == TCR_SYNC_4((task_team)->tt.tt_found_tasks))
+/*!
+@ingroup BASIC_TYPES
+@{
+*/
+
+/*!
+ */
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, void *);
+
+typedef union kmp_cmplrdata {
+  kmp_int32 priority; /**< priority specified by user for the task */
+  kmp_routine_entry_t
+      destructors; /* pointer to function to invoke deconstructors of
+                      firstprivate C++ objects */
+  /* future data */
+} kmp_cmplrdata_t;
+
+/*  sizeof_kmp_task_t passed as arg to kmpc_omp_task call  */
+/*!
+ */
+typedef struct kmp_task { /* GEH: Shouldn't this be aligned somehow? */
+  void *shareds; /**< pointer to block of pointers to shared vars   */
+  kmp_routine_entry_t
+      routine; /**< pointer to routine to call for executing task */
+  kmp_int32 part_id; /**< part id for the task                          */
+  kmp_cmplrdata_t
+      data1; /* Two known optional additions: destructors and priority */
+  kmp_cmplrdata_t data2; /* Process destructors first, priority second */
+  /* future data */
+  /*  private vars  */
+} kmp_task_t;
+
+/*!
+@}
+*/
+
+typedef struct kmp_taskgroup {
+  std::atomic<kmp_int32> count; // number of allocated and incomplete tasks
+  std::atomic<kmp_int32>
+      cancel_request; // request for cancellation of this taskgroup
+  struct kmp_taskgroup *parent; // parent taskgroup
+  // Block of data to perform task reduction
+  void *reduce_data; // reduction related info
+  kmp_int32 reduce_num_data; // number of data items to reduce
+  uintptr_t *gomp_data; // gomp reduction data
+} kmp_taskgroup_t;
+
+// forward declarations
+typedef union kmp_depnode kmp_depnode_t;
+typedef struct kmp_depnode_list kmp_depnode_list_t;
+typedef struct kmp_dephash_entry kmp_dephash_entry_t;
+
+// macros for checking dep flag as an integer
+#define KMP_DEP_IN 0x1
+#define KMP_DEP_OUT 0x2
+#define KMP_DEP_INOUT 0x3
+#define KMP_DEP_MTX 0x4
+#define KMP_DEP_SET 0x8
+#define KMP_DEP_ALL 0x80
+// Compiler sends us this info:
+typedef struct kmp_depend_info {
+  kmp_intptr_t base_addr;
+  size_t len;
+  union {
+    kmp_uint8 flag; // flag as an unsigned char
+    struct { // flag as a set of 8 bits
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+      /* Same fields as in the #else branch, but in reverse order */
+      unsigned all : 1;
+      unsigned unused : 3;
+      unsigned set : 1;
+      unsigned mtx : 1;
+      unsigned out : 1;
+      unsigned in : 1;
+#else
+      unsigned in : 1;
+      unsigned out : 1;
+      unsigned mtx : 1;
+      unsigned set : 1;
+      unsigned unused : 3;
+      unsigned all : 1;
+#endif
+    } flags;
+  };
+} kmp_depend_info_t;
+
+// Internal structures to work with task dependencies:
+struct kmp_depnode_list {
+  kmp_depnode_t *node;
+  kmp_depnode_list_t *next;
+};
+
+// Max number of mutexinoutset dependencies per node
+#define MAX_MTX_DEPS 4
+
+typedef struct kmp_base_depnode {
+  kmp_depnode_list_t *successors; /* used under lock */
+  kmp_task_t *task; /* non-NULL if depnode is active, used under lock */
+  kmp_lock_t *mtx_locks[MAX_MTX_DEPS]; /* lock mutexinoutset dependent tasks */
+  kmp_int32 mtx_num_locks; /* number of locks in mtx_locks array */
+  kmp_lock_t lock; /* guards shared fields: task, successors */
+#if KMP_SUPPORT_GRAPH_OUTPUT
+  kmp_uint32 id;
+#endif
+  std::atomic<kmp_int32> npredecessors;
+  std::atomic<kmp_int32> nrefs;
+} kmp_base_depnode_t;
+
+union KMP_ALIGN_CACHE kmp_depnode {
+  double dn_align; /* use worst case alignment */
+  char dn_pad[KMP_PAD(kmp_base_depnode_t, CACHE_LINE)];
+  kmp_base_depnode_t dn;
+};
+
+struct kmp_dephash_entry {
+  kmp_intptr_t addr;
+  kmp_depnode_t *last_out;
+  kmp_depnode_list_t *last_set;
+  kmp_depnode_list_t *prev_set;
+  kmp_uint8 last_flag;
+  kmp_lock_t *mtx_lock; /* is referenced by depnodes w/mutexinoutset dep */
+  kmp_dephash_entry_t *next_in_bucket;
+};
+
+typedef struct kmp_dephash {
+  kmp_dephash_entry_t **buckets;
+  size_t size;
+  kmp_depnode_t *last_all;
+  size_t generation;
+  kmp_uint32 nelements;
+  kmp_uint32 nconflicts;
+} kmp_dephash_t;
+
+typedef struct kmp_task_affinity_info {
+  kmp_intptr_t base_addr;
+  size_t len;
+  struct {
+    bool flag1 : 1;
+    bool flag2 : 1;
+    kmp_int32 reserved : 30;
+  } flags;
+} kmp_task_affinity_info_t;
+
+typedef enum kmp_event_type_t {
+  KMP_EVENT_UNINITIALIZED = 0,
+  KMP_EVENT_ALLOW_COMPLETION = 1
+} kmp_event_type_t;
+
+typedef struct {
+  kmp_event_type_t type;
+  kmp_tas_lock_t lock;
+  union {
+    kmp_task_t *task;
+  } ed;
+} kmp_event_t;
+
+#if OMPX_TASKGRAPH
+// Initial number of allocated nodes while recording
+#define INIT_MAPSIZE 50
+
+typedef struct kmp_taskgraph_flags { /*This needs to be exactly 32 bits */
+  unsigned nowait : 1;
+  unsigned re_record : 1;
+  unsigned reserved : 30;
+} kmp_taskgraph_flags_t;
+
+/// Represents a TDG node
+typedef struct kmp_node_info {
+  kmp_task_t *task; // Pointer to the actual task
+  kmp_int32 *successors; // Array of the succesors ids
+  kmp_int32 nsuccessors; // Number of succesors of the node
+  std::atomic<kmp_int32>
+      npredecessors_counter; // Number of predessors on the fly
+  kmp_int32 npredecessors; // Total number of predecessors
+  kmp_int32 successors_size; // Number of allocated succesors ids
+  kmp_taskdata_t *parent_task; // Parent implicit task
+} kmp_node_info_t;
+
+/// Represent a TDG's current status
+typedef enum kmp_tdg_status {
+  KMP_TDG_NONE = 0,
+  KMP_TDG_RECORDING = 1,
+  KMP_TDG_READY = 2
+} kmp_tdg_status_t;
+
+/// Structure that contains a TDG
+typedef struct kmp_tdg_info {
+  kmp_int32 tdg_id; // Unique idenfifier of the TDG
+  kmp_taskgraph_flags_t tdg_flags; // Flags related to a TDG
+  kmp_int32 map_size; // Number of allocated TDG nodes
+  kmp_int32 num_roots; // Number of roots tasks int the TDG
+  kmp_int32 *root_tasks; // Array of tasks identifiers that are roots
+  kmp_node_info_t *record_map; // Array of TDG nodes
+  kmp_tdg_status_t tdg_status =
+      KMP_TDG_NONE; // Status of the TDG (recording, ready...)
+  std::atomic<kmp_int32> num_tasks; // Number of TDG nodes
+  kmp_bootstrap_lock_t
+      graph_lock; // Protect graph attributes when updated via taskloop_recur
+  // Taskloop reduction related
+  void *rec_taskred_data; // Data to pass to __kmpc_task_reduction_init or
+                          // __kmpc_taskred_init
+  kmp_int32 rec_num_taskred;
+} kmp_tdg_info_t;
+
+extern int __kmp_tdg_dot;
+extern kmp_int32 __kmp_max_tdgs;
+extern kmp_tdg_info_t **__kmp_global_tdgs;
+extern kmp_int32 __kmp_curr_tdg_idx;
+extern kmp_int32 __kmp_successors_size;
+extern std::atomic<kmp_int32> __kmp_tdg_task_id;
+extern kmp_int32 __kmp_num_tdg;
+#endif
+
+#ifdef BUILD_TIED_TASK_STACK
+
+/* Tied Task stack definitions */
+typedef struct kmp_stack_block {
+  kmp_taskdata_t *sb_block[TASK_STACK_BLOCK_SIZE];
+  struct kmp_stack_block *sb_next;
+  struct kmp_stack_block *sb_prev;
+} kmp_stack_block_t;
+
+typedef struct kmp_task_stack {
+  kmp_stack_block_t ts_first_block; // first block of stack entries
+  kmp_taskdata_t **ts_top; // pointer to the top of stack
+  kmp_int32 ts_entries; // number of entries on the stack
+} kmp_task_stack_t;
+
+#endif // BUILD_TIED_TASK_STACK
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  /* Same fields as in the #else branch, but in reverse order */
+#if OMPX_TASKGRAPH
+  unsigned reserved31 : 6;
+  unsigned onced : 1;
+#else
+  unsigned reserved31 : 7;
+#endif
+  unsigned native : 1;
+  unsigned freed : 1;
+  unsigned complete : 1;
+  unsigned executing : 1;
+  unsigned started : 1;
+  unsigned team_serial : 1;
+  unsigned tasking_ser : 1;
+  unsigned task_serial : 1;
+  unsigned tasktype : 1;
+  unsigned reserved : 8;
+  unsigned hidden_helper : 1;
+  unsigned detachable : 1;
+  unsigned priority_specified : 1;
+  unsigned proxy : 1;
+  unsigned destructors_thunk : 1;
+  unsigned merged_if0 : 1;
+  unsigned final : 1;
+  unsigned tiedness : 1;
+#else
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
+  unsigned final : 1; /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+                              code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+                                     invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+                         context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+                                      setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1; /* 1 == hidden helper task */
+  unsigned reserved : 8; /* reserved for compiler use */
+
+  /* Library flags */ /* Total library flags must be 16 bits */
+  unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1; /* 1==started, 0==not started     */
+  unsigned executing : 1; /* 1==executing, 0==not executing */
+  unsigned complete : 1; /* 1==complete, 0==not complete   */
+  unsigned freed : 1; /* 1==freed, 0==allocated        */
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+#if OMPX_TASKGRAPH
+  unsigned onced : 1; /* 1==ran once already, 0==never ran, record & replay purposes */
+  unsigned reserved31 : 6; /* reserved for library use */
+#else
+  unsigned reserved31 : 7; /* reserved for library use */
+#endif
+#endif
+} kmp_tasking_flags_t;
+
+typedef struct kmp_target_data {
+  void *async_handle; // libomptarget async handle for task completion query
+} kmp_target_data_t;
+
+struct kmp_taskdata { /* aligned during dynamic allocation       */
+  kmp_int32 td_task_id; /* id, assigned by debugger                */
+  kmp_tasking_flags_t td_flags; /* task flags                              */
+  kmp_team_t *td_team; /* team for this task                      */
+  kmp_info_p *td_alloc_thread; /* thread that allocated data structures   */
+  /* Currently not used except for perhaps IDB */
+  kmp_taskdata_t *td_parent; /* parent task                             */
+  kmp_int32 td_level; /* task nesting level                      */
+  std::atomic<kmp_int32> td_untied_count; // untied task active parts counter
+  ident_t *td_ident; /* task identifier                         */
+  // Taskwait data.
+  ident_t *td_taskwait_ident;
+  kmp_uint32 td_taskwait_counter;
+  kmp_int32 td_taskwait_thread; /* gtid + 1 of thread encountered taskwait */
+  KMP_ALIGN_CACHE kmp_internal_control_t
+      td_icvs; /* Internal control variables for the task */
+  KMP_ALIGN_CACHE std::atomic<kmp_int32>
+      td_allocated_child_tasks; /* Child tasks (+ current task) not yet
+                                   deallocated */
+  std::atomic<kmp_int32>
+      td_incomplete_child_tasks; /* Child tasks not yet complete */
+  kmp_taskgroup_t
+      *td_taskgroup; // Each task keeps pointer to its current taskgroup
+  kmp_dephash_t
+      *td_dephash; // Dependencies for children tasks are tracked from here
+  kmp_depnode_t
+      *td_depnode; // Pointer to graph node if this task has dependencies
+  kmp_task_team_t *td_task_team;
+  size_t td_size_alloc; // Size of task structure, including shareds etc.
+#if defined(KMP_GOMP_COMPAT)
+  // 4 or 8 byte integers for the loop bounds in GOMP_taskloop
+  kmp_int32 td_size_loop_bounds;
+#endif
+  kmp_taskdata_t *td_last_tied; // keep tied task for task scheduling constraint
+#if defined(KMP_GOMP_COMPAT)
+  // GOMP sends in a copy function for copy constructors
+  void (*td_copy_func)(void *, void *);
+#endif
+  kmp_event_t td_allow_completion_event;
+#if OMPT_SUPPORT
+  ompt_task_info_t ompt_task_info;
+#endif
+#if OMPX_TASKGRAPH
+  bool is_taskgraph = 0; // whether the task is within a TDG
+  kmp_tdg_info_t *tdg; // used to associate task with a TDG
+#endif
+  kmp_target_data_t td_target_data;
+}; // struct kmp_taskdata
+
+// Make sure padding above worked
+KMP_BUILD_ASSERT(sizeof(kmp_taskdata_t) % sizeof(void *) == 0);
+
+// Data for task team but per thread
+typedef struct kmp_base_thread_data {
+  kmp_info_p *td_thr; // Pointer back to thread info
+  // Used only in __kmp_execute_tasks_template, maybe not avail until task is
+  // queued?
+  kmp_bootstrap_lock_t td_deque_lock; // Lock for accessing deque
+  kmp_taskdata_t *
+      *td_deque; // Deque of tasks encountered by td_thr, dynamically allocated
+  kmp_int32 td_deque_size; // Size of deck
+  kmp_uint32 td_deque_head; // Head of deque (will wrap)
+  kmp_uint32 td_deque_tail; // Tail of deque (will wrap)
+  kmp_int32 td_deque_ntasks; // Number of tasks in deque
+  // GEH: shouldn't this be volatile since used in while-spin?
+  kmp_int32 td_deque_last_stolen; // Thread number of last successful steal
+#ifdef BUILD_TIED_TASK_STACK
+  kmp_task_stack_t td_susp_tied_tasks; // Stack of suspended tied tasks for task
+// scheduling constraint
+#endif // BUILD_TIED_TASK_STACK
+} kmp_base_thread_data_t;
+
+#define TASK_DEQUE_BITS 8 // Used solely to define INITIAL_TASK_DEQUE_SIZE
+#define INITIAL_TASK_DEQUE_SIZE (1 << TASK_DEQUE_BITS)
+
+#define TASK_DEQUE_SIZE(td) ((td).td_deque_size)
+#define TASK_DEQUE_MASK(td) ((td).td_deque_size - 1)
+
+typedef union KMP_ALIGN_CACHE kmp_thread_data {
+  kmp_base_thread_data_t td;
+  double td_align; /* use worst case alignment */
+  char td_pad[KMP_PAD(kmp_base_thread_data_t, CACHE_LINE)];
+} kmp_thread_data_t;
+
+typedef struct kmp_task_pri {
+  kmp_thread_data_t td;
+  kmp_int32 priority;
+  kmp_task_pri *next;
+} kmp_task_pri_t;
+
+// Data for task teams which are used when tasking is enabled for the team
+typedef struct kmp_base_task_team {
+  kmp_bootstrap_lock_t
+      tt_threads_lock; /* Lock used to allocate per-thread part of task team */
+  /* must be bootstrap lock since used at library shutdown*/
+
+  // TODO: check performance vs kmp_tas_lock_t
+  kmp_bootstrap_lock_t tt_task_pri_lock; /* Lock to access priority tasks */
+  kmp_task_pri_t *tt_task_pri_list;
+
+  kmp_task_team_t *tt_next; /* For linking the task team free list */
+  kmp_thread_data_t
+      *tt_threads_data; /* Array of per-thread structures for task team */
+  /* Data survives task team deallocation */
+  kmp_int32 tt_found_tasks; /* Have we found tasks and queued them while
+                               executing this team? */
+  /* TRUE means tt_threads_data is set up and initialized */
+  kmp_int32 tt_nproc; /* #threads in team           */
+  kmp_int32 tt_max_threads; // # entries allocated for threads_data array
+  kmp_int32 tt_found_proxy_tasks; // found proxy tasks since last barrier
+  kmp_int32 tt_untied_task_encountered;
+  std::atomic<kmp_int32> tt_num_task_pri; // number of priority tasks enqueued
+  // There is hidden helper thread encountered in this task team so that we must
+  // wait when waiting on task team
+  kmp_int32 tt_hidden_helper_task_encountered;
+
+  KMP_ALIGN_CACHE
+  std::atomic<kmp_int32> tt_unfinished_threads; /* #threads still active */
+
+  KMP_ALIGN_CACHE
+  volatile kmp_uint32
+      tt_active; /* is the team still actively executing tasks */
+} kmp_base_task_team_t;
+
+union KMP_ALIGN_CACHE kmp_task_team {
+  kmp_base_task_team_t tt;
+  double tt_align; /* use worst case alignment */
+  char tt_pad[KMP_PAD(kmp_base_task_team_t, CACHE_LINE)];
+};
+
+#if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
+// Free lists keep same-size free memory slots for fast memory allocation
+// routines
+typedef struct kmp_free_list {
+  void *th_free_list_self; // Self-allocated tasks free list
+  void *th_free_list_sync; // Self-allocated tasks stolen/returned by other
+  // threads
+  void *th_free_list_other; // Non-self free list (to be returned to owner's
+  // sync list)
+} kmp_free_list_t;
+#endif
+#if KMP_NESTED_HOT_TEAMS
+// Hot teams array keeps hot teams and their sizes for given thread. Hot teams
+// are not put in teams pool, and they don't put threads in threads pool.
+typedef struct kmp_hot_team_ptr {
+  kmp_team_p *hot_team; // pointer to hot_team of given nesting level
+  kmp_int32 hot_team_nth; // number of threads allocated for the hot_team
+} kmp_hot_team_ptr_t;
+#endif
+typedef struct kmp_teams_size {
+  kmp_int32 nteams; // number of teams in a league
+  kmp_int32 nth; // number of threads in each team of the league
+} kmp_teams_size_t;
+
+// This struct stores a thread that acts as a "root" for a contention
+// group. Contention groups are rooted at kmp_root threads, but also at
+// each primary thread of each team created in the teams construct.
+// This struct therefore also stores a thread_limit associated with
+// that contention group, and a counter to track the number of threads
+// active in that contention group. Each thread has a list of these: CG
+// root threads have an entry in their list in which cg_root refers to
+// the thread itself, whereas other workers in the CG will have a
+// single entry where cg_root is same as the entry containing their CG
+// root. When a thread encounters a teams construct, it will add a new
+// entry to the front of its list, because it now roots a new CG.
+typedef struct kmp_cg_root {
+  kmp_info_p *cg_root; // "root" thread for a contention group
+  // The CG root's limit comes from OMP_THREAD_LIMIT for root threads, or
+  // thread_limit clause for teams primary threads
+  kmp_int32 cg_thread_limit;
+  kmp_int32 cg_nthreads; // Count of active threads in CG rooted at cg_root
+  struct kmp_cg_root *up; // pointer to higher level CG root in list
+} kmp_cg_root_t;
+
+// OpenMP thread data structures
+
+typedef struct KMP_ALIGN_CACHE kmp_base_info {
+  /* Start with the readonly data which is cache aligned and padded. This is
+     written before the thread starts working by the primary thread. Uber
+     masters may update themselves later. Usage does not consider serialized
+     regions.  */
+  kmp_desc_t th_info;
+  kmp_team_p *th_team; /* team we belong to */
+  kmp_root_p *th_root; /* pointer to root of task hierarchy */
+  kmp_info_p *th_next_pool; /* next available thread in the pool */
+  kmp_disp_t *th_dispatch; /* thread's dispatch data */
+  int th_in_pool; /* in thread pool (32 bits for TCR/TCW) */
+
+  /* The following are cached from the team info structure */
+  /* TODO use these in more places as determined to be needed via profiling */
+  int th_team_nproc; /* number of threads in a team */
+  kmp_info_p *th_team_master; /* the team's primary thread */
+  int th_team_serialized; /* team is serialized */
+  microtask_t th_teams_microtask; /* save entry address for teams construct */
+  int th_teams_level; /* save initial level of teams construct */
+/* it is 0 on device but may be any on host */
+
+/* The blocktime info is copied from the team struct to the thread struct */
+/* at the start of a barrier, and the values stored in the team are used  */
+/* at points in the code where the team struct is no longer guaranteed    */
+/* to exist (from the POV of worker threads).                             */
+#if KMP_USE_MONITOR
+  int th_team_bt_intervals;
+  int th_team_bt_set;
+#else
+  kmp_uint64 th_team_bt_intervals;
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+  kmp_affin_mask_t *th_affin_mask; /* thread's current affinity mask */
+  kmp_affinity_ids_t th_topology_ids; /* thread's current topology ids */
+  kmp_affinity_attrs_t th_topology_attrs; /* thread's current topology attrs */
+#endif
+  omp_allocator_handle_t th_def_allocator; /* default allocator */
+  /* The data set by the primary thread at reinit, then R/W by the worker */
+  KMP_ALIGN_CACHE int
+      th_set_nproc; /* if > 0, then only use this request for the next fork */
+#if KMP_NESTED_HOT_TEAMS
+  kmp_hot_team_ptr_t *th_hot_teams; /* array of hot teams */
+#endif
+  kmp_proc_bind_t
+      th_set_proc_bind; /* if != proc_bind_default, use request for next fork */
+  kmp_teams_size_t
+      th_teams_size; /* number of teams/threads in teams construct */
+#if KMP_AFFINITY_SUPPORTED
+  int th_current_place; /* place currently bound to */
+  int th_new_place; /* place to bind to in par reg */
+  int th_first_place; /* first place in partition */
+  int th_last_place; /* last place in partition */
+#endif
+  int th_prev_level; /* previous level for affinity format */
+  int th_prev_num_threads; /* previous num_threads for affinity format */
+#if USE_ITT_BUILD
+  kmp_uint64 th_bar_arrive_time; /* arrival to barrier timestamp */
+  kmp_uint64 th_bar_min_time; /* minimum arrival time at the barrier */
+  kmp_uint64 th_frame_time; /* frame timestamp */
+#endif /* USE_ITT_BUILD */
+  kmp_local_t th_local;
+  struct private_common *th_pri_head;
+
+  /* Now the data only used by the worker (after initial allocation) */
+  /* TODO the first serial team should actually be stored in the info_t
+     structure.  this will help reduce initial allocation overhead */
+  KMP_ALIGN_CACHE kmp_team_p
+      *th_serial_team; /*serialized team held in reserve*/
+
+#if OMPT_SUPPORT
+  ompt_thread_info_t ompt_thread_info;
+#endif
+
+  /* The following are also read by the primary thread during reinit */
+  struct common_table *th_pri_common;
+
+  volatile kmp_uint32 th_spin_here; /* thread-local location for spinning */
+  /* while awaiting queuing lock acquire */
+
+  volatile void *th_sleep_loc; // this points at a kmp_flag<T>
+  flag_type th_sleep_loc_type; // enum type of flag stored in th_sleep_loc
+
+  ident_t *th_ident;
+  unsigned th_x; // Random number generator data
+  unsigned th_a; // Random number generator data
+
+  /* Tasking-related data for the thread */
+  kmp_task_team_t *th_task_team; // Task team struct
+  kmp_taskdata_t *th_current_task; // Innermost Task being executed
+  kmp_uint8 th_task_state; // alternating 0/1 for task team identification
+  kmp_uint8 *th_task_state_memo_stack; // Stack holding memos of th_task_state
+  // at nested levels
+  kmp_uint32 th_task_state_top; // Top element of th_task_state_memo_stack
+  kmp_uint32 th_task_state_stack_sz; // Size of th_task_state_memo_stack
+  kmp_uint32 th_reap_state; // Non-zero indicates thread is not
+  // tasking, thus safe to reap
+
+  /* More stuff for keeping track of active/sleeping threads (this part is
+     written by the worker thread) */
+  kmp_uint8 th_active_in_pool; // included in count of #active threads in pool
+  int th_active; // ! sleeping; 32 bits for TCR/TCW
+  std::atomic<kmp_uint32> th_used_in_team; // Flag indicating use in team
+  // 0 = not used in team; 1 = used in team;
+  // 2 = transitioning to not used in team; 3 = transitioning to used in team
+  struct cons_header *th_cons; // used for consistency check
+#if KMP_USE_HIER_SCHED
+  // used for hierarchical scheduling
+  kmp_hier_private_bdata_t *th_hier_bar_data;
+#endif
+
+  /* Add the syncronizing data which is cache aligned and padded. */
+  KMP_ALIGN_CACHE kmp_balign_t th_bar[bs_last_barrier];
+
+  KMP_ALIGN_CACHE volatile kmp_int32
+      th_next_waiting; /* gtid+1 of next thread on lock wait queue, 0 if none */
+
+#if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
+#define NUM_LISTS 4
+  kmp_free_list_t th_free_lists[NUM_LISTS]; // Free lists for fast memory
+// allocation routines
+#endif
+
+#if KMP_OS_WINDOWS
+  kmp_win32_cond_t th_suspend_cv;
+  kmp_win32_mutex_t th_suspend_mx;
+  std::atomic<int> th_suspend_init;
+#endif
+#if KMP_OS_UNIX
+  kmp_cond_align_t th_suspend_cv;
+  kmp_mutex_align_t th_suspend_mx;
+  std::atomic<int> th_suspend_init_count;
+#endif
+
+#if USE_ITT_BUILD
+  kmp_itt_mark_t th_itt_mark_single;
+// alignment ???
+#endif /* USE_ITT_BUILD */
+#if KMP_STATS_ENABLED
+  kmp_stats_list *th_stats;
+#endif
+#if KMP_OS_UNIX
+  std::atomic<bool> th_blocking;
+#endif
+  kmp_cg_root_t *th_cg_roots; // list of cg_roots associated with this thread
+} kmp_base_info_t;
+
+typedef union KMP_ALIGN_CACHE kmp_info {
+  double th_align; /* use worst case alignment */
+  char th_pad[KMP_PAD(kmp_base_info_t, CACHE_LINE)];
+  kmp_base_info_t th;
+} kmp_info_t;
+
+// OpenMP thread team data structures
+
+typedef struct kmp_base_data {
+  volatile kmp_uint32 t_value;
+} kmp_base_data_t;
+
+typedef union KMP_ALIGN_CACHE kmp_sleep_team {
+  double dt_align; /* use worst case alignment */
+  char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
+  kmp_base_data_t dt;
+} kmp_sleep_team_t;
+
+typedef union KMP_ALIGN_CACHE kmp_ordered_team {
+  double dt_align; /* use worst case alignment */
+  char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
+  kmp_base_data_t dt;
+} kmp_ordered_team_t;
+
+typedef int (*launch_t)(int gtid);
+
+/* Minimum number of ARGV entries to malloc if necessary */
+#define KMP_MIN_MALLOC_ARGV_ENTRIES 100
+
+// Set up how many argv pointers will fit in cache lines containing
+// t_inline_argv. Historically, we have supported at least 96 bytes. Using a
+// larger value for more space between the primary write/worker read section and
+// read/write by all section seems to buy more performance on EPCC PARALLEL.
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#define KMP_INLINE_ARGV_BYTES                                                  \
+  (4 * CACHE_LINE -                                                            \
+   ((3 * KMP_PTR_SKIP + 2 * sizeof(int) + 2 * sizeof(kmp_int8) +               \
+     sizeof(kmp_int16) + sizeof(kmp_uint32)) %                                 \
+    CACHE_LINE))
+#else
+#define KMP_INLINE_ARGV_BYTES                                                  \
+  (2 * CACHE_LINE - ((3 * KMP_PTR_SKIP + 2 * sizeof(int)) % CACHE_LINE))
+#endif
+#define KMP_INLINE_ARGV_ENTRIES (int)(KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP)
+
+typedef struct KMP_ALIGN_CACHE kmp_base_team {
+  // Synchronization Data
+  // ---------------------------------------------------------------------------
+  KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered;
+  kmp_balign_team_t t_bar[bs_last_barrier];
+  std::atomic<int> t_construct; // count of single directive encountered by team
+  char pad[sizeof(kmp_lock_t)]; // padding to maintain performance on big iron
+
+  // [0] - parallel / [1] - worksharing task reduction data shared by taskgroups
+  std::atomic<void *> t_tg_reduce_data[2]; // to support task modifier
+  std::atomic<int> t_tg_fini_counter[2]; // sync end of task reductions
+
+  // Primary thread only
+  // ---------------------------------------------------------------------------
+  KMP_ALIGN_CACHE int t_master_tid; // tid of primary thread in parent team
+  int t_master_this_cons; // "this_construct" single counter of primary thread
+  // in parent team
+  ident_t *t_ident; // if volatile, have to change too much other crud to
+  // volatile too
+  kmp_team_p *t_parent; // parent team
+  kmp_team_p *t_next_pool; // next free team in the team pool
+  kmp_disp_t *t_dispatch; // thread's dispatch data
+  kmp_task_team_t *t_task_team[2]; // Task team struct; switch between 2
+  kmp_proc_bind_t t_proc_bind; // bind type for par region
+#if USE_ITT_BUILD
+  kmp_uint64 t_region_time; // region begin timestamp
+#endif /* USE_ITT_BUILD */
+
+  // Primary thread write, workers read
+  // --------------------------------------------------------------------------
+  KMP_ALIGN_CACHE void **t_argv;
+  int t_argc;
+  int t_nproc; // number of threads in team
+  microtask_t t_pkfn;
+  launch_t t_invoke; // procedure to launch the microtask
+
+#if OMPT_SUPPORT
+  ompt_team_info_t ompt_team_info;
+  ompt_lw_taskteam_t *ompt_serialized_team_info;
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  kmp_int8 t_fp_control_saved;
+  kmp_int8 t_pad2b;
+  kmp_int16 t_x87_fpu_control_word; // FP control regs
+  kmp_uint32 t_mxcsr;
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+  void *t_inline_argv[KMP_INLINE_ARGV_ENTRIES];
+
+  KMP_ALIGN_CACHE kmp_info_t **t_threads;
+  kmp_taskdata_t
+      *t_implicit_task_taskdata; // Taskdata for the thread's implicit task
+  int t_level; // nested parallel level
+
+  KMP_ALIGN_CACHE int t_max_argc;
+  int t_max_nproc; // max threads this team can handle (dynamically expandable)
+  int t_serialized; // levels deep of serialized teams
+  dispatch_shared_info_t *t_disp_buffer; // buffers for dispatch system
+  int t_id; // team's id, assigned by debugger.
+  int t_active_level; // nested active parallel level
+  kmp_r_sched_t t_sched; // run-time schedule for the team
+#if KMP_AFFINITY_SUPPORTED
+  int t_first_place; // first & last place in parent thread's partition.
+  int t_last_place; // Restore these values to primary thread after par region.
+#endif // KMP_AFFINITY_SUPPORTED
+  int t_display_affinity;
+  int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via
+  // omp_set_num_threads() call
+  omp_allocator_handle_t t_def_allocator; /* default allocator */
+
+// Read/write by workers as well
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+  // Using CACHE_LINE=64 reduces memory footprint, but causes a big perf
+  // regression of epcc 'parallel' and 'barrier' on fxe256lin01. This extra
+  // padding serves to fix the performance of epcc 'parallel' and 'barrier' when
+  // CACHE_LINE=64. TODO: investigate more and get rid if this padding.
+  char dummy_padding[1024];
+#endif
+  // Internal control stack for additional nested teams.
+  KMP_ALIGN_CACHE kmp_internal_control_t *t_control_stack_top;
+  // for SERIALIZED teams nested 2 or more levels deep
+  // typed flag to store request state of cancellation
+  std::atomic<kmp_int32> t_cancel_request;
+  int t_master_active; // save on fork, restore on join
+  void *t_copypriv_data; // team specific pointer to copyprivate data array
+#if KMP_OS_WINDOWS
+  std::atomic<kmp_uint32> t_copyin_counter;
+#endif
+#if USE_ITT_BUILD
+  void *t_stack_id; // team specific stack stitching id (for ittnotify)
+#endif /* USE_ITT_BUILD */
+  distributedBarrier *b; // Distributed barrier data associated with team
+} kmp_base_team_t;
+
+union KMP_ALIGN_CACHE kmp_team {
+  kmp_base_team_t t;
+  double t_align; /* use worst case alignment */
+  char t_pad[KMP_PAD(kmp_base_team_t, CACHE_LINE)];
+};
+
+typedef union KMP_ALIGN_CACHE kmp_time_global {
+  double dt_align; /* use worst case alignment */
+  char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
+  kmp_base_data_t dt;
+} kmp_time_global_t;
+
+typedef struct kmp_base_global {
+  /* cache-aligned */
+  kmp_time_global_t g_time;
+
+  /* non cache-aligned */
+  volatile int g_abort;
+  volatile int g_done;
+
+  int g_dynamic;
+  enum dynamic_mode g_dynamic_mode;
+} kmp_base_global_t;
+
+typedef union KMP_ALIGN_CACHE kmp_global {
+  kmp_base_global_t g;
+  double g_align; /* use worst case alignment */
+  char g_pad[KMP_PAD(kmp_base_global_t, CACHE_LINE)];
+} kmp_global_t;
+
+typedef struct kmp_base_root {
+  // TODO: GEH - combine r_active with r_in_parallel then r_active ==
+  // (r_in_parallel>= 0)
+  // TODO: GEH - then replace r_active with t_active_levels if we can to reduce
+  // the synch overhead or keeping r_active
+  volatile int r_active; /* TRUE if some region in a nest has > 1 thread */
+  // keeps a count of active parallel regions per root
+  std::atomic<int> r_in_parallel;
+  // GEH: This is misnamed, should be r_active_levels
+  kmp_team_t *r_root_team;
+  kmp_team_t *r_hot_team;
+  kmp_info_t *r_uber_thread;
+  kmp_lock_t r_begin_lock;
+  volatile int r_begin;
+  int r_blocktime; /* blocktime for this root and descendants */
+#if KMP_AFFINITY_SUPPORTED
+  int r_affinity_assigned;
+#endif // KMP_AFFINITY_SUPPORTED
+} kmp_base_root_t;
+
+typedef union KMP_ALIGN_CACHE kmp_root {
+  kmp_base_root_t r;
+  double r_align; /* use worst case alignment */
+  char r_pad[KMP_PAD(kmp_base_root_t, CACHE_LINE)];
+} kmp_root_t;
+
+struct fortran_inx_info {
+  kmp_int32 data;
+};
+
+// This list type exists to hold old __kmp_threads arrays so that
+// old references to them may complete while reallocation takes place when
+// expanding the array. The items in this list are kept alive until library
+// shutdown.
+typedef struct kmp_old_threads_list_t {
+  kmp_info_t **threads;
+  struct kmp_old_threads_list_t *next;
+} kmp_old_threads_list_t;
+
+/* ------------------------------------------------------------------------ */
+
+extern int __kmp_settings;
+extern int __kmp_duplicate_library_ok;
+#if USE_ITT_BUILD
+extern int __kmp_forkjoin_frames;
+extern int __kmp_forkjoin_frames_mode;
+#endif
+extern PACKED_REDUCTION_METHOD_T __kmp_force_reduction_method;
+extern int __kmp_determ_red;
+
+#ifdef KMP_DEBUG
+extern int kmp_a_debug;
+extern int kmp_b_debug;
+extern int kmp_c_debug;
+extern int kmp_d_debug;
+extern int kmp_e_debug;
+extern int kmp_f_debug;
+#endif /* KMP_DEBUG */
+
+/* For debug information logging using rotating buffer */
+#define KMP_DEBUG_BUF_LINES_INIT 512
+#define KMP_DEBUG_BUF_LINES_MIN 1
+
+#define KMP_DEBUG_BUF_CHARS_INIT 128
+#define KMP_DEBUG_BUF_CHARS_MIN 2
+
+extern int
+    __kmp_debug_buf; /* TRUE means use buffer, FALSE means print to stderr */
+extern int __kmp_debug_buf_lines; /* How many lines of debug stored in buffer */
+extern int
+    __kmp_debug_buf_chars; /* How many characters allowed per line in buffer */
+extern int __kmp_debug_buf_atomic; /* TRUE means use atomic update of buffer
+                                      entry pointer */
+
+extern char *__kmp_debug_buffer; /* Debug buffer itself */
+extern std::atomic<int> __kmp_debug_count; /* Counter for number of lines
+                                              printed in buffer so far */
+extern int __kmp_debug_buf_warn_chars; /* Keep track of char increase
+                                          recommended in warnings */
+/* end rotating debug buffer */
+
+#ifdef KMP_DEBUG
+extern int __kmp_par_range; /* +1 => only go par for constructs in range */
+
+#define KMP_PAR_RANGE_ROUTINE_LEN 1024
+extern char __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN];
+#define KMP_PAR_RANGE_FILENAME_LEN 1024
+extern char __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN];
+extern int __kmp_par_range_lb;
+extern int __kmp_par_range_ub;
+#endif
+
+/* For printing out dynamic storage map for threads and teams */
+extern int
+    __kmp_storage_map; /* True means print storage map for threads and teams */
+extern int __kmp_storage_map_verbose; /* True means storage map includes
+                                         placement info */
+extern int __kmp_storage_map_verbose_specified;
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+extern kmp_cpuinfo_t __kmp_cpuinfo;
+static inline bool __kmp_is_hybrid_cpu() { return __kmp_cpuinfo.flags.hybrid; }
+#elif KMP_OS_DARWIN && KMP_ARCH_AARCH64
+static inline bool __kmp_is_hybrid_cpu() { return true; }
+#else
+static inline bool __kmp_is_hybrid_cpu() { return false; }
+#endif
+
+extern volatile int __kmp_init_serial;
+extern volatile int __kmp_init_gtid;
+extern volatile int __kmp_init_common;
+extern volatile int __kmp_need_register_serial;
+extern volatile int __kmp_init_middle;
+extern volatile int __kmp_init_parallel;
+#if KMP_USE_MONITOR
+extern volatile int __kmp_init_monitor;
+#endif
+extern volatile int __kmp_init_user_locks;
+extern volatile int __kmp_init_hidden_helper_threads;
+extern int __kmp_init_counter;
+extern int __kmp_root_counter;
+extern int __kmp_version;
+
+/* list of address of allocated caches for commons */
+extern kmp_cached_addr_t *__kmp_threadpriv_cache_list;
+
+/* Barrier algorithm types and options */
+extern kmp_uint32 __kmp_barrier_gather_bb_dflt;
+extern kmp_uint32 __kmp_barrier_release_bb_dflt;
+extern kmp_bar_pat_e __kmp_barrier_gather_pat_dflt;
+extern kmp_bar_pat_e __kmp_barrier_release_pat_dflt;
+extern kmp_uint32 __kmp_barrier_gather_branch_bits[bs_last_barrier];
+extern kmp_uint32 __kmp_barrier_release_branch_bits[bs_last_barrier];
+extern kmp_bar_pat_e __kmp_barrier_gather_pattern[bs_last_barrier];
+extern kmp_bar_pat_e __kmp_barrier_release_pattern[bs_last_barrier];
+extern char const *__kmp_barrier_branch_bit_env_name[bs_last_barrier];
+extern char const *__kmp_barrier_pattern_env_name[bs_last_barrier];
+extern char const *__kmp_barrier_type_name[bs_last_barrier];
+extern char const *__kmp_barrier_pattern_name[bp_last_bar];
+
+/* Global Locks */
+extern kmp_bootstrap_lock_t __kmp_initz_lock; /* control initialization */
+extern kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
+extern kmp_bootstrap_lock_t __kmp_task_team_lock;
+extern kmp_bootstrap_lock_t
+    __kmp_exit_lock; /* exit() is not always thread-safe */
+#if KMP_USE_MONITOR
+extern kmp_bootstrap_lock_t
+    __kmp_monitor_lock; /* control monitor thread creation */
+#endif
+extern kmp_bootstrap_lock_t
+    __kmp_tp_cached_lock; /* used for the hack to allow threadprivate cache and
+                             __kmp_threads expansion to co-exist */
+
+extern kmp_lock_t __kmp_global_lock; /* control OS/global access  */
+extern kmp_queuing_lock_t __kmp_dispatch_lock; /* control dispatch access  */
+extern kmp_lock_t __kmp_debug_lock; /* control I/O access for KMP_DEBUG */
+
+extern enum library_type __kmp_library;
+
+extern enum sched_type __kmp_sched; /* default runtime scheduling */
+extern enum sched_type __kmp_static; /* default static scheduling method */
+extern enum sched_type __kmp_guided; /* default guided scheduling method */
+extern enum sched_type __kmp_auto; /* default auto scheduling method */
+extern int __kmp_chunk; /* default runtime chunk size */
+extern int __kmp_force_monotonic; /* whether monotonic scheduling forced */
+
+extern size_t __kmp_stksize; /* stack size per thread         */
+#if KMP_USE_MONITOR
+extern size_t __kmp_monitor_stksize; /* stack size for monitor thread */
+#endif
+extern size_t __kmp_stkoffset; /* stack offset per thread       */
+extern int __kmp_stkpadding; /* Should we pad root thread(s) stack */
+
+extern size_t
+    __kmp_malloc_pool_incr; /* incremental size of pool for kmp_malloc() */
+extern int __kmp_env_stksize; /* was KMP_STACKSIZE specified? */
+extern int __kmp_env_blocktime; /* was KMP_BLOCKTIME specified? */
+extern int __kmp_env_checks; /* was KMP_CHECKS specified?    */
+extern int __kmp_env_consistency_check; // was KMP_CONSISTENCY_CHECK specified?
+extern int __kmp_generate_warnings; /* should we issue warnings? */
+extern int __kmp_reserve_warn; /* have we issued reserve_threads warning? */
+
+#ifdef DEBUG_SUSPEND
+extern int __kmp_suspend_count; /* count inside __kmp_suspend_template() */
+#endif
+
+extern kmp_int32 __kmp_use_yield;
+extern kmp_int32 __kmp_use_yield_exp_set;
+extern kmp_uint32 __kmp_yield_init;
+extern kmp_uint32 __kmp_yield_next;
+extern kmp_uint64 __kmp_pause_init;
+
+/* ------------------------------------------------------------------------- */
+extern int __kmp_allThreadsSpecified;
+
+extern size_t __kmp_align_alloc;
+/* following data protected by initialization routines */
+extern int __kmp_xproc; /* number of processors in the system */
+extern int __kmp_avail_proc; /* number of processors available to the process */
+extern size_t __kmp_sys_min_stksize; /* system-defined minimum stack size */
+extern int __kmp_sys_max_nth; /* system-imposed maximum number of threads */
+// maximum total number of concurrently-existing threads on device
+extern int __kmp_max_nth;
+// maximum total number of concurrently-existing threads in a contention group
+extern int __kmp_cg_max_nth;
+extern int __kmp_task_max_nth; // max threads used in a task
+extern int __kmp_teams_max_nth; // max threads used in a teams construct
+extern int __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and
+                                      __kmp_root */
+extern int __kmp_dflt_team_nth; /* default number of threads in a parallel
+                                   region a la OMP_NUM_THREADS */
+extern int __kmp_dflt_team_nth_ub; /* upper bound on "" determined at serial
+                                      initialization */
+extern int __kmp_tp_capacity; /* capacity of __kmp_threads if threadprivate is
+                                 used (fixed) */
+extern int __kmp_tp_cached; /* whether threadprivate cache has been created
+                               (__kmpc_threadprivate_cached()) */
+extern int __kmp_dflt_blocktime; /* number of microseconds to wait before
+                                    blocking (env setting) */
+extern char __kmp_blocktime_units; /* 'm' or 'u' to note units specified */
+extern bool __kmp_wpolicy_passive; /* explicitly set passive wait policy */
+
+// Convert raw blocktime from ms to us if needed.
+static inline void __kmp_aux_convert_blocktime(int *bt) {
+  if (__kmp_blocktime_units == 'm') {
+    if (*bt > INT_MAX / 1000) {
+      *bt = INT_MAX / 1000;
+      KMP_INFORM(MaxValueUsing, "kmp_set_blocktime(ms)", bt);
+    }
+    *bt = *bt * 1000;
+  }
+}
+
+#if KMP_USE_MONITOR
+extern int
+    __kmp_monitor_wakeups; /* number of times monitor wakes up per second */
+extern int __kmp_bt_intervals; /* number of monitor timestamp intervals before
+                                  blocking */
+#endif
+#ifdef KMP_ADJUST_BLOCKTIME
+extern int __kmp_zero_bt; /* whether blocktime has been forced to zero */
+#endif /* KMP_ADJUST_BLOCKTIME */
+#ifdef KMP_DFLT_NTH_CORES
+extern int __kmp_ncores; /* Total number of cores for threads placement */
+#endif
+/* Number of millisecs to delay on abort for Intel(R) VTune(TM) tools */
+extern int __kmp_abort_delay;
+
+extern int __kmp_need_register_atfork_specified;
+extern int __kmp_need_register_atfork; /* At initialization, call pthread_atfork
+                                          to install fork handler */
+extern int __kmp_gtid_mode; /* Method of getting gtid, values:
+                               0 - not set, will be set at runtime
+                               1 - using stack search
+                               2 - dynamic TLS (pthread_getspecific(Linux* OS/OS
+                                   X*) or TlsGetValue(Windows* OS))
+                               3 - static TLS (__declspec(thread) __kmp_gtid),
+                                   Linux* OS .so only.  */
+extern int
+    __kmp_adjust_gtid_mode; /* If true, adjust method based on #threads */
+#ifdef KMP_TDATA_GTID
+extern KMP_THREAD_LOCAL int __kmp_gtid;
+#endif
+extern int __kmp_tls_gtid_min; /* #threads below which use sp search for gtid */
+extern int __kmp_foreign_tp; // If true, separate TP var for each foreign thread
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+extern int __kmp_inherit_fp_control; // copy fp creg(s) parent->workers at fork
+extern kmp_int16 __kmp_init_x87_fpu_control_word; // init thread's FP ctrl reg
+extern kmp_uint32 __kmp_init_mxcsr; /* init thread's mxscr */
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// max_active_levels for nested parallelism enabled by default via
+// OMP_MAX_ACTIVE_LEVELS, OMP_NESTED, OMP_NUM_THREADS, and OMP_PROC_BIND
+extern int __kmp_dflt_max_active_levels;
+// Indicates whether value of __kmp_dflt_max_active_levels was already
+// explicitly set by OMP_MAX_ACTIVE_LEVELS or OMP_NESTED=false
+extern bool __kmp_dflt_max_active_levels_set;
+extern int __kmp_dispatch_num_buffers; /* max possible dynamic loops in
+                                          concurrent execution per team */
+#if KMP_NESTED_HOT_TEAMS
+extern int __kmp_hot_teams_mode;
+extern int __kmp_hot_teams_max_level;
+#endif
+
+#if KMP_OS_LINUX
+extern enum clock_function_type __kmp_clock_function;
+extern int __kmp_clock_function_param;
+#endif /* KMP_OS_LINUX */
+
+#if KMP_MIC_SUPPORTED
+extern enum mic_type __kmp_mic_type;
+#endif
+
+#ifdef USE_LOAD_BALANCE
+extern double __kmp_load_balance_interval; // load balance algorithm interval
+#endif /* USE_LOAD_BALANCE */
+
+// OpenMP 3.1 - Nested num threads array
+typedef struct kmp_nested_nthreads_t {
+  int *nth;
+  int size;
+  int used;
+} kmp_nested_nthreads_t;
+
+extern kmp_nested_nthreads_t __kmp_nested_nth;
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+// Parameters for the speculative lock backoff system.
+struct kmp_adaptive_backoff_params_t {
+  // Number of soft retries before it counts as a hard retry.
+  kmp_uint32 max_soft_retries;
+  // Badness is a bit mask : 0,1,3,7,15,... on each hard failure we move one to
+  // the right
+  kmp_uint32 max_badness;
+};
+
+extern kmp_adaptive_backoff_params_t __kmp_adaptive_backoff_params;
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+extern const char *__kmp_speculative_statsfile;
+#endif
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+extern int __kmp_display_env; /* TRUE or FALSE */
+extern int __kmp_display_env_verbose; /* TRUE if OMP_DISPLAY_ENV=VERBOSE */
+extern int __kmp_omp_cancellation; /* TRUE or FALSE */
+extern int __kmp_nteams;
+extern int __kmp_teams_thread_limit;
+
+/* ------------------------------------------------------------------------- */
+
+/* the following are protected by the fork/join lock */
+/* write: lock  read: anytime */
+extern kmp_info_t **__kmp_threads; /* Descriptors for the threads */
+/* Holds old arrays of __kmp_threads until library shutdown */
+extern kmp_old_threads_list_t *__kmp_old_threads_list;
+/* read/write: lock */
+extern volatile kmp_team_t *__kmp_team_pool;
+extern volatile kmp_info_t *__kmp_thread_pool;
+extern kmp_info_t *__kmp_thread_pool_insert_pt;
+
+// total num threads reachable from some root thread including all root threads
+extern volatile int __kmp_nth;
+/* total number of threads reachable from some root thread including all root
+   threads, and those in the thread pool */
+extern volatile int __kmp_all_nth;
+extern std::atomic<int> __kmp_thread_pool_active_nth;
+
+extern kmp_root_t **__kmp_root; /* root of thread hierarchy */
+/* end data protected by fork/join lock */
+/* ------------------------------------------------------------------------- */
+
+#define __kmp_get_gtid() __kmp_get_global_thread_id()
+#define __kmp_entry_gtid() __kmp_get_global_thread_id_reg()
+#define __kmp_get_tid() (__kmp_tid_from_gtid(__kmp_get_gtid()))
+#define __kmp_get_team() (__kmp_threads[(__kmp_get_gtid())]->th.th_team)
+#define __kmp_get_thread() (__kmp_thread_from_gtid(__kmp_get_gtid()))
+
+// AT: Which way is correct?
+// AT: 1. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team -> t.t_nproc;
+// AT: 2. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team_nproc;
+#define __kmp_get_team_num_threads(gtid)                                       \
+  (__kmp_threads[(gtid)]->th.th_team->t.t_nproc)
+
+static inline bool KMP_UBER_GTID(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= KMP_GTID_MIN);
+  KMP_DEBUG_ASSERT(gtid < __kmp_threads_capacity);
+  return (gtid >= 0 && __kmp_root[gtid] && __kmp_threads[gtid] &&
+          __kmp_threads[gtid] == __kmp_root[gtid]->r.r_uber_thread);
+}
+
+static inline int __kmp_tid_from_gtid(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  return __kmp_threads[gtid]->th.th_info.ds.ds_tid;
+}
+
+static inline int __kmp_gtid_from_tid(int tid, const kmp_team_t *team) {
+  KMP_DEBUG_ASSERT(tid >= 0 && team);
+  return team->t.t_threads[tid]->th.th_info.ds.ds_gtid;
+}
+
+static inline int __kmp_gtid_from_thread(const kmp_info_t *thr) {
+  KMP_DEBUG_ASSERT(thr);
+  return thr->th.th_info.ds.ds_gtid;
+}
+
+static inline kmp_info_t *__kmp_thread_from_gtid(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  return __kmp_threads[gtid];
+}
+
+static inline kmp_team_t *__kmp_team_from_gtid(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  return __kmp_threads[gtid]->th.th_team;
+}
+
+static inline void __kmp_assert_valid_gtid(kmp_int32 gtid) {
+  if (UNLIKELY(gtid < 0 || gtid >= __kmp_threads_capacity))
+    KMP_FATAL(ThreadIdentInvalid);
+}
+
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+extern int __kmp_user_level_mwait; // TRUE or FALSE; from KMP_USER_LEVEL_MWAIT
+extern int __kmp_umwait_enabled; // Runtime check if user-level mwait enabled
+extern int __kmp_mwait_enabled; // Runtime check if ring3 mwait is enabled
+extern int __kmp_mwait_hints; // Hints to pass in to mwait
+#endif
+
+#if KMP_HAVE_UMWAIT
+extern int __kmp_waitpkg_enabled; // Runtime check if waitpkg exists
+extern int __kmp_tpause_state; // 0 (default), 1=C0.1, 2=C0.2; from KMP_TPAUSE
+extern int __kmp_tpause_hint; // 1=C0.1 (default), 0=C0.2; from KMP_TPAUSE
+extern int __kmp_tpause_enabled; // 0 (default), 1 (KMP_TPAUSE is non-zero)
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+extern kmp_global_t __kmp_global; /* global status */
+
+extern kmp_info_t __kmp_monitor;
+// For Debugging Support Library
+extern std::atomic<kmp_int32> __kmp_team_counter;
+// For Debugging Support Library
+extern std::atomic<kmp_int32> __kmp_task_counter;
+
+#if USE_DEBUGGER
+#define _KMP_GEN_ID(counter)                                                   \
+  (__kmp_debugging ? KMP_ATOMIC_INC(&counter) + 1 : ~0)
+#else
+#define _KMP_GEN_ID(counter) (~0)
+#endif /* USE_DEBUGGER */
+
+#define KMP_GEN_TASK_ID() _KMP_GEN_ID(__kmp_task_counter)
+#define KMP_GEN_TEAM_ID() _KMP_GEN_ID(__kmp_team_counter)
+
+/* ------------------------------------------------------------------------ */
+
+extern void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2,
+                                         size_t size, char const *format, ...);
+
+extern void __kmp_serial_initialize(void);
+extern void __kmp_middle_initialize(void);
+extern void __kmp_parallel_initialize(void);
+
+extern void __kmp_internal_begin(void);
+extern void __kmp_internal_end_library(int gtid);
+extern void __kmp_internal_end_thread(int gtid);
+extern void __kmp_internal_end_atexit(void);
+extern void __kmp_internal_end_dtor(void);
+extern void __kmp_internal_end_dest(void *);
+
+extern int __kmp_register_root(int initial_thread);
+extern void __kmp_unregister_root(int gtid);
+extern void __kmp_unregister_library(void); // called by __kmp_internal_end()
+
+extern int __kmp_ignore_mppbeg(void);
+extern int __kmp_ignore_mppend(void);
+
+extern int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws);
+extern void __kmp_exit_single(int gtid);
+
+extern void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+extern void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+
+#ifdef USE_LOAD_BALANCE
+extern int __kmp_get_load_balance(int);
+#endif
+
+extern int __kmp_get_global_thread_id(void);
+extern int __kmp_get_global_thread_id_reg(void);
+extern void __kmp_exit_thread(int exit_status);
+extern void __kmp_abort(char const *format, ...);
+extern void __kmp_abort_thread(void);
+KMP_NORETURN extern void __kmp_abort_process(void);
+extern void __kmp_warn(char const *format, ...);
+
+extern void __kmp_set_num_threads(int new_nth, int gtid);
+
+extern bool __kmp_detect_shm();
+extern bool __kmp_detect_tmp();
+
+// Returns current thread (pointer to kmp_info_t). Current thread *must* be
+// registered.
+static inline kmp_info_t *__kmp_entry_thread() {
+  int gtid = __kmp_entry_gtid();
+
+  return __kmp_threads[gtid];
+}
+
+extern void __kmp_set_max_active_levels(int gtid, int new_max_active_levels);
+extern int __kmp_get_max_active_levels(int gtid);
+extern int __kmp_get_ancestor_thread_num(int gtid, int level);
+extern int __kmp_get_team_size(int gtid, int level);
+extern void __kmp_set_schedule(int gtid, kmp_sched_t new_sched, int chunk);
+extern void __kmp_get_schedule(int gtid, kmp_sched_t *sched, int *chunk);
+
+extern unsigned short __kmp_get_random(kmp_info_t *thread);
+extern void __kmp_init_random(kmp_info_t *thread);
+
+extern kmp_r_sched_t __kmp_get_schedule_global(void);
+extern void __kmp_adjust_num_threads(int new_nproc);
+extern void __kmp_check_stksize(size_t *val);
+
+extern void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL);
+extern void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL);
+extern void ___kmp_free(void *ptr KMP_SRC_LOC_DECL);
+#define __kmp_allocate(size) ___kmp_allocate((size)KMP_SRC_LOC_CURR)
+#define __kmp_page_allocate(size) ___kmp_page_allocate((size)KMP_SRC_LOC_CURR)
+#define __kmp_free(ptr) ___kmp_free((ptr)KMP_SRC_LOC_CURR)
+
+#if USE_FAST_MEMORY
+extern void *___kmp_fast_allocate(kmp_info_t *this_thr,
+                                  size_t size KMP_SRC_LOC_DECL);
+extern void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL);
+extern void __kmp_free_fast_memory(kmp_info_t *this_thr);
+extern void __kmp_initialize_fast_memory(kmp_info_t *this_thr);
+#define __kmp_fast_allocate(this_thr, size)                                    \
+  ___kmp_fast_allocate((this_thr), (size)KMP_SRC_LOC_CURR)
+#define __kmp_fast_free(this_thr, ptr)                                         \
+  ___kmp_fast_free((this_thr), (ptr)KMP_SRC_LOC_CURR)
+#endif
+
+extern void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL);
+extern void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
+                                  size_t elsize KMP_SRC_LOC_DECL);
+extern void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
+                                   size_t size KMP_SRC_LOC_DECL);
+extern void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL);
+#define __kmp_thread_malloc(th, size)                                          \
+  ___kmp_thread_malloc((th), (size)KMP_SRC_LOC_CURR)
+#define __kmp_thread_calloc(th, nelem, elsize)                                 \
+  ___kmp_thread_calloc((th), (nelem), (elsize)KMP_SRC_LOC_CURR)
+#define __kmp_thread_realloc(th, ptr, size)                                    \
+  ___kmp_thread_realloc((th), (ptr), (size)KMP_SRC_LOC_CURR)
+#define __kmp_thread_free(th, ptr)                                             \
+  ___kmp_thread_free((th), (ptr)KMP_SRC_LOC_CURR)
+
+extern void __kmp_push_num_threads(ident_t *loc, int gtid, int num_threads);
+
+extern void __kmp_push_proc_bind(ident_t *loc, int gtid,
+                                 kmp_proc_bind_t proc_bind);
+extern void __kmp_push_num_teams(ident_t *loc, int gtid, int num_teams,
+                                 int num_threads);
+extern void __kmp_push_num_teams_51(ident_t *loc, int gtid, int num_teams_lb,
+                                    int num_teams_ub, int num_threads);
+
+extern void __kmp_yield();
+
+extern void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                                   enum sched_type schedule, kmp_int32 lb,
+                                   kmp_int32 ub, kmp_int32 st, kmp_int32 chunk);
+extern void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                                    enum sched_type schedule, kmp_uint32 lb,
+                                    kmp_uint32 ub, kmp_int32 st,
+                                    kmp_int32 chunk);
+extern void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                                   enum sched_type schedule, kmp_int64 lb,
+                                   kmp_int64 ub, kmp_int64 st, kmp_int64 chunk);
+extern void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                                    enum sched_type schedule, kmp_uint64 lb,
+                                    kmp_uint64 ub, kmp_int64 st,
+                                    kmp_int64 chunk);
+
+extern int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid,
+                                  kmp_int32 *p_last, kmp_int32 *p_lb,
+                                  kmp_int32 *p_ub, kmp_int32 *p_st);
+extern int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 *p_last, kmp_uint32 *p_lb,
+                                   kmp_uint32 *p_ub, kmp_int32 *p_st);
+extern int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid,
+                                  kmp_int32 *p_last, kmp_int64 *p_lb,
+                                  kmp_int64 *p_ub, kmp_int64 *p_st);
+extern int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 *p_last, kmp_uint64 *p_lb,
+                                   kmp_uint64 *p_ub, kmp_int64 *p_st);
+
+extern void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid);
+extern void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid);
+extern void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid);
+extern void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid);
+
+#ifdef KMP_GOMP_COMPAT
+
+extern void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                                      enum sched_type schedule, kmp_int32 lb,
+                                      kmp_int32 ub, kmp_int32 st,
+                                      kmp_int32 chunk, int push_ws);
+extern void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                                       enum sched_type schedule, kmp_uint32 lb,
+                                       kmp_uint32 ub, kmp_int32 st,
+                                       kmp_int32 chunk, int push_ws);
+extern void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                                      enum sched_type schedule, kmp_int64 lb,
+                                      kmp_int64 ub, kmp_int64 st,
+                                      kmp_int64 chunk, int push_ws);
+extern void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                                       enum sched_type schedule, kmp_uint64 lb,
+                                       kmp_uint64 ub, kmp_int64 st,
+                                       kmp_int64 chunk, int push_ws);
+extern void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid);
+extern void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid);
+extern void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid);
+extern void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid);
+
+#endif /* KMP_GOMP_COMPAT */
+
+extern kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_wait_4(kmp_uint32 volatile *spinner, kmp_uint32 checker,
+                               kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
+                               void *obj);
+extern void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
+                             kmp_uint32 (*pred)(void *, kmp_uint32), void *obj);
+
+extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64<> *flag,
+                          int final_spin
+#if USE_ITT_BUILD
+                          ,
+                          void *itt_sync_obj
+#endif
+);
+extern void __kmp_release_64(kmp_flag_64<> *flag);
+
+extern void __kmp_infinite_loop(void);
+
+extern void __kmp_cleanup(void);
+
+#if KMP_HANDLE_SIGNALS
+extern int __kmp_handle_signals;
+extern void __kmp_install_signals(int parallel_init);
+extern void __kmp_remove_signals(void);
+#endif
+
+extern void __kmp_clear_system_time(void);
+extern void __kmp_read_system_time(double *delta);
+
+extern void __kmp_check_stack_overlap(kmp_info_t *thr);
+
+extern void __kmp_expand_host_name(char *buffer, size_t size);
+extern void __kmp_expand_file_name(char *result, size_t rlen, char *pattern);
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64 || (KMP_OS_WINDOWS && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM))
+extern void
+__kmp_initialize_system_tick(void); /* Initialize timer tick value */
+#endif
+
+extern void
+__kmp_runtime_initialize(void); /* machine specific initialization */
+extern void __kmp_runtime_destroy(void);
+
+#if KMP_AFFINITY_SUPPORTED
+extern char *__kmp_affinity_print_mask(char *buf, int buf_len,
+                                       kmp_affin_mask_t *mask);
+extern kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
+                                                  kmp_affin_mask_t *mask);
+extern void __kmp_affinity_initialize(kmp_affinity_t &affinity);
+extern void __kmp_affinity_uninitialize(void);
+extern void __kmp_affinity_set_init_mask(
+    int gtid, int isa_root); /* set affinity according to KMP_AFFINITY */
+void __kmp_affinity_bind_init_mask(int gtid);
+extern void __kmp_affinity_bind_place(int gtid);
+extern void __kmp_affinity_determine_capable(const char *env_var);
+extern int __kmp_aux_set_affinity(void **mask);
+extern int __kmp_aux_get_affinity(void **mask);
+extern int __kmp_aux_get_affinity_max_proc();
+extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
+extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
+extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
+extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+extern int __kmp_get_first_osid_with_ecore(void);
+#endif
+#if KMP_OS_LINUX || KMP_OS_FREEBSD
+extern int kmp_set_thread_affinity_mask_initial(void);
+#endif
+static inline void __kmp_assign_root_init_mask() {
+  int gtid = __kmp_entry_gtid();
+  kmp_root_t *r = __kmp_threads[gtid]->th.th_root;
+  if (r->r.r_uber_thread == __kmp_threads[gtid] && !r->r.r_affinity_assigned) {
+    __kmp_affinity_set_init_mask(gtid, /*isa_root=*/TRUE);
+    __kmp_affinity_bind_init_mask(gtid);
+    r->r.r_affinity_assigned = TRUE;
+  }
+}
+static inline void __kmp_reset_root_init_mask(int gtid) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_root_t *r = th->th.th_root;
+  if (r->r.r_uber_thread == th && r->r.r_affinity_assigned) {
+    __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
+    KMP_CPU_COPY(th->th.th_affin_mask, __kmp_affin_origMask);
+    r->r.r_affinity_assigned = FALSE;
+  }
+}
+#else /* KMP_AFFINITY_SUPPORTED */
+#define __kmp_assign_root_init_mask() /* Nothing */
+static inline void __kmp_reset_root_init_mask(int gtid) {}
+#endif /* KMP_AFFINITY_SUPPORTED */
+// No need for KMP_AFFINITY_SUPPORTED guard as only one field in the
+// format string is for affinity, so platforms that do not support
+// affinity can still use the other fields, e.g., %n for num_threads
+extern size_t __kmp_aux_capture_affinity(int gtid, const char *format,
+                                         kmp_str_buf_t *buffer);
+extern void __kmp_aux_display_affinity(int gtid, const char *format);
+
+extern void __kmp_cleanup_hierarchy();
+extern void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar);
+
+#if KMP_USE_FUTEX
+
+extern int __kmp_futex_determine_capable(void);
+
+#endif // KMP_USE_FUTEX
+
+extern void __kmp_gtid_set_specific(int gtid);
+extern int __kmp_gtid_get_specific(void);
+
+extern double __kmp_read_cpu_time(void);
+
+extern int __kmp_read_system_info(struct kmp_sys_info *info);
+
+#if KMP_USE_MONITOR
+extern void __kmp_create_monitor(kmp_info_t *th);
+#endif
+
+extern void *__kmp_launch_thread(kmp_info_t *thr);
+
+extern void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size);
+
+#if KMP_OS_WINDOWS
+extern int __kmp_still_running(kmp_info_t *th);
+extern int __kmp_is_thread_alive(kmp_info_t *th, DWORD *exit_val);
+extern void __kmp_free_handle(kmp_thread_t tHandle);
+#endif
+
+#if KMP_USE_MONITOR
+extern void __kmp_reap_monitor(kmp_info_t *th);
+#endif
+extern void __kmp_reap_worker(kmp_info_t *th);
+extern void __kmp_terminate_thread(int gtid);
+
+extern int __kmp_try_suspend_mx(kmp_info_t *th);
+extern void __kmp_lock_suspend_mx(kmp_info_t *th);
+extern void __kmp_unlock_suspend_mx(kmp_info_t *th);
+
+extern void __kmp_elapsed(double *);
+extern void __kmp_elapsed_tick(double *);
+
+extern void __kmp_enable(int old_state);
+extern void __kmp_disable(int *old_state);
+
+extern void __kmp_thread_sleep(int millis);
+
+extern void __kmp_common_initialize(void);
+extern void __kmp_common_destroy(void);
+extern void __kmp_common_destroy_gtid(int gtid);
+
+#if KMP_OS_UNIX
+extern void __kmp_register_atfork(void);
+#endif
+extern void __kmp_suspend_initialize(void);
+extern void __kmp_suspend_initialize_thread(kmp_info_t *th);
+extern void __kmp_suspend_uninitialize_thread(kmp_info_t *th);
+
+extern kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
+                                         int tid);
+extern kmp_team_t *
+__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
+#if OMPT_SUPPORT
+                    ompt_data_t ompt_parallel_data,
+#endif
+                    kmp_proc_bind_t proc_bind, kmp_internal_control_t *new_icvs,
+                    int argc USE_NESTED_HOT_ARG(kmp_info_t *thr));
+extern void __kmp_free_thread(kmp_info_t *);
+extern void __kmp_free_team(kmp_root_t *,
+                            kmp_team_t *USE_NESTED_HOT_ARG(kmp_info_t *));
+extern kmp_team_t *__kmp_reap_team(kmp_team_t *);
+
+/* ------------------------------------------------------------------------ */
+
+extern void __kmp_initialize_bget(kmp_info_t *th);
+extern void __kmp_finalize_bget(kmp_info_t *th);
+
+KMP_EXPORT void *kmpc_malloc(size_t size);
+KMP_EXPORT void *kmpc_aligned_malloc(size_t size, size_t alignment);
+KMP_EXPORT void *kmpc_calloc(size_t nelem, size_t elsize);
+KMP_EXPORT void *kmpc_realloc(void *ptr, size_t size);
+KMP_EXPORT void kmpc_free(void *ptr);
+
+/* declarations for internal use */
+
+extern int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
+                         size_t reduce_size, void *reduce_data,
+                         void (*reduce)(void *, void *));
+extern void __kmp_end_split_barrier(enum barrier_type bt, int gtid);
+extern int __kmp_barrier_gomp_cancel(int gtid);
+
+/*!
+ * Tell the fork call which compiler generated the fork call, and therefore how
+ * to deal with the call.
+ */
+enum fork_context_e {
+  fork_context_gnu, /**< Called from GNU generated code, so must not invoke the
+                       microtask internally. */
+  fork_context_intel, /**< Called from Intel generated code.  */
+  fork_context_last
+};
+extern int __kmp_fork_call(ident_t *loc, int gtid,
+                           enum fork_context_e fork_context, kmp_int32 argc,
+                           microtask_t microtask, launch_t invoker,
+                           kmp_va_list ap);
+
+extern void __kmp_join_call(ident_t *loc, int gtid
+#if OMPT_SUPPORT
+                            ,
+                            enum fork_context_e fork_context
+#endif
+                            ,
+                            int exit_teams = 0);
+
+extern void __kmp_serialized_parallel(ident_t *id, kmp_int32 gtid);
+extern void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team);
+extern void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team);
+extern int __kmp_invoke_task_func(int gtid);
+extern void __kmp_run_before_invoked_task(int gtid, int tid,
+                                          kmp_info_t *this_thr,
+                                          kmp_team_t *team);
+extern void __kmp_run_after_invoked_task(int gtid, int tid,
+                                         kmp_info_t *this_thr,
+                                         kmp_team_t *team);
+
+// should never have been exported
+KMP_EXPORT int __kmpc_invoke_task_func(int gtid);
+extern int __kmp_invoke_teams_master(int gtid);
+extern void __kmp_teams_master(int gtid);
+extern int __kmp_aux_get_team_num();
+extern int __kmp_aux_get_num_teams();
+extern void __kmp_save_internal_controls(kmp_info_t *thread);
+extern void __kmp_user_set_library(enum library_type arg);
+extern void __kmp_aux_set_library(enum library_type arg);
+extern void __kmp_aux_set_stacksize(size_t arg);
+extern void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid);
+extern void __kmp_aux_set_defaults(char const *str, size_t len);
+
+/* Functions called from __kmp_aux_env_initialize() in kmp_settings.cpp */
+void kmpc_set_blocktime(int arg);
+void ompc_set_nested(int flag);
+void ompc_set_dynamic(int flag);
+void ompc_set_num_threads(int arg);
+
+extern void __kmp_push_current_task_to_thread(kmp_info_t *this_thr,
+                                              kmp_team_t *team, int tid);
+extern void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr);
+extern kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                    kmp_tasking_flags_t *flags,
+                                    size_t sizeof_kmp_task_t,
+                                    size_t sizeof_shareds,
+                                    kmp_routine_entry_t task_entry);
+extern void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
+                                     kmp_team_t *team, int tid,
+                                     int set_curr_task);
+extern void __kmp_finish_implicit_task(kmp_info_t *this_thr);
+extern void __kmp_free_implicit_task(kmp_info_t *this_thr);
+
+extern kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref,
+                                                       int gtid,
+                                                       kmp_task_t *task);
+extern void __kmp_fulfill_event(kmp_event_t *event);
+
+extern void __kmp_free_task_team(kmp_info_t *thread,
+                                 kmp_task_team_t *task_team);
+extern void __kmp_reap_task_teams(void);
+extern void __kmp_wait_to_unref_task_teams(void);
+extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team,
+                                  int always);
+extern void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team);
+extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
+#if USE_ITT_BUILD
+                                 ,
+                                 void *itt_sync_obj
+#endif /* USE_ITT_BUILD */
+                                 ,
+                                 int wait = 1);
+extern void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread,
+                                  int gtid);
+
+extern int __kmp_is_address_mapped(void *addr);
+extern kmp_uint64 __kmp_hardware_timestamp(void);
+
+#if KMP_OS_UNIX
+extern int __kmp_read_from_file(char const *path, char const *format, ...);
+#endif
+
+/* ------------------------------------------------------------------------ */
+//
+// Assembly routines that have no compiler intrinsic replacement
+//
+
+extern int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int npr, int argc,
+                                  void *argv[]
+#if OMPT_SUPPORT
+                                  ,
+                                  void **exit_frame_ptr
+#endif
+);
+
+/* ------------------------------------------------------------------------ */
+
+KMP_EXPORT void __kmpc_begin(ident_t *, kmp_int32 flags);
+KMP_EXPORT void __kmpc_end(ident_t *);
+
+KMP_EXPORT void __kmpc_threadprivate_register_vec(ident_t *, void *data,
+                                                  kmpc_ctor_vec ctor,
+                                                  kmpc_cctor_vec cctor,
+                                                  kmpc_dtor_vec dtor,
+                                                  size_t vector_length);
+KMP_EXPORT void __kmpc_threadprivate_register(ident_t *, void *data,
+                                              kmpc_ctor ctor, kmpc_cctor cctor,
+                                              kmpc_dtor dtor);
+KMP_EXPORT void *__kmpc_threadprivate(ident_t *, kmp_int32 global_tid,
+                                      void *data, size_t size);
+
+KMP_EXPORT kmp_int32 __kmpc_global_thread_num(ident_t *);
+KMP_EXPORT kmp_int32 __kmpc_global_num_threads(ident_t *);
+KMP_EXPORT kmp_int32 __kmpc_bound_thread_num(ident_t *);
+KMP_EXPORT kmp_int32 __kmpc_bound_num_threads(ident_t *);
+
+KMP_EXPORT kmp_int32 __kmpc_ok_to_fork(ident_t *);
+KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs,
+                                 kmpc_micro microtask, ...);
+KMP_EXPORT void __kmpc_fork_call_if(ident_t *loc, kmp_int32 nargs,
+                                    kmpc_micro microtask, kmp_int32 cond,
+                                    void *args);
+
+KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid);
+
+KMP_EXPORT void __kmpc_flush(ident_t *);
+KMP_EXPORT void __kmpc_barrier(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT kmp_int32 __kmpc_masked(ident_t *, kmp_int32 global_tid,
+                                   kmp_int32 filter);
+KMP_EXPORT void __kmpc_end_masked(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_ordered(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_ordered(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_critical(ident_t *, kmp_int32 global_tid,
+                                kmp_critical_name *);
+KMP_EXPORT void __kmpc_end_critical(ident_t *, kmp_int32 global_tid,
+                                    kmp_critical_name *);
+KMP_EXPORT void __kmpc_critical_with_hint(ident_t *, kmp_int32 global_tid,
+                                          kmp_critical_name *, uint32_t hint);
+
+KMP_EXPORT kmp_int32 __kmpc_barrier_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_barrier_master(ident_t *, kmp_int32 global_tid);
+
+KMP_EXPORT kmp_int32 __kmpc_barrier_master_nowait(ident_t *,
+                                                  kmp_int32 global_tid);
+
+KMP_EXPORT kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_single(ident_t *, kmp_int32 global_tid);
+
+KMP_EXPORT kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 global_tid);
+KMP_EXPORT kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 global_tid,
+                                         kmp_int32 numberOfSections);
+KMP_EXPORT void __kmpc_end_sections(ident_t *loc, kmp_int32 global_tid);
+
+KMP_EXPORT void KMPC_FOR_STATIC_INIT(ident_t *loc, kmp_int32 global_tid,
+                                     kmp_int32 schedtype, kmp_int32 *plastiter,
+                                     kmp_int *plower, kmp_int *pupper,
+                                     kmp_int *pstride, kmp_int incr,
+                                     kmp_int chunk);
+
+KMP_EXPORT void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
+
+KMP_EXPORT void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
+                                   size_t cpy_size, void *cpy_data,
+                                   void (*cpy_func)(void *, void *),
+                                   kmp_int32 didit);
+
+KMP_EXPORT void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid,
+                                          void *cpy_data);
+
+extern void KMPC_SET_NUM_THREADS(int arg);
+extern void KMPC_SET_DYNAMIC(int flag);
+extern void KMPC_SET_NESTED(int flag);
+
+/* OMP 3.0 tasking interface routines */
+KMP_EXPORT kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
+                                     kmp_task_t *new_task);
+KMP_EXPORT kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                             kmp_int32 flags,
+                                             size_t sizeof_kmp_task_t,
+                                             size_t sizeof_shareds,
+                                             kmp_routine_entry_t task_entry);
+KMP_EXPORT kmp_task_t *__kmpc_omp_target_task_alloc(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, size_t sizeof_kmp_task_t,
+    size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id);
+KMP_EXPORT void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
+                                          kmp_task_t *task);
+KMP_EXPORT void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
+                                             kmp_task_t *task);
+KMP_EXPORT kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
+                                           kmp_task_t *new_task);
+KMP_EXPORT kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid);
+KMP_EXPORT kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid,
+                                          int end_part);
+
+#if TASK_UNUSED
+void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task);
+void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
+                              kmp_task_t *task);
+#endif // TASK_UNUSED
+
+/* ------------------------------------------------------------------------ */
+
+KMP_EXPORT void __kmpc_taskgroup(ident_t *loc, int gtid);
+KMP_EXPORT void __kmpc_end_taskgroup(ident_t *loc, int gtid);
+
+KMP_EXPORT kmp_int32 __kmpc_omp_task_with_deps(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 ndeps,
+    kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
+    kmp_depend_info_t *noalias_dep_list);
+
+KMP_EXPORT kmp_base_depnode_t *__kmpc_task_get_depnode(kmp_task_t *task);
+
+KMP_EXPORT kmp_depnode_list_t *__kmpc_task_get_successors(kmp_task_t *task);
+
+KMP_EXPORT void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid,
+                                     kmp_int32 ndeps,
+                                     kmp_depend_info_t *dep_list,
+                                     kmp_int32 ndeps_noalias,
+                                     kmp_depend_info_t *noalias_dep_list);
+/* __kmpc_omp_taskwait_deps_51 : Function for OpenMP 5.1 nowait clause.
+ *                               Placeholder for taskwait with nowait clause.*/
+KMP_EXPORT void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
+                                            kmp_int32 ndeps,
+                                            kmp_depend_info_t *dep_list,
+                                            kmp_int32 ndeps_noalias,
+                                            kmp_depend_info_t *noalias_dep_list,
+                                            kmp_int32 has_no_wait);
+
+extern kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
+                                bool serialize_immediate);
+
+KMP_EXPORT kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid,
+                                   kmp_int32 cncl_kind);
+KMP_EXPORT kmp_int32 __kmpc_cancellationpoint(ident_t *loc_ref, kmp_int32 gtid,
+                                              kmp_int32 cncl_kind);
+KMP_EXPORT kmp_int32 __kmpc_cancel_barrier(ident_t *loc_ref, kmp_int32 gtid);
+KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind);
+
+KMP_EXPORT void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask);
+KMP_EXPORT void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask);
+KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task,
+                                kmp_int32 if_val, kmp_uint64 *lb,
+                                kmp_uint64 *ub, kmp_int64 st, kmp_int32 nogroup,
+                                kmp_int32 sched, kmp_uint64 grainsize,
+                                void *task_dup);
+KMP_EXPORT void __kmpc_taskloop_5(ident_t *loc, kmp_int32 gtid,
+                                  kmp_task_t *task, kmp_int32 if_val,
+                                  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                                  kmp_int32 nogroup, kmp_int32 sched,
+                                  kmp_uint64 grainsize, kmp_int32 modifier,
+                                  void *task_dup);
+KMP_EXPORT void *__kmpc_task_reduction_init(int gtid, int num_data, void *data);
+KMP_EXPORT void *__kmpc_taskred_init(int gtid, int num_data, void *data);
+KMP_EXPORT void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *d);
+KMP_EXPORT void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid,
+                                                     int is_ws, int num,
+                                                     void *data);
+KMP_EXPORT void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws,
+                                              int num, void *data);
+KMP_EXPORT void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid,
+                                                    int is_ws);
+KMP_EXPORT kmp_int32 __kmpc_omp_reg_task_with_affinity(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins,
+    kmp_task_affinity_info_t *affin_list);
+KMP_EXPORT void __kmp_set_num_teams(int num_teams);
+KMP_EXPORT int __kmp_get_max_teams(void);
+KMP_EXPORT void __kmp_set_teams_thread_limit(int limit);
+KMP_EXPORT int __kmp_get_teams_thread_limit(void);
+
+/* Interface target task integration */
+KMP_EXPORT void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid);
+KMP_EXPORT bool __kmpc_omp_has_task_team(kmp_int32 gtid);
+
+/* Lock interface routines (fast versions with gtid passed in) */
+KMP_EXPORT void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid,
+                                 void **user_lock);
+KMP_EXPORT void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                      void **user_lock);
+KMP_EXPORT void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid,
+                                    void **user_lock);
+KMP_EXPORT void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                         void **user_lock);
+KMP_EXPORT void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock);
+KMP_EXPORT void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                     void **user_lock);
+KMP_EXPORT void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid,
+                                  void **user_lock);
+KMP_EXPORT void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                       void **user_lock);
+KMP_EXPORT int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock);
+KMP_EXPORT int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                     void **user_lock);
+
+KMP_EXPORT void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid,
+                                           void **user_lock, uintptr_t hint);
+KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
+                                                void **user_lock,
+                                                uintptr_t hint);
+
+#if OMPX_TASKGRAPH
+// Taskgraph's Record & Replay mechanism
+// __kmp_tdg_is_recording: check whether a given TDG is recording
+// status: the tdg's current status
+static inline bool __kmp_tdg_is_recording(kmp_tdg_status_t status) {
+  return status == KMP_TDG_RECORDING;
+}
+
+KMP_EXPORT kmp_int32 __kmpc_start_record_task(ident_t *loc, kmp_int32 gtid,
+                                              kmp_int32 input_flags,
+                                              kmp_int32 tdg_id);
+KMP_EXPORT void __kmpc_end_record_task(ident_t *loc, kmp_int32 gtid,
+                                       kmp_int32 input_flags, kmp_int32 tdg_id);
+#endif
+/* Interface to fast scalable reduce methods routines */
+
+KMP_EXPORT kmp_int32 __kmpc_reduce_nowait(
+    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
+    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck);
+KMP_EXPORT void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
+                                         kmp_critical_name *lck);
+KMP_EXPORT kmp_int32 __kmpc_reduce(
+    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
+    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck);
+KMP_EXPORT void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
+                                  kmp_critical_name *lck);
+
+/* Internal fast reduction routines */
+
+extern PACKED_REDUCTION_METHOD_T __kmp_determine_reduction_method(
+    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
+    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck);
+
+// this function is for testing set/get/determine reduce method
+KMP_EXPORT kmp_int32 __kmp_get_reduce_method(void);
+
+KMP_EXPORT kmp_uint64 __kmpc_get_taskid();
+KMP_EXPORT kmp_uint64 __kmpc_get_parent_taskid();
+
+// C++ port
+// missing 'extern "C"' declarations
+
+KMP_EXPORT kmp_int32 __kmpc_in_parallel(ident_t *loc);
+KMP_EXPORT void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
+                                        kmp_int32 num_threads);
+
+KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
+                                      int proc_bind);
+KMP_EXPORT void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
+                                      kmp_int32 num_teams,
+                                      kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
+                                        kmp_int32 thread_limit);
+/* Function for OpenMP 5.1 num_teams clause */
+KMP_EXPORT void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
+                                         kmp_int32 num_teams_lb,
+                                         kmp_int32 num_teams_ub,
+                                         kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc,
+                                  kmpc_micro microtask, ...);
+struct kmp_dim { // loop bounds info casted to kmp_int64
+  kmp_int64 lo; // lower
+  kmp_int64 up; // upper
+  kmp_int64 st; // stride
+};
+KMP_EXPORT void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid,
+                                     kmp_int32 num_dims,
+                                     const struct kmp_dim *dims);
+KMP_EXPORT void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid,
+                                     const kmp_int64 *vec);
+KMP_EXPORT void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid,
+                                     const kmp_int64 *vec);
+KMP_EXPORT void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
+
+KMP_EXPORT void *__kmpc_threadprivate_cached(ident_t *loc, kmp_int32 global_tid,
+                                             void *data, size_t size,
+                                             void ***cache);
+
+// The routines below are not exported.
+// Consider making them 'static' in corresponding source files.
+void kmp_threadprivate_insert_private_data(int gtid, void *pc_addr,
+                                           void *data_addr, size_t pc_size);
+struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
+                                                void *data_addr,
+                                                size_t pc_size);
+void __kmp_threadprivate_resize_cache(int newCapacity);
+void __kmp_cleanup_threadprivate_caches();
+
+// ompc_, kmpc_ entries moved from omp.h.
+#if KMP_OS_WINDOWS
+#define KMPC_CONVENTION __cdecl
+#else
+#define KMPC_CONVENTION
+#endif
+
+#ifndef __OMP_H
+typedef enum omp_sched_t {
+  omp_sched_static = 1,
+  omp_sched_dynamic = 2,
+  omp_sched_guided = 3,
+  omp_sched_auto = 4
+} omp_sched_t;
+typedef void *kmp_affinity_mask_t;
+#endif
+
+KMP_EXPORT void KMPC_CONVENTION ompc_set_max_active_levels(int);
+KMP_EXPORT void KMPC_CONVENTION ompc_set_schedule(omp_sched_t, int);
+KMP_EXPORT int KMPC_CONVENTION ompc_get_ancestor_thread_num(int);
+KMP_EXPORT int KMPC_CONVENTION ompc_get_team_size(int);
+KMP_EXPORT int KMPC_CONVENTION
+kmpc_set_affinity_mask_proc(int, kmp_affinity_mask_t *);
+KMP_EXPORT int KMPC_CONVENTION
+kmpc_unset_affinity_mask_proc(int, kmp_affinity_mask_t *);
+KMP_EXPORT int KMPC_CONVENTION
+kmpc_get_affinity_mask_proc(int, kmp_affinity_mask_t *);
+
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize(int);
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize_s(size_t);
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_library(int);
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_defaults(char const *);
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_disp_num_buffers(int);
+void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format);
+size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size);
+void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format);
+size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
+                                              char const *format);
+
+enum kmp_target_offload_kind {
+  tgt_disabled = 0,
+  tgt_default = 1,
+  tgt_mandatory = 2
+};
+typedef enum kmp_target_offload_kind kmp_target_offload_kind_t;
+// Set via OMP_TARGET_OFFLOAD if specified, defaults to tgt_default otherwise
+extern kmp_target_offload_kind_t __kmp_target_offload;
+extern int __kmpc_get_target_offload();
+
+// Constants used in libomptarget
+#define KMP_DEVICE_DEFAULT -1 // This is libomptarget's default device.
+#define KMP_DEVICE_ALL -11 // This is libomptarget's "all devices".
+
+// OMP Pause Resource
+
+// The following enum is used both to set the status in __kmp_pause_status, and
+// as the internal equivalent of the externally-visible omp_pause_resource_t.
+typedef enum kmp_pause_status_t {
+  kmp_not_paused = 0, // status is not paused, or, requesting resume
+  kmp_soft_paused = 1, // status is soft-paused, or, requesting soft pause
+  kmp_hard_paused = 2 // status is hard-paused, or, requesting hard pause
+} kmp_pause_status_t;
+
+// This stores the pause state of the runtime
+extern kmp_pause_status_t __kmp_pause_status;
+extern int __kmpc_pause_resource(kmp_pause_status_t level);
+extern int __kmp_pause_resource(kmp_pause_status_t level);
+// Soft resume sets __kmp_pause_status, and wakes up all threads.
+extern void __kmp_resume_if_soft_paused();
+// Hard resume simply resets the status to not paused. Library will appear to
+// be uninitialized after hard pause. Let OMP constructs trigger required
+// initializations.
+static inline void __kmp_resume_if_hard_paused() {
+  if (__kmp_pause_status == kmp_hard_paused) {
+    __kmp_pause_status = kmp_not_paused;
+  }
+}
+
+extern void __kmp_omp_display_env(int verbose);
+
+// 1: it is initializing hidden helper team
+extern volatile int __kmp_init_hidden_helper;
+// 1: the hidden helper team is done
+extern volatile int __kmp_hidden_helper_team_done;
+// 1: enable hidden helper task
+extern kmp_int32 __kmp_enable_hidden_helper;
+// Main thread of hidden helper team
+extern kmp_info_t *__kmp_hidden_helper_main_thread;
+// Descriptors for the hidden helper threads
+extern kmp_info_t **__kmp_hidden_helper_threads;
+// Number of hidden helper threads
+extern kmp_int32 __kmp_hidden_helper_threads_num;
+// Number of hidden helper tasks that have not been executed yet
+extern std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
+
+extern void __kmp_hidden_helper_initialize();
+extern void __kmp_hidden_helper_threads_initz_routine();
+extern void __kmp_do_initialize_hidden_helper_threads();
+extern void __kmp_hidden_helper_threads_initz_wait();
+extern void __kmp_hidden_helper_initz_release();
+extern void __kmp_hidden_helper_threads_deinitz_wait();
+extern void __kmp_hidden_helper_threads_deinitz_release();
+extern void __kmp_hidden_helper_main_thread_wait();
+extern void __kmp_hidden_helper_worker_thread_wait();
+extern void __kmp_hidden_helper_worker_thread_signal();
+extern void __kmp_hidden_helper_main_thread_release();
+
+// Check whether a given thread is a hidden helper thread
+#define KMP_HIDDEN_HELPER_THREAD(gtid)                                         \
+  ((gtid) >= 1 && (gtid) <= __kmp_hidden_helper_threads_num)
+
+#define KMP_HIDDEN_HELPER_WORKER_THREAD(gtid)                                  \
+  ((gtid) > 1 && (gtid) <= __kmp_hidden_helper_threads_num)
+
+#define KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)                                    \
+  ((gtid) == 1 && (gtid) <= __kmp_hidden_helper_threads_num)
+
+#define KMP_HIDDEN_HELPER_TEAM(team)                                           \
+  (team->t.t_threads[0] == __kmp_hidden_helper_main_thread)
+
+// Map a gtid to a hidden helper thread. The first hidden helper thread, a.k.a
+// main thread, is skipped.
+#define KMP_GTID_TO_SHADOW_GTID(gtid)                                          \
+  ((gtid) % (__kmp_hidden_helper_threads_num - 1) + 2)
+
+// Return the adjusted gtid value by subtracting from gtid the number
+// of hidden helper threads. This adjusted value is the gtid the thread would
+// have received if there were no hidden helper threads.
+static inline int __kmp_adjust_gtid_for_hidden_helpers(int gtid) {
+  int adjusted_gtid = gtid;
+  if (__kmp_hidden_helper_threads_num > 0 && gtid > 0 &&
+      gtid - __kmp_hidden_helper_threads_num >= 0) {
+    adjusted_gtid -= __kmp_hidden_helper_threads_num;
+  }
+  return adjusted_gtid;
+}
+
+// Support for error directive
+typedef enum kmp_severity_t {
+  severity_warning = 1,
+  severity_fatal = 2
+} kmp_severity_t;
+extern void __kmpc_error(ident_t *loc, int severity, const char *message);
+
+// Support for scope directive
+KMP_EXPORT void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved);
+KMP_EXPORT void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved);
+
+#ifdef __cplusplus
+}
+#endif
+
+template <bool C, bool S>
+extern void __kmp_suspend_32(int th_gtid, kmp_flag_32<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_atomic_suspend_64(int th_gtid,
+                                    kmp_atomic_flag_64<C, S> *flag);
+extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag);
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+template <bool C, bool S>
+extern void __kmp_mwait_32(int th_gtid, kmp_flag_32<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_mwait_64(int th_gtid, kmp_flag_64<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_atomic_mwait_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag);
+extern void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag);
+#endif
+template <bool C, bool S>
+extern void __kmp_resume_32(int target_gtid, kmp_flag_32<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag);
+template <bool C, bool S>
+extern void __kmp_atomic_resume_64(int target_gtid,
+                                   kmp_atomic_flag_64<C, S> *flag);
+extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag);
+
+template <bool C, bool S>
+int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid,
+                           kmp_flag_32<C, S> *flag, int final_spin,
+                           int *thread_finished,
+#if USE_ITT_BUILD
+                           void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                           kmp_int32 is_constrained);
+template <bool C, bool S>
+int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
+                           kmp_flag_64<C, S> *flag, int final_spin,
+                           int *thread_finished,
+#if USE_ITT_BUILD
+                           void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                           kmp_int32 is_constrained);
+template <bool C, bool S>
+int __kmp_atomic_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
+                                  kmp_atomic_flag_64<C, S> *flag,
+                                  int final_spin, int *thread_finished,
+#if USE_ITT_BUILD
+                                  void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                                  kmp_int32 is_constrained);
+int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
+                               kmp_flag_oncore *flag, int final_spin,
+                               int *thread_finished,
+#if USE_ITT_BUILD
+                               void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                               kmp_int32 is_constrained);
+
+extern int __kmp_nesting_mode;
+extern int __kmp_nesting_mode_nlevels;
+extern int *__kmp_nesting_nth_level;
+extern void __kmp_init_nesting_mode();
+extern void __kmp_set_nesting_mode_threads();
+
+/// This class safely opens and closes a C-style FILE* object using RAII
+/// semantics. There are also methods which allow using stdout or stderr as
+/// the underlying FILE* object. With the implicit conversion operator to
+/// FILE*, an object with this type can be used in any function which takes
+/// a FILE* object e.g., fprintf().
+/// No close method is needed at use sites.
+class kmp_safe_raii_file_t {
+  FILE *f;
+
+  void close() {
+    if (f && f != stdout && f != stderr) {
+      fclose(f);
+      f = nullptr;
+    }
+  }
+
+public:
+  kmp_safe_raii_file_t() : f(nullptr) {}
+  kmp_safe_raii_file_t(const char *filename, const char *mode,
+                       const char *env_var = nullptr)
+      : f(nullptr) {
+    open(filename, mode, env_var);
+  }
+  ~kmp_safe_raii_file_t() { close(); }
+
+  /// Open filename using mode. This is automatically closed in the destructor.
+  /// The env_var parameter indicates the environment variable the filename
+  /// came from if != nullptr.
+  void open(const char *filename, const char *mode,
+            const char *env_var = nullptr) {
+    KMP_ASSERT(!f);
+    f = fopen(filename, mode);
+    if (!f) {
+      int code = errno;
+      if (env_var) {
+        __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
+                    KMP_HNT(CheckEnvVar, env_var, filename), __kmp_msg_null);
+      } else {
+        __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
+                    __kmp_msg_null);
+      }
+    }
+  }
+  /// Instead of erroring out, return non-zero when
+  /// unsuccessful fopen() for any reason
+  int try_open(const char *filename, const char *mode) {
+    KMP_ASSERT(!f);
+    f = fopen(filename, mode);
+    if (!f)
+      return errno;
+    return 0;
+  }
+  /// Set the FILE* object to stdout and output there
+  /// No open call should happen before this call.
+  void set_stdout() {
+    KMP_ASSERT(!f);
+    f = stdout;
+  }
+  /// Set the FILE* object to stderr and output there
+  /// No open call should happen before this call.
+  void set_stderr() {
+    KMP_ASSERT(!f);
+    f = stderr;
+  }
+  operator bool() { return bool(f); }
+  operator FILE *() { return f; }
+};
+
+template <typename SourceType, typename TargetType,
+          bool isSourceSmaller = (sizeof(SourceType) < sizeof(TargetType)),
+          bool isSourceEqual = (sizeof(SourceType) == sizeof(TargetType)),
+          bool isSourceSigned = std::is_signed<SourceType>::value,
+          bool isTargetSigned = std::is_signed<TargetType>::value>
+struct kmp_convert {};
+
+// Both types are signed; Source smaller
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, true, false, true, true> {
+  static TargetType to(SourceType src) { return (TargetType)src; }
+};
+// Source equal
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, true, true, true> {
+  static TargetType to(SourceType src) { return src; }
+};
+// Source bigger
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, false, true, true> {
+  static TargetType to(SourceType src) {
+    /* KMP_ASSERT(src <= static_cast<SourceType>( */
+    /*                       (std::numeric_limits<TargetType>::max)())); */
+    /* KMP_ASSERT(src >= static_cast<SourceType>( */
+    /*                       (std::numeric_limits<TargetType>::min)())); */
+    return (TargetType)src;
+  }
+};
+
+// Source signed, Target unsigned
+// Source smaller
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, true, false, true, false> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src >= 0);
+    return (TargetType)src;
+  }
+};
+// Source equal
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, true, true, false> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src >= 0);
+    return (TargetType)src;
+  }
+};
+// Source bigger
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, false, true, false> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src >= 0);
+    KMP_ASSERT(src <= static_cast<SourceType>(
+                          (std::numeric_limits<TargetType>::max)()));
+    return (TargetType)src;
+  }
+};
+
+// Source unsigned, Target signed
+// Source smaller
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, true, false, false, true> {
+  static TargetType to(SourceType src) { return (TargetType)src; }
+};
+// Source equal
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, true, false, true> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src <= static_cast<SourceType>(
+                          (std::numeric_limits<TargetType>::max)()));
+    return (TargetType)src;
+  }
+};
+// Source bigger
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, false, false, true> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src <= static_cast<SourceType>(
+                          (std::numeric_limits<TargetType>::max)()));
+    return (TargetType)src;
+  }
+};
+
+// Source unsigned, Target unsigned
+// Source smaller
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, true, false, false, false> {
+  static TargetType to(SourceType src) { return (TargetType)src; }
+};
+// Source equal
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, true, false, false> {
+  static TargetType to(SourceType src) { return src; }
+};
+// Source bigger
+template <typename SourceType, typename TargetType>
+struct kmp_convert<SourceType, TargetType, false, false, false, false> {
+  static TargetType to(SourceType src) {
+    KMP_ASSERT(src <= static_cast<SourceType>(
+                          (std::numeric_limits<TargetType>::max)()));
+    return (TargetType)src;
+  }
+};
+
+template <typename T1, typename T2>
+static inline void __kmp_type_convert(T1 src, T2 *dest) {
+  *dest = kmp_convert<T1, T2>::to(src);
+}
+
+#endif /* KMP_H */
diff --git a/third_party/openmp/kmp_affinity.cpp b/third_party/openmp/kmp_affinity.cpp
new file mode 100644
index 000000000..6a41d34b0
--- /dev/null
+++ b/third_party/openmp/kmp_affinity.cpp
@@ -0,0 +1,5599 @@
+/*
+ * kmp_affinity.cpp -- affinity management
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_str.h"
+#include "kmp_wrapper_getpid.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
+#if KMP_USE_HWLOC
+// Copied from hwloc
+#define HWLOC_GROUP_KIND_INTEL_MODULE 102
+#define HWLOC_GROUP_KIND_INTEL_TILE 103
+#define HWLOC_GROUP_KIND_INTEL_DIE 104
+#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
+#endif
+#include <ctype.h>
+
+// The machine topology
+kmp_topology_t *__kmp_topology = nullptr;
+// KMP_HW_SUBSET environment variable
+kmp_hw_subset_t *__kmp_hw_subset = nullptr;
+
+// Store the real or imagined machine hierarchy here
+static hierarchy_info machine_hierarchy;
+
+void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
+
+#if KMP_AFFINITY_SUPPORTED
+// Helper class to see if place lists further restrict the fullMask
+class kmp_full_mask_modifier_t {
+  kmp_affin_mask_t *mask;
+
+public:
+  kmp_full_mask_modifier_t() {
+    KMP_CPU_ALLOC(mask);
+    KMP_CPU_ZERO(mask);
+  }
+  ~kmp_full_mask_modifier_t() {
+    KMP_CPU_FREE(mask);
+    mask = nullptr;
+  }
+  void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); }
+  // If the new full mask is different from the current full mask,
+  // then switch them. Returns true if full mask was affected, false otherwise.
+  bool restrict_to_mask() {
+    // See if the new mask further restricts or changes the full mask
+    if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask))
+      return false;
+    return __kmp_topology->restrict_to_mask(mask);
+  }
+};
+
+static inline const char *
+__kmp_get_affinity_env_var(const kmp_affinity_t &affinity,
+                           bool for_binding = false) {
+  if (affinity.flags.omp_places) {
+    if (for_binding)
+      return "OMP_PROC_BIND";
+    return "OMP_PLACES";
+  }
+  return affinity.env_var;
+}
+#endif // KMP_AFFINITY_SUPPORTED
+
+void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
+  kmp_uint32 depth;
+  // The test below is true if affinity is available, but set to "none". Need to
+  // init on first use of hierarchical barrier.
+  if (TCR_1(machine_hierarchy.uninitialized))
+    machine_hierarchy.init(nproc);
+
+  // Adjust the hierarchy in case num threads exceeds original
+  if (nproc > machine_hierarchy.base_num_threads)
+    machine_hierarchy.resize(nproc);
+
+  depth = machine_hierarchy.depth;
+  KMP_DEBUG_ASSERT(depth > 0);
+
+  thr_bar->depth = depth;
+  __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
+                     &(thr_bar->base_leaf_kids));
+  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
+}
+
+static int nCoresPerPkg, nPackages;
+static int __kmp_nThreadsPerCore;
+#ifndef KMP_DFLT_NTH_CORES
+static int __kmp_ncores;
+#endif
+
+const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
+  switch (type) {
+  case KMP_HW_SOCKET:
+    return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
+  case KMP_HW_DIE:
+    return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
+  case KMP_HW_MODULE:
+    return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
+  case KMP_HW_TILE:
+    return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
+  case KMP_HW_NUMA:
+    return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
+  case KMP_HW_L3:
+    return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
+  case KMP_HW_L2:
+    return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
+  case KMP_HW_L1:
+    return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
+  case KMP_HW_LLC:
+    return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
+  case KMP_HW_CORE:
+    return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
+  case KMP_HW_THREAD:
+    return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
+  case KMP_HW_PROC_GROUP:
+    return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
+  case KMP_HW_UNKNOWN:
+  case KMP_HW_LAST:
+    return KMP_I18N_STR(Unknown);
+  }
+  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
+}
+
+const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
+  switch (type) {
+  case KMP_HW_SOCKET:
+    return ((plural) ? "sockets" : "socket");
+  case KMP_HW_DIE:
+    return ((plural) ? "dice" : "die");
+  case KMP_HW_MODULE:
+    return ((plural) ? "modules" : "module");
+  case KMP_HW_TILE:
+    return ((plural) ? "tiles" : "tile");
+  case KMP_HW_NUMA:
+    return ((plural) ? "numa_domains" : "numa_domain");
+  case KMP_HW_L3:
+    return ((plural) ? "l3_caches" : "l3_cache");
+  case KMP_HW_L2:
+    return ((plural) ? "l2_caches" : "l2_cache");
+  case KMP_HW_L1:
+    return ((plural) ? "l1_caches" : "l1_cache");
+  case KMP_HW_LLC:
+    return ((plural) ? "ll_caches" : "ll_cache");
+  case KMP_HW_CORE:
+    return ((plural) ? "cores" : "core");
+  case KMP_HW_THREAD:
+    return ((plural) ? "threads" : "thread");
+  case KMP_HW_PROC_GROUP:
+    return ((plural) ? "proc_groups" : "proc_group");
+  case KMP_HW_UNKNOWN:
+  case KMP_HW_LAST:
+    return ((plural) ? "unknowns" : "unknown");
+  }
+  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
+}
+
+const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
+  switch (type) {
+  case KMP_HW_CORE_TYPE_UNKNOWN:
+  case KMP_HW_MAX_NUM_CORE_TYPES:
+    return "unknown";
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  case KMP_HW_CORE_TYPE_ATOM:
+    return "Intel Atom(R) processor";
+  case KMP_HW_CORE_TYPE_CORE:
+    return "Intel(R) Core(TM) processor";
+#endif
+  }
+  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
+}
+
+#if KMP_AFFINITY_SUPPORTED
+// If affinity is supported, check the affinity
+// verbose and warning flags before printing warning
+#define KMP_AFF_WARNING(s, ...)                                                \
+  if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) {    \
+    KMP_WARNING(__VA_ARGS__);                                                  \
+  }
+#else
+#define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// kmp_hw_thread_t methods
+int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
+  const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
+  const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
+  int depth = __kmp_topology->get_depth();
+  for (int level = 0; level < depth; ++level) {
+    if (ahwthread->ids[level] < bhwthread->ids[level])
+      return -1;
+    else if (ahwthread->ids[level] > bhwthread->ids[level])
+      return 1;
+  }
+  if (ahwthread->os_id < bhwthread->os_id)
+    return -1;
+  else if (ahwthread->os_id > bhwthread->os_id)
+    return 1;
+  return 0;
+}
+
+#if KMP_AFFINITY_SUPPORTED
+int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
+  int i;
+  const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
+  const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
+  int depth = __kmp_topology->get_depth();
+  int compact = __kmp_topology->compact;
+  KMP_DEBUG_ASSERT(compact >= 0);
+  KMP_DEBUG_ASSERT(compact <= depth);
+  for (i = 0; i < compact; i++) {
+    int j = depth - i - 1;
+    if (aa->sub_ids[j] < bb->sub_ids[j])
+      return -1;
+    if (aa->sub_ids[j] > bb->sub_ids[j])
+      return 1;
+  }
+  for (; i < depth; i++) {
+    int j = i - compact;
+    if (aa->sub_ids[j] < bb->sub_ids[j])
+      return -1;
+    if (aa->sub_ids[j] > bb->sub_ids[j])
+      return 1;
+  }
+  return 0;
+}
+#endif
+
+void kmp_hw_thread_t::print() const {
+  int depth = __kmp_topology->get_depth();
+  printf("%4d ", os_id);
+  for (int i = 0; i < depth; ++i) {
+    printf("%4d ", ids[i]);
+  }
+  if (attrs) {
+    if (attrs.is_core_type_valid())
+      printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type()));
+    if (attrs.is_core_eff_valid())
+      printf(" (eff=%d)", attrs.get_core_eff());
+  }
+  if (leader)
+    printf(" (leader)");
+  printf("\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// kmp_topology_t methods
+
+// Add a layer to the topology based on the ids. Assume the topology
+// is perfectly nested (i.e., so no object has more than one parent)
+void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
+  // Figure out where the layer should go by comparing the ids of the current
+  // layers with the new ids
+  int target_layer;
+  int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
+  int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
+
+  // Start from the highest layer and work down to find target layer
+  // If new layer is equal to another layer then put the new layer above
+  for (target_layer = 0; target_layer < depth; ++target_layer) {
+    bool layers_equal = true;
+    bool strictly_above_target_layer = false;
+    for (int i = 0; i < num_hw_threads; ++i) {
+      int id = hw_threads[i].ids[target_layer];
+      int new_id = ids[i];
+      if (id != previous_id && new_id == previous_new_id) {
+        // Found the layer we are strictly above
+        strictly_above_target_layer = true;
+        layers_equal = false;
+        break;
+      } else if (id == previous_id && new_id != previous_new_id) {
+        // Found a layer we are below. Move to next layer and check.
+        layers_equal = false;
+        break;
+      }
+      previous_id = id;
+      previous_new_id = new_id;
+    }
+    if (strictly_above_target_layer || layers_equal)
+      break;
+  }
+
+  // Found the layer we are above. Now move everything to accommodate the new
+  // layer. And put the new ids and type into the topology.
+  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+    types[j] = types[i];
+  types[target_layer] = type;
+  for (int k = 0; k < num_hw_threads; ++k) {
+    for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+      hw_threads[k].ids[j] = hw_threads[k].ids[i];
+    hw_threads[k].ids[target_layer] = ids[k];
+  }
+  equivalent[type] = type;
+  depth++;
+}
+
+#if KMP_GROUP_AFFINITY
+// Insert the Windows Processor Group structure into the topology
+void kmp_topology_t::_insert_windows_proc_groups() {
+  // Do not insert the processor group structure for a single group
+  if (__kmp_num_proc_groups == 1)
+    return;
+  kmp_affin_mask_t *mask;
+  int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
+  KMP_CPU_ALLOC(mask);
+  for (int i = 0; i < num_hw_threads; ++i) {
+    KMP_CPU_ZERO(mask);
+    KMP_CPU_SET(hw_threads[i].os_id, mask);
+    ids[i] = __kmp_get_proc_group(mask);
+  }
+  KMP_CPU_FREE(mask);
+  _insert_layer(KMP_HW_PROC_GROUP, ids);
+  __kmp_free(ids);
+}
+#endif
+
+// Remove layers that don't add information to the topology.
+// This is done by having the layer take on the id = UNKNOWN_ID (-1)
+void kmp_topology_t::_remove_radix1_layers() {
+  int preference[KMP_HW_LAST];
+  int top_index1, top_index2;
+  // Set up preference associative array
+  preference[KMP_HW_SOCKET] = 110;
+  preference[KMP_HW_PROC_GROUP] = 100;
+  preference[KMP_HW_CORE] = 95;
+  preference[KMP_HW_THREAD] = 90;
+  preference[KMP_HW_NUMA] = 85;
+  preference[KMP_HW_DIE] = 80;
+  preference[KMP_HW_TILE] = 75;
+  preference[KMP_HW_MODULE] = 73;
+  preference[KMP_HW_L3] = 70;
+  preference[KMP_HW_L2] = 65;
+  preference[KMP_HW_L1] = 60;
+  preference[KMP_HW_LLC] = 5;
+  top_index1 = 0;
+  top_index2 = 1;
+  while (top_index1 < depth - 1 && top_index2 < depth) {
+    kmp_hw_t type1 = types[top_index1];
+    kmp_hw_t type2 = types[top_index2];
+    KMP_ASSERT_VALID_HW_TYPE(type1);
+    KMP_ASSERT_VALID_HW_TYPE(type2);
+    // Do not allow the three main topology levels (sockets, cores, threads) to
+    // be compacted down
+    if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
+         type1 == KMP_HW_SOCKET) &&
+        (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
+         type2 == KMP_HW_SOCKET)) {
+      top_index1 = top_index2++;
+      continue;
+    }
+    bool radix1 = true;
+    bool all_same = true;
+    int id1 = hw_threads[0].ids[top_index1];
+    int id2 = hw_threads[0].ids[top_index2];
+    int pref1 = preference[type1];
+    int pref2 = preference[type2];
+    for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
+      if (hw_threads[hwidx].ids[top_index1] == id1 &&
+          hw_threads[hwidx].ids[top_index2] != id2) {
+        radix1 = false;
+        break;
+      }
+      if (hw_threads[hwidx].ids[top_index2] != id2)
+        all_same = false;
+      id1 = hw_threads[hwidx].ids[top_index1];
+      id2 = hw_threads[hwidx].ids[top_index2];
+    }
+    if (radix1) {
+      // Select the layer to remove based on preference
+      kmp_hw_t remove_type, keep_type;
+      int remove_layer, remove_layer_ids;
+      if (pref1 > pref2) {
+        remove_type = type2;
+        remove_layer = remove_layer_ids = top_index2;
+        keep_type = type1;
+      } else {
+        remove_type = type1;
+        remove_layer = remove_layer_ids = top_index1;
+        keep_type = type2;
+      }
+      // If all the indexes for the second (deeper) layer are the same.
+      // e.g., all are zero, then make sure to keep the first layer's ids
+      if (all_same)
+        remove_layer_ids = top_index2;
+      // Remove radix one type by setting the equivalence, removing the id from
+      // the hw threads and removing the layer from types and depth
+      set_equivalent_type(remove_type, keep_type);
+      for (int idx = 0; idx < num_hw_threads; ++idx) {
+        kmp_hw_thread_t &hw_thread = hw_threads[idx];
+        for (int d = remove_layer_ids; d < depth - 1; ++d)
+          hw_thread.ids[d] = hw_thread.ids[d + 1];
+      }
+      for (int idx = remove_layer; idx < depth - 1; ++idx)
+        types[idx] = types[idx + 1];
+      depth--;
+    } else {
+      top_index1 = top_index2++;
+    }
+  }
+  KMP_ASSERT(depth > 0);
+}
+
+void kmp_topology_t::_set_last_level_cache() {
+  if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
+    set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
+  else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
+    set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
+#if KMP_MIC_SUPPORTED
+  else if (__kmp_mic_type == mic3) {
+    if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
+    else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
+    // L2/Tile wasn't detected so just say L1
+    else
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
+  }
+#endif
+  else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
+    set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
+  // Fallback is to set last level cache to socket or core
+  if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
+    if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
+    else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
+      set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
+  }
+  KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
+}
+
+// Gather the count of each topology layer and the ratio
+void kmp_topology_t::_gather_enumeration_information() {
+  int previous_id[KMP_HW_LAST];
+  int max[KMP_HW_LAST];
+
+  for (int i = 0; i < depth; ++i) {
+    previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
+    max[i] = 0;
+    count[i] = 0;
+    ratio[i] = 0;
+  }
+  int core_level = get_level(KMP_HW_CORE);
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
+    for (int layer = 0; layer < depth; ++layer) {
+      int id = hw_thread.ids[layer];
+      if (id != previous_id[layer]) {
+        // Add an additional increment to each count
+        for (int l = layer; l < depth; ++l)
+          count[l]++;
+        // Keep track of topology layer ratio statistics
+        max[layer]++;
+        for (int l = layer + 1; l < depth; ++l) {
+          if (max[l] > ratio[l])
+            ratio[l] = max[l];
+          max[l] = 1;
+        }
+        // Figure out the number of different core types
+        // and efficiencies for hybrid CPUs
+        if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
+          if (hw_thread.attrs.is_core_eff_valid() &&
+              hw_thread.attrs.core_eff >= num_core_efficiencies) {
+            // Because efficiencies can range from 0 to max efficiency - 1,
+            // the number of efficiencies is max efficiency + 1
+            num_core_efficiencies = hw_thread.attrs.core_eff + 1;
+          }
+          if (hw_thread.attrs.is_core_type_valid()) {
+            bool found = false;
+            for (int j = 0; j < num_core_types; ++j) {
+              if (hw_thread.attrs.get_core_type() == core_types[j]) {
+                found = true;
+                break;
+              }
+            }
+            if (!found) {
+              KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES);
+              core_types[num_core_types++] = hw_thread.attrs.get_core_type();
+            }
+          }
+        }
+        break;
+      }
+    }
+    for (int layer = 0; layer < depth; ++layer) {
+      previous_id[layer] = hw_thread.ids[layer];
+    }
+  }
+  for (int layer = 0; layer < depth; ++layer) {
+    if (max[layer] > ratio[layer])
+      ratio[layer] = max[layer];
+  }
+}
+
+int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
+                                          int above_level,
+                                          bool find_all) const {
+  int current, current_max;
+  int previous_id[KMP_HW_LAST];
+  for (int i = 0; i < depth; ++i)
+    previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
+  int core_level = get_level(KMP_HW_CORE);
+  if (find_all)
+    above_level = -1;
+  KMP_ASSERT(above_level < core_level);
+  current_max = 0;
+  current = 0;
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
+    if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) {
+      if (current > current_max)
+        current_max = current;
+      current = hw_thread.attrs.contains(attr);
+    } else {
+      for (int level = above_level + 1; level <= core_level; ++level) {
+        if (hw_thread.ids[level] != previous_id[level]) {
+          if (hw_thread.attrs.contains(attr))
+            current++;
+          break;
+        }
+      }
+    }
+    for (int level = 0; level < depth; ++level)
+      previous_id[level] = hw_thread.ids[level];
+  }
+  if (current > current_max)
+    current_max = current;
+  return current_max;
+}
+
+// Find out if the topology is uniform
+void kmp_topology_t::_discover_uniformity() {
+  int num = 1;
+  for (int level = 0; level < depth; ++level)
+    num *= ratio[level];
+  flags.uniform = (num == count[depth - 1]);
+}
+
+// Set all the sub_ids for each hardware thread
+void kmp_topology_t::_set_sub_ids() {
+  int previous_id[KMP_HW_LAST];
+  int sub_id[KMP_HW_LAST];
+
+  for (int i = 0; i < depth; ++i) {
+    previous_id[i] = -1;
+    sub_id[i] = -1;
+  }
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
+    // Setup the sub_id
+    for (int j = 0; j < depth; ++j) {
+      if (hw_thread.ids[j] != previous_id[j]) {
+        sub_id[j]++;
+        for (int k = j + 1; k < depth; ++k) {
+          sub_id[k] = 0;
+        }
+        break;
+      }
+    }
+    // Set previous_id
+    for (int j = 0; j < depth; ++j) {
+      previous_id[j] = hw_thread.ids[j];
+    }
+    // Set the sub_ids field
+    for (int j = 0; j < depth; ++j) {
+      hw_thread.sub_ids[j] = sub_id[j];
+    }
+  }
+}
+
+void kmp_topology_t::_set_globals() {
+  // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
+  int core_level, thread_level, package_level;
+  package_level = get_level(KMP_HW_SOCKET);
+#if KMP_GROUP_AFFINITY
+  if (package_level == -1)
+    package_level = get_level(KMP_HW_PROC_GROUP);
+#endif
+  core_level = get_level(KMP_HW_CORE);
+  thread_level = get_level(KMP_HW_THREAD);
+
+  KMP_ASSERT(core_level != -1);
+  KMP_ASSERT(thread_level != -1);
+
+  __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
+  if (package_level != -1) {
+    nCoresPerPkg = calculate_ratio(core_level, package_level);
+    nPackages = get_count(package_level);
+  } else {
+    // assume one socket
+    nCoresPerPkg = get_count(core_level);
+    nPackages = 1;
+  }
+#ifndef KMP_DFLT_NTH_CORES
+  __kmp_ncores = get_count(core_level);
+#endif
+}
+
+kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
+                                         const kmp_hw_t *types) {
+  kmp_topology_t *retval;
+  // Allocate all data in one large allocation
+  size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
+                sizeof(int) * (size_t)KMP_HW_LAST * 3;
+  char *bytes = (char *)__kmp_allocate(size);
+  retval = (kmp_topology_t *)bytes;
+  if (nproc > 0) {
+    retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
+  } else {
+    retval->hw_threads = nullptr;
+  }
+  retval->num_hw_threads = nproc;
+  retval->depth = ndepth;
+  int *arr =
+      (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
+  retval->types = (kmp_hw_t *)arr;
+  retval->ratio = arr + (size_t)KMP_HW_LAST;
+  retval->count = arr + 2 * (size_t)KMP_HW_LAST;
+  retval->num_core_efficiencies = 0;
+  retval->num_core_types = 0;
+  retval->compact = 0;
+  for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
+    retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
+  KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
+  for (int i = 0; i < ndepth; ++i) {
+    retval->types[i] = types[i];
+    retval->equivalent[types[i]] = types[i];
+  }
+  return retval;
+}
+
+void kmp_topology_t::deallocate(kmp_topology_t *topology) {
+  if (topology)
+    __kmp_free(topology);
+}
+
+bool kmp_topology_t::check_ids() const {
+  // Assume ids have been sorted
+  if (num_hw_threads == 0)
+    return true;
+  for (int i = 1; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &current_thread = hw_threads[i];
+    kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
+    bool unique = false;
+    for (int j = 0; j < depth; ++j) {
+      if (previous_thread.ids[j] != current_thread.ids[j]) {
+        unique = true;
+        break;
+      }
+    }
+    if (unique)
+      continue;
+    return false;
+  }
+  return true;
+}
+
+void kmp_topology_t::dump() const {
+  printf("***********************\n");
+  printf("*** __kmp_topology: ***\n");
+  printf("***********************\n");
+  printf("* depth: %d\n", depth);
+
+  printf("* types: ");
+  for (int i = 0; i < depth; ++i)
+    printf("%15s ", __kmp_hw_get_keyword(types[i]));
+  printf("\n");
+
+  printf("* ratio: ");
+  for (int i = 0; i < depth; ++i) {
+    printf("%15d ", ratio[i]);
+  }
+  printf("\n");
+
+  printf("* count: ");
+  for (int i = 0; i < depth; ++i) {
+    printf("%15d ", count[i]);
+  }
+  printf("\n");
+
+  printf("* num_core_eff: %d\n", num_core_efficiencies);
+  printf("* num_core_types: %d\n", num_core_types);
+  printf("* core_types: ");
+  for (int i = 0; i < num_core_types; ++i)
+    printf("%3d ", core_types[i]);
+  printf("\n");
+
+  printf("* equivalent map:\n");
+  KMP_FOREACH_HW_TYPE(i) {
+    const char *key = __kmp_hw_get_keyword(i);
+    const char *value = __kmp_hw_get_keyword(equivalent[i]);
+    printf("%-15s -> %-15s\n", key, value);
+  }
+
+  printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
+
+  printf("* num_hw_threads: %d\n", num_hw_threads);
+  printf("* hw_threads:\n");
+  for (int i = 0; i < num_hw_threads; ++i) {
+    hw_threads[i].print();
+  }
+  printf("***********************\n");
+}
+
+void kmp_topology_t::print(const char *env_var) const {
+  kmp_str_buf_t buf;
+  int print_types_depth;
+  __kmp_str_buf_init(&buf);
+  kmp_hw_t print_types[KMP_HW_LAST + 2];
+
+  // Num Available Threads
+  if (num_hw_threads) {
+    KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
+  } else {
+    KMP_INFORM(AvailableOSProc, env_var, __kmp_xproc);
+  }
+
+  // Uniform or not
+  if (is_uniform()) {
+    KMP_INFORM(Uniform, env_var);
+  } else {
+    KMP_INFORM(NonUniform, env_var);
+  }
+
+  // Equivalent types
+  KMP_FOREACH_HW_TYPE(type) {
+    kmp_hw_t eq_type = equivalent[type];
+    if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
+      KMP_INFORM(AffEqualTopologyTypes, env_var,
+                 __kmp_hw_get_catalog_string(type),
+                 __kmp_hw_get_catalog_string(eq_type));
+    }
+  }
+
+  // Quick topology
+  KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
+  // Create a print types array that always guarantees printing
+  // the core and thread level
+  print_types_depth = 0;
+  for (int level = 0; level < depth; ++level)
+    print_types[print_types_depth++] = types[level];
+  if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
+    // Force in the core level for quick topology
+    if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
+      // Force core before thread e.g., 1 socket X 2 threads/socket
+      // becomes 1 socket X 1 core/socket X 2 threads/socket
+      print_types[print_types_depth - 1] = KMP_HW_CORE;
+      print_types[print_types_depth++] = KMP_HW_THREAD;
+    } else {
+      print_types[print_types_depth++] = KMP_HW_CORE;
+    }
+  }
+  // Always put threads at very end of quick topology
+  if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
+    print_types[print_types_depth++] = KMP_HW_THREAD;
+
+  __kmp_str_buf_clear(&buf);
+  kmp_hw_t numerator_type;
+  kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
+  int core_level = get_level(KMP_HW_CORE);
+  int ncores = get_count(core_level);
+
+  for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
+    int c;
+    bool plural;
+    numerator_type = print_types[plevel];
+    KMP_ASSERT_VALID_HW_TYPE(numerator_type);
+    if (equivalent[numerator_type] != numerator_type)
+      c = 1;
+    else
+      c = get_ratio(level++);
+    plural = (c > 1);
+    if (plevel == 0) {
+      __kmp_str_buf_print(&buf, "%d %s", c,
+                          __kmp_hw_get_catalog_string(numerator_type, plural));
+    } else {
+      __kmp_str_buf_print(&buf, " x %d %s/%s", c,
+                          __kmp_hw_get_catalog_string(numerator_type, plural),
+                          __kmp_hw_get_catalog_string(denominator_type));
+    }
+    denominator_type = numerator_type;
+  }
+  KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
+
+  // Hybrid topology information
+  if (__kmp_is_hybrid_cpu()) {
+    for (int i = 0; i < num_core_types; ++i) {
+      kmp_hw_core_type_t core_type = core_types[i];
+      kmp_hw_attr_t attr;
+      attr.clear();
+      attr.set_core_type(core_type);
+      int ncores = get_ncores_with_attr(attr);
+      if (ncores > 0) {
+        KMP_INFORM(TopologyHybrid, env_var, ncores,
+                   __kmp_hw_get_core_type_string(core_type));
+        KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS)
+        for (int eff = 0; eff < num_core_efficiencies; ++eff) {
+          attr.set_core_eff(eff);
+          int ncores_with_eff = get_ncores_with_attr(attr);
+          if (ncores_with_eff > 0) {
+            KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff);
+          }
+        }
+      }
+    }
+  }
+
+  if (num_hw_threads <= 0) {
+    __kmp_str_buf_free(&buf);
+    return;
+  }
+
+  // Full OS proc to hardware thread map
+  KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
+  for (int i = 0; i < num_hw_threads; i++) {
+    __kmp_str_buf_clear(&buf);
+    for (int level = 0; level < depth; ++level) {
+      kmp_hw_t type = types[level];
+      __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
+      __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
+    }
+    if (__kmp_is_hybrid_cpu())
+      __kmp_str_buf_print(
+          &buf, "(%s)",
+          __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type()));
+    KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
+  }
+
+  __kmp_str_buf_free(&buf);
+}
+
+#if KMP_AFFINITY_SUPPORTED
+void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const {
+  const char *env_var = __kmp_get_affinity_env_var(affinity);
+  // If requested hybrid CPU attributes for granularity (either OMP_PLACES or
+  // KMP_AFFINITY), but none exist, then reset granularity and have below method
+  // select a granularity and warn user.
+  if (!__kmp_is_hybrid_cpu()) {
+    if (affinity.core_attr_gran.valid) {
+      // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
+      // instead
+      KMP_AFF_WARNING(
+          affinity, AffIgnoringNonHybrid, env_var,
+          __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
+      affinity.gran = KMP_HW_CORE;
+      affinity.gran_levels = -1;
+      affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
+      affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
+    } else if (affinity.flags.core_types_gran ||
+               affinity.flags.core_effs_gran) {
+      // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
+      if (affinity.flags.omp_places) {
+        KMP_AFF_WARNING(
+            affinity, AffIgnoringNonHybrid, env_var,
+            __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
+      } else {
+        // KMP_AFFINITY=granularity=core_type|core_eff,...
+        KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
+                        "Intel(R) Hybrid Technology core attribute",
+                        __kmp_hw_get_catalog_string(KMP_HW_CORE));
+      }
+      affinity.gran = KMP_HW_CORE;
+      affinity.gran_levels = -1;
+      affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
+      affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
+    }
+  }
+  // Set the number of affinity granularity levels
+  if (affinity.gran_levels < 0) {
+    kmp_hw_t gran_type = get_equivalent_type(affinity.gran);
+    // Check if user's granularity request is valid
+    if (gran_type == KMP_HW_UNKNOWN) {
+      // First try core, then thread, then package
+      kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
+      for (auto g : gran_types) {
+        if (get_equivalent_type(g) != KMP_HW_UNKNOWN) {
+          gran_type = g;
+          break;
+        }
+      }
+      KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
+      // Warn user what granularity setting will be used instead
+      KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
+                      __kmp_hw_get_catalog_string(affinity.gran),
+                      __kmp_hw_get_catalog_string(gran_type));
+      affinity.gran = gran_type;
+    }
+#if KMP_GROUP_AFFINITY
+    // If more than one processor group exists, and the level of
+    // granularity specified by the user is too coarse, then the
+    // granularity must be adjusted "down" to processor group affinity
+    // because threads can only exist within one processor group.
+    // For example, if a user sets granularity=socket and there are two
+    // processor groups that cover a socket, then the runtime must
+    // restrict the granularity down to the processor group level.
+    if (__kmp_num_proc_groups > 1) {
+      int gran_depth = get_level(gran_type);
+      int proc_group_depth = get_level(KMP_HW_PROC_GROUP);
+      if (gran_depth >= 0 && proc_group_depth >= 0 &&
+          gran_depth < proc_group_depth) {
+        KMP_AFF_WARNING(affinity, AffGranTooCoarseProcGroup, env_var,
+                        __kmp_hw_get_catalog_string(affinity.gran));
+        affinity.gran = gran_type = KMP_HW_PROC_GROUP;
+      }
+    }
+#endif
+    affinity.gran_levels = 0;
+    for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
+      affinity.gran_levels++;
+  }
+}
+#endif
+
+void kmp_topology_t::canonicalize() {
+#if KMP_GROUP_AFFINITY
+  _insert_windows_proc_groups();
+#endif
+  _remove_radix1_layers();
+  _gather_enumeration_information();
+  _discover_uniformity();
+  _set_sub_ids();
+  _set_globals();
+  _set_last_level_cache();
+
+#if KMP_MIC_SUPPORTED
+  // Manually Add L2 = Tile equivalence
+  if (__kmp_mic_type == mic3) {
+    if (get_level(KMP_HW_L2) != -1)
+      set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
+    else if (get_level(KMP_HW_TILE) != -1)
+      set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
+  }
+#endif
+
+  // Perform post canonicalization checking
+  KMP_ASSERT(depth > 0);
+  for (int level = 0; level < depth; ++level) {
+    // All counts, ratios, and types must be valid
+    KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
+    KMP_ASSERT_VALID_HW_TYPE(types[level]);
+    // Detected types must point to themselves
+    KMP_ASSERT(equivalent[types[level]] == types[level]);
+  }
+}
+
+// Canonicalize an explicit packages X cores/pkg X threads/core topology
+void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
+                                  int nthreads_per_core, int ncores) {
+  int ndepth = 3;
+  depth = ndepth;
+  KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
+  for (int level = 0; level < depth; ++level) {
+    count[level] = 0;
+    ratio[level] = 0;
+  }
+  count[0] = npackages;
+  count[1] = ncores;
+  count[2] = __kmp_xproc;
+  ratio[0] = npackages;
+  ratio[1] = ncores_per_pkg;
+  ratio[2] = nthreads_per_core;
+  equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
+  equivalent[KMP_HW_CORE] = KMP_HW_CORE;
+  equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
+  types[0] = KMP_HW_SOCKET;
+  types[1] = KMP_HW_CORE;
+  types[2] = KMP_HW_THREAD;
+  //__kmp_avail_proc = __kmp_xproc;
+  _discover_uniformity();
+}
+
+// Represents running sub IDs for a single core attribute where
+// attribute values have SIZE possibilities.
+template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
+  int last_level; // last level in topology to consider for sub_ids
+  int sub_id[SIZE]; // The sub ID for a given attribute value
+  int prev_sub_id[KMP_HW_LAST];
+  IndexFunc indexer;
+
+public:
+  kmp_sub_ids_t(int last_level) : last_level(last_level) {
+    KMP_ASSERT(last_level < KMP_HW_LAST);
+    for (size_t i = 0; i < SIZE; ++i)
+      sub_id[i] = -1;
+    for (size_t i = 0; i < KMP_HW_LAST; ++i)
+      prev_sub_id[i] = -1;
+  }
+  void update(const kmp_hw_thread_t &hw_thread) {
+    int idx = indexer(hw_thread);
+    KMP_ASSERT(idx < (int)SIZE);
+    for (int level = 0; level <= last_level; ++level) {
+      if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
+        if (level < last_level)
+          sub_id[idx] = -1;
+        sub_id[idx]++;
+        break;
+      }
+    }
+    for (int level = 0; level <= last_level; ++level)
+      prev_sub_id[level] = hw_thread.sub_ids[level];
+  }
+  int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
+    return sub_id[indexer(hw_thread)];
+  }
+};
+
+#if KMP_AFFINITY_SUPPORTED
+static kmp_str_buf_t *
+__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
+                                 bool plural) {
+  __kmp_str_buf_init(buf);
+  if (attr.is_core_type_valid())
+    __kmp_str_buf_print(buf, "%s %s",
+                        __kmp_hw_get_core_type_string(attr.get_core_type()),
+                        __kmp_hw_get_catalog_string(KMP_HW_CORE, plural));
+  else
+    __kmp_str_buf_print(buf, "%s eff=%d",
+                        __kmp_hw_get_catalog_string(KMP_HW_CORE, plural),
+                        attr.get_core_eff());
+  return buf;
+}
+
+bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) {
+  // Apply the filter
+  bool affected;
+  int new_index = 0;
+  for (int i = 0; i < num_hw_threads; ++i) {
+    int os_id = hw_threads[i].os_id;
+    if (KMP_CPU_ISSET(os_id, mask)) {
+      if (i != new_index)
+        hw_threads[new_index] = hw_threads[i];
+      new_index++;
+    } else {
+      KMP_CPU_CLR(os_id, __kmp_affin_fullMask);
+      __kmp_avail_proc--;
+    }
+  }
+
+  KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
+  affected = (num_hw_threads != new_index);
+  num_hw_threads = new_index;
+
+  // Post hardware subset canonicalization
+  if (affected) {
+    _gather_enumeration_information();
+    _discover_uniformity();
+    _set_globals();
+    _set_last_level_cache();
+#if KMP_OS_WINDOWS
+    // Copy filtered full mask if topology has single processor group
+    if (__kmp_num_proc_groups <= 1)
+#endif
+      __kmp_affin_origMask->copy(__kmp_affin_fullMask);
+  }
+  return affected;
+}
+
+// Apply the KMP_HW_SUBSET envirable to the topology
+// Returns true if KMP_HW_SUBSET filtered any processors
+// otherwise, returns false
+bool kmp_topology_t::filter_hw_subset() {
+  // If KMP_HW_SUBSET wasn't requested, then do nothing.
+  if (!__kmp_hw_subset)
+    return false;
+
+  // First, sort the KMP_HW_SUBSET items by the machine topology
+  __kmp_hw_subset->sort();
+
+  // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
+  bool using_core_types = false;
+  bool using_core_effs = false;
+  int hw_subset_depth = __kmp_hw_subset->get_depth();
+  kmp_hw_t specified[KMP_HW_LAST];
+  int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
+  KMP_ASSERT(hw_subset_depth > 0);
+  KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
+  int core_level = get_level(KMP_HW_CORE);
+  for (int i = 0; i < hw_subset_depth; ++i) {
+    int max_count;
+    const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
+    int num = item.num[0];
+    int offset = item.offset[0];
+    kmp_hw_t type = item.type;
+    kmp_hw_t equivalent_type = equivalent[type];
+    int level = get_level(type);
+    topology_levels[i] = level;
+
+    // Check to see if current layer is in detected machine topology
+    if (equivalent_type != KMP_HW_UNKNOWN) {
+      __kmp_hw_subset->at(i).type = equivalent_type;
+    } else {
+      KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetNotExistGeneric,
+                      __kmp_hw_get_catalog_string(type));
+      return false;
+    }
+
+    // Check to see if current layer has already been
+    // specified either directly or through an equivalent type
+    if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
+      KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetEqvLayers,
+                      __kmp_hw_get_catalog_string(type),
+                      __kmp_hw_get_catalog_string(specified[equivalent_type]));
+      return false;
+    }
+    specified[equivalent_type] = type;
+
+    // Check to see if each layer's num & offset parameters are valid
+    max_count = get_ratio(level);
+    if (max_count < 0 ||
+        (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
+      bool plural = (num > 1);
+      KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
+                      __kmp_hw_get_catalog_string(type, plural));
+      return false;
+    }
+
+    // Check to see if core attributes are consistent
+    if (core_level == level) {
+      // Determine which core attributes are specified
+      for (int j = 0; j < item.num_attrs; ++j) {
+        if (item.attr[j].is_core_type_valid())
+          using_core_types = true;
+        if (item.attr[j].is_core_eff_valid())
+          using_core_effs = true;
+      }
+
+      // Check if using a single core attribute on non-hybrid arch.
+      // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
+      //
+      // Check if using multiple core attributes on non-hyrbid arch.
+      // Ignore all of KMP_HW_SUBSET if this is the case.
+      if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
+        if (item.num_attrs == 1) {
+          if (using_core_effs) {
+            KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
+                            "efficiency");
+          } else {
+            KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
+                            "core_type");
+          }
+          using_core_effs = false;
+          using_core_types = false;
+        } else {
+          KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrsNonHybrid);
+          return false;
+        }
+      }
+
+      // Check if using both core types and core efficiencies together
+      if (using_core_types && using_core_effs) {
+        KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, "core_type",
+                        "efficiency");
+        return false;
+      }
+
+      // Check that core efficiency values are valid
+      if (using_core_effs) {
+        for (int j = 0; j < item.num_attrs; ++j) {
+          if (item.attr[j].is_core_eff_valid()) {
+            int core_eff = item.attr[j].get_core_eff();
+            if (core_eff < 0 || core_eff >= num_core_efficiencies) {
+              kmp_str_buf_t buf;
+              __kmp_str_buf_init(&buf);
+              __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff());
+              __kmp_msg(kmp_ms_warning,
+                        KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str),
+                        KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
+                        __kmp_msg_null);
+              __kmp_str_buf_free(&buf);
+              return false;
+            }
+          }
+        }
+      }
+
+      // Check that the number of requested cores with attributes is valid
+      if (using_core_types || using_core_effs) {
+        for (int j = 0; j < item.num_attrs; ++j) {
+          int num = item.num[j];
+          int offset = item.offset[j];
+          int level_above = core_level - 1;
+          if (level_above >= 0) {
+            max_count = get_ncores_with_attr_per(item.attr[j], level_above);
+            if (max_count <= 0 ||
+                (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
+              kmp_str_buf_t buf;
+              __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
+              KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, buf.str);
+              __kmp_str_buf_free(&buf);
+              return false;
+            }
+          }
+        }
+      }
+
+      if ((using_core_types || using_core_effs) && item.num_attrs > 1) {
+        for (int j = 0; j < item.num_attrs; ++j) {
+          // Ambiguous use of specific core attribute + generic core
+          // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
+          if (!item.attr[j]) {
+            kmp_hw_attr_t other_attr;
+            for (int k = 0; k < item.num_attrs; ++k) {
+              if (item.attr[k] != item.attr[j]) {
+                other_attr = item.attr[k];
+                break;
+              }
+            }
+            kmp_str_buf_t buf;
+            __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
+            KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat,
+                            __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
+            __kmp_str_buf_free(&buf);
+            return false;
+          }
+          // Allow specifying a specific core type or core eff exactly once
+          for (int k = 0; k < j; ++k) {
+            if (!item.attr[j] || !item.attr[k])
+              continue;
+            if (item.attr[k] == item.attr[j]) {
+              kmp_str_buf_t buf;
+              __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
+                                               item.num[j] > 0);
+              KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrRepeat, buf.str);
+              __kmp_str_buf_free(&buf);
+              return false;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  struct core_type_indexer {
+    int operator()(const kmp_hw_thread_t &t) const {
+      switch (t.attrs.get_core_type()) {
+      case KMP_HW_CORE_TYPE_UNKNOWN:
+      case KMP_HW_MAX_NUM_CORE_TYPES:
+        return 0;
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+      case KMP_HW_CORE_TYPE_ATOM:
+        return 1;
+      case KMP_HW_CORE_TYPE_CORE:
+        return 2;
+#endif
+      }
+      KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
+      KMP_BUILTIN_UNREACHABLE;
+    }
+  };
+  struct core_eff_indexer {
+    int operator()(const kmp_hw_thread_t &t) const {
+      return t.attrs.get_core_eff();
+    }
+  };
+
+  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
+      core_level);
+  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
+      core_level);
+
+  // Determine which hardware threads should be filtered.
+  int num_filtered = 0;
+  kmp_affin_mask_t *filtered_mask;
+  KMP_CPU_ALLOC(filtered_mask);
+  KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
+  for (int i = 0; i < num_hw_threads; ++i) {
+    kmp_hw_thread_t &hw_thread = hw_threads[i];
+    // Update type_sub_id
+    if (using_core_types)
+      core_type_sub_ids.update(hw_thread);
+    if (using_core_effs)
+      core_eff_sub_ids.update(hw_thread);
+
+    // Check to see if this hardware thread should be filtered
+    bool should_be_filtered = false;
+    for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth;
+         ++hw_subset_index) {
+      const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
+      int level = topology_levels[hw_subset_index];
+      if (level == -1)
+        continue;
+      if ((using_core_effs || using_core_types) && level == core_level) {
+        // Look for the core attribute in KMP_HW_SUBSET which corresponds
+        // to this hardware thread's core attribute. Use this num,offset plus
+        // the running sub_id for the particular core attribute of this hardware
+        // thread to determine if the hardware thread should be filtered or not.
+        int attr_idx;
+        kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type();
+        int core_eff = hw_thread.attrs.get_core_eff();
+        for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) {
+          if (using_core_types &&
+              hw_subset_item.attr[attr_idx].get_core_type() == core_type)
+            break;
+          if (using_core_effs &&
+              hw_subset_item.attr[attr_idx].get_core_eff() == core_eff)
+            break;
+        }
+        // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
+        if (attr_idx == hw_subset_item.num_attrs) {
+          should_be_filtered = true;
+          break;
+        }
+        int sub_id;
+        int num = hw_subset_item.num[attr_idx];
+        int offset = hw_subset_item.offset[attr_idx];
+        if (using_core_types)
+          sub_id = core_type_sub_ids.get_sub_id(hw_thread);
+        else
+          sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
+        if (sub_id < offset ||
+            (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
+          should_be_filtered = true;
+          break;
+        }
+      } else {
+        int num = hw_subset_item.num[0];
+        int offset = hw_subset_item.offset[0];
+        if (hw_thread.sub_ids[level] < offset ||
+            (num != kmp_hw_subset_t::USE_ALL &&
+             hw_thread.sub_ids[level] >= offset + num)) {
+          should_be_filtered = true;
+          break;
+        }
+      }
+    }
+    // Collect filtering information
+    if (should_be_filtered) {
+      KMP_CPU_CLR(hw_thread.os_id, filtered_mask);
+      num_filtered++;
+    }
+  }
+
+  // One last check that we shouldn't allow filtering entire machine
+  if (num_filtered == num_hw_threads) {
+    KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
+    return false;
+  }
+
+  // Apply the filter
+  restrict_to_mask(filtered_mask);
+  return true;
+}
+
+bool kmp_topology_t::is_close(int hwt1, int hwt2,
+                              const kmp_affinity_t &stgs) const {
+  int hw_level = stgs.gran_levels;
+  if (hw_level >= depth)
+    return true;
+  bool retval = true;
+  const kmp_hw_thread_t &t1 = hw_threads[hwt1];
+  const kmp_hw_thread_t &t2 = hw_threads[hwt2];
+  if (stgs.flags.core_types_gran)
+    return t1.attrs.get_core_type() == t2.attrs.get_core_type();
+  if (stgs.flags.core_effs_gran)
+    return t1.attrs.get_core_eff() == t2.attrs.get_core_eff();
+  for (int i = 0; i < (depth - hw_level); ++i) {
+    if (t1.ids[i] != t2.ids[i])
+      return false;
+  }
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+bool KMPAffinity::picked_api = false;
+
+void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
+void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
+void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
+void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
+void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
+void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
+
+void KMPAffinity::pick_api() {
+  KMPAffinity *affinity_dispatch;
+  if (picked_api)
+    return;
+#if KMP_USE_HWLOC
+  // Only use Hwloc if affinity isn't explicitly disabled and
+  // user requests Hwloc topology method
+  if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
+      __kmp_affinity.type != affinity_disabled) {
+    affinity_dispatch = new KMPHwlocAffinity();
+  } else
+#endif
+  {
+    affinity_dispatch = new KMPNativeAffinity();
+  }
+  __kmp_affinity_dispatch = affinity_dispatch;
+  picked_api = true;
+}
+
+void KMPAffinity::destroy_api() {
+  if (__kmp_affinity_dispatch != NULL) {
+    delete __kmp_affinity_dispatch;
+    __kmp_affinity_dispatch = NULL;
+    picked_api = false;
+  }
+}
+
+#define KMP_ADVANCE_SCAN(scan)                                                 \
+  while (*scan != '\0') {                                                      \
+    scan++;                                                                    \
+  }
+
+// Print the affinity mask to the character array in a pretty format.
+// The format is a comma separated list of non-negative integers or integer
+// ranges: e.g., 1,2,3-5,7,9-15
+// The format can also be the string "{<empty>}" if no bits are set in mask
+char *__kmp_affinity_print_mask(char *buf, int buf_len,
+                                kmp_affin_mask_t *mask) {
+  int start = 0, finish = 0, previous = 0;
+  bool first_range;
+  KMP_ASSERT(buf);
+  KMP_ASSERT(buf_len >= 40);
+  KMP_ASSERT(mask);
+  char *scan = buf;
+  char *end = buf + buf_len - 1;
+
+  // Check for empty set.
+  if (mask->begin() == mask->end()) {
+    KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
+    KMP_ADVANCE_SCAN(scan);
+    KMP_ASSERT(scan <= end);
+    return buf;
+  }
+
+  first_range = true;
+  start = mask->begin();
+  while (1) {
+    // Find next range
+    // [start, previous] is inclusive range of contiguous bits in mask
+    for (finish = mask->next(start), previous = start;
+         finish == previous + 1 && finish != mask->end();
+         finish = mask->next(finish)) {
+      previous = finish;
+    }
+
+    // The first range does not need a comma printed before it, but the rest
+    // of the ranges do need a comma beforehand
+    if (!first_range) {
+      KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
+      KMP_ADVANCE_SCAN(scan);
+    } else {
+      first_range = false;
+    }
+    // Range with three or more contiguous bits in the affinity mask
+    if (previous - start > 1) {
+      KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
+    } else {
+      // Range with one or two contiguous bits in the affinity mask
+      KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
+      KMP_ADVANCE_SCAN(scan);
+      if (previous - start > 0) {
+        KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
+      }
+    }
+    KMP_ADVANCE_SCAN(scan);
+    // Start over with new start point
+    start = finish;
+    if (start == mask->end())
+      break;
+    // Check for overflow
+    if (end - scan < 2)
+      break;
+  }
+
+  // Check for overflow
+  KMP_ASSERT(scan <= end);
+  return buf;
+}
+#undef KMP_ADVANCE_SCAN
+
+// Print the affinity mask to the string buffer object in a pretty format
+// The format is a comma separated list of non-negative integers or integer
+// ranges: e.g., 1,2,3-5,7,9-15
+// The format can also be the string "{<empty>}" if no bits are set in mask
+kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
+                                           kmp_affin_mask_t *mask) {
+  int start = 0, finish = 0, previous = 0;
+  bool first_range;
+  KMP_ASSERT(buf);
+  KMP_ASSERT(mask);
+
+  __kmp_str_buf_clear(buf);
+
+  // Check for empty set.
+  if (mask->begin() == mask->end()) {
+    __kmp_str_buf_print(buf, "%s", "{<empty>}");
+    return buf;
+  }
+
+  first_range = true;
+  start = mask->begin();
+  while (1) {
+    // Find next range
+    // [start, previous] is inclusive range of contiguous bits in mask
+    for (finish = mask->next(start), previous = start;
+         finish == previous + 1 && finish != mask->end();
+         finish = mask->next(finish)) {
+      previous = finish;
+    }
+
+    // The first range does not need a comma printed before it, but the rest
+    // of the ranges do need a comma beforehand
+    if (!first_range) {
+      __kmp_str_buf_print(buf, "%s", ",");
+    } else {
+      first_range = false;
+    }
+    // Range with three or more contiguous bits in the affinity mask
+    if (previous - start > 1) {
+      __kmp_str_buf_print(buf, "%u-%u", start, previous);
+    } else {
+      // Range with one or two contiguous bits in the affinity mask
+      __kmp_str_buf_print(buf, "%u", start);
+      if (previous - start > 0) {
+        __kmp_str_buf_print(buf, ",%u", previous);
+      }
+    }
+    // Start over with new start point
+    start = finish;
+    if (start == mask->end())
+      break;
+  }
+  return buf;
+}
+
+// Return (possibly empty) affinity mask representing the offline CPUs
+// Caller must free the mask
+kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
+  kmp_affin_mask_t *offline;
+  KMP_CPU_ALLOC(offline);
+  KMP_CPU_ZERO(offline);
+#if KMP_OS_LINUX
+  int n, begin_cpu, end_cpu;
+  kmp_safe_raii_file_t offline_file;
+  auto skip_ws = [](FILE *f) {
+    int c;
+    do {
+      c = fgetc(f);
+    } while (isspace(c));
+    if (c != EOF)
+      ungetc(c, f);
+  };
+  // File contains CSV of integer ranges representing the offline CPUs
+  // e.g., 1,2,4-7,9,11-15
+  int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
+  if (status != 0)
+    return offline;
+  while (!feof(offline_file)) {
+    skip_ws(offline_file);
+    n = fscanf(offline_file, "%d", &begin_cpu);
+    if (n != 1)
+      break;
+    skip_ws(offline_file);
+    int c = fgetc(offline_file);
+    if (c == EOF || c == ',') {
+      // Just single CPU
+      end_cpu = begin_cpu;
+    } else if (c == '-') {
+      // Range of CPUs
+      skip_ws(offline_file);
+      n = fscanf(offline_file, "%d", &end_cpu);
+      if (n != 1)
+        break;
+      skip_ws(offline_file);
+      c = fgetc(offline_file); // skip ','
+    } else {
+      // Syntax problem
+      break;
+    }
+    // Ensure a valid range of CPUs
+    if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 ||
+        end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
+      continue;
+    }
+    // Insert [begin_cpu, end_cpu] into offline mask
+    for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
+      KMP_CPU_SET(cpu, offline);
+    }
+  }
+#endif
+  return offline;
+}
+
+// Return the number of available procs
+int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
+  int avail_proc = 0;
+  KMP_CPU_ZERO(mask);
+
+#if KMP_GROUP_AFFINITY
+
+  if (__kmp_num_proc_groups > 1) {
+    int group;
+    KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
+    for (group = 0; group < __kmp_num_proc_groups; group++) {
+      int i;
+      int num = __kmp_GetActiveProcessorCount(group);
+      for (i = 0; i < num; i++) {
+        KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
+        avail_proc++;
+      }
+    }
+  } else
+
+#endif /* KMP_GROUP_AFFINITY */
+
+  {
+    int proc;
+    kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus();
+    for (proc = 0; proc < __kmp_xproc; proc++) {
+      // Skip offline CPUs
+      if (KMP_CPU_ISSET(proc, offline_cpus))
+        continue;
+      KMP_CPU_SET(proc, mask);
+      avail_proc++;
+    }
+    KMP_CPU_FREE(offline_cpus);
+  }
+
+  return avail_proc;
+}
+
+// All of the __kmp_affinity_create_*_map() routines should allocate the
+// internal topology object and set the layer ids for it.  Each routine
+// returns a boolean on whether it was successful at doing so.
+kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
+// Original mask is a subset of full mask in multiple processor groups topology
+kmp_affin_mask_t *__kmp_affin_origMask = NULL;
+
+#if KMP_USE_HWLOC
+static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
+#if HWLOC_API_VERSION >= 0x00020000
+  return hwloc_obj_type_is_cache(obj->type);
+#else
+  return obj->type == HWLOC_OBJ_CACHE;
+#endif
+}
+
+// Returns KMP_HW_* type derived from HWLOC_* type
+static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
+
+  if (__kmp_hwloc_is_cache_type(obj)) {
+    if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
+      return KMP_HW_UNKNOWN;
+    switch (obj->attr->cache.depth) {
+    case 1:
+      return KMP_HW_L1;
+    case 2:
+#if KMP_MIC_SUPPORTED
+      if (__kmp_mic_type == mic3) {
+        return KMP_HW_TILE;
+      }
+#endif
+      return KMP_HW_L2;
+    case 3:
+      return KMP_HW_L3;
+    }
+    return KMP_HW_UNKNOWN;
+  }
+
+  switch (obj->type) {
+  case HWLOC_OBJ_PACKAGE:
+    return KMP_HW_SOCKET;
+  case HWLOC_OBJ_NUMANODE:
+    return KMP_HW_NUMA;
+  case HWLOC_OBJ_CORE:
+    return KMP_HW_CORE;
+  case HWLOC_OBJ_PU:
+    return KMP_HW_THREAD;
+  case HWLOC_OBJ_GROUP:
+#if HWLOC_API_VERSION >= 0x00020000
+    if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
+      return KMP_HW_DIE;
+    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
+      return KMP_HW_TILE;
+    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
+      return KMP_HW_MODULE;
+    else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
+      return KMP_HW_PROC_GROUP;
+#endif
+    return KMP_HW_UNKNOWN;
+#if HWLOC_API_VERSION >= 0x00020100
+  case HWLOC_OBJ_DIE:
+    return KMP_HW_DIE;
+#endif
+  }
+  return KMP_HW_UNKNOWN;
+}
+
+// Returns the number of objects of type 'type' below 'obj' within the topology
+// tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
+// HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
+// object.
+static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
+                                           hwloc_obj_type_t type) {
+  int retval = 0;
+  hwloc_obj_t first;
+  for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
+                                           obj->logical_index, type, 0);
+       first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
+                                                       obj->type, first) == obj;
+       first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
+                                          first)) {
+    ++retval;
+  }
+  return retval;
+}
+
+// This gets the sub_id for a lower object under a higher object in the
+// topology tree
+static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
+                                  hwloc_obj_t lower) {
+  hwloc_obj_t obj;
+  hwloc_obj_type_t ltype = lower->type;
+  int lindex = lower->logical_index - 1;
+  int sub_id = 0;
+  // Get the previous lower object
+  obj = hwloc_get_obj_by_type(t, ltype, lindex);
+  while (obj && lindex >= 0 &&
+         hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
+    if (obj->userdata) {
+      sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
+      break;
+    }
+    sub_id++;
+    lindex--;
+    obj = hwloc_get_obj_by_type(t, ltype, lindex);
+  }
+  // store sub_id + 1 so that 0 is differed from NULL
+  lower->userdata = RCAST(void *, sub_id + 1);
+  return sub_id;
+}
+
+static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
+  kmp_hw_t type;
+  int hw_thread_index, sub_id;
+  int depth;
+  hwloc_obj_t pu, obj, root, prev;
+  kmp_hw_t types[KMP_HW_LAST];
+  hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
+
+  hwloc_topology_t tp = __kmp_hwloc_topology;
+  *msg_id = kmp_i18n_null;
+  if (__kmp_affinity.flags.verbose) {
+    KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+  }
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    // Hack to try and infer the machine topology using only the data
+    // available from hwloc on the current thread, and __kmp_xproc.
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
+    // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
+    hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
+    if (o != NULL)
+      nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
+    else
+      nCoresPerPkg = 1; // no PACKAGE found
+    o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
+    if (o != NULL)
+      __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
+    else
+      __kmp_nThreadsPerCore = 1; // no CORE found
+    __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
+    if (nCoresPerPkg == 0)
+      nCoresPerPkg = 1; // to prevent possible division by 0
+    nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    return true;
+  }
+
+#if HWLOC_API_VERSION >= 0x00020400
+  // Handle multiple types of cores if they exist on the system
+  int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
+
+  typedef struct kmp_hwloc_cpukinds_info_t {
+    int efficiency;
+    kmp_hw_core_type_t core_type;
+    hwloc_bitmap_t mask;
+  } kmp_hwloc_cpukinds_info_t;
+  kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;
+
+  if (nr_cpu_kinds > 0) {
+    unsigned nr_infos;
+    struct hwloc_info_s *infos;
+    cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
+        sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
+    for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
+      cpukinds[idx].efficiency = -1;
+      cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+      cpukinds[idx].mask = hwloc_bitmap_alloc();
+      if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
+                                  &cpukinds[idx].efficiency, &nr_infos, &infos,
+                                  0) == 0) {
+        for (unsigned i = 0; i < nr_infos; ++i) {
+          if (__kmp_str_match("CoreType", 8, infos[i].name)) {
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+            if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
+              cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
+              break;
+            } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
+              cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
+              break;
+            }
+#endif
+          }
+        }
+      }
+    }
+  }
+#endif
+
+  root = hwloc_get_root_obj(tp);
+
+  // Figure out the depth and types in the topology
+  depth = 0;
+  pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
+  KMP_ASSERT(pu);
+  obj = pu;
+  types[depth] = KMP_HW_THREAD;
+  hwloc_types[depth] = obj->type;
+  depth++;
+  while (obj != root && obj != NULL) {
+    obj = obj->parent;
+#if HWLOC_API_VERSION >= 0x00020000
+    if (obj->memory_arity) {
+      hwloc_obj_t memory;
+      for (memory = obj->memory_first_child; memory;
+           memory = hwloc_get_next_child(tp, obj, memory)) {
+        if (memory->type == HWLOC_OBJ_NUMANODE)
+          break;
+      }
+      if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
+        types[depth] = KMP_HW_NUMA;
+        hwloc_types[depth] = memory->type;
+        depth++;
+      }
+    }
+#endif
+    type = __kmp_hwloc_type_2_topology_type(obj);
+    if (type != KMP_HW_UNKNOWN) {
+      types[depth] = type;
+      hwloc_types[depth] = obj->type;
+      depth++;
+    }
+  }
+  KMP_ASSERT(depth > 0);
+
+  // Get the order for the types correct
+  for (int i = 0, j = depth - 1; i < j; ++i, --j) {
+    hwloc_obj_type_t hwloc_temp = hwloc_types[i];
+    kmp_hw_t temp = types[i];
+    types[i] = types[j];
+    types[j] = temp;
+    hwloc_types[i] = hwloc_types[j];
+    hwloc_types[j] = hwloc_temp;
+  }
+
+  // Allocate the data structure to be returned.
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
+
+  hw_thread_index = 0;
+  pu = NULL;
+  while ((pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu))) {
+    int index = depth - 1;
+    bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
+    if (included) {
+      hw_thread.clear();
+      hw_thread.ids[index] = pu->logical_index;
+      hw_thread.os_id = pu->os_index;
+      // If multiple core types, then set that attribute for the hardware thread
+#if HWLOC_API_VERSION >= 0x00020400
+      if (cpukinds) {
+        int cpukind_index = -1;
+        for (int i = 0; i < nr_cpu_kinds; ++i) {
+          if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
+            cpukind_index = i;
+            break;
+          }
+        }
+        if (cpukind_index >= 0) {
+          hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
+          hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
+        }
+      }
+#endif
+      index--;
+    }
+    obj = pu;
+    prev = obj;
+    while (obj != root && obj != NULL) {
+      obj = obj->parent;
+#if HWLOC_API_VERSION >= 0x00020000
+      // NUMA Nodes are handled differently since they are not within the
+      // parent/child structure anymore.  They are separate children
+      // of obj (memory_first_child points to first memory child)
+      if (obj->memory_arity) {
+        hwloc_obj_t memory;
+        for (memory = obj->memory_first_child; memory;
+             memory = hwloc_get_next_child(tp, obj, memory)) {
+          if (memory->type == HWLOC_OBJ_NUMANODE)
+            break;
+        }
+        if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
+          sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
+          if (included) {
+            hw_thread.ids[index] = memory->logical_index;
+            hw_thread.ids[index + 1] = sub_id;
+            index--;
+          }
+          prev = memory;
+        }
+        prev = obj;
+      }
+#endif
+      type = __kmp_hwloc_type_2_topology_type(obj);
+      if (type != KMP_HW_UNKNOWN) {
+        sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
+        if (included) {
+          hw_thread.ids[index] = obj->logical_index;
+          hw_thread.ids[index + 1] = sub_id;
+          index--;
+        }
+        prev = obj;
+      }
+    }
+    if (included)
+      hw_thread_index++;
+  }
+
+#if HWLOC_API_VERSION >= 0x00020400
+  // Free the core types information
+  if (cpukinds) {
+    for (int idx = 0; idx < nr_cpu_kinds; ++idx)
+      hwloc_bitmap_free(cpukinds[idx].mask);
+    __kmp_free(cpukinds);
+  }
+#endif
+  __kmp_topology->sort_ids();
+  return true;
+}
+#endif // KMP_USE_HWLOC
+
+// If we don't know how to retrieve the machine's processor topology, or
+// encounter an error in doing so, this routine is called to form a "flat"
+// mapping of os thread id's <-> processor id's.
+static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
+  *msg_id = kmp_i18n_null;
+  int depth = 3;
+  kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
+
+  if (__kmp_affinity.flags.verbose) {
+    KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
+  }
+
+  // Even if __kmp_affinity.type == affinity_none, this routine might still
+  // be called to set __kmp_ncores, as well as
+  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  if (!KMP_AFFINITY_CAPABLE()) {
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
+    __kmp_ncores = nPackages = __kmp_xproc;
+    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+    return true;
+  }
+
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly, and return now if affinity is
+  // not enabled.
+  __kmp_ncores = nPackages = __kmp_avail_proc;
+  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+
+  // Construct the data structure to be returned.
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
+  int avail_ct = 0;
+  int i;
+  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+      continue;
+    }
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
+    hw_thread.clear();
+    hw_thread.os_id = i;
+    hw_thread.ids[0] = i;
+    hw_thread.ids[1] = 0;
+    hw_thread.ids[2] = 0;
+    avail_ct++;
+  }
+  if (__kmp_affinity.flags.verbose) {
+    KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
+  }
+  return true;
+}
+
+#if KMP_GROUP_AFFINITY
+// If multiple Windows* OS processor groups exist, we can create a 2-level
+// topology map with the groups at level 0 and the individual procs at level 1.
+// This facilitates letting the threads float among all procs in a group,
+// if granularity=group (the default when there are multiple groups).
+static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
+  *msg_id = kmp_i18n_null;
+  int depth = 3;
+  kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
+  const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
+
+  if (__kmp_affinity.flags.verbose) {
+    KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
+  }
+
+  // If we aren't affinity capable, then use flat topology
+  if (!KMP_AFFINITY_CAPABLE()) {
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
+    nPackages = __kmp_num_proc_groups;
+    __kmp_nThreadsPerCore = 1;
+    __kmp_ncores = __kmp_xproc;
+    nCoresPerPkg = nPackages / __kmp_ncores;
+    return true;
+  }
+
+  // Construct the data structure to be returned.
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
+  int avail_ct = 0;
+  int i;
+  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+      continue;
+    }
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
+    hw_thread.clear();
+    hw_thread.os_id = i;
+    hw_thread.ids[0] = i / BITS_PER_GROUP;
+    hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
+  }
+  return true;
+}
+#endif /* KMP_GROUP_AFFINITY */
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+template <kmp_uint32 LSB, kmp_uint32 MSB>
+static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
+  const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
+  const kmp_uint32 SHIFT_RIGHT = LSB;
+  kmp_uint32 retval = v;
+  retval <<= SHIFT_LEFT;
+  retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
+  return retval;
+}
+
+static int __kmp_cpuid_mask_width(int count) {
+  int r = 0;
+
+  while ((1 << r) < count)
+    ++r;
+  return r;
+}
+
+class apicThreadInfo {
+public:
+  unsigned osId; // param to __kmp_affinity_bind_thread
+  unsigned apicId; // from cpuid after binding
+  unsigned maxCoresPerPkg; //      ""
+  unsigned maxThreadsPerPkg; //      ""
+  unsigned pkgId; // inferred from above values
+  unsigned coreId; //      ""
+  unsigned threadId; //      ""
+};
+
+static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
+                                                     const void *b) {
+  const apicThreadInfo *aa = (const apicThreadInfo *)a;
+  const apicThreadInfo *bb = (const apicThreadInfo *)b;
+  if (aa->pkgId < bb->pkgId)
+    return -1;
+  if (aa->pkgId > bb->pkgId)
+    return 1;
+  if (aa->coreId < bb->coreId)
+    return -1;
+  if (aa->coreId > bb->coreId)
+    return 1;
+  if (aa->threadId < bb->threadId)
+    return -1;
+  if (aa->threadId > bb->threadId)
+    return 1;
+  return 0;
+}
+
+class kmp_cache_info_t {
+public:
+  struct info_t {
+    unsigned level, mask;
+  };
+  kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
+  size_t get_depth() const { return depth; }
+  info_t &operator[](size_t index) { return table[index]; }
+  const info_t &operator[](size_t index) const { return table[index]; }
+
+  static kmp_hw_t get_topology_type(unsigned level) {
+    KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
+    switch (level) {
+    case 1:
+      return KMP_HW_L1;
+    case 2:
+      return KMP_HW_L2;
+    case 3:
+      return KMP_HW_L3;
+    }
+    return KMP_HW_UNKNOWN;
+  }
+
+private:
+  static const int MAX_CACHE_LEVEL = 3;
+
+  size_t depth;
+  info_t table[MAX_CACHE_LEVEL];
+
+  void get_leaf4_levels() {
+    unsigned level = 0;
+    while (depth < MAX_CACHE_LEVEL) {
+      unsigned cache_type, max_threads_sharing;
+      unsigned cache_level, cache_mask_width;
+      kmp_cpuid buf2;
+      __kmp_x86_cpuid(4, level, &buf2);
+      cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
+      if (!cache_type)
+        break;
+      // Skip instruction caches
+      if (cache_type == 2) {
+        level++;
+        continue;
+      }
+      max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
+      cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
+      cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
+      table[depth].level = cache_level;
+      table[depth].mask = ((-1) << cache_mask_width);
+      depth++;
+      level++;
+    }
+  }
+};
+
+// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
+// an algorithm which cycles through the available os threads, setting
+// the current thread's affinity mask to that thread, and then retrieves
+// the Apic Id for each thread context using the cpuid instruction.
+static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
+  kmp_cpuid buf;
+  *msg_id = kmp_i18n_null;
+
+  if (__kmp_affinity.flags.verbose) {
+    KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
+  }
+
+  // Check if cpuid leaf 4 is supported.
+  __kmp_x86_cpuid(0, 0, &buf);
+  if (buf.eax < 4) {
+    *msg_id = kmp_i18n_str_NoLeaf4Support;
+    return false;
+  }
+
+  // The algorithm used starts by setting the affinity to each available thread
+  // and retrieving info from the cpuid instruction, so if we are not capable of
+  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
+  // need to do something else - use the defaults that we calculated from
+  // issuing cpuid without binding to each proc.
+  if (!KMP_AFFINITY_CAPABLE()) {
+    // Hack to try and infer the machine topology using only the data
+    // available from cpuid on the current thread, and __kmp_xproc.
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
+
+    // Get an upper bound on the number of threads per package using cpuid(1).
+    // On some OS/chps combinations where HT is supported by the chip but is
+    // disabled, this value will be 2 on a single core chip. Usually, it will be
+    // 2 if HT is enabled and 1 if HT is disabled.
+    __kmp_x86_cpuid(1, 0, &buf);
+    int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
+    if (maxThreadsPerPkg == 0) {
+      maxThreadsPerPkg = 1;
+    }
+
+    // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
+    // value.
+    //
+    // The author of cpu_count.cpp treated this only an upper bound on the
+    // number of cores, but I haven't seen any cases where it was greater than
+    // the actual number of cores, so we will treat it as exact in this block of
+    // code.
+    //
+    // First, we need to check if cpuid(4) is supported on this chip. To see if
+    // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
+    // greater.
+    __kmp_x86_cpuid(0, 0, &buf);
+    if (buf.eax >= 4) {
+      __kmp_x86_cpuid(4, 0, &buf);
+      nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
+    } else {
+      nCoresPerPkg = 1;
+    }
+
+    // There is no way to reliably tell if HT is enabled without issuing the
+    // cpuid instruction from every thread, can correlating the cpuid info, so
+    // if the machine is not affinity capable, we assume that HT is off. We have
+    // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
+    // does not support HT.
+    //
+    // - Older OSes are usually found on machines with older chips, which do not
+    //   support HT.
+    // - The performance penalty for mistakenly identifying a machine as HT when
+    //   it isn't (which results in blocktime being incorrectly set to 0) is
+    //   greater than the penalty when for mistakenly identifying a machine as
+    //   being 1 thread/core when it is really HT enabled (which results in
+    //   blocktime being incorrectly set to a positive value).
+    __kmp_ncores = __kmp_xproc;
+    nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    __kmp_nThreadsPerCore = 1;
+    return true;
+  }
+
+  // From here on, we can assume that it is safe to call
+  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
+  // __kmp_affinity.type = affinity_none.
+
+  // Save the affinity mask for the current thread.
+  kmp_affinity_raii_t previous_affinity;
+
+  // Run through each of the available contexts, binding the current thread
+  // to it, and obtaining the pertinent information using the cpuid instr.
+  //
+  // The relevant information is:
+  // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
+  //     has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
+  // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
+  //     of this field determines the width of the core# + thread# fields in the
+  //     Apic Id. It is also an upper bound on the number of threads per
+  //     package, but it has been verified that situations happen were it is not
+  //     exact. In particular, on certain OS/chip combinations where Intel(R)
+  //     Hyper-Threading Technology is supported by the chip but has been
+  //     disabled, the value of this field will be 2 (for a single core chip).
+  //     On other OS/chip combinations supporting Intel(R) Hyper-Threading
+  //     Technology, the value of this field will be 1 when Intel(R)
+  //     Hyper-Threading Technology is disabled and 2 when it is enabled.
+  // - Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4). The value
+  //     of this field (+1) determines the width of the core# field in the Apic
+  //     Id. The comments in "cpucount.cpp" say that this value is an upper
+  //     bound, but the IA-32 architecture manual says that it is exactly the
+  //     number of cores per package, and I haven't seen any case where it
+  //     wasn't.
+  //
+  // From this information, deduce the package Id, core Id, and thread Id,
+  // and set the corresponding fields in the apicThreadInfo struct.
+  unsigned i;
+  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
+      __kmp_avail_proc * sizeof(apicThreadInfo));
+  unsigned nApics = 0;
+  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+      continue;
+    }
+    KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
+
+    __kmp_affinity_dispatch->bind_thread(i);
+    threadInfo[nApics].osId = i;
+
+    // The apic id and max threads per pkg come from cpuid(1).
+    __kmp_x86_cpuid(1, 0, &buf);
+    if (((buf.edx >> 9) & 1) == 0) {
+      __kmp_free(threadInfo);
+      *msg_id = kmp_i18n_str_ApicNotPresent;
+      return false;
+    }
+    threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
+    threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
+    if (threadInfo[nApics].maxThreadsPerPkg == 0) {
+      threadInfo[nApics].maxThreadsPerPkg = 1;
+    }
+
+    // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
+    // value.
+    //
+    // First, we need to check if cpuid(4) is supported on this chip. To see if
+    // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
+    // or greater.
+    __kmp_x86_cpuid(0, 0, &buf);
+    if (buf.eax >= 4) {
+      __kmp_x86_cpuid(4, 0, &buf);
+      threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
+    } else {
+      threadInfo[nApics].maxCoresPerPkg = 1;
+    }
+
+    // Infer the pkgId / coreId / threadId using only the info obtained locally.
+    int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
+    threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
+
+    int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
+    int widthT = widthCT - widthC;
+    if (widthT < 0) {
+      // I've never seen this one happen, but I suppose it could, if the cpuid
+      // instruction on a chip was really screwed up. Make sure to restore the
+      // affinity mask before the tail call.
+      __kmp_free(threadInfo);
+      *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+      return false;
+    }
+
+    int maskC = (1 << widthC) - 1;
+    threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
+
+    int maskT = (1 << widthT) - 1;
+    threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
+
+    nApics++;
+  }
+
+  // We've collected all the info we need.
+  // Restore the old affinity mask for this thread.
+  previous_affinity.restore();
+
+  // Sort the threadInfo table by physical Id.
+  qsort(threadInfo, nApics, sizeof(*threadInfo),
+        __kmp_affinity_cmp_apicThreadInfo_phys_id);
+
+  // The table is now sorted by pkgId / coreId / threadId, but we really don't
+  // know the radix of any of the fields. pkgId's may be sparsely assigned among
+  // the chips on a system. Although coreId's are usually assigned
+  // [0 .. coresPerPkg-1] and threadId's are usually assigned
+  // [0..threadsPerCore-1], we don't want to make any such assumptions.
+  //
+  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
+  // total # packages) are at this point - we want to determine that now. We
+  // only have an upper bound on the first two figures.
+  //
+  // We also perform a consistency check at this point: the values returned by
+  // the cpuid instruction for any thread bound to a given package had better
+  // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
+  nPackages = 1;
+  nCoresPerPkg = 1;
+  __kmp_nThreadsPerCore = 1;
+  unsigned nCores = 1;
+
+  unsigned pkgCt = 1; // to determine radii
+  unsigned lastPkgId = threadInfo[0].pkgId;
+  unsigned coreCt = 1;
+  unsigned lastCoreId = threadInfo[0].coreId;
+  unsigned threadCt = 1;
+  unsigned lastThreadId = threadInfo[0].threadId;
+
+  // intra-pkg consist checks
+  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
+  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
+
+  for (i = 1; i < nApics; i++) {
+    if (threadInfo[i].pkgId != lastPkgId) {
+      nCores++;
+      pkgCt++;
+      lastPkgId = threadInfo[i].pkgId;
+      if ((int)coreCt > nCoresPerPkg)
+        nCoresPerPkg = coreCt;
+      coreCt = 1;
+      lastCoreId = threadInfo[i].coreId;
+      if ((int)threadCt > __kmp_nThreadsPerCore)
+        __kmp_nThreadsPerCore = threadCt;
+      threadCt = 1;
+      lastThreadId = threadInfo[i].threadId;
+
+      // This is a different package, so go on to the next iteration without
+      // doing any consistency checks. Reset the consistency check vars, though.
+      prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
+      prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
+      continue;
+    }
+
+    if (threadInfo[i].coreId != lastCoreId) {
+      nCores++;
+      coreCt++;
+      lastCoreId = threadInfo[i].coreId;
+      if ((int)threadCt > __kmp_nThreadsPerCore)
+        __kmp_nThreadsPerCore = threadCt;
+      threadCt = 1;
+      lastThreadId = threadInfo[i].threadId;
+    } else if (threadInfo[i].threadId != lastThreadId) {
+      threadCt++;
+      lastThreadId = threadInfo[i].threadId;
+    } else {
+      __kmp_free(threadInfo);
+      *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
+      return false;
+    }
+
+    // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
+    // fields agree between all the threads bounds to a given package.
+    if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
+        (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
+      __kmp_free(threadInfo);
+      *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
+      return false;
+    }
+  }
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly
+  nPackages = pkgCt;
+  if ((int)coreCt > nCoresPerPkg)
+    nCoresPerPkg = coreCt;
+  if ((int)threadCt > __kmp_nThreadsPerCore)
+    __kmp_nThreadsPerCore = threadCt;
+  __kmp_ncores = nCores;
+  KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
+
+  // Now that we've determined the number of packages, the number of cores per
+  // package, and the number of threads per core, we can construct the data
+  // structure that is to be returned.
+  int idx = 0;
+  int pkgLevel = 0;
+  int coreLevel = 1;
+  int threadLevel = 2;
+  //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
+  int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
+  kmp_hw_t types[3];
+  if (pkgLevel >= 0)
+    types[idx++] = KMP_HW_SOCKET;
+  if (coreLevel >= 0)
+    types[idx++] = KMP_HW_CORE;
+  if (threadLevel >= 0)
+    types[idx++] = KMP_HW_THREAD;
+
+  KMP_ASSERT(depth > 0);
+  __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
+
+  for (i = 0; i < nApics; ++i) {
+    idx = 0;
+    unsigned os = threadInfo[i].osId;
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+    hw_thread.clear();
+
+    if (pkgLevel >= 0) {
+      hw_thread.ids[idx++] = threadInfo[i].pkgId;
+    }
+    if (coreLevel >= 0) {
+      hw_thread.ids[idx++] = threadInfo[i].coreId;
+    }
+    if (threadLevel >= 0) {
+      hw_thread.ids[idx++] = threadInfo[i].threadId;
+    }
+    hw_thread.os_id = os;
+  }
+
+  __kmp_free(threadInfo);
+  __kmp_topology->sort_ids();
+  if (!__kmp_topology->check_ids()) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+    *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
+    return false;
+  }
+  return true;
+}
+
+// Hybrid cpu detection using CPUID.1A
+// Thread should be pinned to processor already
+static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
+                                  unsigned *native_model_id) {
+  kmp_cpuid buf;
+  __kmp_x86_cpuid(0x1a, 0, &buf);
+  *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
+  switch (*type) {
+  case KMP_HW_CORE_TYPE_ATOM:
+    *efficiency = 0;
+    break;
+  case KMP_HW_CORE_TYPE_CORE:
+    *efficiency = 1;
+    break;
+  default:
+    *efficiency = 0;
+  }
+  *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
+}
+
+// Intel(R) microarchitecture code name Nehalem, Dunnington and later
+// architectures support a newer interface for specifying the x2APIC Ids,
+// based on CPUID.B or CPUID.1F
+/*
+ * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
+    Bits            Bits            Bits           Bits
+    31-16           15-8            7-4            4-0
+---+-----------+--------------+-------------+-----------------+
+EAX| reserved  |   reserved   |   reserved  |  Bits to Shift  |
+---+-----------|--------------+-------------+-----------------|
+EBX| reserved  | Num logical processors at level (16 bits)    |
+---+-----------|--------------+-------------------------------|
+ECX| reserved  |   Level Type |      Level Number (8 bits)    |
+---+-----------+--------------+-------------------------------|
+EDX|                    X2APIC ID (32 bits)                   |
+---+----------------------------------------------------------+
+*/
+
+enum {
+  INTEL_LEVEL_TYPE_INVALID = 0, // Package level
+  INTEL_LEVEL_TYPE_SMT = 1,
+  INTEL_LEVEL_TYPE_CORE = 2,
+  INTEL_LEVEL_TYPE_MODULE = 3,
+  INTEL_LEVEL_TYPE_TILE = 4,
+  INTEL_LEVEL_TYPE_DIE = 5,
+  INTEL_LEVEL_TYPE_LAST = 6,
+};
+
+struct cpuid_level_info_t {
+  unsigned level_type, mask, mask_width, nitems, cache_mask;
+};
+
+static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
+  switch (intel_type) {
+  case INTEL_LEVEL_TYPE_INVALID:
+    return KMP_HW_SOCKET;
+  case INTEL_LEVEL_TYPE_SMT:
+    return KMP_HW_THREAD;
+  case INTEL_LEVEL_TYPE_CORE:
+    return KMP_HW_CORE;
+  case INTEL_LEVEL_TYPE_TILE:
+    return KMP_HW_TILE;
+  case INTEL_LEVEL_TYPE_MODULE:
+    return KMP_HW_MODULE;
+  case INTEL_LEVEL_TYPE_DIE:
+    return KMP_HW_DIE;
+  }
+  return KMP_HW_UNKNOWN;
+}
+
+// This function takes the topology leaf, a levels array to store the levels
+// detected and a bitmap of the known levels.
+// Returns the number of levels in the topology
+static unsigned
+__kmp_x2apicid_get_levels(int leaf,
+                          cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
+                          kmp_uint64 known_levels) {
+  unsigned level, levels_index;
+  unsigned level_type, mask_width, nitems;
+  kmp_cpuid buf;
+
+  // New algorithm has known topology layers act as highest unknown topology
+  // layers when unknown topology layers exist.
+  // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
+  // are unknown topology layers, Then SMT will take the characteristics of
+  // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
+  // This eliminates unknown portions of the topology while still keeping the
+  // correct structure.
+  level = levels_index = 0;
+  do {
+    __kmp_x86_cpuid(leaf, level, &buf);
+    level_type = __kmp_extract_bits<8, 15>(buf.ecx);
+    mask_width = __kmp_extract_bits<0, 4>(buf.eax);
+    nitems = __kmp_extract_bits<0, 15>(buf.ebx);
+    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
+      return 0;
+
+    if (known_levels & (1ull << level_type)) {
+      // Add a new level to the topology
+      KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
+      levels[levels_index].level_type = level_type;
+      levels[levels_index].mask_width = mask_width;
+      levels[levels_index].nitems = nitems;
+      levels_index++;
+    } else {
+      // If it is an unknown level, then logically move the previous layer up
+      if (levels_index > 0) {
+        levels[levels_index - 1].mask_width = mask_width;
+        levels[levels_index - 1].nitems = nitems;
+      }
+    }
+    level++;
+  } while (level_type != INTEL_LEVEL_TYPE_INVALID);
+
+  // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
+  if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
+    return 0;
+
+  // Set the masks to & with apicid
+  for (unsigned i = 0; i < levels_index; ++i) {
+    if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
+      levels[i].mask = ~((-1) << levels[i].mask_width);
+      levels[i].cache_mask = (-1) << levels[i].mask_width;
+      for (unsigned j = 0; j < i; ++j)
+        levels[i].mask ^= levels[j].mask;
+    } else {
+      KMP_DEBUG_ASSERT(i > 0);
+      levels[i].mask = (-1) << levels[i - 1].mask_width;
+      levels[i].cache_mask = 0;
+    }
+  }
+  return levels_index;
+}
+
+static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
+
+  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
+  kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
+  unsigned levels_index;
+  kmp_cpuid buf;
+  kmp_uint64 known_levels;
+  int topology_leaf, highest_leaf, apic_id;
+  int num_leaves;
+  static int leaves[] = {0, 0};
+
+  kmp_i18n_id_t leaf_message_id;
+
+  KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
+
+  *msg_id = kmp_i18n_null;
+  if (__kmp_affinity.flags.verbose) {
+    KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
+  }
+
+  // Figure out the known topology levels
+  known_levels = 0ull;
+  for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
+    if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
+      known_levels |= (1ull << i);
+    }
+  }
+
+  // Get the highest cpuid leaf supported
+  __kmp_x86_cpuid(0, 0, &buf);
+  highest_leaf = buf.eax;
+
+  // If a specific topology method was requested, only allow that specific leaf
+  // otherwise, try both leaves 31 and 11 in that order
+  num_leaves = 0;
+  if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
+    num_leaves = 1;
+    leaves[0] = 11;
+    leaf_message_id = kmp_i18n_str_NoLeaf11Support;
+  } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
+    num_leaves = 1;
+    leaves[0] = 31;
+    leaf_message_id = kmp_i18n_str_NoLeaf31Support;
+  } else {
+    num_leaves = 2;
+    leaves[0] = 31;
+    leaves[1] = 11;
+    leaf_message_id = kmp_i18n_str_NoLeaf11Support;
+  }
+
+  // Check to see if cpuid leaf 31 or 11 is supported.
+  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+  topology_leaf = -1;
+  for (int i = 0; i < num_leaves; ++i) {
+    int leaf = leaves[i];
+    if (highest_leaf < leaf)
+      continue;
+    __kmp_x86_cpuid(leaf, 0, &buf);
+    if (buf.ebx == 0)
+      continue;
+    topology_leaf = leaf;
+    levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
+    if (levels_index == 0)
+      continue;
+    break;
+  }
+  if (topology_leaf == -1 || levels_index == 0) {
+    *msg_id = leaf_message_id;
+    return false;
+  }
+  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
+
+  // The algorithm used starts by setting the affinity to each available thread
+  // and retrieving info from the cpuid instruction, so if we are not capable of
+  // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
+  // we need to do something else - use the defaults that we calculated from
+  // issuing cpuid without binding to each proc.
+  if (!KMP_AFFINITY_CAPABLE()) {
+    // Hack to try and infer the machine topology using only the data
+    // available from cpuid on the current thread, and __kmp_xproc.
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
+    for (unsigned i = 0; i < levels_index; ++i) {
+      if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
+        __kmp_nThreadsPerCore = levels[i].nitems;
+      } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
+        nCoresPerPkg = levels[i].nitems;
+      }
+    }
+    __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
+    nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    return true;
+  }
+
+  // Allocate the data structure to be returned.
+  int depth = levels_index;
+  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
+    types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
+  __kmp_topology =
+      kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
+
+  // Insert equivalent cache types if they exist
+  kmp_cache_info_t cache_info;
+  for (size_t i = 0; i < cache_info.get_depth(); ++i) {
+    const kmp_cache_info_t::info_t &info = cache_info[i];
+    unsigned cache_mask = info.mask;
+    unsigned cache_level = info.level;
+    for (unsigned j = 0; j < levels_index; ++j) {
+      unsigned hw_cache_mask = levels[j].cache_mask;
+      kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
+      if (hw_cache_mask == cache_mask && j < levels_index - 1) {
+        kmp_hw_t type =
+            __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
+        __kmp_topology->set_equivalent_type(cache_type, type);
+      }
+    }
+  }
+
+  // From here on, we can assume that it is safe to call
+  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
+  // __kmp_affinity.type = affinity_none.
+
+  // Save the affinity mask for the current thread.
+  kmp_affinity_raii_t previous_affinity;
+
+  // Run through each of the available contexts, binding the current thread
+  // to it, and obtaining the pertinent information using the cpuid instr.
+  unsigned int proc;
+  int hw_thread_index = 0;
+  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
+    cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
+    unsigned my_levels_index;
+
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+      continue;
+    }
+    KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
+
+    __kmp_affinity_dispatch->bind_thread(proc);
+
+    // New algorithm
+    __kmp_x86_cpuid(topology_leaf, 0, &buf);
+    apic_id = buf.edx;
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
+    my_levels_index =
+        __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
+    if (my_levels_index == 0 || my_levels_index != levels_index) {
+      *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+      return false;
+    }
+    hw_thread.clear();
+    hw_thread.os_id = proc;
+    // Put in topology information
+    for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
+      hw_thread.ids[idx] = apic_id & my_levels[j].mask;
+      if (j > 0) {
+        hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
+      }
+    }
+    // Hybrid information
+    if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
+      kmp_hw_core_type_t type;
+      unsigned native_model_id;
+      int efficiency;
+      __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
+      hw_thread.attrs.set_core_type(type);
+      hw_thread.attrs.set_core_eff(efficiency);
+    }
+    hw_thread_index++;
+  }
+  KMP_ASSERT(hw_thread_index > 0);
+  __kmp_topology->sort_ids();
+  if (!__kmp_topology->check_ids()) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+    *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
+    return false;
+  }
+  return true;
+}
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#define osIdIndex 0
+#define threadIdIndex 1
+#define coreIdIndex 2
+#define pkgIdIndex 3
+#define nodeIdIndex 4
+
+typedef unsigned *ProcCpuInfo;
+static unsigned maxIndex = pkgIdIndex;
+
+static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
+                                                  const void *b) {
+  unsigned i;
+  const unsigned *aa = *(unsigned *const *)a;
+  const unsigned *bb = *(unsigned *const *)b;
+  for (i = maxIndex;; i--) {
+    if (aa[i] < bb[i])
+      return -1;
+    if (aa[i] > bb[i])
+      return 1;
+    if (i == osIdIndex)
+      break;
+  }
+  return 0;
+}
+
+#if KMP_USE_HIER_SCHED
+// Set the array sizes for the hierarchy layers
+static void __kmp_dispatch_set_hierarchy_values() {
+  // Set the maximum number of L1's to number of cores
+  // Set the maximum number of L2's to either number of cores / 2 for
+  // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
+  // Or the number of cores for Intel(R) Xeon(R) processors
+  // Set the maximum number of NUMA nodes and L3's to number of packages
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
+      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
+    KMP_MIC_SUPPORTED
+  if (__kmp_mic_type >= mic3)
+    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
+  else
+#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
+  // Set the number of threads per unit
+  // Number of hardware threads per L1/L2/L3/NUMA/LOOP
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
+      __kmp_nThreadsPerCore;
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) &&   \
+    KMP_MIC_SUPPORTED
+  if (__kmp_mic_type >= mic3)
+    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
+        2 * __kmp_nThreadsPerCore;
+  else
+#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
+        __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
+      nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
+      nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
+      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
+}
+
+// Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
+// i.e., this thread's L1 or this thread's L2, etc.
+int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
+  int index = type + 1;
+  int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
+  KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
+  if (type == kmp_hier_layer_e::LAYER_THREAD)
+    return tid;
+  else if (type == kmp_hier_layer_e::LAYER_LOOP)
+    return 0;
+  KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
+  if (tid >= num_hw_threads)
+    tid = tid % num_hw_threads;
+  return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
+}
+
+// Return the number of t1's per t2
+int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
+  int i1 = t1 + 1;
+  int i2 = t2 + 1;
+  KMP_DEBUG_ASSERT(i1 <= i2);
+  KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
+  KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
+  KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
+  // (nthreads/t2) / (nthreads/t1) = t1 / t2
+  return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
+}
+#endif // KMP_USE_HIER_SCHED
+
+static inline const char *__kmp_cpuinfo_get_filename() {
+  const char *filename;
+  if (__kmp_cpuinfo_file != nullptr)
+    filename = __kmp_cpuinfo_file;
+  else
+    filename = "/proc/cpuinfo";
+  return filename;
+}
+
+static inline const char *__kmp_cpuinfo_get_envvar() {
+  const char *envvar = nullptr;
+  if (__kmp_cpuinfo_file != nullptr)
+    envvar = "KMP_CPUINFO_FILE";
+  return envvar;
+}
+
+// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
+// affinity map.
+static bool __kmp_affinity_create_cpuinfo_map(int *line,
+                                              kmp_i18n_id_t *const msg_id) {
+  const char *filename = __kmp_cpuinfo_get_filename();
+  const char *envvar = __kmp_cpuinfo_get_envvar();
+  *msg_id = kmp_i18n_null;
+
+  if (__kmp_affinity.flags.verbose) {
+    KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
+  }
+
+  kmp_safe_raii_file_t f(filename, "r", envvar);
+
+  // Scan of the file, and count the number of "processor" (osId) fields,
+  // and find the highest value of <n> for a node_<n> field.
+  char buf[256];
+  unsigned num_records = 0;
+  while (!feof(f)) {
+    buf[sizeof(buf) - 1] = 1;
+    if (!fgets(buf, sizeof(buf), f)) {
+      // Read errors presumably because of EOF
+      break;
+    }
+
+    char s1[] = "processor";
+    if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
+      num_records++;
+      continue;
+    }
+
+    // FIXME - this will match "node_<n> <garbage>"
+    unsigned level;
+    if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
+      // validate the input fisrt:
+      if (level > (unsigned)__kmp_xproc) { // level is too big
+        level = __kmp_xproc;
+      }
+      if (nodeIdIndex + level >= maxIndex) {
+        maxIndex = nodeIdIndex + level;
+      }
+      continue;
+    }
+  }
+
+  // Check for empty file / no valid processor records, or too many. The number
+  // of records can't exceed the number of valid bits in the affinity mask.
+  if (num_records == 0) {
+    *msg_id = kmp_i18n_str_NoProcRecords;
+    return false;
+  }
+  if (num_records > (unsigned)__kmp_xproc) {
+    *msg_id = kmp_i18n_str_TooManyProcRecords;
+    return false;
+  }
+
+  // Set the file pointer back to the beginning, so that we can scan the file
+  // again, this time performing a full parse of the data. Allocate a vector of
+  // ProcCpuInfo object, where we will place the data. Adding an extra element
+  // at the end allows us to remove a lot of extra checks for termination
+  // conditions.
+  if (fseek(f, 0, SEEK_SET) != 0) {
+    *msg_id = kmp_i18n_str_CantRewindCpuinfo;
+    return false;
+  }
+
+  // Allocate the array of records to store the proc info in.  The dummy
+  // element at the end makes the logic in filling them out easier to code.
+  unsigned **threadInfo =
+      (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
+  unsigned i;
+  for (i = 0; i <= num_records; i++) {
+    threadInfo[i] =
+        (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  }
+
+#define CLEANUP_THREAD_INFO                                                    \
+  for (i = 0; i <= num_records; i++) {                                         \
+    __kmp_free(threadInfo[i]);                                                 \
+  }                                                                            \
+  __kmp_free(threadInfo);
+
+  // A value of UINT_MAX means that we didn't find the field
+  unsigned __index;
+
+#define INIT_PROC_INFO(p)                                                      \
+  for (__index = 0; __index <= maxIndex; __index++) {                          \
+    (p)[__index] = UINT_MAX;                                                   \
+  }
+
+  for (i = 0; i <= num_records; i++) {
+    INIT_PROC_INFO(threadInfo[i]);
+  }
+
+  unsigned num_avail = 0;
+  *line = 0;
+#if KMP_ARCH_S390X
+  bool reading_s390x_sys_info = true;
+#endif
+  while (!feof(f)) {
+    // Create an inner scoping level, so that all the goto targets at the end of
+    // the loop appear in an outer scoping level. This avoids warnings about
+    // jumping past an initialization to a target in the same block.
+    {
+      buf[sizeof(buf) - 1] = 1;
+      bool long_line = false;
+      if (!fgets(buf, sizeof(buf), f)) {
+        // Read errors presumably because of EOF
+        // If there is valid data in threadInfo[num_avail], then fake
+        // a blank line in ensure that the last address gets parsed.
+        bool valid = false;
+        for (i = 0; i <= maxIndex; i++) {
+          if (threadInfo[num_avail][i] != UINT_MAX) {
+            valid = true;
+          }
+        }
+        if (!valid) {
+          break;
+        }
+        buf[0] = 0;
+      } else if (!buf[sizeof(buf) - 1]) {
+        // The line is longer than the buffer.  Set a flag and don't
+        // emit an error if we were going to ignore the line, anyway.
+        long_line = true;
+
+#define CHECK_LINE                                                             \
+  if (long_line) {                                                             \
+    CLEANUP_THREAD_INFO;                                                       \
+    *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
+    return false;                                                              \
+  }
+      }
+      (*line)++;
+
+#if KMP_ARCH_LOONGARCH64
+      // The parsing logic of /proc/cpuinfo in this function highly depends on
+      // the blank lines between each processor info block. But on LoongArch a
+      // blank line exists before the first processor info block (i.e. after the
+      // "system type" line). This blank line was added because the "system
+      // type" line is unrelated to any of the CPUs. We must skip this line so
+      // that the original logic works on LoongArch.
+      if (*buf == '\n' && *line == 2)
+        continue;
+#endif
+#if KMP_ARCH_S390X
+      // s390x /proc/cpuinfo starts with a variable number of lines containing
+      // the overall system information. Skip them.
+      if (reading_s390x_sys_info) {
+        if (*buf == '\n')
+          reading_s390x_sys_info = false;
+        continue;
+      }
+#endif
+
+#if KMP_ARCH_S390X
+      char s1[] = "cpu number";
+#else
+      char s1[] = "processor";
+#endif
+      if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s1) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
+#if KMP_ARCH_AARCH64
+          // Handle the old AArch64 /proc/cpuinfo layout differently,
+          // it contains all of the 'processor' entries listed in a
+          // single 'Processor' section, therefore the normal looking
+          // for duplicates in that section will always fail.
+          num_avail++;
+#else
+          goto dup_field;
+#endif
+        threadInfo[num_avail][osIdIndex] = val;
+#if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
+        char path[256];
+        KMP_SNPRINTF(
+            path, sizeof(path),
+            "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
+            threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
+
+#if KMP_ARCH_S390X
+        // Disambiguate physical_package_id.
+        unsigned book_id;
+        KMP_SNPRINTF(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%u/topology/book_id",
+                     threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &book_id);
+        threadInfo[num_avail][pkgIdIndex] |= (book_id << 8);
+
+        unsigned drawer_id;
+        KMP_SNPRINTF(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%u/topology/drawer_id",
+                     threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &drawer_id);
+        threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16);
+#endif
+
+        KMP_SNPRINTF(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
+                     threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
+        continue;
+#else
+      }
+      char s2[] = "physical id";
+      if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s2) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][pkgIdIndex] = val;
+        continue;
+      }
+      char s3[] = "core id";
+      if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s3) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][coreIdIndex] = val;
+        continue;
+#endif // KMP_OS_LINUX && USE_SYSFS_INFO
+      }
+      char s4[] = "thread id";
+      if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s4) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][threadIdIndex] = val;
+        continue;
+      }
+      unsigned level;
+      if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s4) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        // validate the input before using level:
+        if (level > (unsigned)__kmp_xproc) { // level is too big
+          level = __kmp_xproc;
+        }
+        if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][nodeIdIndex + level] = val;
+        continue;
+      }
+
+      // We didn't recognize the leading token on the line. There are lots of
+      // leading tokens that we don't recognize - if the line isn't empty, go on
+      // to the next line.
+      if ((*buf != 0) && (*buf != '\n')) {
+        // If the line is longer than the buffer, read characters
+        // until we find a newline.
+        if (long_line) {
+          int ch;
+          while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
+            ;
+        }
+        continue;
+      }
+
+      // A newline has signalled the end of the processor record.
+      // Check that there aren't too many procs specified.
+      if ((int)num_avail == __kmp_xproc) {
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_TooManyEntries;
+        return false;
+      }
+
+      // Check for missing fields.  The osId field must be there, and we
+      // currently require that the physical id field is specified, also.
+      if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_MissingProcField;
+        return false;
+      }
+      if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_MissingPhysicalIDField;
+        return false;
+      }
+
+      // Skip this proc if it is not included in the machine model.
+      if (KMP_AFFINITY_CAPABLE() &&
+          !KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
+                         __kmp_affin_fullMask)) {
+        INIT_PROC_INFO(threadInfo[num_avail]);
+        continue;
+      }
+
+      // We have a successful parse of this proc's info.
+      // Increment the counter, and prepare for the next proc.
+      num_avail++;
+      KMP_ASSERT(num_avail <= num_records);
+      INIT_PROC_INFO(threadInfo[num_avail]);
+    }
+    continue;
+
+  no_val:
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_MissingValCpuinfo;
+    return false;
+
+  dup_field:
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
+    return false;
+  }
+  *line = 0;
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+  unsigned teamSize = 0;
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+  // check for num_records == __kmp_xproc ???
+
+  // If it is configured to omit the package level when there is only a single
+  // package, the logic at the end of this routine won't work if there is only a
+  // single thread
+  KMP_ASSERT(num_avail > 0);
+  KMP_ASSERT(num_avail <= num_records);
+
+  // Sort the threadInfo table by physical Id.
+  qsort(threadInfo, num_avail, sizeof(*threadInfo),
+        __kmp_affinity_cmp_ProcCpuInfo_phys_id);
+
+  // The table is now sorted by pkgId / coreId / threadId, but we really don't
+  // know the radix of any of the fields. pkgId's may be sparsely assigned among
+  // the chips on a system. Although coreId's are usually assigned
+  // [0 .. coresPerPkg-1] and threadId's are usually assigned
+  // [0..threadsPerCore-1], we don't want to make any such assumptions.
+  //
+  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
+  // total # packages) are at this point - we want to determine that now. We
+  // only have an upper bound on the first two figures.
+  unsigned *counts =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  unsigned *maxCt =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  unsigned *totals =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  unsigned *lastId =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+
+  bool assign_thread_ids = false;
+  unsigned threadIdCt;
+  unsigned index;
+
+restart_radix_check:
+  threadIdCt = 0;
+
+  // Initialize the counter arrays with data from threadInfo[0].
+  if (assign_thread_ids) {
+    if (threadInfo[0][threadIdIndex] == UINT_MAX) {
+      threadInfo[0][threadIdIndex] = threadIdCt++;
+    } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
+      threadIdCt = threadInfo[0][threadIdIndex] + 1;
+    }
+  }
+  for (index = 0; index <= maxIndex; index++) {
+    counts[index] = 1;
+    maxCt[index] = 1;
+    totals[index] = 1;
+    lastId[index] = threadInfo[0][index];
+    ;
+  }
+
+  // Run through the rest of the OS procs.
+  for (i = 1; i < num_avail; i++) {
+    // Find the most significant index whose id differs from the id for the
+    // previous OS proc.
+    for (index = maxIndex; index >= threadIdIndex; index--) {
+      if (assign_thread_ids && (index == threadIdIndex)) {
+        // Auto-assign the thread id field if it wasn't specified.
+        if (threadInfo[i][threadIdIndex] == UINT_MAX) {
+          threadInfo[i][threadIdIndex] = threadIdCt++;
+        }
+        // Apparently the thread id field was specified for some entries and not
+        // others. Start the thread id counter off at the next higher thread id.
+        else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
+          threadIdCt = threadInfo[i][threadIdIndex] + 1;
+        }
+      }
+      if (threadInfo[i][index] != lastId[index]) {
+        // Run through all indices which are less significant, and reset the
+        // counts to 1. At all levels up to and including index, we need to
+        // increment the totals and record the last id.
+        unsigned index2;
+        for (index2 = threadIdIndex; index2 < index; index2++) {
+          totals[index2]++;
+          if (counts[index2] > maxCt[index2]) {
+            maxCt[index2] = counts[index2];
+          }
+          counts[index2] = 1;
+          lastId[index2] = threadInfo[i][index2];
+        }
+        counts[index]++;
+        totals[index]++;
+        lastId[index] = threadInfo[i][index];
+
+        if (assign_thread_ids && (index > threadIdIndex)) {
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+          // The default team size is the total #threads in the machine
+          // minus 1 thread for every core that has 3 or more threads.
+          teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+          // Restart the thread counter, as we are on a new core.
+          threadIdCt = 0;
+
+          // Auto-assign the thread id field if it wasn't specified.
+          if (threadInfo[i][threadIdIndex] == UINT_MAX) {
+            threadInfo[i][threadIdIndex] = threadIdCt++;
+          }
+
+          // Apparently the thread id field was specified for some entries and
+          // not others. Start the thread id counter off at the next higher
+          // thread id.
+          else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
+            threadIdCt = threadInfo[i][threadIdIndex] + 1;
+          }
+        }
+        break;
+      }
+    }
+    if (index < threadIdIndex) {
+      // If thread ids were specified, it is an error if they are not unique.
+      // Also, check that we waven't already restarted the loop (to be safe -
+      // shouldn't need to).
+      if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
+        __kmp_free(lastId);
+        __kmp_free(totals);
+        __kmp_free(maxCt);
+        __kmp_free(counts);
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
+        return false;
+      }
+
+      // If the thread ids were not specified and we see entries that
+      // are duplicates, start the loop over and assign the thread ids manually.
+      assign_thread_ids = true;
+      goto restart_radix_check;
+    }
+  }
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+  // The default team size is the total #threads in the machine
+  // minus 1 thread for every core that has 3 or more threads.
+  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+  for (index = threadIdIndex; index <= maxIndex; index++) {
+    if (counts[index] > maxCt[index]) {
+      maxCt[index] = counts[index];
+    }
+  }
+
+  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
+  nCoresPerPkg = maxCt[coreIdIndex];
+  nPackages = totals[pkgIdIndex];
+
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly, and return now if affinity is
+  // not enabled.
+  __kmp_ncores = totals[coreIdIndex];
+  if (!KMP_AFFINITY_CAPABLE()) {
+    KMP_ASSERT(__kmp_affinity.type == affinity_none);
+    return true;
+  }
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+  // Set the default team size.
+  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
+    __kmp_dflt_team_nth = teamSize;
+    KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
+                  "__kmp_dflt_team_nth = %d\n",
+                  __kmp_dflt_team_nth));
+  }
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+  KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
+
+  // Count the number of levels which have more nodes at that level than at the
+  // parent's level (with there being an implicit root node of the top level).
+  // This is equivalent to saying that there is at least one node at this level
+  // which has a sibling. These levels are in the map, and the package level is
+  // always in the map.
+  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
+  for (index = threadIdIndex; index < maxIndex; index++) {
+    KMP_ASSERT(totals[index] >= totals[index + 1]);
+    inMap[index] = (totals[index] > totals[index + 1]);
+  }
+  inMap[maxIndex] = (totals[maxIndex] > 1);
+  inMap[pkgIdIndex] = true;
+  inMap[coreIdIndex] = true;
+  inMap[threadIdIndex] = true;
+
+  int depth = 0;
+  int idx = 0;
+  kmp_hw_t types[KMP_HW_LAST];
+  int pkgLevel = -1;
+  int coreLevel = -1;
+  int threadLevel = -1;
+  for (index = threadIdIndex; index <= maxIndex; index++) {
+    if (inMap[index]) {
+      depth++;
+    }
+  }
+  if (inMap[pkgIdIndex]) {
+    pkgLevel = idx;
+    types[idx++] = KMP_HW_SOCKET;
+  }
+  if (inMap[coreIdIndex]) {
+    coreLevel = idx;
+    types[idx++] = KMP_HW_CORE;
+  }
+  if (inMap[threadIdIndex]) {
+    threadLevel = idx;
+    types[idx++] = KMP_HW_THREAD;
+  }
+  KMP_ASSERT(depth > 0);
+
+  // Construct the data structure that is to be returned.
+  __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
+
+  for (i = 0; i < num_avail; ++i) {
+    unsigned os = threadInfo[i][osIdIndex];
+    int src_index;
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+    hw_thread.clear();
+    hw_thread.os_id = os;
+
+    idx = 0;
+    for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
+      if (!inMap[src_index]) {
+        continue;
+      }
+      if (src_index == pkgIdIndex) {
+        hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
+      } else if (src_index == coreIdIndex) {
+        hw_thread.ids[coreLevel] = threadInfo[i][src_index];
+      } else if (src_index == threadIdIndex) {
+        hw_thread.ids[threadLevel] = threadInfo[i][src_index];
+      }
+    }
+  }
+
+  __kmp_free(inMap);
+  __kmp_free(lastId);
+  __kmp_free(totals);
+  __kmp_free(maxCt);
+  __kmp_free(counts);
+  CLEANUP_THREAD_INFO;
+  __kmp_topology->sort_ids();
+  if (!__kmp_topology->check_ids()) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+    *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
+    return false;
+  }
+  return true;
+}
+
+// Create and return a table of affinity masks, indexed by OS thread ID.
+// This routine handles OR'ing together all the affinity masks of threads
+// that are sufficiently close, if granularity > fine.
+template <typename FindNextFunctionType>
+static void __kmp_create_os_id_masks(unsigned *numUnique,
+                                     kmp_affinity_t &affinity,
+                                     FindNextFunctionType find_next) {
+  // First form a table of affinity masks in order of OS thread id.
+  int maxOsId;
+  int i;
+  int numAddrs = __kmp_topology->get_num_hw_threads();
+  int depth = __kmp_topology->get_depth();
+  const char *env_var = __kmp_get_affinity_env_var(affinity);
+  KMP_ASSERT(numAddrs);
+  KMP_ASSERT(depth);
+
+  i = find_next(-1);
+  // If could not find HW thread location with attributes, then return and
+  // fallback to increment find_next and disregard core attributes.
+  if (i >= numAddrs)
+    return;
+
+  maxOsId = 0;
+  for (i = numAddrs - 1;; --i) {
+    int osId = __kmp_topology->at(i).os_id;
+    if (osId > maxOsId) {
+      maxOsId = osId;
+    }
+    if (i == 0)
+      break;
+  }
+  affinity.num_os_id_masks = maxOsId + 1;
+  KMP_CPU_ALLOC_ARRAY(affinity.os_id_masks, affinity.num_os_id_masks);
+  KMP_ASSERT(affinity.gran_levels >= 0);
+  if (affinity.flags.verbose && (affinity.gran_levels > 0)) {
+    KMP_INFORM(ThreadsMigrate, env_var, affinity.gran_levels);
+  }
+  if (affinity.gran_levels >= (int)depth) {
+    KMP_AFF_WARNING(affinity, AffThreadsMayMigrate);
+  }
+
+  // Run through the table, forming the masks for all threads on each core.
+  // Threads on the same core will have identical kmp_hw_thread_t objects, not
+  // considering the last level, which must be the thread id. All threads on a
+  // core will appear consecutively.
+  int unique = 0;
+  int j = 0; // index of 1st thread on core
+  int leader = 0;
+  kmp_affin_mask_t *sum;
+  KMP_CPU_ALLOC_ON_STACK(sum);
+  KMP_CPU_ZERO(sum);
+
+  i = j = leader = find_next(-1);
+  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
+  kmp_full_mask_modifier_t full_mask;
+  for (i = find_next(i); i < numAddrs; i = find_next(i)) {
+    // If this thread is sufficiently close to the leader (within the
+    // granularity setting), then set the bit for this os thread in the
+    // affinity mask for this group, and go on to the next thread.
+    if (__kmp_topology->is_close(leader, i, affinity)) {
+      KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
+      continue;
+    }
+
+    // For every thread in this group, copy the mask to the thread's entry in
+    // the OS Id mask table. Mark the first address as a leader.
+    for (; j < i; j = find_next(j)) {
+      int osId = __kmp_topology->at(j).os_id;
+      KMP_DEBUG_ASSERT(osId <= maxOsId);
+      kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
+      KMP_CPU_COPY(mask, sum);
+      __kmp_topology->at(j).leader = (j == leader);
+    }
+    unique++;
+
+    // Start a new mask.
+    leader = i;
+    full_mask.include(sum);
+    KMP_CPU_ZERO(sum);
+    KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
+  }
+
+  // For every thread in last group, copy the mask to the thread's
+  // entry in the OS Id mask table.
+  for (; j < i; j = find_next(j)) {
+    int osId = __kmp_topology->at(j).os_id;
+    KMP_DEBUG_ASSERT(osId <= maxOsId);
+    kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
+    KMP_CPU_COPY(mask, sum);
+    __kmp_topology->at(j).leader = (j == leader);
+  }
+  full_mask.include(sum);
+  unique++;
+  KMP_CPU_FREE_FROM_STACK(sum);
+
+  // See if the OS Id mask table further restricts or changes the full mask
+  if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
+    __kmp_topology->print(env_var);
+  }
+
+  *numUnique = unique;
+}
+
+// Stuff for the affinity proclist parsers.  It's easier to declare these vars
+// as file-static than to try and pass them through the calling sequence of
+// the recursive-descent OMP_PLACES parser.
+static kmp_affin_mask_t *newMasks;
+static int numNewMasks;
+static int nextNewMask;
+
+#define ADD_MASK(_mask)                                                        \
+  {                                                                            \
+    if (nextNewMask >= numNewMasks) {                                          \
+      int i;                                                                   \
+      numNewMasks *= 2;                                                        \
+      kmp_affin_mask_t *temp;                                                  \
+      KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);                         \
+      for (i = 0; i < numNewMasks / 2; i++) {                                  \
+        kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);                    \
+        kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i);                       \
+        KMP_CPU_COPY(dest, src);                                               \
+      }                                                                        \
+      KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2);                  \
+      newMasks = temp;                                                         \
+    }                                                                          \
+    KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));               \
+    nextNewMask++;                                                             \
+  }
+
+#define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId)                             \
+  {                                                                            \
+    if (((_osId) > _maxOsId) ||                                                \
+        (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
+      KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId);                \
+    } else {                                                                   \
+      ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
+    }                                                                          \
+  }
+
+// Re-parse the proclist (for the explicit affinity type), and form the list
+// of affinity newMasks indexed by gtid.
+static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
+  int i;
+  kmp_affin_mask_t **out_masks = &affinity.masks;
+  unsigned *out_numMasks = &affinity.num_masks;
+  const char *proclist = affinity.proclist;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
+  int maxOsId = affinity.num_os_id_masks - 1;
+  const char *scan = proclist;
+  const char *next = proclist;
+
+  // We use malloc() for the temporary mask vector, so that we can use
+  // realloc() to extend it.
+  numNewMasks = 2;
+  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
+  nextNewMask = 0;
+  kmp_affin_mask_t *sumMask;
+  KMP_CPU_ALLOC(sumMask);
+  int setSize = 0;
+
+  for (;;) {
+    int start, end, stride;
+
+    SKIP_WS(scan);
+    next = scan;
+    if (*next == '\0') {
+      break;
+    }
+
+    if (*next == '{') {
+      int num;
+      setSize = 0;
+      next++; // skip '{'
+      SKIP_WS(next);
+      scan = next;
+
+      // Read the first integer in the set.
+      KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
+      SKIP_DIGITS(next);
+      num = __kmp_str_to_int(scan, *next);
+      KMP_ASSERT2(num >= 0, "bad explicit proc list");
+
+      // Copy the mask for that osId to the sum (union) mask.
+      if ((num > maxOsId) ||
+          (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+        KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
+        KMP_CPU_ZERO(sumMask);
+      } else {
+        KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
+        setSize = 1;
+      }
+
+      for (;;) {
+        // Check for end of set.
+        SKIP_WS(next);
+        if (*next == '}') {
+          next++; // skip '}'
+          break;
+        }
+
+        // Skip optional comma.
+        if (*next == ',') {
+          next++;
+        }
+        SKIP_WS(next);
+
+        // Read the next integer in the set.
+        scan = next;
+        KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+
+        SKIP_DIGITS(next);
+        num = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT2(num >= 0, "bad explicit proc list");
+
+        // Add the mask for that osId to the sum mask.
+        if ((num > maxOsId) ||
+            (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+          KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
+        } else {
+          KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
+          setSize++;
+        }
+      }
+      if (setSize > 0) {
+        ADD_MASK(sumMask);
+      }
+
+      SKIP_WS(next);
+      if (*next == ',') {
+        next++;
+      }
+      scan = next;
+      continue;
+    }
+
+    // Read the first integer.
+    KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+    SKIP_DIGITS(next);
+    start = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT2(start >= 0, "bad explicit proc list");
+    SKIP_WS(next);
+
+    // If this isn't a range, then add a mask to the list and go on.
+    if (*next != '-') {
+      ADD_MASK_OSID(start, osId2Mask, maxOsId);
+
+      // Skip optional comma.
+      if (*next == ',') {
+        next++;
+      }
+      scan = next;
+      continue;
+    }
+
+    // This is a range.  Skip over the '-' and read in the 2nd int.
+    next++; // skip '-'
+    SKIP_WS(next);
+    scan = next;
+    KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+    SKIP_DIGITS(next);
+    end = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT2(end >= 0, "bad explicit proc list");
+
+    // Check for a stride parameter
+    stride = 1;
+    SKIP_WS(next);
+    if (*next == ':') {
+      // A stride is specified.  Skip over the ':" and read the 3rd int.
+      int sign = +1;
+      next++; // skip ':'
+      SKIP_WS(next);
+      scan = next;
+      if (*next == '-') {
+        sign = -1;
+        next++;
+        SKIP_WS(next);
+        scan = next;
+      }
+      KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+      SKIP_DIGITS(next);
+      stride = __kmp_str_to_int(scan, *next);
+      KMP_ASSERT2(stride >= 0, "bad explicit proc list");
+      stride *= sign;
+    }
+
+    // Do some range checks.
+    KMP_ASSERT2(stride != 0, "bad explicit proc list");
+    if (stride > 0) {
+      KMP_ASSERT2(start <= end, "bad explicit proc list");
+    } else {
+      KMP_ASSERT2(start >= end, "bad explicit proc list");
+    }
+    KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
+
+    // Add the mask for each OS proc # to the list.
+    if (stride > 0) {
+      do {
+        ADD_MASK_OSID(start, osId2Mask, maxOsId);
+        start += stride;
+      } while (start <= end);
+    } else {
+      do {
+        ADD_MASK_OSID(start, osId2Mask, maxOsId);
+        start += stride;
+      } while (start >= end);
+    }
+
+    // Skip optional comma.
+    SKIP_WS(next);
+    if (*next == ',') {
+      next++;
+    }
+    scan = next;
+  }
+
+  *out_numMasks = nextNewMask;
+  if (nextNewMask == 0) {
+    *out_masks = NULL;
+    KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+    return;
+  }
+  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
+  for (i = 0; i < nextNewMask; i++) {
+    kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
+    kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
+    KMP_CPU_COPY(dest, src);
+  }
+  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+  KMP_CPU_FREE(sumMask);
+}
+
+/*-----------------------------------------------------------------------------
+Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
+places.  Again, Here is the grammar:
+
+place_list := place
+place_list := place , place_list
+place := num
+place := place : num
+place := place : num : signed
+place := { subplacelist }
+place := ! place                  // (lowest priority)
+subplace_list := subplace
+subplace_list := subplace , subplace_list
+subplace := num
+subplace := num : num
+subplace := num : num : signed
+signed := num
+signed := + signed
+signed := - signed
+-----------------------------------------------------------------------------*/
+static void __kmp_process_subplace_list(const char **scan,
+                                        kmp_affinity_t &affinity, int maxOsId,
+                                        kmp_affin_mask_t *tempMask,
+                                        int *setSize) {
+  const char *next;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
+
+  for (;;) {
+    int start, count, stride, i;
+
+    // Read in the starting proc id
+    SKIP_WS(*scan);
+    KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
+    next = *scan;
+    SKIP_DIGITS(next);
+    start = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(start >= 0);
+    *scan = next;
+
+    // valid follow sets are ',' ':' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}' || **scan == ',') {
+      if ((start > maxOsId) ||
+          (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+        KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
+      } else {
+        KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+        (*setSize)++;
+      }
+      if (**scan == '}') {
+        break;
+      }
+      (*scan)++; // skip ','
+      continue;
+    }
+    KMP_ASSERT2(**scan == ':', "bad explicit places list");
+    (*scan)++; // skip ':'
+
+    // Read count parameter
+    SKIP_WS(*scan);
+    KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
+    next = *scan;
+    SKIP_DIGITS(next);
+    count = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(count >= 0);
+    *scan = next;
+
+    // valid follow sets are ',' ':' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}' || **scan == ',') {
+      for (i = 0; i < count; i++) {
+        if ((start > maxOsId) ||
+            (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+          KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
+          break; // don't proliferate warnings for large count
+        } else {
+          KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+          start++;
+          (*setSize)++;
+        }
+      }
+      if (**scan == '}') {
+        break;
+      }
+      (*scan)++; // skip ','
+      continue;
+    }
+    KMP_ASSERT2(**scan == ':', "bad explicit places list");
+    (*scan)++; // skip ':'
+
+    // Read stride parameter
+    int sign = +1;
+    for (;;) {
+      SKIP_WS(*scan);
+      if (**scan == '+') {
+        (*scan)++; // skip '+'
+        continue;
+      }
+      if (**scan == '-') {
+        sign *= -1;
+        (*scan)++; // skip '-'
+        continue;
+      }
+      break;
+    }
+    SKIP_WS(*scan);
+    KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
+    next = *scan;
+    SKIP_DIGITS(next);
+    stride = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(stride >= 0);
+    *scan = next;
+    stride *= sign;
+
+    // valid follow sets are ',' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}' || **scan == ',') {
+      for (i = 0; i < count; i++) {
+        if ((start > maxOsId) ||
+            (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+          KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
+          break; // don't proliferate warnings for large count
+        } else {
+          KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+          start += stride;
+          (*setSize)++;
+        }
+      }
+      if (**scan == '}') {
+        break;
+      }
+      (*scan)++; // skip ','
+      continue;
+    }
+
+    KMP_ASSERT2(0, "bad explicit places list");
+  }
+}
+
+static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
+                                int maxOsId, kmp_affin_mask_t *tempMask,
+                                int *setSize) {
+  const char *next;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
+
+  // valid follow sets are '{' '!' and num
+  SKIP_WS(*scan);
+  if (**scan == '{') {
+    (*scan)++; // skip '{'
+    __kmp_process_subplace_list(scan, affinity, maxOsId, tempMask, setSize);
+    KMP_ASSERT2(**scan == '}', "bad explicit places list");
+    (*scan)++; // skip '}'
+  } else if (**scan == '!') {
+    (*scan)++; // skip '!'
+    __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize);
+    KMP_CPU_COMPLEMENT(maxOsId, tempMask);
+  } else if ((**scan >= '0') && (**scan <= '9')) {
+    next = *scan;
+    SKIP_DIGITS(next);
+    int num = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(num >= 0);
+    if ((num > maxOsId) ||
+        (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+      KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
+    } else {
+      KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
+      (*setSize)++;
+    }
+    *scan = next; // skip num
+  } else {
+    KMP_ASSERT2(0, "bad explicit places list");
+  }
+}
+
+// static void
+void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) {
+  int i, j, count, stride, sign;
+  kmp_affin_mask_t **out_masks = &affinity.masks;
+  unsigned *out_numMasks = &affinity.num_masks;
+  const char *placelist = affinity.proclist;
+  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
+  int maxOsId = affinity.num_os_id_masks - 1;
+  const char *scan = placelist;
+  const char *next = placelist;
+
+  numNewMasks = 2;
+  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
+  nextNewMask = 0;
+
+  // tempMask is modified based on the previous or initial
+  //   place to form the current place
+  // previousMask contains the previous place
+  kmp_affin_mask_t *tempMask;
+  kmp_affin_mask_t *previousMask;
+  KMP_CPU_ALLOC(tempMask);
+  KMP_CPU_ZERO(tempMask);
+  KMP_CPU_ALLOC(previousMask);
+  KMP_CPU_ZERO(previousMask);
+  int setSize = 0;
+
+  for (;;) {
+    __kmp_process_place(&scan, affinity, maxOsId, tempMask, &setSize);
+
+    // valid follow sets are ',' ':' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0' || *scan == ',') {
+      if (setSize > 0) {
+        ADD_MASK(tempMask);
+      }
+      KMP_CPU_ZERO(tempMask);
+      setSize = 0;
+      if (*scan == '\0') {
+        break;
+      }
+      scan++; // skip ','
+      continue;
+    }
+
+    KMP_ASSERT2(*scan == ':', "bad explicit places list");
+    scan++; // skip ':'
+
+    // Read count parameter
+    SKIP_WS(scan);
+    KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
+    next = scan;
+    SKIP_DIGITS(next);
+    count = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(count >= 0);
+    scan = next;
+
+    // valid follow sets are ',' ':' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0' || *scan == ',') {
+      stride = +1;
+    } else {
+      KMP_ASSERT2(*scan == ':', "bad explicit places list");
+      scan++; // skip ':'
+
+      // Read stride parameter
+      sign = +1;
+      for (;;) {
+        SKIP_WS(scan);
+        if (*scan == '+') {
+          scan++; // skip '+'
+          continue;
+        }
+        if (*scan == '-') {
+          sign *= -1;
+          scan++; // skip '-'
+          continue;
+        }
+        break;
+      }
+      SKIP_WS(scan);
+      KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
+      next = scan;
+      SKIP_DIGITS(next);
+      stride = __kmp_str_to_int(scan, *next);
+      KMP_DEBUG_ASSERT(stride >= 0);
+      scan = next;
+      stride *= sign;
+    }
+
+    // Add places determined by initial_place : count : stride
+    for (i = 0; i < count; i++) {
+      if (setSize == 0) {
+        break;
+      }
+      // Add the current place, then build the next place (tempMask) from that
+      KMP_CPU_COPY(previousMask, tempMask);
+      ADD_MASK(previousMask);
+      KMP_CPU_ZERO(tempMask);
+      setSize = 0;
+      KMP_CPU_SET_ITERATE(j, previousMask) {
+        if (!KMP_CPU_ISSET(j, previousMask)) {
+          continue;
+        }
+        if ((j + stride > maxOsId) || (j + stride < 0) ||
+            (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
+            (!KMP_CPU_ISSET(j + stride,
+                            KMP_CPU_INDEX(osId2Mask, j + stride)))) {
+          if (i < count - 1) {
+            KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, j + stride);
+          }
+          continue;
+        }
+        KMP_CPU_SET(j + stride, tempMask);
+        setSize++;
+      }
+    }
+    KMP_CPU_ZERO(tempMask);
+    setSize = 0;
+
+    // valid follow sets are ',' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0') {
+      break;
+    }
+    if (*scan == ',') {
+      scan++; // skip ','
+      continue;
+    }
+
+    KMP_ASSERT2(0, "bad explicit places list");
+  }
+
+  *out_numMasks = nextNewMask;
+  if (nextNewMask == 0) {
+    *out_masks = NULL;
+    KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+    return;
+  }
+  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
+  KMP_CPU_FREE(tempMask);
+  KMP_CPU_FREE(previousMask);
+  for (i = 0; i < nextNewMask; i++) {
+    kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
+    kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
+    KMP_CPU_COPY(dest, src);
+  }
+  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+}
+
+#undef ADD_MASK
+#undef ADD_MASK_OSID
+
+// This function figures out the deepest level at which there is at least one
+// cluster/core with more than one processing unit bound to it.
+static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
+  int core_level = 0;
+
+  for (int i = 0; i < nprocs; i++) {
+    const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+    for (int j = bottom_level; j > 0; j--) {
+      if (hw_thread.ids[j] > 0) {
+        if (core_level < (j - 1)) {
+          core_level = j - 1;
+        }
+      }
+    }
+  }
+  return core_level;
+}
+
+// This function counts number of clusters/cores at given level.
+static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
+                                         int core_level) {
+  return __kmp_topology->get_count(core_level);
+}
+// This function finds to which cluster/core given processing unit is bound.
+static int __kmp_affinity_find_core(int proc, int bottom_level,
+                                    int core_level) {
+  int core = 0;
+  KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
+  for (int i = 0; i <= proc; ++i) {
+    if (i + 1 <= proc) {
+      for (int j = 0; j <= core_level; ++j) {
+        if (__kmp_topology->at(i + 1).sub_ids[j] !=
+            __kmp_topology->at(i).sub_ids[j]) {
+          core++;
+          break;
+        }
+      }
+    }
+  }
+  return core;
+}
+
+// This function finds maximal number of processing units bound to a
+// cluster/core at given level.
+static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
+                                            int core_level) {
+  if (core_level >= bottom_level)
+    return 1;
+  int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
+  return __kmp_topology->calculate_ratio(thread_level, core_level);
+}
+
+static int *procarr = NULL;
+static int __kmp_aff_depth = 0;
+static int *__kmp_osid_to_hwthread_map = NULL;
+
+static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
+                                                  kmp_affinity_ids_t &ids,
+                                                  kmp_affinity_attrs_t &attrs) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+
+  // Initiailze ids and attrs thread data
+  for (int i = 0; i < KMP_HW_LAST; ++i)
+    ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+  attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
+
+  // Iterate through each os id within the mask and determine
+  // the topology id and attribute information
+  int cpu;
+  int depth = __kmp_topology->get_depth();
+  KMP_CPU_SET_ITERATE(cpu, mask) {
+    int osid_idx = __kmp_osid_to_hwthread_map[cpu];
+    ids.os_id = cpu;
+    const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
+    for (int level = 0; level < depth; ++level) {
+      kmp_hw_t type = __kmp_topology->get_type(level);
+      int id = hw_thread.sub_ids[level];
+      if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
+        ids.ids[type] = id;
+      } else {
+        // This mask spans across multiple topology units, set it as such
+        // and mark every level below as such as well.
+        ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+        for (; level < depth; ++level) {
+          kmp_hw_t type = __kmp_topology->get_type(level);
+          ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+        }
+      }
+    }
+    if (!attrs.valid) {
+      attrs.core_type = hw_thread.attrs.get_core_type();
+      attrs.core_eff = hw_thread.attrs.get_core_eff();
+      attrs.valid = 1;
+    } else {
+      // This mask spans across multiple attributes, set it as such
+      if (attrs.core_type != hw_thread.attrs.get_core_type())
+        attrs.core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+      if (attrs.core_eff != hw_thread.attrs.get_core_eff())
+        attrs.core_eff = kmp_hw_attr_t::UNKNOWN_CORE_EFF;
+    }
+  }
+}
+
+static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+  const kmp_affin_mask_t *mask = th->th.th_affin_mask;
+  kmp_affinity_ids_t &ids = th->th.th_topology_ids;
+  kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs;
+  __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
+}
+
+// Assign the topology information to each place in the place list
+// A thread can then grab not only its affinity mask, but the topology
+// information associated with that mask. e.g., Which socket is a thread on
+static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+  if (affinity.type != affinity_none) {
+    KMP_ASSERT(affinity.num_os_id_masks);
+    KMP_ASSERT(affinity.os_id_masks);
+  }
+  KMP_ASSERT(affinity.num_masks);
+  KMP_ASSERT(affinity.masks);
+  KMP_ASSERT(__kmp_affin_fullMask);
+
+  int max_cpu = __kmp_affin_fullMask->get_max_cpu();
+  int num_hw_threads = __kmp_topology->get_num_hw_threads();
+
+  // Allocate thread topology information
+  if (!affinity.ids) {
+    affinity.ids = (kmp_affinity_ids_t *)__kmp_allocate(
+        sizeof(kmp_affinity_ids_t) * affinity.num_masks);
+  }
+  if (!affinity.attrs) {
+    affinity.attrs = (kmp_affinity_attrs_t *)__kmp_allocate(
+        sizeof(kmp_affinity_attrs_t) * affinity.num_masks);
+  }
+  if (!__kmp_osid_to_hwthread_map) {
+    // Want the +1 because max_cpu should be valid index into map
+    __kmp_osid_to_hwthread_map =
+        (int *)__kmp_allocate(sizeof(int) * (max_cpu + 1));
+  }
+
+  // Create the OS proc to hardware thread map
+  for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) {
+    int os_id = __kmp_topology->at(hw_thread).os_id;
+    if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask))
+      __kmp_osid_to_hwthread_map[os_id] = hw_thread;
+  }
+
+  for (unsigned i = 0; i < affinity.num_masks; ++i) {
+    kmp_affinity_ids_t &ids = affinity.ids[i];
+    kmp_affinity_attrs_t &attrs = affinity.attrs[i];
+    kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.masks, i);
+    __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
+  }
+}
+
+// Called when __kmp_topology is ready
+static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
+  // Initialize other data structures which depend on the topology
+  if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
+    machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
+    __kmp_affinity_get_topology_info(affinity);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+    __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
+#endif
+  }
+}
+
+// Create a one element mask array (set of places) which only contains the
+// initial process's affinity mask
+static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
+  KMP_ASSERT(__kmp_affin_fullMask != NULL);
+  KMP_ASSERT(affinity.type == affinity_none);
+  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+  affinity.num_masks = 1;
+  KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
+  kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0);
+  KMP_CPU_COPY(dest, __kmp_affin_fullMask);
+  __kmp_aux_affinity_initialize_other_data(affinity);
+}
+
+static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) {
+  // Create the "full" mask - this defines all of the processors that we
+  // consider to be in the machine model. If respect is set, then it is the
+  // initialization thread's affinity mask. Otherwise, it is all processors that
+  // we know about on the machine.
+  int verbose = affinity.flags.verbose;
+  const char *env_var = affinity.env_var;
+
+  // Already initialized
+  if (__kmp_affin_fullMask && __kmp_affin_origMask)
+    return;
+
+  if (__kmp_affin_fullMask == NULL) {
+    KMP_CPU_ALLOC(__kmp_affin_fullMask);
+  }
+  if (__kmp_affin_origMask == NULL) {
+    KMP_CPU_ALLOC(__kmp_affin_origMask);
+  }
+  if (KMP_AFFINITY_CAPABLE()) {
+    __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
+    // Make a copy before possible expanding to the entire machine mask
+    __kmp_affin_origMask->copy(__kmp_affin_fullMask);
+    if (affinity.flags.respect) {
+      // Count the number of available processors.
+      unsigned i;
+      __kmp_avail_proc = 0;
+      KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+        if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+          continue;
+        }
+        __kmp_avail_proc++;
+      }
+      if (__kmp_avail_proc > __kmp_xproc) {
+        KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
+        affinity.type = affinity_none;
+        KMP_AFFINITY_DISABLE();
+        return;
+      }
+
+      if (verbose) {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  __kmp_affin_fullMask);
+        KMP_INFORM(InitOSProcSetRespect, env_var, buf);
+      }
+    } else {
+      if (verbose) {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  __kmp_affin_fullMask);
+        KMP_INFORM(InitOSProcSetNotRespect, env_var, buf);
+      }
+      __kmp_avail_proc =
+          __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
+#if KMP_OS_WINDOWS
+      if (__kmp_num_proc_groups <= 1) {
+        // Copy expanded full mask if topology has single processor group
+        __kmp_affin_origMask->copy(__kmp_affin_fullMask);
+      }
+      // Set the process affinity mask since threads' affinity
+      // masks must be subset of process mask in Windows* OS
+      __kmp_affin_fullMask->set_process_affinity(true);
+#endif
+    }
+  }
+}
+
+static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
+  bool success = false;
+  const char *env_var = affinity.env_var;
+  kmp_i18n_id_t msg_id = kmp_i18n_null;
+  int verbose = affinity.flags.verbose;
+
+  // For backward compatibility, setting KMP_CPUINFO_FILE =>
+  // KMP_TOPOLOGY_METHOD=cpuinfo
+  if ((__kmp_cpuinfo_file != NULL) &&
+      (__kmp_affinity_top_method == affinity_top_method_all)) {
+    __kmp_affinity_top_method = affinity_top_method_cpuinfo;
+  }
+
+  if (__kmp_affinity_top_method == affinity_top_method_all) {
+// In the default code path, errors are not fatal - we just try using
+// another method. We only emit a warning message if affinity is on, or the
+// verbose flag is set, an the nowarnings flag was not set.
+#if KMP_USE_HWLOC
+    if (!success &&
+        __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
+      if (!__kmp_hwloc_error) {
+        success = __kmp_affinity_create_hwloc_map(&msg_id);
+        if (!success && verbose) {
+          KMP_INFORM(AffIgnoringHwloc, env_var);
+        }
+      } else if (verbose) {
+        KMP_INFORM(AffIgnoringHwloc, env_var);
+      }
+    }
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    if (!success) {
+      success = __kmp_affinity_create_x2apicid_map(&msg_id);
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
+      }
+    }
+    if (!success) {
+      success = __kmp_affinity_create_apicid_map(&msg_id);
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
+      }
+    }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#if KMP_OS_LINUX
+    if (!success) {
+      int line = 0;
+      success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
+      }
+    }
+#endif /* KMP_OS_LINUX */
+
+#if KMP_GROUP_AFFINITY
+    if (!success && (__kmp_num_proc_groups > 1)) {
+      success = __kmp_affinity_create_proc_group_map(&msg_id);
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
+      }
+    }
+#endif /* KMP_GROUP_AFFINITY */
+
+    if (!success) {
+      success = __kmp_affinity_create_flat_map(&msg_id);
+      if (!success && verbose && msg_id != kmp_i18n_null) {
+        KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
+      }
+      KMP_ASSERT(success);
+    }
+  }
+
+// If the user has specified that a paricular topology discovery method is to be
+// used, then we abort if that method fails. The exception is group affinity,
+// which might have been implicitly set.
+#if KMP_USE_HWLOC
+  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
+    KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
+    success = __kmp_affinity_create_hwloc_map(&msg_id);
+    if (!success) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+    }
+  }
+#endif // KMP_USE_HWLOC
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
+           __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
+    success = __kmp_affinity_create_x2apicid_map(&msg_id);
+    if (!success) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+    }
+  } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
+    success = __kmp_affinity_create_apicid_map(&msg_id);
+    if (!success) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+    }
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
+    int line = 0;
+    success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
+    if (!success) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      const char *filename = __kmp_cpuinfo_get_filename();
+      if (line > 0) {
+        KMP_FATAL(FileLineMsgExiting, filename, line,
+                  __kmp_i18n_catgets(msg_id));
+      } else {
+        KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
+      }
+    }
+  }
+
+#if KMP_GROUP_AFFINITY
+  else if (__kmp_affinity_top_method == affinity_top_method_group) {
+    success = __kmp_affinity_create_proc_group_map(&msg_id);
+    KMP_ASSERT(success);
+    if (!success) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+    }
+  }
+#endif /* KMP_GROUP_AFFINITY */
+
+  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
+    success = __kmp_affinity_create_flat_map(&msg_id);
+    // should not fail
+    KMP_ASSERT(success);
+  }
+
+  // Early exit if topology could not be created
+  if (!__kmp_topology) {
+    if (KMP_AFFINITY_CAPABLE()) {
+      KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
+    }
+    if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
+        __kmp_ncores > 0) {
+      __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
+      __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
+                                   __kmp_nThreadsPerCore, __kmp_ncores);
+      if (verbose) {
+        __kmp_topology->print(env_var);
+      }
+    }
+    return false;
+  }
+
+  // Canonicalize, print (if requested), apply KMP_HW_SUBSET
+  __kmp_topology->canonicalize();
+  if (verbose)
+    __kmp_topology->print(env_var);
+  bool filtered = __kmp_topology->filter_hw_subset();
+  if (filtered && verbose)
+    __kmp_topology->print("KMP_HW_SUBSET");
+  return success;
+}
+
+static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
+  bool is_regular_affinity = (&affinity == &__kmp_affinity);
+  bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity);
+  const char *env_var = __kmp_get_affinity_env_var(affinity);
+
+  if (affinity.flags.initialized) {
+    KMP_ASSERT(__kmp_affin_fullMask != NULL);
+    return;
+  }
+
+  if (is_regular_affinity && (!__kmp_affin_fullMask || !__kmp_affin_origMask))
+    __kmp_aux_affinity_initialize_masks(affinity);
+
+  if (is_regular_affinity && !__kmp_topology) {
+    bool success = __kmp_aux_affinity_initialize_topology(affinity);
+    if (success) {
+      KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
+    } else {
+      affinity.type = affinity_none;
+      KMP_AFFINITY_DISABLE();
+    }
+  }
+
+  // If KMP_AFFINITY=none, then only create the single "none" place
+  // which is the process's initial affinity mask or the number of
+  // hardware threads depending on respect,norespect
+  if (affinity.type == affinity_none) {
+    __kmp_create_affinity_none_places(affinity);
+#if KMP_USE_HIER_SCHED
+    __kmp_dispatch_set_hierarchy_values();
+#endif
+    affinity.flags.initialized = TRUE;
+    return;
+  }
+
+  __kmp_topology->set_granularity(affinity);
+  int depth = __kmp_topology->get_depth();
+
+  // Create the table of masks, indexed by thread Id.
+  unsigned numUnique;
+  int numAddrs = __kmp_topology->get_num_hw_threads();
+  // If OMP_PLACES=cores:<attribute> specified, then attempt
+  // to make OS Id mask table using those attributes
+  if (affinity.core_attr_gran.valid) {
+    __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) {
+      KMP_ASSERT(idx >= -1);
+      for (int i = idx + 1; i < numAddrs; ++i)
+        if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran))
+          return i;
+      return numAddrs;
+    });
+    if (!affinity.os_id_masks) {
+      const char *core_attribute;
+      if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF)
+        core_attribute = "core_efficiency";
+      else
+        core_attribute = "core_type";
+      KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var,
+                      core_attribute,
+                      __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true))
+    }
+  }
+  // If core attributes did not work, or none were specified,
+  // then make OS Id mask table using typical incremental way.
+  if (!affinity.os_id_masks) {
+    __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
+      KMP_ASSERT(idx >= -1);
+      return idx + 1;
+    });
+  }
+  if (affinity.gran_levels == 0) {
+    KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
+  }
+
+  switch (affinity.type) {
+
+  case affinity_explicit:
+    KMP_DEBUG_ASSERT(affinity.proclist != NULL);
+    if (is_hidden_helper_affinity ||
+        __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
+      __kmp_affinity_process_proclist(affinity);
+    } else {
+      __kmp_affinity_process_placelist(affinity);
+    }
+    if (affinity.num_masks == 0) {
+      KMP_AFF_WARNING(affinity, AffNoValidProcID);
+      affinity.type = affinity_none;
+      __kmp_create_affinity_none_places(affinity);
+      affinity.flags.initialized = TRUE;
+      return;
+    }
+    break;
+
+  // The other affinity types rely on sorting the hardware threads according to
+  // some permutation of the machine topology tree. Set affinity.compact
+  // and affinity.offset appropriately, then jump to a common code
+  // fragment to do the sort and create the array of affinity masks.
+  case affinity_logical:
+    affinity.compact = 0;
+    if (affinity.offset) {
+      affinity.offset =
+          __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
+    }
+    goto sortTopology;
+
+  case affinity_physical:
+    if (__kmp_nThreadsPerCore > 1) {
+      affinity.compact = 1;
+      if (affinity.compact >= depth) {
+        affinity.compact = 0;
+      }
+    } else {
+      affinity.compact = 0;
+    }
+    if (affinity.offset) {
+      affinity.offset =
+          __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
+    }
+    goto sortTopology;
+
+  case affinity_scatter:
+    if (affinity.compact >= depth) {
+      affinity.compact = 0;
+    } else {
+      affinity.compact = depth - 1 - affinity.compact;
+    }
+    goto sortTopology;
+
+  case affinity_compact:
+    if (affinity.compact >= depth) {
+      affinity.compact = depth - 1;
+    }
+    goto sortTopology;
+
+  case affinity_balanced:
+    if (depth <= 1 || is_hidden_helper_affinity) {
+      KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
+      affinity.type = affinity_none;
+      __kmp_create_affinity_none_places(affinity);
+      affinity.flags.initialized = TRUE;
+      return;
+    } else if (!__kmp_topology->is_uniform()) {
+      // Save the depth for further usage
+      __kmp_aff_depth = depth;
+
+      int core_level =
+          __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
+      int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
+                                                 core_level);
+      int maxprocpercore = __kmp_affinity_max_proc_per_core(
+          __kmp_avail_proc, depth - 1, core_level);
+
+      int nproc = ncores * maxprocpercore;
+      if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
+        KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
+        affinity.type = affinity_none;
+        __kmp_create_affinity_none_places(affinity);
+        affinity.flags.initialized = TRUE;
+        return;
+      }
+
+      procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
+      for (int i = 0; i < nproc; i++) {
+        procarr[i] = -1;
+      }
+
+      int lastcore = -1;
+      int inlastcore = 0;
+      for (int i = 0; i < __kmp_avail_proc; i++) {
+        int proc = __kmp_topology->at(i).os_id;
+        int core = __kmp_affinity_find_core(i, depth - 1, core_level);
+
+        if (core == lastcore) {
+          inlastcore++;
+        } else {
+          inlastcore = 0;
+        }
+        lastcore = core;
+
+        procarr[core * maxprocpercore + inlastcore] = proc;
+      }
+    }
+    if (affinity.compact >= depth) {
+      affinity.compact = depth - 1;
+    }
+
+  sortTopology:
+    // Allocate the gtid->affinity mask table.
+    if (affinity.flags.dups) {
+      affinity.num_masks = __kmp_avail_proc;
+    } else {
+      affinity.num_masks = numUnique;
+    }
+
+    if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
+        (__kmp_affinity_num_places > 0) &&
+        ((unsigned)__kmp_affinity_num_places < affinity.num_masks) &&
+        !is_hidden_helper_affinity) {
+      affinity.num_masks = __kmp_affinity_num_places;
+    }
+
+    KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
+
+    // Sort the topology table according to the current setting of
+    // affinity.compact, then fill out affinity.masks.
+    __kmp_topology->sort_compact(affinity);
+    {
+      int i;
+      unsigned j;
+      int num_hw_threads = __kmp_topology->get_num_hw_threads();
+      kmp_full_mask_modifier_t full_mask;
+      for (i = 0, j = 0; i < num_hw_threads; i++) {
+        if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
+          continue;
+        }
+        int osId = __kmp_topology->at(i).os_id;
+
+        kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
+        kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
+        KMP_ASSERT(KMP_CPU_ISSET(osId, src));
+        KMP_CPU_COPY(dest, src);
+        full_mask.include(src);
+        if (++j >= affinity.num_masks) {
+          break;
+        }
+      }
+      KMP_DEBUG_ASSERT(j == affinity.num_masks);
+      // See if the places list further restricts or changes the full mask
+      if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
+        __kmp_topology->print(env_var);
+      }
+    }
+    // Sort the topology back using ids
+    __kmp_topology->sort_ids();
+    break;
+
+  default:
+    KMP_ASSERT2(0, "Unexpected affinity setting");
+  }
+  __kmp_aux_affinity_initialize_other_data(affinity);
+  affinity.flags.initialized = TRUE;
+}
+
+void __kmp_affinity_initialize(kmp_affinity_t &affinity) {
+  // Much of the code above was written assuming that if a machine was not
+  // affinity capable, then affinity type == affinity_none.
+  // We now explicitly represent this as affinity type == affinity_disabled.
+  // There are too many checks for affinity type == affinity_none in this code.
+  // Instead of trying to change them all, check if
+  // affinity type == affinity_disabled, and if so, slam it with affinity_none,
+  // call the real initialization routine, then restore affinity type to
+  // affinity_disabled.
+  int disabled = (affinity.type == affinity_disabled);
+  if (!KMP_AFFINITY_CAPABLE())
+    KMP_ASSERT(disabled);
+  if (disabled)
+    affinity.type = affinity_none;
+  __kmp_aux_affinity_initialize(affinity);
+  if (disabled)
+    affinity.type = affinity_disabled;
+}
+
+void __kmp_affinity_uninitialize(void) {
+  for (kmp_affinity_t *affinity : __kmp_affinities) {
+    if (affinity->masks != NULL)
+      KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks);
+    if (affinity->os_id_masks != NULL)
+      KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
+    if (affinity->proclist != NULL)
+      __kmp_free(affinity->proclist);
+    if (affinity->ids != NULL)
+      __kmp_free(affinity->ids);
+    if (affinity->attrs != NULL)
+      __kmp_free(affinity->attrs);
+    *affinity = KMP_AFFINITY_INIT(affinity->env_var);
+  }
+  if (__kmp_affin_origMask != NULL) {
+    if (KMP_AFFINITY_CAPABLE()) {
+      __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
+    }
+    KMP_CPU_FREE(__kmp_affin_origMask);
+    __kmp_affin_origMask = NULL;
+  }
+  __kmp_affinity_num_places = 0;
+  if (procarr != NULL) {
+    __kmp_free(procarr);
+    procarr = NULL;
+  }
+  if (__kmp_osid_to_hwthread_map) {
+    __kmp_free(__kmp_osid_to_hwthread_map);
+    __kmp_osid_to_hwthread_map = NULL;
+  }
+#if KMP_USE_HWLOC
+  if (__kmp_hwloc_topology != NULL) {
+    hwloc_topology_destroy(__kmp_hwloc_topology);
+    __kmp_hwloc_topology = NULL;
+  }
+#endif
+  if (__kmp_hw_subset) {
+    kmp_hw_subset_t::deallocate(__kmp_hw_subset);
+    __kmp_hw_subset = nullptr;
+  }
+  if (__kmp_topology) {
+    kmp_topology_t::deallocate(__kmp_topology);
+    __kmp_topology = nullptr;
+  }
+  KMPAffinity::destroy_api();
+}
+
+static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity,
+                                      int *place, kmp_affin_mask_t **mask) {
+  int mask_idx;
+  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
+  if (is_hidden_helper)
+    // The first gtid is the regular primary thread, the second gtid is the main
+    // thread of hidden team which does not participate in task execution.
+    mask_idx = gtid - 2;
+  else
+    mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
+  KMP_DEBUG_ASSERT(affinity->num_masks > 0);
+  *place = (mask_idx + affinity->offset) % affinity->num_masks;
+  *mask = KMP_CPU_INDEX(affinity->masks, *place);
+}
+
+// This function initializes the per-thread data concerning affinity including
+// the mask and topology information
+void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
+
+  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+
+  // Set the thread topology information to default of unknown
+  for (int id = 0; id < KMP_HW_LAST; ++id)
+    th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
+  th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return;
+  }
+
+  if (th->th.th_affin_mask == NULL) {
+    KMP_CPU_ALLOC(th->th.th_affin_mask);
+  } else {
+    KMP_CPU_ZERO(th->th.th_affin_mask);
+  }
+
+  // Copy the thread mask to the kmp_info_t structure. If
+  // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e.
+  // one that has all of the OS proc ids set, or if
+  // __kmp_affinity.flags.respect is set, then the full mask is the
+  // same as the mask of the initialization thread.
+  kmp_affin_mask_t *mask;
+  int i;
+  const kmp_affinity_t *affinity;
+  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
+
+  if (is_hidden_helper)
+    affinity = &__kmp_hh_affinity;
+  else
+    affinity = &__kmp_affinity;
+
+  if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) {
+    if ((affinity->type == affinity_none) ||
+        (affinity->type == affinity_balanced) ||
+        KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
+#if KMP_GROUP_AFFINITY
+      if (__kmp_num_proc_groups > 1) {
+        return;
+      }
+#endif
+      KMP_ASSERT(__kmp_affin_fullMask != NULL);
+      i = 0;
+      mask = __kmp_affin_fullMask;
+    } else {
+      __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
+    }
+  } else {
+    if (!isa_root || __kmp_nested_proc_bind.bind_types[0] == proc_bind_false) {
+#if KMP_GROUP_AFFINITY
+      if (__kmp_num_proc_groups > 1) {
+        return;
+      }
+#endif
+      KMP_ASSERT(__kmp_affin_fullMask != NULL);
+      i = KMP_PLACE_ALL;
+      mask = __kmp_affin_fullMask;
+    } else {
+      __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
+    }
+  }
+
+  th->th.th_current_place = i;
+  if (isa_root && !is_hidden_helper) {
+    th->th.th_new_place = i;
+    th->th.th_first_place = 0;
+    th->th.th_last_place = affinity->num_masks - 1;
+  } else if (KMP_AFFINITY_NON_PROC_BIND) {
+    // When using a Non-OMP_PROC_BIND affinity method,
+    // set all threads' place-partition-var to the entire place list
+    th->th.th_first_place = 0;
+    th->th.th_last_place = affinity->num_masks - 1;
+  }
+  // Copy topology information associated with the place
+  if (i >= 0) {
+    th->th.th_topology_ids = __kmp_affinity.ids[i];
+    th->th.th_topology_attrs = __kmp_affinity.attrs[i];
+  }
+
+  if (i == KMP_PLACE_ALL) {
+    KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n",
+                   gtid));
+  } else {
+    KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n",
+                   gtid, i));
+  }
+
+  KMP_CPU_COPY(th->th.th_affin_mask, mask);
+}
+
+void __kmp_affinity_bind_init_mask(int gtid) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return;
+  }
+  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+  const kmp_affinity_t *affinity;
+  const char *env_var;
+  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
+
+  if (is_hidden_helper)
+    affinity = &__kmp_hh_affinity;
+  else
+    affinity = &__kmp_affinity;
+  env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true);
+  /* to avoid duplicate printing (will be correctly printed on barrier) */
+  if (affinity->flags.verbose && (affinity->type == affinity_none ||
+                                  (th->th.th_current_place != KMP_PLACE_ALL &&
+                                   affinity->type != affinity_balanced)) &&
+      !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              th->th.th_affin_mask);
+    KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+               gtid, buf);
+  }
+
+#if KMP_OS_WINDOWS
+  // On Windows* OS, the process affinity mask might have changed. If the user
+  // didn't request affinity and this call fails, just continue silently.
+  // See CQ171393.
+  if (affinity->type == affinity_none) {
+    __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
+  } else
+#endif
+    __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+}
+
+void __kmp_affinity_bind_place(int gtid) {
+  // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
+  if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) {
+    return;
+  }
+
+  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+
+  KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current "
+                 "place = %d)\n",
+                 gtid, th->th.th_new_place, th->th.th_current_place));
+
+  // Check that the new place is within this thread's partition.
+  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+  KMP_ASSERT(th->th.th_new_place >= 0);
+  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks);
+  if (th->th.th_first_place <= th->th.th_last_place) {
+    KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
+               (th->th.th_new_place <= th->th.th_last_place));
+  } else {
+    KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
+               (th->th.th_new_place >= th->th.th_last_place));
+  }
+
+  // Copy the thread mask to the kmp_info_t structure,
+  // and set this thread's affinity.
+  kmp_affin_mask_t *mask =
+      KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
+  KMP_CPU_COPY(th->th.th_affin_mask, mask);
+  th->th.th_current_place = th->th.th_new_place;
+
+  if (__kmp_affinity.flags.verbose) {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              th->th.th_affin_mask);
+    KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
+               __kmp_gettid(), gtid, buf);
+  }
+  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+}
+
+int __kmp_aux_set_affinity(void **mask) {
+  int gtid;
+  kmp_info_t *th;
+  int retval;
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  gtid = __kmp_entry_gtid();
+  KA_TRACE(
+      1000, (""); {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf(
+            "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
+            gtid, buf);
+      });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+    } else {
+      unsigned proc;
+      int num_procs = 0;
+
+      KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
+        if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+          KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+        }
+        if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
+          continue;
+        }
+        num_procs++;
+      }
+      if (num_procs == 0) {
+        KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+      }
+
+#if KMP_GROUP_AFFINITY
+      if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
+        KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+      }
+#endif /* KMP_GROUP_AFFINITY */
+    }
+  }
+
+  th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
+  if (retval == 0) {
+    KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
+  }
+
+  th->th.th_current_place = KMP_PLACE_UNDEFINED;
+  th->th.th_new_place = KMP_PLACE_UNDEFINED;
+  th->th.th_first_place = 0;
+  th->th.th_last_place = __kmp_affinity.num_masks - 1;
+
+  // Turn off 4.0 affinity for the current tread at this parallel level.
+  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
+
+  return retval;
+}
+
+int __kmp_aux_get_affinity(void **mask) {
+  int gtid;
+  int retval;
+#if KMP_OS_WINDOWS || KMP_DEBUG
+  kmp_info_t *th;
+#endif
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  gtid = __kmp_entry_gtid();
+#if KMP_OS_WINDOWS || KMP_DEBUG
+  th = __kmp_threads[gtid];
+#else
+  (void)gtid; // unused variable
+#endif
+  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+
+  KA_TRACE(
+      1000, (""); {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  th->th.th_affin_mask);
+        __kmp_printf(
+            "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid,
+            buf);
+      });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
+    }
+  }
+
+#if !KMP_OS_WINDOWS
+
+  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
+  KA_TRACE(
+      1000, (""); {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  (kmp_affin_mask_t *)(*mask));
+        __kmp_printf(
+            "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid,
+            buf);
+      });
+  return retval;
+
+#else
+  (void)retval;
+
+  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
+  return 0;
+
+#endif /* KMP_OS_WINDOWS */
+}
+
+int __kmp_aux_get_affinity_max_proc() {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return 0;
+  }
+#if KMP_GROUP_AFFINITY
+  if (__kmp_num_proc_groups > 1) {
+    return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
+  }
+#endif
+  return __kmp_xproc;
+}
+
+int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  KA_TRACE(
+      1000, (""); {
+        int gtid = __kmp_entry_gtid();
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
+                           "affinity mask for thread %d = %s\n",
+                           proc, gtid, buf);
+      });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
+    }
+  }
+
+  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
+    return -1;
+  }
+  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+    return -2;
+  }
+
+  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
+  return 0;
+}
+
+int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  KA_TRACE(
+      1000, (""); {
+        int gtid = __kmp_entry_gtid();
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
+                           "affinity mask for thread %d = %s\n",
+                           proc, gtid, buf);
+      });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
+    }
+  }
+
+  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
+    return -1;
+  }
+  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+    return -2;
+  }
+
+  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
+  return 0;
+}
+
+int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  KA_TRACE(
+      1000, (""); {
+        int gtid = __kmp_entry_gtid();
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  (kmp_affin_mask_t *)(*mask));
+        __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
+                           "affinity mask for thread %d = %s\n",
+                           proc, gtid, buf);
+      });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
+    }
+  }
+
+  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
+    return -1;
+  }
+  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+    return 0;
+  }
+
+  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
+}
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+// Returns first os proc id with ATOM core
+int __kmp_get_first_osid_with_ecore(void) {
+  int low = 0;
+  int high = __kmp_topology->get_num_hw_threads() - 1;
+  int mid = 0;
+  while (high - low > 1) {
+    mid = (high + low) / 2;
+    if (__kmp_topology->at(mid).attrs.get_core_type() ==
+        KMP_HW_CORE_TYPE_CORE) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
+    return mid;
+  }
+  return -1;
+}
+#endif
+
+// Dynamic affinity settings - Affinity balanced
+void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
+  KMP_DEBUG_ASSERT(th);
+  bool fine_gran = true;
+  int tid = th->th.th_info.ds.ds_tid;
+  const char *env_var = "KMP_AFFINITY";
+
+  // Do not perform balanced affinity for the hidden helper threads
+  if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
+    return;
+
+  switch (__kmp_affinity.gran) {
+  case KMP_HW_THREAD:
+    break;
+  case KMP_HW_CORE:
+    if (__kmp_nThreadsPerCore > 1) {
+      fine_gran = false;
+    }
+    break;
+  case KMP_HW_SOCKET:
+    if (nCoresPerPkg > 1) {
+      fine_gran = false;
+    }
+    break;
+  default:
+    fine_gran = false;
+  }
+
+  if (__kmp_topology->is_uniform()) {
+    int coreID;
+    int threadID;
+    // Number of hyper threads per core in HT machine
+    int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
+    // Number of cores
+    int ncores = __kmp_ncores;
+    if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
+      __kmp_nth_per_core = __kmp_avail_proc / nPackages;
+      ncores = nPackages;
+    }
+    // How many threads will be bound to each core
+    int chunk = nthreads / ncores;
+    // How many cores will have an additional thread bound to it - "big cores"
+    int big_cores = nthreads % ncores;
+    // Number of threads on the big cores
+    int big_nth = (chunk + 1) * big_cores;
+    if (tid < big_nth) {
+      coreID = tid / (chunk + 1);
+      threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
+    } else { // tid >= big_nth
+      coreID = (tid - big_cores) / chunk;
+      threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
+    }
+    KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                      "Illegal set affinity operation when not capable");
+
+    kmp_affin_mask_t *mask = th->th.th_affin_mask;
+    KMP_CPU_ZERO(mask);
+
+    if (fine_gran) {
+      int osID =
+          __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
+      KMP_CPU_SET(osID, mask);
+    } else {
+      for (int i = 0; i < __kmp_nth_per_core; i++) {
+        int osID;
+        osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
+        KMP_CPU_SET(osID, mask);
+      }
+    }
+    if (__kmp_affinity.flags.verbose) {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
+      KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+                 tid, buf);
+    }
+    __kmp_affinity_get_thread_topology_info(th);
+    __kmp_set_system_affinity(mask, TRUE);
+  } else { // Non-uniform topology
+
+    kmp_affin_mask_t *mask = th->th.th_affin_mask;
+    KMP_CPU_ZERO(mask);
+
+    int core_level =
+        __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
+    int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
+                                               __kmp_aff_depth - 1, core_level);
+    int nth_per_core = __kmp_affinity_max_proc_per_core(
+        __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
+
+    // For performance gain consider the special case nthreads ==
+    // __kmp_avail_proc
+    if (nthreads == __kmp_avail_proc) {
+      if (fine_gran) {
+        int osID = __kmp_topology->at(tid).os_id;
+        KMP_CPU_SET(osID, mask);
+      } else {
+        int core =
+            __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
+        for (int i = 0; i < __kmp_avail_proc; i++) {
+          int osID = __kmp_topology->at(i).os_id;
+          if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
+              core) {
+            KMP_CPU_SET(osID, mask);
+          }
+        }
+      }
+    } else if (nthreads <= ncores) {
+
+      int core = 0;
+      for (int i = 0; i < ncores; i++) {
+        // Check if this core from procarr[] is in the mask
+        int in_mask = 0;
+        for (int j = 0; j < nth_per_core; j++) {
+          if (procarr[i * nth_per_core + j] != -1) {
+            in_mask = 1;
+            break;
+          }
+        }
+        if (in_mask) {
+          if (tid == core) {
+            for (int j = 0; j < nth_per_core; j++) {
+              int osID = procarr[i * nth_per_core + j];
+              if (osID != -1) {
+                KMP_CPU_SET(osID, mask);
+                // For fine granularity it is enough to set the first available
+                // osID for this core
+                if (fine_gran) {
+                  break;
+                }
+              }
+            }
+            break;
+          } else {
+            core++;
+          }
+        }
+      }
+    } else { // nthreads > ncores
+      // Array to save the number of processors at each core
+      int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
+      // Array to save the number of cores with "x" available processors;
+      int *ncores_with_x_procs =
+          (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
+      // Array to save the number of cores with # procs from x to nth_per_core
+      int *ncores_with_x_to_max_procs =
+          (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
+
+      for (int i = 0; i <= nth_per_core; i++) {
+        ncores_with_x_procs[i] = 0;
+        ncores_with_x_to_max_procs[i] = 0;
+      }
+
+      for (int i = 0; i < ncores; i++) {
+        int cnt = 0;
+        for (int j = 0; j < nth_per_core; j++) {
+          if (procarr[i * nth_per_core + j] != -1) {
+            cnt++;
+          }
+        }
+        nproc_at_core[i] = cnt;
+        ncores_with_x_procs[cnt]++;
+      }
+
+      for (int i = 0; i <= nth_per_core; i++) {
+        for (int j = i; j <= nth_per_core; j++) {
+          ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
+        }
+      }
+
+      // Max number of processors
+      int nproc = nth_per_core * ncores;
+      // An array to keep number of threads per each context
+      int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
+      for (int i = 0; i < nproc; i++) {
+        newarr[i] = 0;
+      }
+
+      int nth = nthreads;
+      int flag = 0;
+      while (nth > 0) {
+        for (int j = 1; j <= nth_per_core; j++) {
+          int cnt = ncores_with_x_to_max_procs[j];
+          for (int i = 0; i < ncores; i++) {
+            // Skip the core with 0 processors
+            if (nproc_at_core[i] == 0) {
+              continue;
+            }
+            for (int k = 0; k < nth_per_core; k++) {
+              if (procarr[i * nth_per_core + k] != -1) {
+                if (newarr[i * nth_per_core + k] == 0) {
+                  newarr[i * nth_per_core + k] = 1;
+                  cnt--;
+                  nth--;
+                  break;
+                } else {
+                  if (flag != 0) {
+                    newarr[i * nth_per_core + k]++;
+                    cnt--;
+                    nth--;
+                    break;
+                  }
+                }
+              }
+            }
+            if (cnt == 0 || nth == 0) {
+              break;
+            }
+          }
+          if (nth == 0) {
+            break;
+          }
+        }
+        flag = 1;
+      }
+      int sum = 0;
+      for (int i = 0; i < nproc; i++) {
+        sum += newarr[i];
+        if (sum > tid) {
+          if (fine_gran) {
+            int osID = procarr[i];
+            KMP_CPU_SET(osID, mask);
+          } else {
+            int coreID = i / nth_per_core;
+            for (int ii = 0; ii < nth_per_core; ii++) {
+              int osID = procarr[coreID * nth_per_core + ii];
+              if (osID != -1) {
+                KMP_CPU_SET(osID, mask);
+              }
+            }
+          }
+          break;
+        }
+      }
+      __kmp_free(newarr);
+    }
+
+    if (__kmp_affinity.flags.verbose) {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
+      KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
+                 tid, buf);
+    }
+    __kmp_affinity_get_thread_topology_info(th);
+    __kmp_set_system_affinity(mask, TRUE);
+  }
+}
+
+#if KMP_OS_LINUX || KMP_OS_FREEBSD
+// We don't need this entry for Windows because
+// there is GetProcessAffinityMask() api
+//
+// The intended usage is indicated by these steps:
+// 1) The user gets the current affinity mask
+// 2) Then sets the affinity by calling this function
+// 3) Error check the return value
+// 4) Use non-OpenMP parallelization
+// 5) Reset the affinity to what was stored in step 1)
+#ifdef __cplusplus
+extern "C"
+#endif
+    int
+    kmp_set_thread_affinity_mask_initial()
+// the function returns 0 on success,
+//   -1 if we cannot bind thread
+//   >0 (errno) if an error happened during binding
+{
+  int gtid = __kmp_get_gtid();
+  if (gtid < 0) {
+    // Do not touch non-omp threads
+    KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
+                  "non-omp thread, returning\n"));
+    return -1;
+  }
+  if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
+    KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
+                  "affinity not initialized, returning\n"));
+    return -1;
+  }
+  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
+                "set full mask for thread %d\n",
+                gtid));
+  KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
+  return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
+}
+#endif
+
+#endif // KMP_AFFINITY_SUPPORTED
diff --git a/third_party/openmp/kmp_affinity.h b/third_party/openmp/kmp_affinity.h
new file mode 100644
index 000000000..8a0e2989b
--- /dev/null
+++ b/third_party/openmp/kmp_affinity.h
@@ -0,0 +1,1321 @@
+/*
+ * kmp_affinity.h -- header for affinity management
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_AFFINITY_H
+#define KMP_AFFINITY_H
+
+#include "kmp.h"
+#include "kmp_os.h"
+#include <limits>
+
+#if KMP_AFFINITY_SUPPORTED
+#if KMP_USE_HWLOC
+class KMPHwlocAffinity : public KMPAffinity {
+public:
+  class Mask : public KMPAffinity::Mask {
+    hwloc_cpuset_t mask;
+
+  public:
+    Mask() {
+      mask = hwloc_bitmap_alloc();
+      this->zero();
+    }
+    ~Mask() { hwloc_bitmap_free(mask); }
+    void set(int i) override { hwloc_bitmap_set(mask, i); }
+    bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
+    void clear(int i) override { hwloc_bitmap_clr(mask, i); }
+    void zero() override { hwloc_bitmap_zero(mask); }
+    bool empty() const override { return hwloc_bitmap_iszero(mask); }
+    void copy(const KMPAffinity::Mask *src) override {
+      const Mask *convert = static_cast<const Mask *>(src);
+      hwloc_bitmap_copy(mask, convert->mask);
+    }
+    void bitwise_and(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      hwloc_bitmap_and(mask, mask, convert->mask);
+    }
+    void bitwise_or(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      hwloc_bitmap_or(mask, mask, convert->mask);
+    }
+    void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
+    bool is_equal(const KMPAffinity::Mask *rhs) const override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      return hwloc_bitmap_isequal(mask, convert->mask);
+    }
+    int begin() const override { return hwloc_bitmap_first(mask); }
+    int end() const override { return -1; }
+    int next(int previous) const override {
+      return hwloc_bitmap_next(mask, previous);
+    }
+    int get_system_affinity(bool abort_on_error) override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+      long retval =
+          hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
+                    KMP_ERR(error), __kmp_msg_null);
+      }
+      return error;
+    }
+    int set_system_affinity(bool abort_on_error) const override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal set affinity operation when not capable");
+      long retval =
+          hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
+                    KMP_ERR(error), __kmp_msg_null);
+      }
+      return error;
+    }
+#if KMP_OS_WINDOWS
+    int set_process_affinity(bool abort_on_error) const override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal set process affinity operation when not capable");
+      int error = 0;
+      const hwloc_topology_support *support =
+          hwloc_topology_get_support(__kmp_hwloc_topology);
+      if (support->cpubind->set_proc_cpubind) {
+        int retval;
+        retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
+                                   HWLOC_CPUBIND_PROCESS);
+        if (retval >= 0)
+          return 0;
+        error = errno;
+        if (abort_on_error)
+          __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
+                      KMP_ERR(error), __kmp_msg_null);
+      }
+      return error;
+    }
+#endif
+    int get_proc_group() const override {
+      int group = -1;
+#if KMP_OS_WINDOWS
+      if (__kmp_num_proc_groups == 1) {
+        return 1;
+      }
+      for (int i = 0; i < __kmp_num_proc_groups; i++) {
+        // On windows, the long type is always 32 bits
+        unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
+        unsigned long second_32_bits =
+            hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
+        if (first_32_bits == 0 && second_32_bits == 0) {
+          continue;
+        }
+        if (group >= 0) {
+          return -1;
+        }
+        group = i;
+      }
+#endif /* KMP_OS_WINDOWS */
+      return group;
+    }
+  };
+  void determine_capable(const char *var) override {
+    const hwloc_topology_support *topology_support;
+    if (__kmp_hwloc_topology == NULL) {
+      if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
+        __kmp_hwloc_error = TRUE;
+        if (__kmp_affinity.flags.verbose) {
+          KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
+        }
+      }
+      if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
+        __kmp_hwloc_error = TRUE;
+        if (__kmp_affinity.flags.verbose) {
+          KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
+        }
+      }
+    }
+    topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
+    // Is the system capable of setting/getting this thread's affinity?
+    // Also, is topology discovery possible? (pu indicates ability to discover
+    // processing units). And finally, were there no errors when calling any
+    // hwloc_* API functions?
+    if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
+        topology_support->cpubind->get_thisthread_cpubind &&
+        topology_support->discovery->pu && !__kmp_hwloc_error) {
+      // enables affinity according to KMP_AFFINITY_CAPABLE() macro
+      KMP_AFFINITY_ENABLE(TRUE);
+    } else {
+      // indicate that hwloc didn't work and disable affinity
+      __kmp_hwloc_error = TRUE;
+      KMP_AFFINITY_DISABLE();
+    }
+  }
+  void bind_thread(int which) override {
+    KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                "Illegal set affinity operation when not capable");
+    KMPAffinity::Mask *mask;
+    KMP_CPU_ALLOC_ON_STACK(mask);
+    KMP_CPU_ZERO(mask);
+    KMP_CPU_SET(which, mask);
+    __kmp_set_system_affinity(mask, TRUE);
+    KMP_CPU_FREE_FROM_STACK(mask);
+  }
+  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
+  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
+  KMPAffinity::Mask *allocate_mask_array(int num) override {
+    return new Mask[num];
+  }
+  void deallocate_mask_array(KMPAffinity::Mask *array) override {
+    Mask *hwloc_array = static_cast<Mask *>(array);
+    delete[] hwloc_array;
+  }
+  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
+                                      int index) override {
+    Mask *hwloc_array = static_cast<Mask *>(array);
+    return &(hwloc_array[index]);
+  }
+  api_type get_api_type() const override { return HWLOC; }
+};
+#endif /* KMP_USE_HWLOC */
+
+#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX
+/* On some of the older OS's that we build on, these constants aren't present
+   in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
+   all systems of the same arch where they are defined, and they cannot change.
+   stone forever. */
+#include <sys/syscall.h>
+#if KMP_ARCH_X86 || KMP_ARCH_ARM
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 241
+#elif __NR_sched_setaffinity != 241
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 242
+#elif __NR_sched_getaffinity != 242
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_AARCH64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 122
+#elif __NR_sched_setaffinity != 122
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 123
+#elif __NR_sched_getaffinity != 123
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_X86_64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 203
+#elif __NR_sched_setaffinity != 203
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 204
+#elif __NR_sched_getaffinity != 204
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_PPC64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 222
+#elif __NR_sched_setaffinity != 222
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 223
+#elif __NR_sched_getaffinity != 223
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_MIPS
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 4239
+#elif __NR_sched_setaffinity != 4239
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 4240
+#elif __NR_sched_getaffinity != 4240
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_MIPS64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 5195
+#elif __NR_sched_setaffinity != 5195
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 5196
+#elif __NR_sched_getaffinity != 5196
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_LOONGARCH64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 122
+#elif __NR_sched_setaffinity != 122
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 123
+#elif __NR_sched_getaffinity != 123
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_RISCV64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 122
+#elif __NR_sched_setaffinity != 122
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 123
+#elif __NR_sched_getaffinity != 123
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_VE
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 203
+#elif __NR_sched_setaffinity != 203
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 204
+#elif __NR_sched_getaffinity != 204
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_S390X
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 239
+#elif __NR_sched_setaffinity != 239
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 240
+#elif __NR_sched_getaffinity != 240
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#else
+#error Unknown or unsupported architecture
+#endif /* KMP_ARCH_* */
+#elif KMP_OS_FREEBSD
+#include <pthread.h>
+#endif
+class KMPNativeAffinity : public KMPAffinity {
+  class Mask : public KMPAffinity::Mask {
+    typedef unsigned long mask_t;
+    typedef decltype(__kmp_affin_mask_size) mask_size_type;
+    static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
+    static const mask_t ONE = 1;
+    mask_size_type get_num_mask_types() const {
+      return __kmp_affin_mask_size / sizeof(mask_t);
+    }
+
+  public:
+    mask_t *mask;
+    Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
+    ~Mask() {
+      if (mask)
+        __kmp_free(mask);
+    }
+    void set(int i) override {
+      mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
+    }
+    bool is_set(int i) const override {
+      return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
+    }
+    void clear(int i) override {
+      mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
+    }
+    void zero() override {
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
+        mask[i] = (mask_t)0;
+    }
+    bool empty() const override {
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
+        if (mask[i] != (mask_t)0)
+          return false;
+      return true;
+    }
+    void copy(const KMPAffinity::Mask *src) override {
+      const Mask *convert = static_cast<const Mask *>(src);
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
+        mask[i] = convert->mask[i];
+    }
+    void bitwise_and(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
+        mask[i] &= convert->mask[i];
+    }
+    void bitwise_or(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
+        mask[i] |= convert->mask[i];
+    }
+    void bitwise_not() override {
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
+        mask[i] = ~(mask[i]);
+    }
+    bool is_equal(const KMPAffinity::Mask *rhs) const override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      mask_size_type e = get_num_mask_types();
+      for (mask_size_type i = 0; i < e; ++i)
+        if (mask[i] != convert->mask[i])
+          return false;
+      return true;
+    }
+    int begin() const override {
+      int retval = 0;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int end() const override {
+      int e;
+      __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
+      return e;
+    }
+    int next(int previous) const override {
+      int retval = previous + 1;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int get_system_affinity(bool abort_on_error) override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+#if KMP_OS_LINUX && !defined(__COSMOPOLITAN__)
+      long retval =
+          syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
+#elif KMP_OS_FREEBSD || defined(__COSMOPOLITAN__)
+      int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
+                                     reinterpret_cast<cpuset_t *>(mask));
+      int retval = (r == 0 ? 0 : -1);
+#endif
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
+                    KMP_ERR(error), __kmp_msg_null);
+      }
+      return error;
+    }
+    int set_system_affinity(bool abort_on_error) const override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal set affinity operation when not capable");
+#if KMP_OS_LINUX && !defined(__COSMOPOLITAN__)
+      long retval =
+          syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
+#elif KMP_OS_FREEBSD || defined(__COSMOPOLITAN__)
+      int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
+                                     reinterpret_cast<cpuset_t *>(mask));
+      int retval = (r == 0 ? 0 : -1);
+#endif
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
+                    KMP_ERR(error), __kmp_msg_null);
+      }
+      return error;
+    }
+  };
+  void determine_capable(const char *env_var) override {
+    __kmp_affinity_determine_capable(env_var);
+  }
+  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
+  KMPAffinity::Mask *allocate_mask() override {
+    KMPNativeAffinity::Mask *retval = new Mask();
+    return retval;
+  }
+  void deallocate_mask(KMPAffinity::Mask *m) override {
+    KMPNativeAffinity::Mask *native_mask =
+        static_cast<KMPNativeAffinity::Mask *>(m);
+    delete native_mask;
+  }
+  KMPAffinity::Mask *allocate_mask_array(int num) override {
+    return new Mask[num];
+  }
+  void deallocate_mask_array(KMPAffinity::Mask *array) override {
+    Mask *linux_array = static_cast<Mask *>(array);
+    delete[] linux_array;
+  }
+  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
+                                      int index) override {
+    Mask *linux_array = static_cast<Mask *>(array);
+    return &(linux_array[index]);
+  }
+  api_type get_api_type() const override { return NATIVE_OS; }
+};
+#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
+
+#if KMP_OS_WINDOWS
+class KMPNativeAffinity : public KMPAffinity {
+  class Mask : public KMPAffinity::Mask {
+    typedef ULONG_PTR mask_t;
+    static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
+    mask_t *mask;
+
+  public:
+    Mask() {
+      mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
+    }
+    ~Mask() {
+      if (mask)
+        __kmp_free(mask);
+    }
+    void set(int i) override {
+      mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
+    }
+    bool is_set(int i) const override {
+      return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
+    }
+    void clear(int i) override {
+      mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
+    }
+    void zero() override {
+      for (int i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] = 0;
+    }
+    bool empty() const override {
+      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
+        if (mask[i])
+          return false;
+      return true;
+    }
+    void copy(const KMPAffinity::Mask *src) override {
+      const Mask *convert = static_cast<const Mask *>(src);
+      for (int i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] = convert->mask[i];
+    }
+    void bitwise_and(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (int i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] &= convert->mask[i];
+    }
+    void bitwise_or(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (int i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] |= convert->mask[i];
+    }
+    void bitwise_not() override {
+      for (int i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] = ~(mask[i]);
+    }
+    bool is_equal(const KMPAffinity::Mask *rhs) const override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
+        if (mask[i] != convert->mask[i])
+          return false;
+      return true;
+    }
+    int begin() const override {
+      int retval = 0;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
+    int next(int previous) const override {
+      int retval = previous + 1;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int set_process_affinity(bool abort_on_error) const override {
+      if (__kmp_num_proc_groups <= 1) {
+        if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
+                        __kmp_msg_null);
+          }
+          return error;
+        }
+      }
+      return 0;
+    }
+    int set_system_affinity(bool abort_on_error) const override {
+      if (__kmp_num_proc_groups > 1) {
+        // Check for a valid mask.
+        GROUP_AFFINITY ga;
+        int group = get_proc_group();
+        if (group < 0) {
+          if (abort_on_error) {
+            KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+          }
+          return -1;
+        }
+        // Transform the bit vector into a GROUP_AFFINITY struct
+        // and make the system call to set affinity.
+        ga.Group = group;
+        ga.Mask = mask[group];
+        ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
+
+        KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
+        if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
+                        __kmp_msg_null);
+          }
+          return error;
+        }
+      } else {
+        if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
+                        __kmp_msg_null);
+          }
+          return error;
+        }
+      }
+      return 0;
+    }
+    int get_system_affinity(bool abort_on_error) override {
+      if (__kmp_num_proc_groups > 1) {
+        this->zero();
+        GROUP_AFFINITY ga;
+        KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
+        if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
+                        KMP_ERR(error), __kmp_msg_null);
+          }
+          return error;
+        }
+        if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
+            (ga.Mask == 0)) {
+          return -1;
+        }
+        mask[ga.Group] = ga.Mask;
+      } else {
+        mask_t newMask, sysMask, retval;
+        if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
+                        KMP_ERR(error), __kmp_msg_null);
+          }
+          return error;
+        }
+        retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
+        if (!retval) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
+                        KMP_ERR(error), __kmp_msg_null);
+          }
+          return error;
+        }
+        newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
+        if (!newMask) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
+                        KMP_ERR(error), __kmp_msg_null);
+          }
+        }
+        *mask = retval;
+      }
+      return 0;
+    }
+    int get_proc_group() const override {
+      int group = -1;
+      if (__kmp_num_proc_groups == 1) {
+        return 1;
+      }
+      for (int i = 0; i < __kmp_num_proc_groups; i++) {
+        if (mask[i] == 0)
+          continue;
+        if (group >= 0)
+          return -1;
+        group = i;
+      }
+      return group;
+    }
+  };
+  void determine_capable(const char *env_var) override {
+    __kmp_affinity_determine_capable(env_var);
+  }
+  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
+  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
+  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
+  KMPAffinity::Mask *allocate_mask_array(int num) override {
+    return new Mask[num];
+  }
+  void deallocate_mask_array(KMPAffinity::Mask *array) override {
+    Mask *windows_array = static_cast<Mask *>(array);
+    delete[] windows_array;
+  }
+  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
+                                      int index) override {
+    Mask *windows_array = static_cast<Mask *>(array);
+    return &(windows_array[index]);
+  }
+  api_type get_api_type() const override { return NATIVE_OS; }
+};
+#endif /* KMP_OS_WINDOWS */
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+// Describe an attribute for a level in the machine topology
+struct kmp_hw_attr_t {
+  int core_type : 8;
+  int core_eff : 8;
+  unsigned valid : 1;
+  unsigned reserved : 15;
+
+  static const int UNKNOWN_CORE_EFF = -1;
+
+  kmp_hw_attr_t()
+      : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
+        valid(0), reserved(0) {}
+  void set_core_type(kmp_hw_core_type_t type) {
+    valid = 1;
+    core_type = type;
+  }
+  void set_core_eff(int eff) {
+    valid = 1;
+    core_eff = eff;
+  }
+  kmp_hw_core_type_t get_core_type() const {
+    return (kmp_hw_core_type_t)core_type;
+  }
+  int get_core_eff() const { return core_eff; }
+  bool is_core_type_valid() const {
+    return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
+  }
+  bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
+  operator bool() const { return valid; }
+  void clear() {
+    core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+    core_eff = UNKNOWN_CORE_EFF;
+    valid = 0;
+  }
+  bool contains(const kmp_hw_attr_t &other) const {
+    if (!valid && !other.valid)
+      return true;
+    if (valid && other.valid) {
+      if (other.is_core_type_valid()) {
+        if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
+          return false;
+      }
+      if (other.is_core_eff_valid()) {
+        if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
+          return false;
+      }
+      return true;
+    }
+    return false;
+  }
+#if KMP_AFFINITY_SUPPORTED
+  bool contains(const kmp_affinity_attrs_t &attr) const {
+    if (!valid && !attr.valid)
+      return true;
+    if (valid && attr.valid) {
+      if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
+        return (is_core_type_valid() &&
+                (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
+      if (attr.core_eff != UNKNOWN_CORE_EFF)
+        return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
+      return true;
+    }
+    return false;
+  }
+#endif // KMP_AFFINITY_SUPPORTED
+  bool operator==(const kmp_hw_attr_t &rhs) const {
+    return (rhs.valid == valid && rhs.core_eff == core_eff &&
+            rhs.core_type == core_type);
+  }
+  bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
+};
+
+#if KMP_AFFINITY_SUPPORTED
+KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
+#endif
+
+class kmp_hw_thread_t {
+public:
+  static const int UNKNOWN_ID = -1;
+  static const int MULTIPLE_ID = -2;
+  static int compare_ids(const void *a, const void *b);
+  static int compare_compact(const void *a, const void *b);
+  int ids[KMP_HW_LAST];
+  int sub_ids[KMP_HW_LAST];
+  bool leader;
+  int os_id;
+  kmp_hw_attr_t attrs;
+
+  void print() const;
+  void clear() {
+    for (int i = 0; i < (int)KMP_HW_LAST; ++i)
+      ids[i] = UNKNOWN_ID;
+    leader = false;
+    attrs.clear();
+  }
+};
+
+class kmp_topology_t {
+
+  struct flags_t {
+    int uniform : 1;
+    int reserved : 31;
+  };
+
+  int depth;
+
+  // The following arrays are all 'depth' long and have been
+  // allocated to hold up to KMP_HW_LAST number of objects if
+  // needed so layers can be added without reallocation of any array
+
+  // Orderd array of the types in the topology
+  kmp_hw_t *types;
+
+  // Keep quick topology ratios, for non-uniform topologies,
+  // this ratio holds the max number of itemAs per itemB
+  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
+  int *ratio;
+
+  // Storage containing the absolute number of each topology layer
+  int *count;
+
+  // The number of core efficiencies. This is only useful for hybrid
+  // topologies. Core efficiencies will range from 0 to num efficiencies - 1
+  int num_core_efficiencies;
+  int num_core_types;
+  kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
+
+  // The hardware threads array
+  // hw_threads is num_hw_threads long
+  // Each hw_thread's ids and sub_ids are depth deep
+  int num_hw_threads;
+  kmp_hw_thread_t *hw_threads;
+
+  // Equivalence hash where the key is the hardware topology item
+  // and the value is the equivalent hardware topology type in the
+  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
+  // known equivalence for the topology type
+  kmp_hw_t equivalent[KMP_HW_LAST];
+
+  // Flags describing the topology
+  flags_t flags;
+
+  // Compact value used during sort_compact()
+  int compact;
+
+  // Insert a new topology layer after allocation
+  void _insert_layer(kmp_hw_t type, const int *ids);
+
+#if KMP_GROUP_AFFINITY
+  // Insert topology information about Windows Processor groups
+  void _insert_windows_proc_groups();
+#endif
+
+  // Count each item & get the num x's per y
+  // e.g., get the number of cores and the number of threads per core
+  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
+  void _gather_enumeration_information();
+
+  // Remove layers that don't add information to the topology.
+  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
+  void _remove_radix1_layers();
+
+  // Find out if the topology is uniform
+  void _discover_uniformity();
+
+  // Set all the sub_ids for each hardware thread
+  void _set_sub_ids();
+
+  // Set global affinity variables describing the number of threads per
+  // core, the number of packages, the number of cores per package, and
+  // the number of cores.
+  void _set_globals();
+
+  // Set the last level cache equivalent type
+  void _set_last_level_cache();
+
+  // Return the number of cores with a particular attribute, 'attr'.
+  // If 'find_all' is true, then find all cores on the machine, otherwise find
+  // all cores per the layer 'above'
+  int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
+                            bool find_all = false) const;
+
+public:
+  // Force use of allocate()/deallocate()
+  kmp_topology_t() = delete;
+  kmp_topology_t(const kmp_topology_t &t) = delete;
+  kmp_topology_t(kmp_topology_t &&t) = delete;
+  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
+  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
+
+  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
+  static void deallocate(kmp_topology_t *);
+
+  // Functions used in create_map() routines
+  kmp_hw_thread_t &at(int index) {
+    KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
+    return hw_threads[index];
+  }
+  const kmp_hw_thread_t &at(int index) const {
+    KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
+    return hw_threads[index];
+  }
+  int get_num_hw_threads() const { return num_hw_threads; }
+  void sort_ids() {
+    qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
+          kmp_hw_thread_t::compare_ids);
+  }
+  // Check if the hardware ids are unique, if they are
+  // return true, otherwise return false
+  bool check_ids() const;
+
+  // Function to call after the create_map() routine
+  void canonicalize();
+  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
+
+// Functions used after canonicalize() called
+
+#if KMP_AFFINITY_SUPPORTED
+  // Set the granularity for affinity settings
+  void set_granularity(kmp_affinity_t &stgs) const;
+  bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
+  bool restrict_to_mask(const kmp_affin_mask_t *mask);
+  bool filter_hw_subset();
+#endif
+  bool is_uniform() const { return flags.uniform; }
+  // Tell whether a type is a valid type in the topology
+  // returns KMP_HW_UNKNOWN when there is no equivalent type
+  kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
+    if (type == KMP_HW_UNKNOWN)
+      return KMP_HW_UNKNOWN;
+    return equivalent[type];
+  }
+  // Set type1 = type2
+  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
+    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
+    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
+    kmp_hw_t real_type2 = equivalent[type2];
+    if (real_type2 == KMP_HW_UNKNOWN)
+      real_type2 = type2;
+    equivalent[type1] = real_type2;
+    // This loop is required since any of the types may have been set to
+    // be equivalent to type1.  They all must be checked and reset to type2.
+    KMP_FOREACH_HW_TYPE(type) {
+      if (equivalent[type] == type1) {
+        equivalent[type] = real_type2;
+      }
+    }
+  }
+  // Calculate number of types corresponding to level1
+  // per types corresponding to level2 (e.g., number of threads per core)
+  int calculate_ratio(int level1, int level2) const {
+    KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
+    KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
+    int r = 1;
+    for (int level = level1; level > level2; --level)
+      r *= ratio[level];
+    return r;
+  }
+  int get_ratio(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+    return ratio[level];
+  }
+  int get_depth() const { return depth; };
+  kmp_hw_t get_type(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+    return types[level];
+  }
+  int get_level(kmp_hw_t type) const {
+    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
+    int eq_type = equivalent[type];
+    if (eq_type == KMP_HW_UNKNOWN)
+      return -1;
+    for (int i = 0; i < depth; ++i)
+      if (types[i] == eq_type)
+        return i;
+    return -1;
+  }
+  int get_count(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
+    return count[level];
+  }
+  // Return the total number of cores with attribute 'attr'
+  int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
+    return _get_ncores_with_attr(attr, -1, true);
+  }
+  // Return the number of cores with attribute
+  // 'attr' per topology level 'above'
+  int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
+    return _get_ncores_with_attr(attr, above, false);
+  }
+
+#if KMP_AFFINITY_SUPPORTED
+  friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
+  void sort_compact(kmp_affinity_t &affinity) {
+    compact = affinity.compact;
+    qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
+          kmp_hw_thread_t::compare_compact);
+  }
+#endif
+  void print(const char *env_var = "KMP_AFFINITY") const;
+  void dump() const;
+};
+extern kmp_topology_t *__kmp_topology;
+
+class kmp_hw_subset_t {
+  const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
+
+public:
+  // Describe a machine topology item in KMP_HW_SUBSET
+  struct item_t {
+    kmp_hw_t type;
+    int num_attrs;
+    int num[MAX_ATTRS];
+    int offset[MAX_ATTRS];
+    kmp_hw_attr_t attr[MAX_ATTRS];
+  };
+  // Put parenthesis around max to avoid accidental use of Windows max macro.
+  const static int USE_ALL = (std::numeric_limits<int>::max)();
+
+private:
+  int depth;
+  int capacity;
+  item_t *items;
+  kmp_uint64 set;
+  bool absolute;
+  // The set must be able to handle up to KMP_HW_LAST number of layers
+  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
+  // Sorting the KMP_HW_SUBSET items to follow topology order
+  // All unknown topology types will be at the beginning of the subset
+  static int hw_subset_compare(const void *i1, const void *i2) {
+    kmp_hw_t type1 = ((const item_t *)i1)->type;
+    kmp_hw_t type2 = ((const item_t *)i2)->type;
+    int level1 = __kmp_topology->get_level(type1);
+    int level2 = __kmp_topology->get_level(type2);
+    return level1 - level2;
+  }
+
+public:
+  // Force use of allocate()/deallocate()
+  kmp_hw_subset_t() = delete;
+  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
+  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
+  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
+  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
+
+  static kmp_hw_subset_t *allocate() {
+    int initial_capacity = 5;
+    kmp_hw_subset_t *retval =
+        (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
+    retval->depth = 0;
+    retval->capacity = initial_capacity;
+    retval->set = 0ull;
+    retval->absolute = false;
+    retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
+    return retval;
+  }
+  static void deallocate(kmp_hw_subset_t *subset) {
+    __kmp_free(subset->items);
+    __kmp_free(subset);
+  }
+  void set_absolute() { absolute = true; }
+  bool is_absolute() const { return absolute; }
+  void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
+    for (int i = 0; i < depth; ++i) {
+      // Found an existing item for this layer type
+      // Add the num, offset, and attr to this item
+      if (items[i].type == type) {
+        int idx = items[i].num_attrs++;
+        if ((size_t)idx >= MAX_ATTRS)
+          return;
+        items[i].num[idx] = num;
+        items[i].offset[idx] = offset;
+        items[i].attr[idx] = attr;
+        return;
+      }
+    }
+    if (depth == capacity - 1) {
+      capacity *= 2;
+      item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
+      for (int i = 0; i < depth; ++i)
+        new_items[i] = items[i];
+      __kmp_free(items);
+      items = new_items;
+    }
+    items[depth].num_attrs = 1;
+    items[depth].type = type;
+    items[depth].num[0] = num;
+    items[depth].offset[0] = offset;
+    items[depth].attr[0] = attr;
+    depth++;
+    set |= (1ull << type);
+  }
+  int get_depth() const { return depth; }
+  const item_t &at(int index) const {
+    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+    return items[index];
+  }
+  item_t &at(int index) {
+    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+    return items[index];
+  }
+  void remove(int index) {
+    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
+    set &= ~(1ull << items[index].type);
+    for (int j = index + 1; j < depth; ++j) {
+      items[j - 1] = items[j];
+    }
+    depth--;
+  }
+  void sort() {
+    KMP_DEBUG_ASSERT(__kmp_topology);
+    qsort(items, depth, sizeof(item_t), hw_subset_compare);
+  }
+  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
+  void dump() const {
+    printf("**********************\n");
+    printf("*** kmp_hw_subset: ***\n");
+    printf("* depth: %d\n", depth);
+    printf("* items:\n");
+    for (int i = 0; i < depth; ++i) {
+      printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
+      for (int j = 0; j < items[i].num_attrs; ++j) {
+        printf("  num: %d, offset: %d, attr: ", items[i].num[j],
+               items[i].offset[j]);
+        if (!items[i].attr[j]) {
+          printf(" (none)\n");
+        } else {
+          printf(
+              " core_type = %s, core_eff = %d\n",
+              __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
+              items[i].attr[j].get_core_eff());
+        }
+      }
+    }
+    printf("* set: 0x%llx\n", set);
+    printf("* absolute: %d\n", absolute);
+    printf("**********************\n");
+  }
+};
+extern kmp_hw_subset_t *__kmp_hw_subset;
+
+/* A structure for holding machine-specific hierarchy info to be computed once
+   at init. This structure represents a mapping of threads to the actual machine
+   hierarchy, or to our best guess at what the hierarchy might be, for the
+   purpose of performing an efficient barrier. In the worst case, when there is
+   no machine hierarchy information, it produces a tree suitable for a barrier,
+   similar to the tree used in the hyper barrier. */
+class hierarchy_info {
+public:
+  /* Good default values for number of leaves and branching factor, given no
+     affinity information. Behaves a bit like hyper barrier. */
+  static const kmp_uint32 maxLeaves = 4;
+  static const kmp_uint32 minBranch = 4;
+  /** Number of levels in the hierarchy. Typical levels are threads/core,
+      cores/package or socket, packages/node, nodes/machine, etc. We don't want
+      to get specific with nomenclature. When the machine is oversubscribed we
+      add levels to duplicate the hierarchy, doubling the thread capacity of the
+      hierarchy each time we add a level. */
+  kmp_uint32 maxLevels;
+
+  /** This is specifically the depth of the machine configuration hierarchy, in
+      terms of the number of levels along the longest path from root to any
+      leaf. It corresponds to the number of entries in numPerLevel if we exclude
+      all but one trailing 1. */
+  kmp_uint32 depth;
+  kmp_uint32 base_num_threads;
+  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
+  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
+  // 2=initialization in progress
+  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
+
+  /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
+      the parent of a node at level i has. For example, if we have a machine
+      with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
+      {2, 4, 4, 1, 1}. All empty levels are set to 1. */
+  kmp_uint32 *numPerLevel;
+  kmp_uint32 *skipPerLevel;
+
+  void deriveLevels() {
+    int hier_depth = __kmp_topology->get_depth();
+    for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
+      numPerLevel[level] = __kmp_topology->get_ratio(i);
+    }
+  }
+
+  hierarchy_info()
+      : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
+
+  void fini() {
+    if (!uninitialized && numPerLevel) {
+      __kmp_free(numPerLevel);
+      numPerLevel = NULL;
+      uninitialized = not_initialized;
+    }
+  }
+
+  void init(int num_addrs) {
+    kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
+        &uninitialized, not_initialized, initializing);
+    if (bool_result == 0) { // Wait for initialization
+      while (TCR_1(uninitialized) != initialized)
+        KMP_CPU_PAUSE();
+      return;
+    }
+    KMP_DEBUG_ASSERT(bool_result == 1);
+
+    /* Added explicit initialization of the data fields here to prevent usage of
+       dirty value observed when static library is re-initialized multiple times
+       (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
+       OpenMP). */
+    depth = 1;
+    resizing = 0;
+    maxLevels = 7;
+    numPerLevel =
+        (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
+    skipPerLevel = &(numPerLevel[maxLevels]);
+    for (kmp_uint32 i = 0; i < maxLevels;
+         ++i) { // init numPerLevel[*] to 1 item per level
+      numPerLevel[i] = 1;
+      skipPerLevel[i] = 1;
+    }
+
+    // Sort table by physical ID
+    if (__kmp_topology && __kmp_topology->get_depth() > 0) {
+      deriveLevels();
+    } else {
+      numPerLevel[0] = maxLeaves;
+      numPerLevel[1] = num_addrs / maxLeaves;
+      if (num_addrs % maxLeaves)
+        numPerLevel[1]++;
+    }
+
+    base_num_threads = num_addrs;
+    for (int i = maxLevels - 1; i >= 0;
+         --i) // count non-empty levels to get depth
+      if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
+        depth++;
+
+    kmp_uint32 branch = minBranch;
+    if (numPerLevel[0] == 1)
+      branch = num_addrs / maxLeaves;
+    if (branch < minBranch)
+      branch = minBranch;
+    for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
+      while (numPerLevel[d] > branch ||
+             (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
+        if (numPerLevel[d] & 1)
+          numPerLevel[d]++;
+        numPerLevel[d] = numPerLevel[d] >> 1;
+        if (numPerLevel[d + 1] == 1)
+          depth++;
+        numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
+      }
+      if (numPerLevel[0] == 1) {
+        branch = branch >> 1;
+        if (branch < 4)
+          branch = minBranch;
+      }
+    }
+
+    for (kmp_uint32 i = 1; i < depth; ++i)
+      skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
+    // Fill in hierarchy in the case of oversubscription
+    for (kmp_uint32 i = depth; i < maxLevels; ++i)
+      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
+
+    uninitialized = initialized; // One writer
+  }
+
+  // Resize the hierarchy if nproc changes to something larger than before
+  void resize(kmp_uint32 nproc) {
+    kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
+    while (bool_result == 0) { // someone else is trying to resize
+      KMP_CPU_PAUSE();
+      if (nproc <= base_num_threads) // happy with other thread's resize
+        return;
+      else // try to resize
+        bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
+    }
+    KMP_DEBUG_ASSERT(bool_result != 0);
+    if (nproc <= base_num_threads)
+      return; // happy with other thread's resize
+
+    // Calculate new maxLevels
+    kmp_uint32 old_sz = skipPerLevel[depth - 1];
+    kmp_uint32 incs = 0, old_maxLevels = maxLevels;
+    // First see if old maxLevels is enough to contain new size
+    for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
+      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
+      numPerLevel[i - 1] *= 2;
+      old_sz *= 2;
+      depth++;
+    }
+    if (nproc > old_sz) { // Not enough space, need to expand hierarchy
+      while (nproc > old_sz) {
+        old_sz *= 2;
+        incs++;
+        depth++;
+      }
+      maxLevels += incs;
+
+      // Resize arrays
+      kmp_uint32 *old_numPerLevel = numPerLevel;
+      kmp_uint32 *old_skipPerLevel = skipPerLevel;
+      numPerLevel = skipPerLevel = NULL;
+      numPerLevel =
+          (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
+      skipPerLevel = &(numPerLevel[maxLevels]);
+
+      // Copy old elements from old arrays
+      for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
+        // init numPerLevel[*] to 1 item per level
+        numPerLevel[i] = old_numPerLevel[i];
+        skipPerLevel[i] = old_skipPerLevel[i];
+      }
+
+      // Init new elements in arrays to 1
+      for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
+        // init numPerLevel[*] to 1 item per level
+        numPerLevel[i] = 1;
+        skipPerLevel[i] = 1;
+      }
+
+      // Free old arrays
+      __kmp_free(old_numPerLevel);
+    }
+
+    // Fill in oversubscription levels of hierarchy
+    for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
+      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
+
+    base_num_threads = nproc;
+    resizing = 0; // One writer
+  }
+};
+#endif // KMP_AFFINITY_H
diff --git a/third_party/openmp/kmp_alloc.cpp b/third_party/openmp/kmp_alloc.cpp
new file mode 100644
index 000000000..fb1b0eb5f
--- /dev/null
+++ b/third_party/openmp/kmp_alloc.cpp
@@ -0,0 +1,2324 @@
+/*
+ * kmp_alloc.cpp -- private/shared dynamic memory allocation and management
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_io.h"
+#include "kmp_wrapper_malloc.h"
+
+// Disable bget when it is not used
+#if KMP_USE_BGET
+
+/* Thread private buffer management code */
+
+typedef int (*bget_compact_t)(size_t, int);
+typedef void *(*bget_acquire_t)(size_t);
+typedef void (*bget_release_t)(void *);
+
+/* NOTE: bufsize must be a signed datatype */
+
+#if KMP_OS_WINDOWS
+#if KMP_ARCH_X86 || KMP_ARCH_ARM
+typedef kmp_int32 bufsize;
+#else
+typedef kmp_int64 bufsize;
+#endif
+#else
+typedef ssize_t bufsize;
+#endif // KMP_OS_WINDOWS
+
+/* The three modes of operation are, fifo search, lifo search, and best-fit */
+
+typedef enum bget_mode {
+  bget_mode_fifo = 0,
+  bget_mode_lifo = 1,
+  bget_mode_best = 2
+} bget_mode_t;
+
+static void bpool(kmp_info_t *th, void *buffer, bufsize len);
+static void *bget(kmp_info_t *th, bufsize size);
+static void *bgetz(kmp_info_t *th, bufsize size);
+static void *bgetr(kmp_info_t *th, void *buffer, bufsize newsize);
+static void brel(kmp_info_t *th, void *buf);
+static void bectl(kmp_info_t *th, bget_compact_t compact,
+                  bget_acquire_t acquire, bget_release_t release,
+                  bufsize pool_incr);
+
+/* BGET CONFIGURATION */
+/* Buffer allocation size quantum: all buffers allocated are a
+   multiple of this size.  This MUST be a power of two. */
+
+/* On IA-32 architecture with  Linux* OS, malloc() does not
+   ensure 16 byte alignment */
+
+#if KMP_ARCH_X86 || !KMP_HAVE_QUAD
+
+#define SizeQuant 8
+#define AlignType double
+
+#else
+
+#define SizeQuant 16
+#define AlignType _Quad
+
+#endif
+
+// Define this symbol to enable the bstats() function which calculates the
+// total free space in the buffer pool, the largest available buffer, and the
+// total space currently allocated.
+#define BufStats 1
+
+#ifdef KMP_DEBUG
+
+// Define this symbol to enable the bpoold() function which dumps the buffers
+// in a buffer pool.
+#define BufDump 1
+
+// Define this symbol to enable the bpoolv() function for validating a buffer
+// pool.
+#define BufValid 1
+
+// Define this symbol to enable the bufdump() function which allows dumping the
+// contents of an allocated or free buffer.
+#define DumpData 1
+
+#ifdef NOT_USED_NOW
+
+// Wipe free buffers to a guaranteed pattern of garbage to trip up miscreants
+// who attempt to use pointers into released buffers.
+#define FreeWipe 1
+
+// Use a best fit algorithm when searching for space for an allocation request.
+// This uses memory more efficiently, but allocation will be much slower.
+#define BestFit 1
+
+#endif /* NOT_USED_NOW */
+#endif /* KMP_DEBUG */
+
+static bufsize bget_bin_size[] = {
+    0,
+    //    1 << 6,    /* .5 Cache line */
+    1 << 7, /* 1 Cache line, new */
+    1 << 8, /* 2 Cache lines */
+    1 << 9, /* 4 Cache lines, new */
+    1 << 10, /* 8 Cache lines */
+    1 << 11, /* 16 Cache lines, new */
+    1 << 12, 1 << 13, /* new */
+    1 << 14, 1 << 15, /* new */
+    1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, /*  1MB */
+    1 << 21, /*  2MB */
+    1 << 22, /*  4MB */
+    1 << 23, /*  8MB */
+    1 << 24, /* 16MB */
+    1 << 25, /* 32MB */
+};
+
+#define MAX_BGET_BINS (int)(sizeof(bget_bin_size) / sizeof(bufsize))
+
+struct bfhead;
+
+//  Declare the interface, including the requested buffer size type, bufsize.
+
+/* Queue links */
+typedef struct qlinks {
+  struct bfhead *flink; /* Forward link */
+  struct bfhead *blink; /* Backward link */
+} qlinks_t;
+
+/* Header in allocated and free buffers */
+typedef struct bhead2 {
+  kmp_info_t *bthr; /* The thread which owns the buffer pool */
+  bufsize prevfree; /* Relative link back to previous free buffer in memory or
+                       0 if previous buffer is allocated.  */
+  bufsize bsize; /* Buffer size: positive if free, negative if allocated. */
+} bhead2_t;
+
+/* Make sure the bhead structure is a multiple of SizeQuant in size. */
+typedef union bhead {
+  KMP_ALIGN(SizeQuant)
+  AlignType b_align;
+  char b_pad[sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant))];
+  bhead2_t bb;
+} bhead_t;
+#define BH(p) ((bhead_t *)(p))
+
+/*  Header in directly allocated buffers (by acqfcn) */
+typedef struct bdhead {
+  bufsize tsize; /* Total size, including overhead */
+  bhead_t bh; /* Common header */
+} bdhead_t;
+#define BDH(p) ((bdhead_t *)(p))
+
+/* Header in free buffers */
+typedef struct bfhead {
+  bhead_t bh; /* Common allocated/free header */
+  qlinks_t ql; /* Links on free list */
+} bfhead_t;
+#define BFH(p) ((bfhead_t *)(p))
+
+typedef struct thr_data {
+  bfhead_t freelist[MAX_BGET_BINS];
+#if BufStats
+  size_t totalloc; /* Total space currently allocated */
+  long numget, numrel; /* Number of bget() and brel() calls */
+  long numpblk; /* Number of pool blocks */
+  long numpget, numprel; /* Number of block gets and rels */
+  long numdget, numdrel; /* Number of direct gets and rels */
+#endif /* BufStats */
+
+  /* Automatic expansion block management functions */
+  bget_compact_t compfcn;
+  bget_acquire_t acqfcn;
+  bget_release_t relfcn;
+
+  bget_mode_t mode; /* what allocation mode to use? */
+
+  bufsize exp_incr; /* Expansion block size */
+  bufsize pool_len; /* 0: no bpool calls have been made
+                       -1: not all pool blocks are the same size
+                       >0: (common) block size for all bpool calls made so far
+                    */
+  bfhead_t *last_pool; /* Last pool owned by this thread (delay deallocation) */
+} thr_data_t;
+
+/*  Minimum allocation quantum: */
+#define QLSize (sizeof(qlinks_t))
+#define SizeQ ((SizeQuant > QLSize) ? SizeQuant : QLSize)
+#define MaxSize                                                                \
+  (bufsize)(                                                                   \
+      ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
+// Maximum for the requested size.
+
+/* End sentinel: value placed in bsize field of dummy block delimiting
+   end of pool block.  The most negative number which will  fit  in  a
+   bufsize, defined in a way that the compiler will accept. */
+
+#define ESent                                                                  \
+  ((bufsize)(-(((((bufsize)1) << ((int)sizeof(bufsize) * 8 - 2)) - 1) * 2) - 2))
+
+/* Thread Data management routines */
+static int bget_get_bin(bufsize size) {
+  // binary chop bins
+  int lo = 0, hi = MAX_BGET_BINS - 1;
+
+  KMP_DEBUG_ASSERT(size > 0);
+
+  while ((hi - lo) > 1) {
+    int mid = (lo + hi) >> 1;
+    if (size < bget_bin_size[mid])
+      hi = mid - 1;
+    else
+      lo = mid;
+  }
+
+  KMP_DEBUG_ASSERT((lo >= 0) && (lo < MAX_BGET_BINS));
+
+  return lo;
+}
+
+static void set_thr_data(kmp_info_t *th) {
+  int i;
+  thr_data_t *data;
+
+  data = (thr_data_t *)((!th->th.th_local.bget_data)
+                            ? __kmp_allocate(sizeof(*data))
+                            : th->th.th_local.bget_data);
+
+  memset(data, '\0', sizeof(*data));
+
+  for (i = 0; i < MAX_BGET_BINS; ++i) {
+    data->freelist[i].ql.flink = &data->freelist[i];
+    data->freelist[i].ql.blink = &data->freelist[i];
+  }
+
+  th->th.th_local.bget_data = data;
+  th->th.th_local.bget_list = 0;
+#if !USE_CMP_XCHG_FOR_BGET
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+  __kmp_init_lock(&th->th.th_local.bget_lock);
+#else
+  __kmp_init_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif /* USE_LOCK_FOR_BGET */
+#endif /* ! USE_CMP_XCHG_FOR_BGET */
+}
+
+static thr_data_t *get_thr_data(kmp_info_t *th) {
+  thr_data_t *data;
+
+  data = (thr_data_t *)th->th.th_local.bget_data;
+
+  KMP_DEBUG_ASSERT(data != 0);
+
+  return data;
+}
+
+/* Walk the free list and release the enqueued buffers */
+static void __kmp_bget_dequeue(kmp_info_t *th) {
+  void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
+
+  if (p != 0) {
+#if USE_CMP_XCHG_FOR_BGET
+    {
+      volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
+      while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
+                                        CCAST(void *, old_value), nullptr)) {
+        KMP_CPU_PAUSE();
+        old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
+      }
+      p = CCAST(void *, old_value);
+    }
+#else /* ! USE_CMP_XCHG_FOR_BGET */
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+    __kmp_acquire_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
+#else
+    __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif /* USE_QUEUING_LOCK_FOR_BGET */
+
+    p = (void *)th->th.th_local.bget_list;
+    th->th.th_local.bget_list = 0;
+
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+    __kmp_release_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
+#else
+    __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif
+#endif /* USE_CMP_XCHG_FOR_BGET */
+
+    /* Check again to make sure the list is not empty */
+    while (p != 0) {
+      void *buf = p;
+      bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
+
+      KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
+      KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
+                       (kmp_uintptr_t)th); // clear possible mark
+      KMP_DEBUG_ASSERT(b->ql.blink == 0);
+
+      p = (void *)b->ql.flink;
+
+      brel(th, buf);
+    }
+  }
+}
+
+/* Chain together the free buffers by using the thread owner field */
+static void __kmp_bget_enqueue(kmp_info_t *th, void *buf
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+                               ,
+                               kmp_int32 rel_gtid
+#endif
+) {
+  bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));
+
+  KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
+  KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
+                   (kmp_uintptr_t)th); // clear possible mark
+
+  b->ql.blink = 0;
+
+  KC_TRACE(10, ("__kmp_bget_enqueue: moving buffer to T#%d list\n",
+                __kmp_gtid_from_thread(th)));
+
+#if USE_CMP_XCHG_FOR_BGET
+  {
+    volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
+    /* the next pointer must be set before setting bget_list to buf to avoid
+       exposing a broken list to other threads, even for an instant. */
+    b->ql.flink = BFH(CCAST(void *, old_value));
+
+    while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
+                                      CCAST(void *, old_value), buf)) {
+      KMP_CPU_PAUSE();
+      old_value = TCR_PTR(th->th.th_local.bget_list);
+      /* the next pointer must be set before setting bget_list to buf to avoid
+         exposing a broken list to other threads, even for an instant. */
+      b->ql.flink = BFH(CCAST(void *, old_value));
+    }
+  }
+#else /* ! USE_CMP_XCHG_FOR_BGET */
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+  __kmp_acquire_lock(&th->th.th_local.bget_lock, rel_gtid);
+#else
+  __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif
+
+  b->ql.flink = BFH(th->th.th_local.bget_list);
+  th->th.th_local.bget_list = (void *)buf;
+
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+  __kmp_release_lock(&th->th.th_local.bget_lock, rel_gtid);
+#else
+  __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif
+#endif /* USE_CMP_XCHG_FOR_BGET */
+}
+
+/* insert buffer back onto a new freelist */
+static void __kmp_bget_insert_into_freelist(thr_data_t *thr, bfhead_t *b) {
+  int bin;
+
+  KMP_DEBUG_ASSERT(((size_t)b) % SizeQuant == 0);
+  KMP_DEBUG_ASSERT(b->bh.bb.bsize % SizeQuant == 0);
+
+  bin = bget_get_bin(b->bh.bb.bsize);
+
+  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.blink->ql.flink ==
+                   &thr->freelist[bin]);
+  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.flink->ql.blink ==
+                   &thr->freelist[bin]);
+
+  b->ql.flink = &thr->freelist[bin];
+  b->ql.blink = thr->freelist[bin].ql.blink;
+
+  thr->freelist[bin].ql.blink = b;
+  b->ql.blink->ql.flink = b;
+}
+
+/* unlink the buffer from the old freelist */
+static void __kmp_bget_remove_from_freelist(bfhead_t *b) {
+  KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
+  KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
+
+  b->ql.blink->ql.flink = b->ql.flink;
+  b->ql.flink->ql.blink = b->ql.blink;
+}
+
+/*  GET STATS -- check info on free list */
+static void bcheck(kmp_info_t *th, bufsize *max_free, bufsize *total_free) {
+  thr_data_t *thr = get_thr_data(th);
+  int bin;
+
+  *total_free = *max_free = 0;
+
+  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+    bfhead_t *b, *best;
+
+    best = &thr->freelist[bin];
+    b = best->ql.flink;
+
+    while (b != &thr->freelist[bin]) {
+      *total_free += (b->bh.bb.bsize - sizeof(bhead_t));
+      if ((best == &thr->freelist[bin]) || (b->bh.bb.bsize < best->bh.bb.bsize))
+        best = b;
+
+      /* Link to next buffer */
+      b = b->ql.flink;
+    }
+
+    if (*max_free < best->bh.bb.bsize)
+      *max_free = best->bh.bb.bsize;
+  }
+
+  if (*max_free > (bufsize)sizeof(bhead_t))
+    *max_free -= sizeof(bhead_t);
+}
+
+/*  BGET  --  Allocate a buffer.  */
+static void *bget(kmp_info_t *th, bufsize requested_size) {
+  thr_data_t *thr = get_thr_data(th);
+  bufsize size = requested_size;
+  bfhead_t *b;
+  void *buf;
+  int compactseq = 0;
+  int use_blink = 0;
+  /* For BestFit */
+  bfhead_t *best;
+
+  if (size < 0 || size + sizeof(bhead_t) > MaxSize) {
+    return NULL;
+  }
+
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
+
+  if (size < (bufsize)SizeQ) { // Need at least room for the queue links.
+    size = SizeQ;
+  }
+#if defined(SizeQuant) && (SizeQuant > 1)
+  size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
+#endif
+
+  size += sizeof(bhead_t); // Add overhead in allocated buffer to size required.
+  KMP_DEBUG_ASSERT(size >= 0);
+  KMP_DEBUG_ASSERT(size % SizeQuant == 0);
+
+  use_blink = (thr->mode == bget_mode_lifo);
+
+  /* If a compact function was provided in the call to bectl(), wrap
+     a loop around the allocation process  to  allow  compaction  to
+     intervene in case we don't find a suitable buffer in the chain. */
+
+  for (;;) {
+    int bin;
+
+    for (bin = bget_get_bin(size); bin < MAX_BGET_BINS; ++bin) {
+      /* Link to next buffer */
+      b = (use_blink ? thr->freelist[bin].ql.blink
+                     : thr->freelist[bin].ql.flink);
+
+      if (thr->mode == bget_mode_best) {
+        best = &thr->freelist[bin];
+
+        /* Scan the free list searching for the first buffer big enough
+           to hold the requested size buffer. */
+        while (b != &thr->freelist[bin]) {
+          if (b->bh.bb.bsize >= (bufsize)size) {
+            if ((best == &thr->freelist[bin]) ||
+                (b->bh.bb.bsize < best->bh.bb.bsize)) {
+              best = b;
+            }
+          }
+
+          /* Link to next buffer */
+          b = (use_blink ? b->ql.blink : b->ql.flink);
+        }
+        b = best;
+      }
+
+      while (b != &thr->freelist[bin]) {
+        if ((bufsize)b->bh.bb.bsize >= (bufsize)size) {
+
+          // Buffer is big enough to satisfy the request. Allocate it to the
+          // caller. We must decide whether the buffer is large enough to split
+          // into the part given to the caller and a free buffer that remains
+          // on the free list, or whether the entire buffer should be removed
+          // from the free list and given to the caller in its entirety. We
+          // only split the buffer if enough room remains for a header plus the
+          // minimum quantum of allocation.
+          if ((b->bh.bb.bsize - (bufsize)size) >
+              (bufsize)(SizeQ + (sizeof(bhead_t)))) {
+            bhead_t *ba, *bn;
+
+            ba = BH(((char *)b) + (b->bh.bb.bsize - (bufsize)size));
+            bn = BH(((char *)ba) + size);
+
+            KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
+
+            /* Subtract size from length of free block. */
+            b->bh.bb.bsize -= (bufsize)size;
+
+            /* Link allocated buffer to the previous free buffer. */
+            ba->bb.prevfree = b->bh.bb.bsize;
+
+            /* Plug negative size into user buffer. */
+            ba->bb.bsize = -size;
+
+            /* Mark this buffer as owned by this thread. */
+            TCW_PTR(ba->bb.bthr,
+                    th); // not an allocated address (do not mark it)
+            /* Mark buffer after this one not preceded by free block. */
+            bn->bb.prevfree = 0;
+
+            // unlink buffer from old freelist, and reinsert into new freelist
+            __kmp_bget_remove_from_freelist(b);
+            __kmp_bget_insert_into_freelist(thr, b);
+#if BufStats
+            thr->totalloc += (size_t)size;
+            thr->numget++; /* Increment number of bget() calls */
+#endif
+            buf = (void *)((((char *)ba) + sizeof(bhead_t)));
+            KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
+            return buf;
+          } else {
+            bhead_t *ba;
+
+            ba = BH(((char *)b) + b->bh.bb.bsize);
+
+            KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
+
+            /* The buffer isn't big enough to split.  Give  the  whole
+               shebang to the caller and remove it from the free list. */
+
+            __kmp_bget_remove_from_freelist(b);
+#if BufStats
+            thr->totalloc += (size_t)b->bh.bb.bsize;
+            thr->numget++; /* Increment number of bget() calls */
+#endif
+            /* Negate size to mark buffer allocated. */
+            b->bh.bb.bsize = -(b->bh.bb.bsize);
+
+            /* Mark this buffer as owned by this thread. */
+            TCW_PTR(ba->bb.bthr, th); // not an allocated address (do not mark)
+            /* Zero the back pointer in the next buffer in memory
+               to indicate that this buffer is allocated. */
+            ba->bb.prevfree = 0;
+
+            /* Give user buffer starting at queue links. */
+            buf = (void *)&(b->ql);
+            KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
+            return buf;
+          }
+        }
+
+        /* Link to next buffer */
+        b = (use_blink ? b->ql.blink : b->ql.flink);
+      }
+    }
+
+    /* We failed to find a buffer. If there's a compact function defined,
+       notify it of the size requested. If it returns TRUE, try the allocation
+       again. */
+
+    if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
+      break;
+    }
+  }
+
+  /* No buffer available with requested size free. */
+
+  /* Don't give up yet -- look in the reserve supply. */
+  if (thr->acqfcn != 0) {
+    if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
+      /* Request is too large to fit in a single expansion block.
+         Try to satisfy it by a direct buffer acquisition. */
+      bdhead_t *bdh;
+
+      size += sizeof(bdhead_t) - sizeof(bhead_t);
+
+      KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", (int)size));
+
+      /* richryan */
+      bdh = BDH((*thr->acqfcn)((bufsize)size));
+      if (bdh != NULL) {
+
+        // Mark the buffer special by setting size field of its header to zero.
+        bdh->bh.bb.bsize = 0;
+
+        /* Mark this buffer as owned by this thread. */
+        TCW_PTR(bdh->bh.bb.bthr, th); // don't mark buffer as allocated,
+        // because direct buffer never goes to free list
+        bdh->bh.bb.prevfree = 0;
+        bdh->tsize = size;
+#if BufStats
+        thr->totalloc += (size_t)size;
+        thr->numget++; /* Increment number of bget() calls */
+        thr->numdget++; /* Direct bget() call count */
+#endif
+        buf = (void *)(bdh + 1);
+        KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
+        return buf;
+      }
+
+    } else {
+
+      /*  Try to obtain a new expansion block */
+      void *newpool;
+
+      KE_TRACE(10, ("%%%%%% MALLOCB( %d )\n", (int)thr->exp_incr));
+
+      /* richryan */
+      newpool = (*thr->acqfcn)((bufsize)thr->exp_incr);
+      KMP_DEBUG_ASSERT(((size_t)newpool) % SizeQuant == 0);
+      if (newpool != NULL) {
+        bpool(th, newpool, thr->exp_incr);
+        buf = bget(
+            th, requested_size); /* This can't, I say, can't get into a loop. */
+        return buf;
+      }
+    }
+  }
+
+  /*  Still no buffer available */
+
+  return NULL;
+}
+
+/*  BGETZ  --  Allocate a buffer and clear its contents to zero.  We clear
+               the  entire  contents  of  the buffer to zero, not just the
+               region requested by the caller. */
+
+static void *bgetz(kmp_info_t *th, bufsize size) {
+  char *buf = (char *)bget(th, size);
+
+  if (buf != NULL) {
+    bhead_t *b;
+    bufsize rsize;
+
+    b = BH(buf - sizeof(bhead_t));
+    rsize = -(b->bb.bsize);
+    if (rsize == 0) {
+      bdhead_t *bd;
+
+      bd = BDH(buf - sizeof(bdhead_t));
+      rsize = bd->tsize - (bufsize)sizeof(bdhead_t);
+    } else {
+      rsize -= sizeof(bhead_t);
+    }
+
+    KMP_DEBUG_ASSERT(rsize >= size);
+
+    (void)memset(buf, 0, (bufsize)rsize);
+  }
+  return ((void *)buf);
+}
+
+/*  BGETR  --  Reallocate a buffer.  This is a minimal implementation,
+               simply in terms of brel()  and  bget().   It  could  be
+               enhanced to allow the buffer to grow into adjacent free
+               blocks and to avoid moving data unnecessarily.  */
+
+static void *bgetr(kmp_info_t *th, void *buf, bufsize size) {
+  void *nbuf;
+  bufsize osize; /* Old size of buffer */
+  bhead_t *b;
+
+  nbuf = bget(th, size);
+  if (nbuf == NULL) { /* Acquire new buffer */
+    return NULL;
+  }
+  if (buf == NULL) {
+    return nbuf;
+  }
+  b = BH(((char *)buf) - sizeof(bhead_t));
+  osize = -b->bb.bsize;
+  if (osize == 0) {
+    /*  Buffer acquired directly through acqfcn. */
+    bdhead_t *bd;
+
+    bd = BDH(((char *)buf) - sizeof(bdhead_t));
+    osize = bd->tsize - (bufsize)sizeof(bdhead_t);
+  } else {
+    osize -= sizeof(bhead_t);
+  }
+
+  KMP_DEBUG_ASSERT(osize > 0);
+
+  (void)KMP_MEMCPY((char *)nbuf, (char *)buf, /* Copy the data */
+                   (size_t)((size < osize) ? size : osize));
+  brel(th, buf);
+
+  return nbuf;
+}
+
+/*  BREL  --  Release a buffer.  */
+static void brel(kmp_info_t *th, void *buf) {
+  thr_data_t *thr = get_thr_data(th);
+  bfhead_t *b, *bn;
+  kmp_info_t *bth;
+
+  KMP_DEBUG_ASSERT(buf != NULL);
+  KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
+
+  b = BFH(((char *)buf) - sizeof(bhead_t));
+
+  if (b->bh.bb.bsize == 0) { /* Directly-acquired buffer? */
+    bdhead_t *bdh;
+
+    bdh = BDH(((char *)buf) - sizeof(bdhead_t));
+    KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+#if BufStats
+    thr->totalloc -= (size_t)bdh->tsize;
+    thr->numdrel++; /* Number of direct releases */
+    thr->numrel++; /* Increment number of brel() calls */
+#endif /* BufStats */
+#ifdef FreeWipe
+    (void)memset((char *)buf, 0x55, (size_t)(bdh->tsize - sizeof(bdhead_t)));
+#endif /* FreeWipe */
+
+    KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)bdh));
+
+    KMP_DEBUG_ASSERT(thr->relfcn != 0);
+    (*thr->relfcn)((void *)bdh); /* Release it directly. */
+    return;
+  }
+
+  bth = (kmp_info_t *)((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) &
+                       ~1); // clear possible mark before comparison
+  if (bth != th) {
+    /* Add this buffer to be released by the owning thread later */
+    __kmp_bget_enqueue(bth, buf
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+                       ,
+                       __kmp_gtid_from_thread(th)
+#endif
+    );
+    return;
+  }
+
+  /* Buffer size must be negative, indicating that the buffer is allocated. */
+  if (b->bh.bb.bsize >= 0) {
+    bn = NULL;
+  }
+  KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
+
+  /*  Back pointer in next buffer must be zero, indicating the same thing: */
+
+  KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.bsize)->bb.prevfree == 0);
+
+#if BufStats
+  thr->numrel++; /* Increment number of brel() calls */
+  thr->totalloc += (size_t)b->bh.bb.bsize;
+#endif
+
+  /* If the back link is nonzero, the previous buffer is free.  */
+
+  if (b->bh.bb.prevfree != 0) {
+    /* The previous buffer is free. Consolidate this buffer with it by adding
+       the length of this buffer to the previous free buffer. Note that we
+       subtract the size in the buffer being released, since it's negative to
+       indicate that the buffer is allocated. */
+    bufsize size = b->bh.bb.bsize;
+
+    /* Make the previous buffer the one we're working on. */
+    KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.prevfree)->bb.bsize ==
+                     b->bh.bb.prevfree);
+    b = BFH(((char *)b) - b->bh.bb.prevfree);
+    b->bh.bb.bsize -= size;
+
+    /* unlink the buffer from the old freelist */
+    __kmp_bget_remove_from_freelist(b);
+  } else {
+    /* The previous buffer isn't allocated. Mark this buffer size as positive
+       (i.e. free) and fall through to place the buffer on the free list as an
+       isolated free block. */
+    b->bh.bb.bsize = -b->bh.bb.bsize;
+  }
+
+  /* insert buffer back onto a new freelist */
+  __kmp_bget_insert_into_freelist(thr, b);
+
+  /* Now we look at the next buffer in memory, located by advancing from
+     the  start  of  this  buffer  by its size, to see if that buffer is
+     free.  If it is, we combine  this  buffer  with  the  next  one  in
+     memory, dechaining the second buffer from the free list. */
+  bn = BFH(((char *)b) + b->bh.bb.bsize);
+  if (bn->bh.bb.bsize > 0) {
+
+    /* The buffer is free.  Remove it from the free list and add
+       its size to that of our buffer. */
+    KMP_DEBUG_ASSERT(BH((char *)bn + bn->bh.bb.bsize)->bb.prevfree ==
+                     bn->bh.bb.bsize);
+
+    __kmp_bget_remove_from_freelist(bn);
+
+    b->bh.bb.bsize += bn->bh.bb.bsize;
+
+    /* unlink the buffer from the old freelist, and reinsert it into the new
+     * freelist */
+    __kmp_bget_remove_from_freelist(b);
+    __kmp_bget_insert_into_freelist(thr, b);
+
+    /* Finally,  advance  to   the  buffer  that   follows  the  newly
+       consolidated free block.  We must set its  backpointer  to  the
+       head  of  the  consolidated free block.  We know the next block
+       must be an allocated block because the process of recombination
+       guarantees  that  two  free  blocks will never be contiguous in
+       memory.  */
+    bn = BFH(((char *)b) + b->bh.bb.bsize);
+  }
+#ifdef FreeWipe
+  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
+               (size_t)(b->bh.bb.bsize - sizeof(bfhead_t)));
+#endif
+  KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
+
+  /* The next buffer is allocated.  Set the backpointer in it  to  point
+     to this buffer; the previous free buffer in memory. */
+
+  bn->bh.bb.prevfree = b->bh.bb.bsize;
+
+  /*  If  a  block-release function is defined, and this free buffer
+      constitutes the entire block, release it.  Note that  pool_len
+      is  defined  in  such a way that the test will fail unless all
+      pool blocks are the same size.  */
+  if (thr->relfcn != 0 &&
+      b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
+#if BufStats
+    if (thr->numpblk !=
+        1) { /* Do not release the last buffer until finalization time */
+#endif
+
+      KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+      KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
+      KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
+                       b->bh.bb.bsize);
+
+      /*  Unlink the buffer from the free list  */
+      __kmp_bget_remove_from_freelist(b);
+
+      KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
+
+      (*thr->relfcn)(b);
+#if BufStats
+      thr->numprel++; /* Nr of expansion block releases */
+      thr->numpblk--; /* Total number of blocks */
+      KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+
+      // avoid leaving stale last_pool pointer around if it is being dealloced
+      if (thr->last_pool == b)
+        thr->last_pool = 0;
+    } else {
+      thr->last_pool = b;
+    }
+#endif /* BufStats */
+  }
+}
+
+/*  BECTL  --  Establish automatic pool expansion control  */
+static void bectl(kmp_info_t *th, bget_compact_t compact,
+                  bget_acquire_t acquire, bget_release_t release,
+                  bufsize pool_incr) {
+  thr_data_t *thr = get_thr_data(th);
+
+  thr->compfcn = compact;
+  thr->acqfcn = acquire;
+  thr->relfcn = release;
+  thr->exp_incr = pool_incr;
+}
+
+/*  BPOOL  --  Add a region of memory to the buffer pool.  */
+static void bpool(kmp_info_t *th, void *buf, bufsize len) {
+  /*    int bin = 0; */
+  thr_data_t *thr = get_thr_data(th);
+  bfhead_t *b = BFH(buf);
+  bhead_t *bn;
+
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
+
+#ifdef SizeQuant
+  len &= ~((bufsize)(SizeQuant - 1));
+#endif
+  if (thr->pool_len == 0) {
+    thr->pool_len = len;
+  } else if (len != thr->pool_len) {
+    thr->pool_len = -1;
+  }
+#if BufStats
+  thr->numpget++; /* Number of block acquisitions */
+  thr->numpblk++; /* Number of blocks total */
+  KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+#endif /* BufStats */
+
+  /* Since the block is initially occupied by a single free  buffer,
+     it  had  better  not  be  (much) larger than the largest buffer
+     whose size we can store in bhead.bb.bsize. */
+  KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize)ESent + 1));
+
+  /* Clear  the  backpointer at  the start of the block to indicate that
+     there  is  no  free  block  prior  to  this   one.    That   blocks
+     recombination when the first block in memory is released. */
+  b->bh.bb.prevfree = 0;
+
+  /* Create a dummy allocated buffer at the end of the pool.  This dummy
+     buffer is seen when a buffer at the end of the pool is released and
+     blocks  recombination  of  the last buffer with the dummy buffer at
+     the end.  The length in the dummy buffer  is  set  to  the  largest
+     negative  number  to  denote  the  end  of  the pool for diagnostic
+     routines (this specific value is  not  counted  on  by  the  actual
+     allocation and release functions). */
+  len -= sizeof(bhead_t);
+  b->bh.bb.bsize = (bufsize)len;
+  /* Set the owner of this buffer */
+  TCW_PTR(b->bh.bb.bthr,
+          (kmp_info_t *)((kmp_uintptr_t)th |
+                         1)); // mark the buffer as allocated address
+
+  /* Chain the new block to the free list. */
+  __kmp_bget_insert_into_freelist(thr, b);
+
+#ifdef FreeWipe
+  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
+               (size_t)(len - sizeof(bfhead_t)));
+#endif
+  bn = BH(((char *)b) + len);
+  bn->bb.prevfree = (bufsize)len;
+  /* Definition of ESent assumes two's complement! */
+  KMP_DEBUG_ASSERT((~0) == -1 && (bn != 0));
+
+  bn->bb.bsize = ESent;
+}
+
+/*  BFREED  --  Dump the free lists for this thread. */
+static void bfreed(kmp_info_t *th) {
+  int bin = 0, count = 0;
+  int gtid = __kmp_gtid_from_thread(th);
+  thr_data_t *thr = get_thr_data(th);
+
+#if BufStats
+  __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC
+                       " get=%" KMP_INT64_SPEC " rel=%" KMP_INT64_SPEC
+                       " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC
+                       " prel=%" KMP_INT64_SPEC " dget=%" KMP_INT64_SPEC
+                       " drel=%" KMP_INT64_SPEC "\n",
+                       gtid, (kmp_uint64)thr->totalloc, (kmp_int64)thr->numget,
+                       (kmp_int64)thr->numrel, (kmp_int64)thr->numpblk,
+                       (kmp_int64)thr->numpget, (kmp_int64)thr->numprel,
+                       (kmp_int64)thr->numdget, (kmp_int64)thr->numdrel);
+#endif
+
+  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+    bfhead_t *b;
+
+    for (b = thr->freelist[bin].ql.flink; b != &thr->freelist[bin];
+         b = b->ql.flink) {
+      bufsize bs = b->bh.bb.bsize;
+
+      KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
+      KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
+      KMP_DEBUG_ASSERT(bs > 0);
+
+      count += 1;
+
+      __kmp_printf_no_lock(
+          "__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b,
+          (long)bs);
+#ifdef FreeWipe
+      {
+        char *lerr = ((char *)b) + sizeof(bfhead_t);
+        if ((bs > sizeof(bfhead_t)) &&
+            ((*lerr != 0x55) ||
+             (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
+              0))) {
+          __kmp_printf_no_lock("__kmp_printpool: T#%d     (Contents of above "
+                               "free block have been overstored.)\n",
+                               gtid);
+        }
+      }
+#endif
+    }
+  }
+
+  if (count == 0)
+    __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);
+}
+
+void __kmp_initialize_bget(kmp_info_t *th) {
+  KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));
+
+  set_thr_data(th);
+
+  bectl(th, (bget_compact_t)0, (bget_acquire_t)malloc, (bget_release_t)free,
+        (bufsize)__kmp_malloc_pool_incr);
+}
+
+void __kmp_finalize_bget(kmp_info_t *th) {
+  thr_data_t *thr;
+  bfhead_t *b;
+
+  KMP_DEBUG_ASSERT(th != 0);
+
+#if BufStats
+  thr = (thr_data_t *)th->th.th_local.bget_data;
+  KMP_DEBUG_ASSERT(thr != NULL);
+  b = thr->last_pool;
+
+  /*  If a block-release function is defined, and this free buffer constitutes
+      the entire block, release it. Note that pool_len is defined in such a way
+      that the test will fail unless all pool blocks are the same size.  */
+
+  // Deallocate the last pool if one exists because we no longer do it in brel()
+  if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
+      b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
+    KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+    KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
+    KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
+                     b->bh.bb.bsize);
+
+    /*  Unlink the buffer from the free list  */
+    __kmp_bget_remove_from_freelist(b);
+
+    KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
+
+    (*thr->relfcn)(b);
+    thr->numprel++; /* Nr of expansion block releases */
+    thr->numpblk--; /* Total number of blocks */
+    KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+  }
+#endif /* BufStats */
+
+  /* Deallocate bget_data */
+  if (th->th.th_local.bget_data != NULL) {
+    __kmp_free(th->th.th_local.bget_data);
+    th->th.th_local.bget_data = NULL;
+  }
+}
+
+void kmpc_set_poolsize(size_t size) {
+  bectl(__kmp_get_thread(), (bget_compact_t)0, (bget_acquire_t)malloc,
+        (bget_release_t)free, (bufsize)size);
+}
+
+size_t kmpc_get_poolsize(void) {
+  thr_data_t *p;
+
+  p = get_thr_data(__kmp_get_thread());
+
+  return p->exp_incr;
+}
+
+void kmpc_set_poolmode(int mode) {
+  thr_data_t *p;
+
+  if (mode == bget_mode_fifo || mode == bget_mode_lifo ||
+      mode == bget_mode_best) {
+    p = get_thr_data(__kmp_get_thread());
+    p->mode = (bget_mode_t)mode;
+  }
+}
+
+int kmpc_get_poolmode(void) {
+  thr_data_t *p;
+
+  p = get_thr_data(__kmp_get_thread());
+
+  return p->mode;
+}
+
+void kmpc_get_poolstat(size_t *maxmem, size_t *allmem) {
+  kmp_info_t *th = __kmp_get_thread();
+  bufsize a, b;
+
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
+
+  bcheck(th, &a, &b);
+
+  *maxmem = a;
+  *allmem = b;
+}
+
+void kmpc_poolprint(void) {
+  kmp_info_t *th = __kmp_get_thread();
+
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
+
+  bfreed(th);
+}
+
+#endif // #if KMP_USE_BGET
+
+void *kmpc_malloc(size_t size) {
+  void *ptr;
+  ptr = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
+  if (ptr != NULL) {
+    // save allocated pointer just before one returned to user
+    *(void **)ptr = ptr;
+    ptr = (void **)ptr + 1;
+  }
+  return ptr;
+}
+
+#define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
+
+void *kmpc_aligned_malloc(size_t size, size_t alignment) {
+  void *ptr;
+  void *ptr_allocated;
+  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too big
+  if (!IS_POWER_OF_TWO(alignment)) {
+    // AC: do we need to issue a warning here?
+    errno = EINVAL;
+    return NULL;
+  }
+  size = size + sizeof(void *) + alignment;
+  ptr_allocated = bget(__kmp_entry_thread(), (bufsize)size);
+  if (ptr_allocated != NULL) {
+    // save allocated pointer just before one returned to user
+    ptr = (void *)(((kmp_uintptr_t)ptr_allocated + sizeof(void *) + alignment) &
+                   ~(alignment - 1));
+    *((void **)ptr - 1) = ptr_allocated;
+  } else {
+    ptr = NULL;
+  }
+  return ptr;
+}
+
+void *kmpc_calloc(size_t nelem, size_t elsize) {
+  void *ptr;
+  ptr = bgetz(__kmp_entry_thread(), (bufsize)(nelem * elsize + sizeof(ptr)));
+  if (ptr != NULL) {
+    // save allocated pointer just before one returned to user
+    *(void **)ptr = ptr;
+    ptr = (void **)ptr + 1;
+  }
+  return ptr;
+}
+
+void *kmpc_realloc(void *ptr, size_t size) {
+  void *result = NULL;
+  if (ptr == NULL) {
+    // If pointer is NULL, realloc behaves like malloc.
+    result = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
+    // save allocated pointer just before one returned to user
+    if (result != NULL) {
+      *(void **)result = result;
+      result = (void **)result + 1;
+    }
+  } else if (size == 0) {
+    // If size is 0, realloc behaves like free.
+    // The thread must be registered by the call to kmpc_malloc() or
+    // kmpc_calloc() before.
+    // So it should be safe to call __kmp_get_thread(), not
+    // __kmp_entry_thread().
+    KMP_ASSERT(*((void **)ptr - 1));
+    brel(__kmp_get_thread(), *((void **)ptr - 1));
+  } else {
+    result = bgetr(__kmp_entry_thread(), *((void **)ptr - 1),
+                   (bufsize)(size + sizeof(ptr)));
+    if (result != NULL) {
+      *(void **)result = result;
+      result = (void **)result + 1;
+    }
+  }
+  return result;
+}
+
+// NOTE: the library must have already been initialized by a previous allocate
+void kmpc_free(void *ptr) {
+  if (!__kmp_init_serial) {
+    return;
+  }
+  if (ptr != NULL) {
+    kmp_info_t *th = __kmp_get_thread();
+    __kmp_bget_dequeue(th); /* Release any queued buffers */
+    // extract allocated pointer and free it
+    KMP_ASSERT(*((void **)ptr - 1));
+    brel(th, *((void **)ptr - 1));
+  }
+}
+
+void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL) {
+  void *ptr;
+  KE_TRACE(30, ("-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n", th,
+                (int)size KMP_SRC_LOC_PARM));
+  ptr = bget(th, (bufsize)size);
+  KE_TRACE(30, ("<- __kmp_thread_malloc() returns %p\n", ptr));
+  return ptr;
+}
+
+void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
+                           size_t elsize KMP_SRC_LOC_DECL) {
+  void *ptr;
+  KE_TRACE(30, ("-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n", th,
+                (int)nelem, (int)elsize KMP_SRC_LOC_PARM));
+  ptr = bgetz(th, (bufsize)(nelem * elsize));
+  KE_TRACE(30, ("<- __kmp_thread_calloc() returns %p\n", ptr));
+  return ptr;
+}
+
+void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
+                            size_t size KMP_SRC_LOC_DECL) {
+  KE_TRACE(30, ("-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n", th,
+                ptr, (int)size KMP_SRC_LOC_PARM));
+  ptr = bgetr(th, ptr, (bufsize)size);
+  KE_TRACE(30, ("<- __kmp_thread_realloc() returns %p\n", ptr));
+  return ptr;
+}
+
+void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL) {
+  KE_TRACE(30, ("-> __kmp_thread_free( %p, %p ) called from %s:%d\n", th,
+                ptr KMP_SRC_LOC_PARM));
+  if (ptr != NULL) {
+    __kmp_bget_dequeue(th); /* Release any queued buffers */
+    brel(th, ptr);
+  }
+  KE_TRACE(30, ("<- __kmp_thread_free()\n"));
+}
+
+/* OMP 5.0 Memory Management support */
+static const char *kmp_mk_lib_name;
+static void *h_memkind;
+/* memkind experimental API: */
+// memkind_alloc
+static void *(*kmp_mk_alloc)(void *k, size_t sz);
+// memkind_free
+static void (*kmp_mk_free)(void *kind, void *ptr);
+// memkind_check_available
+static int (*kmp_mk_check)(void *kind);
+// kinds we are going to use
+static void **mk_default;
+static void **mk_interleave;
+static void **mk_hbw;
+static void **mk_hbw_interleave;
+static void **mk_hbw_preferred;
+static void **mk_hugetlb;
+static void **mk_hbw_hugetlb;
+static void **mk_hbw_preferred_hugetlb;
+static void **mk_dax_kmem;
+static void **mk_dax_kmem_all;
+static void **mk_dax_kmem_preferred;
+static void *(*kmp_target_alloc_host)(size_t size, int device);
+static void *(*kmp_target_alloc_shared)(size_t size, int device);
+static void *(*kmp_target_alloc_device)(size_t size, int device);
+static void *(*kmp_target_lock_mem)(void *ptr, size_t size, int device);
+static void *(*kmp_target_unlock_mem)(void *ptr, int device);
+static void *(*kmp_target_free_host)(void *ptr, int device);
+static void *(*kmp_target_free_shared)(void *ptr, int device);
+static void *(*kmp_target_free_device)(void *ptr, int device);
+static bool __kmp_target_mem_available;
+#define KMP_IS_TARGET_MEM_SPACE(MS)                                            \
+  (MS == llvm_omp_target_host_mem_space ||                                     \
+   MS == llvm_omp_target_shared_mem_space ||                                   \
+   MS == llvm_omp_target_device_mem_space)
+#define KMP_IS_TARGET_MEM_ALLOC(MA)                                            \
+  (MA == llvm_omp_target_host_mem_alloc ||                                     \
+   MA == llvm_omp_target_shared_mem_alloc ||                                   \
+   MA == llvm_omp_target_device_mem_alloc)
+
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
+static inline void chk_kind(void ***pkind) {
+  KMP_DEBUG_ASSERT(pkind);
+  if (*pkind) // symbol found
+    if (kmp_mk_check(**pkind)) // kind not available or error
+      *pkind = NULL;
+}
+#endif
+
+void __kmp_init_memkind() {
+// as of 2018-07-31 memkind does not support Windows*, exclude it for now
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
+  // use of statically linked memkind is problematic, as it depends on libnuma
+  kmp_mk_lib_name = "libmemkind.so";
+  h_memkind = dlopen(kmp_mk_lib_name, RTLD_LAZY);
+  if (h_memkind) {
+    kmp_mk_check = (int (*)(void *))dlsym(h_memkind, "memkind_check_available");
+    kmp_mk_alloc =
+        (void *(*)(void *, size_t))dlsym(h_memkind, "memkind_malloc");
+    kmp_mk_free = (void (*)(void *, void *))dlsym(h_memkind, "memkind_free");
+    mk_default = (void **)dlsym(h_memkind, "MEMKIND_DEFAULT");
+    if (kmp_mk_check && kmp_mk_alloc && kmp_mk_free && mk_default &&
+        !kmp_mk_check(*mk_default)) {
+      __kmp_memkind_available = 1;
+      mk_interleave = (void **)dlsym(h_memkind, "MEMKIND_INTERLEAVE");
+      chk_kind(&mk_interleave);
+      mk_hbw = (void **)dlsym(h_memkind, "MEMKIND_HBW");
+      chk_kind(&mk_hbw);
+      mk_hbw_interleave = (void **)dlsym(h_memkind, "MEMKIND_HBW_INTERLEAVE");
+      chk_kind(&mk_hbw_interleave);
+      mk_hbw_preferred = (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED");
+      chk_kind(&mk_hbw_preferred);
+      mk_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HUGETLB");
+      chk_kind(&mk_hugetlb);
+      mk_hbw_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HBW_HUGETLB");
+      chk_kind(&mk_hbw_hugetlb);
+      mk_hbw_preferred_hugetlb =
+          (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED_HUGETLB");
+      chk_kind(&mk_hbw_preferred_hugetlb);
+      mk_dax_kmem = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM");
+      chk_kind(&mk_dax_kmem);
+      mk_dax_kmem_all = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_ALL");
+      chk_kind(&mk_dax_kmem_all);
+      mk_dax_kmem_preferred =
+          (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_PREFERRED");
+      chk_kind(&mk_dax_kmem_preferred);
+      KE_TRACE(25, ("__kmp_init_memkind: memkind library initialized\n"));
+      return; // success
+    }
+    dlclose(h_memkind); // failure
+  }
+#else // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)
+  kmp_mk_lib_name = "";
+#endif // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)
+  h_memkind = NULL;
+  kmp_mk_check = NULL;
+  kmp_mk_alloc = NULL;
+  kmp_mk_free = NULL;
+  mk_default = NULL;
+  mk_interleave = NULL;
+  mk_hbw = NULL;
+  mk_hbw_interleave = NULL;
+  mk_hbw_preferred = NULL;
+  mk_hugetlb = NULL;
+  mk_hbw_hugetlb = NULL;
+  mk_hbw_preferred_hugetlb = NULL;
+  mk_dax_kmem = NULL;
+  mk_dax_kmem_all = NULL;
+  mk_dax_kmem_preferred = NULL;
+}
+
+void __kmp_fini_memkind() {
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
+  if (__kmp_memkind_available)
+    KE_TRACE(25, ("__kmp_fini_memkind: finalize memkind library\n"));
+  if (h_memkind) {
+    dlclose(h_memkind);
+    h_memkind = NULL;
+  }
+  kmp_mk_check = NULL;
+  kmp_mk_alloc = NULL;
+  kmp_mk_free = NULL;
+  mk_default = NULL;
+  mk_interleave = NULL;
+  mk_hbw = NULL;
+  mk_hbw_interleave = NULL;
+  mk_hbw_preferred = NULL;
+  mk_hugetlb = NULL;
+  mk_hbw_hugetlb = NULL;
+  mk_hbw_preferred_hugetlb = NULL;
+  mk_dax_kmem = NULL;
+  mk_dax_kmem_all = NULL;
+  mk_dax_kmem_preferred = NULL;
+#endif
+}
+
+void __kmp_init_target_mem() {
+  *(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host");
+  *(void **)(&kmp_target_alloc_shared) =
+      KMP_DLSYM("llvm_omp_target_alloc_shared");
+  *(void **)(&kmp_target_alloc_device) =
+      KMP_DLSYM("llvm_omp_target_alloc_device");
+  *(void **)(&kmp_target_free_host) = KMP_DLSYM("llvm_omp_target_free_host");
+  *(void **)(&kmp_target_free_shared) =
+      KMP_DLSYM("llvm_omp_target_free_shared");
+  *(void **)(&kmp_target_free_device) =
+      KMP_DLSYM("llvm_omp_target_free_device");
+  __kmp_target_mem_available =
+      kmp_target_alloc_host && kmp_target_alloc_shared &&
+      kmp_target_alloc_device && kmp_target_free_host &&
+      kmp_target_free_shared && kmp_target_free_device;
+  // lock/pin and unlock/unpin target calls
+  *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem");
+  *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem");
+}
+
+omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
+                                             int ntraits,
+                                             omp_alloctrait_t traits[]) {
+  // OpenMP 5.0 only allows predefined memspaces
+  KMP_DEBUG_ASSERT(ms == omp_default_mem_space || ms == omp_low_lat_mem_space ||
+                   ms == omp_large_cap_mem_space || ms == omp_const_mem_space ||
+                   ms == omp_high_bw_mem_space || KMP_IS_TARGET_MEM_SPACE(ms));
+  kmp_allocator_t *al;
+  int i;
+  al = (kmp_allocator_t *)__kmp_allocate(sizeof(kmp_allocator_t)); // zeroed
+  al->memspace = ms; // not used currently
+  for (i = 0; i < ntraits; ++i) {
+    switch (traits[i].key) {
+    case omp_atk_sync_hint:
+    case omp_atk_access:
+      break;
+    case omp_atk_pinned:
+      al->pinned = true;
+      break;
+    case omp_atk_alignment:
+      __kmp_type_convert(traits[i].value, &(al->alignment));
+      KMP_ASSERT(IS_POWER_OF_TWO(al->alignment));
+      break;
+    case omp_atk_pool_size:
+      al->pool_size = traits[i].value;
+      break;
+    case omp_atk_fallback:
+      al->fb = (omp_alloctrait_value_t)traits[i].value;
+      KMP_DEBUG_ASSERT(
+          al->fb == omp_atv_default_mem_fb || al->fb == omp_atv_null_fb ||
+          al->fb == omp_atv_abort_fb || al->fb == omp_atv_allocator_fb);
+      break;
+    case omp_atk_fb_data:
+      al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
+      break;
+    case omp_atk_partition:
+      al->memkind = RCAST(void **, traits[i].value);
+      break;
+    default:
+      KMP_ASSERT2(0, "Unexpected allocator trait");
+    }
+  }
+  if (al->fb == 0) {
+    // set default allocator
+    al->fb = omp_atv_default_mem_fb;
+    al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
+  } else if (al->fb == omp_atv_allocator_fb) {
+    KMP_ASSERT(al->fb_data != NULL);
+  } else if (al->fb == omp_atv_default_mem_fb) {
+    al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
+  }
+  if (__kmp_memkind_available) {
+    // Let's use memkind library if available
+    if (ms == omp_high_bw_mem_space) {
+      if (al->memkind == (void *)omp_atv_interleaved && mk_hbw_interleave) {
+        al->memkind = mk_hbw_interleave;
+      } else if (mk_hbw_preferred) {
+        // AC: do not try to use MEMKIND_HBW for now, because memkind library
+        // cannot reliably detect exhaustion of HBW memory.
+        // It could be possible using hbw_verify_memory_region() but memkind
+        // manual says: "Using this function in production code may result in
+        // serious performance penalty".
+        al->memkind = mk_hbw_preferred;
+      } else {
+        // HBW is requested but not available --> return NULL allocator
+        __kmp_free(al);
+        return omp_null_allocator;
+      }
+    } else if (ms == omp_large_cap_mem_space) {
+      if (mk_dax_kmem_all) {
+        // All pmem nodes are visited
+        al->memkind = mk_dax_kmem_all;
+      } else if (mk_dax_kmem) {
+        // Only closest pmem node is visited
+        al->memkind = mk_dax_kmem;
+      } else {
+        __kmp_free(al);
+        return omp_null_allocator;
+      }
+    } else {
+      if (al->memkind == (void *)omp_atv_interleaved && mk_interleave) {
+        al->memkind = mk_interleave;
+      } else {
+        al->memkind = mk_default;
+      }
+    }
+  } else if (KMP_IS_TARGET_MEM_SPACE(ms) && !__kmp_target_mem_available) {
+    __kmp_free(al);
+    return omp_null_allocator;
+  } else {
+    if (ms == omp_high_bw_mem_space) {
+      // cannot detect HBW memory presence without memkind library
+      __kmp_free(al);
+      return omp_null_allocator;
+    }
+  }
+  return (omp_allocator_handle_t)al;
+}
+
+void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t allocator) {
+  if (allocator > kmp_max_mem_alloc)
+    __kmp_free(allocator);
+}
+
+void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t allocator) {
+  if (allocator == omp_null_allocator)
+    allocator = omp_default_mem_alloc;
+  __kmp_threads[gtid]->th.th_def_allocator = allocator;
+}
+
+omp_allocator_handle_t __kmpc_get_default_allocator(int gtid) {
+  return __kmp_threads[gtid]->th.th_def_allocator;
+}
+
+typedef struct kmp_mem_desc { // Memory block descriptor
+  void *ptr_alloc; // Pointer returned by allocator
+  size_t size_a; // Size of allocated memory block (initial+descriptor+align)
+  size_t size_orig; // Original size requested
+  void *ptr_align; // Pointer to aligned memory, returned
+  kmp_allocator_t *allocator; // allocator
+} kmp_mem_desc_t;
+static int alignment = sizeof(void *); // align to pointer size by default
+
+// external interfaces are wrappers over internal implementation
+void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
+  KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));
+  void *ptr = __kmp_alloc(gtid, 0, size, allocator);
+  KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", ptr, gtid));
+  return ptr;
+}
+
+void *__kmpc_aligned_alloc(int gtid, size_t algn, size_t size,
+                           omp_allocator_handle_t allocator) {
+  KE_TRACE(25, ("__kmpc_aligned_alloc: T#%d (%d, %d, %p)\n", gtid, (int)algn,
+                (int)size, allocator));
+  void *ptr = __kmp_alloc(gtid, algn, size, allocator);
+  KE_TRACE(25, ("__kmpc_aligned_alloc returns %p, T#%d\n", ptr, gtid));
+  return ptr;
+}
+
+void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
+                    omp_allocator_handle_t allocator) {
+  KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb,
+                (int)size, allocator));
+  void *ptr = __kmp_calloc(gtid, 0, nmemb, size, allocator);
+  KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid));
+  return ptr;
+}
+
+void *__kmpc_realloc(int gtid, void *ptr, size_t size,
+                     omp_allocator_handle_t allocator,
+                     omp_allocator_handle_t free_allocator) {
+  KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size,
+                allocator, free_allocator));
+  void *nptr = __kmp_realloc(gtid, ptr, size, allocator, free_allocator);
+  KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid));
+  return nptr;
+}
+
+void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
+  KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
+  ___kmpc_free(gtid, ptr, allocator);
+  KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, ptr, allocator));
+  return;
+}
+
+// internal implementation, called from inside the library
+void *__kmp_alloc(int gtid, size_t algn, size_t size,
+                  omp_allocator_handle_t allocator) {
+  void *ptr = NULL;
+  kmp_allocator_t *al;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (size == 0)
+    return NULL;
+  if (allocator == omp_null_allocator)
+    allocator = __kmp_threads[gtid]->th.th_def_allocator;
+  kmp_int32 default_device =
+      __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
+
+  al = RCAST(kmp_allocator_t *, allocator);
+
+  int sz_desc = sizeof(kmp_mem_desc_t);
+  kmp_mem_desc_t desc;
+  kmp_uintptr_t addr; // address returned by allocator
+  kmp_uintptr_t addr_align; // address to return to caller
+  kmp_uintptr_t addr_descr; // address of memory block descriptor
+  size_t align = alignment; // default alignment
+  if (allocator > kmp_max_mem_alloc && al->alignment > align)
+    align = al->alignment; // alignment required by allocator trait
+  if (align < algn)
+    align = algn; // max of allocator trait, parameter and sizeof(void*)
+  desc.size_orig = size;
+  desc.size_a = size + sz_desc + align;
+  bool is_pinned = false;
+  if (allocator > kmp_max_mem_alloc)
+    is_pinned = al->pinned;
+
+  // Use default allocator if libmemkind is not available
+  int use_default_allocator = (__kmp_memkind_available) ? false : true;
+
+  if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {
+    // Use size input directly as the memory may not be accessible on host.
+    // Use default device for now.
+    if (__kmp_target_mem_available) {
+      kmp_int32 device =
+          __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
+      if (allocator == llvm_omp_target_host_mem_alloc)
+        ptr = kmp_target_alloc_host(size, device);
+      else if (allocator == llvm_omp_target_shared_mem_alloc)
+        ptr = kmp_target_alloc_shared(size, device);
+      else // allocator == llvm_omp_target_device_mem_alloc
+        ptr = kmp_target_alloc_device(size, device);
+      return ptr;
+    } else {
+      KMP_INFORM(TargetMemNotAvailable);
+    }
+  }
+
+  if (allocator >= kmp_max_mem_alloc && KMP_IS_TARGET_MEM_SPACE(al->memspace)) {
+    if (__kmp_target_mem_available) {
+      kmp_int32 device =
+          __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
+      if (al->memspace == llvm_omp_target_host_mem_space)
+        ptr = kmp_target_alloc_host(size, device);
+      else if (al->memspace == llvm_omp_target_shared_mem_space)
+        ptr = kmp_target_alloc_shared(size, device);
+      else // al->memspace == llvm_omp_target_device_mem_space
+        ptr = kmp_target_alloc_device(size, device);
+      return ptr;
+    } else {
+      KMP_INFORM(TargetMemNotAvailable);
+    }
+  }
+
+  if (__kmp_memkind_available) {
+    if (allocator < kmp_max_mem_alloc) {
+      // pre-defined allocator
+      if (allocator == omp_high_bw_mem_alloc && mk_hbw_preferred) {
+        ptr = kmp_mk_alloc(*mk_hbw_preferred, desc.size_a);
+      } else if (allocator == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
+        ptr = kmp_mk_alloc(*mk_dax_kmem_all, desc.size_a);
+      } else {
+        ptr = kmp_mk_alloc(*mk_default, desc.size_a);
+      }
+    } else if (al->pool_size > 0) {
+      // custom allocator with pool size requested
+      kmp_uint64 used =
+          KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
+      if (used + desc.size_a > al->pool_size) {
+        // not enough space, need to go fallback path
+        KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
+        if (al->fb == omp_atv_default_mem_fb) {
+          al = (kmp_allocator_t *)omp_default_mem_alloc;
+          ptr = kmp_mk_alloc(*mk_default, desc.size_a);
+        } else if (al->fb == omp_atv_abort_fb) {
+          KMP_ASSERT(0); // abort fallback requested
+        } else if (al->fb == omp_atv_allocator_fb) {
+          KMP_ASSERT(al != al->fb_data);
+          al = al->fb_data;
+          ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+          if (is_pinned && kmp_target_lock_mem)
+            kmp_target_lock_mem(ptr, size, default_device);
+          return ptr;
+        } // else ptr == NULL;
+      } else {
+        // pool has enough space
+        ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
+        if (ptr == NULL) {
+          if (al->fb == omp_atv_default_mem_fb) {
+            al = (kmp_allocator_t *)omp_default_mem_alloc;
+            ptr = kmp_mk_alloc(*mk_default, desc.size_a);
+          } else if (al->fb == omp_atv_abort_fb) {
+            KMP_ASSERT(0); // abort fallback requested
+          } else if (al->fb == omp_atv_allocator_fb) {
+            KMP_ASSERT(al != al->fb_data);
+            al = al->fb_data;
+            ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+            if (is_pinned && kmp_target_lock_mem)
+              kmp_target_lock_mem(ptr, size, default_device);
+            return ptr;
+          }
+        }
+      }
+    } else {
+      // custom allocator, pool size not requested
+      ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
+      if (ptr == NULL) {
+        if (al->fb == omp_atv_default_mem_fb) {
+          al = (kmp_allocator_t *)omp_default_mem_alloc;
+          ptr = kmp_mk_alloc(*mk_default, desc.size_a);
+        } else if (al->fb == omp_atv_abort_fb) {
+          KMP_ASSERT(0); // abort fallback requested
+        } else if (al->fb == omp_atv_allocator_fb) {
+          KMP_ASSERT(al != al->fb_data);
+          al = al->fb_data;
+          ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+          if (is_pinned && kmp_target_lock_mem)
+            kmp_target_lock_mem(ptr, size, default_device);
+          return ptr;
+        }
+      }
+    }
+  } else if (allocator < kmp_max_mem_alloc) {
+    // pre-defined allocator
+    if (allocator == omp_high_bw_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc");
+    } else if (allocator == omp_large_cap_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_large_cap_mem_alloc");
+    } else if (allocator == omp_const_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_const_mem_alloc");
+    } else if (allocator == omp_low_lat_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_low_lat_mem_alloc");
+    } else if (allocator == omp_cgroup_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_cgroup_mem_alloc");
+    } else if (allocator == omp_pteam_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_pteam_mem_alloc");
+    } else if (allocator == omp_thread_mem_alloc) {
+      KMP_WARNING(OmpNoAllocator, "omp_thread_mem_alloc");
+    } else { // default allocator requested
+      use_default_allocator = true;
+    }
+    if (use_default_allocator) {
+      ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
+      use_default_allocator = false;
+    }
+  } else if (al->pool_size > 0) {
+    // custom allocator with pool size requested
+    kmp_uint64 used =
+        KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
+    if (used + desc.size_a > al->pool_size) {
+      // not enough space, need to go fallback path
+      KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
+      if (al->fb == omp_atv_default_mem_fb) {
+        al = (kmp_allocator_t *)omp_default_mem_alloc;
+        ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
+      } else if (al->fb == omp_atv_abort_fb) {
+        KMP_ASSERT(0); // abort fallback requested
+      } else if (al->fb == omp_atv_allocator_fb) {
+        KMP_ASSERT(al != al->fb_data);
+        al = al->fb_data;
+        ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+        if (is_pinned && kmp_target_lock_mem)
+          kmp_target_lock_mem(ptr, size, default_device);
+        return ptr;
+      } // else ptr == NULL;
+    } else {
+      // pool has enough space
+      ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
+      if (ptr == NULL && al->fb == omp_atv_abort_fb) {
+        KMP_ASSERT(0); // abort fallback requested
+      } // no sense to look for another fallback because of same internal alloc
+    }
+  } else {
+    // custom allocator, pool size not requested
+    ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
+    if (ptr == NULL && al->fb == omp_atv_abort_fb) {
+      KMP_ASSERT(0); // abort fallback requested
+    } // no sense to look for another fallback because of same internal alloc
+  }
+  KE_TRACE(10, ("__kmp_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
+  if (ptr == NULL)
+    return NULL;
+
+  if (is_pinned && kmp_target_lock_mem)
+    kmp_target_lock_mem(ptr, desc.size_a, default_device);
+
+  addr = (kmp_uintptr_t)ptr;
+  addr_align = (addr + sz_desc + align - 1) & ~(align - 1);
+  addr_descr = addr_align - sz_desc;
+
+  desc.ptr_alloc = ptr;
+  desc.ptr_align = (void *)addr_align;
+  desc.allocator = al;
+  *((kmp_mem_desc_t *)addr_descr) = desc; // save descriptor contents
+  KMP_MB();
+
+  return desc.ptr_align;
+}
+
+void *__kmp_calloc(int gtid, size_t algn, size_t nmemb, size_t size,
+                   omp_allocator_handle_t allocator) {
+  void *ptr = NULL;
+  kmp_allocator_t *al;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (allocator == omp_null_allocator)
+    allocator = __kmp_threads[gtid]->th.th_def_allocator;
+
+  al = RCAST(kmp_allocator_t *, allocator);
+
+  if (nmemb == 0 || size == 0)
+    return ptr;
+
+  if ((SIZE_MAX - sizeof(kmp_mem_desc_t)) / size < nmemb) {
+    if (al->fb == omp_atv_abort_fb) {
+      KMP_ASSERT(0);
+    }
+    return ptr;
+  }
+
+  ptr = __kmp_alloc(gtid, algn, nmemb * size, allocator);
+
+  if (ptr) {
+    memset(ptr, 0x00, nmemb * size);
+  }
+  return ptr;
+}
+
+void *__kmp_realloc(int gtid, void *ptr, size_t size,
+                    omp_allocator_handle_t allocator,
+                    omp_allocator_handle_t free_allocator) {
+  void *nptr = NULL;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (size == 0) {
+    if (ptr != NULL)
+      ___kmpc_free(gtid, ptr, free_allocator);
+    return nptr;
+  }
+
+  nptr = __kmp_alloc(gtid, 0, size, allocator);
+
+  if (nptr != NULL && ptr != NULL) {
+    kmp_mem_desc_t desc;
+    kmp_uintptr_t addr_align; // address to return to caller
+    kmp_uintptr_t addr_descr; // address of memory block descriptor
+
+    addr_align = (kmp_uintptr_t)ptr;
+    addr_descr = addr_align - sizeof(kmp_mem_desc_t);
+    desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
+
+    KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
+    KMP_DEBUG_ASSERT(desc.size_orig > 0);
+    KMP_DEBUG_ASSERT(desc.size_orig < desc.size_a);
+    KMP_MEMCPY((char *)nptr, (char *)ptr,
+               (size_t)((size < desc.size_orig) ? size : desc.size_orig));
+  }
+
+  if (nptr != NULL) {
+    ___kmpc_free(gtid, ptr, free_allocator);
+  }
+
+  return nptr;
+}
+
+void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
+  if (ptr == NULL)
+    return;
+
+  kmp_allocator_t *al;
+  omp_allocator_handle_t oal;
+  al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
+  kmp_mem_desc_t desc;
+  kmp_uintptr_t addr_align; // address to return to caller
+  kmp_uintptr_t addr_descr; // address of memory block descriptor
+  if (__kmp_target_mem_available && (KMP_IS_TARGET_MEM_ALLOC(allocator) ||
+                                     (allocator > kmp_max_mem_alloc &&
+                                      KMP_IS_TARGET_MEM_SPACE(al->memspace)))) {
+    kmp_int32 device =
+        __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
+    if (allocator == llvm_omp_target_host_mem_alloc) {
+      kmp_target_free_host(ptr, device);
+    } else if (allocator == llvm_omp_target_shared_mem_alloc) {
+      kmp_target_free_shared(ptr, device);
+    } else if (allocator == llvm_omp_target_device_mem_alloc) {
+      kmp_target_free_device(ptr, device);
+    }
+    return;
+  }
+
+  addr_align = (kmp_uintptr_t)ptr;
+  addr_descr = addr_align - sizeof(kmp_mem_desc_t);
+  desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
+
+  KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
+  if (allocator) {
+    KMP_DEBUG_ASSERT(desc.allocator == al || desc.allocator == al->fb_data);
+  }
+  al = desc.allocator;
+  oal = (omp_allocator_handle_t)al; // cast to void* for comparisons
+  KMP_DEBUG_ASSERT(al);
+
+  if (allocator > kmp_max_mem_alloc && kmp_target_unlock_mem && al->pinned) {
+    kmp_int32 device =
+        __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
+    kmp_target_unlock_mem(desc.ptr_alloc, device);
+  }
+
+  if (__kmp_memkind_available) {
+    if (oal < kmp_max_mem_alloc) {
+      // pre-defined allocator
+      if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) {
+        kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc);
+      } else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
+        kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc);
+      } else {
+        kmp_mk_free(*mk_default, desc.ptr_alloc);
+      }
+    } else {
+      if (al->pool_size > 0) { // custom allocator with pool size requested
+        kmp_uint64 used =
+            KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
+        (void)used; // to suppress compiler warning
+        KMP_DEBUG_ASSERT(used >= desc.size_a);
+      }
+      kmp_mk_free(*al->memkind, desc.ptr_alloc);
+    }
+  } else {
+    if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
+      kmp_uint64 used =
+          KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
+      (void)used; // to suppress compiler warning
+      KMP_DEBUG_ASSERT(used >= desc.size_a);
+    }
+    __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
+  }
+}
+
+/* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
+   memory leaks, but it may be useful for debugging memory corruptions, used
+   freed pointers, etc. */
+/* #define LEAK_MEMORY */
+struct kmp_mem_descr { // Memory block descriptor.
+  void *ptr_allocated; // Pointer returned by malloc(), subject for free().
+  size_t size_allocated; // Size of allocated memory block.
+  void *ptr_aligned; // Pointer to aligned memory, to be used by client code.
+  size_t size_aligned; // Size of aligned memory block.
+};
+typedef struct kmp_mem_descr kmp_mem_descr_t;
+
+/* Allocate memory on requested boundary, fill allocated memory with 0x00.
+   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
+   error. Must use __kmp_free when freeing memory allocated by this routine! */
+static void *___kmp_allocate_align(size_t size,
+                                   size_t alignment KMP_SRC_LOC_DECL) {
+  /* __kmp_allocate() allocates (by call to malloc()) bigger memory block than
+     requested to return properly aligned pointer. Original pointer returned
+     by malloc() and size of allocated block is saved in descriptor just
+     before the aligned pointer. This information used by __kmp_free() -- it
+     has to pass to free() original pointer, not aligned one.
+
+          +---------+------------+-----------------------------------+---------+
+          | padding | descriptor |           aligned block           | padding |
+          +---------+------------+-----------------------------------+---------+
+          ^                      ^
+          |                      |
+          |                      +- Aligned pointer returned to caller
+          +- Pointer returned by malloc()
+
+      Aligned block is filled with zeros, paddings are filled with 0xEF. */
+
+  kmp_mem_descr_t descr;
+  kmp_uintptr_t addr_allocated; // Address returned by malloc().
+  kmp_uintptr_t addr_aligned; // Aligned address to return to caller.
+  kmp_uintptr_t addr_descr; // Address of memory block descriptor.
+
+  KE_TRACE(25, ("-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
+                (int)size, (int)alignment KMP_SRC_LOC_PARM));
+
+  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too
+  KMP_DEBUG_ASSERT(sizeof(void *) <= sizeof(kmp_uintptr_t));
+  // Make sure kmp_uintptr_t is enough to store addresses.
+
+  descr.size_aligned = size;
+  descr.size_allocated =
+      descr.size_aligned + sizeof(kmp_mem_descr_t) + alignment;
+
+#if KMP_DEBUG
+  descr.ptr_allocated = _malloc_src_loc(descr.size_allocated, _file_, _line_);
+#else
+  descr.ptr_allocated = malloc_src_loc(descr.size_allocated KMP_SRC_LOC_PARM);
+#endif
+  KE_TRACE(10, ("   malloc( %d ) returned %p\n", (int)descr.size_allocated,
+                descr.ptr_allocated));
+  if (descr.ptr_allocated == NULL) {
+    KMP_FATAL(OutOfHeapMemory);
+  }
+
+  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
+  addr_aligned =
+      (addr_allocated + sizeof(kmp_mem_descr_t) + alignment) & ~(alignment - 1);
+  addr_descr = addr_aligned - sizeof(kmp_mem_descr_t);
+
+  descr.ptr_aligned = (void *)addr_aligned;
+
+  KE_TRACE(26, ("   ___kmp_allocate_align: "
+                "ptr_allocated=%p, size_allocated=%d, "
+                "ptr_aligned=%p, size_aligned=%d\n",
+                descr.ptr_allocated, (int)descr.size_allocated,
+                descr.ptr_aligned, (int)descr.size_aligned));
+
+  KMP_DEBUG_ASSERT(addr_allocated <= addr_descr);
+  KMP_DEBUG_ASSERT(addr_descr + sizeof(kmp_mem_descr_t) == addr_aligned);
+  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
+                   addr_allocated + descr.size_allocated);
+  KMP_DEBUG_ASSERT(addr_aligned % alignment == 0);
+#ifdef KMP_DEBUG
+  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
+// Fill allocated memory block with 0xEF.
+#endif
+  memset(descr.ptr_aligned, 0x00, descr.size_aligned);
+  // Fill the aligned memory block (which is intended for using by caller) with
+  // 0x00. Do not
+  // put this filling under KMP_DEBUG condition! Many callers expect zeroed
+  // memory. (Padding
+  // bytes remain filled with 0xEF in debugging library.)
+  *((kmp_mem_descr_t *)addr_descr) = descr;
+
+  KMP_MB();
+
+  KE_TRACE(25, ("<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned));
+  return descr.ptr_aligned;
+} // func ___kmp_allocate_align
+
+/* Allocate memory on cache line boundary, fill allocated memory with 0x00.
+   Do not call this func directly! Use __kmp_allocate macro instead.
+   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
+   error. Must use __kmp_free when freeing memory allocated by this routine! */
+void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL) {
+  void *ptr;
+  KE_TRACE(25, ("-> __kmp_allocate( %d ) called from %s:%d\n",
+                (int)size KMP_SRC_LOC_PARM));
+  ptr = ___kmp_allocate_align(size, __kmp_align_alloc KMP_SRC_LOC_PARM);
+  KE_TRACE(25, ("<- __kmp_allocate() returns %p\n", ptr));
+  return ptr;
+} // func ___kmp_allocate
+
+/* Allocate memory on page boundary, fill allocated memory with 0x00.
+   Does not call this func directly! Use __kmp_page_allocate macro instead.
+   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
+   error. Must use __kmp_free when freeing memory allocated by this routine! */
+void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
+  int page_size = 8 * 1024;
+  void *ptr;
+
+  KE_TRACE(25, ("-> __kmp_page_allocate( %d ) called from %s:%d\n",
+                (int)size KMP_SRC_LOC_PARM));
+  ptr = ___kmp_allocate_align(size, page_size KMP_SRC_LOC_PARM);
+  KE_TRACE(25, ("<- __kmp_page_allocate( %d ) returns %p\n", (int)size, ptr));
+  return ptr;
+} // ___kmp_page_allocate
+
+/* Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
+   In debug mode, fill the memory block with 0xEF before call to free(). */
+void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
+  kmp_mem_descr_t descr;
+#if KMP_DEBUG
+  kmp_uintptr_t addr_allocated; // Address returned by malloc().
+  kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
+#endif
+  KE_TRACE(25,
+           ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
+  KMP_ASSERT(ptr != NULL);
+
+  descr = *(kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t));
+
+  KE_TRACE(26, ("   __kmp_free:     "
+                "ptr_allocated=%p, size_allocated=%d, "
+                "ptr_aligned=%p, size_aligned=%d\n",
+                descr.ptr_allocated, (int)descr.size_allocated,
+                descr.ptr_aligned, (int)descr.size_aligned));
+#if KMP_DEBUG
+  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
+  addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
+  KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
+  KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
+  KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
+  KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
+  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
+                   addr_allocated + descr.size_allocated);
+  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
+// Fill memory block with 0xEF, it helps catch using freed memory.
+#endif
+
+#ifndef LEAK_MEMORY
+  KE_TRACE(10, ("   free( %p )\n", descr.ptr_allocated));
+#ifdef KMP_DEBUG
+  _free_src_loc(descr.ptr_allocated, _file_, _line_);
+#else
+  free_src_loc(descr.ptr_allocated KMP_SRC_LOC_PARM);
+#endif
+#endif
+  KMP_MB();
+  KE_TRACE(25, ("<- __kmp_free() returns\n"));
+} // func ___kmp_free
+
+#if USE_FAST_MEMORY == 3
+// Allocate fast memory by first scanning the thread's free lists
+// If a chunk the right size exists, grab it off the free list.
+// Otherwise allocate normally using kmp_thread_malloc.
+
+// AC: How to choose the limit? Just get 16 for now...
+#define KMP_FREE_LIST_LIMIT 16
+
+// Always use 128 bytes for determining buckets for caching memory blocks
+#define DCACHE_LINE 128
+
+void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
+  void *ptr;
+  size_t num_lines, idx;
+  int index;
+  void *alloc_ptr;
+  size_t alloc_size;
+  kmp_mem_descr_t *descr;
+
+  KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
+                __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM));
+
+  num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE;
+  idx = num_lines - 1;
+  KMP_DEBUG_ASSERT(idx >= 0);
+  if (idx < 2) {
+    index = 0; // idx is [ 0, 1 ], use first free list
+    num_lines = 2; // 1, 2 cache lines or less than cache line
+  } else if ((idx >>= 2) == 0) {
+    index = 1; // idx is [ 2, 3 ], use second free list
+    num_lines = 4; // 3, 4 cache lines
+  } else if ((idx >>= 2) == 0) {
+    index = 2; // idx is [ 4, 15 ], use third free list
+    num_lines = 16; // 5, 6, ..., 16 cache lines
+  } else if ((idx >>= 2) == 0) {
+    index = 3; // idx is [ 16, 63 ], use fourth free list
+    num_lines = 64; // 17, 18, ..., 64 cache lines
+  } else {
+    goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
+  }
+
+  ptr = this_thr->th.th_free_lists[index].th_free_list_self;
+  if (ptr != NULL) {
+    // pop the head of no-sync free list
+    this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
+    KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
+                                                      sizeof(kmp_mem_descr_t)))
+                                     ->ptr_aligned);
+    goto end;
+  }
+  ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
+  if (ptr != NULL) {
+    // no-sync free list is empty, use sync free list (filled in by other
+    // threads only)
+    // pop the head of the sync free list, push NULL instead
+    while (!KMP_COMPARE_AND_STORE_PTR(
+        &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) {
+      KMP_CPU_PAUSE();
+      ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
+    }
+    // push the rest of chain into no-sync free list (can be NULL if there was
+    // the only block)
+    this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
+    KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
+                                                      sizeof(kmp_mem_descr_t)))
+                                     ->ptr_aligned);
+    goto end;
+  }
+
+alloc_call:
+  // haven't found block in the free lists, thus allocate it
+  size = num_lines * DCACHE_LINE;
+
+  alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE;
+  KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with "
+                "alloc_size %d\n",
+                __kmp_gtid_from_thread(this_thr), alloc_size));
+  alloc_ptr = bget(this_thr, (bufsize)alloc_size);
+
+  // align ptr to DCACHE_LINE
+  ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) +
+                  DCACHE_LINE) &
+                 ~(DCACHE_LINE - 1));
+  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
+
+  descr->ptr_allocated = alloc_ptr; // remember allocated pointer
+  // we don't need size_allocated
+  descr->ptr_aligned = (void *)this_thr; // remember allocating thread
+  // (it is already saved in bget buffer,
+  // but we may want to use another allocator in future)
+  descr->size_aligned = size;
+
+end:
+  KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n",
+                __kmp_gtid_from_thread(this_thr), ptr));
+  return ptr;
+} // func __kmp_fast_allocate
+
+// Free fast memory and place it on the thread's free list if it is of
+// the correct size.
+void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
+  kmp_mem_descr_t *descr;
+  kmp_info_t *alloc_thr;
+  size_t size;
+  size_t idx;
+  int index;
+
+  KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
+                __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM));
+  KMP_ASSERT(ptr != NULL);
+
+  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
+
+  KE_TRACE(26, ("   __kmp_fast_free:     size_aligned=%d\n",
+                (int)descr->size_aligned));
+
+  size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
+
+  idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
+  if (idx == size) {
+    index = 0; // 2 cache lines
+  } else if ((idx <<= 1) == size) {
+    index = 1; // 4 cache lines
+  } else if ((idx <<= 2) == size) {
+    index = 2; // 16 cache lines
+  } else if ((idx <<= 2) == size) {
+    index = 3; // 64 cache lines
+  } else {
+    KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64);
+    goto free_call; // 65 or more cache lines ( > 8KB )
+  }
+
+  alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
+  if (alloc_thr == this_thr) {
+    // push block to self no-sync free list, linking previous head (LIFO)
+    *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
+    this_thr->th.th_free_lists[index].th_free_list_self = ptr;
+  } else {
+    void *head = this_thr->th.th_free_lists[index].th_free_list_other;
+    if (head == NULL) {
+      // Create new free list
+      this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+      *((void **)ptr) = NULL; // mark the tail of the list
+      descr->size_allocated = (size_t)1; // head of the list keeps its length
+    } else {
+      // need to check existed "other" list's owner thread and size of queue
+      kmp_mem_descr_t *dsc =
+          (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t));
+      // allocating thread, same for all queue nodes
+      kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned);
+      size_t q_sz =
+          dsc->size_allocated + 1; // new size in case we add current task
+      if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) {
+        // we can add current task to "other" list, no sync needed
+        *((void **)ptr) = head;
+        descr->size_allocated = q_sz;
+        this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+      } else {
+        // either queue blocks owner is changing or size limit exceeded
+        // return old queue to allocating thread (q_th) synchronously,
+        // and start new list for alloc_thr's tasks
+        void *old_ptr;
+        void *tail = head;
+        void *next = *((void **)head);
+        while (next != NULL) {
+          KMP_DEBUG_ASSERT(
+              // queue size should decrease by 1 each step through the list
+              ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t)))
+                      ->size_allocated +
+                  1 ==
+              ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t)))
+                  ->size_allocated);
+          tail = next; // remember tail node
+          next = *((void **)next);
+        }
+        KMP_DEBUG_ASSERT(q_th != NULL);
+        // push block to owner's sync free list
+        old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
+        /* the next pointer must be set before setting free_list to ptr to avoid
+           exposing a broken list to other threads, even for an instant. */
+        *((void **)tail) = old_ptr;
+
+        while (!KMP_COMPARE_AND_STORE_PTR(
+            &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) {
+          KMP_CPU_PAUSE();
+          old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
+          *((void **)tail) = old_ptr;
+        }
+
+        // start new list of not-selt tasks
+        this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+        *((void **)ptr) = NULL;
+        descr->size_allocated = (size_t)1; // head of queue keeps its length
+      }
+    }
+  }
+  goto end;
+
+free_call:
+  KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
+                __kmp_gtid_from_thread(this_thr), size));
+  __kmp_bget_dequeue(this_thr); /* Release any queued buffers */
+  brel(this_thr, descr->ptr_allocated);
+
+end:
+  KE_TRACE(25, ("<- __kmp_fast_free() returns\n"));
+
+} // func __kmp_fast_free
+
+// Initialize the thread free lists related to fast memory
+// Only do this when a thread is initially created.
+void __kmp_initialize_fast_memory(kmp_info_t *this_thr) {
+  KE_TRACE(10, ("__kmp_initialize_fast_memory: Called from th %p\n", this_thr));
+
+  memset(this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof(kmp_free_list_t));
+}
+
+// Free the memory in the thread free lists related to fast memory
+// Only do this when a thread is being reaped (destroyed).
+void __kmp_free_fast_memory(kmp_info_t *th) {
+  // Suppose we use BGET underlying allocator, walk through its structures...
+  int bin;
+  thr_data_t *thr = get_thr_data(th);
+  void **lst = NULL;
+
+  KE_TRACE(
+      5, ("__kmp_free_fast_memory: Called T#%d\n", __kmp_gtid_from_thread(th)));
+
+  __kmp_bget_dequeue(th); // Release any queued buffers
+
+  // Dig through free lists and extract all allocated blocks
+  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+    bfhead_t *b = thr->freelist[bin].ql.flink;
+    while (b != &thr->freelist[bin]) {
+      if ((kmp_uintptr_t)b->bh.bb.bthr & 1) { // the buffer is allocated address
+        *((void **)b) =
+            lst; // link the list (override bthr, but keep flink yet)
+        lst = (void **)b; // push b into lst
+      }
+      b = b->ql.flink; // get next buffer
+    }
+  }
+  while (lst != NULL) {
+    void *next = *lst;
+    KE_TRACE(10, ("__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
+                  lst, next, th, __kmp_gtid_from_thread(th)));
+    (*thr->relfcn)(lst);
+#if BufStats
+    // count blocks to prevent problems in __kmp_finalize_bget()
+    thr->numprel++; /* Nr of expansion block releases */
+    thr->numpblk--; /* Total number of blocks */
+#endif
+    lst = (void **)next;
+  }
+
+  KE_TRACE(
+      5, ("__kmp_free_fast_memory: Freed T#%d\n", __kmp_gtid_from_thread(th)));
+}
+
+#endif // USE_FAST_MEMORY
diff --git a/third_party/openmp/kmp_atomic.cpp b/third_party/openmp/kmp_atomic.cpp
new file mode 100644
index 000000000..261e9f1be
--- /dev/null
+++ b/third_party/openmp/kmp_atomic.cpp
@@ -0,0 +1,3877 @@
+/*
+ * kmp_atomic.cpp -- ATOMIC implementation routines
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_atomic.h"
+#include "kmp.h" // TRUE, asm routines prototypes
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+/*!
+@defgroup ATOMIC_OPS Atomic Operations
+These functions are used for implementing the many different varieties of atomic
+operations.
+
+The compiler is at liberty to inline atomic operations that are naturally
+supported by the target architecture. For instance on IA-32 architecture an
+atomic like this can be inlined
+@code
+static int s = 0;
+#pragma omp atomic
+    s++;
+@endcode
+using the single instruction: `lock; incl s`
+
+However the runtime does provide entrypoints for these operations to support
+compilers that choose not to inline them. (For instance,
+`__kmpc_atomic_fixed4_add` could be used to perform the increment above.)
+
+The names of the functions are encoded by using the data type name and the
+operation name, as in these tables.
+
+Data Type  | Data type encoding
+-----------|---------------
+int8_t     | `fixed1`
+uint8_t    | `fixed1u`
+int16_t    | `fixed2`
+uint16_t   | `fixed2u`
+int32_t    | `fixed4`
+uint32_t   | `fixed4u`
+int32_t    | `fixed8`
+uint32_t   | `fixed8u`
+float      | `float4`
+double     | `float8`
+float 10 (8087 eighty bit float)  | `float10`
+complex<float>   |  `cmplx4`
+complex<double>  | `cmplx8`
+complex<float10> | `cmplx10`
+<br>
+
+Operation | Operation encoding
+----------|-------------------
++ | add
+- | sub
+\* | mul
+/ | div
+& | andb
+<< | shl
+\>\> | shr
+\| | orb
+^  | xor
+&& | andl
+\|\| | orl
+maximum | max
+minimum | min
+.eqv.   | eqv
+.neqv.  | neqv
+
+<br>
+For non-commutative operations, `_rev` can also be added for the reversed
+operation. For the functions that capture the result, the suffix `_cpt` is
+added.
+
+Update Functions
+================
+The general form of an atomic function that just performs an update (without a
+`capture`)
+@code
+void __kmpc_atomic_<datatype>_<operation>( ident_t *id_ref, int gtid, TYPE *
+lhs, TYPE rhs );
+@endcode
+@param ident_t  a pointer to source location
+@param gtid  the global thread id
+@param lhs   a pointer to the left operand
+@param rhs   the right operand
+
+`capture` functions
+===================
+The capture functions perform an atomic update and return a result, which is
+either the value before the capture, or that after. They take an additional
+argument to determine which result is returned.
+Their general form is therefore
+@code
+TYPE __kmpc_atomic_<datatype>_<operation>_cpt( ident_t *id_ref, int gtid, TYPE *
+lhs, TYPE rhs, int flag );
+@endcode
+@param ident_t  a pointer to source location
+@param gtid  the global thread id
+@param lhs   a pointer to the left operand
+@param rhs   the right operand
+@param flag  one if the result is to be captured *after* the operation, zero if
+captured *before*.
+
+The one set of exceptions to this is the `complex<float>` type where the value
+is not returned, rather an extra argument pointer is passed.
+
+They look like
+@code
+void __kmpc_atomic_cmplx4_<op>_cpt(  ident_t *id_ref, int gtid, kmp_cmplx32 *
+lhs, kmp_cmplx32 rhs, kmp_cmplx32 * out, int flag );
+@endcode
+
+Read and Write Operations
+=========================
+The OpenMP<sup>*</sup> standard now supports atomic operations that simply
+ensure that the value is read or written atomically, with no modification
+performed. In many cases on IA-32 architecture these operations can be inlined
+since the architecture guarantees that no tearing occurs on aligned objects
+accessed with a single memory operation of up to 64 bits in size.
+
+The general form of the read operations is
+@code
+TYPE __kmpc_atomic_<type>_rd ( ident_t *id_ref, int gtid, TYPE * loc );
+@endcode
+
+For the write operations the form is
+@code
+void __kmpc_atomic_<type>_wr ( ident_t *id_ref, int gtid, TYPE * lhs, TYPE rhs
+);
+@endcode
+
+Full list of functions
+======================
+This leads to the generation of 376 atomic functions, as follows.
+
+Functions for integers
+---------------------
+There are versions here for integers of size 1,2,4 and 8 bytes both signed and
+unsigned (where that matters).
+@code
+    __kmpc_atomic_fixed1_add
+    __kmpc_atomic_fixed1_add_cpt
+    __kmpc_atomic_fixed1_add_fp
+    __kmpc_atomic_fixed1_andb
+    __kmpc_atomic_fixed1_andb_cpt
+    __kmpc_atomic_fixed1_andl
+    __kmpc_atomic_fixed1_andl_cpt
+    __kmpc_atomic_fixed1_div
+    __kmpc_atomic_fixed1_div_cpt
+    __kmpc_atomic_fixed1_div_cpt_rev
+    __kmpc_atomic_fixed1_div_float8
+    __kmpc_atomic_fixed1_div_fp
+    __kmpc_atomic_fixed1_div_rev
+    __kmpc_atomic_fixed1_eqv
+    __kmpc_atomic_fixed1_eqv_cpt
+    __kmpc_atomic_fixed1_max
+    __kmpc_atomic_fixed1_max_cpt
+    __kmpc_atomic_fixed1_min
+    __kmpc_atomic_fixed1_min_cpt
+    __kmpc_atomic_fixed1_mul
+    __kmpc_atomic_fixed1_mul_cpt
+    __kmpc_atomic_fixed1_mul_float8
+    __kmpc_atomic_fixed1_mul_fp
+    __kmpc_atomic_fixed1_neqv
+    __kmpc_atomic_fixed1_neqv_cpt
+    __kmpc_atomic_fixed1_orb
+    __kmpc_atomic_fixed1_orb_cpt
+    __kmpc_atomic_fixed1_orl
+    __kmpc_atomic_fixed1_orl_cpt
+    __kmpc_atomic_fixed1_rd
+    __kmpc_atomic_fixed1_shl
+    __kmpc_atomic_fixed1_shl_cpt
+    __kmpc_atomic_fixed1_shl_cpt_rev
+    __kmpc_atomic_fixed1_shl_rev
+    __kmpc_atomic_fixed1_shr
+    __kmpc_atomic_fixed1_shr_cpt
+    __kmpc_atomic_fixed1_shr_cpt_rev
+    __kmpc_atomic_fixed1_shr_rev
+    __kmpc_atomic_fixed1_sub
+    __kmpc_atomic_fixed1_sub_cpt
+    __kmpc_atomic_fixed1_sub_cpt_rev
+    __kmpc_atomic_fixed1_sub_fp
+    __kmpc_atomic_fixed1_sub_rev
+    __kmpc_atomic_fixed1_swp
+    __kmpc_atomic_fixed1_wr
+    __kmpc_atomic_fixed1_xor
+    __kmpc_atomic_fixed1_xor_cpt
+    __kmpc_atomic_fixed1u_add_fp
+    __kmpc_atomic_fixed1u_sub_fp
+    __kmpc_atomic_fixed1u_mul_fp
+    __kmpc_atomic_fixed1u_div
+    __kmpc_atomic_fixed1u_div_cpt
+    __kmpc_atomic_fixed1u_div_cpt_rev
+    __kmpc_atomic_fixed1u_div_fp
+    __kmpc_atomic_fixed1u_div_rev
+    __kmpc_atomic_fixed1u_shr
+    __kmpc_atomic_fixed1u_shr_cpt
+    __kmpc_atomic_fixed1u_shr_cpt_rev
+    __kmpc_atomic_fixed1u_shr_rev
+    __kmpc_atomic_fixed2_add
+    __kmpc_atomic_fixed2_add_cpt
+    __kmpc_atomic_fixed2_add_fp
+    __kmpc_atomic_fixed2_andb
+    __kmpc_atomic_fixed2_andb_cpt
+    __kmpc_atomic_fixed2_andl
+    __kmpc_atomic_fixed2_andl_cpt
+    __kmpc_atomic_fixed2_div
+    __kmpc_atomic_fixed2_div_cpt
+    __kmpc_atomic_fixed2_div_cpt_rev
+    __kmpc_atomic_fixed2_div_float8
+    __kmpc_atomic_fixed2_div_fp
+    __kmpc_atomic_fixed2_div_rev
+    __kmpc_atomic_fixed2_eqv
+    __kmpc_atomic_fixed2_eqv_cpt
+    __kmpc_atomic_fixed2_max
+    __kmpc_atomic_fixed2_max_cpt
+    __kmpc_atomic_fixed2_min
+    __kmpc_atomic_fixed2_min_cpt
+    __kmpc_atomic_fixed2_mul
+    __kmpc_atomic_fixed2_mul_cpt
+    __kmpc_atomic_fixed2_mul_float8
+    __kmpc_atomic_fixed2_mul_fp
+    __kmpc_atomic_fixed2_neqv
+    __kmpc_atomic_fixed2_neqv_cpt
+    __kmpc_atomic_fixed2_orb
+    __kmpc_atomic_fixed2_orb_cpt
+    __kmpc_atomic_fixed2_orl
+    __kmpc_atomic_fixed2_orl_cpt
+    __kmpc_atomic_fixed2_rd
+    __kmpc_atomic_fixed2_shl
+    __kmpc_atomic_fixed2_shl_cpt
+    __kmpc_atomic_fixed2_shl_cpt_rev
+    __kmpc_atomic_fixed2_shl_rev
+    __kmpc_atomic_fixed2_shr
+    __kmpc_atomic_fixed2_shr_cpt
+    __kmpc_atomic_fixed2_shr_cpt_rev
+    __kmpc_atomic_fixed2_shr_rev
+    __kmpc_atomic_fixed2_sub
+    __kmpc_atomic_fixed2_sub_cpt
+    __kmpc_atomic_fixed2_sub_cpt_rev
+    __kmpc_atomic_fixed2_sub_fp
+    __kmpc_atomic_fixed2_sub_rev
+    __kmpc_atomic_fixed2_swp
+    __kmpc_atomic_fixed2_wr
+    __kmpc_atomic_fixed2_xor
+    __kmpc_atomic_fixed2_xor_cpt
+    __kmpc_atomic_fixed2u_add_fp
+    __kmpc_atomic_fixed2u_sub_fp
+    __kmpc_atomic_fixed2u_mul_fp
+    __kmpc_atomic_fixed2u_div
+    __kmpc_atomic_fixed2u_div_cpt
+    __kmpc_atomic_fixed2u_div_cpt_rev
+    __kmpc_atomic_fixed2u_div_fp
+    __kmpc_atomic_fixed2u_div_rev
+    __kmpc_atomic_fixed2u_shr
+    __kmpc_atomic_fixed2u_shr_cpt
+    __kmpc_atomic_fixed2u_shr_cpt_rev
+    __kmpc_atomic_fixed2u_shr_rev
+    __kmpc_atomic_fixed4_add
+    __kmpc_atomic_fixed4_add_cpt
+    __kmpc_atomic_fixed4_add_fp
+    __kmpc_atomic_fixed4_andb
+    __kmpc_atomic_fixed4_andb_cpt
+    __kmpc_atomic_fixed4_andl
+    __kmpc_atomic_fixed4_andl_cpt
+    __kmpc_atomic_fixed4_div
+    __kmpc_atomic_fixed4_div_cpt
+    __kmpc_atomic_fixed4_div_cpt_rev
+    __kmpc_atomic_fixed4_div_float8
+    __kmpc_atomic_fixed4_div_fp
+    __kmpc_atomic_fixed4_div_rev
+    __kmpc_atomic_fixed4_eqv
+    __kmpc_atomic_fixed4_eqv_cpt
+    __kmpc_atomic_fixed4_max
+    __kmpc_atomic_fixed4_max_cpt
+    __kmpc_atomic_fixed4_min
+    __kmpc_atomic_fixed4_min_cpt
+    __kmpc_atomic_fixed4_mul
+    __kmpc_atomic_fixed4_mul_cpt
+    __kmpc_atomic_fixed4_mul_float8
+    __kmpc_atomic_fixed4_mul_fp
+    __kmpc_atomic_fixed4_neqv
+    __kmpc_atomic_fixed4_neqv_cpt
+    __kmpc_atomic_fixed4_orb
+    __kmpc_atomic_fixed4_orb_cpt
+    __kmpc_atomic_fixed4_orl
+    __kmpc_atomic_fixed4_orl_cpt
+    __kmpc_atomic_fixed4_rd
+    __kmpc_atomic_fixed4_shl
+    __kmpc_atomic_fixed4_shl_cpt
+    __kmpc_atomic_fixed4_shl_cpt_rev
+    __kmpc_atomic_fixed4_shl_rev
+    __kmpc_atomic_fixed4_shr
+    __kmpc_atomic_fixed4_shr_cpt
+    __kmpc_atomic_fixed4_shr_cpt_rev
+    __kmpc_atomic_fixed4_shr_rev
+    __kmpc_atomic_fixed4_sub
+    __kmpc_atomic_fixed4_sub_cpt
+    __kmpc_atomic_fixed4_sub_cpt_rev
+    __kmpc_atomic_fixed4_sub_fp
+    __kmpc_atomic_fixed4_sub_rev
+    __kmpc_atomic_fixed4_swp
+    __kmpc_atomic_fixed4_wr
+    __kmpc_atomic_fixed4_xor
+    __kmpc_atomic_fixed4_xor_cpt
+    __kmpc_atomic_fixed4u_add_fp
+    __kmpc_atomic_fixed4u_sub_fp
+    __kmpc_atomic_fixed4u_mul_fp
+    __kmpc_atomic_fixed4u_div
+    __kmpc_atomic_fixed4u_div_cpt
+    __kmpc_atomic_fixed4u_div_cpt_rev
+    __kmpc_atomic_fixed4u_div_fp
+    __kmpc_atomic_fixed4u_div_rev
+    __kmpc_atomic_fixed4u_shr
+    __kmpc_atomic_fixed4u_shr_cpt
+    __kmpc_atomic_fixed4u_shr_cpt_rev
+    __kmpc_atomic_fixed4u_shr_rev
+    __kmpc_atomic_fixed8_add
+    __kmpc_atomic_fixed8_add_cpt
+    __kmpc_atomic_fixed8_add_fp
+    __kmpc_atomic_fixed8_andb
+    __kmpc_atomic_fixed8_andb_cpt
+    __kmpc_atomic_fixed8_andl
+    __kmpc_atomic_fixed8_andl_cpt
+    __kmpc_atomic_fixed8_div
+    __kmpc_atomic_fixed8_div_cpt
+    __kmpc_atomic_fixed8_div_cpt_rev
+    __kmpc_atomic_fixed8_div_float8
+    __kmpc_atomic_fixed8_div_fp
+    __kmpc_atomic_fixed8_div_rev
+    __kmpc_atomic_fixed8_eqv
+    __kmpc_atomic_fixed8_eqv_cpt
+    __kmpc_atomic_fixed8_max
+    __kmpc_atomic_fixed8_max_cpt
+    __kmpc_atomic_fixed8_min
+    __kmpc_atomic_fixed8_min_cpt
+    __kmpc_atomic_fixed8_mul
+    __kmpc_atomic_fixed8_mul_cpt
+    __kmpc_atomic_fixed8_mul_float8
+    __kmpc_atomic_fixed8_mul_fp
+    __kmpc_atomic_fixed8_neqv
+    __kmpc_atomic_fixed8_neqv_cpt
+    __kmpc_atomic_fixed8_orb
+    __kmpc_atomic_fixed8_orb_cpt
+    __kmpc_atomic_fixed8_orl
+    __kmpc_atomic_fixed8_orl_cpt
+    __kmpc_atomic_fixed8_rd
+    __kmpc_atomic_fixed8_shl
+    __kmpc_atomic_fixed8_shl_cpt
+    __kmpc_atomic_fixed8_shl_cpt_rev
+    __kmpc_atomic_fixed8_shl_rev
+    __kmpc_atomic_fixed8_shr
+    __kmpc_atomic_fixed8_shr_cpt
+    __kmpc_atomic_fixed8_shr_cpt_rev
+    __kmpc_atomic_fixed8_shr_rev
+    __kmpc_atomic_fixed8_sub
+    __kmpc_atomic_fixed8_sub_cpt
+    __kmpc_atomic_fixed8_sub_cpt_rev
+    __kmpc_atomic_fixed8_sub_fp
+    __kmpc_atomic_fixed8_sub_rev
+    __kmpc_atomic_fixed8_swp
+    __kmpc_atomic_fixed8_wr
+    __kmpc_atomic_fixed8_xor
+    __kmpc_atomic_fixed8_xor_cpt
+    __kmpc_atomic_fixed8u_add_fp
+    __kmpc_atomic_fixed8u_sub_fp
+    __kmpc_atomic_fixed8u_mul_fp
+    __kmpc_atomic_fixed8u_div
+    __kmpc_atomic_fixed8u_div_cpt
+    __kmpc_atomic_fixed8u_div_cpt_rev
+    __kmpc_atomic_fixed8u_div_fp
+    __kmpc_atomic_fixed8u_div_rev
+    __kmpc_atomic_fixed8u_shr
+    __kmpc_atomic_fixed8u_shr_cpt
+    __kmpc_atomic_fixed8u_shr_cpt_rev
+    __kmpc_atomic_fixed8u_shr_rev
+@endcode
+
+Functions for floating point
+----------------------------
+There are versions here for floating point numbers of size 4, 8, 10 and 16
+bytes. (Ten byte floats are used by X87, but are now rare).
+@code
+    __kmpc_atomic_float4_add
+    __kmpc_atomic_float4_add_cpt
+    __kmpc_atomic_float4_add_float8
+    __kmpc_atomic_float4_add_fp
+    __kmpc_atomic_float4_div
+    __kmpc_atomic_float4_div_cpt
+    __kmpc_atomic_float4_div_cpt_rev
+    __kmpc_atomic_float4_div_float8
+    __kmpc_atomic_float4_div_fp
+    __kmpc_atomic_float4_div_rev
+    __kmpc_atomic_float4_max
+    __kmpc_atomic_float4_max_cpt
+    __kmpc_atomic_float4_min
+    __kmpc_atomic_float4_min_cpt
+    __kmpc_atomic_float4_mul
+    __kmpc_atomic_float4_mul_cpt
+    __kmpc_atomic_float4_mul_float8
+    __kmpc_atomic_float4_mul_fp
+    __kmpc_atomic_float4_rd
+    __kmpc_atomic_float4_sub
+    __kmpc_atomic_float4_sub_cpt
+    __kmpc_atomic_float4_sub_cpt_rev
+    __kmpc_atomic_float4_sub_float8
+    __kmpc_atomic_float4_sub_fp
+    __kmpc_atomic_float4_sub_rev
+    __kmpc_atomic_float4_swp
+    __kmpc_atomic_float4_wr
+    __kmpc_atomic_float8_add
+    __kmpc_atomic_float8_add_cpt
+    __kmpc_atomic_float8_add_fp
+    __kmpc_atomic_float8_div
+    __kmpc_atomic_float8_div_cpt
+    __kmpc_atomic_float8_div_cpt_rev
+    __kmpc_atomic_float8_div_fp
+    __kmpc_atomic_float8_div_rev
+    __kmpc_atomic_float8_max
+    __kmpc_atomic_float8_max_cpt
+    __kmpc_atomic_float8_min
+    __kmpc_atomic_float8_min_cpt
+    __kmpc_atomic_float8_mul
+    __kmpc_atomic_float8_mul_cpt
+    __kmpc_atomic_float8_mul_fp
+    __kmpc_atomic_float8_rd
+    __kmpc_atomic_float8_sub
+    __kmpc_atomic_float8_sub_cpt
+    __kmpc_atomic_float8_sub_cpt_rev
+    __kmpc_atomic_float8_sub_fp
+    __kmpc_atomic_float8_sub_rev
+    __kmpc_atomic_float8_swp
+    __kmpc_atomic_float8_wr
+    __kmpc_atomic_float10_add
+    __kmpc_atomic_float10_add_cpt
+    __kmpc_atomic_float10_add_fp
+    __kmpc_atomic_float10_div
+    __kmpc_atomic_float10_div_cpt
+    __kmpc_atomic_float10_div_cpt_rev
+    __kmpc_atomic_float10_div_fp
+    __kmpc_atomic_float10_div_rev
+    __kmpc_atomic_float10_mul
+    __kmpc_atomic_float10_mul_cpt
+    __kmpc_atomic_float10_mul_fp
+    __kmpc_atomic_float10_rd
+    __kmpc_atomic_float10_sub
+    __kmpc_atomic_float10_sub_cpt
+    __kmpc_atomic_float10_sub_cpt_rev
+    __kmpc_atomic_float10_sub_fp
+    __kmpc_atomic_float10_sub_rev
+    __kmpc_atomic_float10_swp
+    __kmpc_atomic_float10_wr
+    __kmpc_atomic_float16_add
+    __kmpc_atomic_float16_add_cpt
+    __kmpc_atomic_float16_div
+    __kmpc_atomic_float16_div_cpt
+    __kmpc_atomic_float16_div_cpt_rev
+    __kmpc_atomic_float16_div_rev
+    __kmpc_atomic_float16_max
+    __kmpc_atomic_float16_max_cpt
+    __kmpc_atomic_float16_min
+    __kmpc_atomic_float16_min_cpt
+    __kmpc_atomic_float16_mul
+    __kmpc_atomic_float16_mul_cpt
+    __kmpc_atomic_float16_rd
+    __kmpc_atomic_float16_sub
+    __kmpc_atomic_float16_sub_cpt
+    __kmpc_atomic_float16_sub_cpt_rev
+    __kmpc_atomic_float16_sub_rev
+    __kmpc_atomic_float16_swp
+    __kmpc_atomic_float16_wr
+@endcode
+
+Functions for Complex types
+---------------------------
+Functions for complex types whose component floating point variables are of size
+4,8,10 or 16 bytes. The names here are based on the size of the component float,
+*not* the size of the complex type. So `__kmpc_atomic_cmplx8_add` is an
+operation on a `complex<double>` or `complex(kind=8)`, *not* `complex<float>`.
+
+@code
+    __kmpc_atomic_cmplx4_add
+    __kmpc_atomic_cmplx4_add_cmplx8
+    __kmpc_atomic_cmplx4_add_cpt
+    __kmpc_atomic_cmplx4_div
+    __kmpc_atomic_cmplx4_div_cmplx8
+    __kmpc_atomic_cmplx4_div_cpt
+    __kmpc_atomic_cmplx4_div_cpt_rev
+    __kmpc_atomic_cmplx4_div_rev
+    __kmpc_atomic_cmplx4_mul
+    __kmpc_atomic_cmplx4_mul_cmplx8
+    __kmpc_atomic_cmplx4_mul_cpt
+    __kmpc_atomic_cmplx4_rd
+    __kmpc_atomic_cmplx4_sub
+    __kmpc_atomic_cmplx4_sub_cmplx8
+    __kmpc_atomic_cmplx4_sub_cpt
+    __kmpc_atomic_cmplx4_sub_cpt_rev
+    __kmpc_atomic_cmplx4_sub_rev
+    __kmpc_atomic_cmplx4_swp
+    __kmpc_atomic_cmplx4_wr
+    __kmpc_atomic_cmplx8_add
+    __kmpc_atomic_cmplx8_add_cpt
+    __kmpc_atomic_cmplx8_div
+    __kmpc_atomic_cmplx8_div_cpt
+    __kmpc_atomic_cmplx8_div_cpt_rev
+    __kmpc_atomic_cmplx8_div_rev
+    __kmpc_atomic_cmplx8_mul
+    __kmpc_atomic_cmplx8_mul_cpt
+    __kmpc_atomic_cmplx8_rd
+    __kmpc_atomic_cmplx8_sub
+    __kmpc_atomic_cmplx8_sub_cpt
+    __kmpc_atomic_cmplx8_sub_cpt_rev
+    __kmpc_atomic_cmplx8_sub_rev
+    __kmpc_atomic_cmplx8_swp
+    __kmpc_atomic_cmplx8_wr
+    __kmpc_atomic_cmplx10_add
+    __kmpc_atomic_cmplx10_add_cpt
+    __kmpc_atomic_cmplx10_div
+    __kmpc_atomic_cmplx10_div_cpt
+    __kmpc_atomic_cmplx10_div_cpt_rev
+    __kmpc_atomic_cmplx10_div_rev
+    __kmpc_atomic_cmplx10_mul
+    __kmpc_atomic_cmplx10_mul_cpt
+    __kmpc_atomic_cmplx10_rd
+    __kmpc_atomic_cmplx10_sub
+    __kmpc_atomic_cmplx10_sub_cpt
+    __kmpc_atomic_cmplx10_sub_cpt_rev
+    __kmpc_atomic_cmplx10_sub_rev
+    __kmpc_atomic_cmplx10_swp
+    __kmpc_atomic_cmplx10_wr
+    __kmpc_atomic_cmplx16_add
+    __kmpc_atomic_cmplx16_add_cpt
+    __kmpc_atomic_cmplx16_div
+    __kmpc_atomic_cmplx16_div_cpt
+    __kmpc_atomic_cmplx16_div_cpt_rev
+    __kmpc_atomic_cmplx16_div_rev
+    __kmpc_atomic_cmplx16_mul
+    __kmpc_atomic_cmplx16_mul_cpt
+    __kmpc_atomic_cmplx16_rd
+    __kmpc_atomic_cmplx16_sub
+    __kmpc_atomic_cmplx16_sub_cpt
+    __kmpc_atomic_cmplx16_sub_cpt_rev
+    __kmpc_atomic_cmplx16_swp
+    __kmpc_atomic_cmplx16_wr
+@endcode
+*/
+
+/*!
+@ingroup ATOMIC_OPS
+@{
+*/
+
+/*
+ * Global vars
+ */
+
+#ifndef KMP_GOMP_COMPAT
+int __kmp_atomic_mode = 1; // Intel perf
+#else
+int __kmp_atomic_mode = 2; // GOMP compatibility
+#endif /* KMP_GOMP_COMPAT */
+
+KMP_ALIGN(128)
+
+// Control access to all user coded atomics in Gnu compat mode
+kmp_atomic_lock_t __kmp_atomic_lock;
+// Control access to all user coded atomics for 1-byte fixed data types
+kmp_atomic_lock_t __kmp_atomic_lock_1i;
+// Control access to all user coded atomics for 2-byte fixed data types
+kmp_atomic_lock_t __kmp_atomic_lock_2i;
+// Control access to all user coded atomics for 4-byte fixed data types
+kmp_atomic_lock_t __kmp_atomic_lock_4i;
+// Control access to all user coded atomics for kmp_real32 data type
+kmp_atomic_lock_t __kmp_atomic_lock_4r;
+// Control access to all user coded atomics for 8-byte fixed data types
+kmp_atomic_lock_t __kmp_atomic_lock_8i;
+// Control access to all user coded atomics for kmp_real64 data type
+kmp_atomic_lock_t __kmp_atomic_lock_8r;
+// Control access to all user coded atomics for complex byte data type
+kmp_atomic_lock_t __kmp_atomic_lock_8c;
+// Control access to all user coded atomics for long double data type
+kmp_atomic_lock_t __kmp_atomic_lock_10r;
+// Control access to all user coded atomics for _Quad data type
+kmp_atomic_lock_t __kmp_atomic_lock_16r;
+// Control access to all user coded atomics for double complex data type
+kmp_atomic_lock_t __kmp_atomic_lock_16c;
+// Control access to all user coded atomics for long double complex type
+kmp_atomic_lock_t __kmp_atomic_lock_20c;
+// Control access to all user coded atomics for _Quad complex data type
+kmp_atomic_lock_t __kmp_atomic_lock_32c;
+
+/* 2007-03-02:
+   Without "volatile" specifier in OP_CMPXCHG and MIN_MAX_CMPXCHG we have a bug
+   on *_32 and *_32e. This is just a temporary workaround for the problem. It
+   seems the right solution is writing OP_CMPXCHG and MIN_MAX_CMPXCHG routines
+   in assembler language. */
+#define KMP_ATOMIC_VOLATILE volatile
+
+#if (KMP_ARCH_X86) && KMP_HAVE_QUAD
+
+static inline Quad_a4_t operator+(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q + rhs.q;
+}
+static inline Quad_a4_t operator-(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q - rhs.q;
+}
+static inline Quad_a4_t operator*(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q * rhs.q;
+}
+static inline Quad_a4_t operator/(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q / rhs.q;
+}
+static inline bool operator<(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q < rhs.q;
+}
+static inline bool operator>(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q > rhs.q;
+}
+
+static inline Quad_a16_t operator+(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q + rhs.q;
+}
+static inline Quad_a16_t operator-(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q - rhs.q;
+}
+static inline Quad_a16_t operator*(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q * rhs.q;
+}
+static inline Quad_a16_t operator/(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q / rhs.q;
+}
+static inline bool operator<(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q < rhs.q;
+}
+static inline bool operator>(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q > rhs.q;
+}
+
+static inline kmp_cmplx128_a4_t operator+(kmp_cmplx128_a4_t &lhs,
+                                          kmp_cmplx128_a4_t &rhs) {
+  return lhs.q + rhs.q;
+}
+static inline kmp_cmplx128_a4_t operator-(kmp_cmplx128_a4_t &lhs,
+                                          kmp_cmplx128_a4_t &rhs) {
+  return lhs.q - rhs.q;
+}
+static inline kmp_cmplx128_a4_t operator*(kmp_cmplx128_a4_t &lhs,
+                                          kmp_cmplx128_a4_t &rhs) {
+  return lhs.q * rhs.q;
+}
+static inline kmp_cmplx128_a4_t operator/(kmp_cmplx128_a4_t &lhs,
+                                          kmp_cmplx128_a4_t &rhs) {
+  return lhs.q / rhs.q;
+}
+
+static inline kmp_cmplx128_a16_t operator+(kmp_cmplx128_a16_t &lhs,
+                                           kmp_cmplx128_a16_t &rhs) {
+  return lhs.q + rhs.q;
+}
+static inline kmp_cmplx128_a16_t operator-(kmp_cmplx128_a16_t &lhs,
+                                           kmp_cmplx128_a16_t &rhs) {
+  return lhs.q - rhs.q;
+}
+static inline kmp_cmplx128_a16_t operator*(kmp_cmplx128_a16_t &lhs,
+                                           kmp_cmplx128_a16_t &rhs) {
+  return lhs.q * rhs.q;
+}
+static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,
+                                           kmp_cmplx128_a16_t &rhs) {
+  return lhs.q / rhs.q;
+}
+
+#endif // (KMP_ARCH_X86) && KMP_HAVE_QUAD
+
+// ATOMIC implementation routines -----------------------------------------
+// One routine for each operation and operand type.
+// All routines declarations looks like
+// void __kmpc_atomic_RTYPE_OP( ident_t*, int, TYPE *lhs, TYPE rhs );
+
+#define KMP_CHECK_GTID                                                         \
+  if (gtid == KMP_GTID_UNKNOWN) {                                              \
+    gtid = __kmp_entry_gtid();                                                 \
+  } // check and get gtid when needed
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, RET_TYPE)                           \
+  RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID(ident_t *id_ref, int gtid,        \
+                                             TYPE *lhs, TYPE rhs) {            \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid));
+
+// ------------------------------------------------------------------------
+// Lock variables used for critical sections for various size operands
+#define ATOMIC_LOCK0 __kmp_atomic_lock // all types, for Gnu compat
+#define ATOMIC_LOCK1i __kmp_atomic_lock_1i // char
+#define ATOMIC_LOCK2i __kmp_atomic_lock_2i // short
+#define ATOMIC_LOCK4i __kmp_atomic_lock_4i // long int
+#define ATOMIC_LOCK4r __kmp_atomic_lock_4r // float
+#define ATOMIC_LOCK8i __kmp_atomic_lock_8i // long long int
+#define ATOMIC_LOCK8r __kmp_atomic_lock_8r // double
+#define ATOMIC_LOCK8c __kmp_atomic_lock_8c // float complex
+#define ATOMIC_LOCK10r __kmp_atomic_lock_10r // long double
+#define ATOMIC_LOCK16r __kmp_atomic_lock_16r // _Quad
+#define ATOMIC_LOCK16c __kmp_atomic_lock_16c // double complex
+#define ATOMIC_LOCK20c __kmp_atomic_lock_20c // long double complex
+#define ATOMIC_LOCK32c __kmp_atomic_lock_32c // _Quad complex
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL(OP, LCK_ID)                                                \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  (*lhs) OP(rhs);                                                              \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+#define OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID)                                   \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  (*lhs) = (TYPE)((*lhs)OP rhs);                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+// ------------------------------------------------------------------------
+// For GNU compatibility, we may need to use a critical section,
+// even though it is not required by the ISA.
+//
+// On IA-32 architecture, all atomic operations except for fixed 4 byte add,
+// sub, and bitwise logical ops, and 1 & 2 byte logical ops use a common
+// critical section.  On Intel(R) 64, all atomic operations are done with fetch
+// and add or compare and exchange.  Therefore, the FLAG parameter to this
+// macro is either KMP_ARCH_X86 or 0 (or 1, for Intel-specific extension which
+// require a critical section, where we predict that they will be implemented
+// in the Gnu codegen by calling GOMP_atomic_start() / GOMP_atomic_end()).
+//
+// When the OP_GOMP_CRITICAL macro is used in a *CRITICAL* macro construct,
+// the FLAG parameter should always be 1.  If we know that we will be using
+// a critical section, then we want to make certain that we use the generic
+// lock __kmp_atomic_lock to protect the atomic update, and not of of the
+// locks that are specialized based upon the size or type of the data.
+//
+// If FLAG is 0, then we are relying on dead code elimination by the build
+// compiler to get rid of the useless block of code, and save a needless
+// branch at runtime.
+
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL(OP, FLAG)                                             \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL(OP, 0);                                                        \
+    return;                                                                    \
+  }
+
+#define OP_UPDATE_GOMP_CRITICAL(TYPE, OP, FLAG)                                \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_UPDATE_CRITICAL(TYPE, OP, 0);                                           \
+    return;                                                                    \
+  }
+#else
+#define OP_GOMP_CRITICAL(OP, FLAG)
+#define OP_UPDATE_GOMP_CRITICAL(TYPE, OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+#if KMP_MIC
+#define KMP_DO_PAUSE _mm_delay_32(1)
+#else
+#define KMP_DO_PAUSE
+#endif /* KMP_MIC */
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+#define OP_CMPXCHG(TYPE, BITS, OP)                                             \
+  {                                                                            \
+    TYPE old_value, new_value;                                                 \
+    old_value = *(TYPE volatile *)lhs;                                         \
+    new_value = (TYPE)(old_value OP rhs);                                      \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      KMP_DO_PAUSE;                                                            \
+                                                                               \
+      old_value = *(TYPE volatile *)lhs;                                       \
+      new_value = (TYPE)(old_value OP rhs);                                    \
+    }                                                                          \
+  }
+
+#if USE_CMPXCHG_FIX
+// 2007-06-25:
+// workaround for C78287 (complex(kind=4) data type). lin_32, lin_32e, win_32
+// and win_32e are affected (I verified the asm). Compiler ignores the volatile
+// qualifier of the temp_val in the OP_CMPXCHG macro. This is a problem of the
+// compiler. Related tracker is C76005, targeted to 11.0. I verified the asm of
+// the workaround.
+#define OP_CMPXCHG_WORKAROUND(TYPE, BITS, OP)                                  \
+  {                                                                            \
+    struct _sss {                                                              \
+      TYPE cmp;                                                                \
+      kmp_int##BITS *vvv;                                                      \
+    };                                                                         \
+    struct _sss old_value, new_value;                                          \
+    old_value.vvv = (kmp_int##BITS *)&old_value.cmp;                           \
+    new_value.vvv = (kmp_int##BITS *)&new_value.cmp;                           \
+    *old_value.vvv = *(volatile kmp_int##BITS *)lhs;                           \
+    new_value.cmp = (TYPE)(old_value.cmp OP rhs);                              \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) old_value.vvv,   \
+        *VOLATILE_CAST(kmp_int##BITS *) new_value.vvv)) {                      \
+      KMP_DO_PAUSE;                                                            \
+                                                                               \
+      *old_value.vvv = *(volatile kmp_int##BITS *)lhs;                         \
+      new_value.cmp = (TYPE)(old_value.cmp OP rhs);                            \
+    }                                                                          \
+  }
+// end of the first part of the workaround for C78287
+#endif // USE_CMPXCHG_FIX
+
+#if KMP_OS_WINDOWS && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
+// Undo explicit type casts to get MSVC ARM64 to build. Uses
+// OP_CMPXCHG_WORKAROUND definition for OP_CMPXCHG
+#undef OP_CMPXCHG
+#define OP_CMPXCHG(TYPE, BITS, OP)                                             \
+  {                                                                            \
+    struct _sss {                                                              \
+      TYPE cmp;                                                                \
+      kmp_int##BITS *vvv;                                                      \
+    };                                                                         \
+    struct _sss old_value, new_value;                                          \
+    old_value.vvv = (kmp_int##BITS *)&old_value.cmp;                           \
+    new_value.vvv = (kmp_int##BITS *)&new_value.cmp;                           \
+    *old_value.vvv = *(volatile kmp_int##BITS *)lhs;                           \
+    new_value.cmp = old_value.cmp OP rhs;                                      \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) old_value.vvv,   \
+        *VOLATILE_CAST(kmp_int##BITS *) new_value.vvv)) {                      \
+      KMP_DO_PAUSE;                                                            \
+                                                                               \
+      *old_value.vvv = *(volatile kmp_int##BITS *)lhs;                         \
+      new_value.cmp = old_value.cmp OP rhs;                                    \
+    }                                                                          \
+  }
+
+#undef OP_UPDATE_CRITICAL
+#define OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID)                                   \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  (*lhs) = (*lhs)OP rhs;                                                       \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+#endif // KMP_OS_WINDOWS && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#define ATOMIC_FIXED_ADD(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,         \
+                         GOMP_FLAG)                                            \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
+  /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */            \
+  KMP_TEST_THEN_ADD##BITS(lhs, OP rhs);                                        \
+  }
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,           \
+                       GOMP_FLAG)                                              \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
+  OP_CMPXCHG(TYPE, BITS, OP)                                                   \
+  }
+#if USE_CMPXCHG_FIX
+// -------------------------------------------------------------------------
+// workaround for C78287 (complex(kind=4) data type)
+#define ATOMIC_CMPXCHG_WORKAROUND(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID,      \
+                                  MASK, GOMP_FLAG)                             \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
+  OP_CMPXCHG_WORKAROUND(TYPE, BITS, OP)                                        \
+  }
+// end of the second part of the workaround for C78287
+#endif // USE_CMPXCHG_FIX
+
+#else
+// -------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_FIXED_ADD(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,         \
+                         GOMP_FLAG)                                            \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */          \
+    KMP_TEST_THEN_ADD##BITS(lhs, OP rhs);                                      \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_UPDATE_CRITICAL(TYPE, OP,                                               \
+                       LCK_ID) /* unaligned address - use critical */          \
+  }                                                                            \
+  }
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,           \
+                       GOMP_FLAG)                                              \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_UPDATE_CRITICAL(TYPE, OP,                                               \
+                       LCK_ID) /* unaligned address - use critical */          \
+  }                                                                            \
+  }
+#if USE_CMPXCHG_FIX
+// -------------------------------------------------------------------------
+// workaround for C78287 (complex(kind=4) data type)
+#define ATOMIC_CMPXCHG_WORKAROUND(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID,      \
+                                  MASK, GOMP_FLAG)                             \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_UPDATE_CRITICAL(TYPE, OP,                                               \
+                       LCK_ID) /* unaligned address - use critical */          \
+  }                                                                            \
+  }
+// end of the second part of the workaround for C78287
+#endif // USE_CMPXCHG_FIX
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// Routines for ATOMIC 4-byte operands addition and subtraction
+ATOMIC_FIXED_ADD(fixed4, add, kmp_int32, 32, +, 4i, 3,
+                 0) // __kmpc_atomic_fixed4_add
+ATOMIC_FIXED_ADD(fixed4, sub, kmp_int32, 32, -, 4i, 3,
+                 0) // __kmpc_atomic_fixed4_sub
+
+ATOMIC_CMPXCHG(float4, add, kmp_real32, 32, +, 4r, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_float4_add
+ATOMIC_CMPXCHG(float4, sub, kmp_real32, 32, -, 4r, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_float4_sub
+
+// Routines for ATOMIC 8-byte operands addition and subtraction
+ATOMIC_FIXED_ADD(fixed8, add, kmp_int64, 64, +, 8i, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed8_add
+ATOMIC_FIXED_ADD(fixed8, sub, kmp_int64, 64, -, 8i, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub
+
+ATOMIC_CMPXCHG(float8, add, kmp_real64, 64, +, 8r, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_float8_add
+ATOMIC_CMPXCHG(float8, sub, kmp_real64, 64, -, 8r, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_float8_sub
+
+// ------------------------------------------------------------------------
+// Entries definition for integer operands
+//     TYPE_ID - operands type and size (fixed4, float4)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operand type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator (used in critical section)
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+//     MASK    - used for alignment check
+
+//               TYPE_ID,OP_ID,  TYPE,   BITS,OP,LCK_ID,MASK,GOMP_FLAG
+// ------------------------------------------------------------------------
+// Routines for ATOMIC integer operands, other operators
+// ------------------------------------------------------------------------
+//              TYPE_ID,OP_ID, TYPE,          OP, LCK_ID, GOMP_FLAG
+ATOMIC_CMPXCHG(fixed1, add, kmp_int8, 8, +, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_add
+ATOMIC_CMPXCHG(fixed1, andb, kmp_int8, 8, &, 1i, 0,
+               0) // __kmpc_atomic_fixed1_andb
+ATOMIC_CMPXCHG(fixed1, div, kmp_int8, 8, /, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_div
+ATOMIC_CMPXCHG(fixed1u, div, kmp_uint8, 8, /, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div
+ATOMIC_CMPXCHG(fixed1, mul, kmp_int8, 8, *, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_mul
+ATOMIC_CMPXCHG(fixed1, orb, kmp_int8, 8, |, 1i, 0,
+               0) // __kmpc_atomic_fixed1_orb
+ATOMIC_CMPXCHG(fixed1, shl, kmp_int8, 8, <<, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_shl
+ATOMIC_CMPXCHG(fixed1, shr, kmp_int8, 8, >>, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_shr
+ATOMIC_CMPXCHG(fixed1u, shr, kmp_uint8, 8, >>, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1u_shr
+ATOMIC_CMPXCHG(fixed1, sub, kmp_int8, 8, -, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub
+ATOMIC_CMPXCHG(fixed1, xor, kmp_int8, 8, ^, 1i, 0,
+               0) // __kmpc_atomic_fixed1_xor
+ATOMIC_CMPXCHG(fixed2, add, kmp_int16, 16, +, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_add
+ATOMIC_CMPXCHG(fixed2, andb, kmp_int16, 16, &, 2i, 1,
+               0) // __kmpc_atomic_fixed2_andb
+ATOMIC_CMPXCHG(fixed2, div, kmp_int16, 16, /, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_div
+ATOMIC_CMPXCHG(fixed2u, div, kmp_uint16, 16, /, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div
+ATOMIC_CMPXCHG(fixed2, mul, kmp_int16, 16, *, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_mul
+ATOMIC_CMPXCHG(fixed2, orb, kmp_int16, 16, |, 2i, 1,
+               0) // __kmpc_atomic_fixed2_orb
+ATOMIC_CMPXCHG(fixed2, shl, kmp_int16, 16, <<, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_shl
+ATOMIC_CMPXCHG(fixed2, shr, kmp_int16, 16, >>, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_shr
+ATOMIC_CMPXCHG(fixed2u, shr, kmp_uint16, 16, >>, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2u_shr
+ATOMIC_CMPXCHG(fixed2, sub, kmp_int16, 16, -, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub
+ATOMIC_CMPXCHG(fixed2, xor, kmp_int16, 16, ^, 2i, 1,
+               0) // __kmpc_atomic_fixed2_xor
+ATOMIC_CMPXCHG(fixed4, andb, kmp_int32, 32, &, 4i, 3,
+               0) // __kmpc_atomic_fixed4_andb
+ATOMIC_CMPXCHG(fixed4, div, kmp_int32, 32, /, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_div
+ATOMIC_CMPXCHG(fixed4u, div, kmp_uint32, 32, /, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4u_div
+ATOMIC_CMPXCHG(fixed4, mul, kmp_int32, 32, *, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_mul
+ATOMIC_CMPXCHG(fixed4, orb, kmp_int32, 32, |, 4i, 3,
+               0) // __kmpc_atomic_fixed4_orb
+ATOMIC_CMPXCHG(fixed4, shl, kmp_int32, 32, <<, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_shl
+ATOMIC_CMPXCHG(fixed4, shr, kmp_int32, 32, >>, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_shr
+ATOMIC_CMPXCHG(fixed4u, shr, kmp_uint32, 32, >>, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4u_shr
+ATOMIC_CMPXCHG(fixed4, xor, kmp_int32, 32, ^, 4i, 3,
+               0) // __kmpc_atomic_fixed4_xor
+ATOMIC_CMPXCHG(fixed8, andb, kmp_int64, 64, &, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_andb
+ATOMIC_CMPXCHG(fixed8, div, kmp_int64, 64, /, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_div
+ATOMIC_CMPXCHG(fixed8u, div, kmp_uint64, 64, /, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div
+ATOMIC_CMPXCHG(fixed8, mul, kmp_int64, 64, *, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_mul
+ATOMIC_CMPXCHG(fixed8, orb, kmp_int64, 64, |, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_orb
+ATOMIC_CMPXCHG(fixed8, shl, kmp_int64, 64, <<, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_shl
+ATOMIC_CMPXCHG(fixed8, shr, kmp_int64, 64, >>, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_shr
+ATOMIC_CMPXCHG(fixed8u, shr, kmp_uint64, 64, >>, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8u_shr
+ATOMIC_CMPXCHG(fixed8, xor, kmp_int64, 64, ^, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_xor
+ATOMIC_CMPXCHG(float4, div, kmp_real32, 32, /, 4r, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_float4_div
+ATOMIC_CMPXCHG(float4, mul, kmp_real32, 32, *, 4r, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_float4_mul
+ATOMIC_CMPXCHG(float8, div, kmp_real64, 64, /, 8r, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_float8_div
+ATOMIC_CMPXCHG(float8, mul, kmp_real64, 64, *, 8r, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_float8_mul
+//              TYPE_ID,OP_ID, TYPE,          OP, LCK_ID, GOMP_FLAG
+
+/* ------------------------------------------------------------------------ */
+/* Routines for C/C++ Reduction operators && and ||                         */
+
+// ------------------------------------------------------------------------
+// Need separate macros for &&, || because there is no combined assignment
+//   TODO: eliminate ATOMIC_CRIT_{L,EQV} macros as not used
+#define ATOMIC_CRIT_L(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)             \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(= *lhs OP, GOMP_FLAG)                                       \
+  OP_CRITICAL(= *lhs OP, LCK_ID)                                               \
+  }
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ===================================
+#define ATOMIC_CMPX_L(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK, GOMP_FLAG) \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(= *lhs OP, GOMP_FLAG)                                       \
+  OP_CMPXCHG(TYPE, BITS, OP)                                                   \
+  }
+
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPX_L(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK, GOMP_FLAG) \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(= *lhs OP, GOMP_FLAG)                                       \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL(= *lhs OP, LCK_ID) /* unaligned - use critical */              \
+  }                                                                            \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+ATOMIC_CMPX_L(fixed1, andl, char, 8, &&, 1i, 0,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed1_andl
+ATOMIC_CMPX_L(fixed1, orl, char, 8, ||, 1i, 0,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed1_orl
+ATOMIC_CMPX_L(fixed2, andl, short, 16, &&, 2i, 1,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed2_andl
+ATOMIC_CMPX_L(fixed2, orl, short, 16, ||, 2i, 1,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed2_orl
+ATOMIC_CMPX_L(fixed4, andl, kmp_int32, 32, &&, 4i, 3,
+              0) // __kmpc_atomic_fixed4_andl
+ATOMIC_CMPX_L(fixed4, orl, kmp_int32, 32, ||, 4i, 3,
+              0) // __kmpc_atomic_fixed4_orl
+ATOMIC_CMPX_L(fixed8, andl, kmp_int64, 64, &&, 8i, 7,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed8_andl
+ATOMIC_CMPX_L(fixed8, orl, kmp_int64, 64, ||, 8i, 7,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed8_orl
+
+/* ------------------------------------------------------------------------- */
+/* Routines for Fortran operators that matched no one in C:                  */
+/* MAX, MIN, .EQV., .NEQV.                                                   */
+/* Operators .AND., .OR. are covered by __kmpc_atomic_*_{andl,orl}           */
+/* Intrinsics IAND, IOR, IEOR are covered by __kmpc_atomic_*_{andb,orb,xor}  */
+
+// -------------------------------------------------------------------------
+// MIN and MAX need separate macros
+// OP - operator to check if we need any actions?
+#define MIN_MAX_CRITSECT(OP, LCK_ID)                                           \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (*lhs OP rhs) { /* still need actions? */                                 \
+    *lhs = rhs;                                                                \
+  }                                                                            \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+// -------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_MIN_MAX_CRITSECT(OP, FLAG)                                        \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    MIN_MAX_CRITSECT(OP, 0);                                                   \
+    return;                                                                    \
+  }
+#else
+#define GOMP_MIN_MAX_CRITSECT(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// -------------------------------------------------------------------------
+#define MIN_MAX_CMPXCHG(TYPE, BITS, OP)                                        \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value;                                                            \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    while (old_value OP rhs && /* still need actions? */                       \
+           !KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+               (kmp_int##BITS *)lhs,                                           \
+               *VOLATILE_CAST(kmp_int##BITS *) & old_value,                    \
+               *VOLATILE_CAST(kmp_int##BITS *) & rhs)) {                       \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+    }                                                                          \
+  }
+
+// -------------------------------------------------------------------------
+// 1-byte, 2-byte operands - use critical section
+#define MIN_MAX_CRITICAL(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)          \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  if (*lhs OP rhs) { /* need actions? */                                       \
+    GOMP_MIN_MAX_CRITSECT(OP, GOMP_FLAG)                                       \
+    MIN_MAX_CRITSECT(OP, LCK_ID)                                               \
+  }                                                                            \
+  }
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// -------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#define MIN_MAX_COMPXCHG(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,         \
+                         GOMP_FLAG)                                            \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  if (*lhs OP rhs) {                                                           \
+    GOMP_MIN_MAX_CRITSECT(OP, GOMP_FLAG)                                       \
+    MIN_MAX_CMPXCHG(TYPE, BITS, OP)                                            \
+  }                                                                            \
+  }
+
+#else
+// -------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define MIN_MAX_COMPXCHG(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,         \
+                         GOMP_FLAG)                                            \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  if (*lhs OP rhs) {                                                           \
+    GOMP_MIN_MAX_CRITSECT(OP, GOMP_FLAG)                                       \
+    if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                    \
+      MIN_MAX_CMPXCHG(TYPE, BITS, OP) /* aligned address */                    \
+    } else {                                                                   \
+      KMP_CHECK_GTID;                                                          \
+      MIN_MAX_CRITSECT(OP, LCK_ID) /* unaligned address */                     \
+    }                                                                          \
+  }                                                                            \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+MIN_MAX_COMPXCHG(fixed1, max, char, 8, <, 1i, 0,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed1_max
+MIN_MAX_COMPXCHG(fixed1, min, char, 8, >, 1i, 0,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed1_min
+MIN_MAX_COMPXCHG(fixed2, max, short, 16, <, 2i, 1,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed2_max
+MIN_MAX_COMPXCHG(fixed2, min, short, 16, >, 2i, 1,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed2_min
+MIN_MAX_COMPXCHG(fixed4, max, kmp_int32, 32, <, 4i, 3,
+                 0) // __kmpc_atomic_fixed4_max
+MIN_MAX_COMPXCHG(fixed4, min, kmp_int32, 32, >, 4i, 3,
+                 0) // __kmpc_atomic_fixed4_min
+MIN_MAX_COMPXCHG(fixed8, max, kmp_int64, 64, <, 8i, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed8_max
+MIN_MAX_COMPXCHG(fixed8, min, kmp_int64, 64, >, 8i, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed8_min
+MIN_MAX_COMPXCHG(float4, max, kmp_real32, 32, <, 4r, 3,
+                 KMP_ARCH_X86) // __kmpc_atomic_float4_max
+MIN_MAX_COMPXCHG(float4, min, kmp_real32, 32, >, 4r, 3,
+                 KMP_ARCH_X86) // __kmpc_atomic_float4_min
+MIN_MAX_COMPXCHG(float8, max, kmp_real64, 64, <, 8r, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_float8_max
+MIN_MAX_COMPXCHG(float8, min, kmp_real64, 64, >, 8r, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_float8_min
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+MIN_MAX_CRITICAL(float10, max, long double, <, 10r,
+                 1) // __kmpc_atomic_float10_max
+MIN_MAX_CRITICAL(float10, min, long double, >, 10r,
+                 1) // __kmpc_atomic_float10_min
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_HAVE_QUAD
+MIN_MAX_CRITICAL(float16, max, QUAD_LEGACY, <, 16r,
+                 1) // __kmpc_atomic_float16_max
+MIN_MAX_CRITICAL(float16, min, QUAD_LEGACY, >, 16r,
+                 1) // __kmpc_atomic_float16_min
+#if (KMP_ARCH_X86)
+MIN_MAX_CRITICAL(float16, max_a16, Quad_a16_t, <, 16r,
+                 1) // __kmpc_atomic_float16_max_a16
+MIN_MAX_CRITICAL(float16, min_a16, Quad_a16_t, >, 16r,
+                 1) // __kmpc_atomic_float16_min_a16
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+// ------------------------------------------------------------------------
+// Need separate macros for .EQV. because of the need of complement (~)
+// OP ignored for critical sections, ^=~ used instead
+#define ATOMIC_CRIT_EQV(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)           \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(^= (TYPE) ~, GOMP_FLAG) /* send assignment */               \
+  OP_CRITICAL(^= (TYPE) ~, LCK_ID) /* send assignment and complement */        \
+  }
+
+// ------------------------------------------------------------------------
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ===================================
+#define ATOMIC_CMPX_EQV(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,          \
+                        GOMP_FLAG)                                             \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(^= (TYPE) ~, GOMP_FLAG) /* send assignment */               \
+  OP_CMPXCHG(TYPE, BITS, OP)                                                   \
+  }
+// ------------------------------------------------------------------------
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPX_EQV(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,          \
+                        GOMP_FLAG)                                             \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(^= (TYPE) ~, GOMP_FLAG)                                     \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL(^= (TYPE) ~, LCK_ID) /* unaligned address - use critical */    \
+  }                                                                            \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+ATOMIC_CMPXCHG(fixed1, neqv, kmp_int8, 8, ^, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_neqv
+ATOMIC_CMPXCHG(fixed2, neqv, kmp_int16, 16, ^, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_neqv
+ATOMIC_CMPXCHG(fixed4, neqv, kmp_int32, 32, ^, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_neqv
+ATOMIC_CMPXCHG(fixed8, neqv, kmp_int64, 64, ^, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_neqv
+ATOMIC_CMPX_EQV(fixed1, eqv, kmp_int8, 8, ^~, 1i, 0,
+                KMP_ARCH_X86) // __kmpc_atomic_fixed1_eqv
+ATOMIC_CMPX_EQV(fixed2, eqv, kmp_int16, 16, ^~, 2i, 1,
+                KMP_ARCH_X86) // __kmpc_atomic_fixed2_eqv
+ATOMIC_CMPX_EQV(fixed4, eqv, kmp_int32, 32, ^~, 4i, 3,
+                KMP_ARCH_X86) // __kmpc_atomic_fixed4_eqv
+ATOMIC_CMPX_EQV(fixed8, eqv, kmp_int64, 64, ^~, 8i, 7,
+                KMP_ARCH_X86) // __kmpc_atomic_fixed8_eqv
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)           \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG) /* send assignment */           \
+  OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID) /* send assignment */                   \
+  }
+
+/* ------------------------------------------------------------------------- */
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// routines for long double type
+ATOMIC_CRITICAL(float10, add, long double, +, 10r,
+                1) // __kmpc_atomic_float10_add
+ATOMIC_CRITICAL(float10, sub, long double, -, 10r,
+                1) // __kmpc_atomic_float10_sub
+ATOMIC_CRITICAL(float10, mul, long double, *, 10r,
+                1) // __kmpc_atomic_float10_mul
+ATOMIC_CRITICAL(float10, div, long double, /, 10r,
+                1) // __kmpc_atomic_float10_div
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL(float16, add, QUAD_LEGACY, +, 16r,
+                1) // __kmpc_atomic_float16_add
+ATOMIC_CRITICAL(float16, sub, QUAD_LEGACY, -, 16r,
+                1) // __kmpc_atomic_float16_sub
+ATOMIC_CRITICAL(float16, mul, QUAD_LEGACY, *, 16r,
+                1) // __kmpc_atomic_float16_mul
+ATOMIC_CRITICAL(float16, div, QUAD_LEGACY, /, 16r,
+                1) // __kmpc_atomic_float16_div
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL(float16, add_a16, Quad_a16_t, +, 16r,
+                1) // __kmpc_atomic_float16_add_a16
+ATOMIC_CRITICAL(float16, sub_a16, Quad_a16_t, -, 16r,
+                1) // __kmpc_atomic_float16_sub_a16
+ATOMIC_CRITICAL(float16, mul_a16, Quad_a16_t, *, 16r,
+                1) // __kmpc_atomic_float16_mul_a16
+ATOMIC_CRITICAL(float16, div_a16, Quad_a16_t, /, 16r,
+                1) // __kmpc_atomic_float16_div_a16
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+// routines for complex types
+
+#if USE_CMPXCHG_FIX
+// workaround for C78287 (complex(kind=4) data type)
+ATOMIC_CMPXCHG_WORKAROUND(cmplx4, add, kmp_cmplx32, 64, +, 8c, 7,
+                          1) // __kmpc_atomic_cmplx4_add
+ATOMIC_CMPXCHG_WORKAROUND(cmplx4, sub, kmp_cmplx32, 64, -, 8c, 7,
+                          1) // __kmpc_atomic_cmplx4_sub
+ATOMIC_CMPXCHG_WORKAROUND(cmplx4, mul, kmp_cmplx32, 64, *, 8c, 7,
+                          1) // __kmpc_atomic_cmplx4_mul
+ATOMIC_CMPXCHG_WORKAROUND(cmplx4, div, kmp_cmplx32, 64, /, 8c, 7,
+                          1) // __kmpc_atomic_cmplx4_div
+// end of the workaround for C78287
+#else
+ATOMIC_CRITICAL(cmplx4, add, kmp_cmplx32, +, 8c, 1) // __kmpc_atomic_cmplx4_add
+ATOMIC_CRITICAL(cmplx4, sub, kmp_cmplx32, -, 8c, 1) // __kmpc_atomic_cmplx4_sub
+ATOMIC_CRITICAL(cmplx4, mul, kmp_cmplx32, *, 8c, 1) // __kmpc_atomic_cmplx4_mul
+ATOMIC_CRITICAL(cmplx4, div, kmp_cmplx32, /, 8c, 1) // __kmpc_atomic_cmplx4_div
+#endif // USE_CMPXCHG_FIX
+
+ATOMIC_CRITICAL(cmplx8, add, kmp_cmplx64, +, 16c, 1) // __kmpc_atomic_cmplx8_add
+ATOMIC_CRITICAL(cmplx8, sub, kmp_cmplx64, -, 16c, 1) // __kmpc_atomic_cmplx8_sub
+ATOMIC_CRITICAL(cmplx8, mul, kmp_cmplx64, *, 16c, 1) // __kmpc_atomic_cmplx8_mul
+ATOMIC_CRITICAL(cmplx8, div, kmp_cmplx64, /, 16c, 1) // __kmpc_atomic_cmplx8_div
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ATOMIC_CRITICAL(cmplx10, add, kmp_cmplx80, +, 20c,
+                1) // __kmpc_atomic_cmplx10_add
+ATOMIC_CRITICAL(cmplx10, sub, kmp_cmplx80, -, 20c,
+                1) // __kmpc_atomic_cmplx10_sub
+ATOMIC_CRITICAL(cmplx10, mul, kmp_cmplx80, *, 20c,
+                1) // __kmpc_atomic_cmplx10_mul
+ATOMIC_CRITICAL(cmplx10, div, kmp_cmplx80, /, 20c,
+                1) // __kmpc_atomic_cmplx10_div
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL(cmplx16, add, CPLX128_LEG, +, 32c,
+                1) // __kmpc_atomic_cmplx16_add
+ATOMIC_CRITICAL(cmplx16, sub, CPLX128_LEG, -, 32c,
+                1) // __kmpc_atomic_cmplx16_sub
+ATOMIC_CRITICAL(cmplx16, mul, CPLX128_LEG, *, 32c,
+                1) // __kmpc_atomic_cmplx16_mul
+ATOMIC_CRITICAL(cmplx16, div, CPLX128_LEG, /, 32c,
+                1) // __kmpc_atomic_cmplx16_div
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL(cmplx16, add_a16, kmp_cmplx128_a16_t, +, 32c,
+                1) // __kmpc_atomic_cmplx16_add_a16
+ATOMIC_CRITICAL(cmplx16, sub_a16, kmp_cmplx128_a16_t, -, 32c,
+                1) // __kmpc_atomic_cmplx16_sub_a16
+ATOMIC_CRITICAL(cmplx16, mul_a16, kmp_cmplx128_a16_t, *, 32c,
+                1) // __kmpc_atomic_cmplx16_mul_a16
+ATOMIC_CRITICAL(cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c,
+                1) // __kmpc_atomic_cmplx16_div_a16
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// OpenMP 4.0: x = expr binop x for non-commutative operations.
+// Supported only on IA-32 architecture and Intel(R) 64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_REV(TYPE, OP, LCK_ID)                                      \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  (*lhs) = (TYPE)((rhs)OP(*lhs));                                              \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_REV(TYPE, OP, FLAG)                                   \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_REV(TYPE, OP, 0);                                              \
+    return;                                                                    \
+  }
+
+#else
+#define OP_GOMP_CRITICAL_REV(TYPE, OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN_REV(TYPE_ID, OP_ID, TYPE, RET_TYPE)                       \
+  RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID##_rev(ident_t *id_ref, int gtid,  \
+                                                   TYPE *lhs, TYPE rhs) {      \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID "_rev: T#%d\n", gtid));
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_REV(TYPE, BITS, OP)                                         \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value, new_value;                                                 \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    new_value = (TYPE)(rhs OP old_value);                                      \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      KMP_DO_PAUSE;                                                            \
+                                                                               \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+      new_value = (TYPE)(rhs OP old_value);                                    \
+    }                                                                          \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_REV(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, GOMP_FLAG)  \
+  ATOMIC_BEGIN_REV(TYPE_ID, OP_ID, TYPE, void)                                 \
+  OP_GOMP_CRITICAL_REV(TYPE, OP, GOMP_FLAG)                                    \
+  OP_CMPXCHG_REV(TYPE, BITS, OP)                                               \
+  }
+
+// ------------------------------------------------------------------------
+// Entries definition for integer operands
+//     TYPE_ID - operands type and size (fixed4, float4)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operand type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator (used in critical section)
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+
+//               TYPE_ID,OP_ID,  TYPE,   BITS,OP,LCK_ID,GOMP_FLAG
+// ------------------------------------------------------------------------
+// Routines for ATOMIC integer operands, other operators
+// ------------------------------------------------------------------------
+//                  TYPE_ID,OP_ID, TYPE,    BITS, OP, LCK_ID, GOMP_FLAG
+ATOMIC_CMPXCHG_REV(fixed1, div, kmp_int8, 8, /, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_rev
+ATOMIC_CMPXCHG_REV(fixed1u, div, kmp_uint8, 8, /, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_rev
+ATOMIC_CMPXCHG_REV(fixed1, shl, kmp_int8, 8, <<, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_shl_rev
+ATOMIC_CMPXCHG_REV(fixed1, shr, kmp_int8, 8, >>, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_shr_rev
+ATOMIC_CMPXCHG_REV(fixed1u, shr, kmp_uint8, 8, >>, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_shr_rev
+ATOMIC_CMPXCHG_REV(fixed1, sub, kmp_int8, 8, -, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_rev
+
+ATOMIC_CMPXCHG_REV(fixed2, div, kmp_int16, 16, /, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_rev
+ATOMIC_CMPXCHG_REV(fixed2u, div, kmp_uint16, 16, /, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_rev
+ATOMIC_CMPXCHG_REV(fixed2, shl, kmp_int16, 16, <<, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_shl_rev
+ATOMIC_CMPXCHG_REV(fixed2, shr, kmp_int16, 16, >>, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_shr_rev
+ATOMIC_CMPXCHG_REV(fixed2u, shr, kmp_uint16, 16, >>, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_shr_rev
+ATOMIC_CMPXCHG_REV(fixed2, sub, kmp_int16, 16, -, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_rev
+
+ATOMIC_CMPXCHG_REV(fixed4, div, kmp_int32, 32, /, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_div_rev
+ATOMIC_CMPXCHG_REV(fixed4u, div, kmp_uint32, 32, /, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4u_div_rev
+ATOMIC_CMPXCHG_REV(fixed4, shl, kmp_int32, 32, <<, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_shl_rev
+ATOMIC_CMPXCHG_REV(fixed4, shr, kmp_int32, 32, >>, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_shr_rev
+ATOMIC_CMPXCHG_REV(fixed4u, shr, kmp_uint32, 32, >>, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4u_shr_rev
+ATOMIC_CMPXCHG_REV(fixed4, sub, kmp_int32, 32, -, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_sub_rev
+
+ATOMIC_CMPXCHG_REV(fixed8, div, kmp_int64, 64, /, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_rev
+ATOMIC_CMPXCHG_REV(fixed8u, div, kmp_uint64, 64, /, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_rev
+ATOMIC_CMPXCHG_REV(fixed8, shl, kmp_int64, 64, <<, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_shl_rev
+ATOMIC_CMPXCHG_REV(fixed8, shr, kmp_int64, 64, >>, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_shr_rev
+ATOMIC_CMPXCHG_REV(fixed8u, shr, kmp_uint64, 64, >>, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_shr_rev
+ATOMIC_CMPXCHG_REV(fixed8, sub, kmp_int64, 64, -, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_rev
+
+ATOMIC_CMPXCHG_REV(float4, div, kmp_real32, 32, /, 4r,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_div_rev
+ATOMIC_CMPXCHG_REV(float4, sub, kmp_real32, 32, -, 4r,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_sub_rev
+
+ATOMIC_CMPXCHG_REV(float8, div, kmp_real64, 64, /, 8r,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_div_rev
+ATOMIC_CMPXCHG_REV(float8, sub, kmp_real64, 64, -, 8r,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_sub_rev
+//                  TYPE_ID,OP_ID, TYPE,     BITS,OP,LCK_ID, GOMP_FLAG
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_REV(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)       \
+  ATOMIC_BEGIN_REV(TYPE_ID, OP_ID, TYPE, void)                                 \
+  OP_GOMP_CRITICAL_REV(TYPE, OP, GOMP_FLAG)                                    \
+  OP_CRITICAL_REV(TYPE, OP, LCK_ID)                                            \
+  }
+
+/* ------------------------------------------------------------------------- */
+// routines for long double type
+ATOMIC_CRITICAL_REV(float10, sub, long double, -, 10r,
+                    1) // __kmpc_atomic_float10_sub_rev
+ATOMIC_CRITICAL_REV(float10, div, long double, /, 10r,
+                    1) // __kmpc_atomic_float10_div_rev
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL_REV(float16, sub, QUAD_LEGACY, -, 16r,
+                    1) // __kmpc_atomic_float16_sub_rev
+ATOMIC_CRITICAL_REV(float16, div, QUAD_LEGACY, /, 16r,
+                    1) // __kmpc_atomic_float16_div_rev
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_REV(float16, sub_a16, Quad_a16_t, -, 16r,
+                    1) // __kmpc_atomic_float16_sub_a16_rev
+ATOMIC_CRITICAL_REV(float16, div_a16, Quad_a16_t, /, 16r,
+                    1) // __kmpc_atomic_float16_div_a16_rev
+#endif // KMP_ARCH_X86
+#endif // KMP_HAVE_QUAD
+
+// routines for complex types
+ATOMIC_CRITICAL_REV(cmplx4, sub, kmp_cmplx32, -, 8c,
+                    1) // __kmpc_atomic_cmplx4_sub_rev
+ATOMIC_CRITICAL_REV(cmplx4, div, kmp_cmplx32, /, 8c,
+                    1) // __kmpc_atomic_cmplx4_div_rev
+ATOMIC_CRITICAL_REV(cmplx8, sub, kmp_cmplx64, -, 16c,
+                    1) // __kmpc_atomic_cmplx8_sub_rev
+ATOMIC_CRITICAL_REV(cmplx8, div, kmp_cmplx64, /, 16c,
+                    1) // __kmpc_atomic_cmplx8_div_rev
+ATOMIC_CRITICAL_REV(cmplx10, sub, kmp_cmplx80, -, 20c,
+                    1) // __kmpc_atomic_cmplx10_sub_rev
+ATOMIC_CRITICAL_REV(cmplx10, div, kmp_cmplx80, /, 20c,
+                    1) // __kmpc_atomic_cmplx10_div_rev
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_REV(cmplx16, sub, CPLX128_LEG, -, 32c,
+                    1) // __kmpc_atomic_cmplx16_sub_rev
+ATOMIC_CRITICAL_REV(cmplx16, div, CPLX128_LEG, /, 32c,
+                    1) // __kmpc_atomic_cmplx16_div_rev
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_REV(cmplx16, sub_a16, kmp_cmplx128_a16_t, -, 32c,
+                    1) // __kmpc_atomic_cmplx16_sub_a16_rev
+ATOMIC_CRITICAL_REV(cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c,
+                    1) // __kmpc_atomic_cmplx16_div_a16_rev
+#endif // KMP_ARCH_X86
+#endif // KMP_HAVE_QUAD
+
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+// End of OpenMP 4.0: x = expr binop x for non-commutative operations.
+
+/* ------------------------------------------------------------------------ */
+/* Routines for mixed types of LHS and RHS, when RHS is "larger"            */
+/* Note: in order to reduce the total number of types combinations          */
+/*       it is supposed that compiler converts RHS to longest floating type,*/
+/*       that is _Quad, before call to any of these routines                */
+/* Conversion to _Quad will be done by the compiler during calculation,     */
+/*    conversion back to TYPE - before the assignment, like:                */
+/*    *lhs = (TYPE)( (_Quad)(*lhs) OP rhs )                                 */
+/* Performance penalty expected because of SW emulation use                 */
+/* ------------------------------------------------------------------------ */
+
+#define ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                \
+  void __kmpc_atomic_##TYPE_ID##_##OP_ID##_##RTYPE_ID(                         \
+      ident_t *id_ref, int gtid, TYPE *lhs, RTYPE rhs) {                       \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100,                                                              \
+             ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID "_" #RTYPE_ID ": T#%d\n",   \
+              gtid));
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CRITICAL_FP(TYPE_ID, TYPE, OP_ID, OP, RTYPE_ID, RTYPE, LCK_ID,  \
+                           GOMP_FLAG)                                          \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG) /* send assignment */           \
+  OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID) /* send assignment */                   \
+  }
+
+// -------------------------------------------------------------------------
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// -------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#define ATOMIC_CMPXCHG_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,    \
+                           LCK_ID, MASK, GOMP_FLAG)                            \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
+  OP_CMPXCHG(TYPE, BITS, OP)                                                   \
+  }
+// -------------------------------------------------------------------------
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPXCHG_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,    \
+                           LCK_ID, MASK, GOMP_FLAG)                            \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_UPDATE_CRITICAL(TYPE, OP,                                               \
+                       LCK_ID) /* unaligned address - use critical */          \
+  }                                                                            \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// -------------------------------------------------------------------------
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_REV_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID,       \
+                               RTYPE, LCK_ID, MASK, GOMP_FLAG)                 \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_GOMP_CRITICAL_REV(TYPE, OP, GOMP_FLAG)                                    \
+  OP_CMPXCHG_REV(TYPE, BITS, OP)                                               \
+  }
+#define ATOMIC_CRITICAL_REV_FP(TYPE_ID, TYPE, OP_ID, OP, RTYPE_ID, RTYPE,      \
+                               LCK_ID, GOMP_FLAG)                              \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_GOMP_CRITICAL_REV(TYPE, OP, GOMP_FLAG)                                    \
+  OP_CRITICAL_REV(TYPE, OP, LCK_ID)                                            \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// RHS=float8
+ATOMIC_CMPXCHG_MIX(fixed1, char, mul, 8, *, float8, kmp_real64, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_mul_float8
+ATOMIC_CMPXCHG_MIX(fixed1, char, div, 8, /, float8, kmp_real64, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_float8
+ATOMIC_CMPXCHG_MIX(fixed2, short, mul, 16, *, float8, kmp_real64, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_mul_float8
+ATOMIC_CMPXCHG_MIX(fixed2, short, div, 16, /, float8, kmp_real64, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_float8
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, mul, 32, *, float8, kmp_real64, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_mul_float8
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, div, 32, /, float8, kmp_real64, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_div_float8
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, mul, 64, *, float8, kmp_real64, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_mul_float8
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, div, 64, /, float8, kmp_real64, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_float8
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, add, 32, +, float8, kmp_real64, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_add_float8
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, sub, 32, -, float8, kmp_real64, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_sub_float8
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, mul, 32, *, float8, kmp_real64, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_mul_float8
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, div, 32, /, float8, kmp_real64, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_div_float8
+
+// RHS=float16 (deprecated, to be removed when we are sure the compiler does not
+// use them)
+#if KMP_HAVE_QUAD
+ATOMIC_CMPXCHG_MIX(fixed1, char, add, 8, +, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_add_fp
+ATOMIC_CMPXCHG_MIX(fixed1u, uchar, add, 8, +, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_add_fp
+ATOMIC_CMPXCHG_MIX(fixed1, char, sub, 8, -, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed1u, uchar, sub, 8, -, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed1, char, mul, 8, *, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed1u, uchar, mul, 8, *, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed1, char, div, 8, /, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_fp
+ATOMIC_CMPXCHG_MIX(fixed1u, uchar, div, 8, /, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_fp
+
+ATOMIC_CMPXCHG_MIX(fixed2, short, add, 16, +, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_add_fp
+ATOMIC_CMPXCHG_MIX(fixed2u, ushort, add, 16, +, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_add_fp
+ATOMIC_CMPXCHG_MIX(fixed2, short, sub, 16, -, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed2u, ushort, sub, 16, -, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed2, short, mul, 16, *, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed2u, ushort, mul, 16, *, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed2, short, div, 16, /, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_fp
+ATOMIC_CMPXCHG_MIX(fixed2u, ushort, div, 16, /, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_fp
+
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, add, 32, +, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_add_fp
+ATOMIC_CMPXCHG_MIX(fixed4u, kmp_uint32, add, 32, +, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4u_add_fp
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, sub, 32, -, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed4u, kmp_uint32, sub, 32, -, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4u_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, mul, 32, *, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed4u, kmp_uint32, mul, 32, *, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4u_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, div, 32, /, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_div_fp
+ATOMIC_CMPXCHG_MIX(fixed4u, kmp_uint32, div, 32, /, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4u_div_fp
+
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, add, 64, +, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_add_fp
+ATOMIC_CMPXCHG_MIX(fixed8u, kmp_uint64, add, 64, +, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_add_fp
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, sub, 64, -, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed8u, kmp_uint64, sub, 64, -, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, mul, 64, *, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed8u, kmp_uint64, mul, 64, *, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, div, 64, /, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_fp
+ATOMIC_CMPXCHG_MIX(fixed8u, kmp_uint64, div, 64, /, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_fp
+
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, add, 32, +, fp, _Quad, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_add_fp
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, sub, 32, -, fp, _Quad, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_sub_fp
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, mul, 32, *, fp, _Quad, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_mul_fp
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, div, 32, /, fp, _Quad, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_div_fp
+
+ATOMIC_CMPXCHG_MIX(float8, kmp_real64, add, 64, +, fp, _Quad, 8r, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_add_fp
+ATOMIC_CMPXCHG_MIX(float8, kmp_real64, sub, 64, -, fp, _Quad, 8r, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_sub_fp
+ATOMIC_CMPXCHG_MIX(float8, kmp_real64, mul, 64, *, fp, _Quad, 8r, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_mul_fp
+ATOMIC_CMPXCHG_MIX(float8, kmp_real64, div, 64, /, fp, _Quad, 8r, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_div_fp
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ATOMIC_CRITICAL_FP(float10, long double, add, +, fp, _Quad, 10r,
+                   1) // __kmpc_atomic_float10_add_fp
+ATOMIC_CRITICAL_FP(float10, long double, sub, -, fp, _Quad, 10r,
+                   1) // __kmpc_atomic_float10_sub_fp
+ATOMIC_CRITICAL_FP(float10, long double, mul, *, fp, _Quad, 10r,
+                   1) // __kmpc_atomic_float10_mul_fp
+ATOMIC_CRITICAL_FP(float10, long double, div, /, fp, _Quad, 10r,
+                   1) // __kmpc_atomic_float10_div_fp
+
+// Reverse operations
+ATOMIC_CMPXCHG_REV_MIX(fixed1, char, sub_rev, 8, -, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed1u, uchar, sub_rev, 8, -, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed1, char, div_rev, 8, /, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed1u, uchar, div_rev, 8, /, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_rev_fp
+
+ATOMIC_CMPXCHG_REV_MIX(fixed2, short, sub_rev, 16, -, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed2u, ushort, sub_rev, 16, -, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed2, short, div_rev, 16, /, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed2u, ushort, div_rev, 16, /, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_rev_fp
+
+ATOMIC_CMPXCHG_REV_MIX(fixed4, kmp_int32, sub_rev, 32, -, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed4u, kmp_uint32, sub_rev, 32, -, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed4, kmp_int32, div_rev, 32, /, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_div_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed4u, kmp_uint32, div_rev, 32, /, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_div_rev_fp
+
+ATOMIC_CMPXCHG_REV_MIX(fixed8, kmp_int64, sub_rev, 64, -, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed8u, kmp_uint64, sub_rev, 64, -, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed8, kmp_int64, div_rev, 64, /, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed8u, kmp_uint64, div_rev, 64, /, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_rev_fp
+
+ATOMIC_CMPXCHG_REV_MIX(float4, kmp_real32, sub_rev, 32, -, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(float4, kmp_real32, div_rev, 32, /, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_div_rev_fp
+
+ATOMIC_CMPXCHG_REV_MIX(float8, kmp_real64, sub_rev, 64, -, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(float8, kmp_real64, div_rev, 64, /, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_div_rev_fp
+
+ATOMIC_CRITICAL_REV_FP(float10, long double, sub_rev, -, fp, _Quad, 10r,
+                       1) // __kmpc_atomic_float10_sub_rev_fp
+ATOMIC_CRITICAL_REV_FP(float10, long double, div_rev, /, fp, _Quad, 10r,
+                       1) // __kmpc_atomic_float10_div_rev_fp
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#endif // KMP_HAVE_QUAD
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#if USE_CMPXCHG_FIX
+// workaround for C78287 (complex(kind=4) data type)
+#define ATOMIC_CMPXCHG_CMPLX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,  \
+                             LCK_ID, MASK, GOMP_FLAG)                          \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
+  OP_CMPXCHG_WORKAROUND(TYPE, BITS, OP)                                        \
+  }
+// end of the second part of the workaround for C78287
+#else
+#define ATOMIC_CMPXCHG_CMPLX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,  \
+                             LCK_ID, MASK, GOMP_FLAG)                          \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
+  OP_CMPXCHG(TYPE, BITS, OP)                                                   \
+  }
+#endif // USE_CMPXCHG_FIX
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPXCHG_CMPLX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,  \
+                             LCK_ID, MASK, GOMP_FLAG)                          \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_UPDATE_GOMP_CRITICAL(TYPE, OP, GOMP_FLAG)                                 \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_UPDATE_CRITICAL(TYPE, OP,                                               \
+                       LCK_ID) /* unaligned address - use critical */          \
+  }                                                                            \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+ATOMIC_CMPXCHG_CMPLX(cmplx4, kmp_cmplx32, add, 64, +, cmplx8, kmp_cmplx64, 8c,
+                     7, KMP_ARCH_X86) // __kmpc_atomic_cmplx4_add_cmplx8
+ATOMIC_CMPXCHG_CMPLX(cmplx4, kmp_cmplx32, sub, 64, -, cmplx8, kmp_cmplx64, 8c,
+                     7, KMP_ARCH_X86) // __kmpc_atomic_cmplx4_sub_cmplx8
+ATOMIC_CMPXCHG_CMPLX(cmplx4, kmp_cmplx32, mul, 64, *, cmplx8, kmp_cmplx64, 8c,
+                     7, KMP_ARCH_X86) // __kmpc_atomic_cmplx4_mul_cmplx8
+ATOMIC_CMPXCHG_CMPLX(cmplx4, kmp_cmplx32, div, 64, /, cmplx8, kmp_cmplx64, 8c,
+                     7, KMP_ARCH_X86) // __kmpc_atomic_cmplx4_div_cmplx8
+
+// READ, WRITE, CAPTURE
+
+// ------------------------------------------------------------------------
+// Atomic READ routines
+
+// ------------------------------------------------------------------------
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN_READ(TYPE_ID, OP_ID, TYPE, RET_TYPE)                      \
+  RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID(ident_t *id_ref, int gtid,        \
+                                             TYPE *loc) {                      \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid));
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store_ret" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+// TODO: check if it is still necessary
+// Return old value regardless of the result of "compare & swap# operation
+#define OP_CMPXCHG_READ(TYPE, BITS, OP)                                        \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    union f_i_union {                                                          \
+      TYPE f_val;                                                              \
+      kmp_int##BITS i_val;                                                     \
+    };                                                                         \
+    union f_i_union old_value;                                                 \
+    temp_val = *loc;                                                           \
+    old_value.f_val = temp_val;                                                \
+    old_value.i_val = KMP_COMPARE_AND_STORE_RET##BITS(                         \
+        (kmp_int##BITS *)loc,                                                  \
+        *VOLATILE_CAST(kmp_int##BITS *) & old_value.i_val,                     \
+        *VOLATILE_CAST(kmp_int##BITS *) & old_value.i_val);                    \
+    new_value = old_value.f_val;                                               \
+    return new_value;                                                          \
+  }
+
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_READ(OP, LCK_ID)                                           \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  new_value = (*loc);                                                          \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+// -------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_READ(OP, FLAG)                                        \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_READ(OP, 0);                                                   \
+    return new_value;                                                          \
+  }
+#else
+#define OP_GOMP_CRITICAL_READ(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// -------------------------------------------------------------------------
+#define ATOMIC_FIXED_READ(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)           \
+  ATOMIC_BEGIN_READ(TYPE_ID, OP_ID, TYPE, TYPE)                                \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_READ(OP## =, GOMP_FLAG)                                     \
+  new_value = KMP_TEST_THEN_ADD##BITS(loc, OP 0);                              \
+  return new_value;                                                            \
+  }
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_READ(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)         \
+  ATOMIC_BEGIN_READ(TYPE_ID, OP_ID, TYPE, TYPE)                                \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_READ(OP## =, GOMP_FLAG)                                     \
+  OP_CMPXCHG_READ(TYPE, BITS, OP)                                              \
+  }
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_READ(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)      \
+  ATOMIC_BEGIN_READ(TYPE_ID, OP_ID, TYPE, TYPE)                                \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_READ(OP## =, GOMP_FLAG) /* send assignment */               \
+  OP_CRITICAL_READ(OP, LCK_ID) /* send assignment */                           \
+  return new_value;                                                            \
+  }
+
+// ------------------------------------------------------------------------
+// Fix for cmplx4 read (CQ220361) on Windows* OS. Regular routine with return
+// value doesn't work.
+// Let's return the read value through the additional parameter.
+#if (KMP_OS_WINDOWS)
+
+#define OP_CRITICAL_READ_WRK(OP, LCK_ID)                                       \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  (*out) = (*loc);                                                             \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_READ_WRK(OP, FLAG)                                    \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_READ_WRK(OP, 0);                                               \
+  }
+#else
+#define OP_GOMP_CRITICAL_READ_WRK(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+#define ATOMIC_BEGIN_READ_WRK(TYPE_ID, OP_ID, TYPE)                            \
+  void __kmpc_atomic_##TYPE_ID##_##OP_ID(TYPE *out, ident_t *id_ref, int gtid, \
+                                         TYPE *loc) {                          \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid));
+
+// ------------------------------------------------------------------------
+#define ATOMIC_CRITICAL_READ_WRK(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)  \
+  ATOMIC_BEGIN_READ_WRK(TYPE_ID, OP_ID, TYPE)                                  \
+  OP_GOMP_CRITICAL_READ_WRK(OP## =, GOMP_FLAG) /* send assignment */           \
+  OP_CRITICAL_READ_WRK(OP, LCK_ID) /* send assignment */                       \
+  }
+
+#endif // KMP_OS_WINDOWS
+
+// ------------------------------------------------------------------------
+//                  TYPE_ID,OP_ID, TYPE,      OP, GOMP_FLAG
+ATOMIC_FIXED_READ(fixed4, rd, kmp_int32, 32, +, 0) // __kmpc_atomic_fixed4_rd
+ATOMIC_FIXED_READ(fixed8, rd, kmp_int64, 64, +,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed8_rd
+ATOMIC_CMPXCHG_READ(float4, rd, kmp_real32, 32, +,
+                    KMP_ARCH_X86) // __kmpc_atomic_float4_rd
+ATOMIC_CMPXCHG_READ(float8, rd, kmp_real64, 64, +,
+                    KMP_ARCH_X86) // __kmpc_atomic_float8_rd
+
+// !!! TODO: Remove lock operations for "char" since it can't be non-atomic
+ATOMIC_CMPXCHG_READ(fixed1, rd, kmp_int8, 8, +,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed1_rd
+ATOMIC_CMPXCHG_READ(fixed2, rd, kmp_int16, 16, +,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed2_rd
+
+ATOMIC_CRITICAL_READ(float10, rd, long double, +, 10r,
+                     1) // __kmpc_atomic_float10_rd
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_READ(float16, rd, QUAD_LEGACY, +, 16r,
+                     1) // __kmpc_atomic_float16_rd
+#endif // KMP_HAVE_QUAD
+
+// Fix for CQ220361 on Windows* OS
+#if (KMP_OS_WINDOWS)
+ATOMIC_CRITICAL_READ_WRK(cmplx4, rd, kmp_cmplx32, +, 8c,
+                         1) // __kmpc_atomic_cmplx4_rd
+#else
+ATOMIC_CRITICAL_READ(cmplx4, rd, kmp_cmplx32, +, 8c,
+                     1) // __kmpc_atomic_cmplx4_rd
+#endif // (KMP_OS_WINDOWS)
+ATOMIC_CRITICAL_READ(cmplx8, rd, kmp_cmplx64, +, 16c,
+                     1) // __kmpc_atomic_cmplx8_rd
+ATOMIC_CRITICAL_READ(cmplx10, rd, kmp_cmplx80, +, 20c,
+                     1) // __kmpc_atomic_cmplx10_rd
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_READ(cmplx16, rd, CPLX128_LEG, +, 32c,
+                     1) // __kmpc_atomic_cmplx16_rd
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_READ(float16, a16_rd, Quad_a16_t, +, 16r,
+                     1) // __kmpc_atomic_float16_a16_rd
+ATOMIC_CRITICAL_READ(cmplx16, a16_rd, kmp_cmplx128_a16_t, +, 32c,
+                     1) // __kmpc_atomic_cmplx16_a16_rd
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// ------------------------------------------------------------------------
+// Atomic WRITE routines
+
+#define ATOMIC_XCHG_WR(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)              \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP, GOMP_FLAG)                                              \
+  KMP_XCHG_FIXED##BITS(lhs, rhs);                                              \
+  }
+// ------------------------------------------------------------------------
+#define ATOMIC_XCHG_FLOAT_WR(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)        \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP, GOMP_FLAG)                                              \
+  KMP_XCHG_REAL##BITS(lhs, rhs);                                               \
+  }
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_WR(TYPE, BITS, OP)                                          \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value, new_value;                                                 \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    new_value = rhs;                                                           \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+      new_value = rhs;                                                         \
+    }                                                                          \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_WR(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)           \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP, GOMP_FLAG)                                              \
+  OP_CMPXCHG_WR(TYPE, BITS, OP)                                                \
+  }
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_WR(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)        \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP, GOMP_FLAG) /* send assignment */                        \
+  OP_CRITICAL(OP, LCK_ID) /* send assignment */                                \
+  }
+// -------------------------------------------------------------------------
+
+ATOMIC_XCHG_WR(fixed1, wr, kmp_int8, 8, =,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_wr
+ATOMIC_XCHG_WR(fixed2, wr, kmp_int16, 16, =,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_wr
+ATOMIC_XCHG_WR(fixed4, wr, kmp_int32, 32, =,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_wr
+#if (KMP_ARCH_X86)
+ATOMIC_CMPXCHG_WR(fixed8, wr, kmp_int64, 64, =,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed8_wr
+#else
+ATOMIC_XCHG_WR(fixed8, wr, kmp_int64, 64, =,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_wr
+#endif // (KMP_ARCH_X86)
+
+ATOMIC_XCHG_FLOAT_WR(float4, wr, kmp_real32, 32, =,
+                     KMP_ARCH_X86) // __kmpc_atomic_float4_wr
+#if (KMP_ARCH_X86)
+ATOMIC_CMPXCHG_WR(float8, wr, kmp_real64, 64, =,
+                  KMP_ARCH_X86) // __kmpc_atomic_float8_wr
+#else
+ATOMIC_XCHG_FLOAT_WR(float8, wr, kmp_real64, 64, =,
+                     KMP_ARCH_X86) // __kmpc_atomic_float8_wr
+#endif // (KMP_ARCH_X86)
+
+ATOMIC_CRITICAL_WR(float10, wr, long double, =, 10r,
+                   1) // __kmpc_atomic_float10_wr
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_WR(float16, wr, QUAD_LEGACY, =, 16r,
+                   1) // __kmpc_atomic_float16_wr
+#endif // KMP_HAVE_QUAD
+ATOMIC_CRITICAL_WR(cmplx4, wr, kmp_cmplx32, =, 8c, 1) // __kmpc_atomic_cmplx4_wr
+ATOMIC_CRITICAL_WR(cmplx8, wr, kmp_cmplx64, =, 16c,
+                   1) // __kmpc_atomic_cmplx8_wr
+ATOMIC_CRITICAL_WR(cmplx10, wr, kmp_cmplx80, =, 20c,
+                   1) // __kmpc_atomic_cmplx10_wr
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_WR(cmplx16, wr, CPLX128_LEG, =, 32c,
+                   1) // __kmpc_atomic_cmplx16_wr
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_WR(float16, a16_wr, Quad_a16_t, =, 16r,
+                   1) // __kmpc_atomic_float16_a16_wr
+ATOMIC_CRITICAL_WR(cmplx16, a16_wr, kmp_cmplx128_a16_t, =, 32c,
+                   1) // __kmpc_atomic_cmplx16_a16_wr
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// ------------------------------------------------------------------------
+// Atomic CAPTURE routines
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, RET_TYPE)                       \
+  RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID(ident_t *id_ref, int gtid,        \
+                                             TYPE *lhs, TYPE rhs, int flag) {  \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid));
+
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_CPT(OP, LCK_ID)                                            \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    (*lhs) OP rhs;                                                             \
+    new_value = (*lhs);                                                        \
+  } else {                                                                     \
+    new_value = (*lhs);                                                        \
+    (*lhs) OP rhs;                                                             \
+  }                                                                            \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return new_value;
+
+#define OP_UPDATE_CRITICAL_CPT(TYPE, OP, LCK_ID)                               \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    (*lhs) = (TYPE)((*lhs)OP rhs);                                             \
+    new_value = (*lhs);                                                        \
+  } else {                                                                     \
+    new_value = (*lhs);                                                        \
+    (*lhs) = (TYPE)((*lhs)OP rhs);                                             \
+  }                                                                            \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return new_value;
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT(TYPE, OP, FLAG)                                   \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_UPDATE_CRITICAL_CPT(TYPE, OP, 0);                                       \
+  }
+#else
+#define OP_GOMP_CRITICAL_CPT(TYPE, OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_CPT(TYPE, BITS, OP)                                         \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value, new_value;                                                 \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    new_value = (TYPE)(old_value OP rhs);                                      \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+      new_value = (TYPE)(old_value OP rhs);                                    \
+    }                                                                          \
+    if (flag) {                                                                \
+      return new_value;                                                        \
+    } else                                                                     \
+      return old_value;                                                        \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)          \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  (void)new_value;                                                             \
+  OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG)                                    \
+  OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_FIXED_ADD_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)        \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE old_value, new_value;                                                   \
+  (void)new_value;                                                             \
+  OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG)                                    \
+  /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */            \
+  old_value = KMP_TEST_THEN_ADD##BITS(lhs, OP rhs);                            \
+  if (flag) {                                                                  \
+    return old_value OP rhs;                                                   \
+  } else                                                                       \
+    return old_value;                                                          \
+  }
+// -------------------------------------------------------------------------
+
+ATOMIC_FIXED_ADD_CPT(fixed4, add_cpt, kmp_int32, 32, +,
+                     0) // __kmpc_atomic_fixed4_add_cpt
+ATOMIC_FIXED_ADD_CPT(fixed4, sub_cpt, kmp_int32, 32, -,
+                     0) // __kmpc_atomic_fixed4_sub_cpt
+ATOMIC_FIXED_ADD_CPT(fixed8, add_cpt, kmp_int64, 64, +,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed8_add_cpt
+ATOMIC_FIXED_ADD_CPT(fixed8, sub_cpt, kmp_int64, 64, -,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_cpt
+
+ATOMIC_CMPXCHG_CPT(float4, add_cpt, kmp_real32, 32, +,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_add_cpt
+ATOMIC_CMPXCHG_CPT(float4, sub_cpt, kmp_real32, 32, -,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_sub_cpt
+ATOMIC_CMPXCHG_CPT(float8, add_cpt, kmp_real64, 64, +,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_add_cpt
+ATOMIC_CMPXCHG_CPT(float8, sub_cpt, kmp_real64, 64, -,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_sub_cpt
+
+// ------------------------------------------------------------------------
+// Entries definition for integer operands
+//     TYPE_ID - operands type and size (fixed4, float4)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operand type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator (used in critical section)
+//               TYPE_ID,OP_ID,  TYPE,   BITS,OP,GOMP_FLAG
+// ------------------------------------------------------------------------
+// Routines for ATOMIC integer operands, other operators
+// ------------------------------------------------------------------------
+//              TYPE_ID,OP_ID, TYPE,          OP,  GOMP_FLAG
+ATOMIC_CMPXCHG_CPT(fixed1, add_cpt, kmp_int8, 8, +,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_add_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, andb_cpt, kmp_int8, 8, &,
+                   0) // __kmpc_atomic_fixed1_andb_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, div_cpt, kmp_int8, 8, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed1u, div_cpt, kmp_uint8, 8, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, mul_cpt, kmp_int8, 8, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_mul_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, orb_cpt, kmp_int8, 8, |,
+                   0) // __kmpc_atomic_fixed1_orb_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, shl_cpt, kmp_int8, 8, <<,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_shl_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, shr_cpt, kmp_int8, 8, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed1u, shr_cpt, kmp_uint8, 8, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, sub_cpt, kmp_int8, 8, -,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, xor_cpt, kmp_int8, 8, ^,
+                   0) // __kmpc_atomic_fixed1_xor_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, add_cpt, kmp_int16, 16, +,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_add_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, andb_cpt, kmp_int16, 16, &,
+                   0) // __kmpc_atomic_fixed2_andb_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, div_cpt, kmp_int16, 16, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed2u, div_cpt, kmp_uint16, 16, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, mul_cpt, kmp_int16, 16, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_mul_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, orb_cpt, kmp_int16, 16, |,
+                   0) // __kmpc_atomic_fixed2_orb_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, shl_cpt, kmp_int16, 16, <<,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_shl_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, shr_cpt, kmp_int16, 16, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed2u, shr_cpt, kmp_uint16, 16, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, sub_cpt, kmp_int16, 16, -,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, xor_cpt, kmp_int16, 16, ^,
+                   0) // __kmpc_atomic_fixed2_xor_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, andb_cpt, kmp_int32, 32, &,
+                   0) // __kmpc_atomic_fixed4_andb_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, div_cpt, kmp_int32, 32, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed4u, div_cpt, kmp_uint32, 32, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4u_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, mul_cpt, kmp_int32, 32, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_mul_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, orb_cpt, kmp_int32, 32, |,
+                   0) // __kmpc_atomic_fixed4_orb_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, shl_cpt, kmp_int32, 32, <<,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_shl_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, shr_cpt, kmp_int32, 32, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed4u, shr_cpt, kmp_uint32, 32, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4u_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, xor_cpt, kmp_int32, 32, ^,
+                   0) // __kmpc_atomic_fixed4_xor_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, andb_cpt, kmp_int64, 64, &,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_andb_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, div_cpt, kmp_int64, 64, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed8u, div_cpt, kmp_uint64, 64, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, mul_cpt, kmp_int64, 64, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_mul_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, orb_cpt, kmp_int64, 64, |,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_orb_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, shl_cpt, kmp_int64, 64, <<,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_shl_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, shr_cpt, kmp_int64, 64, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed8u, shr_cpt, kmp_uint64, 64, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, xor_cpt, kmp_int64, 64, ^,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_xor_cpt
+ATOMIC_CMPXCHG_CPT(float4, div_cpt, kmp_real32, 32, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_div_cpt
+ATOMIC_CMPXCHG_CPT(float4, mul_cpt, kmp_real32, 32, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_mul_cpt
+ATOMIC_CMPXCHG_CPT(float8, div_cpt, kmp_real64, 64, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_div_cpt
+ATOMIC_CMPXCHG_CPT(float8, mul_cpt, kmp_real64, 64, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_mul_cpt
+//              TYPE_ID,OP_ID, TYPE,          OP,  GOMP_FLAG
+
+// CAPTURE routines for mixed types RHS=float16
+#if KMP_HAVE_QUAD
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)            \
+  TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID##_##RTYPE_ID(                         \
+      ident_t *id_ref, int gtid, TYPE *lhs, RTYPE rhs, int flag) {             \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100,                                                              \
+             ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID "_" #RTYPE_ID ": T#%d\n",   \
+              gtid));
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_CPT_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID,       \
+                               RTYPE, LCK_ID, MASK, GOMP_FLAG)                 \
+  ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
+  TYPE new_value;                                                              \
+  (void)new_value;                                                             \
+  OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG)                                    \
+  OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CRITICAL_CPT_MIX(TYPE_ID, TYPE, OP_ID, OP, RTYPE_ID, RTYPE,     \
+                                LCK_ID, GOMP_FLAG)                             \
+  ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
+  TYPE new_value;                                                              \
+  (void)new_value;                                                             \
+  OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG) /* send assignment */              \
+  OP_UPDATE_CRITICAL_CPT(TYPE, OP, LCK_ID) /* send assignment */               \
+  }
+
+ATOMIC_CMPXCHG_CPT_MIX(fixed1, char, add_cpt, 8, +, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1u, uchar, add_cpt, 8, +, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1, char, sub_cpt, 8, -, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1u, uchar, sub_cpt, 8, -, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1, char, mul_cpt, 8, *, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1u, uchar, mul_cpt, 8, *, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1, char, div_cpt, 8, /, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1u, uchar, div_cpt, 8, /, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_cpt_fp
+
+ATOMIC_CMPXCHG_CPT_MIX(fixed2, short, add_cpt, 16, +, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2u, ushort, add_cpt, 16, +, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2, short, sub_cpt, 16, -, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2u, ushort, sub_cpt, 16, -, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2, short, mul_cpt, 16, *, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2u, ushort, mul_cpt, 16, *, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2, short, div_cpt, 16, /, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2u, ushort, div_cpt, 16, /, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_cpt_fp
+
+ATOMIC_CMPXCHG_CPT_MIX(fixed4, kmp_int32, add_cpt, 32, +, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4u, kmp_uint32, add_cpt, 32, +, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4, kmp_int32, sub_cpt, 32, -, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4u, kmp_uint32, sub_cpt, 32, -, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4, kmp_int32, mul_cpt, 32, *, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4u, kmp_uint32, mul_cpt, 32, *, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4, kmp_int32, div_cpt, 32, /, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_div_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4u, kmp_uint32, div_cpt, 32, /, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_div_cpt_fp
+
+ATOMIC_CMPXCHG_CPT_MIX(fixed8, kmp_int64, add_cpt, 64, +, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8u, kmp_uint64, add_cpt, 64, +, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8, kmp_int64, sub_cpt, 64, -, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8u, kmp_uint64, sub_cpt, 64, -, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8, kmp_int64, mul_cpt, 64, *, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8u, kmp_uint64, mul_cpt, 64, *, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8, kmp_int64, div_cpt, 64, /, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8u, kmp_uint64, div_cpt, 64, /, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_cpt_fp
+
+ATOMIC_CMPXCHG_CPT_MIX(float4, kmp_real32, add_cpt, 32, +, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float4, kmp_real32, sub_cpt, 32, -, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float4, kmp_real32, mul_cpt, 32, *, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float4, kmp_real32, div_cpt, 32, /, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_div_cpt_fp
+
+ATOMIC_CMPXCHG_CPT_MIX(float8, kmp_real64, add_cpt, 64, +, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float8, kmp_real64, sub_cpt, 64, -, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float8, kmp_real64, mul_cpt, 64, *, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float8, kmp_real64, div_cpt, 64, /, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_div_cpt_fp
+
+ATOMIC_CRITICAL_CPT_MIX(float10, long double, add_cpt, +, fp, _Quad, 10r,
+                        1) // __kmpc_atomic_float10_add_cpt_fp
+ATOMIC_CRITICAL_CPT_MIX(float10, long double, sub_cpt, -, fp, _Quad, 10r,
+                        1) // __kmpc_atomic_float10_sub_cpt_fp
+ATOMIC_CRITICAL_CPT_MIX(float10, long double, mul_cpt, *, fp, _Quad, 10r,
+                        1) // __kmpc_atomic_float10_mul_cpt_fp
+ATOMIC_CRITICAL_CPT_MIX(float10, long double, div_cpt, /, fp, _Quad, 10r,
+                        1) // __kmpc_atomic_float10_div_cpt_fp
+
+#endif // KMP_HAVE_QUAD
+
+// ------------------------------------------------------------------------
+// Routines for C/C++ Reduction operators && and ||
+
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_L_CPT(OP, LCK_ID)                                          \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    new_value OP rhs;                                                          \
+    (*lhs) = new_value;                                                        \
+  } else {                                                                     \
+    new_value = (*lhs);                                                        \
+    (*lhs) OP rhs;                                                             \
+  }                                                                            \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_L_CPT(OP, FLAG)                                       \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_L_CPT(OP, 0);                                                  \
+    return new_value;                                                          \
+  }
+#else
+#define OP_GOMP_CRITICAL_L_CPT(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// ------------------------------------------------------------------------
+// Need separate macros for &&, || because there is no combined assignment
+#define ATOMIC_CMPX_L_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)           \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  (void)new_value;                                                             \
+  OP_GOMP_CRITICAL_L_CPT(= *lhs OP, GOMP_FLAG)                                 \
+  OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
+  }
+
+ATOMIC_CMPX_L_CPT(fixed1, andl_cpt, char, 8, &&,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed1_andl_cpt
+ATOMIC_CMPX_L_CPT(fixed1, orl_cpt, char, 8, ||,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed1_orl_cpt
+ATOMIC_CMPX_L_CPT(fixed2, andl_cpt, short, 16, &&,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed2_andl_cpt
+ATOMIC_CMPX_L_CPT(fixed2, orl_cpt, short, 16, ||,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed2_orl_cpt
+ATOMIC_CMPX_L_CPT(fixed4, andl_cpt, kmp_int32, 32, &&,
+                  0) // __kmpc_atomic_fixed4_andl_cpt
+ATOMIC_CMPX_L_CPT(fixed4, orl_cpt, kmp_int32, 32, ||,
+                  0) // __kmpc_atomic_fixed4_orl_cpt
+ATOMIC_CMPX_L_CPT(fixed8, andl_cpt, kmp_int64, 64, &&,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed8_andl_cpt
+ATOMIC_CMPX_L_CPT(fixed8, orl_cpt, kmp_int64, 64, ||,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed8_orl_cpt
+
+// -------------------------------------------------------------------------
+// Routines for Fortran operators that matched no one in C:
+// MAX, MIN, .EQV., .NEQV.
+// Operators .AND., .OR. are covered by __kmpc_atomic_*_{andl,orl}_cpt
+// Intrinsics IAND, IOR, IEOR are covered by __kmpc_atomic_*_{andb,orb,xor}_cpt
+
+// -------------------------------------------------------------------------
+// MIN and MAX need separate macros
+// OP - operator to check if we need any actions?
+#define MIN_MAX_CRITSECT_CPT(OP, LCK_ID)                                       \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (*lhs OP rhs) { /* still need actions? */                                 \
+    old_value = *lhs;                                                          \
+    *lhs = rhs;                                                                \
+    if (flag)                                                                  \
+      new_value = rhs;                                                         \
+    else                                                                       \
+      new_value = old_value;                                                   \
+  } else {                                                                     \
+    new_value = *lhs;                                                          \
+  }                                                                            \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return new_value;
+
+// -------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_MIN_MAX_CRITSECT_CPT(OP, FLAG)                                    \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    MIN_MAX_CRITSECT_CPT(OP, 0);                                               \
+  }
+#else
+#define GOMP_MIN_MAX_CRITSECT_CPT(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// -------------------------------------------------------------------------
+#define MIN_MAX_CMPXCHG_CPT(TYPE, BITS, OP)                                    \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    /*TYPE old_value; */                                                       \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    while (old_value OP rhs && /* still need actions? */                       \
+           !KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+               (kmp_int##BITS *)lhs,                                           \
+               *VOLATILE_CAST(kmp_int##BITS *) & old_value,                    \
+               *VOLATILE_CAST(kmp_int##BITS *) & rhs)) {                       \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+    }                                                                          \
+    if (flag)                                                                  \
+      return rhs;                                                              \
+    else                                                                       \
+      return old_value;                                                        \
+  }
+
+// -------------------------------------------------------------------------
+// 1-byte, 2-byte operands - use critical section
+#define MIN_MAX_CRITICAL_CPT(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)      \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value, old_value;                                                   \
+  if (*lhs OP rhs) { /* need actions? */                                       \
+    GOMP_MIN_MAX_CRITSECT_CPT(OP, GOMP_FLAG)                                   \
+    MIN_MAX_CRITSECT_CPT(OP, LCK_ID)                                           \
+  }                                                                            \
+  return *lhs;                                                                 \
+  }
+
+#define MIN_MAX_COMPXCHG_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)        \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value, old_value;                                                   \
+  (void)new_value;                                                             \
+  if (*lhs OP rhs) {                                                           \
+    GOMP_MIN_MAX_CRITSECT_CPT(OP, GOMP_FLAG)                                   \
+    MIN_MAX_CMPXCHG_CPT(TYPE, BITS, OP)                                        \
+  }                                                                            \
+  return *lhs;                                                                 \
+  }
+
+MIN_MAX_COMPXCHG_CPT(fixed1, max_cpt, char, 8, <,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed1_max_cpt
+MIN_MAX_COMPXCHG_CPT(fixed1, min_cpt, char, 8, >,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed1_min_cpt
+MIN_MAX_COMPXCHG_CPT(fixed2, max_cpt, short, 16, <,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed2_max_cpt
+MIN_MAX_COMPXCHG_CPT(fixed2, min_cpt, short, 16, >,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed2_min_cpt
+MIN_MAX_COMPXCHG_CPT(fixed4, max_cpt, kmp_int32, 32, <,
+                     0) // __kmpc_atomic_fixed4_max_cpt
+MIN_MAX_COMPXCHG_CPT(fixed4, min_cpt, kmp_int32, 32, >,
+                     0) // __kmpc_atomic_fixed4_min_cpt
+MIN_MAX_COMPXCHG_CPT(fixed8, max_cpt, kmp_int64, 64, <,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed8_max_cpt
+MIN_MAX_COMPXCHG_CPT(fixed8, min_cpt, kmp_int64, 64, >,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed8_min_cpt
+MIN_MAX_COMPXCHG_CPT(float4, max_cpt, kmp_real32, 32, <,
+                     KMP_ARCH_X86) // __kmpc_atomic_float4_max_cpt
+MIN_MAX_COMPXCHG_CPT(float4, min_cpt, kmp_real32, 32, >,
+                     KMP_ARCH_X86) // __kmpc_atomic_float4_min_cpt
+MIN_MAX_COMPXCHG_CPT(float8, max_cpt, kmp_real64, 64, <,
+                     KMP_ARCH_X86) // __kmpc_atomic_float8_max_cpt
+MIN_MAX_COMPXCHG_CPT(float8, min_cpt, kmp_real64, 64, >,
+                     KMP_ARCH_X86) // __kmpc_atomic_float8_min_cpt
+MIN_MAX_CRITICAL_CPT(float10, max_cpt, long double, <, 10r,
+                     1) // __kmpc_atomic_float10_max_cpt
+MIN_MAX_CRITICAL_CPT(float10, min_cpt, long double, >, 10r,
+                     1) // __kmpc_atomic_float10_min_cpt
+#if KMP_HAVE_QUAD
+MIN_MAX_CRITICAL_CPT(float16, max_cpt, QUAD_LEGACY, <, 16r,
+                     1) // __kmpc_atomic_float16_max_cpt
+MIN_MAX_CRITICAL_CPT(float16, min_cpt, QUAD_LEGACY, >, 16r,
+                     1) // __kmpc_atomic_float16_min_cpt
+#if (KMP_ARCH_X86)
+MIN_MAX_CRITICAL_CPT(float16, max_a16_cpt, Quad_a16_t, <, 16r,
+                     1) // __kmpc_atomic_float16_max_a16_cpt
+MIN_MAX_CRITICAL_CPT(float16, min_a16_cpt, Quad_a16_t, >, 16r,
+                     1) // __kmpc_atomic_float16_mix_a16_cpt
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_EQV_CPT(OP, FLAG)                                     \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_CPT(OP, 0);                                                    \
+  }
+#else
+#define OP_GOMP_CRITICAL_EQV_CPT(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+#define ATOMIC_CMPX_EQV_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)         \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  (void)new_value;                                                             \
+  OP_GOMP_CRITICAL_EQV_CPT(^= (TYPE) ~, GOMP_FLAG) /* send assignment */       \
+  OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
+  }
+
+// ------------------------------------------------------------------------
+
+ATOMIC_CMPXCHG_CPT(fixed1, neqv_cpt, kmp_int8, 8, ^,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_neqv_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, neqv_cpt, kmp_int16, 16, ^,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_neqv_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, neqv_cpt, kmp_int32, 32, ^,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_neqv_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, neqv_cpt, kmp_int64, 64, ^,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_neqv_cpt
+ATOMIC_CMPX_EQV_CPT(fixed1, eqv_cpt, kmp_int8, 8, ^~,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed1_eqv_cpt
+ATOMIC_CMPX_EQV_CPT(fixed2, eqv_cpt, kmp_int16, 16, ^~,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed2_eqv_cpt
+ATOMIC_CMPX_EQV_CPT(fixed4, eqv_cpt, kmp_int32, 32, ^~,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed4_eqv_cpt
+ATOMIC_CMPX_EQV_CPT(fixed8, eqv_cpt, kmp_int64, 64, ^~,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed8_eqv_cpt
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_CPT(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)       \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_CPT(TYPE, OP, GOMP_FLAG) /* send assignment */              \
+  OP_UPDATE_CRITICAL_CPT(TYPE, OP, LCK_ID) /* send assignment */               \
+  }
+
+// ------------------------------------------------------------------------
+// Workaround for cmplx4. Regular routines with return value don't work
+// on Win_32e. Let's return captured values through the additional parameter.
+#define OP_CRITICAL_CPT_WRK(OP, LCK_ID)                                        \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    (*lhs) OP rhs;                                                             \
+    (*out) = (*lhs);                                                           \
+  } else {                                                                     \
+    (*out) = (*lhs);                                                           \
+    (*lhs) OP rhs;                                                             \
+  }                                                                            \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return;
+// ------------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT_WRK(OP, FLAG)                                     \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_CPT_WRK(OP## =, 0);                                            \
+  }
+#else
+#define OP_GOMP_CRITICAL_CPT_WRK(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+
+#define ATOMIC_BEGIN_WRK(TYPE_ID, OP_ID, TYPE)                                 \
+  void __kmpc_atomic_##TYPE_ID##_##OP_ID(ident_t *id_ref, int gtid, TYPE *lhs, \
+                                         TYPE rhs, TYPE *out, int flag) {      \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid));
+// ------------------------------------------------------------------------
+
+#define ATOMIC_CRITICAL_CPT_WRK(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)   \
+  ATOMIC_BEGIN_WRK(TYPE_ID, OP_ID, TYPE)                                       \
+  OP_GOMP_CRITICAL_CPT_WRK(OP, GOMP_FLAG)                                      \
+  OP_CRITICAL_CPT_WRK(OP## =, LCK_ID)                                          \
+  }
+// The end of workaround for cmplx4
+
+/* ------------------------------------------------------------------------- */
+// routines for long double type
+ATOMIC_CRITICAL_CPT(float10, add_cpt, long double, +, 10r,
+                    1) // __kmpc_atomic_float10_add_cpt
+ATOMIC_CRITICAL_CPT(float10, sub_cpt, long double, -, 10r,
+                    1) // __kmpc_atomic_float10_sub_cpt
+ATOMIC_CRITICAL_CPT(float10, mul_cpt, long double, *, 10r,
+                    1) // __kmpc_atomic_float10_mul_cpt
+ATOMIC_CRITICAL_CPT(float10, div_cpt, long double, /, 10r,
+                    1) // __kmpc_atomic_float10_div_cpt
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL_CPT(float16, add_cpt, QUAD_LEGACY, +, 16r,
+                    1) // __kmpc_atomic_float16_add_cpt
+ATOMIC_CRITICAL_CPT(float16, sub_cpt, QUAD_LEGACY, -, 16r,
+                    1) // __kmpc_atomic_float16_sub_cpt
+ATOMIC_CRITICAL_CPT(float16, mul_cpt, QUAD_LEGACY, *, 16r,
+                    1) // __kmpc_atomic_float16_mul_cpt
+ATOMIC_CRITICAL_CPT(float16, div_cpt, QUAD_LEGACY, /, 16r,
+                    1) // __kmpc_atomic_float16_div_cpt
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_CPT(float16, add_a16_cpt, Quad_a16_t, +, 16r,
+                    1) // __kmpc_atomic_float16_add_a16_cpt
+ATOMIC_CRITICAL_CPT(float16, sub_a16_cpt, Quad_a16_t, -, 16r,
+                    1) // __kmpc_atomic_float16_sub_a16_cpt
+ATOMIC_CRITICAL_CPT(float16, mul_a16_cpt, Quad_a16_t, *, 16r,
+                    1) // __kmpc_atomic_float16_mul_a16_cpt
+ATOMIC_CRITICAL_CPT(float16, div_a16_cpt, Quad_a16_t, /, 16r,
+                    1) // __kmpc_atomic_float16_div_a16_cpt
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// routines for complex types
+
+// cmplx4 routines to return void
+ATOMIC_CRITICAL_CPT_WRK(cmplx4, add_cpt, kmp_cmplx32, +, 8c,
+                        1) // __kmpc_atomic_cmplx4_add_cpt
+ATOMIC_CRITICAL_CPT_WRK(cmplx4, sub_cpt, kmp_cmplx32, -, 8c,
+                        1) // __kmpc_atomic_cmplx4_sub_cpt
+ATOMIC_CRITICAL_CPT_WRK(cmplx4, mul_cpt, kmp_cmplx32, *, 8c,
+                        1) // __kmpc_atomic_cmplx4_mul_cpt
+ATOMIC_CRITICAL_CPT_WRK(cmplx4, div_cpt, kmp_cmplx32, /, 8c,
+                        1) // __kmpc_atomic_cmplx4_div_cpt
+
+ATOMIC_CRITICAL_CPT(cmplx8, add_cpt, kmp_cmplx64, +, 16c,
+                    1) // __kmpc_atomic_cmplx8_add_cpt
+ATOMIC_CRITICAL_CPT(cmplx8, sub_cpt, kmp_cmplx64, -, 16c,
+                    1) // __kmpc_atomic_cmplx8_sub_cpt
+ATOMIC_CRITICAL_CPT(cmplx8, mul_cpt, kmp_cmplx64, *, 16c,
+                    1) // __kmpc_atomic_cmplx8_mul_cpt
+ATOMIC_CRITICAL_CPT(cmplx8, div_cpt, kmp_cmplx64, /, 16c,
+                    1) // __kmpc_atomic_cmplx8_div_cpt
+ATOMIC_CRITICAL_CPT(cmplx10, add_cpt, kmp_cmplx80, +, 20c,
+                    1) // __kmpc_atomic_cmplx10_add_cpt
+ATOMIC_CRITICAL_CPT(cmplx10, sub_cpt, kmp_cmplx80, -, 20c,
+                    1) // __kmpc_atomic_cmplx10_sub_cpt
+ATOMIC_CRITICAL_CPT(cmplx10, mul_cpt, kmp_cmplx80, *, 20c,
+                    1) // __kmpc_atomic_cmplx10_mul_cpt
+ATOMIC_CRITICAL_CPT(cmplx10, div_cpt, kmp_cmplx80, /, 20c,
+                    1) // __kmpc_atomic_cmplx10_div_cpt
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_CPT(cmplx16, add_cpt, CPLX128_LEG, +, 32c,
+                    1) // __kmpc_atomic_cmplx16_add_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, sub_cpt, CPLX128_LEG, -, 32c,
+                    1) // __kmpc_atomic_cmplx16_sub_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, mul_cpt, CPLX128_LEG, *, 32c,
+                    1) // __kmpc_atomic_cmplx16_mul_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, div_cpt, CPLX128_LEG, /, 32c,
+                    1) // __kmpc_atomic_cmplx16_div_cpt
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_CPT(cmplx16, add_a16_cpt, kmp_cmplx128_a16_t, +, 32c,
+                    1) // __kmpc_atomic_cmplx16_add_a16_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, sub_a16_cpt, kmp_cmplx128_a16_t, -, 32c,
+                    1) // __kmpc_atomic_cmplx16_sub_a16_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, mul_a16_cpt, kmp_cmplx128_a16_t, *, 32c,
+                    1) // __kmpc_atomic_cmplx16_mul_a16_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, div_a16_cpt, kmp_cmplx128_a16_t, /, 32c,
+                    1) // __kmpc_atomic_cmplx16_div_a16_cpt
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// OpenMP 4.0: v = x = expr binop x; { v = x; x = expr binop x; } { x = expr
+// binop x; v = x; }  for non-commutative operations.
+// Supported only on IA-32 architecture and Intel(R) 64
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_CPT_REV(TYPE, OP, LCK_ID)                                  \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    /*temp_val = (*lhs);*/                                                     \
+    (*lhs) = (TYPE)((rhs)OP(*lhs));                                            \
+    new_value = (*lhs);                                                        \
+  } else {                                                                     \
+    new_value = (*lhs);                                                        \
+    (*lhs) = (TYPE)((rhs)OP(*lhs));                                            \
+  }                                                                            \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return new_value;
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, FLAG)                               \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_CPT_REV(TYPE, OP, 0);                                          \
+  }
+#else
+#define OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_CPT_REV(TYPE, BITS, OP)                                     \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value, new_value;                                                 \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    new_value = (TYPE)(rhs OP old_value);                                      \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+      new_value = (TYPE)(rhs OP old_value);                                    \
+    }                                                                          \
+    if (flag) {                                                                \
+      return new_value;                                                        \
+    } else                                                                     \
+      return old_value;                                                        \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_CPT_REV(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)      \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  (void)new_value;                                                             \
+  OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, GOMP_FLAG)                                \
+  OP_CMPXCHG_CPT_REV(TYPE, BITS, OP)                                           \
+  }
+
+ATOMIC_CMPXCHG_CPT_REV(fixed1, div_cpt_rev, kmp_int8, 8, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed1u, div_cpt_rev, kmp_uint8, 8, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed1, shl_cpt_rev, kmp_int8, 8, <<,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed1, shr_cpt_rev, kmp_int8, 8, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed1u, shr_cpt_rev, kmp_uint8, 8, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed1, sub_cpt_rev, kmp_int8, 8, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2, div_cpt_rev, kmp_int16, 16, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2u, div_cpt_rev, kmp_uint16, 16, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2, shl_cpt_rev, kmp_int16, 16, <<,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2, shr_cpt_rev, kmp_int16, 16, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2u, shr_cpt_rev, kmp_uint16, 16, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2, sub_cpt_rev, kmp_int16, 16, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4, div_cpt_rev, kmp_int32, 32, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4u, div_cpt_rev, kmp_uint32, 32, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4, shl_cpt_rev, kmp_int32, 32, <<,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4, shr_cpt_rev, kmp_int32, 32, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4u, shr_cpt_rev, kmp_uint32, 32, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4, sub_cpt_rev, kmp_int32, 32, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8, div_cpt_rev, kmp_int64, 64, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8u, div_cpt_rev, kmp_uint64, 64, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8, shl_cpt_rev, kmp_int64, 64, <<,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8, shr_cpt_rev, kmp_int64, 64, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8u, shr_cpt_rev, kmp_uint64, 64, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8, sub_cpt_rev, kmp_int64, 64, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(float4, div_cpt_rev, kmp_real32, 32, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(float4, sub_cpt_rev, kmp_real32, 32, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(float8, div_cpt_rev, kmp_real64, 64, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(float8, sub_cpt_rev, kmp_real64, 64, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_sub_cpt_rev
+//              TYPE_ID,OP_ID, TYPE,          OP,  GOMP_FLAG
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_CPT_REV(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)   \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  /*printf("__kmp_atomic_mode = %d\n", __kmp_atomic_mode);*/                   \
+  OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, GOMP_FLAG)                                \
+  OP_CRITICAL_CPT_REV(TYPE, OP, LCK_ID)                                        \
+  }
+
+/* ------------------------------------------------------------------------- */
+// routines for long double type
+ATOMIC_CRITICAL_CPT_REV(float10, sub_cpt_rev, long double, -, 10r,
+                        1) // __kmpc_atomic_float10_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(float10, div_cpt_rev, long double, /, 10r,
+                        1) // __kmpc_atomic_float10_div_cpt_rev
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL_CPT_REV(float16, sub_cpt_rev, QUAD_LEGACY, -, 16r,
+                        1) // __kmpc_atomic_float16_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(float16, div_cpt_rev, QUAD_LEGACY, /, 16r,
+                        1) // __kmpc_atomic_float16_div_cpt_rev
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_CPT_REV(float16, sub_a16_cpt_rev, Quad_a16_t, -, 16r,
+                        1) // __kmpc_atomic_float16_sub_a16_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(float16, div_a16_cpt_rev, Quad_a16_t, /, 16r,
+                        1) // __kmpc_atomic_float16_div_a16_cpt_rev
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// routines for complex types
+
+// ------------------------------------------------------------------------
+// Workaround for cmplx4. Regular routines with return value don't work
+// on Win_32e. Let's return captured values through the additional parameter.
+#define OP_CRITICAL_CPT_REV_WRK(OP, LCK_ID)                                    \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    (*lhs) = (rhs)OP(*lhs);                                                    \
+    (*out) = (*lhs);                                                           \
+  } else {                                                                     \
+    (*out) = (*lhs);                                                           \
+    (*lhs) = (rhs)OP(*lhs);                                                    \
+  }                                                                            \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return;
+// ------------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT_REV_WRK(OP, FLAG)                                 \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_CPT_REV_WRK(OP, 0);                                            \
+  }
+#else
+#define OP_GOMP_CRITICAL_CPT_REV_WRK(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+
+#define ATOMIC_CRITICAL_CPT_REV_WRK(TYPE_ID, OP_ID, TYPE, OP, LCK_ID,          \
+                                    GOMP_FLAG)                                 \
+  ATOMIC_BEGIN_WRK(TYPE_ID, OP_ID, TYPE)                                       \
+  OP_GOMP_CRITICAL_CPT_REV_WRK(OP, GOMP_FLAG)                                  \
+  OP_CRITICAL_CPT_REV_WRK(OP, LCK_ID)                                          \
+  }
+// The end of workaround for cmplx4
+
+// !!! TODO: check if we need to return void for cmplx4 routines
+// cmplx4 routines to return void
+ATOMIC_CRITICAL_CPT_REV_WRK(cmplx4, sub_cpt_rev, kmp_cmplx32, -, 8c,
+                            1) // __kmpc_atomic_cmplx4_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV_WRK(cmplx4, div_cpt_rev, kmp_cmplx32, /, 8c,
+                            1) // __kmpc_atomic_cmplx4_div_cpt_rev
+
+ATOMIC_CRITICAL_CPT_REV(cmplx8, sub_cpt_rev, kmp_cmplx64, -, 16c,
+                        1) // __kmpc_atomic_cmplx8_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(cmplx8, div_cpt_rev, kmp_cmplx64, /, 16c,
+                        1) // __kmpc_atomic_cmplx8_div_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(cmplx10, sub_cpt_rev, kmp_cmplx80, -, 20c,
+                        1) // __kmpc_atomic_cmplx10_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(cmplx10, div_cpt_rev, kmp_cmplx80, /, 20c,
+                        1) // __kmpc_atomic_cmplx10_div_cpt_rev
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_CPT_REV(cmplx16, sub_cpt_rev, CPLX128_LEG, -, 32c,
+                        1) // __kmpc_atomic_cmplx16_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(cmplx16, div_cpt_rev, CPLX128_LEG, /, 32c,
+                        1) // __kmpc_atomic_cmplx16_div_cpt_rev
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_CPT_REV(cmplx16, sub_a16_cpt_rev, kmp_cmplx128_a16_t, -, 32c,
+                        1) // __kmpc_atomic_cmplx16_sub_a16_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(cmplx16, div_a16_cpt_rev, kmp_cmplx128_a16_t, /, 32c,
+                        1) // __kmpc_atomic_cmplx16_div_a16_cpt_rev
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// Capture reverse for mixed type: RHS=float16
+#if KMP_HAVE_QUAD
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_CPT_REV_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID,   \
+                                   RTYPE, LCK_ID, MASK, GOMP_FLAG)             \
+  ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
+  TYPE new_value;                                                              \
+  (void)new_value;                                                             \
+  OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, GOMP_FLAG)                                \
+  OP_CMPXCHG_CPT_REV(TYPE, BITS, OP)                                           \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CRITICAL_CPT_REV_MIX(TYPE_ID, TYPE, OP_ID, OP, RTYPE_ID, RTYPE, \
+                                    LCK_ID, GOMP_FLAG)                         \
+  ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
+  TYPE new_value;                                                              \
+  (void)new_value;                                                             \
+  OP_GOMP_CRITICAL_CPT_REV(TYPE, OP, GOMP_FLAG) /* send assignment */          \
+  OP_CRITICAL_CPT_REV(TYPE, OP, LCK_ID) /* send assignment */                  \
+  }
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed1, char, sub_cpt_rev, 8, -, fp, _Quad, 1i, 0,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed1u, uchar, sub_cpt_rev, 8, -, fp, _Quad, 1i, 0,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed1u_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed1, char, div_cpt_rev, 8, /, fp, _Quad, 1i, 0,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed1u, uchar, div_cpt_rev, 8, /, fp, _Quad, 1i, 0,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_cpt_rev_fp
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed2, short, sub_cpt_rev, 16, -, fp, _Quad, 2i, 1,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed2u, ushort, sub_cpt_rev, 16, -, fp, _Quad, 2i,
+                           1,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed2u_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed2, short, div_cpt_rev, 16, /, fp, _Quad, 2i, 1,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed2u, ushort, div_cpt_rev, 16, /, fp, _Quad, 2i,
+                           1,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_cpt_rev_fp
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed4, kmp_int32, sub_cpt_rev, 32, -, fp, _Quad, 4i,
+                           3, 0) // __kmpc_atomic_fixed4_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed4u, kmp_uint32, sub_cpt_rev, 32, -, fp, _Quad,
+                           4i, 3, 0) // __kmpc_atomic_fixed4u_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed4, kmp_int32, div_cpt_rev, 32, /, fp, _Quad, 4i,
+                           3, 0) // __kmpc_atomic_fixed4_div_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed4u, kmp_uint32, div_cpt_rev, 32, /, fp, _Quad,
+                           4i, 3, 0) // __kmpc_atomic_fixed4u_div_cpt_rev_fp
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed8, kmp_int64, sub_cpt_rev, 64, -, fp, _Quad, 8i,
+                           7,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed8u, kmp_uint64, sub_cpt_rev, 64, -, fp, _Quad,
+                           8i, 7,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed8u_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed8, kmp_int64, div_cpt_rev, 64, /, fp, _Quad, 8i,
+                           7,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed8u, kmp_uint64, div_cpt_rev, 64, /, fp, _Quad,
+                           8i, 7,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_cpt_rev_fp
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(float4, kmp_real32, sub_cpt_rev, 32, -, fp, _Quad,
+                           4r, 3,
+                           KMP_ARCH_X86) // __kmpc_atomic_float4_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(float4, kmp_real32, div_cpt_rev, 32, /, fp, _Quad,
+                           4r, 3,
+                           KMP_ARCH_X86) // __kmpc_atomic_float4_div_cpt_rev_fp
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(float8, kmp_real64, sub_cpt_rev, 64, -, fp, _Quad,
+                           8r, 7,
+                           KMP_ARCH_X86) // __kmpc_atomic_float8_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(float8, kmp_real64, div_cpt_rev, 64, /, fp, _Quad,
+                           8r, 7,
+                           KMP_ARCH_X86) // __kmpc_atomic_float8_div_cpt_rev_fp
+
+ATOMIC_CRITICAL_CPT_REV_MIX(float10, long double, sub_cpt_rev, -, fp, _Quad,
+                            10r, 1) // __kmpc_atomic_float10_sub_cpt_rev_fp
+ATOMIC_CRITICAL_CPT_REV_MIX(float10, long double, div_cpt_rev, /, fp, _Quad,
+                            10r, 1) // __kmpc_atomic_float10_div_cpt_rev_fp
+
+#endif // KMP_HAVE_QUAD
+
+//   OpenMP 4.0 Capture-write (swap): {v = x; x = expr;}
+
+#define ATOMIC_BEGIN_SWP(TYPE_ID, TYPE)                                        \
+  TYPE __kmpc_atomic_##TYPE_ID##_swp(ident_t *id_ref, int gtid, TYPE *lhs,     \
+                                     TYPE rhs) {                               \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_swp: T#%d\n", gtid));
+
+#define CRITICAL_SWP(LCK_ID)                                                   \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  old_value = (*lhs);                                                          \
+  (*lhs) = rhs;                                                                \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return old_value;
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_CRITICAL_SWP(FLAG)                                                \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    CRITICAL_SWP(0);                                                           \
+  }
+#else
+#define GOMP_CRITICAL_SWP(FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+#define ATOMIC_XCHG_SWP(TYPE_ID, TYPE, BITS, GOMP_FLAG)                        \
+  ATOMIC_BEGIN_SWP(TYPE_ID, TYPE)                                              \
+  TYPE old_value;                                                              \
+  GOMP_CRITICAL_SWP(GOMP_FLAG)                                                 \
+  old_value = KMP_XCHG_FIXED##BITS(lhs, rhs);                                  \
+  return old_value;                                                            \
+  }
+// ------------------------------------------------------------------------
+#define ATOMIC_XCHG_FLOAT_SWP(TYPE_ID, TYPE, BITS, GOMP_FLAG)                  \
+  ATOMIC_BEGIN_SWP(TYPE_ID, TYPE)                                              \
+  TYPE old_value;                                                              \
+  GOMP_CRITICAL_SWP(GOMP_FLAG)                                                 \
+  old_value = KMP_XCHG_REAL##BITS(lhs, rhs);                                   \
+  return old_value;                                                            \
+  }
+
+// ------------------------------------------------------------------------
+#define CMPXCHG_SWP(TYPE, BITS)                                                \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value, new_value;                                                 \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    new_value = rhs;                                                           \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+      new_value = rhs;                                                         \
+    }                                                                          \
+    return old_value;                                                          \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_SWP(TYPE_ID, TYPE, BITS, GOMP_FLAG)                     \
+  ATOMIC_BEGIN_SWP(TYPE_ID, TYPE)                                              \
+  TYPE old_value;                                                              \
+  (void)old_value;                                                             \
+  GOMP_CRITICAL_SWP(GOMP_FLAG)                                                 \
+  CMPXCHG_SWP(TYPE, BITS)                                                      \
+  }
+
+ATOMIC_XCHG_SWP(fixed1, kmp_int8, 8, KMP_ARCH_X86) // __kmpc_atomic_fixed1_swp
+ATOMIC_XCHG_SWP(fixed2, kmp_int16, 16, KMP_ARCH_X86) // __kmpc_atomic_fixed2_swp
+ATOMIC_XCHG_SWP(fixed4, kmp_int32, 32, KMP_ARCH_X86) // __kmpc_atomic_fixed4_swp
+
+ATOMIC_XCHG_FLOAT_SWP(float4, kmp_real32, 32,
+                      KMP_ARCH_X86) // __kmpc_atomic_float4_swp
+
+#if (KMP_ARCH_X86)
+ATOMIC_CMPXCHG_SWP(fixed8, kmp_int64, 64,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_swp
+ATOMIC_CMPXCHG_SWP(float8, kmp_real64, 64,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_swp
+#else
+ATOMIC_XCHG_SWP(fixed8, kmp_int64, 64, KMP_ARCH_X86) // __kmpc_atomic_fixed8_swp
+ATOMIC_XCHG_FLOAT_SWP(float8, kmp_real64, 64,
+                      KMP_ARCH_X86) // __kmpc_atomic_float8_swp
+#endif // (KMP_ARCH_X86)
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+#define ATOMIC_CRITICAL_SWP(TYPE_ID, TYPE, LCK_ID, GOMP_FLAG)                  \
+  ATOMIC_BEGIN_SWP(TYPE_ID, TYPE)                                              \
+  TYPE old_value;                                                              \
+  GOMP_CRITICAL_SWP(GOMP_FLAG)                                                 \
+  CRITICAL_SWP(LCK_ID)                                                         \
+  }
+
+// ------------------------------------------------------------------------
+// !!! TODO: check if we need to return void for cmplx4 routines
+// Workaround for cmplx4. Regular routines with return value don't work
+// on Win_32e. Let's return captured values through the additional parameter.
+
+#define ATOMIC_BEGIN_SWP_WRK(TYPE_ID, TYPE)                                    \
+  void __kmpc_atomic_##TYPE_ID##_swp(ident_t *id_ref, int gtid, TYPE *lhs,     \
+                                     TYPE rhs, TYPE *out) {                    \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_swp: T#%d\n", gtid));
+
+#define CRITICAL_SWP_WRK(LCK_ID)                                               \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  tmp = (*lhs);                                                                \
+  (*lhs) = (rhs);                                                              \
+  (*out) = tmp;                                                                \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return;
+// ------------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_CRITICAL_SWP_WRK(FLAG)                                            \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    CRITICAL_SWP_WRK(0);                                                       \
+  }
+#else
+#define GOMP_CRITICAL_SWP_WRK(FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+
+#define ATOMIC_CRITICAL_SWP_WRK(TYPE_ID, TYPE, LCK_ID, GOMP_FLAG)              \
+  ATOMIC_BEGIN_SWP_WRK(TYPE_ID, TYPE)                                          \
+  TYPE tmp;                                                                    \
+  GOMP_CRITICAL_SWP_WRK(GOMP_FLAG)                                             \
+  CRITICAL_SWP_WRK(LCK_ID)                                                     \
+  }
+// The end of workaround for cmplx4
+
+ATOMIC_CRITICAL_SWP(float10, long double, 10r, 1) // __kmpc_atomic_float10_swp
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_SWP(float16, QUAD_LEGACY, 16r, 1) // __kmpc_atomic_float16_swp
+#endif // KMP_HAVE_QUAD
+// cmplx4 routine to return void
+ATOMIC_CRITICAL_SWP_WRK(cmplx4, kmp_cmplx32, 8c, 1) // __kmpc_atomic_cmplx4_swp
+
+// ATOMIC_CRITICAL_SWP( cmplx4, kmp_cmplx32,  8c,   1 )           //
+// __kmpc_atomic_cmplx4_swp
+
+ATOMIC_CRITICAL_SWP(cmplx8, kmp_cmplx64, 16c, 1) // __kmpc_atomic_cmplx8_swp
+ATOMIC_CRITICAL_SWP(cmplx10, kmp_cmplx80, 20c, 1) // __kmpc_atomic_cmplx10_swp
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_SWP(cmplx16, CPLX128_LEG, 32c, 1) // __kmpc_atomic_cmplx16_swp
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_SWP(float16_a16, Quad_a16_t, 16r,
+                    1) // __kmpc_atomic_float16_a16_swp
+ATOMIC_CRITICAL_SWP(cmplx16_a16, kmp_cmplx128_a16_t, 32c,
+                    1) // __kmpc_atomic_cmplx16_a16_swp
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// End of OpenMP 4.0 Capture
+
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+#undef OP_CRITICAL
+
+/* ------------------------------------------------------------------------ */
+/* Generic atomic routines                                                  */
+
+void __kmpc_atomic_1(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (
+#if KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+      FALSE /* must use lock */
+#else
+      TRUE
+#endif // KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+  ) {
+    kmp_int8 old_value, new_value;
+
+    old_value = *(kmp_int8 *)lhs;
+    (*f)(&new_value, &old_value, rhs);
+
+    /* TODO: Should this be acquire or release? */
+    while (!KMP_COMPARE_AND_STORE_ACQ8((kmp_int8 *)lhs, *(kmp_int8 *)&old_value,
+                                       *(kmp_int8 *)&new_value)) {
+      KMP_CPU_PAUSE();
+
+      old_value = *(kmp_int8 *)lhs;
+      (*f)(&new_value, &old_value, rhs);
+    }
+
+    return;
+  } else {
+    // All 1-byte data is of integer data type.
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock_1i, gtid);
+
+    (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_release_atomic_lock(&__kmp_atomic_lock_1i, gtid);
+  }
+}
+
+void __kmpc_atomic_2(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *)) {
+  if (
+#if KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+      FALSE /* must use lock */
+#elif KMP_ARCH_X86 || KMP_ARCH_X86_64
+      TRUE /* no alignment problems */
+#else
+      !((kmp_uintptr_t)lhs & 0x1) /* make sure address is 2-byte aligned */
+#endif // KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+  ) {
+    kmp_int16 old_value, new_value;
+
+    old_value = *(kmp_int16 *)lhs;
+    (*f)(&new_value, &old_value, rhs);
+
+    /* TODO: Should this be acquire or release? */
+    while (!KMP_COMPARE_AND_STORE_ACQ16(
+        (kmp_int16 *)lhs, *(kmp_int16 *)&old_value, *(kmp_int16 *)&new_value)) {
+      KMP_CPU_PAUSE();
+
+      old_value = *(kmp_int16 *)lhs;
+      (*f)(&new_value, &old_value, rhs);
+    }
+
+    return;
+  } else {
+    // All 2-byte data is of integer data type.
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock_2i, gtid);
+
+    (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_release_atomic_lock(&__kmp_atomic_lock_2i, gtid);
+  }
+}
+
+void __kmpc_atomic_4(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (
+// FIXME: On IA-32 architecture, gcc uses cmpxchg only for 4-byte ints.
+// Gomp compatibility is broken if this routine is called for floats.
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+      TRUE /* no alignment problems */
+#else
+      !((kmp_uintptr_t)lhs & 0x3) /* make sure address is 4-byte aligned */
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+  ) {
+    kmp_int32 old_value, new_value;
+
+    old_value = *(kmp_int32 *)lhs;
+    (*f)(&new_value, &old_value, rhs);
+
+    /* TODO: Should this be acquire or release? */
+    while (!KMP_COMPARE_AND_STORE_ACQ32(
+        (kmp_int32 *)lhs, *(kmp_int32 *)&old_value, *(kmp_int32 *)&new_value)) {
+      KMP_CPU_PAUSE();
+
+      old_value = *(kmp_int32 *)lhs;
+      (*f)(&new_value, &old_value, rhs);
+    }
+
+    return;
+  } else {
+    // Use __kmp_atomic_lock_4i for all 4-byte data,
+    // even if it isn't of integer data type.
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock_4i, gtid);
+
+    (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_release_atomic_lock(&__kmp_atomic_lock_4i, gtid);
+  }
+}
+
+void __kmpc_atomic_8(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (
+
+#if KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+      FALSE /* must use lock */
+#elif KMP_ARCH_X86 || KMP_ARCH_X86_64
+      TRUE /* no alignment problems */
+#else
+      !((kmp_uintptr_t)lhs & 0x7) /* make sure address is 8-byte aligned */
+#endif // KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+  ) {
+    kmp_int64 old_value, new_value;
+
+    old_value = *(kmp_int64 *)lhs;
+    (*f)(&new_value, &old_value, rhs);
+    /* TODO: Should this be acquire or release? */
+    while (!KMP_COMPARE_AND_STORE_ACQ64(
+        (kmp_int64 *)lhs, *(kmp_int64 *)&old_value, *(kmp_int64 *)&new_value)) {
+      KMP_CPU_PAUSE();
+
+      old_value = *(kmp_int64 *)lhs;
+      (*f)(&new_value, &old_value, rhs);
+    }
+
+    return;
+  } else {
+    // Use __kmp_atomic_lock_8i for all 8-byte data,
+    // even if it isn't of integer data type.
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock_8i, gtid);
+
+    (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_release_atomic_lock(&__kmp_atomic_lock_8i, gtid);
+  }
+}
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+void __kmpc_atomic_10(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock_10r, gtid);
+
+  (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock(&__kmp_atomic_lock_10r, gtid);
+}
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+void __kmpc_atomic_16(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock_16c, gtid);
+
+  (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock(&__kmp_atomic_lock_16c, gtid);
+}
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+void __kmpc_atomic_20(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock_20c, gtid);
+
+  (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock(&__kmp_atomic_lock_20c, gtid);
+}
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+void __kmpc_atomic_32(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock_32c, gtid);
+
+  (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock(&__kmp_atomic_lock_32c, gtid);
+}
+
+// AC: same two routines as GOMP_atomic_start/end, but will be called by our
+// compiler; duplicated in order to not use 3-party names in pure Intel code
+// TODO: consider adding GTID parameter after consultation with Ernesto/Xinmin.
+void __kmpc_atomic_start(void) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("__kmpc_atomic_start: T#%d\n", gtid));
+  __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+void __kmpc_atomic_end(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("__kmpc_atomic_end: T#%d\n", gtid));
+  __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// OpenMP 5.1 compare and swap
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@return Result of comparison
+
+Implements Compare And Swap atomic operation.
+
+Sample code:
+#pragma omp atomic compare update capture
+  { r = x == e; if(r) { x = d; } }
+*/
+bool __kmpc_atomic_bool_1_cas(ident_t *loc, int gtid, char *x, char e, char d) {
+  return KMP_COMPARE_AND_STORE_ACQ8(x, e, d);
+}
+bool __kmpc_atomic_bool_2_cas(ident_t *loc, int gtid, short *x, short e,
+                              short d) {
+  return KMP_COMPARE_AND_STORE_ACQ16(x, e, d);
+}
+bool __kmpc_atomic_bool_4_cas(ident_t *loc, int gtid, kmp_int32 *x, kmp_int32 e,
+                              kmp_int32 d) {
+  return KMP_COMPARE_AND_STORE_ACQ32(x, e, d);
+}
+bool __kmpc_atomic_bool_8_cas(ident_t *loc, int gtid, kmp_int64 *x, kmp_int64 e,
+                              kmp_int64 d) {
+  return KMP_COMPARE_AND_STORE_ACQ64(x, e, d);
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@return Old value of x
+
+Implements Compare And Swap atomic operation.
+
+Sample code:
+#pragma omp atomic compare update capture
+  { v = x; if (x == e) { x = d; } }
+*/
+char __kmpc_atomic_val_1_cas(ident_t *loc, int gtid, char *x, char e, char d) {
+  return KMP_COMPARE_AND_STORE_RET8(x, e, d);
+}
+short __kmpc_atomic_val_2_cas(ident_t *loc, int gtid, short *x, short e,
+                              short d) {
+  return KMP_COMPARE_AND_STORE_RET16(x, e, d);
+}
+kmp_int32 __kmpc_atomic_val_4_cas(ident_t *loc, int gtid, kmp_int32 *x,
+                                  kmp_int32 e, kmp_int32 d) {
+  return KMP_COMPARE_AND_STORE_RET32(x, e, d);
+}
+kmp_int64 __kmpc_atomic_val_8_cas(ident_t *loc, int gtid, kmp_int64 *x,
+                                  kmp_int64 e, kmp_int64 d) {
+  return KMP_COMPARE_AND_STORE_RET64(x, e, d);
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@param pv Captured value location
+@return Result of comparison
+
+Implements Compare And Swap + Capture atomic operation.
+
+v gets old valie of x if comparison failed, untouched otherwise.
+Sample code:
+#pragma omp atomic compare update capture
+  { r = x == e; if(r) { x = d; } else { v = x; } }
+*/
+bool __kmpc_atomic_bool_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+                                  char d, char *pv) {
+  char old = KMP_COMPARE_AND_STORE_RET8(x, e, d);
+  if (old == e)
+    return true;
+  KMP_ASSERT(pv != NULL);
+  *pv = old;
+  return false;
+}
+bool __kmpc_atomic_bool_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+                                  short d, short *pv) {
+  short old = KMP_COMPARE_AND_STORE_RET16(x, e, d);
+  if (old == e)
+    return true;
+  KMP_ASSERT(pv != NULL);
+  *pv = old;
+  return false;
+}
+bool __kmpc_atomic_bool_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+                                  kmp_int32 e, kmp_int32 d, kmp_int32 *pv) {
+  kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(x, e, d);
+  if (old == e)
+    return true;
+  KMP_ASSERT(pv != NULL);
+  *pv = old;
+  return false;
+}
+bool __kmpc_atomic_bool_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+                                  kmp_int64 e, kmp_int64 d, kmp_int64 *pv) {
+  kmp_int64 old = KMP_COMPARE_AND_STORE_RET64(x, e, d);
+  if (old == e)
+    return true;
+  KMP_ASSERT(pv != NULL);
+  *pv = old;
+  return false;
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param x Memory location to operate on
+@param e Expected value
+@param d Desired value
+@param pv Captured value location
+@return Old value of x
+
+Implements Compare And Swap + Capture atomic operation.
+
+v gets new valie of x.
+Sample code:
+#pragma omp atomic compare update capture
+  { if (x == e) { x = d; }; v = x; }
+*/
+char __kmpc_atomic_val_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+                                 char d, char *pv) {
+  char old = KMP_COMPARE_AND_STORE_RET8(x, e, d);
+  KMP_ASSERT(pv != NULL);
+  *pv = old == e ? d : old;
+  return old;
+}
+short __kmpc_atomic_val_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+                                  short d, short *pv) {
+  short old = KMP_COMPARE_AND_STORE_RET16(x, e, d);
+  KMP_ASSERT(pv != NULL);
+  *pv = old == e ? d : old;
+  return old;
+}
+kmp_int32 __kmpc_atomic_val_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+                                      kmp_int32 e, kmp_int32 d, kmp_int32 *pv) {
+  kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(x, e, d);
+  KMP_ASSERT(pv != NULL);
+  *pv = old == e ? d : old;
+  return old;
+}
+kmp_int64 __kmpc_atomic_val_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+                                      kmp_int64 e, kmp_int64 d, kmp_int64 *pv) {
+  kmp_int64 old = KMP_COMPARE_AND_STORE_RET64(x, e, d);
+  KMP_ASSERT(pv != NULL);
+  *pv = old == e ? d : old;
+  return old;
+}
+
+// End OpenMP 5.1 compare + capture
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+/*!
+@}
+*/
+
+// end of file
diff --git a/third_party/openmp/kmp_atomic.h b/third_party/openmp/kmp_atomic.h
new file mode 100644
index 000000000..4fc51ee42
--- /dev/null
+++ b/third_party/openmp/kmp_atomic.h
@@ -0,0 +1,1855 @@
+/*
+ * kmp_atomic.h - ATOMIC header file
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_ATOMIC_H
+#define KMP_ATOMIC_H
+
+#include "kmp_lock.h"
+#include "kmp_os.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+// C++ build port.
+// Intel compiler does not support _Complex datatype on win.
+// Intel compiler supports _Complex datatype on lin and mac.
+// On the other side, there is a problem of stack alignment on lin_32 and mac_32
+// if the rhs is cmplx80 or cmplx128 typedef'ed datatype.
+// The decision is: to use compiler supported _Complex type on lin and mac,
+//                  to use typedef'ed types on win.
+// Condition for WIN64 was modified in anticipation of 10.1 build compiler.
+
+#if defined(__cplusplus) && (KMP_OS_WINDOWS)
+// create shortcuts for c99 complex types
+
+// Visual Studio cannot have function parameters that have the
+// align __declspec attribute, so we must remove it. (Compiler Error C2719)
+#if KMP_COMPILER_MSVC
+#undef KMP_DO_ALIGN
+#define KMP_DO_ALIGN(alignment) /* Nothing */
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600) && defined(_DEBUG)
+// Workaround for the problem of _DebugHeapTag unresolved external.
+// This problem prevented to use our static debug library for C tests
+// compiled with /MDd option (the library itself built with /MTd),
+#undef _DEBUG
+#define _DEBUG_TEMPORARILY_UNSET_
+#endif
+
+#include <complex>
+
+template <typename type_lhs, typename type_rhs>
+std::complex<type_lhs> __kmp_lhs_div_rhs(const std::complex<type_lhs> &lhs,
+                                         const std::complex<type_rhs> &rhs) {
+  type_lhs a = lhs.real();
+  type_lhs b = lhs.imag();
+  type_rhs c = rhs.real();
+  type_rhs d = rhs.imag();
+  type_rhs den = c * c + d * d;
+  type_rhs r = (a * c + b * d);
+  type_rhs i = (b * c - a * d);
+  std::complex<type_lhs> ret(r / den, i / den);
+  return ret;
+}
+
+// complex8
+struct __kmp_cmplx64_t : std::complex<double> {
+
+  __kmp_cmplx64_t() : std::complex<double>() {}
+
+  __kmp_cmplx64_t(const std::complex<double> &cd) : std::complex<double>(cd) {}
+
+  void operator/=(const __kmp_cmplx64_t &rhs) {
+    std::complex<double> lhs = *this;
+    *this = __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  __kmp_cmplx64_t operator/(const __kmp_cmplx64_t &rhs) {
+    std::complex<double> lhs = *this;
+    return __kmp_lhs_div_rhs(lhs, rhs);
+  }
+};
+typedef struct __kmp_cmplx64_t kmp_cmplx64;
+
+// complex4
+struct __kmp_cmplx32_t : std::complex<float> {
+
+  __kmp_cmplx32_t() : std::complex<float>() {}
+
+  __kmp_cmplx32_t(const std::complex<float> &cf) : std::complex<float>(cf) {}
+
+  __kmp_cmplx32_t operator+(const __kmp_cmplx32_t &b) {
+    std::complex<float> lhs = *this;
+    std::complex<float> rhs = b;
+    return (lhs + rhs);
+  }
+  __kmp_cmplx32_t operator-(const __kmp_cmplx32_t &b) {
+    std::complex<float> lhs = *this;
+    std::complex<float> rhs = b;
+    return (lhs - rhs);
+  }
+  __kmp_cmplx32_t operator*(const __kmp_cmplx32_t &b) {
+    std::complex<float> lhs = *this;
+    std::complex<float> rhs = b;
+    return (lhs * rhs);
+  }
+
+  __kmp_cmplx32_t operator+(const kmp_cmplx64 &b) {
+    kmp_cmplx64 t = kmp_cmplx64(*this) + b;
+    std::complex<double> d(t);
+    std::complex<float> f(d);
+    __kmp_cmplx32_t r(f);
+    return r;
+  }
+  __kmp_cmplx32_t operator-(const kmp_cmplx64 &b) {
+    kmp_cmplx64 t = kmp_cmplx64(*this) - b;
+    std::complex<double> d(t);
+    std::complex<float> f(d);
+    __kmp_cmplx32_t r(f);
+    return r;
+  }
+  __kmp_cmplx32_t operator*(const kmp_cmplx64 &b) {
+    kmp_cmplx64 t = kmp_cmplx64(*this) * b;
+    std::complex<double> d(t);
+    std::complex<float> f(d);
+    __kmp_cmplx32_t r(f);
+    return r;
+  }
+
+  void operator/=(const __kmp_cmplx32_t &rhs) {
+    std::complex<float> lhs = *this;
+    *this = __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  __kmp_cmplx32_t operator/(const __kmp_cmplx32_t &rhs) {
+    std::complex<float> lhs = *this;
+    return __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  void operator/=(const kmp_cmplx64 &rhs) {
+    std::complex<float> lhs = *this;
+    *this = __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  __kmp_cmplx32_t operator/(const kmp_cmplx64 &rhs) {
+    std::complex<float> lhs = *this;
+    return __kmp_lhs_div_rhs(lhs, rhs);
+  }
+};
+typedef struct __kmp_cmplx32_t kmp_cmplx32;
+
+// complex10
+struct KMP_DO_ALIGN(16) __kmp_cmplx80_t : std::complex<long double> {
+
+  __kmp_cmplx80_t() : std::complex<long double>() {}
+
+  __kmp_cmplx80_t(const std::complex<long double> &cld)
+      : std::complex<long double>(cld) {}
+
+  void operator/=(const __kmp_cmplx80_t &rhs) {
+    std::complex<long double> lhs = *this;
+    *this = __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  __kmp_cmplx80_t operator/(const __kmp_cmplx80_t &rhs) {
+    std::complex<long double> lhs = *this;
+    return __kmp_lhs_div_rhs(lhs, rhs);
+  }
+};
+typedef KMP_DO_ALIGN(16) struct __kmp_cmplx80_t kmp_cmplx80;
+
+// complex16
+#if KMP_HAVE_QUAD
+struct __kmp_cmplx128_t : std::complex<_Quad> {
+
+  __kmp_cmplx128_t() : std::complex<_Quad>() {}
+
+  __kmp_cmplx128_t(const std::complex<_Quad> &cq) : std::complex<_Quad>(cq) {}
+
+  void operator/=(const __kmp_cmplx128_t &rhs) {
+    std::complex<_Quad> lhs = *this;
+    *this = __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  __kmp_cmplx128_t operator/(const __kmp_cmplx128_t &rhs) {
+    std::complex<_Quad> lhs = *this;
+    return __kmp_lhs_div_rhs(lhs, rhs);
+  }
+};
+typedef struct __kmp_cmplx128_t kmp_cmplx128;
+#endif /* KMP_HAVE_QUAD */
+
+#ifdef _DEBUG_TEMPORARILY_UNSET_
+#undef _DEBUG_TEMPORARILY_UNSET_
+// Set it back now
+#define _DEBUG 1
+#endif
+
+#else
+// create shortcuts for c99 complex types
+typedef float _Complex kmp_cmplx32;
+typedef double _Complex kmp_cmplx64;
+typedef long double _Complex kmp_cmplx80;
+#if KMP_HAVE_QUAD
+typedef _Quad _Complex kmp_cmplx128;
+#endif
+#endif
+
+// Compiler 12.0 changed alignment of 16 and 32-byte arguments (like _Quad
+// and kmp_cmplx128) on IA-32 architecture. The following aligned structures
+// are implemented to support the old alignment in 10.1, 11.0, 11.1 and
+// introduce the new alignment in 12.0. See CQ88405.
+#if KMP_ARCH_X86 && KMP_HAVE_QUAD
+
+// 4-byte aligned structures for backward compatibility.
+
+#pragma pack(push, 4)
+
+struct KMP_DO_ALIGN(4) Quad_a4_t {
+  _Quad q;
+
+  Quad_a4_t() : q() {}
+  Quad_a4_t(const _Quad &cq) : q(cq) {}
+
+  Quad_a4_t operator+(const Quad_a4_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a4_t)(lhs + rhs);
+  }
+
+  Quad_a4_t operator-(const Quad_a4_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a4_t)(lhs - rhs);
+  }
+  Quad_a4_t operator*(const Quad_a4_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a4_t)(lhs * rhs);
+  }
+
+  Quad_a4_t operator/(const Quad_a4_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a4_t)(lhs / rhs);
+  }
+};
+
+struct KMP_DO_ALIGN(4) kmp_cmplx128_a4_t {
+  kmp_cmplx128 q;
+
+  kmp_cmplx128_a4_t() : q() {}
+
+#if defined(__cplusplus) && (KMP_OS_WINDOWS)
+  kmp_cmplx128_a4_t(const std::complex<_Quad> &c128) : q(c128) {}
+#endif
+  kmp_cmplx128_a4_t(const kmp_cmplx128 &c128) : q(c128) {}
+
+  kmp_cmplx128_a4_t operator+(const kmp_cmplx128_a4_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a4_t)(lhs + rhs);
+  }
+  kmp_cmplx128_a4_t operator-(const kmp_cmplx128_a4_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a4_t)(lhs - rhs);
+  }
+  kmp_cmplx128_a4_t operator*(const kmp_cmplx128_a4_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a4_t)(lhs * rhs);
+  }
+
+  kmp_cmplx128_a4_t operator/(const kmp_cmplx128_a4_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a4_t)(lhs / rhs);
+  }
+};
+
+#pragma pack(pop)
+
+// New 16-byte aligned structures for 12.0 compiler.
+struct KMP_DO_ALIGN(16) Quad_a16_t {
+  _Quad q;
+
+  Quad_a16_t() : q() {}
+  Quad_a16_t(const _Quad &cq) : q(cq) {}
+
+  Quad_a16_t operator+(const Quad_a16_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a16_t)(lhs + rhs);
+  }
+
+  Quad_a16_t operator-(const Quad_a16_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a16_t)(lhs - rhs);
+  }
+  Quad_a16_t operator*(const Quad_a16_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a16_t)(lhs * rhs);
+  }
+
+  Quad_a16_t operator/(const Quad_a16_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a16_t)(lhs / rhs);
+  }
+};
+
+struct KMP_DO_ALIGN(16) kmp_cmplx128_a16_t {
+  kmp_cmplx128 q;
+
+  kmp_cmplx128_a16_t() : q() {}
+
+#if defined(__cplusplus) && (KMP_OS_WINDOWS)
+  kmp_cmplx128_a16_t(const std::complex<_Quad> &c128) : q(c128) {}
+#endif
+  kmp_cmplx128_a16_t(const kmp_cmplx128 &c128) : q(c128) {}
+
+  kmp_cmplx128_a16_t operator+(const kmp_cmplx128_a16_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a16_t)(lhs + rhs);
+  }
+  kmp_cmplx128_a16_t operator-(const kmp_cmplx128_a16_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a16_t)(lhs - rhs);
+  }
+  kmp_cmplx128_a16_t operator*(const kmp_cmplx128_a16_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a16_t)(lhs * rhs);
+  }
+
+  kmp_cmplx128_a16_t operator/(const kmp_cmplx128_a16_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a16_t)(lhs / rhs);
+  }
+};
+
+#endif
+
+#if (KMP_ARCH_X86)
+#define QUAD_LEGACY Quad_a4_t
+#define CPLX128_LEG kmp_cmplx128_a4_t
+#else
+#define QUAD_LEGACY _Quad
+#define CPLX128_LEG kmp_cmplx128
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int __kmp_atomic_mode;
+
+// Atomic locks can easily become contended, so we use queuing locks for them.
+typedef kmp_queuing_lock_t kmp_atomic_lock_t;
+
+static inline void __kmp_acquire_atomic_lock(kmp_atomic_lock_t *lck,
+                                             kmp_int32 gtid) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_atomic, 0, kmp_mutex_impl_queuing,
+        (ompt_wait_id_t)(uintptr_t)lck, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  __kmp_acquire_queuing_lock(lck, gtid);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_atomic, (ompt_wait_id_t)(uintptr_t)lck,
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+}
+
+static inline int __kmp_test_atomic_lock(kmp_atomic_lock_t *lck,
+                                         kmp_int32 gtid) {
+  return __kmp_test_queuing_lock(lck, gtid);
+}
+
+static inline void __kmp_release_atomic_lock(kmp_atomic_lock_t *lck,
+                                             kmp_int32 gtid) {
+  __kmp_release_queuing_lock(lck, gtid);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_atomic, (ompt_wait_id_t)(uintptr_t)lck,
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+}
+
+static inline void __kmp_init_atomic_lock(kmp_atomic_lock_t *lck) {
+  __kmp_init_queuing_lock(lck);
+}
+
+static inline void __kmp_destroy_atomic_lock(kmp_atomic_lock_t *lck) {
+  __kmp_destroy_queuing_lock(lck);
+}
+
+// Global Locks
+extern kmp_atomic_lock_t __kmp_atomic_lock; /* Control access to all user coded
+                                               atomics in Gnu compat mode   */
+extern kmp_atomic_lock_t __kmp_atomic_lock_1i; /* Control access to all user
+                                                  coded atomics for 1-byte fixed
+                                                  data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_2i; /* Control access to all user
+                                                  coded atomics for 2-byte fixed
+                                                  data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_4i; /* Control access to all user
+                                                  coded atomics for 4-byte fixed
+                                                  data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_4r; /* Control access to all user
+                                                  coded atomics for kmp_real32
+                                                  data type    */
+extern kmp_atomic_lock_t __kmp_atomic_lock_8i; /* Control access to all user
+                                                  coded atomics for 8-byte fixed
+                                                  data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_8r; /* Control access to all user
+                                                  coded atomics for kmp_real64
+                                                  data type    */
+extern kmp_atomic_lock_t
+    __kmp_atomic_lock_8c; /* Control access to all user coded atomics for
+                             complex byte data type  */
+extern kmp_atomic_lock_t
+    __kmp_atomic_lock_10r; /* Control access to all user coded atomics for long
+                              double data type   */
+extern kmp_atomic_lock_t __kmp_atomic_lock_16r; /* Control access to all user
+                                                   coded atomics for _Quad data
+                                                   type         */
+extern kmp_atomic_lock_t __kmp_atomic_lock_16c; /* Control access to all user
+                                                   coded atomics for double
+                                                   complex data type*/
+extern kmp_atomic_lock_t
+    __kmp_atomic_lock_20c; /* Control access to all user coded atomics for long
+                              double complex type*/
+extern kmp_atomic_lock_t __kmp_atomic_lock_32c; /* Control access to all user
+                                                   coded atomics for _Quad
+                                                   complex data type */
+
+//  Below routines for atomic UPDATE are listed
+
+// 1-byte
+void __kmpc_atomic_fixed1_add(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_andb(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_div(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1u_div(ident_t *id_ref, int gtid, unsigned char *lhs,
+                               unsigned char rhs);
+void __kmpc_atomic_fixed1_mul(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_orb(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_shl(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_shr(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1u_shr(ident_t *id_ref, int gtid, unsigned char *lhs,
+                               unsigned char rhs);
+void __kmpc_atomic_fixed1_sub(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_xor(ident_t *id_ref, int gtid, char *lhs, char rhs);
+// 2-byte
+void __kmpc_atomic_fixed2_add(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_andb(ident_t *id_ref, int gtid, short *lhs,
+                               short rhs);
+void __kmpc_atomic_fixed2_div(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2u_div(ident_t *id_ref, int gtid, unsigned short *lhs,
+                               unsigned short rhs);
+void __kmpc_atomic_fixed2_mul(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_orb(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_shl(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_shr(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2u_shr(ident_t *id_ref, int gtid, unsigned short *lhs,
+                               unsigned short rhs);
+void __kmpc_atomic_fixed2_sub(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_xor(ident_t *id_ref, int gtid, short *lhs, short rhs);
+// 4-byte add / sub fixed
+void __kmpc_atomic_fixed4_add(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4_sub(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+// 4-byte add / sub float
+void __kmpc_atomic_float4_add(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+void __kmpc_atomic_float4_sub(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+// 8-byte add / sub fixed
+void __kmpc_atomic_fixed8_add(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8_sub(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+// 8-byte add / sub float
+void __kmpc_atomic_float8_add(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+void __kmpc_atomic_float8_sub(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+// 4-byte fixed
+void __kmpc_atomic_fixed4_andb(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                               kmp_int32 rhs);
+void __kmpc_atomic_fixed4_div(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4u_div(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                               kmp_uint32 rhs);
+void __kmpc_atomic_fixed4_mul(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4_orb(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4_shl(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4_shr(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4u_shr(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                               kmp_uint32 rhs);
+void __kmpc_atomic_fixed4_xor(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+// 8-byte fixed
+void __kmpc_atomic_fixed8_andb(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                               kmp_int64 rhs);
+void __kmpc_atomic_fixed8_div(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8u_div(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                               kmp_uint64 rhs);
+void __kmpc_atomic_fixed8_mul(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8_orb(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8_shl(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8_shr(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8u_shr(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                               kmp_uint64 rhs);
+void __kmpc_atomic_fixed8_xor(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+// 4-byte float
+void __kmpc_atomic_float4_div(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+void __kmpc_atomic_float4_mul(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+// 8-byte float
+void __kmpc_atomic_float8_div(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+void __kmpc_atomic_float8_mul(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+// 1-, 2-, 4-, 8-byte logical (&&, ||)
+void __kmpc_atomic_fixed1_andl(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_orl(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed2_andl(ident_t *id_ref, int gtid, short *lhs,
+                               short rhs);
+void __kmpc_atomic_fixed2_orl(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed4_andl(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                               kmp_int32 rhs);
+void __kmpc_atomic_fixed4_orl(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed8_andl(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                               kmp_int64 rhs);
+void __kmpc_atomic_fixed8_orl(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+// MIN / MAX
+void __kmpc_atomic_fixed1_max(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_min(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed2_max(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_min(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed4_max(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4_min(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed8_max(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8_min(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_float4_max(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+void __kmpc_atomic_float4_min(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+void __kmpc_atomic_float8_max(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+void __kmpc_atomic_float8_min(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+void __kmpc_atomic_float10_max(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
+void __kmpc_atomic_float10_min(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_max(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+void __kmpc_atomic_float16_min(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary; IA-32
+// architecture only
+void __kmpc_atomic_float16_max_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+void __kmpc_atomic_float16_min_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+#endif
+#endif
+// .NEQV. (same as xor)
+void __kmpc_atomic_fixed1_neqv(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed2_neqv(ident_t *id_ref, int gtid, short *lhs,
+                               short rhs);
+void __kmpc_atomic_fixed4_neqv(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                               kmp_int32 rhs);
+void __kmpc_atomic_fixed8_neqv(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                               kmp_int64 rhs);
+// .EQV. (same as ~xor)
+void __kmpc_atomic_fixed1_eqv(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed2_eqv(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed4_eqv(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed8_eqv(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+// long double type
+void __kmpc_atomic_float10_add(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
+void __kmpc_atomic_float10_sub(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
+void __kmpc_atomic_float10_mul(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
+void __kmpc_atomic_float10_div(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
+// _Quad type
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_add(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+void __kmpc_atomic_float16_sub(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+void __kmpc_atomic_float16_mul(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+void __kmpc_atomic_float16_div(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+void __kmpc_atomic_float16_add_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+void __kmpc_atomic_float16_sub_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+void __kmpc_atomic_float16_mul_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+void __kmpc_atomic_float16_div_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+#endif
+#endif
+// routines for complex types
+void __kmpc_atomic_cmplx4_add(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                              kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx4_sub(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                              kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx4_mul(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                              kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx4_div(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                              kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx8_add(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                              kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx8_sub(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                              kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx8_mul(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                              kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx8_div(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                              kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx10_add(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                               kmp_cmplx80 rhs);
+void __kmpc_atomic_cmplx10_sub(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                               kmp_cmplx80 rhs);
+void __kmpc_atomic_cmplx10_mul(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                               kmp_cmplx80 rhs);
+void __kmpc_atomic_cmplx10_div(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                               kmp_cmplx80 rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_cmplx16_add(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                               CPLX128_LEG rhs);
+void __kmpc_atomic_cmplx16_sub(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                               CPLX128_LEG rhs);
+void __kmpc_atomic_cmplx16_mul(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                               CPLX128_LEG rhs);
+void __kmpc_atomic_cmplx16_div(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                               CPLX128_LEG rhs);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+void __kmpc_atomic_cmplx16_add_a16(ident_t *id_ref, int gtid,
+                                   kmp_cmplx128_a16_t *lhs,
+                                   kmp_cmplx128_a16_t rhs);
+void __kmpc_atomic_cmplx16_sub_a16(ident_t *id_ref, int gtid,
+                                   kmp_cmplx128_a16_t *lhs,
+                                   kmp_cmplx128_a16_t rhs);
+void __kmpc_atomic_cmplx16_mul_a16(ident_t *id_ref, int gtid,
+                                   kmp_cmplx128_a16_t *lhs,
+                                   kmp_cmplx128_a16_t rhs);
+void __kmpc_atomic_cmplx16_div_a16(ident_t *id_ref, int gtid,
+                                   kmp_cmplx128_a16_t *lhs,
+                                   kmp_cmplx128_a16_t rhs);
+#endif
+#endif
+
+// OpenMP 4.0: x = expr binop x for non-commutative operations.
+// Supported only on IA-32 architecture and Intel(R) 64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+void __kmpc_atomic_fixed1_sub_rev(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs);
+void __kmpc_atomic_fixed1_div_rev(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs);
+void __kmpc_atomic_fixed1u_div_rev(ident_t *id_ref, int gtid,
+                                   unsigned char *lhs, unsigned char rhs);
+void __kmpc_atomic_fixed1_shl_rev(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs);
+void __kmpc_atomic_fixed1_shr_rev(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs);
+void __kmpc_atomic_fixed1u_shr_rev(ident_t *id_ref, int gtid,
+                                   unsigned char *lhs, unsigned char rhs);
+void __kmpc_atomic_fixed2_sub_rev(ident_t *id_ref, int gtid, short *lhs,
+                                  short rhs);
+void __kmpc_atomic_fixed2_div_rev(ident_t *id_ref, int gtid, short *lhs,
+                                  short rhs);
+void __kmpc_atomic_fixed2u_div_rev(ident_t *id_ref, int gtid,
+                                   unsigned short *lhs, unsigned short rhs);
+void __kmpc_atomic_fixed2_shl_rev(ident_t *id_ref, int gtid, short *lhs,
+                                  short rhs);
+void __kmpc_atomic_fixed2_shr_rev(ident_t *id_ref, int gtid, short *lhs,
+                                  short rhs);
+void __kmpc_atomic_fixed2u_shr_rev(ident_t *id_ref, int gtid,
+                                   unsigned short *lhs, unsigned short rhs);
+void __kmpc_atomic_fixed4_sub_rev(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                  kmp_int32 rhs);
+void __kmpc_atomic_fixed4_div_rev(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                  kmp_int32 rhs);
+void __kmpc_atomic_fixed4u_div_rev(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                   kmp_uint32 rhs);
+void __kmpc_atomic_fixed4_shl_rev(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                  kmp_int32 rhs);
+void __kmpc_atomic_fixed4_shr_rev(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                  kmp_int32 rhs);
+void __kmpc_atomic_fixed4u_shr_rev(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                   kmp_uint32 rhs);
+void __kmpc_atomic_fixed8_sub_rev(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                  kmp_int64 rhs);
+void __kmpc_atomic_fixed8_div_rev(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                  kmp_int64 rhs);
+void __kmpc_atomic_fixed8u_div_rev(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                   kmp_uint64 rhs);
+void __kmpc_atomic_fixed8_shl_rev(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                  kmp_int64 rhs);
+void __kmpc_atomic_fixed8_shr_rev(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                  kmp_int64 rhs);
+void __kmpc_atomic_fixed8u_shr_rev(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                   kmp_uint64 rhs);
+void __kmpc_atomic_float4_sub_rev(ident_t *id_ref, int gtid, float *lhs,
+                                  float rhs);
+void __kmpc_atomic_float4_div_rev(ident_t *id_ref, int gtid, float *lhs,
+                                  float rhs);
+void __kmpc_atomic_float8_sub_rev(ident_t *id_ref, int gtid, double *lhs,
+                                  double rhs);
+void __kmpc_atomic_float8_div_rev(ident_t *id_ref, int gtid, double *lhs,
+                                  double rhs);
+void __kmpc_atomic_float10_sub_rev(ident_t *id_ref, int gtid, long double *lhs,
+                                   long double rhs);
+void __kmpc_atomic_float10_div_rev(ident_t *id_ref, int gtid, long double *lhs,
+                                   long double rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_sub_rev(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                                   QUAD_LEGACY rhs);
+void __kmpc_atomic_float16_div_rev(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                                   QUAD_LEGACY rhs);
+#endif
+void __kmpc_atomic_cmplx4_sub_rev(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx4_div_rev(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx8_sub_rev(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                                  kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx8_div_rev(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                                  kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx10_sub_rev(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                                   kmp_cmplx80 rhs);
+void __kmpc_atomic_cmplx10_div_rev(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                                   kmp_cmplx80 rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_cmplx16_sub_rev(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                                   CPLX128_LEG rhs);
+void __kmpc_atomic_cmplx16_div_rev(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                                   CPLX128_LEG rhs);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+void __kmpc_atomic_float16_sub_a16_rev(ident_t *id_ref, int gtid,
+                                       Quad_a16_t *lhs, Quad_a16_t rhs);
+void __kmpc_atomic_float16_div_a16_rev(ident_t *id_ref, int gtid,
+                                       Quad_a16_t *lhs, Quad_a16_t rhs);
+void __kmpc_atomic_cmplx16_sub_a16_rev(ident_t *id_ref, int gtid,
+                                       kmp_cmplx128_a16_t *lhs,
+                                       kmp_cmplx128_a16_t rhs);
+void __kmpc_atomic_cmplx16_div_a16_rev(ident_t *id_ref, int gtid,
+                                       kmp_cmplx128_a16_t *lhs,
+                                       kmp_cmplx128_a16_t rhs);
+#endif
+#endif // KMP_HAVE_QUAD
+
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// routines for mixed types
+
+// RHS=float8
+void __kmpc_atomic_fixed1_mul_float8(ident_t *id_ref, int gtid, char *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed1_div_float8(ident_t *id_ref, int gtid, char *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed2_mul_float8(ident_t *id_ref, int gtid, short *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed2_div_float8(ident_t *id_ref, int gtid, short *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed4_mul_float8(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed4_div_float8(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed8_mul_float8(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed8_div_float8(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_float4_add_float8(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_float4_sub_float8(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_float4_mul_float8(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_float4_div_float8(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                     kmp_real64 rhs);
+
+// RHS=float16 (deprecated, to be removed when we are sure the compiler does not
+// use them)
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_fixed1_add_fp(ident_t *id_ref, int gtid, char *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed1u_add_fp(ident_t *id_ref, int gtid, unsigned char *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed1_sub_fp(ident_t *id_ref, int gtid, char *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed1u_sub_fp(ident_t *id_ref, int gtid, unsigned char *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed1_mul_fp(ident_t *id_ref, int gtid, char *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed1u_mul_fp(ident_t *id_ref, int gtid, unsigned char *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed1_div_fp(ident_t *id_ref, int gtid, char *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed1u_div_fp(ident_t *id_ref, int gtid, unsigned char *lhs,
+                                  _Quad rhs);
+
+void __kmpc_atomic_fixed2_add_fp(ident_t *id_ref, int gtid, short *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed2u_add_fp(ident_t *id_ref, int gtid,
+                                  unsigned short *lhs, _Quad rhs);
+void __kmpc_atomic_fixed2_sub_fp(ident_t *id_ref, int gtid, short *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed2u_sub_fp(ident_t *id_ref, int gtid,
+                                  unsigned short *lhs, _Quad rhs);
+void __kmpc_atomic_fixed2_mul_fp(ident_t *id_ref, int gtid, short *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed2u_mul_fp(ident_t *id_ref, int gtid,
+                                  unsigned short *lhs, _Quad rhs);
+void __kmpc_atomic_fixed2_div_fp(ident_t *id_ref, int gtid, short *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed2u_div_fp(ident_t *id_ref, int gtid,
+                                  unsigned short *lhs, _Quad rhs);
+
+void __kmpc_atomic_fixed4_add_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed4u_add_fp(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed4_sub_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed4u_sub_fp(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed4_mul_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed4u_mul_fp(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed4_div_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed4u_div_fp(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                  _Quad rhs);
+
+void __kmpc_atomic_fixed8_add_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed8u_add_fp(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed8_sub_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed8u_sub_fp(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed8_mul_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed8u_mul_fp(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed8_div_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed8u_div_fp(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                  _Quad rhs);
+
+void __kmpc_atomic_float4_add_fp(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float4_sub_fp(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float4_mul_fp(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float4_div_fp(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                 _Quad rhs);
+
+void __kmpc_atomic_float8_add_fp(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float8_sub_fp(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float8_mul_fp(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float8_div_fp(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                                 _Quad rhs);
+
+void __kmpc_atomic_float10_add_fp(ident_t *id_ref, int gtid, long double *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_float10_sub_fp(ident_t *id_ref, int gtid, long double *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_float10_mul_fp(ident_t *id_ref, int gtid, long double *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_float10_div_fp(ident_t *id_ref, int gtid, long double *lhs,
+                                  _Quad rhs);
+
+// Reverse operations
+void __kmpc_atomic_fixed1_sub_rev_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed1u_sub_rev_fp(ident_t *id_ref, int gtid,
+                                      unsigned char *lhs, _Quad rhs);
+void __kmpc_atomic_fixed1_div_rev_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed1u_div_rev_fp(ident_t *id_ref, int gtid,
+                                      unsigned char *lhs, _Quad rhs);
+void __kmpc_atomic_fixed2_sub_rev_fp(ident_t *id_ref, int gtid, short *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed2u_sub_rev_fp(ident_t *id_ref, int gtid,
+                                      unsigned short *lhs, _Quad rhs);
+void __kmpc_atomic_fixed2_div_rev_fp(ident_t *id_ref, int gtid, short *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed2u_div_rev_fp(ident_t *id_ref, int gtid,
+                                      unsigned short *lhs, _Quad rhs);
+void __kmpc_atomic_fixed4_sub_rev_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed4u_sub_rev_fp(ident_t *id_ref, int gtid,
+                                      kmp_uint32 *lhs, _Quad rhs);
+void __kmpc_atomic_fixed4_div_rev_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed4u_div_rev_fp(ident_t *id_ref, int gtid,
+                                      kmp_uint32 *lhs, _Quad rhs);
+void __kmpc_atomic_fixed8_sub_rev_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed8u_sub_rev_fp(ident_t *id_ref, int gtid,
+                                      kmp_uint64 *lhs, _Quad rhs);
+void __kmpc_atomic_fixed8_div_rev_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed8u_div_rev_fp(ident_t *id_ref, int gtid,
+                                      kmp_uint64 *lhs, _Quad rhs);
+void __kmpc_atomic_float4_sub_rev_fp(ident_t *id_ref, int gtid, float *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_float4_div_rev_fp(ident_t *id_ref, int gtid, float *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_float8_sub_rev_fp(ident_t *id_ref, int gtid, double *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_float8_div_rev_fp(ident_t *id_ref, int gtid, double *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_float10_sub_rev_fp(ident_t *id_ref, int gtid,
+                                      long double *lhs, _Quad rhs);
+void __kmpc_atomic_float10_div_rev_fp(ident_t *id_ref, int gtid,
+                                      long double *lhs, _Quad rhs);
+
+#endif // KMP_HAVE_QUAD
+
+// RHS=cmplx8
+void __kmpc_atomic_cmplx4_add_cmplx8(ident_t *id_ref, int gtid,
+                                     kmp_cmplx32 *lhs, kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx4_sub_cmplx8(ident_t *id_ref, int gtid,
+                                     kmp_cmplx32 *lhs, kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx4_mul_cmplx8(ident_t *id_ref, int gtid,
+                                     kmp_cmplx32 *lhs, kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx4_div_cmplx8(ident_t *id_ref, int gtid,
+                                     kmp_cmplx32 *lhs, kmp_cmplx64 rhs);
+
+// generic atomic routines
+void __kmpc_atomic_1(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *));
+void __kmpc_atomic_2(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *));
+void __kmpc_atomic_4(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *));
+void __kmpc_atomic_8(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *));
+void __kmpc_atomic_10(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *));
+void __kmpc_atomic_16(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *));
+void __kmpc_atomic_20(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *));
+void __kmpc_atomic_32(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *));
+
+// READ, WRITE, CAPTURE
+
+//  Below routines for atomic READ are listed
+char __kmpc_atomic_fixed1_rd(ident_t *id_ref, int gtid, char *loc);
+short __kmpc_atomic_fixed2_rd(ident_t *id_ref, int gtid, short *loc);
+kmp_int32 __kmpc_atomic_fixed4_rd(ident_t *id_ref, int gtid, kmp_int32 *loc);
+kmp_int64 __kmpc_atomic_fixed8_rd(ident_t *id_ref, int gtid, kmp_int64 *loc);
+kmp_real32 __kmpc_atomic_float4_rd(ident_t *id_ref, int gtid, kmp_real32 *loc);
+kmp_real64 __kmpc_atomic_float8_rd(ident_t *id_ref, int gtid, kmp_real64 *loc);
+long double __kmpc_atomic_float10_rd(ident_t *id_ref, int gtid,
+                                     long double *loc);
+#if KMP_HAVE_QUAD
+QUAD_LEGACY __kmpc_atomic_float16_rd(ident_t *id_ref, int gtid,
+                                     QUAD_LEGACY *loc);
+#endif
+// Fix for CQ220361: cmplx4 READ will return void on Windows* OS; read value
+// will be returned through an additional parameter
+#if (KMP_OS_WINDOWS)
+void __kmpc_atomic_cmplx4_rd(kmp_cmplx32 *out, ident_t *id_ref, int gtid,
+                             kmp_cmplx32 *loc);
+#else
+kmp_cmplx32 __kmpc_atomic_cmplx4_rd(ident_t *id_ref, int gtid,
+                                    kmp_cmplx32 *loc);
+#endif
+kmp_cmplx64 __kmpc_atomic_cmplx8_rd(ident_t *id_ref, int gtid,
+                                    kmp_cmplx64 *loc);
+kmp_cmplx80 __kmpc_atomic_cmplx10_rd(ident_t *id_ref, int gtid,
+                                     kmp_cmplx80 *loc);
+#if KMP_HAVE_QUAD
+CPLX128_LEG __kmpc_atomic_cmplx16_rd(ident_t *id_ref, int gtid,
+                                     CPLX128_LEG *loc);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+Quad_a16_t __kmpc_atomic_float16_a16_rd(ident_t *id_ref, int gtid,
+                                        Quad_a16_t *loc);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_a16_rd(ident_t *id_ref, int gtid,
+                                                kmp_cmplx128_a16_t *loc);
+#endif
+#endif
+
+//  Below routines for atomic WRITE are listed
+void __kmpc_atomic_fixed1_wr(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed2_wr(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed4_wr(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                             kmp_int32 rhs);
+void __kmpc_atomic_fixed8_wr(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                             kmp_int64 rhs);
+void __kmpc_atomic_float4_wr(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                             kmp_real32 rhs);
+void __kmpc_atomic_float8_wr(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                             kmp_real64 rhs);
+void __kmpc_atomic_float10_wr(ident_t *id_ref, int gtid, long double *lhs,
+                              long double rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_wr(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                              QUAD_LEGACY rhs);
+#endif
+void __kmpc_atomic_cmplx4_wr(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                             kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx8_wr(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                             kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx10_wr(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                              kmp_cmplx80 rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_cmplx16_wr(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                              CPLX128_LEG rhs);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+void __kmpc_atomic_float16_a16_wr(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                  Quad_a16_t rhs);
+void __kmpc_atomic_cmplx16_a16_wr(ident_t *id_ref, int gtid,
+                                  kmp_cmplx128_a16_t *lhs,
+                                  kmp_cmplx128_a16_t rhs);
+#endif
+#endif
+
+//  Below routines for atomic CAPTURE are listed
+
+// 1-byte
+char __kmpc_atomic_fixed1_add_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_andb_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                   char rhs, int flag);
+char __kmpc_atomic_fixed1_div_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_div_cpt(ident_t *id_ref, int gtid,
+                                            unsigned char *lhs,
+                                            unsigned char rhs, int flag);
+char __kmpc_atomic_fixed1_mul_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_orb_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_shl_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_shr_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_shr_cpt(ident_t *id_ref, int gtid,
+                                            unsigned char *lhs,
+                                            unsigned char rhs, int flag);
+char __kmpc_atomic_fixed1_sub_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_xor_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+// 2-byte
+short __kmpc_atomic_fixed2_add_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_andb_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                    short rhs, int flag);
+short __kmpc_atomic_fixed2_div_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_div_cpt(ident_t *id_ref, int gtid,
+                                             unsigned short *lhs,
+                                             unsigned short rhs, int flag);
+short __kmpc_atomic_fixed2_mul_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_orb_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_shl_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_shr_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_shr_cpt(ident_t *id_ref, int gtid,
+                                             unsigned short *lhs,
+                                             unsigned short rhs, int flag);
+short __kmpc_atomic_fixed2_sub_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_xor_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+// 4-byte add / sub fixed
+kmp_int32 __kmpc_atomic_fixed4_add_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_sub_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+// 4-byte add / sub float
+kmp_real32 __kmpc_atomic_float4_add_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+kmp_real32 __kmpc_atomic_float4_sub_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+// 8-byte add / sub fixed
+kmp_int64 __kmpc_atomic_fixed8_add_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_sub_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+// 8-byte add / sub float
+kmp_real64 __kmpc_atomic_float8_add_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+kmp_real64 __kmpc_atomic_float8_sub_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+// 4-byte fixed
+kmp_int32 __kmpc_atomic_fixed4_andb_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int32 *lhs, kmp_int32 rhs,
+                                        int flag);
+kmp_int32 __kmpc_atomic_fixed4_div_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_div_cpt(ident_t *id_ref, int gtid,
+                                         kmp_uint32 *lhs, kmp_uint32 rhs,
+                                         int flag);
+kmp_int32 __kmpc_atomic_fixed4_mul_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_orb_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_shl_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_shr_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_shr_cpt(ident_t *id_ref, int gtid,
+                                         kmp_uint32 *lhs, kmp_uint32 rhs,
+                                         int flag);
+kmp_int32 __kmpc_atomic_fixed4_xor_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+// 8-byte fixed
+kmp_int64 __kmpc_atomic_fixed8_andb_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int64 *lhs, kmp_int64 rhs,
+                                        int flag);
+kmp_int64 __kmpc_atomic_fixed8_div_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_div_cpt(ident_t *id_ref, int gtid,
+                                         kmp_uint64 *lhs, kmp_uint64 rhs,
+                                         int flag);
+kmp_int64 __kmpc_atomic_fixed8_mul_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_orb_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_shl_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_shr_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_shr_cpt(ident_t *id_ref, int gtid,
+                                         kmp_uint64 *lhs, kmp_uint64 rhs,
+                                         int flag);
+kmp_int64 __kmpc_atomic_fixed8_xor_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+// 4-byte float
+kmp_real32 __kmpc_atomic_float4_div_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+kmp_real32 __kmpc_atomic_float4_mul_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+// 8-byte float
+kmp_real64 __kmpc_atomic_float8_div_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+kmp_real64 __kmpc_atomic_float8_mul_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+// 1-, 2-, 4-, 8-byte logical (&&, ||)
+char __kmpc_atomic_fixed1_andl_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                   char rhs, int flag);
+char __kmpc_atomic_fixed1_orl_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+short __kmpc_atomic_fixed2_andl_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                    short rhs, int flag);
+short __kmpc_atomic_fixed2_orl_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_andl_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int32 *lhs, kmp_int32 rhs,
+                                        int flag);
+kmp_int32 __kmpc_atomic_fixed4_orl_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_andl_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int64 *lhs, kmp_int64 rhs,
+                                        int flag);
+kmp_int64 __kmpc_atomic_fixed8_orl_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+// MIN / MAX
+char __kmpc_atomic_fixed1_max_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_min_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+short __kmpc_atomic_fixed2_max_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_min_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_max_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_min_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_max_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_min_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_real32 __kmpc_atomic_float4_max_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+kmp_real32 __kmpc_atomic_float4_min_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+kmp_real64 __kmpc_atomic_float8_max_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+kmp_real64 __kmpc_atomic_float8_min_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+long double __kmpc_atomic_float10_max_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
+long double __kmpc_atomic_float10_min_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
+#if KMP_HAVE_QUAD
+QUAD_LEGACY __kmpc_atomic_float16_max_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+QUAD_LEGACY __kmpc_atomic_float16_min_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+#endif
+// .NEQV. (same as xor)
+char __kmpc_atomic_fixed1_neqv_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                   char rhs, int flag);
+short __kmpc_atomic_fixed2_neqv_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                    short rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_neqv_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int32 *lhs, kmp_int32 rhs,
+                                        int flag);
+kmp_int64 __kmpc_atomic_fixed8_neqv_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int64 *lhs, kmp_int64 rhs,
+                                        int flag);
+// .EQV. (same as ~xor)
+char __kmpc_atomic_fixed1_eqv_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+short __kmpc_atomic_fixed2_eqv_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_eqv_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_eqv_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+// long double type
+long double __kmpc_atomic_float10_add_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
+long double __kmpc_atomic_float10_sub_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
+long double __kmpc_atomic_float10_mul_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
+long double __kmpc_atomic_float10_div_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
+#if KMP_HAVE_QUAD
+// _Quad type
+QUAD_LEGACY __kmpc_atomic_float16_add_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+QUAD_LEGACY __kmpc_atomic_float16_sub_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+QUAD_LEGACY __kmpc_atomic_float16_mul_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+QUAD_LEGACY __kmpc_atomic_float16_div_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+#endif
+// routines for complex types
+// Workaround for cmplx4 routines - return void; captured value is returned via
+// the argument
+void __kmpc_atomic_cmplx4_add_cpt(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs, kmp_cmplx32 *out, int flag);
+void __kmpc_atomic_cmplx4_sub_cpt(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs, kmp_cmplx32 *out, int flag);
+void __kmpc_atomic_cmplx4_mul_cpt(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs, kmp_cmplx32 *out, int flag);
+void __kmpc_atomic_cmplx4_div_cpt(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs, kmp_cmplx32 *out, int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_add_cpt(ident_t *id_ref, int gtid,
+                                         kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                         int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_sub_cpt(ident_t *id_ref, int gtid,
+                                         kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                         int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_mul_cpt(ident_t *id_ref, int gtid,
+                                         kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                         int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_div_cpt(ident_t *id_ref, int gtid,
+                                         kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                         int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_add_cpt(ident_t *id_ref, int gtid,
+                                          kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                          int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_sub_cpt(ident_t *id_ref, int gtid,
+                                          kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                          int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_mul_cpt(ident_t *id_ref, int gtid,
+                                          kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                          int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_div_cpt(ident_t *id_ref, int gtid,
+                                          kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                          int flag);
+#if KMP_HAVE_QUAD
+CPLX128_LEG __kmpc_atomic_cmplx16_add_cpt(ident_t *id_ref, int gtid,
+                                          CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                          int flag);
+CPLX128_LEG __kmpc_atomic_cmplx16_sub_cpt(ident_t *id_ref, int gtid,
+                                          CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                          int flag);
+CPLX128_LEG __kmpc_atomic_cmplx16_mul_cpt(ident_t *id_ref, int gtid,
+                                          CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                          int flag);
+CPLX128_LEG __kmpc_atomic_cmplx16_div_cpt(ident_t *id_ref, int gtid,
+                                          CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                          int flag);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+Quad_a16_t __kmpc_atomic_float16_add_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+Quad_a16_t __kmpc_atomic_float16_sub_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+Quad_a16_t __kmpc_atomic_float16_mul_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+Quad_a16_t __kmpc_atomic_float16_div_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+Quad_a16_t __kmpc_atomic_float16_max_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+Quad_a16_t __kmpc_atomic_float16_min_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_add_a16_cpt(ident_t *id_ref, int gtid,
+                                                     kmp_cmplx128_a16_t *lhs,
+                                                     kmp_cmplx128_a16_t rhs,
+                                                     int flag);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_sub_a16_cpt(ident_t *id_ref, int gtid,
+                                                     kmp_cmplx128_a16_t *lhs,
+                                                     kmp_cmplx128_a16_t rhs,
+                                                     int flag);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_mul_a16_cpt(ident_t *id_ref, int gtid,
+                                                     kmp_cmplx128_a16_t *lhs,
+                                                     kmp_cmplx128_a16_t rhs,
+                                                     int flag);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_div_a16_cpt(ident_t *id_ref, int gtid,
+                                                     kmp_cmplx128_a16_t *lhs,
+                                                     kmp_cmplx128_a16_t rhs,
+                                                     int flag);
+#endif
+#endif
+
+void __kmpc_atomic_start(void);
+void __kmpc_atomic_end(void);
+
+// OpenMP 4.0: v = x = expr binop x; { v = x; x = expr binop x; } { x = expr
+// binop x; v = x; }  for non-commutative operations.
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+char __kmpc_atomic_fixed1_sub_cpt_rev(ident_t *id_ref, int gtid, char *lhs,
+                                      char rhs, int flag);
+char __kmpc_atomic_fixed1_div_cpt_rev(ident_t *id_ref, int gtid, char *lhs,
+                                      char rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_div_cpt_rev(ident_t *id_ref, int gtid,
+                                                unsigned char *lhs,
+                                                unsigned char rhs, int flag);
+char __kmpc_atomic_fixed1_shl_cpt_rev(ident_t *id_ref, int gtid, char *lhs,
+                                      char rhs, int flag);
+char __kmpc_atomic_fixed1_shr_cpt_rev(ident_t *id_ref, int gtid, char *lhs,
+                                      char rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                                unsigned char *lhs,
+                                                unsigned char rhs, int flag);
+short __kmpc_atomic_fixed2_sub_cpt_rev(ident_t *id_ref, int gtid, short *lhs,
+                                       short rhs, int flag);
+short __kmpc_atomic_fixed2_div_cpt_rev(ident_t *id_ref, int gtid, short *lhs,
+                                       short rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_div_cpt_rev(ident_t *id_ref, int gtid,
+                                                 unsigned short *lhs,
+                                                 unsigned short rhs, int flag);
+short __kmpc_atomic_fixed2_shl_cpt_rev(ident_t *id_ref, int gtid, short *lhs,
+                                       short rhs, int flag);
+short __kmpc_atomic_fixed2_shr_cpt_rev(ident_t *id_ref, int gtid, short *lhs,
+                                       short rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                                 unsigned short *lhs,
+                                                 unsigned short rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int32 *lhs, kmp_int32 rhs,
+                                           int flag);
+kmp_int32 __kmpc_atomic_fixed4_div_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int32 *lhs, kmp_int32 rhs,
+                                           int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_div_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_uint32 *lhs, kmp_uint32 rhs,
+                                             int flag);
+kmp_int32 __kmpc_atomic_fixed4_shl_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int32 *lhs, kmp_int32 rhs,
+                                           int flag);
+kmp_int32 __kmpc_atomic_fixed4_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int32 *lhs, kmp_int32 rhs,
+                                           int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_uint32 *lhs, kmp_uint32 rhs,
+                                             int flag);
+kmp_int64 __kmpc_atomic_fixed8_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int64 *lhs, kmp_int64 rhs,
+                                           int flag);
+kmp_int64 __kmpc_atomic_fixed8_div_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int64 *lhs, kmp_int64 rhs,
+                                           int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_div_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_uint64 *lhs, kmp_uint64 rhs,
+                                             int flag);
+kmp_int64 __kmpc_atomic_fixed8_shl_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int64 *lhs, kmp_int64 rhs,
+                                           int flag);
+kmp_int64 __kmpc_atomic_fixed8_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int64 *lhs, kmp_int64 rhs,
+                                           int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_uint64 *lhs, kmp_uint64 rhs,
+                                             int flag);
+float __kmpc_atomic_float4_sub_cpt_rev(ident_t *id_ref, int gtid, float *lhs,
+                                       float rhs, int flag);
+float __kmpc_atomic_float4_div_cpt_rev(ident_t *id_ref, int gtid, float *lhs,
+                                       float rhs, int flag);
+double __kmpc_atomic_float8_sub_cpt_rev(ident_t *id_ref, int gtid, double *lhs,
+                                        double rhs, int flag);
+double __kmpc_atomic_float8_div_cpt_rev(ident_t *id_ref, int gtid, double *lhs,
+                                        double rhs, int flag);
+long double __kmpc_atomic_float10_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                              long double *lhs, long double rhs,
+                                              int flag);
+long double __kmpc_atomic_float10_div_cpt_rev(ident_t *id_ref, int gtid,
+                                              long double *lhs, long double rhs,
+                                              int flag);
+#if KMP_HAVE_QUAD
+QUAD_LEGACY __kmpc_atomic_float16_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                              QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                              int flag);
+QUAD_LEGACY __kmpc_atomic_float16_div_cpt_rev(ident_t *id_ref, int gtid,
+                                              QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                              int flag);
+#endif
+// Workaround for cmplx4 routines - return void; captured value is returned via
+// the argument
+void __kmpc_atomic_cmplx4_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                      kmp_cmplx32 *lhs, kmp_cmplx32 rhs,
+                                      kmp_cmplx32 *out, int flag);
+void __kmpc_atomic_cmplx4_div_cpt_rev(ident_t *id_ref, int gtid,
+                                      kmp_cmplx32 *lhs, kmp_cmplx32 rhs,
+                                      kmp_cmplx32 *out, int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                             int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_div_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                             int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                              kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                              int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_div_cpt_rev(ident_t *id_ref, int gtid,
+                                              kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                              int flag);
+#if KMP_HAVE_QUAD
+CPLX128_LEG __kmpc_atomic_cmplx16_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                              CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                              int flag);
+CPLX128_LEG __kmpc_atomic_cmplx16_div_cpt_rev(ident_t *id_ref, int gtid,
+                                              CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                              int flag);
+#if (KMP_ARCH_X86)
+Quad_a16_t __kmpc_atomic_float16_sub_a16_cpt_rev(ident_t *id_ref, int gtid,
+                                                 Quad_a16_t *lhs,
+                                                 Quad_a16_t rhs, int flag);
+Quad_a16_t __kmpc_atomic_float16_div_a16_cpt_rev(ident_t *id_ref, int gtid,
+                                                 Quad_a16_t *lhs,
+                                                 Quad_a16_t rhs, int flag);
+kmp_cmplx128_a16_t
+__kmpc_atomic_cmplx16_sub_a16_cpt_rev(ident_t *id_ref, int gtid,
+                                      kmp_cmplx128_a16_t *lhs,
+                                      kmp_cmplx128_a16_t rhs, int flag);
+kmp_cmplx128_a16_t
+__kmpc_atomic_cmplx16_div_a16_cpt_rev(ident_t *id_ref, int gtid,
+                                      kmp_cmplx128_a16_t *lhs,
+                                      kmp_cmplx128_a16_t rhs, int flag);
+#endif
+#endif
+
+//   OpenMP 4.0 Capture-write (swap): {v = x; x = expr;}
+char __kmpc_atomic_fixed1_swp(ident_t *id_ref, int gtid, char *lhs, char rhs);
+short __kmpc_atomic_fixed2_swp(ident_t *id_ref, int gtid, short *lhs,
+                               short rhs);
+kmp_int32 __kmpc_atomic_fixed4_swp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                   kmp_int32 rhs);
+kmp_int64 __kmpc_atomic_fixed8_swp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                   kmp_int64 rhs);
+float __kmpc_atomic_float4_swp(ident_t *id_ref, int gtid, float *lhs,
+                               float rhs);
+double __kmpc_atomic_float8_swp(ident_t *id_ref, int gtid, double *lhs,
+                                double rhs);
+long double __kmpc_atomic_float10_swp(ident_t *id_ref, int gtid,
+                                      long double *lhs, long double rhs);
+#if KMP_HAVE_QUAD
+QUAD_LEGACY __kmpc_atomic_float16_swp(ident_t *id_ref, int gtid,
+                                      QUAD_LEGACY *lhs, QUAD_LEGACY rhs);
+#endif
+// !!! TODO: check if we need a workaround here
+void __kmpc_atomic_cmplx4_swp(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                              kmp_cmplx32 rhs, kmp_cmplx32 *out);
+// kmp_cmplx32   	__kmpc_atomic_cmplx4_swp(  ident_t *id_ref, int gtid,
+// kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+
+kmp_cmplx64 __kmpc_atomic_cmplx8_swp(ident_t *id_ref, int gtid,
+                                     kmp_cmplx64 *lhs, kmp_cmplx64 rhs);
+kmp_cmplx80 __kmpc_atomic_cmplx10_swp(ident_t *id_ref, int gtid,
+                                      kmp_cmplx80 *lhs, kmp_cmplx80 rhs);
+#if KMP_HAVE_QUAD
+CPLX128_LEG __kmpc_atomic_cmplx16_swp(ident_t *id_ref, int gtid,
+                                      CPLX128_LEG *lhs, CPLX128_LEG rhs);
+#if (KMP_ARCH_X86)
+Quad_a16_t __kmpc_atomic_float16_a16_swp(ident_t *id_ref, int gtid,
+                                         Quad_a16_t *lhs, Quad_a16_t rhs);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_a16_swp(ident_t *id_ref, int gtid,
+                                                 kmp_cmplx128_a16_t *lhs,
+                                                 kmp_cmplx128_a16_t rhs);
+#endif
+#endif
+
+// Capture routines for mixed types (RHS=float16)
+#if KMP_HAVE_QUAD
+
+char __kmpc_atomic_fixed1_add_cpt_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs, int flag);
+char __kmpc_atomic_fixed1_sub_cpt_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs, int flag);
+char __kmpc_atomic_fixed1_mul_cpt_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs, int flag);
+char __kmpc_atomic_fixed1_div_cpt_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_add_cpt_fp(ident_t *id_ref, int gtid,
+                                               unsigned char *lhs, _Quad rhs,
+                                               int flag);
+unsigned char __kmpc_atomic_fixed1u_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                               unsigned char *lhs, _Quad rhs,
+                                               int flag);
+unsigned char __kmpc_atomic_fixed1u_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                               unsigned char *lhs, _Quad rhs,
+                                               int flag);
+unsigned char __kmpc_atomic_fixed1u_div_cpt_fp(ident_t *id_ref, int gtid,
+                                               unsigned char *lhs, _Quad rhs,
+                                               int flag);
+
+short __kmpc_atomic_fixed2_add_cpt_fp(ident_t *id_ref, int gtid, short *lhs,
+                                      _Quad rhs, int flag);
+short __kmpc_atomic_fixed2_sub_cpt_fp(ident_t *id_ref, int gtid, short *lhs,
+                                      _Quad rhs, int flag);
+short __kmpc_atomic_fixed2_mul_cpt_fp(ident_t *id_ref, int gtid, short *lhs,
+                                      _Quad rhs, int flag);
+short __kmpc_atomic_fixed2_div_cpt_fp(ident_t *id_ref, int gtid, short *lhs,
+                                      _Quad rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_add_cpt_fp(ident_t *id_ref, int gtid,
+                                                unsigned short *lhs, _Quad rhs,
+                                                int flag);
+unsigned short __kmpc_atomic_fixed2u_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                                unsigned short *lhs, _Quad rhs,
+                                                int flag);
+unsigned short __kmpc_atomic_fixed2u_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                                unsigned short *lhs, _Quad rhs,
+                                                int flag);
+unsigned short __kmpc_atomic_fixed2u_div_cpt_fp(ident_t *id_ref, int gtid,
+                                                unsigned short *lhs, _Quad rhs,
+                                                int flag);
+
+kmp_int32 __kmpc_atomic_fixed4_add_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int32 *lhs, _Quad rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int32 *lhs, _Quad rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int32 *lhs, _Quad rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_div_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int32 *lhs, _Quad rhs, int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_add_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint32 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint32 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint32 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_div_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint32 *lhs, _Quad rhs,
+                                            int flag);
+
+kmp_int64 __kmpc_atomic_fixed8_add_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int64 *lhs, _Quad rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int64 *lhs, _Quad rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int64 *lhs, _Quad rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_div_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int64 *lhs, _Quad rhs, int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_add_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint64 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint64 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint64 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_div_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint64 *lhs, _Quad rhs,
+                                            int flag);
+
+float __kmpc_atomic_float4_add_cpt_fp(ident_t *id_ref, int gtid,
+                                      kmp_real32 *lhs, _Quad rhs, int flag);
+float __kmpc_atomic_float4_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                      kmp_real32 *lhs, _Quad rhs, int flag);
+float __kmpc_atomic_float4_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                      kmp_real32 *lhs, _Quad rhs, int flag);
+float __kmpc_atomic_float4_div_cpt_fp(ident_t *id_ref, int gtid,
+                                      kmp_real32 *lhs, _Quad rhs, int flag);
+
+double __kmpc_atomic_float8_add_cpt_fp(ident_t *id_ref, int gtid,
+                                       kmp_real64 *lhs, _Quad rhs, int flag);
+double __kmpc_atomic_float8_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                       kmp_real64 *lhs, _Quad rhs, int flag);
+double __kmpc_atomic_float8_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                       kmp_real64 *lhs, _Quad rhs, int flag);
+double __kmpc_atomic_float8_div_cpt_fp(ident_t *id_ref, int gtid,
+                                       kmp_real64 *lhs, _Quad rhs, int flag);
+
+long double __kmpc_atomic_float10_add_cpt_fp(ident_t *id_ref, int gtid,
+                                             long double *lhs, _Quad rhs,
+                                             int flag);
+long double __kmpc_atomic_float10_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                             long double *lhs, _Quad rhs,
+                                             int flag);
+long double __kmpc_atomic_float10_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                             long double *lhs, _Quad rhs,
+                                             int flag);
+long double __kmpc_atomic_float10_div_cpt_fp(ident_t *id_ref, int gtid,
+                                             long double *lhs, _Quad rhs,
+                                             int flag);
+
+char __kmpc_atomic_fixed1_sub_cpt_rev_fp(ident_t *id_ref, int gtid, char *lhs,
+                                         _Quad rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                   unsigned char *lhs,
+                                                   _Quad rhs, int flag);
+char __kmpc_atomic_fixed1_div_cpt_rev_fp(ident_t *id_ref, int gtid, char *lhs,
+                                         _Quad rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                   unsigned char *lhs,
+                                                   _Quad rhs, int flag);
+short __kmpc_atomic_fixed2_sub_cpt_rev_fp(ident_t *id_ref, int gtid, short *lhs,
+                                          _Quad rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                    unsigned short *lhs,
+                                                    _Quad rhs, int flag);
+short __kmpc_atomic_fixed2_div_cpt_rev_fp(ident_t *id_ref, int gtid, short *lhs,
+                                          _Quad rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                    unsigned short *lhs,
+                                                    _Quad rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                              kmp_int32 *lhs, _Quad rhs,
+                                              int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                kmp_uint32 *lhs, _Quad rhs,
+                                                int flag);
+kmp_int32 __kmpc_atomic_fixed4_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                              kmp_int32 *lhs, _Quad rhs,
+                                              int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                kmp_uint32 *lhs, _Quad rhs,
+                                                int flag);
+kmp_int64 __kmpc_atomic_fixed8_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                              kmp_int64 *lhs, _Quad rhs,
+                                              int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                kmp_uint64 *lhs, _Quad rhs,
+                                                int flag);
+kmp_int64 __kmpc_atomic_fixed8_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                              kmp_int64 *lhs, _Quad rhs,
+                                              int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                kmp_uint64 *lhs, _Quad rhs,
+                                                int flag);
+float __kmpc_atomic_float4_sub_cpt_rev_fp(ident_t *id_ref, int gtid, float *lhs,
+                                          _Quad rhs, int flag);
+float __kmpc_atomic_float4_div_cpt_rev_fp(ident_t *id_ref, int gtid, float *lhs,
+                                          _Quad rhs, int flag);
+double __kmpc_atomic_float8_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                           double *lhs, _Quad rhs, int flag);
+double __kmpc_atomic_float8_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                           double *lhs, _Quad rhs, int flag);
+long double __kmpc_atomic_float10_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                 long double *lhs, _Quad rhs,
+                                                 int flag);
+long double __kmpc_atomic_float10_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                 long double *lhs, _Quad rhs,
+                                                 int flag);
+
+#endif // KMP_HAVE_QUAD
+
+// End of OpenMP 4.0 capture
+
+// OpenMP 5.1 compare and swap
+/*
+    __kmpc_atomic_bool_1_cas
+    __kmpc_atomic_bool_2_cas
+    __kmpc_atomic_bool_4_cas
+    __kmpc_atomic_bool_8_cas
+    __kmpc_atomic_val_1_cas
+    __kmpc_atomic_val_2_cas
+    __kmpc_atomic_val_4_cas
+    __kmpc_atomic_val_8_cas
+    __kmpc_atomic_bool_1_cas_cpt
+    __kmpc_atomic_bool_2_cas_cpt
+    __kmpc_atomic_bool_4_cas_cpt
+    __kmpc_atomic_bool_8_cas_cpt
+    __kmpc_atomic_val_1_cas_cpt
+    __kmpc_atomic_val_2_cas_cpt
+    __kmpc_atomic_val_4_cas_cpt
+    __kmpc_atomic_val_8_cas_cpt
+*/
+// In all interfaces of CAS (Compare And Swap):
+// r is the boolean result of comparison
+// x is memory location to operate on
+// e is expected (old) value
+// d is desired (new) value
+// pv is pointer to captured value v whose location may coincide with e
+
+// { r = x == e; if(r) { x = d; } }
+// functions return result of comparison
+bool __kmpc_atomic_bool_1_cas(ident_t *loc, int gtid, char *x, char e, char d);
+bool __kmpc_atomic_bool_2_cas(ident_t *loc, int gtid, short *x, short e,
+                              short d);
+bool __kmpc_atomic_bool_4_cas(ident_t *loc, int gtid, kmp_int32 *x, kmp_int32 e,
+                              kmp_int32 d);
+bool __kmpc_atomic_bool_8_cas(ident_t *loc, int gtid, kmp_int64 *x, kmp_int64 e,
+                              kmp_int64 d);
+
+// { v = x; if (x == e) { x = d; } }
+// functions return old value
+char __kmpc_atomic_val_1_cas(ident_t *loc, int gtid, char *x, char e, char d);
+short __kmpc_atomic_val_2_cas(ident_t *loc, int gtid, short *x, short e,
+                              short d);
+kmp_int32 __kmpc_atomic_val_4_cas(ident_t *loc, int gtid, kmp_int32 *x,
+                                  kmp_int32 e, kmp_int32 d);
+kmp_int64 __kmpc_atomic_val_8_cas(ident_t *loc, int gtid, kmp_int64 *x,
+                                  kmp_int64 e, kmp_int64 d);
+
+// { r = x == e; if(r) { x = d; } else { v = x; } }
+// v gets old value if comparison failed, untouched otherwise
+// functions return result of comparison
+bool __kmpc_atomic_bool_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+                                  char d, char *pv);
+bool __kmpc_atomic_bool_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+                                  short d, short *pv);
+bool __kmpc_atomic_bool_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+                                  kmp_int32 e, kmp_int32 d, kmp_int32 *pv);
+bool __kmpc_atomic_bool_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+                                  kmp_int64 e, kmp_int64 d, kmp_int64 *pv);
+
+// { if (x == e) { x = d; }; v = x; }
+// v gets old value if comparison failed, new value otherwise
+// functions return old value
+char __kmpc_atomic_val_1_cas_cpt(ident_t *loc, int gtid, char *x, char e,
+                                 char d, char *pv);
+short __kmpc_atomic_val_2_cas_cpt(ident_t *loc, int gtid, short *x, short e,
+                                  short d, short *pv);
+kmp_int32 __kmpc_atomic_val_4_cas_cpt(ident_t *loc, int gtid, kmp_int32 *x,
+                                      kmp_int32 e, kmp_int32 d, kmp_int32 *pv);
+kmp_int64 __kmpc_atomic_val_8_cas_cpt(ident_t *loc, int gtid, kmp_int64 *x,
+                                      kmp_int64 e, kmp_int64 d, kmp_int64 *pv);
+
+// End OpenMP 5.1 compare + capture
+
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /* KMP_ATOMIC_H */
+
+// end of file
diff --git a/third_party/openmp/kmp_barrier.cpp b/third_party/openmp/kmp_barrier.cpp
new file mode 100644
index 000000000..e9ab15f17
--- /dev/null
+++ b/third_party/openmp/kmp_barrier.cpp
@@ -0,0 +1,2670 @@
+/*
+ * kmp_barrier.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_wait_release.h"
+#include "kmp_barrier.h"
+#include "kmp_itt.h"
+#include "kmp_os.h"
+#include "kmp_stats.h"
+#include "ompt-specific.h"
+// for distributed barrier
+#include "kmp_affinity.h"
+
+#if KMP_MIC
+#include <immintrin.h>
+#define USE_NGO_STORES 1
+#endif // KMP_MIC
+
+#if KMP_MIC && USE_NGO_STORES
+// ICV copying
+#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
+#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
+#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
+#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
+#else
+#define ngo_load(src) ((void)0)
+#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
+#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
+#define ngo_sync() ((void)0)
+#endif /* KMP_MIC && USE_NGO_STORES */
+
+void __kmp_print_structure(void); // Forward declaration
+
+// ---------------------------- Barrier Algorithms ----------------------------
+// Distributed barrier
+
+// Compute how many threads to have polling each cache-line.
+// We want to limit the number of writes to IDEAL_GO_RESOLUTION.
+void distributedBarrier::computeVarsForN(size_t n) {
+  int nsockets = 1;
+  if (__kmp_topology) {
+    int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
+    int core_level = __kmp_topology->get_level(KMP_HW_CORE);
+    int ncores_per_socket =
+        __kmp_topology->calculate_ratio(core_level, socket_level);
+    nsockets = __kmp_topology->get_count(socket_level);
+
+    if (nsockets <= 0)
+      nsockets = 1;
+    if (ncores_per_socket <= 0)
+      ncores_per_socket = 1;
+
+    threads_per_go = ncores_per_socket >> 1;
+    if (!fix_threads_per_go) {
+      // Minimize num_gos
+      if (threads_per_go > 4) {
+        if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
+          threads_per_go = threads_per_go >> 1;
+        }
+        if (threads_per_go > 4 && nsockets == 1)
+          threads_per_go = threads_per_go >> 1;
+      }
+    }
+    if (threads_per_go == 0)
+      threads_per_go = 1;
+    fix_threads_per_go = true;
+    num_gos = n / threads_per_go;
+    if (n % threads_per_go)
+      num_gos++;
+    if (nsockets == 1 || num_gos == 1)
+      num_groups = 1;
+    else {
+      num_groups = num_gos / nsockets;
+      if (num_gos % nsockets)
+        num_groups++;
+    }
+    if (num_groups <= 0)
+      num_groups = 1;
+    gos_per_group = num_gos / num_groups;
+    if (num_gos % num_groups)
+      gos_per_group++;
+    threads_per_group = threads_per_go * gos_per_group;
+  } else {
+    num_gos = n / threads_per_go;
+    if (n % threads_per_go)
+      num_gos++;
+    if (num_gos == 1)
+      num_groups = 1;
+    else {
+      num_groups = num_gos / 2;
+      if (num_gos % 2)
+        num_groups++;
+    }
+    gos_per_group = num_gos / num_groups;
+    if (num_gos % num_groups)
+      gos_per_group++;
+    threads_per_group = threads_per_go * gos_per_group;
+  }
+}
+
+void distributedBarrier::computeGo(size_t n) {
+  // Minimize num_gos
+  for (num_gos = 1;; num_gos++)
+    if (IDEAL_CONTENTION * num_gos >= n)
+      break;
+  threads_per_go = n / num_gos;
+  if (n % num_gos)
+    threads_per_go++;
+  while (num_gos > MAX_GOS) {
+    threads_per_go++;
+    num_gos = n / threads_per_go;
+    if (n % threads_per_go)
+      num_gos++;
+  }
+  computeVarsForN(n);
+}
+
+// This function is to resize the barrier arrays when the new number of threads
+// exceeds max_threads, which is the current size of all the arrays
+void distributedBarrier::resize(size_t nthr) {
+  KMP_DEBUG_ASSERT(nthr > max_threads);
+
+  // expand to requested size * 2
+  max_threads = nthr * 2;
+
+  // allocate arrays to new max threads
+  for (int i = 0; i < MAX_ITERS; ++i) {
+    if (flags[i])
+      flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
+                                                 max_threads * sizeof(flags_s));
+    else
+      flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
+  }
+
+  if (go)
+    go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
+  else
+    go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
+
+  if (iter)
+    iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
+  else
+    iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
+
+  if (sleep)
+    sleep =
+        (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
+  else
+    sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
+}
+
+// This function is to set all the go flags that threads might be waiting
+// on, and when blocktime is not infinite, it should be followed by a wake-up
+// call to each thread
+kmp_uint64 distributedBarrier::go_release() {
+  kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
+  for (size_t j = 0; j < num_gos; j++) {
+    go[j].go.store(next_go);
+  }
+  return next_go;
+}
+
+void distributedBarrier::go_reset() {
+  for (size_t j = 0; j < max_threads; ++j) {
+    for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
+      flags[i][j].stillNeed = 1;
+    }
+    go[j].go.store(0);
+    iter[j].iter = 0;
+  }
+}
+
+// This function inits/re-inits the distributed barrier for a particular number
+// of threads. If a resize of arrays is needed, it calls the resize function.
+void distributedBarrier::init(size_t nthr) {
+  size_t old_max = max_threads;
+  if (nthr > max_threads) { // need more space in arrays
+    resize(nthr);
+  }
+
+  for (size_t i = 0; i < max_threads; i++) {
+    for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
+      flags[j][i].stillNeed = 1;
+    }
+    go[i].go.store(0);
+    iter[i].iter = 0;
+    if (i >= old_max)
+      sleep[i].sleep = false;
+  }
+
+  // Recalculate num_gos, etc. based on new nthr
+  computeVarsForN(nthr);
+
+  num_threads = nthr;
+
+  if (team_icvs == NULL)
+    team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
+}
+
+// This function is used only when KMP_BLOCKTIME is not infinite.
+// static
+void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
+                               size_t start, size_t stop, size_t inc,
+                               size_t tid) {
+  KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
+  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+    return;
+
+  kmp_info_t **other_threads = team->t.t_threads;
+  for (size_t thr = start; thr < stop; thr += inc) {
+    KMP_DEBUG_ASSERT(other_threads[thr]);
+    int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
+    // Wake up worker regardless of if it appears to be sleeping or not
+    __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
+  }
+}
+
+static void __kmp_dist_barrier_gather(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
+  kmp_team_t *team;
+  distributedBarrier *b;
+  kmp_info_t **other_threads;
+  kmp_uint64 my_current_iter, my_next_iter;
+  kmp_uint32 nproc;
+  bool group_leader;
+
+  team = this_thr->th.th_team;
+  nproc = this_thr->th.th_team_nproc;
+  other_threads = team->t.t_threads;
+  b = team->t.b;
+  my_current_iter = b->iter[tid].iter;
+  my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
+  group_leader = ((tid % b->threads_per_group) == 0);
+
+  KA_TRACE(20,
+           ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
+            gtid, team->t.t_id, tid, bt));
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  // Barrier imbalance - save arrive time to the thread
+  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
+        __itt_get_timestamp();
+  }
+#endif
+
+  if (group_leader) {
+    // Start from the thread after the group leader
+    size_t group_start = tid + 1;
+    size_t group_end = tid + b->threads_per_group;
+    size_t threads_pending = 0;
+
+    if (group_end > nproc)
+      group_end = nproc;
+    do { // wait for threads in my group
+      threads_pending = 0;
+      // Check all the flags every time to avoid branch misspredict
+      for (size_t thr = group_start; thr < group_end; thr++) {
+        // Each thread uses a different cache line
+        threads_pending += b->flags[my_current_iter][thr].stillNeed;
+      }
+      // Execute tasks here
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        kmp_task_team_t *task_team = this_thr->th.th_task_team;
+        if (task_team != NULL) {
+          if (TCR_SYNC_4(task_team->tt.tt_active)) {
+            if (KMP_TASKING_ENABLED(task_team)) {
+              int tasks_completed = FALSE;
+              __kmp_atomic_execute_tasks_64(
+                  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
+                  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+            } else
+              this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+          }
+        } else {
+          this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+        } // if
+      }
+      if (TCR_4(__kmp_global.g.g_done)) {
+        if (__kmp_global.g.g_abort)
+          __kmp_abort_thread();
+        break;
+      } else if (__kmp_tasking_mode != tskm_immediate_exec &&
+                 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
+        this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+      }
+    } while (threads_pending > 0);
+
+    if (reduce) { // Perform reduction if needed
+      OMPT_REDUCTION_DECL(this_thr, gtid);
+      OMPT_REDUCTION_BEGIN;
+      // Group leader reduces all threads in group
+      for (size_t thr = group_start; thr < group_end; thr++) {
+        (*reduce)(this_thr->th.th_local.reduce_data,
+                  other_threads[thr]->th.th_local.reduce_data);
+      }
+      OMPT_REDUCTION_END;
+    }
+
+    // Set flag for next iteration
+    b->flags[my_next_iter][tid].stillNeed = 1;
+    // Each thread uses a different cache line; resets stillNeed to 0 to
+    // indicate it has reached the barrier
+    b->flags[my_current_iter][tid].stillNeed = 0;
+
+    do { // wait for all group leaders
+      threads_pending = 0;
+      for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
+        threads_pending += b->flags[my_current_iter][thr].stillNeed;
+      }
+      // Execute tasks here
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        kmp_task_team_t *task_team = this_thr->th.th_task_team;
+        if (task_team != NULL) {
+          if (TCR_SYNC_4(task_team->tt.tt_active)) {
+            if (KMP_TASKING_ENABLED(task_team)) {
+              int tasks_completed = FALSE;
+              __kmp_atomic_execute_tasks_64(
+                  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
+                  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+            } else
+              this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+          }
+        } else {
+          this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+        } // if
+      }
+      if (TCR_4(__kmp_global.g.g_done)) {
+        if (__kmp_global.g.g_abort)
+          __kmp_abort_thread();
+        break;
+      } else if (__kmp_tasking_mode != tskm_immediate_exec &&
+                 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
+        this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+      }
+    } while (threads_pending > 0);
+
+    if (reduce) { // Perform reduction if needed
+      if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
+        for (size_t thr = b->threads_per_group; thr < nproc;
+             thr += b->threads_per_group) {
+          (*reduce)(this_thr->th.th_local.reduce_data,
+                    other_threads[thr]->th.th_local.reduce_data);
+        }
+        OMPT_REDUCTION_END;
+      }
+    }
+  } else {
+    // Set flag for next iteration
+    b->flags[my_next_iter][tid].stillNeed = 1;
+    // Each thread uses a different cache line; resets stillNeed to 0 to
+    // indicate it has reached the barrier
+    b->flags[my_current_iter][tid].stillNeed = 0;
+  }
+
+  KMP_MFENCE();
+
+  KA_TRACE(20,
+           ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+            gtid, team->t.t_id, tid, bt));
+}
+
+static void __kmp_dist_barrier_release(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
+  kmp_team_t *team;
+  distributedBarrier *b;
+  kmp_bstate_t *thr_bar;
+  kmp_uint64 my_current_iter, next_go;
+  size_t my_go_index;
+  bool group_leader;
+
+  KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
+                gtid, tid, bt));
+
+  thr_bar = &this_thr->th.th_bar[bt].bb;
+
+  if (!KMP_MASTER_TID(tid)) {
+    // workers and non-master group leaders need to check their presence in team
+    do {
+      if (this_thr->th.th_used_in_team.load() != 1 &&
+          this_thr->th.th_used_in_team.load() != 3) {
+        // Thread is not in use in a team. Wait on location in tid's thread
+        // struct. The 0 value tells anyone looking that this thread is spinning
+        // or sleeping until this location becomes 3 again; 3 is the transition
+        // state to get to 1 which is waiting on go and being in the team
+        kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
+        if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
+                                        0) ||
+            this_thr->th.th_used_in_team.load() == 0) {
+          my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
+        }
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+          // In fork barrier where we could not get the object reliably
+          itt_sync_obj =
+              __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+          // Cancel wait on previous parallel region...
+          __kmp_itt_task_starting(itt_sync_obj);
+
+          if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+            return;
+
+          itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+          if (itt_sync_obj != NULL)
+            // Call prepare as early as possible for "new" barrier
+            __kmp_itt_task_finished(itt_sync_obj);
+        } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+          return;
+      }
+      if (this_thr->th.th_used_in_team.load() != 1 &&
+          this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
+        continue;
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return;
+
+      // At this point, the thread thinks it is in use in a team, or in
+      // transition to be used in a team, but it might have reached this barrier
+      // before it was marked unused by the team. Unused threads are awoken and
+      // shifted to wait on local thread struct elsewhere. It also might reach
+      // this point by being picked up for use by a different team. Either way,
+      // we need to update the tid.
+      tid = __kmp_tid_from_gtid(gtid);
+      team = this_thr->th.th_team;
+      KMP_DEBUG_ASSERT(tid >= 0);
+      KMP_DEBUG_ASSERT(team);
+      b = team->t.b;
+      my_current_iter = b->iter[tid].iter;
+      next_go = my_current_iter + distributedBarrier::MAX_ITERS;
+      my_go_index = tid / b->threads_per_go;
+      if (this_thr->th.th_used_in_team.load() == 3) {
+        KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
+      }
+      // Check if go flag is set
+      if (b->go[my_go_index].go.load() != next_go) {
+        // Wait on go flag on team
+        kmp_atomic_flag_64<false, true> my_flag(
+            &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
+        my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
+        KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
+                         b->iter[tid].iter == 0);
+        KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
+      }
+
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return;
+      // At this point, the thread's go location was set. This means the primary
+      // thread is safely in the barrier, and so this thread's data is
+      // up-to-date, but we should check again that this thread is really in
+      // use in the team, as it could have been woken up for the purpose of
+      // changing team size, or reaping threads at shutdown.
+      if (this_thr->th.th_used_in_team.load() == 1)
+        break;
+    } while (1);
+
+    if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+      return;
+
+    group_leader = ((tid % b->threads_per_group) == 0);
+    if (group_leader) {
+      // Tell all the threads in my group they can go!
+      for (size_t go_idx = my_go_index + 1;
+           go_idx < my_go_index + b->gos_per_group; go_idx++) {
+        b->go[go_idx].go.store(next_go);
+      }
+      // Fence added so that workers can see changes to go. sfence inadequate.
+      KMP_MFENCE();
+    }
+
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs) { // copy ICVs to final dest
+      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
+                               tid, FALSE);
+      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                (kmp_internal_control_t *)team->t.b->team_icvs);
+      copy_icvs(&thr_bar->th_fixed_icvs,
+                &team->t.t_implicit_task_taskdata[tid].td_icvs);
+    }
+#endif
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
+      // This thread is now awake and participating in the barrier;
+      // wake up the other threads in the group
+      size_t nproc = this_thr->th.th_team_nproc;
+      size_t group_end = tid + b->threads_per_group;
+      if (nproc < group_end)
+        group_end = nproc;
+      __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
+    }
+  } else { //  Primary thread
+    team = this_thr->th.th_team;
+    b = team->t.b;
+    my_current_iter = b->iter[tid].iter;
+    next_go = my_current_iter + distributedBarrier::MAX_ITERS;
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs) {
+      // primary thread has ICVs in final destination; copy
+      copy_icvs(&thr_bar->th_fixed_icvs,
+                &team->t.t_implicit_task_taskdata[tid].td_icvs);
+    }
+#endif
+    // Tell all the group leaders they can go!
+    for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
+      b->go[go_idx].go.store(next_go);
+    }
+
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+      // Wake-up the group leaders
+      size_t nproc = this_thr->th.th_team_nproc;
+      __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
+                                b->threads_per_group, tid);
+    }
+
+    // Tell all the threads in my group they can go!
+    for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
+      b->go[go_idx].go.store(next_go);
+    }
+
+    // Fence added so that workers can see changes to go. sfence inadequate.
+    KMP_MFENCE();
+
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+      // Wake-up the other threads in my group
+      size_t nproc = this_thr->th.th_team_nproc;
+      size_t group_end = tid + b->threads_per_group;
+      if (nproc < group_end)
+        group_end = nproc;
+      __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
+    }
+  }
+  // Update to next iteration
+  KMP_ASSERT(my_current_iter == b->iter[tid].iter);
+  b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
+
+  KA_TRACE(
+      20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+           gtid, team->t.t_id, tid, bt));
+}
+
+// Linear Barrier
+template <bool cancellable = false>
+static bool __kmp_linear_barrier_gather_template(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
+  kmp_team_t *team = this_thr->th.th_team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_info_t **other_threads = team->t.t_threads;
+
+  KA_TRACE(
+      20,
+      ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+       gtid, team->t.t_id, tid, bt));
+  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  // Barrier imbalance - save arrive time to the thread
+  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
+        __itt_get_timestamp();
+  }
+#endif
+  // We now perform a linear reduction to signal that all of the threads have
+  // arrived.
+  if (!KMP_MASTER_TID(tid)) {
+    KA_TRACE(20,
+             ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
+              "arrived(%p): %llu => %llu\n",
+              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
+              team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
+              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+    // Mark arrival to primary thread
+    /* After performing this write, a worker thread may not assume that the team
+       is valid any more - it could be deallocated by the primary thread at any
+       time. */
+    kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
+    flag.release();
+  } else {
+    kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
+    int nproc = this_thr->th.th_team_nproc;
+    int i;
+    // Don't have to worry about sleep bit here or atomic since team setting
+    kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
+
+    // Collect all the worker team member threads.
+    for (i = 1; i < nproc; ++i) {
+#if KMP_CACHE_MANAGE
+      // Prefetch next thread's arrived count
+      if (i + 1 < nproc)
+        KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+      KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
+                    "arrived(%p) == %llu\n",
+                    gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
+                    team->t.t_id, i,
+                    &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
+
+      // Wait for worker thread to arrive
+      if (cancellable) {
+        kmp_flag_64<true, false> flag(
+            &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
+        if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
+          return true;
+      } else {
+        kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
+                           new_state);
+        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+      }
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      // Barrier imbalance - write min of the thread time and the other thread
+      // time to the thread.
+      if (__kmp_forkjoin_frames_mode == 2) {
+        this_thr->th.th_bar_min_time = KMP_MIN(
+            this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
+      }
+#endif
+      if (reduce) {
+        KA_TRACE(100,
+                 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
+                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
+                  team->t.t_id, i));
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
+        (*reduce)(this_thr->th.th_local.reduce_data,
+                  other_threads[i]->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
+      }
+    }
+    // Don't have to worry about sleep bit here or atomic since team setting
+    team_bar->b_arrived = new_state;
+    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
+                  "arrived(%p) = %llu\n",
+                  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
+                  new_state));
+  }
+  KA_TRACE(
+      20,
+      ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+       gtid, team->t.t_id, tid, bt));
+  return false;
+}
+
+template <bool cancellable = false>
+static bool __kmp_linear_barrier_release_template(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_team_t *team;
+
+  if (KMP_MASTER_TID(tid)) {
+    unsigned int i;
+    kmp_uint32 nproc = this_thr->th.th_team_nproc;
+    kmp_info_t **other_threads;
+
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    other_threads = team->t.t_threads;
+
+    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
+                  "barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+
+    if (nproc > 1) {
+#if KMP_BARRIER_ICV_PUSH
+      {
+        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
+        if (propagate_icvs) {
+          ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
+          for (i = 1; i < nproc; ++i) {
+            __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
+                                     team, i, FALSE);
+            ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
+                           &team->t.t_implicit_task_taskdata[0].td_icvs);
+          }
+          ngo_sync();
+        }
+      }
+#endif // KMP_BARRIER_ICV_PUSH
+
+      // Now, release all of the worker threads
+      for (i = 1; i < nproc; ++i) {
+#if KMP_CACHE_MANAGE
+        // Prefetch next thread's go flag
+        if (i + 1 < nproc)
+          KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+        KA_TRACE(
+            20,
+            ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
+             "go(%p): %u => %u\n",
+             gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
+             team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
+             other_threads[i]->th.th_bar[bt].bb.b_go,
+             other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
+        kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
+                           other_threads[i]);
+        flag.release();
+      }
+    }
+  } else { // Wait for the PRIMARY thread to release us
+    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
+                  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+    if (cancellable) {
+      kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+      if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
+        return true;
+    } else {
+      kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+      // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
+      // disabled)
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+      // Cancel wait on previous parallel region...
+      __kmp_itt_task_starting(itt_sync_obj);
+
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return false;
+
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+      if (itt_sync_obj != NULL)
+        // Call prepare as early as possible for "new" barrier
+        __kmp_itt_task_finished(itt_sync_obj);
+    } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+      return false;
+// The worker thread may now assume that the team is valid.
+#ifdef KMP_DEBUG
+    tid = __kmp_tid_from_gtid(gtid);
+    team = __kmp_threads[gtid]->th.th_team;
+#endif
+    KMP_DEBUG_ASSERT(team != NULL);
+    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+    KA_TRACE(20,
+             ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+    KMP_MB(); // Flush all pending memory write invalidates.
+  }
+  KA_TRACE(
+      20,
+      ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+       gtid, team->t.t_id, tid, bt));
+  return false;
+}
+
+static void __kmp_linear_barrier_gather(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  __kmp_linear_barrier_gather_template<false>(
+      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+}
+
+static bool __kmp_linear_barrier_gather_cancellable(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  return __kmp_linear_barrier_gather_template<true>(
+      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+}
+
+static void __kmp_linear_barrier_release(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  __kmp_linear_barrier_release_template<false>(
+      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
+}
+
+static bool __kmp_linear_barrier_release_cancellable(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  return __kmp_linear_barrier_release_template<true>(
+      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
+}
+
+// Tree barrier
+static void __kmp_tree_barrier_gather(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
+  kmp_team_t *team = this_thr->th.th_team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_info_t **other_threads = team->t.t_threads;
+  kmp_uint32 nproc = this_thr->th.th_team_nproc;
+  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
+  kmp_uint32 branch_factor = 1 << branch_bits;
+  kmp_uint32 child;
+  kmp_uint32 child_tid;
+  kmp_uint64 new_state = 0;
+
+  KA_TRACE(
+      20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+           gtid, team->t.t_id, tid, bt));
+  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  // Barrier imbalance - save arrive time to the thread
+  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
+        __itt_get_timestamp();
+  }
+#endif
+  // Perform tree gather to wait until all threads have arrived; reduce any
+  // required data as we go
+  child_tid = (tid << branch_bits) + 1;
+  if (child_tid < nproc) {
+    // Parent threads wait for all their children to arrive
+    new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+    child = 1;
+    do {
+      kmp_info_t *child_thr = other_threads[child_tid];
+      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+      // Prefetch next thread's arrived count
+      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
+        KMP_CACHE_PREFETCH(
+            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+      KA_TRACE(20,
+               ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
+                "arrived(%p) == %llu\n",
+                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
+      // Wait for child to arrive
+      kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
+      flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      // Barrier imbalance - write min of the thread time and a child time to
+      // the thread.
+      if (__kmp_forkjoin_frames_mode == 2) {
+        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
+                                               child_thr->th.th_bar_min_time);
+      }
+#endif
+      if (reduce) {
+        KA_TRACE(100,
+                 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
+                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                  team->t.t_id, child_tid));
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
+        (*reduce)(this_thr->th.th_local.reduce_data,
+                  child_thr->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
+      }
+      child++;
+      child_tid++;
+    } while (child <= branch_factor && child_tid < nproc);
+  }
+
+  if (!KMP_MASTER_TID(tid)) { // Worker threads
+    kmp_int32 parent_tid = (tid - 1) >> branch_bits;
+
+    KA_TRACE(20,
+             ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
+              "arrived(%p): %llu => %llu\n",
+              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
+              team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
+              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+
+    // Mark arrival to parent thread
+    /* After performing this write, a worker thread may not assume that the team
+       is valid any more - it could be deallocated by the primary thread at any
+       time.  */
+    kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
+    flag.release();
+  } else {
+    // Need to update the team arrived pointer if we are the primary thread
+    if (nproc > 1) // New value was already computed above
+      team->t.t_bar[bt].b_arrived = new_state;
+    else
+      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
+    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
+                  "arrived(%p) = %llu\n",
+                  gtid, team->t.t_id, tid, team->t.t_id,
+                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+  }
+  KA_TRACE(20,
+           ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+            gtid, team->t.t_id, tid, bt));
+}
+
+static void __kmp_tree_barrier_release(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
+  kmp_team_t *team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_uint32 nproc;
+  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
+  kmp_uint32 branch_factor = 1 << branch_bits;
+  kmp_uint32 child;
+  kmp_uint32 child_tid;
+
+  // Perform a tree release for all of the threads that have been gathered
+  if (!KMP_MASTER_TID(
+          tid)) { // Handle fork barrier workers who aren't part of a team yet
+    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
+                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+    // Wait for parent thread to release us
+    kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+      // In fork barrier where we could not get the object reliably (or
+      // ITTNOTIFY is disabled)
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+      // Cancel wait on previous parallel region...
+      __kmp_itt_task_starting(itt_sync_obj);
+
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return;
+
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+      if (itt_sync_obj != NULL)
+        // Call prepare as early as possible for "new" barrier
+        __kmp_itt_task_finished(itt_sync_obj);
+    } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+      return;
+
+    // The worker thread may now assume that the team is valid.
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    tid = __kmp_tid_from_gtid(gtid);
+
+    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+    KA_TRACE(20,
+             ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
+              team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+    KMP_MB(); // Flush all pending memory write invalidates.
+  } else {
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
+                  "barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+  }
+  nproc = this_thr->th.th_team_nproc;
+  child_tid = (tid << branch_bits) + 1;
+
+  if (child_tid < nproc) {
+    kmp_info_t **other_threads = team->t.t_threads;
+    child = 1;
+    // Parent threads release all their children
+    do {
+      kmp_info_t *child_thr = other_threads[child_tid];
+      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+      // Prefetch next thread's go count
+      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
+        KMP_CACHE_PREFETCH(
+            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+
+#if KMP_BARRIER_ICV_PUSH
+      {
+        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
+        if (propagate_icvs) {
+          __kmp_init_implicit_task(team->t.t_ident,
+                                   team->t.t_threads[child_tid], team,
+                                   child_tid, FALSE);
+          copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
+                    &team->t.t_implicit_task_taskdata[0].td_icvs);
+        }
+      }
+#endif // KMP_BARRIER_ICV_PUSH
+      KA_TRACE(20,
+               ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
+                "go(%p): %u => %u\n",
+                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+                child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+      // Release child from barrier
+      kmp_flag_64<> flag(&child_bar->b_go, child_thr);
+      flag.release();
+      child++;
+      child_tid++;
+    } while (child <= branch_factor && child_tid < nproc);
+  }
+  KA_TRACE(
+      20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+           gtid, team->t.t_id, tid, bt));
+}
+
+// Hyper Barrier
+static void __kmp_hyper_barrier_gather(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
+  kmp_team_t *team = this_thr->th.th_team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_info_t **other_threads = team->t.t_threads;
+  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
+  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
+  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
+  kmp_uint32 branch_factor = 1 << branch_bits;
+  kmp_uint32 offset;
+  kmp_uint32 level;
+
+  KA_TRACE(
+      20,
+      ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+       gtid, team->t.t_id, tid, bt));
+  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  // Barrier imbalance - save arrive time to the thread
+  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
+        __itt_get_timestamp();
+  }
+#endif
+  /* Perform a hypercube-embedded tree gather to wait until all of the threads
+     have arrived, and reduce any required data as we go.  */
+  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
+  for (level = 0, offset = 1; offset < num_threads;
+       level += branch_bits, offset <<= branch_bits) {
+    kmp_uint32 child;
+    kmp_uint32 child_tid;
+
+    if (((tid >> level) & (branch_factor - 1)) != 0) {
+      kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
+
+      KMP_MB(); // Synchronize parent and child threads.
+      KA_TRACE(20,
+               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
+                "arrived(%p): %llu => %llu\n",
+                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
+                team->t.t_id, parent_tid, &thr_bar->b_arrived,
+                thr_bar->b_arrived,
+                thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+      // Mark arrival to parent thread
+      /* After performing this write (in the last iteration of the enclosing for
+         loop), a worker thread may not assume that the team is valid any more
+         - it could be deallocated by the primary thread at any time.  */
+      p_flag.set_waiter(other_threads[parent_tid]);
+      p_flag.release();
+      break;
+    }
+
+    // Parent threads wait for children to arrive
+    if (new_state == KMP_BARRIER_UNUSED_STATE)
+      new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+    for (child = 1, child_tid = tid + (1 << level);
+         child < branch_factor && child_tid < num_threads;
+         child++, child_tid += (1 << level)) {
+      kmp_info_t *child_thr = other_threads[child_tid];
+      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+      kmp_uint32 next_child_tid = child_tid + (1 << level);
+      // Prefetch next thread's arrived count
+      if (child + 1 < branch_factor && next_child_tid < num_threads)
+        KMP_CACHE_PREFETCH(
+            &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+      KA_TRACE(20,
+               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
+                "arrived(%p) == %llu\n",
+                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
+      // Wait for child to arrive
+      kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
+      c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+      KMP_MB(); // Synchronize parent and child threads.
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      // Barrier imbalance - write min of the thread time and a child time to
+      // the thread.
+      if (__kmp_forkjoin_frames_mode == 2) {
+        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
+                                               child_thr->th.th_bar_min_time);
+      }
+#endif
+      if (reduce) {
+        KA_TRACE(100,
+                 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
+                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                  team->t.t_id, child_tid));
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
+        (*reduce)(this_thr->th.th_local.reduce_data,
+                  child_thr->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
+      }
+    }
+  }
+
+  if (KMP_MASTER_TID(tid)) {
+    // Need to update the team arrived pointer if we are the primary thread
+    if (new_state == KMP_BARRIER_UNUSED_STATE)
+      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
+    else
+      team->t.t_bar[bt].b_arrived = new_state;
+    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
+                  "arrived(%p) = %llu\n",
+                  gtid, team->t.t_id, tid, team->t.t_id,
+                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+  }
+  KA_TRACE(
+      20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+           gtid, team->t.t_id, tid, bt));
+}
+
+// The reverse versions seem to beat the forward versions overall
+#define KMP_REVERSE_HYPER_BAR
+static void __kmp_hyper_barrier_release(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
+  kmp_team_t *team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_info_t **other_threads;
+  kmp_uint32 num_threads;
+  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
+  kmp_uint32 branch_factor = 1 << branch_bits;
+  kmp_uint32 child;
+  kmp_uint32 child_tid;
+  kmp_uint32 offset;
+  kmp_uint32 level;
+
+  /* Perform a hypercube-embedded tree release for all of the threads that have
+     been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
+     are released in the reverse order of the corresponding gather, otherwise
+     threads are released in the same order. */
+  if (KMP_MASTER_TID(tid)) { // primary thread
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
+                  "barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs) { // primary already has ICVs in final destination; copy
+      copy_icvs(&thr_bar->th_fixed_icvs,
+                &team->t.t_implicit_task_taskdata[tid].td_icvs);
+    }
+#endif
+  } else { // Handle fork barrier workers who aren't part of a team yet
+    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
+                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+    // Wait for parent thread to release us
+    kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+      // In fork barrier where we could not get the object reliably
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+      // Cancel wait on previous parallel region...
+      __kmp_itt_task_starting(itt_sync_obj);
+
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return;
+
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+      if (itt_sync_obj != NULL)
+        // Call prepare as early as possible for "new" barrier
+        __kmp_itt_task_finished(itt_sync_obj);
+    } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+      return;
+
+    // The worker thread may now assume that the team is valid.
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    tid = __kmp_tid_from_gtid(gtid);
+
+    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+    KA_TRACE(20,
+             ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+    KMP_MB(); // Flush all pending memory write invalidates.
+  }
+  num_threads = this_thr->th.th_team_nproc;
+  other_threads = team->t.t_threads;
+
+#ifdef KMP_REVERSE_HYPER_BAR
+  // Count up to correct level for parent
+  for (level = 0, offset = 1;
+       offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
+       level += branch_bits, offset <<= branch_bits)
+    ;
+
+  // Now go down from there
+  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
+       level -= branch_bits, offset >>= branch_bits)
+#else
+  // Go down the tree, level by level
+  for (level = 0, offset = 1; offset < num_threads;
+       level += branch_bits, offset <<= branch_bits)
+#endif // KMP_REVERSE_HYPER_BAR
+  {
+#ifdef KMP_REVERSE_HYPER_BAR
+    /* Now go in reverse order through the children, highest to lowest.
+       Initial setting of child is conservative here. */
+    child = num_threads >> ((level == 0) ? level : level - 1);
+    for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
+        child_tid = tid + (child << level);
+         child >= 1; child--, child_tid -= (1 << level))
+#else
+    if (((tid >> level) & (branch_factor - 1)) != 0)
+      // No need to go lower than this, since this is the level parent would be
+      // notified
+      break;
+    // Iterate through children on this level of the tree
+    for (child = 1, child_tid = tid + (1 << level);
+         child < branch_factor && child_tid < num_threads;
+         child++, child_tid += (1 << level))
+#endif // KMP_REVERSE_HYPER_BAR
+    {
+      if (child_tid >= num_threads)
+        continue; // Child doesn't exist so keep going
+      else {
+        kmp_info_t *child_thr = other_threads[child_tid];
+        kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+        kmp_uint32 next_child_tid = child_tid - (1 << level);
+// Prefetch next thread's go count
+#ifdef KMP_REVERSE_HYPER_BAR
+        if (child - 1 >= 1 && next_child_tid < num_threads)
+#else
+        if (child + 1 < branch_factor && next_child_tid < num_threads)
+#endif // KMP_REVERSE_HYPER_BAR
+          KMP_CACHE_PREFETCH(
+              &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+
+#if KMP_BARRIER_ICV_PUSH
+        if (propagate_icvs) // push my fixed ICVs to my child
+          copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
+#endif // KMP_BARRIER_ICV_PUSH
+
+        KA_TRACE(
+            20,
+            ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
+             "go(%p): %u => %u\n",
+             gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+             team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+             child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+        // Release child from barrier
+        kmp_flag_64<> flag(&child_bar->b_go, child_thr);
+        flag.release();
+      }
+    }
+  }
+#if KMP_BARRIER_ICV_PUSH
+  if (propagate_icvs &&
+      !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
+    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
+                             FALSE);
+    copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+              &thr_bar->th_fixed_icvs);
+  }
+#endif
+  KA_TRACE(
+      20,
+      ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+       gtid, team->t.t_id, tid, bt));
+}
+
+// Hierarchical Barrier
+
+// Initialize thread barrier data
+/* Initializes/re-initializes the hierarchical barrier data stored on a thread.
+   Performs the minimum amount of initialization required based on how the team
+   has changed. Returns true if leaf children will require both on-core and
+   traditional wake-up mechanisms. For example, if the team size increases,
+   threads already in the team will respond to on-core wakeup on their parent
+   thread, but threads newly added to the team will only be listening on the
+   their local b_go. */
+static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
+                                                   kmp_bstate_t *thr_bar,
+                                                   kmp_uint32 nproc, int gtid,
+                                                   int tid, kmp_team_t *team) {
+  // Checks to determine if (re-)initialization is needed
+  bool uninitialized = thr_bar->team == NULL;
+  bool team_changed = team != thr_bar->team;
+  bool team_sz_changed = nproc != thr_bar->nproc;
+  bool tid_changed = tid != thr_bar->old_tid;
+  bool retval = false;
+
+  if (uninitialized || team_sz_changed) {
+    __kmp_get_hierarchy(nproc, thr_bar);
+  }
+
+  if (uninitialized || team_sz_changed || tid_changed) {
+    thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
+    thr_bar->parent_tid = -1; // default for primary thread
+    if (!KMP_MASTER_TID(tid)) {
+      // if not primary thread, find parent thread in hierarchy
+      kmp_uint32 d = 0;
+      while (d < thr_bar->depth) { // find parent based on level of thread in
+        // hierarchy, and note level
+        kmp_uint32 rem;
+        if (d == thr_bar->depth - 2) { // reached level right below the primary
+          thr_bar->parent_tid = 0;
+          thr_bar->my_level = d;
+          break;
+        } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
+          // TODO: can we make the above op faster?
+          // thread is not a subtree root at next level, so this is max
+          thr_bar->parent_tid = tid - rem;
+          thr_bar->my_level = d;
+          break;
+        }
+        ++d;
+      }
+    }
+    __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
+                            (thr_bar->skip_per_level[thr_bar->my_level])),
+                       &(thr_bar->offset));
+    thr_bar->old_tid = tid;
+    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
+    thr_bar->team = team;
+    thr_bar->parent_bar =
+        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
+  }
+  if (uninitialized || team_changed || tid_changed) {
+    thr_bar->team = team;
+    thr_bar->parent_bar =
+        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
+    retval = true;
+  }
+  if (uninitialized || team_sz_changed || tid_changed) {
+    thr_bar->nproc = nproc;
+    thr_bar->leaf_kids = thr_bar->base_leaf_kids;
+    if (thr_bar->my_level == 0)
+      thr_bar->leaf_kids = 0;
+    if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
+      __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
+    thr_bar->leaf_state = 0;
+    for (int i = 0; i < thr_bar->leaf_kids; ++i)
+      ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
+  }
+  return retval;
+}
+
+static void __kmp_hierarchical_barrier_gather(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
+  kmp_team_t *team = this_thr->th.th_team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_uint32 nproc = this_thr->th.th_team_nproc;
+  kmp_info_t **other_threads = team->t.t_threads;
+  kmp_uint64 new_state = 0;
+
+  int level = team->t.t_level;
+  if (other_threads[0]
+          ->th.th_teams_microtask) // are we inside the teams construct?
+    if (this_thr->th.th_teams_size.nteams > 1)
+      ++level; // level was not increased in teams construct for team_of_masters
+  if (level == 1)
+    thr_bar->use_oncore_barrier = 1;
+  else
+    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
+
+  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
+                "barrier type %d\n",
+                gtid, team->t.t_id, tid, bt));
+  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  // Barrier imbalance - save arrive time to the thread
+  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+    this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
+  }
+#endif
+
+  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
+                                               team);
+
+  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
+    kmp_int32 child_tid;
+    new_state =
+        (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+        thr_bar->use_oncore_barrier) {
+      if (thr_bar->leaf_kids) {
+        // First, wait for leaf children to check-in on my b_arrived flag
+        kmp_uint64 leaf_state =
+            KMP_MASTER_TID(tid)
+                ? thr_bar->b_arrived | thr_bar->leaf_state
+                : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
+        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
+                      "for leaf kids\n",
+                      gtid, team->t.t_id, tid));
+        kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
+        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+        if (reduce) {
+          OMPT_REDUCTION_DECL(this_thr, gtid);
+          OMPT_REDUCTION_BEGIN;
+          for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
+               ++child_tid) {
+            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
+                           "T#%d(%d:%d)\n",
+                           gtid, team->t.t_id, tid,
+                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                           child_tid));
+            (*reduce)(this_thr->th.th_local.reduce_data,
+                      other_threads[child_tid]->th.th_local.reduce_data);
+          }
+          OMPT_REDUCTION_END;
+        }
+        // clear leaf_state bits
+        KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
+      }
+      // Next, wait for higher level children on each child's b_arrived flag
+      for (kmp_uint32 d = 1; d < thr_bar->my_level;
+           ++d) { // gather lowest level threads first, but skip 0
+        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
+                   skip = thr_bar->skip_per_level[d];
+        if (last > nproc)
+          last = nproc;
+        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
+          kmp_info_t *child_thr = other_threads[child_tid];
+          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
+                        "T#%d(%d:%d) "
+                        "arrived(%p) == %llu\n",
+                        gtid, team->t.t_id, tid,
+                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                        child_tid, &child_bar->b_arrived, new_state));
+          kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
+          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          if (reduce) {
+            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
+                           "T#%d(%d:%d)\n",
+                           gtid, team->t.t_id, tid,
+                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                           child_tid));
+            (*reduce)(this_thr->th.th_local.reduce_data,
+                      child_thr->th.th_local.reduce_data);
+          }
+        }
+      }
+    } else { // Blocktime is not infinite
+      for (kmp_uint32 d = 0; d < thr_bar->my_level;
+           ++d) { // Gather lowest level threads first
+        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
+                   skip = thr_bar->skip_per_level[d];
+        if (last > nproc)
+          last = nproc;
+        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
+          kmp_info_t *child_thr = other_threads[child_tid];
+          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
+                        "T#%d(%d:%d) "
+                        "arrived(%p) == %llu\n",
+                        gtid, team->t.t_id, tid,
+                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                        child_tid, &child_bar->b_arrived, new_state));
+          kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
+          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          if (reduce) {
+            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
+                           "T#%d(%d:%d)\n",
+                           gtid, team->t.t_id, tid,
+                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                           child_tid));
+            (*reduce)(this_thr->th.th_local.reduce_data,
+                      child_thr->th.th_local.reduce_data);
+          }
+        }
+      }
+    }
+  }
+  // All subordinates are gathered; now release parent if not primary thread
+
+  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
+                  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
+                  gtid, team->t.t_id, tid,
+                  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
+                  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
+                  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+    /* Mark arrival to parent: After performing this write, a worker thread may
+       not assume that the team is valid any more - it could be deallocated by
+       the primary thread at any time. */
+    if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
+        !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
+      // flag; release it
+      kmp_flag_64<> flag(&thr_bar->b_arrived,
+                         other_threads[thr_bar->parent_tid]);
+      flag.release();
+    } else {
+      // Leaf does special release on "offset" bits of parent's b_arrived flag
+      thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+      kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
+                           thr_bar->offset + 1);
+      flag.set_waiter(other_threads[thr_bar->parent_tid]);
+      flag.release();
+    }
+  } else { // Primary thread needs to update the team's b_arrived value
+    team->t.t_bar[bt].b_arrived = new_state;
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
+                  "arrived(%p) = %llu\n",
+                  gtid, team->t.t_id, tid, team->t.t_id,
+                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+  }
+  // Is the team access below unsafe or just technically invalid?
+  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
+                "barrier type %d\n",
+                gtid, team->t.t_id, tid, bt));
+}
+
+static void __kmp_hierarchical_barrier_release(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
+  kmp_team_t *team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_uint32 nproc;
+  bool team_change = false; // indicates on-core barrier shouldn't be used
+
+  if (KMP_MASTER_TID(tid)) {
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
+                  "entered barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+  } else { // Worker threads
+    // Wait for parent thread to release me
+    if (!thr_bar->use_oncore_barrier ||
+        __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
+        thr_bar->team == NULL) {
+      // Use traditional method of waiting on my own b_go flag
+      thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
+      kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+      TCW_8(thr_bar->b_go,
+            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+    } else { // Thread barrier data is initialized, this is a leaf, blocktime is
+      // infinite, not nested
+      // Wait on my "offset" bits on parent's b_go flag
+      thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
+      kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
+                           thr_bar->offset + 1, bt,
+                           this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
+      flag.wait(this_thr, TRUE);
+      if (thr_bar->wait_flag ==
+          KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
+        TCW_8(thr_bar->b_go,
+              KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+      } else { // Reset my bits on parent's b_go flag
+        (RCAST(volatile char *,
+               &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
+      }
+    }
+    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
+    // Early exit for reaping threads releasing forkjoin barrier
+    if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+      return;
+    // The worker thread may now assume that the team is valid.
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    tid = __kmp_tid_from_gtid(gtid);
+
+    KA_TRACE(
+        20,
+        ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+         gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+    KMP_MB(); // Flush all pending memory write invalidates.
+  }
+
+  nproc = this_thr->th.th_team_nproc;
+  int level = team->t.t_level;
+  if (team->t.t_threads[0]
+          ->th.th_teams_microtask) { // are we inside the teams construct?
+    if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+        this_thr->th.th_teams_level == level)
+      ++level; // level was not increased in teams construct for team_of_workers
+    if (this_thr->th.th_teams_size.nteams > 1)
+      ++level; // level was not increased in teams construct for team_of_masters
+  }
+  if (level == 1)
+    thr_bar->use_oncore_barrier = 1;
+  else
+    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
+
+  // If the team size has increased, we still communicate with old leaves via
+  // oncore barrier.
+  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
+  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
+  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
+                                                       tid, team);
+  // But if the entire team changes, we won't use oncore barrier at all
+  if (team_change)
+    old_leaf_kids = 0;
+
+#if KMP_BARRIER_ICV_PUSH
+  if (propagate_icvs) {
+    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
+                             FALSE);
+    if (KMP_MASTER_TID(
+            tid)) { // primary already has copy in final destination; copy
+      copy_icvs(&thr_bar->th_fixed_icvs,
+                &team->t.t_implicit_task_taskdata[tid].td_icvs);
+    } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+               thr_bar->use_oncore_barrier) { // optimization for inf blocktime
+      if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
+        // leaves (on-core children) pull parent's fixed ICVs directly to local
+        // ICV store
+        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                  &thr_bar->parent_bar->th_fixed_icvs);
+      // non-leaves will get ICVs piggybacked with b_go via NGO store
+    } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
+      if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
+        // access
+        copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
+      else // leaves copy parent's fixed ICVs directly to local ICV store
+        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                  &thr_bar->parent_bar->th_fixed_icvs);
+    }
+  }
+#endif // KMP_BARRIER_ICV_PUSH
+
+  // Now, release my children
+  if (thr_bar->my_level) { // not a leaf
+    kmp_int32 child_tid;
+    kmp_uint32 last;
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+        thr_bar->use_oncore_barrier) {
+      if (KMP_MASTER_TID(tid)) { // do a flat release
+        // Set local b_go to bump children via NGO store of the cache line
+        // containing IVCs and b_go.
+        thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
+        // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
+        // the cache line
+        ngo_load(&thr_bar->th_fixed_icvs);
+        // This loops over all the threads skipping only the leaf nodes in the
+        // hierarchy
+        for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
+             child_tid += thr_bar->skip_per_level[1]) {
+          kmp_bstate_t *child_bar =
+              &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
+          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
+                        "releasing T#%d(%d:%d)"
+                        " go(%p): %u => %u\n",
+                        gtid, team->t.t_id, tid,
+                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                        child_tid, &child_bar->b_go, child_bar->b_go,
+                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+          // Use ngo store (if available) to both store ICVs and release child
+          // via child's b_go
+          ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
+        }
+        ngo_sync();
+      }
+      TCW_8(thr_bar->b_go,
+            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+      // Now, release leaf children
+      if (thr_bar->leaf_kids) { // if there are any
+        // We test team_change on the off-chance that the level 1 team changed.
+        if (team_change ||
+            old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
+          if (old_leaf_kids) { // release old leaf kids
+            thr_bar->b_go |= old_leaf_state;
+          }
+          // Release new leaf kids
+          last = tid + thr_bar->skip_per_level[1];
+          if (last > nproc)
+            last = nproc;
+          for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
+               ++child_tid) { // skip_per_level[0]=1
+            kmp_info_t *child_thr = team->t.t_threads[child_tid];
+            kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+            KA_TRACE(
+                20,
+                ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
+                 " T#%d(%d:%d) go(%p): %u => %u\n",
+                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+            // Release child using child's b_go flag
+            kmp_flag_64<> flag(&child_bar->b_go, child_thr);
+            flag.release();
+          }
+        } else { // Release all children at once with leaf_state bits on my own
+          // b_go flag
+          thr_bar->b_go |= thr_bar->leaf_state;
+        }
+      }
+    } else { // Blocktime is not infinite; do a simple hierarchical release
+      for (int d = thr_bar->my_level - 1; d >= 0;
+           --d) { // Release highest level threads first
+        last = tid + thr_bar->skip_per_level[d + 1];
+        kmp_uint32 skip = thr_bar->skip_per_level[d];
+        if (last > nproc)
+          last = nproc;
+        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
+          kmp_info_t *child_thr = team->t.t_threads[child_tid];
+          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
+                        "releasing T#%d(%d:%d) go(%p): %u => %u\n",
+                        gtid, team->t.t_id, tid,
+                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                        child_tid, &child_bar->b_go, child_bar->b_go,
+                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+          // Release child using child's b_go flag
+          kmp_flag_64<> flag(&child_bar->b_go, child_thr);
+          flag.release();
+        }
+      }
+    }
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs && !KMP_MASTER_TID(tid))
+      // non-leaves copy ICVs from fixed ICVs to local dest
+      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                &thr_bar->th_fixed_icvs);
+#endif // KMP_BARRIER_ICV_PUSH
+  }
+  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
+                "barrier type %d\n",
+                gtid, team->t.t_id, tid, bt));
+}
+
+// End of Barrier Algorithms
+
+// type traits for cancellable value
+// if cancellable is true, then is_cancellable is a normal boolean variable
+// if cancellable is false, then is_cancellable is a compile time constant
+template <bool cancellable> struct is_cancellable {};
+template <> struct is_cancellable<true> {
+  bool value;
+  is_cancellable() : value(false) {}
+  is_cancellable(bool b) : value(b) {}
+  is_cancellable &operator=(bool b) {
+    value = b;
+    return *this;
+  }
+  operator bool() const { return value; }
+};
+template <> struct is_cancellable<false> {
+  is_cancellable &operator=(bool b) { return *this; }
+  constexpr operator bool() const { return false; }
+};
+
+// Internal function to do a barrier.
+/* If is_split is true, do a split barrier, otherwise, do a plain barrier
+   If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
+   barrier
+   When cancellable = false,
+     Returns 0 if primary thread, 1 if worker thread.
+   When cancellable = true
+     Returns 0 if not cancelled, 1 if cancelled.  */
+template <bool cancellable = false>
+static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
+                                  size_t reduce_size, void *reduce_data,
+                                  void (*reduce)(void *, void *)) {
+  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
+  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
+  int tid = __kmp_tid_from_gtid(gtid);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+  int status = 0;
+  is_cancellable<cancellable> cancelled;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_data_t *my_task_data;
+  ompt_data_t *my_parallel_data;
+  void *return_address;
+  ompt_sync_region_t barrier_kind;
+#endif
+
+  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
+                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+#if OMPT_OPTIONAL
+    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
+    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
+    return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
+          return_address);
+    }
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
+          return_address);
+    }
+#endif
+    // It is OK to report the barrier state after the barrier begin callback.
+    // According to the OMPT specification, a compliant implementation may
+    // even delay reporting this state until the barrier begins to wait.
+    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
+  }
+#endif
+
+  if (!team->t.t_serialized) {
+#if USE_ITT_BUILD
+    // This value will be used in itt notify events below.
+    void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
+#endif
+#endif /* USE_ITT_BUILD */
+    if (__kmp_tasking_mode == tskm_extra_barrier) {
+      __kmp_tasking_barrier(team, this_thr, gtid);
+      KA_TRACE(15,
+               ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
+                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
+    }
+
+    /* Copy the blocktime info to the thread, where __kmp_wait_template() can
+       access it when the team struct is not guaranteed to exist. */
+    // See note about the corresponding code in __kmp_join_barrier() being
+    // performance-critical.
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+#if KMP_USE_MONITOR
+      this_thr->th.th_team_bt_intervals =
+          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+      this_thr->th.th_team_bt_set =
+          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+#else
+      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
+#endif
+    }
+
+#if USE_ITT_BUILD
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+      __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+#if USE_DEBUGGER
+    // Let the debugger know: the thread arrived to the barrier and waiting.
+    if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
+      team->t.t_bar[bt].b_master_arrived += 1;
+    } else {
+      this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
+    } // if
+#endif /* USE_DEBUGGER */
+    if (reduce != NULL) {
+      // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
+      this_thr->th.th_local.reduce_data = reduce_data;
+    }
+
+    if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
+      // use 0 to only setup the current team if nthreads > 1
+      __kmp_task_team_setup(this_thr, team, 0);
+
+    if (cancellable) {
+      cancelled = __kmp_linear_barrier_gather_cancellable(
+          bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+    } else {
+      switch (__kmp_barrier_gather_pattern[bt]) {
+      case bp_dist_bar: {
+        __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
+                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+        break;
+      }
+      case bp_hyper_bar: {
+        // don't set branch bits to 0; use linear
+        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
+        __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
+                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+        break;
+      }
+      case bp_hierarchical_bar: {
+        __kmp_hierarchical_barrier_gather(
+            bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+        break;
+      }
+      case bp_tree_bar: {
+        // don't set branch bits to 0; use linear
+        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
+        __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
+                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+        break;
+      }
+      default: {
+        __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
+                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+      }
+      }
+    }
+
+    KMP_MB();
+
+    if (KMP_MASTER_TID(tid)) {
+      status = 0;
+      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
+        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
+      }
+#if USE_DEBUGGER
+      // Let the debugger know: All threads are arrived and starting leaving the
+      // barrier.
+      team->t.t_bar[bt].b_team_arrived += 1;
+#endif
+
+      if (__kmp_omp_cancellation) {
+        kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
+        // Reset cancellation flag for worksharing constructs
+        if (cancel_request == cancel_loop ||
+            cancel_request == cancel_sections) {
+          KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
+        }
+      }
+#if USE_ITT_BUILD
+      /* TODO: In case of split reduction barrier, primary thread may send
+         acquired event early, before the final summation into the shared
+         variable is done (final summation can be a long operation for array
+         reductions).  */
+      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      // Barrier - report frame end (only if active_level == 1)
+      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
+          __kmp_forkjoin_frames_mode &&
+          (this_thr->th.th_teams_microtask == NULL || // either not in teams
+           this_thr->th.th_teams_size.nteams == 1) && // or inside single team
+          team->t.t_active_level == 1) {
+        ident_t *loc = __kmp_threads[gtid]->th.th_ident;
+        kmp_uint64 cur_time = __itt_get_timestamp();
+        kmp_info_t **other_threads = team->t.t_threads;
+        int nproc = this_thr->th.th_team_nproc;
+        int i;
+        switch (__kmp_forkjoin_frames_mode) {
+        case 1:
+          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
+                                 loc, nproc);
+          this_thr->th.th_frame_time = cur_time;
+          break;
+        case 2: // AC 2015-01-19: currently does not work for hierarchical (to
+          // be fixed)
+          __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
+                                 1, loc, nproc);
+          break;
+        case 3:
+          if (__itt_metadata_add_ptr) {
+            // Initialize with primary thread's wait time
+            kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
+            // Set arrive time to zero to be able to check it in
+            // __kmp_invoke_task(); the same is done inside the loop below
+            this_thr->th.th_bar_arrive_time = 0;
+            for (i = 1; i < nproc; ++i) {
+              delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
+              other_threads[i]->th.th_bar_arrive_time = 0;
+            }
+            __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
+                                         cur_time, delta,
+                                         (kmp_uint64)(reduce != NULL));
+          }
+          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
+                                 loc, nproc);
+          this_thr->th.th_frame_time = cur_time;
+          break;
+        }
+      }
+#endif /* USE_ITT_BUILD */
+    } else {
+      status = 1;
+#if USE_ITT_BUILD
+      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+    }
+    if ((status == 1 || !is_split) && !cancelled) {
+      if (cancellable) {
+        cancelled = __kmp_linear_barrier_release_cancellable(
+            bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+      } else {
+        switch (__kmp_barrier_release_pattern[bt]) {
+        case bp_dist_bar: {
+          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+          __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
+                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          break;
+        }
+        case bp_hyper_bar: {
+          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+          __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
+                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          break;
+        }
+        case bp_hierarchical_bar: {
+          __kmp_hierarchical_barrier_release(
+              bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          break;
+        }
+        case bp_tree_bar: {
+          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+          __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
+                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          break;
+        }
+        default: {
+          __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
+                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+        }
+        }
+      }
+      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
+        __kmp_task_team_sync(this_thr, team);
+      }
+    }
+
+#if USE_ITT_BUILD
+    /* GEH: TODO: Move this under if-condition above and also include in
+       __kmp_end_split_barrier(). This will more accurately represent the actual
+       release time of the threads for split barriers.  */
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+      __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+  } else { // Team is serialized.
+    status = 0;
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      if (this_thr->th.th_task_team != NULL) {
+#if USE_ITT_NOTIFY
+        void *itt_sync_obj = NULL;
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+          itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
+          __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+        }
+#endif
+
+        KMP_DEBUG_ASSERT(
+            this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
+            this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
+                TRUE);
+        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
+        __kmp_task_team_setup(this_thr, team, 0);
+
+#if USE_ITT_BUILD
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+      }
+    }
+  }
+  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
+                gtid, __kmp_team_from_gtid(gtid)->t.t_id,
+                __kmp_tid_from_gtid(gtid), status));
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+#if OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
+          return_address);
+    }
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
+          return_address);
+    }
+#endif
+    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+  }
+#endif
+
+  if (cancellable)
+    return (int)cancelled;
+  return status;
+}
+
+// Returns 0 if primary thread, 1 if worker thread.
+int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
+                  size_t reduce_size, void *reduce_data,
+                  void (*reduce)(void *, void *)) {
+  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
+                                  reduce);
+}
+
+#if defined(KMP_GOMP_COMPAT)
+// Returns 1 if cancelled, 0 otherwise
+int __kmp_barrier_gomp_cancel(int gtid) {
+  if (__kmp_omp_cancellation) {
+    int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
+                                                 0, NULL, NULL);
+    if (cancelled) {
+      int tid = __kmp_tid_from_gtid(gtid);
+      kmp_info_t *this_thr = __kmp_threads[gtid];
+      if (KMP_MASTER_TID(tid)) {
+        // Primary thread does not need to revert anything
+      } else {
+        // Workers need to revert their private b_arrived flag
+        this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
+            KMP_BARRIER_STATE_BUMP;
+      }
+    }
+    return cancelled;
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  return FALSE;
+}
+#endif
+
+void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
+  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
+  KMP_DEBUG_ASSERT(bt < bs_last_barrier);
+  int tid = __kmp_tid_from_gtid(gtid);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+
+  if (!team->t.t_serialized) {
+    if (KMP_MASTER_GTID(gtid)) {
+      switch (__kmp_barrier_release_pattern[bt]) {
+      case bp_dist_bar: {
+        __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
+                                   FALSE USE_ITT_BUILD_ARG(NULL));
+        break;
+      }
+      case bp_hyper_bar: {
+        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+        __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
+                                    FALSE USE_ITT_BUILD_ARG(NULL));
+        break;
+      }
+      case bp_hierarchical_bar: {
+        __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
+                                           FALSE USE_ITT_BUILD_ARG(NULL));
+        break;
+      }
+      case bp_tree_bar: {
+        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+        __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
+                                   FALSE USE_ITT_BUILD_ARG(NULL));
+        break;
+      }
+      default: {
+        __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
+                                     FALSE USE_ITT_BUILD_ARG(NULL));
+      }
+      }
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        __kmp_task_team_sync(this_thr, team);
+      } // if
+    }
+  }
+}
+
+void __kmp_join_barrier(int gtid) {
+  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
+  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
+
+  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
+
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team;
+  int tid;
+#ifdef KMP_DEBUG
+  int team_id;
+#endif /* KMP_DEBUG */
+#if USE_ITT_BUILD
+  void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
+    // Get object created at fork_barrier
+    itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+#endif
+#endif /* USE_ITT_BUILD */
+#if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
+  int nproc = this_thr->th.th_team_nproc;
+#endif
+  KMP_MB();
+
+  // Get current info
+  team = this_thr->th.th_team;
+  KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
+  tid = __kmp_tid_from_gtid(gtid);
+#ifdef KMP_DEBUG
+  team_id = team->t.t_id;
+  kmp_info_t *master_thread = this_thr->th.th_team_master;
+  if (master_thread != team->t.t_threads[0]) {
+    __kmp_print_structure();
+  }
+#endif /* KMP_DEBUG */
+  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
+  KMP_MB();
+
+  // Verify state
+  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
+  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
+  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
+  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
+                gtid, team_id, tid));
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+#if OMPT_OPTIONAL
+    ompt_data_t *my_task_data;
+    ompt_data_t *my_parallel_data;
+    void *codeptr = NULL;
+    int ds_tid = this_thr->th.th_info.ds.ds_tid;
+    if (KMP_MASTER_TID(ds_tid) &&
+        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
+         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
+      codeptr = team->t.ompt_team_info.master_return_address;
+    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
+    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
+          my_task_data, codeptr);
+    }
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
+          my_task_data, codeptr);
+    }
+    if (!KMP_MASTER_TID(ds_tid))
+      this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
+#endif
+    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
+  }
+#endif
+
+  if (__kmp_tasking_mode == tskm_extra_barrier) {
+    __kmp_tasking_barrier(team, this_thr, gtid);
+    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
+                  gtid, team_id, tid));
+  }
+#ifdef KMP_DEBUG
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
+                  "%p, th_task_team = %p\n",
+                  __kmp_gtid_from_thread(this_thr), team_id,
+                  team->t.t_task_team[this_thr->th.th_task_state],
+                  this_thr->th.th_task_team));
+    if (this_thr->th.th_task_team)
+      KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
+                       team->t.t_task_team[this_thr->th.th_task_state]);
+  }
+#endif /* KMP_DEBUG */
+
+  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
+     access it when the team struct is not guaranteed to exist. Doing these
+     loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
+     we do not perform the copy if blocktime=infinite, since the values are not
+     used by __kmp_wait_template() in that case. */
+  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+#if KMP_USE_MONITOR
+    this_thr->th.th_team_bt_intervals =
+        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+    this_thr->th.th_team_bt_set =
+        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+#else
+    this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
+#endif
+  }
+
+#if USE_ITT_BUILD
+  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+    __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
+  case bp_dist_bar: {
+    __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  case bp_hyper_bar: {
+    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
+    __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  case bp_hierarchical_bar: {
+    __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+                                      NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  case bp_tree_bar: {
+    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
+    __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  default: {
+    __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  }
+
+  /* From this point on, the team data structure may be deallocated at any time
+     by the primary thread - it is unsafe to reference it in any of the worker
+     threads. Any per-team data items that need to be referenced before the
+     end of the barrier should be moved to the kmp_task_team_t structs.  */
+  if (KMP_MASTER_TID(tid)) {
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    if (__kmp_display_affinity) {
+      KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
+    }
+#if KMP_STATS_ENABLED
+    // Have primary thread flag the workers to indicate they are now waiting for
+    // next parallel region, Also wake them up so they switch their timers to
+    // idle.
+    for (int i = 0; i < team->t.t_nproc; ++i) {
+      kmp_info_t *team_thread = team->t.t_threads[i];
+      if (team_thread == this_thr)
+        continue;
+      team_thread->th.th_stats->setIdleFlag();
+      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
+          team_thread->th.th_sleep_loc != NULL)
+        __kmp_null_resume_wrapper(team_thread);
+    }
+#endif
+#if USE_ITT_BUILD
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    // Join barrier - report frame end
+    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
+        __kmp_forkjoin_frames_mode &&
+        (this_thr->th.th_teams_microtask == NULL || // either not in teams
+         this_thr->th.th_teams_size.nteams == 1) && // or inside single team
+        team->t.t_active_level == 1) {
+      kmp_uint64 cur_time = __itt_get_timestamp();
+      ident_t *loc = team->t.t_ident;
+      kmp_info_t **other_threads = team->t.t_threads;
+      switch (__kmp_forkjoin_frames_mode) {
+      case 1:
+        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
+                               loc, nproc);
+        break;
+      case 2:
+        __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
+                               loc, nproc);
+        break;
+      case 3:
+        if (__itt_metadata_add_ptr) {
+          // Initialize with primary thread's wait time
+          kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
+          // Set arrive time to zero to be able to check it in
+          // __kmp_invoke_task(); the same is done inside the loop below
+          this_thr->th.th_bar_arrive_time = 0;
+          for (int i = 1; i < nproc; ++i) {
+            delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
+            other_threads[i]->th.th_bar_arrive_time = 0;
+          }
+          __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
+                                       cur_time, delta, 0);
+        }
+        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
+                               loc, nproc);
+        this_thr->th.th_frame_time = cur_time;
+        break;
+      }
+    }
+#endif /* USE_ITT_BUILD */
+  }
+#if USE_ITT_BUILD
+  else {
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+  }
+#endif /* USE_ITT_BUILD */
+
+#if KMP_DEBUG
+  if (KMP_MASTER_TID(tid)) {
+    KA_TRACE(
+        15,
+        ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
+         gtid, team_id, tid, nproc));
+  }
+#endif /* KMP_DEBUG */
+
+  // TODO now, mark worker threads as done so they may be disbanded
+  KMP_MB(); // Flush all pending memory write invalidates.
+  KA_TRACE(10,
+           ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
+
+}
+
+// TODO release worker threads' fork barriers as we are ready instead of all at
+// once
+void __kmp_fork_barrier(int gtid, int tid) {
+  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
+  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
+#if USE_ITT_BUILD
+  void *itt_sync_obj = NULL;
+#endif /* USE_ITT_BUILD */
+#ifdef KMP_DEBUG
+  if (team)
+    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
+                  (team != NULL) ? team->t.t_id : -1, tid));
+#endif
+  // th_team pointer only valid for primary thread here
+  if (KMP_MASTER_TID(tid)) {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+      // Create itt barrier object
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
+      __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
+    }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+
+#ifdef KMP_DEBUG
+    KMP_DEBUG_ASSERT(team);
+    kmp_info_t **other_threads = team->t.t_threads;
+    int i;
+
+    // Verify state
+    KMP_MB();
+
+    for (i = 1; i < team->t.t_nproc; ++i) {
+      KA_TRACE(500,
+               ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
+                "== %u.\n",
+                gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
+                team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
+                other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
+      KMP_DEBUG_ASSERT(
+          (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
+           ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
+      KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
+    }
+#endif
+
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      // 0 indicates setup current task team if nthreads > 1
+      __kmp_task_team_setup(this_thr, team, 0);
+    }
+
+    /* The primary thread may have changed its blocktime between join barrier
+       and fork barrier. Copy the blocktime info to the thread, where
+       __kmp_wait_template() can access it when the team struct is not
+       guaranteed to exist. */
+    // See note about the corresponding code in __kmp_join_barrier() being
+    // performance-critical
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+#if KMP_USE_MONITOR
+      this_thr->th.th_team_bt_intervals =
+          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+      this_thr->th.th_team_bt_set =
+          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+#else
+      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
+#endif
+    }
+  } // primary thread
+
+  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
+  case bp_dist_bar: {
+    __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+                               TRUE USE_ITT_BUILD_ARG(NULL));
+    break;
+  }
+  case bp_hyper_bar: {
+    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
+    __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  case bp_hierarchical_bar: {
+    __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+                                       TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  case bp_tree_bar: {
+    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
+    __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+                               TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  default: {
+    __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  }
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
+    int ds_tid = this_thr->th.th_info.ds.ds_tid;
+    ompt_data_t *task_data = (team)
+                                 ? OMPT_CUR_TASK_DATA(this_thr)
+                                 : &(this_thr->th.ompt_thread_info.task_data);
+    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+#if OMPT_OPTIONAL
+    void *codeptr = NULL;
+    if (KMP_MASTER_TID(ds_tid) &&
+        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
+         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
+      codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
+          codeptr);
+    }
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
+          codeptr);
+    }
+#endif
+    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_end, NULL, task_data, 0, ds_tid,
+          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+    }
+  }
+#endif
+
+  // Early exit for reaping threads releasing forkjoin barrier
+  if (TCR_4(__kmp_global.g.g_done)) {
+    this_thr->th.th_task_team = NULL;
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+      if (!KMP_MASTER_TID(tid)) {
+        itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+        if (itt_sync_obj)
+          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+      }
+    }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+    KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
+    return;
+  }
+
+  /* We can now assume that a valid team structure has been allocated by the
+     primary thread and propagated to all worker threads. The current thread,
+     however, may not be part of the team, so we can't blindly assume that the
+     team pointer is non-null.  */
+  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
+  KMP_DEBUG_ASSERT(team != NULL);
+  tid = __kmp_tid_from_gtid(gtid);
+
+#if KMP_BARRIER_ICV_PULL
+  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
+     __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
+     implicit task has this data before this function is called. We cannot
+     modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
+     thread struct, because it is not always the case that the threads arrays
+     have been allocated when __kmp_fork_call() is executed. */
+  {
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
+    if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
+      // Copy the initial ICVs from the primary thread's thread struct to the
+      // implicit task for this tid.
+      KA_TRACE(10,
+               ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
+      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
+                               tid, FALSE);
+      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                &team->t.t_threads[0]
+                     ->th.th_bar[bs_forkjoin_barrier]
+                     .bb.th_fixed_icvs);
+    }
+  }
+#endif // KMP_BARRIER_ICV_PULL
+
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    __kmp_task_team_sync(this_thr, team);
+  }
+
+#if KMP_AFFINITY_SUPPORTED
+  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
+  if (proc_bind == proc_bind_intel) {
+    // Call dynamic affinity settings
+    if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
+      __kmp_balanced_affinity(this_thr, team->t.t_nproc);
+    }
+  } else if (proc_bind != proc_bind_false) {
+    if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
+      KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
+                     __kmp_gtid_from_thread(this_thr),
+                     this_thr->th.th_current_place));
+    } else {
+      __kmp_affinity_bind_place(gtid);
+    }
+  }
+#endif // KMP_AFFINITY_SUPPORTED
+  // Perform the display affinity functionality
+  if (__kmp_display_affinity) {
+    if (team->t.t_display_affinity
+#if KMP_AFFINITY_SUPPORTED
+        || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
+#endif
+    ) {
+      // NULL means use the affinity-format-var ICV
+      __kmp_aux_display_affinity(gtid, NULL);
+      this_thr->th.th_prev_num_threads = team->t.t_nproc;
+      this_thr->th.th_prev_level = team->t.t_level;
+    }
+  }
+  if (!KMP_MASTER_TID(tid))
+    KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+    if (!KMP_MASTER_TID(tid)) {
+      // Get correct barrier object
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+      __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
+    } // (prepare called inside barrier_release)
+  }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
+                team->t.t_id, tid));
+}
+
+void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
+                          kmp_internal_control_t *new_icvs, ident_t *loc) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
+
+  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
+  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
+
+/* Primary thread's copy of the ICVs was set up on the implicit taskdata in
+   __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
+   implicit task has this data before this function is called. */
+#if KMP_BARRIER_ICV_PULL
+  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
+     remains untouched), where all of the worker threads can access them and
+     make their own copies after the barrier. */
+  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
+  // allocated at this point
+  copy_icvs(
+      &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
+      new_icvs);
+  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
+                team->t.t_threads[0], team));
+#elif KMP_BARRIER_ICV_PUSH
+  // The ICVs will be propagated in the fork barrier, so nothing needs to be
+  // done here.
+  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
+                team->t.t_threads[0], team));
+#else
+  // Copy the ICVs to each of the non-primary threads.  This takes O(nthreads)
+  // time.
+  ngo_load(new_icvs);
+  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
+  // allocated at this point
+  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
+    // TODO: GEH - pass in better source location info since usually NULL here
+    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
+                  f, team->t.t_threads[f], team));
+    __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
+    ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
+    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
+                  f, team->t.t_threads[f], team));
+  }
+  ngo_sync();
+#endif // KMP_BARRIER_ICV_PULL
+}
diff --git a/third_party/openmp/kmp_barrier.h b/third_party/openmp/kmp_barrier.h
new file mode 100644
index 000000000..ae9b8d62f
--- /dev/null
+++ b/third_party/openmp/kmp_barrier.h
@@ -0,0 +1,144 @@
+/*
+ * kmp_barrier.h
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_BARRIER_H
+#define KMP_BARRIER_H
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+
+#if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC
+#include <xmmintrin.h>
+#define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment)
+#define KMP_ALIGNED_FREE(ptr) _mm_free(ptr)
+#elif KMP_HAVE_ALIGNED_ALLOC
+#define KMP_ALGIN_UP(val, alignment)                                           \
+  (((val) + (alignment)-1) / (alignment) * (alignment))
+#define KMP_ALIGNED_ALLOCATE(size, alignment)                                  \
+  aligned_alloc(alignment, KMP_ALGIN_UP(size, alignment))
+#define KMP_ALIGNED_FREE(ptr) free(ptr)
+#elif KMP_HAVE_POSIX_MEMALIGN
+static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) {
+  void *ptr;
+  int n = posix_memalign(&ptr, alignment, size);
+  if (n != 0) {
+    if (ptr)
+      free(ptr);
+    return nullptr;
+  }
+  return ptr;
+}
+#define KMP_ALIGNED_FREE(ptr) free(ptr)
+#elif KMP_HAVE__ALIGNED_MALLOC
+#include <malloc.h>
+#define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment)
+#define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr)
+#else
+#define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size)
+#define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr)
+#endif
+
+// Use four cache lines: MLC tends to prefetch the next or previous cache line
+// creating a possible fake conflict between cores, so this is the only way to
+// guarantee that no such prefetch can happen.
+#ifndef KMP_FOURLINE_ALIGN_CACHE
+#define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE)
+#endif
+
+#define KMP_OPTIMIZE_FOR_REDUCTIONS 0
+
+class distributedBarrier {
+  struct flags_s {
+    kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed;
+  };
+
+  struct go_s {
+    std::atomic<kmp_uint64> KMP_FOURLINE_ALIGN_CACHE go;
+  };
+
+  struct iter_s {
+    kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter;
+  };
+
+  struct sleep_s {
+    std::atomic<bool> KMP_FOURLINE_ALIGN_CACHE sleep;
+  };
+
+  void init(size_t nthr);
+  void resize(size_t nthr);
+  void computeGo(size_t n);
+  void computeVarsForN(size_t n);
+
+public:
+  enum {
+    MAX_ITERS = 3,
+    MAX_GOS = 8,
+    IDEAL_GOS = 4,
+    IDEAL_CONTENTION = 16,
+  };
+
+  flags_s *flags[MAX_ITERS];
+  go_s *go;
+  iter_s *iter;
+  sleep_s *sleep;
+
+  size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier
+  size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure
+  // number of go signals each requiring one write per iteration
+  size_t KMP_ALIGN_CACHE num_gos;
+  // number of groups of gos
+  size_t KMP_ALIGN_CACHE num_groups;
+  // threads per go signal
+  size_t KMP_ALIGN_CACHE threads_per_go;
+  bool KMP_ALIGN_CACHE fix_threads_per_go;
+  // threads per group
+  size_t KMP_ALIGN_CACHE threads_per_group;
+  // number of go signals in a group
+  size_t KMP_ALIGN_CACHE gos_per_group;
+  void *team_icvs;
+
+  distributedBarrier() = delete;
+  ~distributedBarrier() = delete;
+
+  // Used instead of constructor to create aligned data
+  static distributedBarrier *allocate(int nThreads) {
+    distributedBarrier *d = (distributedBarrier *)KMP_ALIGNED_ALLOCATE(
+        sizeof(distributedBarrier), 4 * CACHE_LINE);
+    if (!d) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    d->num_threads = 0;
+    d->max_threads = 0;
+    for (int i = 0; i < MAX_ITERS; ++i)
+      d->flags[i] = NULL;
+    d->go = NULL;
+    d->iter = NULL;
+    d->sleep = NULL;
+    d->team_icvs = NULL;
+    d->fix_threads_per_go = false;
+    // calculate gos and groups ONCE on base size
+    d->computeGo(nThreads);
+    d->init(nThreads);
+    return d;
+  }
+
+  static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); }
+
+  void update_num_threads(size_t nthr) { init(nthr); }
+
+  bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); }
+  size_t get_num_threads() { return num_threads; }
+  kmp_uint64 go_release();
+  void go_reset();
+};
+
+#endif // KMP_BARRIER_H
diff --git a/third_party/openmp/kmp_cancel.cpp b/third_party/openmp/kmp_cancel.cpp
new file mode 100644
index 000000000..d1290493d
--- /dev/null
+++ b/third_party/openmp/kmp_cancel.cpp
@@ -0,0 +1,331 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_str.h"
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+@param cncl_kind Cancellation kind (parallel, for, sections, taskgroup)
+
+@return returns true if the cancellation request has been activated and the
+execution thread needs to proceed to the end of the canceled region.
+
+Request cancellation of the binding OpenMP region.
+*/
+kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind) {
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+
+  KC_TRACE(10, ("__kmpc_cancel: T#%d request %d OMP_CANCELLATION=%d\n", gtid,
+                cncl_kind, __kmp_omp_cancellation));
+
+  KMP_DEBUG_ASSERT(cncl_kind != cancel_noreq);
+  KMP_DEBUG_ASSERT(cncl_kind == cancel_parallel || cncl_kind == cancel_loop ||
+                   cncl_kind == cancel_sections ||
+                   cncl_kind == cancel_taskgroup);
+  KMP_DEBUG_ASSERT(__kmp_get_gtid() == gtid);
+
+  if (__kmp_omp_cancellation) {
+    switch (cncl_kind) {
+    case cancel_parallel:
+    case cancel_loop:
+    case cancel_sections:
+      // cancellation requests for parallel and worksharing constructs
+      // are handled through the team structure
+      {
+        kmp_team_t *this_team = this_thr->th.th_team;
+        KMP_DEBUG_ASSERT(this_team);
+        kmp_int32 old = cancel_noreq;
+        this_team->t.t_cancel_request.compare_exchange_strong(old, cncl_kind);
+        if (old == cancel_noreq || old == cncl_kind) {
+// we do not have a cancellation request in this team or we do have
+// one that matches the current request -> cancel
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+          if (ompt_enabled.ompt_callback_cancel) {
+            ompt_data_t *task_data;
+            __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                          NULL);
+            ompt_cancel_flag_t type = ompt_cancel_parallel;
+            if (cncl_kind == cancel_parallel)
+              type = ompt_cancel_parallel;
+            else if (cncl_kind == cancel_loop)
+              type = ompt_cancel_loop;
+            else if (cncl_kind == cancel_sections)
+              type = ompt_cancel_sections;
+            ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                task_data, type | ompt_cancel_activated,
+                OMPT_GET_RETURN_ADDRESS(0));
+          }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+          return 1 /* true */;
+        }
+        break;
+      }
+    case cancel_taskgroup:
+      // cancellation requests for a task group
+      // are handled through the taskgroup structure
+      {
+        kmp_taskdata_t *task;
+        kmp_taskgroup_t *taskgroup;
+
+        task = this_thr->th.th_current_task;
+        KMP_DEBUG_ASSERT(task);
+
+        taskgroup = task->td_taskgroup;
+        if (taskgroup) {
+          kmp_int32 old = cancel_noreq;
+          taskgroup->cancel_request.compare_exchange_strong(old, cncl_kind);
+          if (old == cancel_noreq || old == cncl_kind) {
+// we do not have a cancellation request in this taskgroup or we do
+// have one that matches the current request -> cancel
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+            if (ompt_enabled.ompt_callback_cancel) {
+              ompt_data_t *task_data;
+              __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                            NULL);
+              ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                  task_data, ompt_cancel_taskgroup | ompt_cancel_activated,
+                  OMPT_GET_RETURN_ADDRESS(0));
+            }
+#endif
+            return 1 /* true */;
+          }
+        } else {
+          // TODO: what needs to happen here?
+          // the specification disallows cancellation w/o taskgroups
+          // so we might do anything here, let's abort for now
+          KMP_ASSERT(0 /* false */);
+        }
+      }
+      break;
+    default:
+      KMP_ASSERT(0 /* false */);
+    }
+  }
+
+  // ICV OMP_CANCELLATION=false, so we ignored this cancel request
+  KMP_DEBUG_ASSERT(!__kmp_omp_cancellation);
+  return 0 /* false */;
+}
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+@param cncl_kind Cancellation kind (parallel, for, sections, taskgroup)
+
+@return returns true if a matching cancellation request has been flagged in the
+RTL and the encountering thread has to cancel..
+
+Cancellation point for the encountering thread.
+*/
+kmp_int32 __kmpc_cancellationpoint(ident_t *loc_ref, kmp_int32 gtid,
+                                   kmp_int32 cncl_kind) {
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+
+  KC_TRACE(10,
+           ("__kmpc_cancellationpoint: T#%d request %d OMP_CANCELLATION=%d\n",
+            gtid, cncl_kind, __kmp_omp_cancellation));
+
+  KMP_DEBUG_ASSERT(cncl_kind != cancel_noreq);
+  KMP_DEBUG_ASSERT(cncl_kind == cancel_parallel || cncl_kind == cancel_loop ||
+                   cncl_kind == cancel_sections ||
+                   cncl_kind == cancel_taskgroup);
+  KMP_DEBUG_ASSERT(__kmp_get_gtid() == gtid);
+
+  if (__kmp_omp_cancellation) {
+    switch (cncl_kind) {
+    case cancel_parallel:
+    case cancel_loop:
+    case cancel_sections:
+      // cancellation requests for parallel and worksharing constructs
+      // are handled through the team structure
+      {
+        kmp_team_t *this_team = this_thr->th.th_team;
+        KMP_DEBUG_ASSERT(this_team);
+        if (this_team->t.t_cancel_request) {
+          if (cncl_kind == this_team->t.t_cancel_request) {
+// the request in the team structure matches the type of
+// cancellation point so we can cancel
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+            if (ompt_enabled.ompt_callback_cancel) {
+              ompt_data_t *task_data;
+              __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                            NULL);
+              ompt_cancel_flag_t type = ompt_cancel_parallel;
+              if (cncl_kind == cancel_parallel)
+                type = ompt_cancel_parallel;
+              else if (cncl_kind == cancel_loop)
+                type = ompt_cancel_loop;
+              else if (cncl_kind == cancel_sections)
+                type = ompt_cancel_sections;
+              ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                  task_data, type | ompt_cancel_detected,
+                  OMPT_GET_RETURN_ADDRESS(0));
+            }
+#endif
+            return 1 /* true */;
+          }
+          KMP_ASSERT(0 /* false */);
+        } else {
+          // we do not have a cancellation request pending, so we just
+          // ignore this cancellation point
+          return 0;
+        }
+        break;
+      }
+    case cancel_taskgroup:
+      // cancellation requests for a task group
+      // are handled through the taskgroup structure
+      {
+        kmp_taskdata_t *task;
+        kmp_taskgroup_t *taskgroup;
+
+        task = this_thr->th.th_current_task;
+        KMP_DEBUG_ASSERT(task);
+
+        taskgroup = task->td_taskgroup;
+        if (taskgroup) {
+// return the current status of cancellation for the taskgroup
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+          if (ompt_enabled.ompt_callback_cancel &&
+              !!taskgroup->cancel_request) {
+            ompt_data_t *task_data;
+            __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                          NULL);
+            ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                task_data, ompt_cancel_taskgroup | ompt_cancel_detected,
+                OMPT_GET_RETURN_ADDRESS(0));
+          }
+#endif
+          return !!taskgroup->cancel_request;
+        } else {
+          // if a cancellation point is encountered by a task that does not
+          // belong to a taskgroup, it is OK to ignore it
+          return 0 /* false */;
+        }
+      }
+    default:
+      KMP_ASSERT(0 /* false */);
+    }
+  }
+
+  // ICV OMP_CANCELLATION=false, so we ignore the cancellation point
+  KMP_DEBUG_ASSERT(!__kmp_omp_cancellation);
+  return 0 /* false */;
+}
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+
+@return returns true if a matching cancellation request has been flagged in the
+RTL and the encountering thread has to cancel..
+
+Barrier with cancellation point to send threads from the barrier to the
+end of the parallel region.  Needs a special code pattern as documented
+in the design document for the cancellation feature.
+*/
+kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32 gtid) {
+  int ret = 0 /* false */;
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *this_team = this_thr->th.th_team;
+
+  KMP_DEBUG_ASSERT(__kmp_get_gtid() == gtid);
+
+  // call into the standard barrier
+  __kmpc_barrier(loc, gtid);
+
+  // if cancellation is active, check cancellation flag
+  if (__kmp_omp_cancellation) {
+    // depending on which construct to cancel, check the flag and
+    // reset the flag
+    switch (KMP_ATOMIC_LD_RLX(&(this_team->t.t_cancel_request))) {
+    case cancel_parallel:
+      ret = 1;
+      // ensure that threads have checked the flag, when
+      // leaving the above barrier
+      __kmpc_barrier(loc, gtid);
+      this_team->t.t_cancel_request = cancel_noreq;
+      // the next barrier is the fork/join barrier, which
+      // synchronizes the threads leaving here
+      break;
+    case cancel_loop:
+    case cancel_sections:
+      ret = 1;
+      // ensure that threads have checked the flag, when
+      // leaving the above barrier
+      __kmpc_barrier(loc, gtid);
+      this_team->t.t_cancel_request = cancel_noreq;
+      // synchronize the threads again to make sure we do not have any run-away
+      // threads that cause a race on the cancellation flag
+      __kmpc_barrier(loc, gtid);
+      break;
+    case cancel_taskgroup:
+      // this case should not occur
+      KMP_ASSERT(0 /* false */);
+      break;
+    case cancel_noreq:
+      // do nothing
+      break;
+    default:
+      KMP_ASSERT(0 /* false */);
+    }
+  }
+
+  return ret;
+}
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+
+@return returns true if a matching cancellation request has been flagged in the
+RTL and the encountering thread has to cancel..
+
+Query function to query the current status of cancellation requests.
+Can be used to implement the following pattern:
+
+if (kmp_get_cancellation_status(kmp_cancel_parallel)) {
+    perform_cleanup();
+    #pragma omp cancellation point parallel
+}
+*/
+int __kmp_get_cancellation_status(int cancel_kind) {
+  if (__kmp_omp_cancellation) {
+    kmp_info_t *this_thr = __kmp_entry_thread();
+
+    switch (cancel_kind) {
+    case cancel_parallel:
+    case cancel_loop:
+    case cancel_sections: {
+      kmp_team_t *this_team = this_thr->th.th_team;
+      return this_team->t.t_cancel_request == cancel_kind;
+    }
+    case cancel_taskgroup: {
+      kmp_taskdata_t *task;
+      kmp_taskgroup_t *taskgroup;
+      task = this_thr->th.th_current_task;
+      taskgroup = task->td_taskgroup;
+      return taskgroup && taskgroup->cancel_request;
+    }
+    }
+  }
+
+  return 0 /* false */;
+}
diff --git a/third_party/openmp/kmp_collapse.cpp b/third_party/openmp/kmp_collapse.cpp
new file mode 100644
index 000000000..2c410ca9b
--- /dev/null
+++ b/third_party/openmp/kmp_collapse.cpp
@@ -0,0 +1,1475 @@
+/*
+ * kmp_collapse.cpp -- loop collapse feature
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#include "kmp_collapse.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+// OMPTODO: different style of comments (see kmp_sched)
+// OMPTODO: OMPT/OMPD
+
+// avoid inadevertently using a library based abs
+template <typename T> T __kmp_abs(const T val) {
+  return (val < 0) ? -val : val;
+}
+kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
+kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
+
+//----------------------------------------------------------------------------
+// Common functions for working with rectangular and non-rectangular loops
+//----------------------------------------------------------------------------
+
+template <typename T> int __kmp_sign(T val) {
+  return (T(0) < val) - (val < T(0));
+}
+
+template <typename T> class CollapseAllocator {
+  typedef T *pT;
+
+private:
+  static const size_t allocaSize = 32; // size limit for stack allocations
+                                       // (8 bytes x 4 nested loops)
+  char stackAlloc[allocaSize];
+  static constexpr size_t maxElemCount = allocaSize / sizeof(T);
+  pT pTAlloc;
+
+public:
+  CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
+    if (n > maxElemCount) {
+      pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
+    }
+  }
+  ~CollapseAllocator() {
+    if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
+      __kmp_free(pTAlloc);
+    }
+  }
+  T &operator[](int index) { return pTAlloc[index]; }
+  operator const pT() { return pTAlloc; }
+};
+
+//----------Loop canonicalization---------------------------------------------
+
+// For loop nest (any shape):
+// convert != to < or >;
+// switch from using < or > to <= or >=.
+// "bounds" array has to be allocated per thread.
+// All other internal functions will work only with canonicalized loops.
+template <typename T>
+void kmp_canonicalize_one_loop_XX(
+    ident_t *loc,
+    /*in/out*/ bounds_infoXX_template<T> *bounds) {
+
+  if (__kmp_env_consistency_check) {
+    if (bounds->step == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
+                            loc);
+    }
+  }
+
+  if (bounds->comparison == comparison_t::comp_not_eq) {
+    // We can convert this to < or >, depends on the sign of the step:
+    if (bounds->step > 0) {
+      bounds->comparison = comparison_t::comp_less;
+    } else {
+      bounds->comparison = comparison_t::comp_greater;
+    }
+  }
+
+  if (bounds->comparison == comparison_t::comp_less) {
+    // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
+    // because ub0 + ub1*j should be still positive (otherwise loop was not
+    // well formed)
+    bounds->ub0 -= 1;
+    bounds->comparison = comparison_t::comp_less_or_eq;
+  } else if (bounds->comparison == comparison_t::comp_greater) {
+    bounds->ub0 += 1;
+    bounds->comparison = comparison_t::comp_greater_or_eq;
+  }
+}
+
+// Canonicalize loop nest. original_bounds_nest is an array of length n.
+void kmp_canonicalize_loop_nest(ident_t *loc,
+                                /*in/out*/ bounds_info_t *original_bounds_nest,
+                                kmp_index_t n) {
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(original_bounds_nest[ind]);
+
+    switch (bounds->loop_type) {
+    case loop_type_t::loop_type_int32:
+      kmp_canonicalize_one_loop_XX<kmp_int32>(
+          loc,
+          /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
+      break;
+    case loop_type_t::loop_type_uint32:
+      kmp_canonicalize_one_loop_XX<kmp_uint32>(
+          loc,
+          /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
+      break;
+    case loop_type_t::loop_type_int64:
+      kmp_canonicalize_one_loop_XX<kmp_int64>(
+          loc,
+          /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
+      break;
+    case loop_type_t::loop_type_uint64:
+      kmp_canonicalize_one_loop_XX<kmp_uint64>(
+          loc,
+          /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
+      break;
+    default:
+      KMP_ASSERT(false);
+    }
+  }
+}
+
+//----------Calculating trip count on one level-------------------------------
+
+// Calculate trip count on this loop level.
+// We do this either for a rectangular loop nest,
+// or after an adjustment bringing the loops to a parallelepiped shape.
+// This number should not depend on the value of outer IV
+// even if the formular has lb1 and ub1.
+// Note: for non-rectangular loops don't use span for this, it's too big.
+
+template <typename T>
+kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
+    /*in/out*/ bounds_infoXX_template<T> *bounds) {
+
+  if (bounds->comparison == comparison_t::comp_less_or_eq) {
+    if (bounds->ub0 < bounds->lb0) {
+      // Note: after this we don't need to calculate inner loops,
+      // but that should be an edge case:
+      bounds->trip_count = 0;
+    } else {
+      // ub - lb may exceed signed type range; we need to cast to
+      // kmp_loop_nest_iv_t anyway
+      bounds->trip_count =
+          static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
+              __kmp_abs(bounds->step) +
+          1;
+    }
+  } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
+    if (bounds->lb0 < bounds->ub0) {
+      // Note: after this we don't need to calculate inner loops,
+      // but that should be an edge case:
+      bounds->trip_count = 0;
+    } else {
+      // lb - ub may exceed signed type range; we need to cast to
+      // kmp_loop_nest_iv_t anyway
+      bounds->trip_count =
+          static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
+              __kmp_abs(bounds->step) +
+          1;
+    }
+  } else {
+    KMP_ASSERT(false);
+  }
+  return bounds->trip_count;
+}
+
+// Calculate trip count on this loop level.
+kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
+
+  kmp_loop_nest_iv_t trip_count = 0;
+
+  switch (bounds->loop_type) {
+  case loop_type_t::loop_type_int32:
+    trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
+        /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
+    break;
+  case loop_type_t::loop_type_uint32:
+    trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
+        /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
+    break;
+  case loop_type_t::loop_type_int64:
+    trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
+        /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
+    break;
+  case loop_type_t::loop_type_uint64:
+    trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
+        /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
+    break;
+  default:
+    KMP_ASSERT(false);
+  }
+
+  return trip_count;
+}
+
+//----------Trim original iv according to its type----------------------------
+
+// Trim original iv according to its type.
+// Return kmp_uint64 value which can be easily used in all internal calculations
+// And can be statically cast back to original type in user code.
+kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
+  kmp_uint64 res = 0;
+
+  switch (loop_iv_type) {
+  case loop_type_t::loop_type_int8:
+    res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
+    break;
+  case loop_type_t::loop_type_uint8:
+    res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
+    break;
+  case loop_type_t::loop_type_int16:
+    res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
+    break;
+  case loop_type_t::loop_type_uint16:
+    res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
+    break;
+  case loop_type_t::loop_type_int32:
+    res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
+    break;
+  case loop_type_t::loop_type_uint32:
+    res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
+    break;
+  case loop_type_t::loop_type_int64:
+    res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
+    break;
+  case loop_type_t::loop_type_uint64:
+    res = static_cast<kmp_uint64>(original_iv);
+    break;
+  default:
+    KMP_ASSERT(false);
+  }
+
+  return res;
+}
+
+//----------Compare two IVs (remember they have a type)-----------------------
+
+bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
+                kmp_uint64 original_iv2) {
+  bool res = false;
+
+  switch (loop_iv_type) {
+  case loop_type_t::loop_type_int8:
+    res = static_cast<kmp_int8>(original_iv1) ==
+          static_cast<kmp_int8>(original_iv2);
+    break;
+  case loop_type_t::loop_type_uint8:
+    res = static_cast<kmp_uint8>(original_iv1) ==
+          static_cast<kmp_uint8>(original_iv2);
+    break;
+  case loop_type_t::loop_type_int16:
+    res = static_cast<kmp_int16>(original_iv1) ==
+          static_cast<kmp_int16>(original_iv2);
+    break;
+  case loop_type_t::loop_type_uint16:
+    res = static_cast<kmp_uint16>(original_iv1) ==
+          static_cast<kmp_uint16>(original_iv2);
+    break;
+  case loop_type_t::loop_type_int32:
+    res = static_cast<kmp_int32>(original_iv1) ==
+          static_cast<kmp_int32>(original_iv2);
+    break;
+  case loop_type_t::loop_type_uint32:
+    res = static_cast<kmp_uint32>(original_iv1) ==
+          static_cast<kmp_uint32>(original_iv2);
+    break;
+  case loop_type_t::loop_type_int64:
+    res = static_cast<kmp_int64>(original_iv1) ==
+          static_cast<kmp_int64>(original_iv2);
+    break;
+  case loop_type_t::loop_type_uint64:
+    res = static_cast<kmp_uint64>(original_iv1) ==
+          static_cast<kmp_uint64>(original_iv2);
+    break;
+  default:
+    KMP_ASSERT(false);
+  }
+
+  return res;
+}
+
+//----------Calculate original iv on one level--------------------------------
+
+// Return true if the point fits into upper bounds on this level,
+// false otherwise
+template <typename T>
+bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
+                                 const kmp_point_t original_ivs,
+                                 kmp_index_t ind) {
+
+  T iv = static_cast<T>(original_ivs[ind]);
+  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
+
+  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
+       (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
+      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
+       (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
+    // The calculated point is outside of loop upper boundary:
+    return false;
+  }
+
+  return true;
+}
+
+// Calculate one iv corresponding to iteration on the level ind.
+// Return true if it fits into lower-upper bounds on this level
+// (if not, we need to re-calculate)
+template <typename T>
+bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
+                        /*in/out*/ kmp_point_t original_ivs,
+                        const kmp_iterations_t iterations, kmp_index_t ind,
+                        bool start_with_lower_bound, bool checkBounds) {
+
+  kmp_uint64 temp = 0;
+  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
+
+  if (start_with_lower_bound) {
+    // we moved to the next iteration on one of outer loops, should start
+    // with the lower bound here:
+    temp = bounds->lb0 + bounds->lb1 * outer_iv;
+  } else {
+    auto iteration = iterations[ind];
+    temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
+  }
+
+  // Now trim original iv according to its type:
+  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
+
+  if (checkBounds) {
+    return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
+  } else {
+    return true;
+  }
+}
+
+bool kmp_calc_one_iv(const bounds_info_t *bounds,
+                     /*in/out*/ kmp_point_t original_ivs,
+                     const kmp_iterations_t iterations, kmp_index_t ind,
+                     bool start_with_lower_bound, bool checkBounds) {
+
+  switch (bounds->loop_type) {
+  case loop_type_t::loop_type_int32:
+    return kmp_calc_one_iv_XX<kmp_int32>(
+        (bounds_infoXX_template<kmp_int32> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
+        checkBounds);
+    break;
+  case loop_type_t::loop_type_uint32:
+    return kmp_calc_one_iv_XX<kmp_uint32>(
+        (bounds_infoXX_template<kmp_uint32> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
+        checkBounds);
+    break;
+  case loop_type_t::loop_type_int64:
+    return kmp_calc_one_iv_XX<kmp_int64>(
+        (bounds_infoXX_template<kmp_int64> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
+        checkBounds);
+    break;
+  case loop_type_t::loop_type_uint64:
+    return kmp_calc_one_iv_XX<kmp_uint64>(
+        (bounds_infoXX_template<kmp_uint64> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
+        checkBounds);
+    break;
+  default:
+    KMP_ASSERT(false);
+    return false;
+  }
+}
+
+//----------Calculate original iv on one level for rectangular loop nest------
+
+// Calculate one iv corresponding to iteration on the level ind.
+// Return true if it fits into lower-upper bounds on this level
+// (if not, we need to re-calculate)
+template <typename T>
+void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
+                                /*in/out*/ kmp_uint64 *original_ivs,
+                                const kmp_iterations_t iterations,
+                                kmp_index_t ind) {
+
+  auto iteration = iterations[ind];
+
+  kmp_uint64 temp =
+      bounds->lb0 +
+      bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
+      iteration * bounds->step;
+
+  // Now trim original iv according to its type:
+  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
+}
+
+void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
+                             /*in/out*/ kmp_uint64 *original_ivs,
+                             const kmp_iterations_t iterations,
+                             kmp_index_t ind) {
+
+  switch (bounds->loop_type) {
+  case loop_type_t::loop_type_int32:
+    kmp_calc_one_iv_rectang_XX<kmp_int32>(
+        (bounds_infoXX_template<kmp_int32> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind);
+    break;
+  case loop_type_t::loop_type_uint32:
+    kmp_calc_one_iv_rectang_XX<kmp_uint32>(
+        (bounds_infoXX_template<kmp_uint32> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind);
+    break;
+  case loop_type_t::loop_type_int64:
+    kmp_calc_one_iv_rectang_XX<kmp_int64>(
+        (bounds_infoXX_template<kmp_int64> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind);
+    break;
+  case loop_type_t::loop_type_uint64:
+    kmp_calc_one_iv_rectang_XX<kmp_uint64>(
+        (bounds_infoXX_template<kmp_uint64> *)(bounds),
+        /*in/out*/ original_ivs, iterations, ind);
+    break;
+  default:
+    KMP_ASSERT(false);
+  }
+}
+
+//----------------------------------------------------------------------------
+// Rectangular loop nest
+//----------------------------------------------------------------------------
+
+//----------Canonicalize loop nest and calculate trip count-------------------
+
+// Canonicalize loop nest and calculate overall trip count.
+// "bounds_nest" has to be allocated per thread.
+// API will modify original bounds_nest array to bring it to a canonical form
+// (only <= and >=, no !=, <, >). If the original loop nest was already in a
+// canonical form there will be no changes to bounds in bounds_nest array
+// (only trip counts will be calculated).
+// Returns trip count of overall space.
+extern "C" kmp_loop_nest_iv_t
+__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
+                                 /*in/out*/ bounds_info_t *original_bounds_nest,
+                                 kmp_index_t n) {
+
+  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
+
+  kmp_loop_nest_iv_t total = 1;
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(original_bounds_nest[ind]);
+
+    kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
+    total *= trip_count;
+  }
+
+  return total;
+}
+
+//----------Calculate old induction variables---------------------------------
+
+// Calculate old induction variables corresponding to overall new_iv.
+// Note: original IV will be returned as if it had kmp_uint64 type,
+// will have to be converted to original type in user code.
+// Note: trip counts should be already calculated by
+// __kmpc_process_loop_nest_rectang.
+// OMPTODO: special case 2, 3 nested loops: either do different
+// interface without array or possibly template this over n
+extern "C" void
+__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
+                                 const bounds_info_t *original_bounds_nest,
+                                 /*out*/ kmp_uint64 *original_ivs,
+                                 kmp_index_t n) {
+
+  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
+
+  // First, calc corresponding iteration in every original loop:
+  for (kmp_index_t ind = n; ind > 0;) {
+    --ind;
+    auto bounds = &(original_bounds_nest[ind]);
+
+    // should be optimized to OPDIVREM:
+    auto temp = new_iv / bounds->trip_count;
+    auto iteration = new_iv % bounds->trip_count;
+    new_iv = temp;
+
+    iterations[ind] = iteration;
+  }
+  KMP_ASSERT(new_iv == 0);
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(original_bounds_nest[ind]);
+
+    kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
+  }
+}
+
+//----------------------------------------------------------------------------
+// Non-rectangular loop nest
+//----------------------------------------------------------------------------
+
+//----------Calculate maximum possible span of iv values on one level---------
+
+// Calculate span for IV on this loop level for "<=" case.
+// Note: it's for <= on this loop nest level, so lower bound should be smallest
+// value, upper bound should be the biggest value. If the loop won't execute,
+// 'smallest' may be bigger than 'biggest', but we'd better not switch them
+// around.
+template <typename T>
+void kmp_calc_span_lessoreq_XX(
+    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
+    /* in/out*/ bounds_info_internal_t *bounds_nest) {
+
+  typedef typename traits_t<T>::unsigned_t UT;
+  // typedef typename traits_t<T>::signed_t ST;
+
+  // typedef typename big_span_t span_t;
+  typedef T span_t;
+
+  auto &bbounds = bounds->b;
+
+  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
+    // This dimention depends on one of previous ones; can't be the outermost
+    // one.
+    bounds_info_internalXX_template<T> *previous =
+        reinterpret_cast<bounds_info_internalXX_template<T> *>(
+            &(bounds_nest[bbounds.outer_iv]));
+
+    // OMPTODO: assert that T is compatible with loop variable type on
+    // 'previous' loop
+
+    {
+      span_t bound_candidate1 =
+          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
+      span_t bound_candidate2 =
+          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
+      if (bound_candidate1 < bound_candidate2) {
+        bounds->span_smallest = bound_candidate1;
+      } else {
+        bounds->span_smallest = bound_candidate2;
+      }
+    }
+
+    {
+      // We can't adjust the upper bound with respect to step, because
+      // lower bound might be off after adjustments
+
+      span_t bound_candidate1 =
+          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
+      span_t bound_candidate2 =
+          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
+      if (bound_candidate1 < bound_candidate2) {
+        bounds->span_biggest = bound_candidate2;
+      } else {
+        bounds->span_biggest = bound_candidate1;
+      }
+    }
+  } else {
+    // Rectangular:
+    bounds->span_smallest = bbounds.lb0;
+    bounds->span_biggest = bbounds.ub0;
+  }
+  if (!bounds->loop_bounds_adjusted) {
+    // Here it's safe to reduce the space to the multiply of step.
+    // OMPTODO: check if the formular is correct.
+    // Also check if it would be safe to do this if we didn't adjust left side.
+    bounds->span_biggest -=
+        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
+  }
+}
+
+// Calculate span for IV on this loop level for ">=" case.
+template <typename T>
+void kmp_calc_span_greateroreq_XX(
+    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
+    /* in/out*/ bounds_info_internal_t *bounds_nest) {
+
+  typedef typename traits_t<T>::unsigned_t UT;
+  // typedef typename traits_t<T>::signed_t ST;
+
+  // typedef typename big_span_t span_t;
+  typedef T span_t;
+
+  auto &bbounds = bounds->b;
+
+  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
+    // This dimention depends on one of previous ones; can't be the outermost
+    // one.
+    bounds_info_internalXX_template<T> *previous =
+        reinterpret_cast<bounds_info_internalXX_template<T> *>(
+            &(bounds_nest[bbounds.outer_iv]));
+
+    // OMPTODO: assert that T is compatible with loop variable type on
+    // 'previous' loop
+
+    {
+      span_t bound_candidate1 =
+          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
+      span_t bound_candidate2 =
+          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
+      if (bound_candidate1 >= bound_candidate2) {
+        bounds->span_smallest = bound_candidate1;
+      } else {
+        bounds->span_smallest = bound_candidate2;
+      }
+    }
+
+    {
+      // We can't adjust the upper bound with respect to step, because
+      // lower bound might be off after adjustments
+
+      span_t bound_candidate1 =
+          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
+      span_t bound_candidate2 =
+          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
+      if (bound_candidate1 >= bound_candidate2) {
+        bounds->span_biggest = bound_candidate2;
+      } else {
+        bounds->span_biggest = bound_candidate1;
+      }
+    }
+
+  } else {
+    // Rectangular:
+    bounds->span_biggest = bbounds.lb0;
+    bounds->span_smallest = bbounds.ub0;
+  }
+  if (!bounds->loop_bounds_adjusted) {
+    // Here it's safe to reduce the space to the multiply of step.
+    // OMPTODO: check if the formular is correct.
+    // Also check if it would be safe to do this if we didn't adjust left side.
+    bounds->span_biggest -=
+        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
+  }
+}
+
+// Calculate maximum possible span for IV on this loop level.
+template <typename T>
+void kmp_calc_span_XX(
+    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
+    /* in/out*/ bounds_info_internal_t *bounds_nest) {
+
+  if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
+    kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
+  } else {
+    KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
+    kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
+  }
+}
+
+//----------All initial processing of the loop nest---------------------------
+
+// Calculate new bounds for this loop level.
+// To be able to work with the nest we need to get it to a parallelepiped shape.
+// We need to stay in the original range of values, so that there will be no
+// overflow, for that we'll adjust both upper and lower bounds as needed.
+template <typename T>
+void kmp_calc_new_bounds_XX(
+    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
+    /* in/out*/ bounds_info_internal_t *bounds_nest) {
+
+  auto &bbounds = bounds->b;
+
+  if (bbounds.lb1 == bbounds.ub1) {
+    // Already parallel, no need to adjust:
+    bounds->loop_bounds_adjusted = false;
+  } else {
+    bounds->loop_bounds_adjusted = true;
+
+    T old_lb1 = bbounds.lb1;
+    T old_ub1 = bbounds.ub1;
+
+    if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
+      // With this shape we can adjust to a rectangle:
+      bbounds.lb1 = 0;
+      bbounds.ub1 = 0;
+    } else {
+      // get upper and lower bounds to be parallel
+      // with values in the old range.
+      // Note: abs didn't work here.
+      if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
+          ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
+        bbounds.lb1 = old_ub1;
+      } else {
+        bbounds.ub1 = old_lb1;
+      }
+    }
+
+    // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
+    // The idea here that for this IV we are now getting the same span
+    // irrespective of the previous IV value.
+    bounds_info_internalXX_template<T> *previous =
+        reinterpret_cast<bounds_info_internalXX_template<T> *>(
+            &bounds_nest[bbounds.outer_iv]);
+
+    if (bbounds.comparison == comparison_t::comp_less_or_eq) {
+      if (old_lb1 < bbounds.lb1) {
+        KMP_ASSERT(old_lb1 < 0);
+        // The length is good on outer_iv biggest number,
+        // can use it to find where to move the lower bound:
+
+        T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
+        bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
+                            // e.g. it was 0?? (same below)
+      } else if (old_lb1 > bbounds.lb1) {
+        // still need to move lower bound:
+        T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
+        bbounds.lb0 += add;
+      }
+
+      if (old_ub1 > bbounds.ub1) {
+        KMP_ASSERT(old_ub1 > 0);
+        // The length is good on outer_iv biggest number,
+        // can use it to find where to move upper bound:
+
+        T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
+        bbounds.ub0 += add;
+      } else if (old_ub1 < bbounds.ub1) {
+        // still need to move upper bound:
+        T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
+        bbounds.ub0 -= sub;
+      }
+    } else {
+      KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
+      if (old_lb1 < bbounds.lb1) {
+        KMP_ASSERT(old_lb1 < 0);
+        T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
+        bbounds.lb0 -= sub;
+      } else if (old_lb1 > bbounds.lb1) {
+        T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
+        bbounds.lb0 += add;
+      }
+
+      if (old_ub1 > bbounds.ub1) {
+        KMP_ASSERT(old_ub1 > 0);
+        T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
+        bbounds.ub0 += add;
+      } else if (old_ub1 < bbounds.ub1) {
+        T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
+        bbounds.ub0 -= sub;
+      }
+    }
+  }
+}
+
+// Do all processing for one canonicalized loop in the nest
+// (assuming that outer loops already were processed):
+template <typename T>
+kmp_loop_nest_iv_t kmp_process_one_loop_XX(
+    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
+    /*in/out*/ bounds_info_internal_t *bounds_nest) {
+
+  kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
+  kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
+  return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
+}
+
+// Non-rectangular loop nest, canonicalized to use <= or >=.
+// Process loop nest to have a parallelepiped shape,
+// calculate biggest spans for IV's on all levels and calculate overall trip
+// count. "bounds_nest" has to be allocated per thread.
+// Returns overall trip count (for adjusted space).
+kmp_loop_nest_iv_t kmp_process_loop_nest(
+    /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
+
+  kmp_loop_nest_iv_t total = 1;
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(bounds_nest[ind]);
+    kmp_loop_nest_iv_t trip_count = 0;
+
+    switch (bounds->b.loop_type) {
+    case loop_type_t::loop_type_int32:
+      trip_count = kmp_process_one_loop_XX<kmp_int32>(
+          /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
+          /*in/out*/ bounds_nest);
+      break;
+    case loop_type_t::loop_type_uint32:
+      trip_count = kmp_process_one_loop_XX<kmp_uint32>(
+          /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
+          /*in/out*/ bounds_nest);
+      break;
+    case loop_type_t::loop_type_int64:
+      trip_count = kmp_process_one_loop_XX<kmp_int64>(
+          /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
+          /*in/out*/ bounds_nest);
+      break;
+    case loop_type_t::loop_type_uint64:
+      trip_count = kmp_process_one_loop_XX<kmp_uint64>(
+          /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
+          /*in/out*/ bounds_nest);
+      break;
+    default:
+      KMP_ASSERT(false);
+    }
+    total *= trip_count;
+  }
+
+  return total;
+}
+
+//----------Calculate iterations (in the original or updated space)-----------
+
+// Calculate number of iterations in original or updated space resulting in
+// original_ivs[ind] (only on this level, non-negative)
+// (not counting initial iteration)
+template <typename T>
+kmp_loop_nest_iv_t
+kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
+                                 const kmp_point_t original_ivs,
+                                 kmp_index_t ind) {
+
+  kmp_loop_nest_iv_t iterations = 0;
+
+  if (bounds->comparison == comparison_t::comp_less_or_eq) {
+    iterations =
+        (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
+         bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
+        __kmp_abs(bounds->step);
+  } else {
+    KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
+    iterations = (bounds->lb0 +
+                  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
+                  static_cast<T>(original_ivs[ind])) /
+                 __kmp_abs(bounds->step);
+  }
+
+  return iterations;
+}
+
+// Calculate number of iterations in the original or updated space resulting in
+// original_ivs[ind] (only on this level, non-negative)
+kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
+                                                 const kmp_point_t original_ivs,
+                                                 kmp_index_t ind) {
+
+  switch (bounds->loop_type) {
+  case loop_type_t::loop_type_int32:
+    return kmp_calc_number_of_iterations_XX<kmp_int32>(
+        (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_uint32:
+    return kmp_calc_number_of_iterations_XX<kmp_uint32>(
+        (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_int64:
+    return kmp_calc_number_of_iterations_XX<kmp_int64>(
+        (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_uint64:
+    return kmp_calc_number_of_iterations_XX<kmp_uint64>(
+        (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
+    break;
+  default:
+    KMP_ASSERT(false);
+    return 0;
+  }
+}
+
+//----------Calculate new iv corresponding to original ivs--------------------
+
+// We got a point in the original loop nest.
+// Take updated bounds and calculate what new_iv will correspond to this point.
+// When we are getting original IVs from new_iv, we have to adjust to fit into
+// original loops bounds. Getting new_iv for the adjusted original IVs will help
+// with making more chunks non-empty.
+kmp_loop_nest_iv_t
+kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
+                                  const kmp_point_t original_ivs,
+                                  kmp_index_t n) {
+
+  kmp_loop_nest_iv_t new_iv = 0;
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(bounds_nest[ind].b);
+
+    new_iv = new_iv * bounds->trip_count +
+             kmp_calc_number_of_iterations(bounds, original_ivs, ind);
+  }
+
+  return new_iv;
+}
+
+//----------Calculate original ivs for provided iterations--------------------
+
+// Calculate original IVs for provided iterations, assuming iterations are
+// calculated in the original space.
+// Loop nest is in canonical form (with <= / >=).
+bool kmp_calc_original_ivs_from_iterations(
+    const bounds_info_t *original_bounds_nest, kmp_index_t n,
+    /*in/out*/ kmp_point_t original_ivs,
+    /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
+
+  kmp_index_t lengthened_ind = n;
+
+  for (; ind < n;) {
+    auto bounds = &(original_bounds_nest[ind]);
+    bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
+                                ind, (lengthened_ind < ind), true);
+
+    if (!good) {
+      // The calculated iv value is too big (or too small for >=):
+      if (ind == 0) {
+        // Space is empty:
+        return false;
+      } else {
+        // Go to next iteration on the outer loop:
+        --ind;
+        ++iterations[ind];
+        lengthened_ind = ind;
+        for (kmp_index_t i = ind + 1; i < n; ++i) {
+          iterations[i] = 0;
+        }
+        continue;
+      }
+    }
+    ++ind;
+  }
+
+  return true;
+}
+
+//----------Calculate original ivs for the beginning of the loop nest---------
+
+// Calculate IVs for the beginning of the loop nest.
+// Note: lower bounds of all loops may not work -
+// if on some of the iterations of the outer loops inner loops are empty.
+// Loop nest is in canonical form (with <= / >=).
+bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
+                                     kmp_index_t n,
+                                     /*out*/ kmp_point_t original_ivs) {
+
+  // Iterations in the original space, multiplied by step:
+  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
+  for (kmp_index_t ind = n; ind > 0;) {
+    --ind;
+    iterations[ind] = 0;
+  }
+
+  // Now calculate the point:
+  bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
+                                                 /*in/out*/ original_ivs,
+                                                 /*in/out*/ iterations, 0);
+  return b;
+}
+
+//----------Calculate next point in the original loop space-------------------
+
+// From current set of original IVs calculate next point.
+// Return false if there is no next point in the loop bounds.
+bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
+                                kmp_index_t n, const kmp_point_t original_ivs,
+                                /*out*/ kmp_point_t next_original_ivs) {
+  // Iterations in the original space, multiplied by step (so can be negative):
+  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
+  // First, calc corresponding iteration in every original loop:
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(original_bounds_nest[ind]);
+    iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
+  }
+
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    next_original_ivs[ind] = original_ivs[ind];
+  }
+
+  // Next add one step to the iterations on the inner-most level, and see if we
+  // need to move up the nest:
+  kmp_index_t ind = n - 1;
+  ++iterations[ind];
+
+  bool b = kmp_calc_original_ivs_from_iterations(
+      original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
+
+  return b;
+}
+
+//----------Calculate chunk end in the original loop space--------------------
+
+// For one level calculate old induction variable corresponding to overall
+// new_iv for the chunk end.
+// Return true if it fits into upper bound on this level
+// (if not, we need to re-calculate)
+template <typename T>
+bool kmp_calc_one_iv_for_chunk_end_XX(
+    const bounds_infoXX_template<T> *bounds,
+    const bounds_infoXX_template<T> *updated_bounds,
+    /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
+    kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
+    const kmp_point_t original_ivs_start) {
+
+  // typedef  std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
+  // big_span_t;
+
+  // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
+  T temp = 0;
+
+  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
+
+  if (start_with_lower_bound) {
+    // we moved to the next iteration on one of outer loops, may as well use
+    // the lower bound here:
+    temp = bounds->lb0 + bounds->lb1 * outer_iv;
+  } else {
+    // Start in expanded space, but:
+    // - we need to hit original space lower bound, so need to account for
+    // that
+    // - we have to go into original space, even if that means adding more
+    // iterations than was planned
+    // - we have to go past (or equal to) previous point (which is the chunk
+    // starting point)
+
+    auto iteration = iterations[ind];
+
+    auto step = bounds->step;
+
+    // In case of >= it's negative:
+    auto accountForStep =
+        ((bounds->lb0 + bounds->lb1 * outer_iv) -
+         (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
+        step;
+
+    temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
+           accountForStep + iteration * step;
+
+    if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
+         (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
+        ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
+         (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
+      // Too small (or too big), didn't reach the original lower bound. Use
+      // heuristic:
+      temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
+    }
+
+    if (compare_with_start) {
+
+      T start = static_cast<T>(original_ivs_start[ind]);
+
+      temp = kmp_fix_iv(bounds->loop_iv_type, temp);
+
+      // On all previous levels start of the chunk is same as the end, need to
+      // be really careful here:
+      if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
+           (temp < start)) ||
+          ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
+           (temp > start))) {
+        // End of the chunk can't be smaller (for >= bigger) than it's start.
+        // Use heuristic:
+        temp = start + iteration / 4 * step;
+      }
+    }
+  }
+
+  original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
+
+  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
+       (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
+      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
+       (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
+    // Too big (or too small for >=).
+    return false;
+  }
+
+  return true;
+}
+
+// For one level calculate old induction variable corresponding to overall
+// new_iv for the chunk end.
+bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
+                                   const bounds_info_t *updated_bounds,
+                                   /*in/out*/ kmp_point_t original_ivs,
+                                   const kmp_iterations_t iterations,
+                                   kmp_index_t ind, bool start_with_lower_bound,
+                                   bool compare_with_start,
+                                   const kmp_point_t original_ivs_start) {
+
+  switch (bounds->loop_type) {
+  case loop_type_t::loop_type_int32:
+    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
+        (bounds_infoXX_template<kmp_int32> *)(bounds),
+        (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
+        /*in/out*/
+        original_ivs, iterations, ind, start_with_lower_bound,
+        compare_with_start, original_ivs_start);
+    break;
+  case loop_type_t::loop_type_uint32:
+    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
+        (bounds_infoXX_template<kmp_uint32> *)(bounds),
+        (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
+        /*in/out*/
+        original_ivs, iterations, ind, start_with_lower_bound,
+        compare_with_start, original_ivs_start);
+    break;
+  case loop_type_t::loop_type_int64:
+    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
+        (bounds_infoXX_template<kmp_int64> *)(bounds),
+        (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
+        /*in/out*/
+        original_ivs, iterations, ind, start_with_lower_bound,
+        compare_with_start, original_ivs_start);
+    break;
+  case loop_type_t::loop_type_uint64:
+    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
+        (bounds_infoXX_template<kmp_uint64> *)(bounds),
+        (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
+        /*in/out*/
+        original_ivs, iterations, ind, start_with_lower_bound,
+        compare_with_start, original_ivs_start);
+    break;
+  default:
+    KMP_ASSERT(false);
+    return false;
+  }
+}
+
+// Calculate old induction variables corresponding to overall new_iv for the
+// chunk end. If due to space extension we are getting old IVs outside of the
+// boundaries, bring them into the boundaries. Need to do this in the runtime,
+// esp. on the lower bounds side. When getting result need to make sure that the
+// new chunk starts at next position to old chunk, not overlaps with it (this is
+// done elsewhere), and need to make sure end of the chunk is further than the
+// beginning of the chunk. We don't need an exact ending point here, just
+// something more-or-less close to the desired chunk length, bigger is fine
+// (smaller would be fine, but we risk going into infinite loop, so do smaller
+// only at the very end of the space). result: false if could not find the
+// ending point in the original loop space. In this case the caller can use
+// original upper bounds as the end of the chunk. Chunk won't be empty, because
+// it'll have at least the starting point, which is by construction in the
+// original space.
+bool kmp_calc_original_ivs_for_chunk_end(
+    const bounds_info_t *original_bounds_nest, kmp_index_t n,
+    const bounds_info_internal_t *updated_bounds_nest,
+    const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
+    /*out*/ kmp_point_t original_ivs) {
+
+  // Iterations in the expanded space:
+  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
+  // First, calc corresponding iteration in every modified loop:
+  for (kmp_index_t ind = n; ind > 0;) {
+    --ind;
+    auto &updated_bounds = updated_bounds_nest[ind];
+
+    // should be optimized to OPDIVREM:
+    auto new_ind = new_iv / updated_bounds.b.trip_count;
+    auto iteration = new_iv % updated_bounds.b.trip_count;
+
+    new_iv = new_ind;
+    iterations[ind] = iteration;
+  }
+  KMP_DEBUG_ASSERT(new_iv == 0);
+
+  kmp_index_t lengthened_ind = n;
+  kmp_index_t equal_ind = -1;
+
+  // Next calculate the point, but in original loop nest.
+  for (kmp_index_t ind = 0; ind < n;) {
+    auto bounds = &(original_bounds_nest[ind]);
+    auto updated_bounds = &(updated_bounds_nest[ind].b);
+
+    bool good = kmp_calc_one_iv_for_chunk_end(
+        bounds, updated_bounds,
+        /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
+        (equal_ind >= ind - 1), original_ivs_start);
+
+    if (!good) {
+      // Too big (or too small for >=).
+      if (ind == 0) {
+        // Need to reduce to the end.
+        return false;
+      } else {
+        // Go to next iteration on outer loop:
+        --ind;
+        ++(iterations[ind]);
+        lengthened_ind = ind;
+        if (equal_ind >= lengthened_ind) {
+          // We've changed the number of iterations here,
+          // can't be same anymore:
+          equal_ind = lengthened_ind - 1;
+        }
+        for (kmp_index_t i = ind + 1; i < n; ++i) {
+          iterations[i] = 0;
+        }
+        continue;
+      }
+    }
+
+    if ((equal_ind == ind - 1) &&
+        (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
+                    original_ivs_start[ind]))) {
+      equal_ind = ind;
+    } else if ((equal_ind > ind - 1) &&
+               !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
+                            original_ivs_start[ind]))) {
+      equal_ind = ind - 1;
+    }
+    ++ind;
+  }
+
+  return true;
+}
+
+//----------Calculate upper bounds for the last chunk-------------------------
+
+// Calculate one upper bound for the end.
+template <typename T>
+void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
+                            /*in/out*/ kmp_point_t original_ivs,
+                            kmp_index_t ind) {
+
+  T temp = bounds->ub0 +
+           bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
+
+  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
+}
+
+void kmp_calc_one_iv_end(const bounds_info_t *bounds,
+                         /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
+
+  switch (bounds->loop_type) {
+  default:
+    KMP_ASSERT(false);
+    break;
+  case loop_type_t::loop_type_int32:
+    kmp_calc_one_iv_end_XX<kmp_int32>(
+        (bounds_infoXX_template<kmp_int32> *)(bounds),
+        /*in/out*/ original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_uint32:
+    kmp_calc_one_iv_end_XX<kmp_uint32>(
+        (bounds_infoXX_template<kmp_uint32> *)(bounds),
+        /*in/out*/ original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_int64:
+    kmp_calc_one_iv_end_XX<kmp_int64>(
+        (bounds_infoXX_template<kmp_int64> *)(bounds),
+        /*in/out*/ original_ivs, ind);
+    break;
+  case loop_type_t::loop_type_uint64:
+    kmp_calc_one_iv_end_XX<kmp_uint64>(
+        (bounds_infoXX_template<kmp_uint64> *)(bounds),
+        /*in/out*/ original_ivs, ind);
+    break;
+  }
+}
+
+// Calculate upper bounds for the last loop iteration. Just use original upper
+// bounds (adjusted when canonicalized to use <= / >=). No need to check that
+// this point is in the original space (it's likely not)
+void kmp_calc_original_ivs_for_end(
+    const bounds_info_t *const original_bounds_nest, kmp_index_t n,
+    /*out*/ kmp_point_t original_ivs) {
+  for (kmp_index_t ind = 0; ind < n; ++ind) {
+    auto bounds = &(original_bounds_nest[ind]);
+    kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
+  }
+}
+
+//----------Init API for non-rectangular loops--------------------------------
+
+// Init API for collapsed loops (static, no chunks defined).
+// "bounds_nest" has to be allocated per thread.
+// API will modify original bounds_nest array to bring it to a canonical form
+// (only <= and >=, no !=, <, >). If the original loop nest was already in a
+// canonical form there will be no changes to bounds in bounds_nest array
+// (only trip counts will be calculated). Internally API will expand the space
+// to parallelogram/parallelepiped, calculate total, calculate bounds for the
+// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
+// important on the left side, to hit the lower bounds and not step over), and
+// pick the correct chunk for this thread (so it will calculate chunks up to the
+// needed one). It could be optimized to calculate just this chunk, potentially
+// a bit less well distributed among threads. It is designed to make sure that
+// threads will receive predictable chunks, deterministically (so that next nest
+// of loops with similar characteristics will get exactly same chunks on same
+// threads).
+// Current contract: chunk_bounds_nest has only lb0 and ub0,
+// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
+extern "C" kmp_int32
+__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
+                          /*in/out*/ bounds_info_t *original_bounds_nest,
+                          /*out*/ bounds_info_t *chunk_bounds_nest,
+                          kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
+
+  KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
+  KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
+
+  if (__kmp_env_consistency_check) {
+    __kmp_push_workshare(gtid, ct_pdo, loc);
+  }
+
+  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
+
+  CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
+
+  for (kmp_index_t i = 0; i < n; ++i) {
+    updated_bounds_nest[i].b = original_bounds_nest[i];
+  }
+
+  kmp_loop_nest_iv_t total =
+      kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
+
+  if (plastiter != NULL) {
+    *plastiter = FALSE;
+  }
+
+  if (total == 0) {
+    // Loop won't execute:
+    return FALSE;
+  }
+
+  // OMPTODO: DISTRIBUTE is not supported yet
+  __kmp_assert_valid_gtid(gtid);
+  kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
+
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+  kmp_uint32 nth = team->t.t_nproc; // Number of threads
+
+  KMP_DEBUG_ASSERT(tid < nth);
+
+  CollapseAllocator<kmp_uint64> original_ivs_start(n);
+
+  if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
+                                       /*out*/ original_ivs_start)) {
+    // Loop won't execute:
+    return FALSE;
+  }
+
+  // Not doing this optimization for one thread:
+  // (1) more to test
+  // (2) without it current contract that chunk_bounds_nest has only lb0 and
+  // ub0, lb1 and ub1 are set to 0 and can be ignored.
+  // if (nth == 1) {
+  //  // One thread:
+  //  // Copy all info from original_bounds_nest, it'll be good enough.
+
+  //  for (kmp_index_t i = 0; i < n; ++i) {
+  //    chunk_bounds_nest[i] = original_bounds_nest[i];
+  //  }
+
+  //  if (plastiter != NULL) {
+  //    *plastiter = TRUE;
+  //  }
+  //  return TRUE;
+  //}
+
+  kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
+      updated_bounds_nest, original_ivs_start, n);
+
+  bool last_iter = false;
+
+  for (; nth > 0;) {
+    // We could calculate chunk size once, but this is to compensate that the
+    // original space is not parallelepiped and some threads can be left
+    // without work:
+    KMP_DEBUG_ASSERT(total >= new_iv);
+
+    kmp_loop_nest_iv_t total_left = total - new_iv;
+    kmp_loop_nest_iv_t chunk_size = total_left / nth;
+    kmp_loop_nest_iv_t remainder = total_left % nth;
+
+    kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
+
+    if (remainder > 0) {
+      ++curr_chunk_size;
+      --remainder;
+    }
+
+#if defined(KMP_DEBUG)
+    kmp_loop_nest_iv_t new_iv_for_start = new_iv;
+#endif
+
+    if (curr_chunk_size > 1) {
+      new_iv += curr_chunk_size - 1;
+    }
+
+    CollapseAllocator<kmp_uint64> original_ivs_end(n);
+    if ((nth == 1) || (new_iv >= total - 1)) {
+      // Do this one till the end - just in case we miscalculated
+      // and either too much is left to process or new_iv is a bit too big:
+      kmp_calc_original_ivs_for_end(original_bounds_nest, n,
+                                    /*out*/ original_ivs_end);
+
+      last_iter = true;
+    } else {
+      // Note: here we make sure it's past (or equal to) the previous point.
+      if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
+                                               updated_bounds_nest,
+                                               original_ivs_start, new_iv,
+                                               /*out*/ original_ivs_end)) {
+        // We could not find the ending point, use the original upper bounds:
+        kmp_calc_original_ivs_for_end(original_bounds_nest, n,
+                                      /*out*/ original_ivs_end);
+
+        last_iter = true;
+      }
+    }
+
+#if defined(KMP_DEBUG)
+    auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
+        updated_bounds_nest, original_ivs_end, n);
+    KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
+#endif
+
+    if (last_iter && (tid != 0)) {
+      // We are done, this was last chunk, but no chunk for current thread was
+      // found:
+      return FALSE;
+    }
+
+    if (tid == 0) {
+      // We found the chunk for this thread, now we need to check if it's the
+      // last chunk or not:
+
+      CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
+      if (last_iter ||
+          !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
+                                      /*out*/ original_ivs_next_start)) {
+        // no more loop iterations left to process,
+        // this means that currently found chunk is the last chunk:
+        if (plastiter != NULL) {
+          *plastiter = TRUE;
+        }
+      }
+
+      // Fill in chunk bounds:
+      for (kmp_index_t i = 0; i < n; ++i) {
+        chunk_bounds_nest[i] =
+            original_bounds_nest[i]; // To fill in types, etc. - optional
+        chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
+        chunk_bounds_nest[i].lb1_u64 = 0;
+
+        chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
+        chunk_bounds_nest[i].ub1_u64 = 0;
+      }
+
+      return TRUE;
+    }
+
+    --tid;
+    --nth;
+
+    bool next_chunk = kmp_calc_next_original_ivs(
+        original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
+    if (!next_chunk) {
+      // no more loop iterations to process,
+      // the prevoius chunk was the last chunk
+      break;
+    }
+
+    // original_ivs_start is next to previous chunk original_ivs_end,
+    // we need to start new chunk here, so chunks will be one after another
+    // without any gap or overlap:
+    new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
+                                               original_ivs_start, n);
+  }
+
+  return FALSE;
+}
diff --git a/third_party/openmp/kmp_collapse.h b/third_party/openmp/kmp_collapse.h
new file mode 100644
index 000000000..e48701856
--- /dev/null
+++ b/third_party/openmp/kmp_collapse.h
@@ -0,0 +1,240 @@
+/*
+ * kmp_collapse.h -- header for loop collapse feature
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_COLLAPSE_H
+#define KMP_COLLAPSE_H
+
+#include <type_traits>
+
+// Type of the index into the loop nest structures
+// (with values from 0 to less than n from collapse(n))
+typedef kmp_int32 kmp_index_t;
+
+// Type for combined loop nest space IV:
+typedef kmp_uint64 kmp_loop_nest_iv_t;
+
+// Loop has <, <=, etc. as a comparison:
+enum comparison_t : kmp_int32 {
+  comp_less_or_eq = 0,
+  comp_greater_or_eq = 1,
+  comp_not_eq = 2,
+  comp_less = 3,
+  comp_greater = 4
+};
+
+// Type of loop IV.
+// Type of bounds and step, after usual promotions
+// are a subset of these types (32 & 64 only):
+enum loop_type_t : kmp_int32 {
+  loop_type_uint8 = 0,
+  loop_type_int8 = 1,
+  loop_type_uint16 = 2,
+  loop_type_int16 = 3,
+  loop_type_uint32 = 4,
+  loop_type_int32 = 5,
+  loop_type_uint64 = 6,
+  loop_type_int64 = 7
+};
+
+/*!
+ @ingroup WORK_SHARING
+ * Describes the structure for rectangular nested loops.
+ */
+template <typename T> struct bounds_infoXX_template {
+
+  // typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+
+  loop_type_t loop_type; // The differentiator
+  loop_type_t loop_iv_type;
+  comparison_t comparison;
+  // outer_iv should be 0 (or any other less then number of dimentions)
+  // if loop doesn't depend on it (lb1 and ub1 will be 0).
+  // This way we can do multiplication without a check.
+  kmp_index_t outer_iv;
+
+  // unions to keep the size constant:
+  union {
+    T lb0;
+    kmp_uint64 lb0_u64; // real type can be signed
+  };
+
+  union {
+    T lb1;
+    kmp_uint64 lb1_u64; // real type can be signed
+  };
+
+  union {
+    T ub0;
+    kmp_uint64 ub0_u64; // real type can be signed
+  };
+
+  union {
+    T ub1;
+    kmp_uint64 ub1_u64; // real type can be signed
+  };
+
+  union {
+    ST step; // signed even if bounds type is unsigned
+    kmp_int64 step_64; // signed
+  };
+
+  kmp_loop_nest_iv_t trip_count;
+};
+
+/*!
+ @ingroup WORK_SHARING
+ * Interface struct for rectangular nested loops.
+ * Same size as bounds_infoXX_template.
+ */
+struct bounds_info_t {
+
+  loop_type_t loop_type; // The differentiator
+  loop_type_t loop_iv_type;
+  comparison_t comparison;
+  // outer_iv should be 0  (or any other less then number of dimentions)
+  // if loop doesn't depend on it (lb1 and ub1 will be 0).
+  // This way we can do multiplication without a check.
+  kmp_index_t outer_iv;
+
+  kmp_uint64 lb0_u64; // real type can be signed
+  kmp_uint64 lb1_u64; // real type can be signed
+  kmp_uint64 ub0_u64; // real type can be signed
+  kmp_uint64 ub1_u64; // real type can be signed
+  kmp_int64 step_64; // signed
+
+  // This is internal, but it's the only internal thing we need
+  // in rectangular case, so let's expose it here:
+  kmp_loop_nest_iv_t trip_count;
+};
+
+//-------------------------------------------------------------------------
+// Additional types for internal representation:
+
+// Array for a point in the loop space, in the original space.
+// It's represented in kmp_uint64, but each dimention is calculated in
+// that loop IV type. Also dimentions have to be converted to those types
+// when used in generated code.
+typedef kmp_uint64* kmp_point_t;
+
+// Array: Number of loop iterations on each nesting level to achieve some point,
+// in expanded space or in original space.
+// OMPTODO: move from using iterations to using offsets (iterations multiplied
+// by steps). For those we need to be careful with the types, as step can be
+// negative, but it'll remove multiplications and divisions in several places.
+typedef kmp_loop_nest_iv_t* kmp_iterations_t;
+
+// Internal struct with additional info:
+template <typename T> struct bounds_info_internalXX_template {
+
+  // OMPTODO: should span have type T or should it better be
+  // kmp_uint64/kmp_int64 depending on T sign? (if kmp_uint64/kmp_int64 than
+  // updated bounds should probably also be kmp_uint64/kmp_int64). I'd like to
+  // use big_span_t, if it can be resolved at compile time.
+  typedef
+      typename std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
+          big_span_t;
+
+  // typedef typename big_span_t span_t;
+  typedef T span_t;
+
+  bounds_infoXX_template<T> b; // possibly adjusted bounds
+
+  // Leaving this as a union in case we'll switch to span_t with different sizes
+  // (depending on T)
+  union {
+    // Smallest possible value of iv (may be smaller than actually possible)
+    span_t span_smallest;
+    kmp_uint64 span_smallest_u64;
+  };
+
+  // Leaving this as a union in case we'll switch to span_t with different sizes
+  // (depending on T)
+  union {
+    // Biggest possible value of iv (may be bigger than actually possible)
+    span_t span_biggest;
+    kmp_uint64 span_biggest_u64;
+  };
+
+  // Did we adjust loop bounds (not counting canonicalization)?
+  bool loop_bounds_adjusted;
+};
+
+// Internal struct with additional info:
+struct bounds_info_internal_t {
+
+  bounds_info_t b; // possibly adjusted bounds
+
+  // Smallest possible value of iv (may be smaller than actually possible)
+  kmp_uint64 span_smallest_u64;
+
+  // Biggest possible value of iv (may be bigger than actually possible)
+  kmp_uint64 span_biggest_u64;
+
+  // Did we adjust loop bounds (not counting canonicalization)?
+  bool loop_bounds_adjusted;
+};
+
+//----------APIs for rectangular loop nests--------------------------------
+
+// Canonicalize loop nest and calculate overall trip count.
+// "bounds_nest" has to be allocated per thread.
+// API will modify original bounds_nest array to bring it to a canonical form
+// (only <= and >=, no !=, <, >). If the original loop nest was already in a
+// canonical form there will be no changes to bounds in bounds_nest array
+// (only trip counts will be calculated).
+// Returns trip count of overall space.
+extern "C" kmp_loop_nest_iv_t
+__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
+                                 /*in/out*/ bounds_info_t *original_bounds_nest,
+                                 kmp_index_t n);
+
+// Calculate old induction variables corresponding to overall new_iv.
+// Note: original IV will be returned as if it had kmp_uint64 type,
+// will have to be converted to original type in user code.
+// Note: trip counts should be already calculated by
+// __kmpc_process_loop_nest_rectang.
+// OMPTODO: special case 2, 3 nested loops - if it'll be possible to inline
+// that into user code.
+extern "C" void
+__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
+                                 const bounds_info_t *original_bounds_nest,
+                                 /*out*/ kmp_uint64 *original_ivs,
+                                 kmp_index_t n);
+
+//----------Init API for non-rectangular loops--------------------------------
+
+// Init API for collapsed loops (static, no chunks defined).
+// "bounds_nest" has to be allocated per thread.
+// API will modify original bounds_nest array to bring it to a canonical form
+// (only <= and >=, no !=, <, >). If the original loop nest was already in a
+// canonical form there will be no changes to bounds in bounds_nest array
+// (only trip counts will be calculated). Internally API will expand the space
+// to parallelogram/parallelepiped, calculate total, calculate bounds for the
+// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
+// important on the left side, to hit the lower bounds and not step over), and
+// pick the correct chunk for this thread (so it will calculate chunks up to the
+// needed one). It could be optimized to calculate just this chunk, potentially
+// a bit less well distributed among threads. It is designed to make sure that
+// threads will receive predictable chunks, deterministically (so that next nest
+// of loops with similar characteristics will get exactly same chunks on same
+// threads).
+// Current contract: chunk_bounds_nest has only lb0 and ub0,
+// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
+extern "C" kmp_int32
+__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
+                          /*in/out*/ bounds_info_t *original_bounds_nest,
+                          /*out*/ bounds_info_t *chunk_bounds_nest,
+                          kmp_index_t n,
+                          /*out*/ kmp_int32 *plastiter);
+
+#endif // KMP_COLLAPSE_H
diff --git a/third_party/openmp/kmp_config.h b/third_party/openmp/kmp_config.h
new file mode 100644
index 000000000..feb27a500
--- /dev/null
+++ b/third_party/openmp/kmp_config.h
@@ -0,0 +1,176 @@
+/*
+ * kmp_config.h -- Feature macros
+ */
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef KMP_CONFIG_H
+#define KMP_CONFIG_H
+
+#include "libc/dce.h"
+#include "kmp_platform.h"
+
+#ifndef __ASSEMBLER__
+#include "libc/stdio/syscall.h"
+#endif
+
+#define KMP_USE_FUTEX 0
+#define KMP_FTN_ENTRIES KMP_FTN_PLAIN
+#define syscall {{openmp_shall_not_use_syscall}}
+
+#define DEBUG_BUILD IsModeDbg()
+#define RELWITHDEBINFO_BUILD (IsOptimized() && !IsTiny())
+#define LIBOMP_USE_ITT_NOTIFY 0
+#define USE_ITT_NOTIFY LIBOMP_USE_ITT_NOTIFY
+#if ! LIBOMP_USE_ITT_NOTIFY
+# define INTEL_NO_ITTNOTIFY_API
+#endif
+#define LIBOMP_USE_VERSION_SYMBOLS 0
+#if LIBOMP_USE_VERSION_SYMBOLS
+# define KMP_USE_VERSION_SYMBOLS
+#endif
+#define LIBOMP_HAVE_WEAK_ATTRIBUTE 1
+#define KMP_HAVE_WEAK_ATTRIBUTE LIBOMP_HAVE_WEAK_ATTRIBUTE
+#define LIBOMP_HAVE_PSAPI 0
+#define KMP_HAVE_PSAPI LIBOMP_HAVE_PSAPI
+#define LIBOMP_STATS 0
+#define KMP_STATS_ENABLED LIBOMP_STATS
+#ifdef __x86_64__
+#define LIBOMP_HAVE_X86INTRIN_H 1
+#else
+#define LIBOMP_HAVE_X86INTRIN_H 0
+#endif
+#define KMP_HAVE_X86INTRIN_H LIBOMP_HAVE_X86INTRIN_H
+#define LIBOMP_HAVE___BUILTIN_READCYCLECOUNTER 0
+#define KMP_HAVE___BUILTIN_READCYCLECOUNTER LIBOMP_HAVE___BUILTIN_READCYCLECOUNTER
+#define LIBOMP_HAVE___RDTSC 1
+#define KMP_HAVE___RDTSC LIBOMP_HAVE___RDTSC
+#define LIBOMP_USE_DEBUGGER 0
+#define USE_DEBUGGER LIBOMP_USE_DEBUGGER
+#define LIBOMP_OMPT_DEBUG 0
+#define OMPT_DEBUG LIBOMP_OMPT_DEBUG
+#define LIBOMP_OMPT_SUPPORT 1
+#define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
+#define LIBOMP_OMPD_SUPPORT 0
+#define OMPD_SUPPORT LIBOMP_OMPD_SUPPORT
+#define LIBOMP_OMPX_TASKGRAPH 0
+#define OMPX_TASKGRAPH LIBOMP_OMPX_TASKGRAPH
+#define LIBOMP_PROFILING_SUPPORT 0
+#define OMP_PROFILING_SUPPORT LIBOMP_PROFILING_SUPPORT
+#define LIBOMP_OMPT_OPTIONAL 0
+#define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL
+#define LIBOMP_USE_ADAPTIVE_LOCKS 0
+#define KMP_USE_ADAPTIVE_LOCKS LIBOMP_USE_ADAPTIVE_LOCKS
+#define KMP_DEBUG_ADAPTIVE_LOCKS 0
+#define LIBOMP_USE_INTERNODE_ALIGNMENT 0
+#define KMP_USE_INTERNODE_ALIGNMENT LIBOMP_USE_INTERNODE_ALIGNMENT
+#define LIBOMP_ENABLE_ASSERTIONS 0
+#define KMP_USE_ASSERT LIBOMP_ENABLE_ASSERTIONS
+#define LIBOMP_USE_HIER_SCHED 0
+#define KMP_USE_HIER_SCHED LIBOMP_USE_HIER_SCHED
+#define STUBS_LIBRARY 0
+#define LIBOMP_USE_HWLOC 0
+#define KMP_USE_HWLOC LIBOMP_USE_HWLOC
+#define LIBOMP_ENABLE_SHARED 0
+#define KMP_DYNAMIC_LIB LIBOMP_ENABLE_SHARED
+#define KMP_ARCH_STR "@LIBOMP_LEGAL_ARCH@"
+#define KMP_LIBRARY_FILE "@LIBOMP_LIB_FILE@"
+#define KMP_VERSION_MAJOR 5
+#define KMP_VERSION_MINOR 0
+#define MSVC 0
+#define KMP_MSVC_COMPAT MSVC
+// #define LIBOMP_HAVE_WAITPKG_INTRINSICS
+#define KMP_HAVE_WAITPKG_INTRINSICS LIBOMP_HAVE_WAITPKG_INTRINSICS
+// #define LIBOMP_HAVE_RTM_INTRINSICS
+#define KMP_HAVE_RTM_INTRINSICS LIBOMP_HAVE_RTM_INTRINSICS
+#ifdef __x86_64__
+#define LIBOMP_HAVE_IMMINTRIN_H 1
+#else
+#define LIBOMP_HAVE_IMMINTRIN_H 0
+#endif
+#define KMP_HAVE_IMMINTRIN_H LIBOMP_HAVE_IMMINTRIN_H
+#define LIBOMP_HAVE_INTRIN_H 0
+#define KMP_HAVE_INTRIN_H LIBOMP_HAVE_INTRIN_H
+#define LIBOMP_HAVE_ATTRIBUTE_WAITPKG 0
+#define KMP_HAVE_ATTRIBUTE_WAITPKG LIBOMP_HAVE_ATTRIBUTE_WAITPKG
+#define LIBOMP_HAVE_ATTRIBUTE_RTM 0
+#define KMP_HAVE_ATTRIBUTE_RTM LIBOMP_HAVE_ATTRIBUTE_RTM
+#define LIBOMP_ARCH_AARCH64_A64FX 0
+#define KMP_ARCH_AARCH64_A64FX LIBOMP_ARCH_AARCH64_A64FX
+#ifdef __x86_64__
+#define LIBOMP_HAVE_XMMINTRIN_H 1
+#else
+#define LIBOMP_HAVE_XMMINTRIN_H 0
+#endif
+#define KMP_HAVE_XMMINTRIN_H LIBOMP_HAVE_XMMINTRIN_H
+#ifdef __x86_64__
+#define LIBOMP_HAVE__MM_MALLOC 1
+#else
+#define LIBOMP_HAVE__MM_MALLOC 0
+#endif
+#define KMP_HAVE__MM_MALLOC LIBOMP_HAVE__MM_MALLOC
+#define LIBOMP_HAVE_ALIGNED_ALLOC 1
+#define KMP_HAVE_ALIGNED_ALLOC LIBOMP_HAVE_ALIGNED_ALLOC
+#define LIBOMP_HAVE_POSIX_MEMALIGN 1
+#define KMP_HAVE_POSIX_MEMALIGN LIBOMP_HAVE_POSIX_MEMALIGN
+#define LIBOMP_HAVE__ALIGNED_MALLOC 0
+#define KMP_HAVE__ALIGNED_MALLOC LIBOMP_HAVE__ALIGNED_MALLOC
+#define OPENMP_ENABLE_LIBOMPTARGET 0
+#define ENABLE_LIBOMPTARGET OPENMP_ENABLE_LIBOMPTARGET
+
+// Configured cache line based on architecture
+#if KMP_ARCH_PPC64 || KMP_ARCH_PPC
+# define CACHE_LINE 128
+#elif KMP_ARCH_AARCH64_A64FX
+# define CACHE_LINE 256
+#elif KMP_ARCH_S390X
+# define CACHE_LINE 256
+#else
+# define CACHE_LINE 64
+#endif
+
+#if ! KMP_32_BIT_ARCH
+# define BUILD_I8 1
+#endif
+
+#define KMP_NESTED_HOT_TEAMS 1
+#define KMP_ADJUST_BLOCKTIME 1
+#define BUILD_PARALLEL_ORDERED 1
+#define KMP_ASM_INTRINS 1
+#define USE_ITT_BUILD LIBOMP_USE_ITT_NOTIFY
+#define INTEL_ITTNOTIFY_PREFIX __kmp_itt_
+#if ! KMP_MIC
+# define USE_LOAD_BALANCE 1
+#endif
+#if ! (KMP_OS_WINDOWS || KMP_OS_DARWIN)
+# define KMP_TDATA_GTID 1
+#endif
+#if STUBS_LIBRARY
+# define KMP_STUB 1
+#endif
+#if DEBUG_BUILD || RELWITHDEBINFO_BUILD
+# define KMP_DEBUG 1
+#endif
+
+#if KMP_OS_WINDOWS
+# define KMP_WIN_CDECL
+#else
+# define BUILD_TV
+# define KMP_GOMP_COMPAT
+#endif
+
+// use shared memory with dynamic library (except Android, where shm_*
+// functions don't exist).
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !__ANDROID__
+#define KMP_USE_SHM
+#endif
+
+#ifdef __COSMOPOLITAN__
+#define KMP_USE_SHM
+#endif
+
+#endif // KMP_CONFIG_H
diff --git a/third_party/openmp/kmp_csupport.cpp b/third_party/openmp/kmp_csupport.cpp
new file mode 100644
index 000000000..9eeaeb88f
--- /dev/null
+++ b/third_party/openmp/kmp_csupport.cpp
@@ -0,0 +1,4569 @@
+/*
+ * kmp_csupport.cpp -- kfront linkage support for OpenMP.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define __KMP_IMP
+#include "omp.h" /* extern "C" declarations of user-visible routines */
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_lock.h"
+#include "kmp_stats.h"
+#include "ompt-specific.h"
+
+#define MAX_MESSAGE 512
+
+// flags will be used in future, e.g. to implement openmp_strict library
+// restrictions
+
+/*!
+ * @ingroup STARTUP_SHUTDOWN
+ * @param loc   in   source location information
+ * @param flags in   for future use (currently ignored)
+ *
+ * Initialize the runtime library. This call is optional; if it is not made then
+ * it will be implicitly called by attempts to use other library functions.
+ */
+void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
+  // By default __kmpc_begin() is no-op.
+  char *env;
+  if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
+      __kmp_str_match_true(env)) {
+    __kmp_middle_initialize();
+    __kmp_assign_root_init_mask();
+    KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
+  } else if (__kmp_ignore_mppbeg() == FALSE) {
+    // By default __kmp_ignore_mppbeg() returns TRUE.
+    __kmp_internal_begin();
+    KC_TRACE(10, ("__kmpc_begin: called\n"));
+  }
+}
+
+/*!
+ * @ingroup STARTUP_SHUTDOWN
+ * @param loc source location information
+ *
+ * Shutdown the runtime library. This is also optional, and even if called will
+ * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
+ * zero.
+ */
+void __kmpc_end(ident_t *loc) {
+  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
+  // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
+  // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
+  // returns FALSE and __kmpc_end() will unregister this root (it can cause
+  // library shut down).
+  if (__kmp_ignore_mppend() == FALSE) {
+    KC_TRACE(10, ("__kmpc_end: called\n"));
+    KA_TRACE(30, ("__kmpc_end\n"));
+
+    __kmp_internal_end_thread(-1);
+  }
+#if KMP_OS_WINDOWS && OMPT_SUPPORT
+  // Normal exit process on Windows does not allow worker threads of the final
+  // parallel region to finish reporting their events, so shutting down the
+  // library here fixes the issue at least for the cases where __kmpc_end() is
+  // placed properly.
+  if (ompt_enabled.enabled)
+    __kmp_internal_end_library(__kmp_gtid_get_specific());
+#endif
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The global thread index of the active thread.
+
+This function can be called in any context.
+
+If the runtime has ony been entered at the outermost level from a
+single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
+that which would be returned by omp_get_thread_num() in the outermost
+active parallel construct. (Or zero if there is no active parallel
+construct, since the primary thread is necessarily thread zero).
+
+If multiple non-OpenMP threads all enter an OpenMP construct then this
+will be a unique thread identifier among all the threads created by
+the OpenMP runtime (but the value cannot be defined in terms of
+OpenMP thread ids returned by omp_get_thread_num()).
+*/
+kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
+  kmp_int32 gtid = __kmp_entry_gtid();
+
+  KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
+
+  return gtid;
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The number of threads under control of the OpenMP<sup>*</sup> runtime
+
+This function can be called in any context.
+It returns the total number of threads under the control of the OpenMP runtime.
+That is not a number that can be determined by any OpenMP standard calls, since
+the library may be called from more than one non-OpenMP thread, and this
+reflects the total over all such calls. Similarly the runtime maintains
+underlying threads even when they are not active (since the cost of creating
+and destroying OS threads is high), this call counts all such threads even if
+they are not waiting for work.
+*/
+kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
+  KC_TRACE(10,
+           ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
+
+  return TCR_4(__kmp_all_nth);
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The thread number of the calling thread in the innermost active parallel
+construct.
+*/
+kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
+  KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
+  return __kmp_tid_from_gtid(__kmp_entry_gtid());
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The number of threads in the innermost active parallel construct.
+*/
+kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
+  KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
+
+  return __kmp_entry_thread()->th.th_team->t.t_nproc;
+}
+
+/*!
+ * @ingroup DEPRECATED
+ * @param loc location description
+ *
+ * This function need not be called. It always returns TRUE.
+ */
+kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
+#ifndef KMP_DEBUG
+
+  return TRUE;
+
+#else
+
+  const char *semi2;
+  const char *semi3;
+  int line_no;
+
+  if (__kmp_par_range == 0) {
+    return TRUE;
+  }
+  semi2 = loc->psource;
+  if (semi2 == NULL) {
+    return TRUE;
+  }
+  semi2 = strchr(semi2, ';');
+  if (semi2 == NULL) {
+    return TRUE;
+  }
+  semi2 = strchr(semi2 + 1, ';');
+  if (semi2 == NULL) {
+    return TRUE;
+  }
+  if (__kmp_par_range_filename[0]) {
+    const char *name = semi2 - 1;
+    while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
+      name--;
+    }
+    if ((*name == '/') || (*name == ';')) {
+      name++;
+    }
+    if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
+      return __kmp_par_range < 0;
+    }
+  }
+  semi3 = strchr(semi2 + 1, ';');
+  if (__kmp_par_range_routine[0]) {
+    if ((semi3 != NULL) && (semi3 > semi2) &&
+        (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
+      return __kmp_par_range < 0;
+    }
+  }
+  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
+    if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
+      return __kmp_par_range > 0;
+    }
+    return __kmp_par_range < 0;
+  }
+  return TRUE;
+
+#endif /* KMP_DEBUG */
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return 1 if this thread is executing inside an active parallel region, zero if
+not.
+*/
+kmp_int32 __kmpc_in_parallel(ident_t *loc) {
+  return __kmp_entry_thread()->th.th_root->r.r_active;
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param num_threads number of threads requested for this parallel construct
+
+Set the number of threads to be used by the next fork spawned by this thread.
+This call is only required if the parallel construct has a `num_threads` clause.
+*/
+void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
+                             kmp_int32 num_threads) {
+  KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
+                global_tid, num_threads));
+  __kmp_assert_valid_gtid(global_tid);
+  __kmp_push_num_threads(loc, global_tid, num_threads);
+}
+
+void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
+  KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
+  /* the num_threads are automatically popped */
+}
+
+void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
+                           kmp_int32 proc_bind) {
+  KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
+                proc_bind));
+  __kmp_assert_valid_gtid(global_tid);
+  __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
+}
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param argc  total number of arguments in the ellipsis
+@param microtask  pointer to callback routine consisting of outlined parallel
+construct
+@param ...  pointers to shared variables that aren't global
+
+Do the actual fork and call the microtask in the relevant number of threads.
+*/
+void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
+  int gtid = __kmp_entry_gtid();
+
+#if (KMP_STATS_ENABLED)
+  // If we were in a serial region, then stop the serial timer, record
+  // the event, and start parallel region timer
+  stats_state_e previous_state = KMP_GET_THREAD_STATE();
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
+  } else {
+    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
+  }
+  int inParallel = __kmpc_in_parallel(loc);
+  if (inParallel) {
+    KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
+  } else {
+    KMP_COUNT_BLOCK(OMP_PARALLEL);
+  }
+#endif
+
+  // maybe to save thr_state is enough here
+  {
+    va_list ap;
+    va_start(ap, microtask);
+
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      kmp_info_t *master_th = __kmp_threads[gtid];
+      ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    }
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_FORKING();
+#endif
+    __kmp_fork_call(loc, gtid, fork_context_intel, argc,
+                    VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
+                    VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
+                    kmp_va_addr_of(ap));
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_JOINING();
+#endif
+    __kmp_join_call(loc, gtid
+#if OMPT_SUPPORT
+                    ,
+                    fork_context_intel
+#endif
+    );
+
+    va_end(ap);
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+  }
+
+#if KMP_STATS_ENABLED
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
+    KMP_SET_THREAD_STATE(previous_state);
+  } else {
+    KMP_POP_PARTITIONED_TIMER();
+  }
+#endif // KMP_STATS_ENABLED
+}
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param microtask  pointer to callback routine consisting of outlined parallel
+construct
+@param cond  condition for running in parallel
+@param args  struct of pointers to shared variables that aren't global
+
+Perform a fork only if the condition is true.
+*/
+void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
+                         kmp_int32 cond, void *args) {
+  int gtid = __kmp_entry_gtid();
+  if (cond) {
+    if (args)
+      __kmpc_fork_call(loc, argc, microtask, args);
+    else
+      __kmpc_fork_call(loc, argc, microtask);
+  } else {
+    __kmpc_serialized_parallel(loc, gtid);
+
+#if OMPT_SUPPORT
+    void *exit_frame_ptr;
+#endif
+
+    if (args)
+      __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
+                             /*npr=*/0,
+                             /*argc=*/1, &args
+#if OMPT_SUPPORT
+                             ,
+                             &exit_frame_ptr
+#endif
+      );
+    else
+      __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
+                             /*npr=*/0,
+                             /*argc=*/0,
+                             /*args=*/nullptr
+#if OMPT_SUPPORT
+                             ,
+                             &exit_frame_ptr
+#endif
+      );
+
+    __kmpc_end_serialized_parallel(loc, gtid);
+  }
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param num_teams number of teams requested for the teams construct
+@param num_threads number of threads per team requested for the teams construct
+
+Set the number of teams to be used by the teams construct.
+This call is only required if the teams construct has a `num_teams` clause
+or a `thread_limit` clause (or both).
+*/
+void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
+                           kmp_int32 num_teams, kmp_int32 num_threads) {
+  KA_TRACE(20,
+           ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
+            global_tid, num_teams, num_threads));
+  __kmp_assert_valid_gtid(global_tid);
+  __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param thread_limit limit on number of threads which can be created within the
+current task
+
+Set the thread_limit for the current task
+This call is there to support `thread_limit` clause on the `target` construct
+*/
+void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
+                             kmp_int32 thread_limit) {
+  __kmp_assert_valid_gtid(global_tid);
+  kmp_info_t *thread = __kmp_threads[global_tid];
+  if (thread_limit > 0)
+    thread->th.th_current_task->td_icvs.task_thread_limit = thread_limit;
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param num_teams_lb lower bound on number of teams requested for the teams
+construct
+@param num_teams_ub upper bound on number of teams requested for the teams
+construct
+@param num_threads number of threads per team requested for the teams construct
+
+Set the number of teams to be used by the teams construct. The number of initial
+teams cretaed will be greater than or equal to the lower bound and less than or
+equal to the upper bound.
+This call is only required if the teams construct has a `num_teams` clause
+or a `thread_limit` clause (or both).
+*/
+void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
+                              kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
+                              kmp_int32 num_threads) {
+  KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
+                " num_teams_ub=%d num_threads=%d\n",
+                global_tid, num_teams_lb, num_teams_ub, num_threads));
+  __kmp_assert_valid_gtid(global_tid);
+  __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
+                          num_threads);
+}
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param argc  total number of arguments in the ellipsis
+@param microtask  pointer to callback routine consisting of outlined teams
+construct
+@param ...  pointers to shared variables that aren't global
+
+Do the actual fork and call the microtask in the relevant number of threads.
+*/
+void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
+                       ...) {
+  int gtid = __kmp_entry_gtid();
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  va_list ap;
+  va_start(ap, microtask);
+
+#if KMP_STATS_ENABLED
+  KMP_COUNT_BLOCK(OMP_TEAMS);
+  stats_state_e previous_state = KMP_GET_THREAD_STATE();
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
+  } else {
+    KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
+  }
+#endif
+
+  // remember teams entry point and nesting level
+  this_thr->th.th_teams_microtask = microtask;
+  this_thr->th.th_teams_level =
+      this_thr->th.th_team->t.t_level; // AC: can be >0 on host
+
+#if OMPT_SUPPORT
+  kmp_team_t *parent_team = this_thr->th.th_team;
+  int tid = __kmp_tid_from_gtid(gtid);
+  if (ompt_enabled.enabled) {
+    parent_team->t.t_implicit_task_taskdata[tid]
+        .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  // check if __kmpc_push_num_teams called, set default number of teams
+  // otherwise
+  if (this_thr->th.th_teams_size.nteams == 0) {
+    __kmp_push_num_teams(loc, gtid, 0, 0);
+  }
+  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
+  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
+  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
+
+  __kmp_fork_call(
+      loc, gtid, fork_context_intel, argc,
+      VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
+      VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
+  __kmp_join_call(loc, gtid
+#if OMPT_SUPPORT
+                  ,
+                  fork_context_intel
+#endif
+  );
+
+  // Pop current CG root off list
+  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
+  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
+  this_thr->th.th_cg_roots = tmp->up;
+  KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
+                 " to node %p. cg_nthreads was %d\n",
+                 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
+  KMP_DEBUG_ASSERT(tmp->cg_nthreads);
+  int i = tmp->cg_nthreads--;
+  if (i == 1) { // check is we are the last thread in CG (not always the case)
+    __kmp_free(tmp);
+  }
+  // Restore current task's thread_limit from CG root
+  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
+  this_thr->th.th_current_task->td_icvs.thread_limit =
+      this_thr->th.th_cg_roots->cg_thread_limit;
+
+  this_thr->th.th_teams_microtask = NULL;
+  this_thr->th.th_teams_level = 0;
+  *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
+  va_end(ap);
+#if KMP_STATS_ENABLED
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
+    KMP_SET_THREAD_STATE(previous_state);
+  } else {
+    KMP_POP_PARTITIONED_TIMER();
+  }
+#endif // KMP_STATS_ENABLED
+}
+
+// I don't think this function should ever have been exported.
+// The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
+// openmp code ever called it, but it's been exported from the RTL for so
+// long that I'm afraid to remove the definition.
+int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param global_tid  global thread number
+
+Enter a serialized parallel construct. This interface is used to handle a
+conditional parallel region, like this,
+@code
+#pragma omp parallel if (condition)
+@endcode
+when the condition is false.
+*/
+void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
+  // The implementation is now in kmp_runtime.cpp so that it can share static
+  // functions with kmp_fork_call since the tasks to be done are similar in
+  // each case.
+  __kmp_assert_valid_gtid(global_tid);
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif
+  __kmp_serialized_parallel(loc, global_tid);
+}
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param global_tid  global thread number
+
+Leave a serialized parallel construct.
+*/
+void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
+  kmp_internal_control_t *top;
+  kmp_info_t *this_thr;
+  kmp_team_t *serial_team;
+
+  KC_TRACE(10,
+           ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
+
+  /* skip all this code for autopar serialized loops since it results in
+     unacceptable overhead */
+  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
+    return;
+
+  // Not autopar code
+  __kmp_assert_valid_gtid(global_tid);
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  this_thr = __kmp_threads[global_tid];
+  serial_team = this_thr->th.th_serial_team;
+
+  kmp_task_team_t *task_team = this_thr->th.th_task_team;
+  // we need to wait for the proxy tasks before finishing the thread
+  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
+                            task_team->tt.tt_hidden_helper_task_encountered))
+    __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
+
+  KMP_MB();
+  KMP_DEBUG_ASSERT(serial_team);
+  KMP_ASSERT(serial_team->t.t_serialized);
+  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
+  KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
+  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
+  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
+    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
+          OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
+    }
+
+    // reset clear the task id only after unlinking the task
+    ompt_data_t *parent_task_data;
+    __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
+
+    if (ompt_enabled.ompt_callback_parallel_end) {
+      ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+          &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
+          ompt_parallel_invoker_program | ompt_parallel_team,
+          OMPT_LOAD_RETURN_ADDRESS(global_tid));
+    }
+    __ompt_lw_taskteam_unlink(this_thr);
+    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+  }
+#endif
+
+  /* If necessary, pop the internal control stack values and replace the team
+   * values */
+  top = serial_team->t.t_control_stack_top;
+  if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
+    copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
+    serial_team->t.t_control_stack_top = top->next;
+    __kmp_free(top);
+  }
+
+  /* pop dispatch buffers stack */
+  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
+  {
+    dispatch_private_info_t *disp_buffer =
+        serial_team->t.t_dispatch->th_disp_buffer;
+    serial_team->t.t_dispatch->th_disp_buffer =
+        serial_team->t.t_dispatch->th_disp_buffer->next;
+    __kmp_free(disp_buffer);
+  }
+  this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
+
+  --serial_team->t.t_serialized;
+  if (serial_team->t.t_serialized == 0) {
+
+    /* return to the parallel section */
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
+      __kmp_clear_x87_fpu_status_word();
+      __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
+      __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
+    }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+    __kmp_pop_current_task_from_thread(this_thr);
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP)
+      ompd_bp_parallel_end();
+#endif
+
+    this_thr->th.th_team = serial_team->t.t_parent;
+    this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
+
+    /* restore values cached in the thread */
+    this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
+    this_thr->th.th_team_master =
+        serial_team->t.t_parent->t.t_threads[0]; /* JPH */
+    this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
+
+    /* TODO the below shouldn't need to be adjusted for serialized teams */
+    this_thr->th.th_dispatch =
+        &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
+
+    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
+    this_thr->th.th_current_task->td_flags.executing = 1;
+
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      // Copy the task team from the new child / old parent team to the thread.
+      this_thr->th.th_task_team =
+          this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
+      KA_TRACE(20,
+               ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
+                "team %p\n",
+                global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
+    }
+#if KMP_AFFINITY_SUPPORTED
+    if (this_thr->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
+      __kmp_reset_root_init_mask(global_tid);
+    }
+#endif
+  } else {
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
+                    "depth of serial team %p to %d\n",
+                    global_tid, serial_team, serial_team->t.t_serialized));
+    }
+  }
+
+  serial_team->t.t_level--;
+  if (__kmp_env_consistency_check)
+    __kmp_pop_parallel(global_tid, NULL);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled)
+    this_thr->th.ompt_thread_info.state =
+        ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
+                                           : ompt_state_work_parallel);
+#endif
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc  source location information.
+
+Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
+depending on the memory ordering convention obeyed by the compiler
+even that may not be necessary).
+*/
+void __kmpc_flush(ident_t *loc) {
+  KC_TRACE(10, ("__kmpc_flush: called\n"));
+
+  /* need explicit __mf() here since use volatile instead in library */
+  KMP_MFENCE(); /* Flush all pending memory write invalidates.  */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_flush) {
+    ompt_callbacks.ompt_callback(ompt_callback_flush)(
+        __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+}
+
+/* -------------------------------------------------------------------------- */
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+
+Execute a barrier.
+*/
+void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
+  KMP_COUNT_BLOCK(OMP_BARRIER);
+  KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  if (__kmp_env_consistency_check) {
+    if (loc == 0) {
+      KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
+    }
+    __kmp_check_barrier(global_tid, ct_barrier, loc);
+  }
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->enter_frame.ptr == NULL)
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif
+  __kmp_threads[global_tid]->th.th_ident = loc;
+  // TODO: explicit barrier_wait_id:
+  //   this function is called when 'barrier' directive is present or
+  //   implicit barrier at the end of a worksharing construct.
+  // 1) better to add a per-thread barrier counter to a thread data structure
+  // 2) set to 0 when a new team is created
+  // 4) no sync is required
+
+  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+}
+
+/* The BARRIER for a MASTER section is always explicit   */
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+@return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
+*/
+kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
+  int status = 0;
+
+  KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  if (KMP_MASTER_GTID(global_tid)) {
+    KMP_COUNT_BLOCK(OMP_MASTER);
+    KMP_PUSH_PARTITIONED_TIMER(OMP_master);
+    status = 1;
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (status) {
+    if (ompt_enabled.ompt_callback_masked) {
+      kmp_info_t *this_thr = __kmp_threads[global_tid];
+      kmp_team_t *team = this_thr->th.th_team;
+
+      int tid = __kmp_tid_from_gtid(global_tid);
+      ompt_callbacks.ompt_callback(ompt_callback_masked)(
+          ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
+          &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+          OMPT_GET_RETURN_ADDRESS(0));
+    }
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+#if KMP_USE_DYNAMIC_LOCK
+    if (status)
+      __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
+    else
+      __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
+#else
+    if (status)
+      __kmp_push_sync(global_tid, ct_master, loc, NULL);
+    else
+      __kmp_check_sync(global_tid, ct_master, loc, NULL);
+#endif
+  }
+
+  return status;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+
+Mark the end of a <tt>master</tt> region. This should only be called by the
+thread that executes the <tt>master</tt> region.
+*/
+void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
+  KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
+  KMP_POP_PARTITIONED_TIMER();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[global_tid];
+  kmp_team_t *team = this_thr->th.th_team;
+  if (ompt_enabled.ompt_callback_masked) {
+    int tid = __kmp_tid_from_gtid(global_tid);
+    ompt_callbacks.ompt_callback(ompt_callback_masked)(
+        ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    if (KMP_MASTER_GTID(global_tid))
+      __kmp_pop_sync(global_tid, ct_master, loc);
+  }
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number.
+@param filter result of evaluating filter clause on thread global_tid, or zero
+if no filter clause present
+@return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
+*/
+kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
+  int status = 0;
+  int tid;
+  KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  tid = __kmp_tid_from_gtid(global_tid);
+  if (tid == filter) {
+    KMP_COUNT_BLOCK(OMP_MASKED);
+    KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
+    status = 1;
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (status) {
+    if (ompt_enabled.ompt_callback_masked) {
+      kmp_info_t *this_thr = __kmp_threads[global_tid];
+      kmp_team_t *team = this_thr->th.th_team;
+      ompt_callbacks.ompt_callback(ompt_callback_masked)(
+          ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
+          &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+          OMPT_GET_RETURN_ADDRESS(0));
+    }
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+#if KMP_USE_DYNAMIC_LOCK
+    if (status)
+      __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
+    else
+      __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
+#else
+    if (status)
+      __kmp_push_sync(global_tid, ct_masked, loc, NULL);
+    else
+      __kmp_check_sync(global_tid, ct_masked, loc, NULL);
+#endif
+  }
+
+  return status;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+
+Mark the end of a <tt>masked</tt> region. This should only be called by the
+thread that executes the <tt>masked</tt> region.
+*/
+void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
+  KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+  KMP_POP_PARTITIONED_TIMER();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[global_tid];
+  kmp_team_t *team = this_thr->th.th_team;
+  if (ompt_enabled.ompt_callback_masked) {
+    int tid = __kmp_tid_from_gtid(global_tid);
+    ompt_callbacks.ompt_callback(ompt_callback_masked)(
+        ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    __kmp_pop_sync(global_tid, ct_masked, loc);
+  }
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param gtid  global thread number.
+
+Start execution of an <tt>ordered</tt> construct.
+*/
+void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
+  int cid = 0;
+  kmp_info_t *th;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+#if USE_ITT_BUILD
+  __kmp_itt_ordered_prep(gtid);
+// TODO: ordered_wait_id
+#endif /* USE_ITT_BUILD */
+
+  th = __kmp_threads[gtid];
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_team_t *team;
+  ompt_wait_id_t lck;
+  void *codeptr_ra;
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+  if (ompt_enabled.enabled) {
+    team = __kmp_team_from_gtid(gtid);
+    lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
+    /* OMPT state update */
+    th->th.ompt_thread_info.wait_id = lck;
+    th->th.ompt_thread_info.state = ompt_state_wait_ordered;
+
+    /* OMPT event callback */
+    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
+          codeptr_ra);
+    }
+  }
+#endif
+
+  if (th->th.th_dispatch->th_deo_fcn != 0)
+    (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
+  else
+    __kmp_parallel_deo(&gtid, &cid, loc);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    /* OMPT state update */
+    th->th.ompt_thread_info.state = ompt_state_work_parallel;
+    th->th.ompt_thread_info.wait_id = 0;
+
+    /* OMPT event callback */
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
+    }
+  }
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_ordered_start(gtid);
+#endif /* USE_ITT_BUILD */
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param gtid  global thread number.
+
+End execution of an <tt>ordered</tt> construct.
+*/
+void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
+  int cid = 0;
+  kmp_info_t *th;
+
+  KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
+
+#if USE_ITT_BUILD
+  __kmp_itt_ordered_end(gtid);
+// TODO: ordered_wait_id
+#endif /* USE_ITT_BUILD */
+
+  th = __kmp_threads[gtid];
+
+  if (th->th.th_dispatch->th_dxo_fcn != 0)
+    (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
+  else
+    __kmp_parallel_dxo(&gtid, &cid, loc);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_ordered,
+        (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
+            ->t.t_ordered.dt.t_value,
+        OMPT_LOAD_RETURN_ADDRESS(gtid));
+  }
+#endif
+}
+
+#if KMP_USE_DYNAMIC_LOCK
+
+static __forceinline void
+__kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
+                          kmp_int32 gtid, kmp_indirect_locktag_t tag) {
+  // Pointer to the allocated indirect lock is written to crit, while indexing
+  // is ignored.
+  void *idx;
+  kmp_indirect_lock_t **lck;
+  lck = (kmp_indirect_lock_t **)crit;
+  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
+  KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
+  KMP_SET_I_LOCK_LOCATION(ilk, loc);
+  KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
+  KA_TRACE(20,
+           ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
+#if USE_ITT_BUILD
+  __kmp_itt_critical_creating(ilk->lock, loc);
+#endif
+  int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
+  if (status == 0) {
+#if USE_ITT_BUILD
+    __kmp_itt_critical_destroyed(ilk->lock);
+#endif
+    // We don't really need to destroy the unclaimed lock here since it will be
+    // cleaned up at program exit.
+    // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
+  }
+  KMP_DEBUG_ASSERT(*lck != NULL);
+}
+
+// Fast-path acquire tas lock
+#define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
+  {                                                                            \
+    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
+    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
+    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
+    if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
+        !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
+      kmp_uint32 spins;                                                        \
+      KMP_FSYNC_PREPARE(l);                                                    \
+      KMP_INIT_YIELD(spins);                                                   \
+      kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
+      do {                                                                     \
+        if (TCR_4(__kmp_nth) >                                                 \
+            (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
+          KMP_YIELD(TRUE);                                                     \
+        } else {                                                               \
+          KMP_YIELD_SPIN(spins);                                               \
+        }                                                                      \
+        __kmp_spin_backoff(&backoff);                                          \
+      } while (                                                                \
+          KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
+          !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
+    }                                                                          \
+    KMP_FSYNC_ACQUIRED(l);                                                     \
+  }
+
+// Fast-path test tas lock
+#define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
+  {                                                                            \
+    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
+    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
+    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
+    rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
+         __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
+  }
+
+// Fast-path release tas lock
+#define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
+  { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
+
+#if KMP_USE_FUTEX
+
+#include <sys/syscall.h>
+#include <unistd.h>
+#ifndef FUTEX_WAIT
+#define FUTEX_WAIT 0
+#endif
+#ifndef FUTEX_WAKE
+#define FUTEX_WAKE 1
+#endif
+
+// Fast-path acquire futex lock
+#define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
+  {                                                                            \
+    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
+    kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
+    KMP_MB();                                                                  \
+    KMP_FSYNC_PREPARE(ftx);                                                    \
+    kmp_int32 poll_val;                                                        \
+    while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
+                &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
+                KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
+      kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
+      if (!cond) {                                                             \
+        if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
+                                         poll_val |                            \
+                                             KMP_LOCK_BUSY(1, futex))) {       \
+          continue;                                                            \
+        }                                                                      \
+        poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
+      }                                                                        \
+      kmp_int32 rc;                                                            \
+      if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
+                        NULL, NULL, 0)) != 0) {                                \
+        continue;                                                              \
+      }                                                                        \
+      gtid_code |= 1;                                                          \
+    }                                                                          \
+    KMP_FSYNC_ACQUIRED(ftx);                                                   \
+  }
+
+// Fast-path test futex lock
+#define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
+  {                                                                            \
+    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
+    if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
+                                    KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
+      KMP_FSYNC_ACQUIRED(ftx);                                                 \
+      rc = TRUE;                                                               \
+    } else {                                                                   \
+      rc = FALSE;                                                              \
+    }                                                                          \
+  }
+
+// Fast-path release futex lock
+#define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
+  {                                                                            \
+    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
+    KMP_MB();                                                                  \
+    KMP_FSYNC_RELEASING(ftx);                                                  \
+    kmp_int32 poll_val =                                                       \
+        KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
+    if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
+      syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
+              KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
+    }                                                                          \
+    KMP_MB();                                                                  \
+    KMP_YIELD_OVERSUB();                                                       \
+  }
+
+#endif // KMP_USE_FUTEX
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
+                                                      ident_t const *loc,
+                                                      kmp_int32 gtid) {
+  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
+
+  // Because of the double-check, the following load doesn't need to be volatile
+  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
+
+  if (lck == NULL) {
+    void *idx;
+
+    // Allocate & initialize the lock.
+    // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
+    lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
+    __kmp_init_user_lock_with_checks(lck);
+    __kmp_set_user_lock_location(lck, loc);
+#if USE_ITT_BUILD
+    __kmp_itt_critical_creating(lck);
+// __kmp_itt_critical_creating() should be called *before* the first usage
+// of underlying lock. It is the only place where we can guarantee it. There
+// are chances the lock will destroyed with no usage, but it is not a
+// problem, because this is not real event seen by user but rather setting
+// name for object (lock). See more details in kmp_itt.h.
+#endif /* USE_ITT_BUILD */
+
+    // Use a cmpxchg instruction to slam the start of the critical section with
+    // the lock pointer.  If another thread beat us to it, deallocate the lock,
+    // and use the lock that the other thread allocated.
+    int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
+
+    if (status == 0) {
+// Deallocate the lock and reload the value.
+#if USE_ITT_BUILD
+      __kmp_itt_critical_destroyed(lck);
+// Let ITT know the lock is destroyed and the same memory location may be reused
+// for another purpose.
+#endif /* USE_ITT_BUILD */
+      __kmp_destroy_user_lock_with_checks(lck);
+      __kmp_user_lock_free(&idx, gtid, lck);
+      lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
+      KMP_DEBUG_ASSERT(lck != NULL);
+    }
+  }
+  return lck;
+}
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number.
+@param crit identity of the critical section. This could be a pointer to a lock
+associated with the critical section, or some other suitably unique value.
+
+Enter code protected by a `critical` construct.
+This function blocks until the executing thread can enter the critical section.
+*/
+void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
+                     kmp_critical_name *crit) {
+#if KMP_USE_DYNAMIC_LOCK
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif // OMPT_SUPPORT
+  __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
+#else
+  KMP_COUNT_BLOCK(OMP_CRITICAL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_state_t prev_state = ompt_state_undefined;
+  ompt_thread_info_t ti;
+#endif
+  kmp_user_lock_p lck;
+
+  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  // TODO: add THR_OVHD_STATE
+
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
+  KMP_CHECK_USER_LOCK_INIT();
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
+    lck = (kmp_user_lock_p)crit;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
+    lck = (kmp_user_lock_p)crit;
+  }
+#endif
+  else { // ticket, queuing or drdpa
+    lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
+  }
+
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_critical, loc, lck);
+
+    // since the critical directive binds to all threads, not just the current
+    // team we have to check this even if we are in a serialized team.
+    // also, even if we are the uber thread, we still have to conduct the lock,
+    // as we have to contend with sibling threads.
+
+#if USE_ITT_BUILD
+  __kmp_itt_critical_acquiring(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+  void *codeptr_ra = NULL;
+  if (ompt_enabled.enabled) {
+    ti = __kmp_threads[global_tid]->th.ompt_thread_info;
+    /* OMPT state update */
+    prev_state = ti.state;
+    ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
+    ti.state = ompt_state_wait_critical;
+
+    /* OMPT event callback */
+    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+          (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
+    }
+  }
+#endif
+  // Value of 'crit' should be good for using as a critical_id of the critical
+  // section directive.
+  __kmp_acquire_user_lock_with_checks(lck, global_tid);
+
+#if USE_ITT_BUILD
+  __kmp_itt_critical_acquired(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    /* OMPT state update */
+    ti.state = prev_state;
+    ti.wait_id = 0;
+
+    /* OMPT event callback */
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
+    }
+  }
+#endif
+  KMP_POP_PARTITIONED_TIMER();
+
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
+  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+#if KMP_USE_DYNAMIC_LOCK
+
+// Converts the given hint to an internal lock implementation
+static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
+#if KMP_USE_TSX
+#define KMP_TSX_LOCK(seq) lockseq_##seq
+#else
+#define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
+#else
+#define KMP_CPUINFO_RTM 0
+#endif
+
+  // Hints that do not require further logic
+  if (hint & kmp_lock_hint_hle)
+    return KMP_TSX_LOCK(hle);
+  if (hint & kmp_lock_hint_rtm)
+    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
+  if (hint & kmp_lock_hint_adaptive)
+    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
+
+  // Rule out conflicting hints first by returning the default lock
+  if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
+    return __kmp_user_lock_seq;
+  if ((hint & omp_lock_hint_speculative) &&
+      (hint & omp_lock_hint_nonspeculative))
+    return __kmp_user_lock_seq;
+
+  // Do not even consider speculation when it appears to be contended
+  if (hint & omp_lock_hint_contended)
+    return lockseq_queuing;
+
+  // Uncontended lock without speculation
+  if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
+    return lockseq_tas;
+
+  // Use RTM lock for speculation
+  if (hint & omp_lock_hint_speculative)
+    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
+
+  return __kmp_user_lock_seq;
+}
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#if KMP_USE_DYNAMIC_LOCK
+static kmp_mutex_impl_t
+__ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
+  if (user_lock) {
+    switch (KMP_EXTRACT_D_TAG(user_lock)) {
+    case 0:
+      break;
+#if KMP_USE_FUTEX
+    case locktag_futex:
+      return kmp_mutex_impl_queuing;
+#endif
+    case locktag_tas:
+      return kmp_mutex_impl_spin;
+#if KMP_USE_TSX
+    case locktag_hle:
+    case locktag_rtm_spin:
+      return kmp_mutex_impl_speculative;
+#endif
+    default:
+      return kmp_mutex_impl_none;
+    }
+    ilock = KMP_LOOKUP_I_LOCK(user_lock);
+  }
+  KMP_ASSERT(ilock);
+  switch (ilock->type) {
+#if KMP_USE_TSX
+  case locktag_adaptive:
+  case locktag_rtm_queuing:
+    return kmp_mutex_impl_speculative;
+#endif
+  case locktag_nested_tas:
+    return kmp_mutex_impl_spin;
+#if KMP_USE_FUTEX
+  case locktag_nested_futex:
+#endif
+  case locktag_ticket:
+  case locktag_queuing:
+  case locktag_drdpa:
+  case locktag_nested_ticket:
+  case locktag_nested_queuing:
+  case locktag_nested_drdpa:
+    return kmp_mutex_impl_queuing;
+  default:
+    return kmp_mutex_impl_none;
+  }
+}
+#else
+// For locks without dynamic binding
+static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
+  switch (__kmp_user_lock_kind) {
+  case lk_tas:
+    return kmp_mutex_impl_spin;
+#if KMP_USE_FUTEX
+  case lk_futex:
+#endif
+  case lk_ticket:
+  case lk_queuing:
+  case lk_drdpa:
+    return kmp_mutex_impl_queuing;
+#if KMP_USE_TSX
+  case lk_hle:
+  case lk_rtm_queuing:
+  case lk_rtm_spin:
+  case lk_adaptive:
+    return kmp_mutex_impl_speculative;
+#endif
+  default:
+    return kmp_mutex_impl_none;
+  }
+}
+#endif // KMP_USE_DYNAMIC_LOCK
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number.
+@param crit identity of the critical section. This could be a pointer to a lock
+associated with the critical section, or some other suitably unique value.
+@param hint the lock hint.
+
+Enter code protected by a `critical` construct with a hint. The hint value is
+used to suggest a lock implementation. This function blocks until the executing
+thread can enter the critical section unless the hint suggests use of
+speculative execution and the hardware supports it.
+*/
+void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
+                               kmp_critical_name *crit, uint32_t hint) {
+  KMP_COUNT_BLOCK(OMP_CRITICAL);
+  kmp_user_lock_p lck;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_state_t prev_state = ompt_state_undefined;
+  ompt_thread_info_t ti;
+  // This is the case, if called from __kmpc_critical:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+#endif
+
+  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
+  // Check if it is initialized.
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
+  kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
+  if (*lk == 0) {
+    if (KMP_IS_D_LOCK(lockseq)) {
+      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
+                                  KMP_GET_D_TAG(lockseq));
+    } else {
+      __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
+    }
+  }
+  // Branch for accessing the actual lock object and set operation. This
+  // branching is inevitable since this lock initialization does not follow the
+  // normal dispatch path (lock table is not used).
+  if (KMP_EXTRACT_D_TAG(lk) != 0) {
+    lck = (kmp_user_lock_p)lk;
+    if (__kmp_env_consistency_check) {
+      __kmp_push_sync(global_tid, ct_critical, loc, lck,
+                      __kmp_map_hint_to_lock(hint));
+    }
+#if USE_ITT_BUILD
+    __kmp_itt_critical_acquiring(lck);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
+      /* OMPT state update */
+      prev_state = ti.state;
+      ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
+      ti.state = ompt_state_wait_critical;
+
+      /* OMPT event callback */
+      if (ompt_enabled.ompt_callback_mutex_acquire) {
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+            ompt_mutex_critical, (unsigned int)hint,
+            __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
+            codeptr);
+      }
+    }
+#endif
+#if KMP_USE_INLINED_TAS
+    if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
+      KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
+    } else
+#elif KMP_USE_INLINED_FUTEX
+    if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
+      KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
+    } else
+#endif
+    {
+      KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
+    }
+  } else {
+    kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
+    lck = ilk->lock;
+    if (__kmp_env_consistency_check) {
+      __kmp_push_sync(global_tid, ct_critical, loc, lck,
+                      __kmp_map_hint_to_lock(hint));
+    }
+#if USE_ITT_BUILD
+    __kmp_itt_critical_acquiring(lck);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
+      /* OMPT state update */
+      prev_state = ti.state;
+      ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
+      ti.state = ompt_state_wait_critical;
+
+      /* OMPT event callback */
+      if (ompt_enabled.ompt_callback_mutex_acquire) {
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+            ompt_mutex_critical, (unsigned int)hint,
+            __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
+            codeptr);
+      }
+    }
+#endif
+    KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
+  }
+  KMP_POP_PARTITIONED_TIMER();
+
+#if USE_ITT_BUILD
+  __kmp_itt_critical_acquired(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    /* OMPT state update */
+    ti.state = prev_state;
+    ti.wait_id = 0;
+
+    /* OMPT event callback */
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+    }
+  }
+#endif
+
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
+  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
+} // __kmpc_critical_with_hint
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+@param crit identity of the critical section. This could be a pointer to a lock
+associated with the critical section, or some other suitably unique value.
+
+Leave a critical section, releasing any lock that was held during its execution.
+*/
+void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
+                         kmp_critical_name *crit) {
+  kmp_user_lock_p lck;
+
+  KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
+
+#if KMP_USE_DYNAMIC_LOCK
+  int locktag = KMP_EXTRACT_D_TAG(crit);
+  if (locktag) {
+    lck = (kmp_user_lock_p)crit;
+    KMP_ASSERT(lck != NULL);
+    if (__kmp_env_consistency_check) {
+      __kmp_pop_sync(global_tid, ct_critical, loc);
+    }
+#if USE_ITT_BUILD
+    __kmp_itt_critical_releasing(lck);
+#endif
+#if KMP_USE_INLINED_TAS
+    if (locktag == locktag_tas && !__kmp_env_consistency_check) {
+      KMP_RELEASE_TAS_LOCK(lck, global_tid);
+    } else
+#elif KMP_USE_INLINED_FUTEX
+    if (locktag == locktag_futex && !__kmp_env_consistency_check) {
+      KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
+    } else
+#endif
+    {
+      KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
+    }
+  } else {
+    kmp_indirect_lock_t *ilk =
+        (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
+    KMP_ASSERT(ilk != NULL);
+    lck = ilk->lock;
+    if (__kmp_env_consistency_check) {
+      __kmp_pop_sync(global_tid, ct_critical, loc);
+    }
+#if USE_ITT_BUILD
+    __kmp_itt_critical_releasing(lck);
+#endif
+    KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
+  }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
+    lck = (kmp_user_lock_p)crit;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
+    lck = (kmp_user_lock_p)crit;
+  }
+#endif
+  else { // ticket, queuing or drdpa
+    lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
+  }
+
+  KMP_ASSERT(lck != NULL);
+
+  if (__kmp_env_consistency_check)
+    __kmp_pop_sync(global_tid, ct_critical, loc);
+
+#if USE_ITT_BUILD
+  __kmp_itt_critical_releasing(lck);
+#endif /* USE_ITT_BUILD */
+  // Value of 'crit' should be good for using as a critical_id of the critical
+  // section directive.
+  __kmp_release_user_lock_with_checks(lck, global_tid);
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  /* OMPT release event triggers after lock is released; place here to trigger
+   * for all #if branches */
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
+        OMPT_LOAD_RETURN_ADDRESS(0));
+  }
+#endif
+
+  KMP_POP_PARTITIONED_TIMER();
+  KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+@return one if the thread should execute the master block, zero otherwise
+
+Start execution of a combined barrier and master. The barrier is executed inside
+this function.
+*/
+kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
+  int status;
+  KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  if (__kmp_env_consistency_check)
+    __kmp_check_barrier(global_tid, ct_barrier, loc);
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->enter_frame.ptr == NULL)
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif
+#if USE_ITT_NOTIFY
+  __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+  status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+
+  return (status != 0) ? 0 : 1;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+
+Complete the execution of a combined barrier and master. This function should
+only be called at the completion of the <tt>master</tt> code. Other threads will
+still be waiting at the barrier and this call releases them.
+*/
+void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
+  KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+@return one if the thread should execute the master block, zero otherwise
+
+Start execution of a combined barrier and master(nowait) construct.
+The barrier is executed inside this function.
+There is no equivalent "end" function, since the
+*/
+kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
+  kmp_int32 ret;
+  KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  if (__kmp_env_consistency_check) {
+    if (loc == 0) {
+      KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
+    }
+    __kmp_check_barrier(global_tid, ct_barrier, loc);
+  }
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->enter_frame.ptr == NULL)
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif
+#if USE_ITT_NOTIFY
+  __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+
+  ret = __kmpc_master(loc, global_tid);
+
+  if (__kmp_env_consistency_check) {
+    /*  there's no __kmpc_end_master called; so the (stats) */
+    /*  actions of __kmpc_end_master are done here          */
+    if (ret) {
+      /* only one thread should do the pop since only */
+      /* one did the push (see __kmpc_master())       */
+      __kmp_pop_sync(global_tid, ct_master, loc);
+    }
+  }
+
+  return (ret);
+}
+
+/* The BARRIER for a SINGLE process section is always explicit   */
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+@return One if this thread should execute the single construct, zero otherwise.
+
+Test whether to execute a <tt>single</tt> construct.
+There are no implicit barriers in the two "single" calls, rather the compiler
+should introduce an explicit barrier if it is required.
+*/
+
+kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
+  __kmp_assert_valid_gtid(global_tid);
+  kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
+
+  if (rc) {
+    // We are going to execute the single statement, so we should count it.
+    KMP_COUNT_BLOCK(OMP_SINGLE);
+    KMP_PUSH_PARTITIONED_TIMER(OMP_single);
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[global_tid];
+  kmp_team_t *team = this_thr->th.th_team;
+  int tid = __kmp_tid_from_gtid(global_tid);
+
+  if (ompt_enabled.enabled) {
+    if (rc) {
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_executor, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_end,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+      }
+    }
+  }
+#endif
+
+  return rc;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+
+Mark the end of a <tt>single</tt> construct.  This function should
+only be called by the thread that executed the block of code protected
+by the `single` construct.
+*/
+void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
+  __kmp_assert_valid_gtid(global_tid);
+  __kmp_exit_single(global_tid);
+  KMP_POP_PARTITIONED_TIMER();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[global_tid];
+  kmp_team_t *team = this_thr->th.th_team;
+  int tid = __kmp_tid_from_gtid(global_tid);
+
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_single_executor, ompt_scope_end,
+        &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc Source location
+@param global_tid Global thread id
+
+Mark the end of a statically scheduled loop.
+*/
+void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
+  KMP_POP_PARTITIONED_TIMER();
+  KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_work_t ompt_work_type = ompt_work_loop;
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    // Determine workshare type
+    if (loc != NULL) {
+      if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
+        ompt_work_type = ompt_work_loop;
+      } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
+        ompt_work_type = ompt_work_sections;
+      } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
+        ompt_work_type = ompt_work_distribute;
+      } else {
+        // use default set above.
+        // a warning about this case is provided in __kmpc_for_static_init
+      }
+      KMP_DEBUG_ASSERT(ompt_work_type);
+    }
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
+        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+  if (__kmp_env_consistency_check)
+    __kmp_pop_workshare(global_tid, ct_pdo, loc);
+}
+
+// User routines which take C-style arguments (call by value)
+// different from the Fortran equivalent routines
+
+void ompc_set_num_threads(int arg) {
+  // !!!!! TODO: check the per-task binding
+  __kmp_set_num_threads(arg, __kmp_entry_gtid());
+}
+
+void ompc_set_dynamic(int flag) {
+  kmp_info_t *thread;
+
+  /* For the thread-private implementation of the internal controls */
+  thread = __kmp_entry_thread();
+
+  __kmp_save_internal_controls(thread);
+
+  set__dynamic(thread, flag ? true : false);
+}
+
+void ompc_set_nested(int flag) {
+  kmp_info_t *thread;
+
+  /* For the thread-private internal controls implementation */
+  thread = __kmp_entry_thread();
+
+  __kmp_save_internal_controls(thread);
+
+  set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
+}
+
+void ompc_set_max_active_levels(int max_active_levels) {
+  /* TO DO */
+  /* we want per-task implementation of this internal control */
+
+  /* For the per-thread internal controls implementation */
+  __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
+}
+
+void ompc_set_schedule(omp_sched_t kind, int modifier) {
+  // !!!!! TODO: check the per-task binding
+  __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
+}
+
+int ompc_get_ancestor_thread_num(int level) {
+  return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
+}
+
+int ompc_get_team_size(int level) {
+  return __kmp_get_team_size(__kmp_entry_gtid(), level);
+}
+
+/* OpenMP 5.0 Affinity Format API */
+void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
+                         format, KMP_STRLEN(format) + 1);
+}
+
+size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
+  size_t format_size;
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  format_size = KMP_STRLEN(__kmp_affinity_format);
+  if (buffer && size) {
+    __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
+                           format_size + 1);
+  }
+  return format_size;
+}
+
+void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
+  int gtid;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  gtid = __kmp_get_gtid();
+#if KMP_AFFINITY_SUPPORTED
+  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
+      __kmp_affinity.flags.reset) {
+    __kmp_reset_root_init_mask(gtid);
+  }
+#endif
+  __kmp_aux_display_affinity(gtid, format);
+}
+
+size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
+                                              char const *format) {
+  int gtid;
+  size_t num_required;
+  kmp_str_buf_t capture_buf;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  gtid = __kmp_get_gtid();
+#if KMP_AFFINITY_SUPPORTED
+  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
+      __kmp_affinity.flags.reset) {
+    __kmp_reset_root_init_mask(gtid);
+  }
+#endif
+  __kmp_str_buf_init(&capture_buf);
+  num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
+  if (buffer && buf_size) {
+    __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
+                           capture_buf.used + 1);
+  }
+  __kmp_str_buf_free(&capture_buf);
+  return num_required;
+}
+
+void kmpc_set_stacksize(int arg) {
+  // __kmp_aux_set_stacksize initializes the library if needed
+  __kmp_aux_set_stacksize(arg);
+}
+
+void kmpc_set_stacksize_s(size_t arg) {
+  // __kmp_aux_set_stacksize initializes the library if needed
+  __kmp_aux_set_stacksize(arg);
+}
+
+void kmpc_set_blocktime(int arg) {
+  int gtid, tid, bt = arg;
+  kmp_info_t *thread;
+
+  gtid = __kmp_entry_gtid();
+  tid = __kmp_tid_from_gtid(gtid);
+  thread = __kmp_thread_from_gtid(gtid);
+
+  __kmp_aux_convert_blocktime(&bt);
+  __kmp_aux_set_blocktime(bt, thread, tid);
+}
+
+void kmpc_set_library(int arg) {
+  // __kmp_user_set_library initializes the library if needed
+  __kmp_user_set_library((enum library_type)arg);
+}
+
+void kmpc_set_defaults(char const *str) {
+  // __kmp_aux_set_defaults initializes the library if needed
+  __kmp_aux_set_defaults(str, KMP_STRLEN(str));
+}
+
+void kmpc_set_disp_num_buffers(int arg) {
+  // ignore after initialization because some teams have already
+  // allocated dispatch buffers
+  if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
+      arg <= KMP_MAX_DISP_NUM_BUFF) {
+    __kmp_dispatch_num_buffers = arg;
+  }
+}
+
+int kmpc_set_affinity_mask_proc(int proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  return __kmp_aux_set_affinity_mask_proc(proc, mask);
+#endif
+}
+
+int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  return __kmp_aux_unset_affinity_mask_proc(proc, mask);
+#endif
+}
+
+int kmpc_get_affinity_mask_proc(int proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  return __kmp_aux_get_affinity_mask_proc(proc, mask);
+#endif
+}
+
+/* -------------------------------------------------------------------------- */
+/*!
+@ingroup THREADPRIVATE
+@param loc       source location information
+@param gtid      global thread number
+@param cpy_size  size of the cpy_data buffer
+@param cpy_data  pointer to data to be copied
+@param cpy_func  helper function to call for copying data
+@param didit     flag variable: 1=single thread; 0=not single thread
+
+__kmpc_copyprivate implements the interface for the private data broadcast
+needed for the copyprivate clause associated with a single region in an
+OpenMP<sup>*</sup> program (both C and Fortran).
+All threads participating in the parallel region call this routine.
+One of the threads (called the single thread) should have the <tt>didit</tt>
+variable set to 1 and all other threads should have that variable set to 0.
+All threads pass a pointer to a data buffer (cpy_data) that they have built.
+
+The OpenMP specification forbids the use of nowait on the single region when a
+copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
+barrier internally to avoid race conditions, so the code generation for the
+single region should avoid generating a barrier after the call to @ref
+__kmpc_copyprivate.
+
+The <tt>gtid</tt> parameter is the global thread id for the current thread.
+The <tt>loc</tt> parameter is a pointer to source location information.
+
+Internal implementation: The single thread will first copy its descriptor
+address (cpy_data) to a team-private location, then the other threads will each
+call the function pointed to by the parameter cpy_func, which carries out the
+copy by copying the data using the cpy_data buffer.
+
+The cpy_func routine used for the copy and the contents of the data area defined
+by cpy_data and cpy_size may be built in any fashion that will allow the copy
+to be done. For instance, the cpy_data buffer can hold the actual data to be
+copied or it may hold a list of pointers to the data. The cpy_func routine must
+interpret the cpy_data buffer appropriately.
+
+The interface to cpy_func is as follows:
+@code
+void cpy_func( void *destination, void *source )
+@endcode
+where void *destination is the cpy_data pointer for the thread being copied to
+and void *source is the cpy_data pointer for the thread being copied from.
+*/
+void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
+                        void *cpy_data, void (*cpy_func)(void *, void *),
+                        kmp_int32 didit) {
+  void **data_ptr;
+  KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
+
+  KMP_MB();
+
+  data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
+
+  if (__kmp_env_consistency_check) {
+    if (loc == 0) {
+      KMP_WARNING(ConstructIdentInvalid);
+    }
+  }
+
+  // ToDo: Optimize the following two barriers into some kind of split barrier
+
+  if (didit)
+    *data_ptr = cpy_data;
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->enter_frame.ptr == NULL)
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+/* This barrier is not a barrier region boundary */
+#if USE_ITT_NOTIFY
+  __kmp_threads[gtid]->th.th_ident = loc;
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+  if (!didit)
+    (*cpy_func)(cpy_data, *data_ptr);
+
+  // Consider next barrier a user-visible barrier for barrier region boundaries
+  // Nesting checks are already handled by the single construct checks
+  {
+#if OMPT_SUPPORT
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+#if USE_ITT_NOTIFY
+    __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
+// tasks can overwrite the location)
+#endif
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+  }
+}
+
+/* --------------------------------------------------------------------------*/
+/*!
+@ingroup THREADPRIVATE
+@param loc       source location information
+@param gtid      global thread number
+@param cpy_data  pointer to the data to be saved/copied or 0
+@return          the saved pointer to the data
+
+__kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
+__kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
+coming from single), and returns that pointer in all calls (for single thread
+it's not needed). This version doesn't do any actual data copying. Data copying
+has to be done somewhere else, e.g. inline in the generated code. Due to this,
+this function doesn't have any barrier at the end of the function, like
+__kmpc_copyprivate does, so generated code needs barrier after copying of all
+data was done.
+*/
+void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
+  void **data_ptr;
+
+  KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
+
+  KMP_MB();
+
+  data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
+
+  if (__kmp_env_consistency_check) {
+    if (loc == 0) {
+      KMP_WARNING(ConstructIdentInvalid);
+    }
+  }
+
+  // ToDo: Optimize the following barrier
+
+  if (cpy_data)
+    *data_ptr = cpy_data;
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->enter_frame.ptr == NULL)
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+/* This barrier is not a barrier region boundary */
+#if USE_ITT_NOTIFY
+  __kmp_threads[gtid]->th.th_ident = loc;
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+  return *data_ptr;
+}
+
+/* -------------------------------------------------------------------------- */
+
+#define INIT_LOCK __kmp_init_user_lock_with_checks
+#define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
+#define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
+#define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
+#define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
+#define ACQUIRE_NESTED_LOCK_TIMED                                              \
+  __kmp_acquire_nested_user_lock_with_checks_timed
+#define RELEASE_LOCK __kmp_release_user_lock_with_checks
+#define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
+#define TEST_LOCK __kmp_test_user_lock_with_checks
+#define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
+#define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
+#define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
+
+// TODO: Make check abort messages use location info & pass it into
+// with_checks routines
+
+#if KMP_USE_DYNAMIC_LOCK
+
+// internal lock initializer
+static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
+                                                    kmp_dyna_lockseq_t seq) {
+  if (KMP_IS_D_LOCK(seq)) {
+    KMP_INIT_D_LOCK(lock, seq);
+#if USE_ITT_BUILD
+    __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
+#endif
+  } else {
+    KMP_INIT_I_LOCK(lock, seq);
+#if USE_ITT_BUILD
+    kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+    __kmp_itt_lock_creating(ilk->lock, loc);
+#endif
+  }
+}
+
+// internal nest lock initializer
+static __forceinline void
+__kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
+                               kmp_dyna_lockseq_t seq) {
+#if KMP_USE_TSX
+  // Don't have nested lock implementation for speculative locks
+  if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
+      seq == lockseq_rtm_spin || seq == lockseq_adaptive)
+    seq = __kmp_user_lock_seq;
+#endif
+  switch (seq) {
+  case lockseq_tas:
+    seq = lockseq_nested_tas;
+    break;
+#if KMP_USE_FUTEX
+  case lockseq_futex:
+    seq = lockseq_nested_futex;
+    break;
+#endif
+  case lockseq_ticket:
+    seq = lockseq_nested_ticket;
+    break;
+  case lockseq_queuing:
+    seq = lockseq_nested_queuing;
+    break;
+  case lockseq_drdpa:
+    seq = lockseq_nested_drdpa;
+    break;
+  default:
+    seq = lockseq_nested_queuing;
+  }
+  KMP_INIT_I_LOCK(lock, seq);
+#if USE_ITT_BUILD
+  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+  __kmp_itt_lock_creating(ilk->lock, loc);
+#endif
+}
+
+/* initialize the lock with a hint */
+void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
+                                uintptr_t hint) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (__kmp_env_consistency_check && user_lock == NULL) {
+    KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
+  }
+
+  __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_lock, (omp_lock_hint_t)hint,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+}
+
+/* initialize the lock with a hint */
+void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
+                                     void **user_lock, uintptr_t hint) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (__kmp_env_consistency_check && user_lock == NULL) {
+    KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
+  }
+
+  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+}
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+/* initialize the lock */
+void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (__kmp_env_consistency_check && user_lock == NULL) {
+    KMP_FATAL(LockIsUninitialized, "omp_init_lock");
+  }
+  __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  static char const *const func = "omp_init_lock";
+  kmp_user_lock_p lck;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (__kmp_env_consistency_check) {
+    if (user_lock == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+  }
+
+  KMP_CHECK_USER_LOCK_INIT();
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
+  }
+  INIT_LOCK(lck);
+  __kmp_set_user_lock_location(lck, loc);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_creating(lck);
+#endif /* USE_ITT_BUILD */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_init_lock
+
+/* initialize the lock */
+void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (__kmp_env_consistency_check && user_lock == NULL) {
+    KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
+  }
+  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_nest_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  static char const *const func = "omp_init_nest_lock";
+  kmp_user_lock_p lck;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (__kmp_env_consistency_check) {
+    if (user_lock == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+  }
+
+  KMP_CHECK_USER_LOCK_INIT();
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
+  }
+
+  INIT_NESTED_LOCK(lck);
+  __kmp_set_user_lock_location(lck, loc);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_creating(lck);
+#endif /* USE_ITT_BUILD */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_init_nest_lock
+
+void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+#if USE_ITT_BUILD
+  kmp_user_lock_p lck;
+  if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
+    lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
+  } else {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+  __kmp_itt_lock_destroyed(lck);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
+#else
+  kmp_user_lock_p lck;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_destroyed(lck);
+#endif /* USE_ITT_BUILD */
+  DESTROY_LOCK(lck);
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    ;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    ;
+  }
+#endif
+  else {
+    __kmp_user_lock_free(user_lock, gtid, lck);
+  }
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_destroy_lock
+
+/* destroy the lock */
+void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+#if USE_ITT_BUILD
+  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
+  __kmp_itt_lock_destroyed(ilk->lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_destroyed(lck);
+#endif /* USE_ITT_BUILD */
+
+  DESTROY_NESTED_LOCK(lck);
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+    ;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    ;
+  }
+#endif
+  else {
+    __kmp_user_lock_free(user_lock, gtid, lck);
+  }
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_destroy_nest_lock
+
+void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+  KMP_COUNT_BLOCK(OMP_set_lock);
+#if KMP_USE_DYNAMIC_LOCK
+  int tag = KMP_EXTRACT_D_TAG(user_lock);
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring(
+      (kmp_user_lock_p)
+          user_lock); // itt function will get to the right lock object.
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+#if KMP_USE_INLINED_TAS
+  if (tag == locktag_tas && !__kmp_env_consistency_check) {
+    KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
+  } else
+#elif KMP_USE_INLINED_FUTEX
+  if (tag == locktag_futex && !__kmp_env_consistency_check) {
+    KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
+  } else
+#endif
+  {
+    __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+  }
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+  }
+#endif
+
+  ACQUIRE_LOCK(lck, gtid);
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquired(lck);
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+  }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_nest_lock, omp_lock_hint_none,
+          __ompt_get_mutex_impl_type(user_lock),
+          (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+    }
+  }
+#endif
+  int acquire_status =
+      KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
+  (void)acquire_status;
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+#endif
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
+            codeptr);
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+      }
+    }
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+  int acquire_status;
+  kmp_user_lock_p lck;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_nest_lock, omp_lock_hint_none,
+          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
+          codeptr);
+    }
+  }
+#endif
+
+  ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquired(lck);
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    }
+  }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+  int tag = KMP_EXTRACT_D_TAG(user_lock);
+#if USE_ITT_BUILD
+  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+#endif
+#if KMP_USE_INLINED_TAS
+  if (tag == locktag_tas && !__kmp_env_consistency_check) {
+    KMP_RELEASE_TAS_LOCK(user_lock, gtid);
+  } else
+#elif KMP_USE_INLINED_FUTEX
+  if (tag == locktag_futex && !__kmp_env_consistency_check) {
+    KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
+  } else
+#endif
+  {
+    __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+
+  /* Can't use serial interval since not block structured */
+  /* release the lock */
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+// "fast" path implemented to fix customer performance issue
+#if USE_ITT_BUILD
+    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+#endif /* USE_ITT_BUILD */
+    TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
+    KMP_MB();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    // This is the case, if called from omp_init_lock_with_hint:
+    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_mutex_released) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+          ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+    }
+#endif
+
+    return;
+#else
+    lck = (kmp_user_lock_p)user_lock;
+#endif
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_releasing(lck);
+#endif /* USE_ITT_BUILD */
+
+  RELEASE_LOCK(lck, gtid);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+  }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* release the lock */
+void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+#endif
+  int release_status =
+      KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
+  (void)release_status;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
+    if (release_status == KMP_LOCK_RELEASED) {
+      if (ompt_enabled.ompt_callback_mutex_released) {
+        // release_lock_last
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
+            codeptr);
+      }
+    } else if (ompt_enabled.ompt_callback_nest_lock) {
+      // release_lock_prev
+      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+          ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+    }
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+
+  /* Can't use serial interval since not block structured */
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    // "fast" path implemented to fix customer performance issue
+    kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
+#if USE_ITT_BUILD
+    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    int release_status = KMP_LOCK_STILL_HELD;
+#endif
+
+    if (--(tl->lk.depth_locked) == 0) {
+      TCW_4(tl->lk.poll, 0);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+      release_status = KMP_LOCK_RELEASED;
+#endif
+    }
+    KMP_MB();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    // This is the case, if called from omp_init_lock_with_hint:
+    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+    if (ompt_enabled.enabled) {
+      if (release_status == KMP_LOCK_RELEASED) {
+        if (ompt_enabled.ompt_callback_mutex_released) {
+          // release_lock_last
+          ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+              ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+        }
+      } else if (ompt_enabled.ompt_callback_nest_lock) {
+        // release_lock_previous
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    }
+#endif
+
+    return;
+#else
+    lck = (kmp_user_lock_p)user_lock;
+#endif
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_releasing(lck);
+#endif /* USE_ITT_BUILD */
+
+  int release_status;
+  release_status = RELEASE_NESTED_LOCK(lck, gtid);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
+    if (release_status == KMP_LOCK_RELEASED) {
+      if (ompt_enabled.ompt_callback_mutex_released) {
+        // release_lock_last
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    } else if (ompt_enabled.ompt_callback_nest_lock) {
+      // release_lock_previous
+      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+          ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+    }
+  }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* try to acquire the lock */
+int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+  KMP_COUNT_BLOCK(OMP_test_lock);
+
+#if KMP_USE_DYNAMIC_LOCK
+  int rc;
+  int tag = KMP_EXTRACT_D_TAG(user_lock);
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_test_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+#if KMP_USE_INLINED_TAS
+  if (tag == locktag_tas && !__kmp_env_consistency_check) {
+    KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
+  } else
+#elif KMP_USE_INLINED_FUTEX
+  if (tag == locktag_futex && !__kmp_env_consistency_check) {
+    KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
+  } else
+#endif
+  {
+    rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+  }
+  if (rc) {
+#if USE_ITT_BUILD
+    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+    }
+#endif
+    return FTN_TRUE;
+  } else {
+#if USE_ITT_BUILD
+    __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
+#endif
+    return FTN_FALSE;
+  }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+  int rc;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_test_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+  }
+#endif
+
+  rc = TEST_LOCK(lck, gtid);
+#if USE_ITT_BUILD
+  if (rc) {
+    __kmp_itt_lock_acquired(lck);
+  } else {
+    __kmp_itt_lock_cancelled(lck);
+  }
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+  }
+#endif
+
+  return (rc ? FTN_TRUE : FTN_FALSE);
+
+  /* Can't use serial interval since not block structured */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* try to acquire the lock */
+int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+  int rc;
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_test_nest_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+  rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
+#if USE_ITT_BUILD
+  if (rc) {
+    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+  } else {
+    __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
+  }
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled && rc) {
+    if (rc == 1) {
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
+            codeptr);
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+      }
+    }
+  }
+#endif
+  return rc;
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+  int rc;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring(lck);
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) &&
+        ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_test_nest_lock, omp_lock_hint_none,
+          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
+          codeptr);
+    }
+#endif
+
+  rc = TEST_NESTED_LOCK(lck, gtid);
+#if USE_ITT_BUILD
+  if (rc) {
+    __kmp_itt_lock_acquired(lck);
+  } else {
+    __kmp_itt_lock_cancelled(lck);
+  }
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled && rc) {
+    if (rc == 1) {
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    }
+  }
+#endif
+  return rc;
+
+  /* Can't use serial interval since not block structured */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+// Interface to fast scalable reduce methods routines
+
+// keep the selected method in a thread local structure for cross-function
+// usage: will be used in __kmpc_end_reduce* functions;
+// another solution: to re-determine the method one more time in
+// __kmpc_end_reduce* functions (new prototype required then)
+// AT: which solution is better?
+#define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
+  ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
+
+#define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
+  (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
+
+// description of the packed_reduction_method variable: look at the macros in
+// kmp.h
+
+// used in a critical section reduce block
+static __forceinline void
+__kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
+                                          kmp_critical_name *crit) {
+
+  // this lock was visible to a customer and to the threading profile tool as a
+  // serial overhead span (although it's used for an internal purpose only)
+  //            why was it visible in previous implementation?
+  //            should we keep it visible in new reduce block?
+  kmp_user_lock_p lck;
+
+#if KMP_USE_DYNAMIC_LOCK
+
+  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
+  // Check if it is initialized.
+  if (*lk == 0) {
+    if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
+      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
+                                  KMP_GET_D_TAG(__kmp_user_lock_seq));
+    } else {
+      __kmp_init_indirect_csptr(crit, loc, global_tid,
+                                KMP_GET_I_TAG(__kmp_user_lock_seq));
+    }
+  }
+  // Branch for accessing the actual lock object and set operation. This
+  // branching is inevitable since this lock initialization does not follow the
+  // normal dispatch path (lock table is not used).
+  if (KMP_EXTRACT_D_TAG(lk) != 0) {
+    lck = (kmp_user_lock_p)lk;
+    KMP_DEBUG_ASSERT(lck != NULL);
+    if (__kmp_env_consistency_check) {
+      __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
+    }
+    KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
+  } else {
+    kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
+    lck = ilk->lock;
+    KMP_DEBUG_ASSERT(lck != NULL);
+    if (__kmp_env_consistency_check) {
+      __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
+    }
+    KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
+  }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  // We know that the fast reduction code is only emitted by Intel compilers
+  // with 32 byte critical sections. If there isn't enough space, then we
+  // have to use a pointer.
+  if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
+    lck = (kmp_user_lock_p)crit;
+  } else {
+    lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
+  }
+  KMP_DEBUG_ASSERT(lck != NULL);
+
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_critical, loc, lck);
+
+  __kmp_acquire_user_lock_with_checks(lck, global_tid);
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+// used in a critical section reduce block
+static __forceinline void
+__kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
+                                        kmp_critical_name *crit) {
+
+  kmp_user_lock_p lck;
+
+#if KMP_USE_DYNAMIC_LOCK
+
+  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
+    lck = (kmp_user_lock_p)crit;
+    if (__kmp_env_consistency_check)
+      __kmp_pop_sync(global_tid, ct_critical, loc);
+    KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
+  } else {
+    kmp_indirect_lock_t *ilk =
+        (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
+    if (__kmp_env_consistency_check)
+      __kmp_pop_sync(global_tid, ct_critical, loc);
+    KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
+  }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  // We know that the fast reduction code is only emitted by Intel compilers
+  // with 32 byte critical sections. If there isn't enough space, then we have
+  // to use a pointer.
+  if (__kmp_base_user_lock_size > 32) {
+    lck = *((kmp_user_lock_p *)crit);
+    KMP_ASSERT(lck != NULL);
+  } else {
+    lck = (kmp_user_lock_p)crit;
+  }
+
+  if (__kmp_env_consistency_check)
+    __kmp_pop_sync(global_tid, ct_critical, loc);
+
+  __kmp_release_user_lock_with_checks(lck, global_tid);
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmp_end_critical_section_reduce_block
+
+static __forceinline int
+__kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
+                                     int *task_state) {
+  kmp_team_t *team;
+
+  // Check if we are inside the teams construct?
+  if (th->th.th_teams_microtask) {
+    *team_p = team = th->th.th_team;
+    if (team->t.t_level == th->th.th_teams_level) {
+      // This is reduction at teams construct.
+      KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
+      // Let's swap teams temporarily for the reduction.
+      th->th.th_info.ds.ds_tid = team->t.t_master_tid;
+      th->th.th_team = team->t.t_parent;
+      th->th.th_team_nproc = th->th.th_team->t.t_nproc;
+      th->th.th_task_team = th->th.th_team->t.t_task_team[0];
+      *task_state = th->th.th_task_state;
+      th->th.th_task_state = 0;
+
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static __forceinline void
+__kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
+  // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
+  th->th.th_info.ds.ds_tid = 0;
+  th->th.th_team = team;
+  th->th.th_team_nproc = team->t.t_nproc;
+  th->th.th_task_team = team->t.t_task_team[task_state];
+  __kmp_type_convert(task_state, &(th->th.th_task_state));
+}
+
+/* 2.a.i. Reduce Block without a terminating barrier */
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread number
+@param num_vars number of items (variables) to be reduced
+@param reduce_size size of data in bytes to be reduced
+@param reduce_data pointer to data to be reduced
+@param reduce_func callback function providing reduction operation on two
+operands and returning result of reduction in lhs_data
+@param lck pointer to the unique lock data structure
+@result 1 for the primary thread, 0 for all other team threads, 2 for all team
+threads if atomic reduction needed
+
+The nowait version is used for a reduce clause with the nowait argument.
+*/
+kmp_int32
+__kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
+                     size_t reduce_size, void *reduce_data,
+                     void (*reduce_func)(void *lhs_data, void *rhs_data),
+                     kmp_critical_name *lck) {
+
+  KMP_COUNT_BLOCK(REDUCE_nowait);
+  int retval = 0;
+  PACKED_REDUCTION_METHOD_T packed_reduction_method;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  int teams_swapped = 0, task_state;
+  KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  // why do we need this initialization here at all?
+  // Reduction clause can not be used as a stand-alone directive.
+
+  // do not call __kmp_serial_initialize(), it will be called by
+  // __kmp_parallel_initialize() if needed
+  // possible detection of false-positive race by the threadchecker ???
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+// check correctness of reduce block nesting
+#if KMP_USE_DYNAMIC_LOCK
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
+#else
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
+#endif
+
+  th = __kmp_thread_from_gtid(global_tid);
+  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
+
+  // packed_reduction_method value will be reused by __kmp_end_reduce* function,
+  // the value should be kept in a variable
+  // the variable should be either a construct-specific or thread-specific
+  // property, not a team specific property
+  //     (a thread can reach the next reduce block on the next construct, reduce
+  //     method may differ on the next construct)
+  // an ident_t "loc" parameter could be used as a construct-specific property
+  // (what if loc == 0?)
+  //     (if both construct-specific and team-specific variables were shared,
+  //     then unness extra syncs should be needed)
+  // a thread-specific variable is better regarding two issues above (next
+  // construct and extra syncs)
+  // a thread-specific "th_local.reduction_method" variable is used currently
+  // each thread executes 'determine' and 'set' lines (no need to execute by one
+  // thread, to avoid unness extra syncs)
+
+  packed_reduction_method = __kmp_determine_reduction_method(
+      loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
+  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
+
+  OMPT_REDUCTION_DECL(th, global_tid);
+  if (packed_reduction_method == critical_reduce_block) {
+
+    OMPT_REDUCTION_BEGIN;
+
+    __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
+    retval = 1;
+
+  } else if (packed_reduction_method == empty_reduce_block) {
+
+    OMPT_REDUCTION_BEGIN;
+
+    // usage: if team size == 1, no synchronization is required ( Intel
+    // platforms only )
+    retval = 1;
+
+  } else if (packed_reduction_method == atomic_reduce_block) {
+
+    retval = 2;
+
+    // all threads should do this pop here (because __kmpc_end_reduce_nowait()
+    // won't be called by the code gen)
+    //     (it's not quite good, because the checking block has been closed by
+    //     this 'pop',
+    //      but atomic operation has not been executed yet, will be executed
+    //      slightly later, literally on next instruction)
+    if (__kmp_env_consistency_check)
+      __kmp_pop_sync(global_tid, ct_reduce, loc);
+
+  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
+                                   tree_reduce_block)) {
+
+// AT: performance issue: a real barrier here
+// AT: (if primary thread is slow, other threads are blocked here waiting for
+//      the primary thread to come and release them)
+// AT: (it's not what a customer might expect specifying NOWAIT clause)
+// AT: (specifying NOWAIT won't result in improvement of performance, it'll
+//      be confusing to a customer)
+// AT: another implementation of *barrier_gather*nowait() (or some other design)
+// might go faster and be more in line with sense of NOWAIT
+// AT: TO DO: do epcc test and compare times
+
+// this barrier should be invisible to a customer and to the threading profile
+// tool (it's neither a terminating barrier nor customer's code, it's
+// used for an internal purpose)
+#if OMPT_SUPPORT
+    // JP: can this barrier potentially leed to task scheduling?
+    // JP: as long as there is a barrier in the implementation, OMPT should and
+    // will provide the barrier events
+    //         so we set-up the necessary frame/return addresses.
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->enter_frame.ptr == NULL)
+        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+    retval =
+        __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
+                      global_tid, FALSE, reduce_size, reduce_data, reduce_func);
+    retval = (retval != 0) ? (0) : (1);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+
+    // all other workers except primary thread should do this pop here
+    //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
+    if (__kmp_env_consistency_check) {
+      if (retval == 0) {
+        __kmp_pop_sync(global_tid, ct_reduce, loc);
+      }
+    }
+
+  } else {
+
+    // should never reach this block
+    KMP_ASSERT(0); // "unexpected method"
+  }
+  if (teams_swapped) {
+    __kmp_restore_swapped_teams(th, team, task_state);
+  }
+  KA_TRACE(
+      10,
+      ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
+       global_tid, packed_reduction_method, retval));
+
+  return retval;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread id.
+@param lck pointer to the unique lock data structure
+
+Finish the execution of a reduce nowait.
+*/
+void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
+                              kmp_critical_name *lck) {
+
+  PACKED_REDUCTION_METHOD_T packed_reduction_method;
+
+  KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
+
+  OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
+
+  if (packed_reduction_method == critical_reduce_block) {
+
+    __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
+    OMPT_REDUCTION_END;
+
+  } else if (packed_reduction_method == empty_reduce_block) {
+
+    // usage: if team size == 1, no synchronization is required ( on Intel
+    // platforms only )
+
+    OMPT_REDUCTION_END;
+
+  } else if (packed_reduction_method == atomic_reduce_block) {
+
+    // neither primary thread nor other workers should get here
+    //     (code gen does not generate this call in case 2: atomic reduce block)
+    // actually it's better to remove this elseif at all;
+    // after removal this value will checked by the 'else' and will assert
+
+  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
+                                   tree_reduce_block)) {
+
+    // only primary thread gets here
+    // OMPT: tree reduction is annotated in the barrier code
+
+  } else {
+
+    // should never reach this block
+    KMP_ASSERT(0); // "unexpected method"
+  }
+
+  if (__kmp_env_consistency_check)
+    __kmp_pop_sync(global_tid, ct_reduce, loc);
+
+  KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
+                global_tid, packed_reduction_method));
+
+  return;
+}
+
+/* 2.a.ii. Reduce Block with a terminating barrier */
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread number
+@param num_vars number of items (variables) to be reduced
+@param reduce_size size of data in bytes to be reduced
+@param reduce_data pointer to data to be reduced
+@param reduce_func callback function providing reduction operation on two
+operands and returning result of reduction in lhs_data
+@param lck pointer to the unique lock data structure
+@result 1 for the primary thread, 0 for all other team threads, 2 for all team
+threads if atomic reduction needed
+
+A blocking reduce that includes an implicit barrier.
+*/
+kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
+                        size_t reduce_size, void *reduce_data,
+                        void (*reduce_func)(void *lhs_data, void *rhs_data),
+                        kmp_critical_name *lck) {
+  KMP_COUNT_BLOCK(REDUCE_wait);
+  int retval = 0;
+  PACKED_REDUCTION_METHOD_T packed_reduction_method;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  int teams_swapped = 0, task_state;
+
+  KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  // why do we need this initialization here at all?
+  // Reduction clause can not be a stand-alone directive.
+
+  // do not call __kmp_serial_initialize(), it will be called by
+  // __kmp_parallel_initialize() if needed
+  // possible detection of false-positive race by the threadchecker ???
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+// check correctness of reduce block nesting
+#if KMP_USE_DYNAMIC_LOCK
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
+#else
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
+#endif
+
+  th = __kmp_thread_from_gtid(global_tid);
+  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
+
+  packed_reduction_method = __kmp_determine_reduction_method(
+      loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
+  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
+
+  OMPT_REDUCTION_DECL(th, global_tid);
+
+  if (packed_reduction_method == critical_reduce_block) {
+
+    OMPT_REDUCTION_BEGIN;
+    __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
+    retval = 1;
+
+  } else if (packed_reduction_method == empty_reduce_block) {
+
+    OMPT_REDUCTION_BEGIN;
+    // usage: if team size == 1, no synchronization is required ( Intel
+    // platforms only )
+    retval = 1;
+
+  } else if (packed_reduction_method == atomic_reduce_block) {
+
+    retval = 2;
+
+  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
+                                   tree_reduce_block)) {
+
+// case tree_reduce_block:
+// this barrier should be visible to a customer and to the threading profile
+// tool (it's a terminating barrier on constructs if NOWAIT not specified)
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->enter_frame.ptr == NULL)
+        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident =
+        loc; // needed for correct notification of frames
+#endif
+    retval =
+        __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
+                      global_tid, TRUE, reduce_size, reduce_data, reduce_func);
+    retval = (retval != 0) ? (0) : (1);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+
+    // all other workers except primary thread should do this pop here
+    // (none of other workers except primary will enter __kmpc_end_reduce())
+    if (__kmp_env_consistency_check) {
+      if (retval == 0) { // 0: all other workers; 1: primary thread
+        __kmp_pop_sync(global_tid, ct_reduce, loc);
+      }
+    }
+
+  } else {
+
+    // should never reach this block
+    KMP_ASSERT(0); // "unexpected method"
+  }
+  if (teams_swapped) {
+    __kmp_restore_swapped_teams(th, team, task_state);
+  }
+
+  KA_TRACE(10,
+           ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
+            global_tid, packed_reduction_method, retval));
+  return retval;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread id.
+@param lck pointer to the unique lock data structure
+
+Finish the execution of a blocking reduce.
+The <tt>lck</tt> pointer must be the same as that used in the corresponding
+start function.
+*/
+void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
+                       kmp_critical_name *lck) {
+
+  PACKED_REDUCTION_METHOD_T packed_reduction_method;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  int teams_swapped = 0, task_state;
+
+  KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  th = __kmp_thread_from_gtid(global_tid);
+  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
+
+  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
+
+  // this barrier should be visible to a customer and to the threading profile
+  // tool (it's a terminating barrier on constructs if NOWAIT not specified)
+  OMPT_REDUCTION_DECL(th, global_tid);
+
+  if (packed_reduction_method == critical_reduce_block) {
+    __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
+
+    OMPT_REDUCTION_END;
+
+// TODO: implicit barrier: should be exposed
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->enter_frame.ptr == NULL)
+        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+
+  } else if (packed_reduction_method == empty_reduce_block) {
+
+    OMPT_REDUCTION_END;
+
+// usage: if team size==1, no synchronization is required (Intel platforms only)
+
+// TODO: implicit barrier: should be exposed
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->enter_frame.ptr == NULL)
+        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+
+  } else if (packed_reduction_method == atomic_reduce_block) {
+
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->enter_frame.ptr == NULL)
+        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    }
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif
+// TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+
+  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
+                                   tree_reduce_block)) {
+
+    // only primary thread executes here (primary releases all other workers)
+    __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
+                            global_tid);
+
+  } else {
+
+    // should never reach this block
+    KMP_ASSERT(0); // "unexpected method"
+  }
+  if (teams_swapped) {
+    __kmp_restore_swapped_teams(th, team, task_state);
+  }
+
+  if (__kmp_env_consistency_check)
+    __kmp_pop_sync(global_tid, ct_reduce, loc);
+
+  KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
+                global_tid, packed_reduction_method));
+
+  return;
+}
+
+#undef __KMP_GET_REDUCTION_METHOD
+#undef __KMP_SET_REDUCTION_METHOD
+
+/* end of interface to fast scalable reduce routines */
+
+kmp_uint64 __kmpc_get_taskid() {
+
+  kmp_int32 gtid;
+  kmp_info_t *thread;
+
+  gtid = __kmp_get_gtid();
+  if (gtid < 0) {
+    return 0;
+  }
+  thread = __kmp_thread_from_gtid(gtid);
+  return thread->th.th_current_task->td_task_id;
+
+} // __kmpc_get_taskid
+
+kmp_uint64 __kmpc_get_parent_taskid() {
+
+  kmp_int32 gtid;
+  kmp_info_t *thread;
+  kmp_taskdata_t *parent_task;
+
+  gtid = __kmp_get_gtid();
+  if (gtid < 0) {
+    return 0;
+  }
+  thread = __kmp_thread_from_gtid(gtid);
+  parent_task = thread->th.th_current_task->td_parent;
+  return (parent_task == NULL ? 0 : parent_task->td_task_id);
+
+} // __kmpc_get_parent_taskid
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param gtid  global thread number.
+@param num_dims  number of associated doacross loops.
+@param dims  info on loops bounds.
+
+Initialize doacross loop information.
+Expect compiler send us inclusive bounds,
+e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
+*/
+void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
+                          const struct kmp_dim *dims) {
+  __kmp_assert_valid_gtid(gtid);
+  int j, idx;
+  kmp_int64 last, trace_count;
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+  kmp_uint32 *flags;
+  kmp_disp_t *pr_buf = th->th.th_dispatch;
+  dispatch_shared_info_t *sh_buf;
+
+  KA_TRACE(
+      20,
+      ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
+       gtid, num_dims, !team->t.t_serialized));
+  KMP_DEBUG_ASSERT(dims != NULL);
+  KMP_DEBUG_ASSERT(num_dims > 0);
+
+  if (team->t.t_serialized) {
+    KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
+    return; // no dependencies if team is serialized
+  }
+  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
+  idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
+  // the next loop
+  sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
+
+  // Save bounds info into allocated private buffer
+  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
+  pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
+      th, sizeof(kmp_int64) * (4 * num_dims + 1));
+  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+  pr_buf->th_doacross_info[0] =
+      (kmp_int64)num_dims; // first element is number of dimensions
+  // Save also address of num_done in order to access it later without knowing
+  // the buffer index
+  pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
+  pr_buf->th_doacross_info[2] = dims[0].lo;
+  pr_buf->th_doacross_info[3] = dims[0].up;
+  pr_buf->th_doacross_info[4] = dims[0].st;
+  last = 5;
+  for (j = 1; j < num_dims; ++j) {
+    kmp_int64
+        range_length; // To keep ranges of all dimensions but the first dims[0]
+    if (dims[j].st == 1) { // most common case
+      // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
+      range_length = dims[j].up - dims[j].lo + 1;
+    } else {
+      if (dims[j].st > 0) {
+        KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
+        range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
+      } else { // negative increment
+        KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
+        range_length =
+            (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
+      }
+    }
+    pr_buf->th_doacross_info[last++] = range_length;
+    pr_buf->th_doacross_info[last++] = dims[j].lo;
+    pr_buf->th_doacross_info[last++] = dims[j].up;
+    pr_buf->th_doacross_info[last++] = dims[j].st;
+  }
+
+  // Compute total trip count.
+  // Start with range of dims[0] which we don't need to keep in the buffer.
+  if (dims[0].st == 1) { // most common case
+    trace_count = dims[0].up - dims[0].lo + 1;
+  } else if (dims[0].st > 0) {
+    KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
+    trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
+  } else { // negative increment
+    KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
+    trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
+  }
+  for (j = 1; j < num_dims; ++j) {
+    trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
+  }
+  KMP_DEBUG_ASSERT(trace_count > 0);
+
+  // Check if shared buffer is not occupied by other loop (idx -
+  // __kmp_dispatch_num_buffers)
+  if (idx != sh_buf->doacross_buf_idx) {
+    // Shared buffer is occupied, wait for it to be free
+    __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
+                 __kmp_eq_4, NULL);
+  }
+#if KMP_32_BIT_ARCH
+  // Check if we are the first thread. After the CAS the first thread gets 0,
+  // others get 1 if initialization is in progress, allocated pointer otherwise.
+  // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
+  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
+      (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
+#else
+  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
+      (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
+#endif
+  if (flags == NULL) {
+    // we are the first thread, allocate the array of flags
+    size_t size =
+        (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
+    flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
+    KMP_MB();
+    sh_buf->doacross_flags = flags;
+  } else if (flags == (kmp_uint32 *)1) {
+#if KMP_32_BIT_ARCH
+    // initialization is still in progress, need to wait
+    while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
+#else
+    while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
+#endif
+      KMP_YIELD(TRUE);
+    KMP_MB();
+  } else {
+    KMP_MB();
+  }
+  KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
+  pr_buf->th_doacross_flags =
+      sh_buf->doacross_flags; // save private copy in order to not
+  // touch shared buffer on each iteration
+  KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
+}
+
+void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
+  __kmp_assert_valid_gtid(gtid);
+  kmp_int64 shft;
+  size_t num_dims, i;
+  kmp_uint32 flag;
+  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+  kmp_disp_t *pr_buf;
+  kmp_int64 lo, up, st;
+
+  KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
+  if (team->t.t_serialized) {
+    KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
+    return; // no dependencies if team is serialized
+  }
+
+  // calculate sequential iteration number and check out-of-bounds condition
+  pr_buf = th->th.th_dispatch;
+  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+  num_dims = (size_t)pr_buf->th_doacross_info[0];
+  lo = pr_buf->th_doacross_info[2];
+  up = pr_buf->th_doacross_info[3];
+  st = pr_buf->th_doacross_info[4];
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_dependence_t deps[num_dims];
+#endif
+  if (st == 1) { // most common case
+    if (vec[0] < lo || vec[0] > up) {
+      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                    "bounds [%lld,%lld]\n",
+                    gtid, vec[0], lo, up));
+      return;
+    }
+    iter_number = vec[0] - lo;
+  } else if (st > 0) {
+    if (vec[0] < lo || vec[0] > up) {
+      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                    "bounds [%lld,%lld]\n",
+                    gtid, vec[0], lo, up));
+      return;
+    }
+    iter_number = (kmp_uint64)(vec[0] - lo) / st;
+  } else { // negative increment
+    if (vec[0] > lo || vec[0] < up) {
+      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                    "bounds [%lld,%lld]\n",
+                    gtid, vec[0], lo, up));
+      return;
+    }
+    iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+  }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  deps[0].variable.value = iter_number;
+  deps[0].dependence_type = ompt_dependence_type_sink;
+#endif
+  for (i = 1; i < num_dims; ++i) {
+    kmp_int64 iter, ln;
+    size_t j = i * 4;
+    ln = pr_buf->th_doacross_info[j + 1];
+    lo = pr_buf->th_doacross_info[j + 2];
+    up = pr_buf->th_doacross_info[j + 3];
+    st = pr_buf->th_doacross_info[j + 4];
+    if (st == 1) {
+      if (vec[i] < lo || vec[i] > up) {
+        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                      "bounds [%lld,%lld]\n",
+                      gtid, vec[i], lo, up));
+        return;
+      }
+      iter = vec[i] - lo;
+    } else if (st > 0) {
+      if (vec[i] < lo || vec[i] > up) {
+        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                      "bounds [%lld,%lld]\n",
+                      gtid, vec[i], lo, up));
+        return;
+      }
+      iter = (kmp_uint64)(vec[i] - lo) / st;
+    } else { // st < 0
+      if (vec[i] > lo || vec[i] < up) {
+        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                      "bounds [%lld,%lld]\n",
+                      gtid, vec[i], lo, up));
+        return;
+      }
+      iter = (kmp_uint64)(lo - vec[i]) / (-st);
+    }
+    iter_number = iter + ln * iter_number;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    deps[i].variable.value = iter;
+    deps[i].dependence_type = ompt_dependence_type_sink;
+#endif
+  }
+  shft = iter_number % 32; // use 32-bit granularity
+  iter_number >>= 5; // divided by 32
+  flag = 1 << shft;
+  while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
+    KMP_YIELD(TRUE);
+  }
+  KMP_MB();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_dependences) {
+    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
+        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
+  }
+#endif
+  KA_TRACE(20,
+           ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
+            gtid, (iter_number << 5) + shft));
+}
+
+void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
+  __kmp_assert_valid_gtid(gtid);
+  kmp_int64 shft;
+  size_t num_dims, i;
+  kmp_uint32 flag;
+  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+  kmp_disp_t *pr_buf;
+  kmp_int64 lo, st;
+
+  KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
+  if (team->t.t_serialized) {
+    KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
+    return; // no dependencies if team is serialized
+  }
+
+  // calculate sequential iteration number (same as in "wait" but no
+  // out-of-bounds checks)
+  pr_buf = th->th.th_dispatch;
+  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+  num_dims = (size_t)pr_buf->th_doacross_info[0];
+  lo = pr_buf->th_doacross_info[2];
+  st = pr_buf->th_doacross_info[4];
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_dependence_t deps[num_dims];
+#endif
+  if (st == 1) { // most common case
+    iter_number = vec[0] - lo;
+  } else if (st > 0) {
+    iter_number = (kmp_uint64)(vec[0] - lo) / st;
+  } else { // negative increment
+    iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+  }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  deps[0].variable.value = iter_number;
+  deps[0].dependence_type = ompt_dependence_type_source;
+#endif
+  for (i = 1; i < num_dims; ++i) {
+    kmp_int64 iter, ln;
+    size_t j = i * 4;
+    ln = pr_buf->th_doacross_info[j + 1];
+    lo = pr_buf->th_doacross_info[j + 2];
+    st = pr_buf->th_doacross_info[j + 4];
+    if (st == 1) {
+      iter = vec[i] - lo;
+    } else if (st > 0) {
+      iter = (kmp_uint64)(vec[i] - lo) / st;
+    } else { // st < 0
+      iter = (kmp_uint64)(lo - vec[i]) / (-st);
+    }
+    iter_number = iter + ln * iter_number;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    deps[i].variable.value = iter;
+    deps[i].dependence_type = ompt_dependence_type_source;
+#endif
+  }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_dependences) {
+    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
+        &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
+  }
+#endif
+  shft = iter_number % 32; // use 32-bit granularity
+  iter_number >>= 5; // divided by 32
+  flag = 1 << shft;
+  KMP_MB();
+  if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
+    KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
+  KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
+                (iter_number << 5) + shft));
+}
+
+void __kmpc_doacross_fini(ident_t *loc, int gtid) {
+  __kmp_assert_valid_gtid(gtid);
+  kmp_int32 num_done;
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+  kmp_disp_t *pr_buf = th->th.th_dispatch;
+
+  KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
+  if (team->t.t_serialized) {
+    KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
+    return; // nothing to do
+  }
+  num_done =
+      KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
+  if (num_done == th->th.th_team_nproc) {
+    // we are the last thread, need to free shared resources
+    int idx = pr_buf->th_doacross_buf_idx - 1;
+    dispatch_shared_info_t *sh_buf =
+        &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
+                     (kmp_int64)&sh_buf->doacross_num_done);
+    KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
+    KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
+    __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
+    sh_buf->doacross_flags = NULL;
+    sh_buf->doacross_num_done = 0;
+    sh_buf->doacross_buf_idx +=
+        __kmp_dispatch_num_buffers; // free buffer for future re-use
+  }
+  // free private resources (need to keep buffer index forever)
+  pr_buf->th_doacross_flags = NULL;
+  __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
+  pr_buf->th_doacross_info = NULL;
+  KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
+}
+
+/* OpenMP 5.1 Memory Management routines */
+void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
+  return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
+}
+
+void *omp_aligned_alloc(size_t align, size_t size,
+                        omp_allocator_handle_t allocator) {
+  return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
+}
+
+void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
+  return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
+}
+
+void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
+                         omp_allocator_handle_t allocator) {
+  return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
+}
+
+void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
+                  omp_allocator_handle_t free_allocator) {
+  return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
+                       free_allocator);
+}
+
+void omp_free(void *ptr, omp_allocator_handle_t allocator) {
+  ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
+}
+/* end of OpenMP 5.1 Memory Management routines */
+
+int __kmpc_get_target_offload(void) {
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_target_offload;
+}
+
+int __kmpc_pause_resource(kmp_pause_status_t level) {
+  if (!__kmp_init_serial) {
+    return 1; // Can't pause if runtime is not initialized
+  }
+  return __kmp_pause_resource(level);
+}
+
+void __kmpc_error(ident_t *loc, int severity, const char *message) {
+  if (!__kmp_init_serial)
+    __kmp_serial_initialize();
+
+  KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
+    ompt_callbacks.ompt_callback(ompt_callback_error)(
+        (ompt_severity_t)severity, message, KMP_STRLEN(message),
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif // OMPT_SUPPORT
+
+  char *src_loc;
+  if (loc && loc->psource) {
+    kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
+    src_loc =
+        __kmp_str_format("%s:%d:%d", str_loc.file, str_loc.line, str_loc.col);
+    __kmp_str_loc_free(&str_loc);
+  } else {
+    src_loc = __kmp_str_format("unknown");
+  }
+
+  if (severity == severity_warning)
+    KMP_WARNING(UserDirectedWarning, src_loc, message);
+  else
+    KMP_FATAL(UserDirectedError, src_loc, message);
+
+  __kmp_str_free(&src_loc);
+}
+
+// Mark begin of scope directive.
+void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
+// reserved is for extension of scope directive and not used.
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
+    kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
+    int tid = __kmp_tid_from_gtid(gtid);
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_scope, ompt_scope_begin,
+        &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+}
+
+// Mark end of scope directive
+void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
+// reserved is for extension of scope directive and not used.
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
+    kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
+    int tid = __kmp_tid_from_gtid(gtid);
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_scope, ompt_scope_end,
+        &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+}
+
+#ifdef KMP_USE_VERSION_SYMBOLS
+// For GOMP compatibility there are two versions of each omp_* API.
+// One is the plain C symbol and one is the Fortran symbol with an appended
+// underscore. When we implement a specific ompc_* version of an omp_*
+// function, we want the plain GOMP versioned symbol to alias the ompc_* version
+// instead of the Fortran versions in kmp_ftn_entry.h
+extern "C" {
+// Have to undef these from omp.h so they aren't translated into
+// their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
+#ifdef omp_set_affinity_format
+#undef omp_set_affinity_format
+#endif
+#ifdef omp_get_affinity_format
+#undef omp_get_affinity_format
+#endif
+#ifdef omp_display_affinity
+#undef omp_display_affinity
+#endif
+#ifdef omp_capture_affinity
+#undef omp_capture_affinity
+#endif
+KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
+                        "OMP_5.0");
+KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
+                        "OMP_5.0");
+KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
+                        "OMP_5.0");
+KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
+                        "OMP_5.0");
+} // extern "C"
+#endif
diff --git a/third_party/openmp/kmp_debug.cpp b/third_party/openmp/kmp_debug.cpp
new file mode 100644
index 000000000..6c397c5d8
--- /dev/null
+++ b/third_party/openmp/kmp_debug.cpp
@@ -0,0 +1,131 @@
+/*
+ * kmp_debug.cpp -- debug utilities for the Guide library
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_debug.h" /* really necessary? */
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+
+#ifdef KMP_DEBUG
+void __kmp_debug_printf_stdout(char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+
+  __kmp_vprintf(kmp_out, format, ap);
+
+  va_end(ap);
+}
+#endif
+
+void __kmp_debug_printf(char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+
+  __kmp_vprintf(kmp_err, format, ap);
+
+  va_end(ap);
+}
+
+#ifdef KMP_USE_ASSERT
+int __kmp_debug_assert(char const *msg, char const *file, int line) {
+
+  if (file == NULL) {
+    file = KMP_I18N_STR(UnknownFile);
+  } else {
+    // Remove directories from path, leave only file name. File name is enough,
+    // there is no need in bothering developers and customers with full paths.
+    char const *slash = strrchr(file, '/');
+    if (slash != NULL) {
+      file = slash + 1;
+    }
+  }
+
+#ifdef KMP_DEBUG
+  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_debug_printf("Assertion failure at %s(%d): %s.\n", file, line, msg);
+  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+#ifdef USE_ASSERT_BREAK
+#if KMP_OS_WINDOWS
+  DebugBreak();
+#endif
+#endif // USE_ASSERT_BREAK
+#ifdef USE_ASSERT_STALL
+  /*    __kmp_infinite_loop(); */
+  for (;;)
+    ;
+#endif // USE_ASSERT_STALL
+#ifdef USE_ASSERT_SEG
+  {
+    int volatile *ZERO = (int *)0;
+    ++(*ZERO);
+  }
+#endif // USE_ASSERT_SEG
+#endif
+
+  __kmp_fatal(KMP_MSG(AssertionFailure, file, line), KMP_HNT(SubmitBugReport),
+              __kmp_msg_null);
+
+  return 0;
+
+} // __kmp_debug_assert
+
+#endif // KMP_USE_ASSERT
+
+/* Dump debugging buffer to stderr */
+void __kmp_dump_debug_buffer(void) {
+  if (__kmp_debug_buffer != NULL) {
+    int i;
+    int dc = __kmp_debug_count;
+    char *db = &__kmp_debug_buffer[(dc % __kmp_debug_buf_lines) *
+                                   __kmp_debug_buf_chars];
+    char *db_end =
+        &__kmp_debug_buffer[__kmp_debug_buf_lines * __kmp_debug_buf_chars];
+    char *db2;
+
+    __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+    __kmp_printf_no_lock("\nStart dump of debugging buffer (entry=%d):\n",
+                         dc % __kmp_debug_buf_lines);
+
+    for (i = 0; i < __kmp_debug_buf_lines; i++) {
+
+      if (*db != '\0') {
+        /* Fix up where no carriage return before string termination char */
+        for (db2 = db + 1; db2 < db + __kmp_debug_buf_chars - 1; db2++) {
+          if (*db2 == '\0') {
+            if (*(db2 - 1) != '\n') {
+              *db2 = '\n';
+              *(db2 + 1) = '\0';
+            }
+            break;
+          }
+        }
+        /* Handle case at end by shortening the printed message by one char if
+         * necessary */
+        if (db2 == db + __kmp_debug_buf_chars - 1 && *db2 == '\0' &&
+            *(db2 - 1) != '\n') {
+          *(db2 - 1) = '\n';
+        }
+
+        __kmp_printf_no_lock("%4d: %.*s", i, __kmp_debug_buf_chars, db);
+        *db = '\0'; /* only let it print once! */
+      }
+
+      db += __kmp_debug_buf_chars;
+      if (db >= db_end)
+        db = __kmp_debug_buffer;
+    }
+
+    __kmp_printf_no_lock("End dump of debugging buffer (entry=%d).\n\n",
+                         (dc + i - 1) % __kmp_debug_buf_lines);
+    __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+  }
+}
diff --git a/third_party/openmp/kmp_debug.h b/third_party/openmp/kmp_debug.h
new file mode 100644
index 000000000..08d52cc04
--- /dev/null
+++ b/third_party/openmp/kmp_debug.h
@@ -0,0 +1,179 @@
+/*
+ * kmp_debug.h -- debug / assertion code for Assure library
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_DEBUG_H
+#define KMP_DEBUG_H
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// -----------------------------------------------------------------------------
+// Build-time assertion.
+
+// New C++11 style build assert
+#define KMP_BUILD_ASSERT(expr) static_assert(expr, "Build condition error")
+
+// -----------------------------------------------------------------------------
+// Run-time assertions.
+
+extern void __kmp_dump_debug_buffer(void);
+
+#ifdef KMP_USE_ASSERT
+extern int __kmp_debug_assert(char const *expr, char const *file, int line);
+#ifdef KMP_DEBUG
+#define KMP_ASSERT(cond)                                                       \
+  if (!(cond)) {                                                               \
+    __kmp_debug_assert(#cond, __FILE__, __LINE__);                             \
+  }
+#define KMP_ASSERT2(cond, msg)                                                 \
+  if (!(cond)) {                                                               \
+    __kmp_debug_assert((msg), __FILE__, __LINE__);                             \
+  }
+#define KMP_DEBUG_ASSERT(cond) KMP_ASSERT(cond)
+#define KMP_DEBUG_ASSERT2(cond, msg) KMP_ASSERT2(cond, msg)
+#define KMP_DEBUG_USE_VAR(x) /* Nothing (it is used!) */
+#else
+// Do not expose condition in release build. Use "assertion failure".
+#define KMP_ASSERT(cond)                                                       \
+  if (!(cond)) {                                                               \
+    __kmp_debug_assert("assertion failure", __FILE__, __LINE__);               \
+  }
+#define KMP_ASSERT2(cond, msg) KMP_ASSERT(cond)
+#define KMP_DEBUG_ASSERT(cond) /* Nothing */
+#define KMP_DEBUG_ASSERT2(cond, msg) /* Nothing */
+#define KMP_DEBUG_USE_VAR(x) ((void)(x))
+#endif // KMP_DEBUG
+#else
+#define KMP_ASSERT(cond) /* Nothing */
+#define KMP_ASSERT2(cond, msg) /* Nothing */
+#define KMP_DEBUG_ASSERT(cond) /* Nothing */
+#define KMP_DEBUG_ASSERT2(cond, msg) /* Nothing */
+#define KMP_DEBUG_USE_VAR(x) ((void)(x))
+#endif // KMP_USE_ASSERT
+
+#ifdef KMP_DEBUG
+extern void __kmp_debug_printf_stdout(char const *format, ...);
+#endif
+extern void __kmp_debug_printf(char const *format, ...);
+
+#ifdef KMP_DEBUG
+
+extern int kmp_a_debug;
+extern int kmp_b_debug;
+extern int kmp_c_debug;
+extern int kmp_d_debug;
+extern int kmp_e_debug;
+extern int kmp_f_debug;
+extern int kmp_diag;
+
+#define KA_TRACE(d, x)                                                         \
+  if (kmp_a_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define KB_TRACE(d, x)                                                         \
+  if (kmp_b_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define KC_TRACE(d, x)                                                         \
+  if (kmp_c_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define KD_TRACE(d, x)                                                         \
+  if (kmp_d_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define KE_TRACE(d, x)                                                         \
+  if (kmp_e_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define KF_TRACE(d, x)                                                         \
+  if (kmp_f_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define K_DIAG(d, x)                                                           \
+  {                                                                            \
+    if (kmp_diag == d) {                                                       \
+      __kmp_debug_printf_stdout x;                                             \
+    }                                                                          \
+  }
+
+#define KA_DUMP(d, x)                                                          \
+  if (kmp_a_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+#define KB_DUMP(d, x)                                                          \
+  if (kmp_b_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+#define KC_DUMP(d, x)                                                          \
+  if (kmp_c_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+#define KD_DUMP(d, x)                                                          \
+  if (kmp_d_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+#define KE_DUMP(d, x)                                                          \
+  if (kmp_e_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+#define KF_DUMP(d, x)                                                          \
+  if (kmp_f_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+
+#else
+
+#define KA_TRACE(d, x) /* nothing to do */
+#define KB_TRACE(d, x) /* nothing to do */
+#define KC_TRACE(d, x) /* nothing to do */
+#define KD_TRACE(d, x) /* nothing to do */
+#define KE_TRACE(d, x) /* nothing to do */
+#define KF_TRACE(d, x) /* nothing to do */
+#define K_DIAG(d, x)                                                           \
+  {} /* nothing to do */
+
+#define KA_DUMP(d, x) /* nothing to do */
+#define KB_DUMP(d, x) /* nothing to do */
+#define KC_DUMP(d, x) /* nothing to do */
+#define KD_DUMP(d, x) /* nothing to do */
+#define KE_DUMP(d, x) /* nothing to do */
+#define KF_DUMP(d, x) /* nothing to do */
+
+#endif // KMP_DEBUG
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif /* KMP_DEBUG_H */
diff --git a/third_party/openmp/kmp_debugger.cpp b/third_party/openmp/kmp_debugger.cpp
new file mode 100644
index 000000000..0abdb0f44
--- /dev/null
+++ b/third_party/openmp/kmp_debugger.cpp
@@ -0,0 +1,286 @@
+#include "kmp_config.h"
+
+#if USE_DEBUGGER
+/*
+ * kmp_debugger.cpp -- debugger support.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_lock.h"
+#include "kmp_omp.h"
+#include "kmp_str.h"
+
+// NOTE: All variable names are known to the debugger, do not change!
+
+#ifdef __cplusplus
+extern "C" {
+extern kmp_omp_struct_info_t __kmp_omp_debug_struct_info;
+} // extern "C"
+#endif // __cplusplus
+
+int __kmp_debugging = FALSE; // Boolean whether currently debugging OpenMP RTL.
+
+#define offset_and_size_of(structure, field)                                   \
+  { offsetof(structure, field), sizeof(((structure *)NULL)->field) }
+
+#define offset_and_size_not_available                                          \
+  { -1, -1 }
+
+#define addr_and_size_of(var)                                                  \
+  { (kmp_uint64)(&var), sizeof(var) }
+
+#define nthr_buffer_size 1024
+static kmp_int32 kmp_omp_nthr_info_buffer[nthr_buffer_size] = {
+    nthr_buffer_size * sizeof(kmp_int32)};
+
+/* TODO: Check punctuation for various platforms here */
+static char func_microtask[] = "__kmp_invoke_microtask";
+static char func_fork[] = "__kmpc_fork_call";
+static char func_fork_teams[] = "__kmpc_fork_teams";
+
+// Various info about runtime structures: addresses, field offsets, sizes, etc.
+kmp_omp_struct_info_t __kmp_omp_debug_struct_info = {
+
+    /* Change this only if you make a fundamental data structure change here */
+    KMP_OMP_VERSION,
+
+    /* sanity check.  Only should be checked if versions are identical
+     * This is also used for backward compatibility to get the runtime
+     * structure size if it the runtime is older than the interface */
+    sizeof(kmp_omp_struct_info_t),
+
+    /* OpenMP RTL version info. */
+    addr_and_size_of(__kmp_version_major),
+    addr_and_size_of(__kmp_version_minor),
+    addr_and_size_of(__kmp_version_build),
+    addr_and_size_of(__kmp_openmp_version),
+    {(kmp_uint64)(__kmp_copyright) + KMP_VERSION_MAGIC_LEN,
+     0}, // Skip magic prefix.
+
+    /* Various globals. */
+    addr_and_size_of(__kmp_threads),
+    addr_and_size_of(__kmp_root),
+    addr_and_size_of(__kmp_threads_capacity),
+#if KMP_USE_MONITOR
+    addr_and_size_of(__kmp_monitor),
+#endif
+#if !KMP_USE_DYNAMIC_LOCK
+    addr_and_size_of(__kmp_user_lock_table),
+#endif
+    addr_and_size_of(func_microtask),
+    addr_and_size_of(func_fork),
+    addr_and_size_of(func_fork_teams),
+    addr_and_size_of(__kmp_team_counter),
+    addr_and_size_of(__kmp_task_counter),
+    addr_and_size_of(kmp_omp_nthr_info_buffer),
+    sizeof(void *),
+    OMP_LOCK_T_SIZE < sizeof(void *),
+    bs_last_barrier,
+    INITIAL_TASK_DEQUE_SIZE,
+
+    // thread structure information
+    sizeof(kmp_base_info_t),
+    offset_and_size_of(kmp_base_info_t, th_info),
+    offset_and_size_of(kmp_base_info_t, th_team),
+    offset_and_size_of(kmp_base_info_t, th_root),
+    offset_and_size_of(kmp_base_info_t, th_serial_team),
+    offset_and_size_of(kmp_base_info_t, th_ident),
+    offset_and_size_of(kmp_base_info_t, th_spin_here),
+    offset_and_size_of(kmp_base_info_t, th_next_waiting),
+    offset_and_size_of(kmp_base_info_t, th_task_team),
+    offset_and_size_of(kmp_base_info_t, th_current_task),
+    offset_and_size_of(kmp_base_info_t, th_task_state),
+    offset_and_size_of(kmp_base_info_t, th_bar),
+    offset_and_size_of(kmp_bstate_t, b_worker_arrived),
+
+    // teams information
+    offset_and_size_of(kmp_base_info_t, th_teams_microtask),
+    offset_and_size_of(kmp_base_info_t, th_teams_level),
+    offset_and_size_of(kmp_teams_size_t, nteams),
+    offset_and_size_of(kmp_teams_size_t, nth),
+
+    // kmp_desc structure (for info field above)
+    sizeof(kmp_desc_base_t),
+    offset_and_size_of(kmp_desc_base_t, ds_tid),
+    offset_and_size_of(kmp_desc_base_t, ds_gtid),
+// On Windows* OS, ds_thread contains a thread /handle/, which is not usable,
+// while thread /id/ is in ds_thread_id.
+#if KMP_OS_WINDOWS
+    offset_and_size_of(kmp_desc_base_t, ds_thread_id),
+#else
+    offset_and_size_of(kmp_desc_base_t, ds_thread),
+#endif
+
+    // team structure information
+    sizeof(kmp_base_team_t),
+    offset_and_size_of(kmp_base_team_t, t_master_tid),
+    offset_and_size_of(kmp_base_team_t, t_ident),
+    offset_and_size_of(kmp_base_team_t, t_parent),
+    offset_and_size_of(kmp_base_team_t, t_nproc),
+    offset_and_size_of(kmp_base_team_t, t_threads),
+    offset_and_size_of(kmp_base_team_t, t_serialized),
+    offset_and_size_of(kmp_base_team_t, t_id),
+    offset_and_size_of(kmp_base_team_t, t_pkfn),
+    offset_and_size_of(kmp_base_team_t, t_task_team),
+    offset_and_size_of(kmp_base_team_t, t_implicit_task_taskdata),
+    offset_and_size_of(kmp_base_team_t, t_cancel_request),
+    offset_and_size_of(kmp_base_team_t, t_bar),
+    offset_and_size_of(kmp_balign_team_t, b_master_arrived),
+    offset_and_size_of(kmp_balign_team_t, b_team_arrived),
+
+    // root structure information
+    sizeof(kmp_base_root_t),
+    offset_and_size_of(kmp_base_root_t, r_root_team),
+    offset_and_size_of(kmp_base_root_t, r_hot_team),
+    offset_and_size_of(kmp_base_root_t, r_uber_thread),
+    offset_and_size_not_available,
+
+    // ident structure information
+    sizeof(ident_t),
+    offset_and_size_of(ident_t, psource),
+    offset_and_size_of(ident_t, flags),
+
+    // lock structure information
+    sizeof(kmp_base_queuing_lock_t),
+    offset_and_size_of(kmp_base_queuing_lock_t, initialized),
+    offset_and_size_of(kmp_base_queuing_lock_t, location),
+    offset_and_size_of(kmp_base_queuing_lock_t, tail_id),
+    offset_and_size_of(kmp_base_queuing_lock_t, head_id),
+    offset_and_size_of(kmp_base_queuing_lock_t, next_ticket),
+    offset_and_size_of(kmp_base_queuing_lock_t, now_serving),
+    offset_and_size_of(kmp_base_queuing_lock_t, owner_id),
+    offset_and_size_of(kmp_base_queuing_lock_t, depth_locked),
+    offset_and_size_of(kmp_base_queuing_lock_t, flags),
+
+#if !KMP_USE_DYNAMIC_LOCK
+    /* Lock table. */
+    sizeof(kmp_lock_table_t),
+    offset_and_size_of(kmp_lock_table_t, used),
+    offset_and_size_of(kmp_lock_table_t, allocated),
+    offset_and_size_of(kmp_lock_table_t, table),
+#endif
+
+    // Task team structure information.
+    sizeof(kmp_base_task_team_t),
+    offset_and_size_of(kmp_base_task_team_t, tt_threads_data),
+    offset_and_size_of(kmp_base_task_team_t, tt_found_tasks),
+    offset_and_size_of(kmp_base_task_team_t, tt_nproc),
+    offset_and_size_of(kmp_base_task_team_t, tt_unfinished_threads),
+    offset_and_size_of(kmp_base_task_team_t, tt_active),
+
+    // task_data_t.
+    sizeof(kmp_taskdata_t),
+    offset_and_size_of(kmp_taskdata_t, td_task_id),
+    offset_and_size_of(kmp_taskdata_t, td_flags),
+    offset_and_size_of(kmp_taskdata_t, td_team),
+    offset_and_size_of(kmp_taskdata_t, td_parent),
+    offset_and_size_of(kmp_taskdata_t, td_level),
+    offset_and_size_of(kmp_taskdata_t, td_ident),
+    offset_and_size_of(kmp_taskdata_t, td_allocated_child_tasks),
+    offset_and_size_of(kmp_taskdata_t, td_incomplete_child_tasks),
+
+    offset_and_size_of(kmp_taskdata_t, td_taskwait_ident),
+    offset_and_size_of(kmp_taskdata_t, td_taskwait_counter),
+    offset_and_size_of(kmp_taskdata_t, td_taskwait_thread),
+
+    offset_and_size_of(kmp_taskdata_t, td_taskgroup),
+    offset_and_size_of(kmp_taskgroup_t, count),
+    offset_and_size_of(kmp_taskgroup_t, cancel_request),
+
+    offset_and_size_of(kmp_taskdata_t, td_depnode),
+    offset_and_size_of(kmp_depnode_list_t, node),
+    offset_and_size_of(kmp_depnode_list_t, next),
+    offset_and_size_of(kmp_base_depnode_t, successors),
+    offset_and_size_of(kmp_base_depnode_t, task),
+    offset_and_size_of(kmp_base_depnode_t, npredecessors),
+    offset_and_size_of(kmp_base_depnode_t, nrefs),
+    offset_and_size_of(kmp_task_t, routine),
+
+    // thread_data_t.
+    sizeof(kmp_thread_data_t),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque_size),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque_head),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque_tail),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque_ntasks),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque_last_stolen),
+
+    // The last field.
+    KMP_OMP_VERSION,
+
+}; // __kmp_omp_debug_struct_info
+
+#undef offset_and_size_of
+#undef addr_and_size_of
+
+/* Intel compiler on IA-32 architecture issues a warning "conversion
+  from "unsigned long long" to "char *" may lose significant bits"
+  when 64-bit value is assigned to 32-bit pointer. Use this function
+  to suppress the warning. */
+static inline void *__kmp_convert_to_ptr(kmp_uint64 addr) {
+#if KMP_COMPILER_ICC || KMP_COMPILER_ICX
+#pragma warning(push)
+#pragma warning(disable : 810) // conversion from "unsigned long long" to "char
+// *" may lose significant bits
+#pragma warning(disable : 1195) // conversion from integer to smaller pointer
+#endif // KMP_COMPILER_ICC || KMP_COMPILER_ICX
+  return (void *)addr;
+#if KMP_COMPILER_ICC || KMP_COMPILER_ICX
+#pragma warning(pop)
+#endif // KMP_COMPILER_ICC || KMP_COMPILER_ICX
+} // __kmp_convert_to_ptr
+
+static int kmp_location_match(kmp_str_loc_t *loc, kmp_omp_nthr_item_t *item) {
+
+  int file_match = 0;
+  int func_match = 0;
+  int line_match = 0;
+
+  char *file = (char *)__kmp_convert_to_ptr(item->file);
+  char *func = (char *)__kmp_convert_to_ptr(item->func);
+  file_match = __kmp_str_fname_match(&loc->fname, file);
+  func_match =
+      item->func == 0 // If item->func is NULL, it allows any func name.
+      || strcmp(func, "*") == 0 ||
+      (loc->func != NULL && strcmp(loc->func, func) == 0);
+  line_match =
+      item->begin <= loc->line &&
+      (item->end <= 0 ||
+       loc->line <= item->end); // if item->end <= 0, it means "end of file".
+
+  return (file_match && func_match && line_match);
+
+} // kmp_location_match
+
+int __kmp_omp_num_threads(ident_t const *ident) {
+
+  int num_threads = 0;
+
+  kmp_omp_nthr_info_t *info = (kmp_omp_nthr_info_t *)__kmp_convert_to_ptr(
+      __kmp_omp_debug_struct_info.nthr_info.addr);
+  if (info->num > 0 && info->array != 0) {
+    kmp_omp_nthr_item_t *items =
+        (kmp_omp_nthr_item_t *)__kmp_convert_to_ptr(info->array);
+    kmp_str_loc_t loc = __kmp_str_loc_init(ident->psource, true);
+    int i;
+    for (i = 0; i < info->num; ++i) {
+      if (kmp_location_match(&loc, &items[i])) {
+        num_threads = items[i].num_threads;
+      }
+    }
+    __kmp_str_loc_free(&loc);
+  }
+
+  return num_threads;
+  ;
+
+} // __kmp_omp_num_threads
+#endif /* USE_DEBUGGER */
diff --git a/third_party/openmp/kmp_debugger.h b/third_party/openmp/kmp_debugger.h
new file mode 100644
index 000000000..7ec74287d
--- /dev/null
+++ b/third_party/openmp/kmp_debugger.h
@@ -0,0 +1,48 @@
+#if USE_DEBUGGER
+/*
+ * kmp_debugger.h -- debugger support.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_DEBUGGER_H
+#define KMP_DEBUGGER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/* This external variable can be set by any debugger to flag to the runtime
+   that we are currently executing inside a debugger.  This will allow the
+   debugger to override the number of threads spawned in a parallel region by
+   using __kmp_omp_num_threads() (below).
+   * When __kmp_debugging is TRUE, each team and each task gets a unique integer
+   identifier that can be used by debugger to conveniently identify teams and
+   tasks.
+   * The debugger has access to __kmp_omp_debug_struct_info which contains
+   information about the OpenMP library's important internal structures.  This
+   access will allow the debugger to read detailed information from the typical
+   OpenMP constructs (teams, threads, tasking, etc. ) during a debugging
+   session and offer detailed and useful information which the user can probe
+   about the OpenMP portion of their code. */
+extern int __kmp_debugging; /* Boolean whether currently debugging OpenMP RTL */
+// Return number of threads specified by the debugger for given parallel region.
+/* The ident field, which represents a source file location, is used to check if
+   the debugger has changed the number of threads for the parallel region at
+   source file location ident.  This way, specific parallel regions' number of
+   threads can be changed at the debugger's request. */
+int __kmp_omp_num_threads(ident_t const *ident);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // KMP_DEBUGGER_H
+
+#endif // USE_DEBUGGER
diff --git a/third_party/openmp/kmp_dispatch.cpp b/third_party/openmp/kmp_dispatch.cpp
new file mode 100644
index 000000000..ac85b2b3f
--- /dev/null
+++ b/third_party/openmp/kmp_dispatch.cpp
@@ -0,0 +1,3142 @@
+/*
+ * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/* Dynamic scheduling initialization and dispatch.
+ *
+ * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
+ *       it may change values between parallel regions.  __kmp_max_nth
+ *       is the largest value __kmp_nth may take, 1 is the smallest.
+ */
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#if KMP_USE_X87CONTROL
+#include <float.h>
+#endif
+#include "kmp_lock.h"
+#include "kmp_dispatch.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  kmp_info_t *th;
+
+  KMP_DEBUG_ASSERT(gtid_ref);
+
+  if (__kmp_env_consistency_check) {
+    th = __kmp_threads[*gtid_ref];
+    if (th->th.th_root->r.r_active &&
+        (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
+#if KMP_USE_DYNAMIC_LOCK
+      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
+#else
+      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
+#endif
+    }
+  }
+}
+
+void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  kmp_info_t *th;
+
+  if (__kmp_env_consistency_check) {
+    th = __kmp_threads[*gtid_ref];
+    if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
+      __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
+    }
+  }
+}
+
+// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
+static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
+                                         bool use_hier = false) {
+  // Pick up the nonmonotonic/monotonic bits from the scheduling type
+  // Nonmonotonic as default for dynamic schedule when no modifier is specified
+  int monotonicity = SCHEDULE_NONMONOTONIC;
+
+  // Let default be monotonic for executables
+  // compiled with OpenMP* 4.5 or less compilers
+  if (loc != NULL && loc->get_openmp_version() < 50)
+    monotonicity = SCHEDULE_MONOTONIC;
+
+  if (use_hier || __kmp_force_monotonic)
+    monotonicity = SCHEDULE_MONOTONIC;
+  else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
+    monotonicity = SCHEDULE_NONMONOTONIC;
+  else if (SCHEDULE_HAS_MONOTONIC(schedule))
+    monotonicity = SCHEDULE_MONOTONIC;
+
+  return monotonicity;
+}
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+// Return floating point number rounded to two decimal points
+static inline float __kmp_round_2decimal_val(float num) {
+  return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
+}
+static inline int __kmp_get_round_val(float num) {
+  return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
+}
+#endif
+
+template <typename T>
+inline void
+__kmp_initialize_self_buffer(kmp_team_t *team, T id,
+                             dispatch_private_info_template<T> *pr,
+                             typename traits_t<T>::unsigned_t nchunks, T nproc,
+                             typename traits_t<T>::unsigned_t &init,
+                             T &small_chunk, T &extras, T &p_extra) {
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  if (pr->flags.use_hybrid) {
+    kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
+    kmp_hw_core_type_t type =
+        (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
+    T pchunks = pr->u.p.pchunks;
+    T echunks = nchunks - pchunks;
+    T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
+    T num_procs_with_ecore = nproc - num_procs_with_pcore;
+    T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
+    T big_chunk =
+        pchunks / num_procs_with_pcore; // chunks per thread with p-core
+    small_chunk =
+        echunks / num_procs_with_ecore; // chunks per thread with e-core
+
+    extras =
+        (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
+
+    p_extra = (big_chunk - small_chunk);
+
+    if (type == KMP_HW_CORE_TYPE_CORE) {
+      if (id < first_thread_with_ecore) {
+        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
+      } else {
+        init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
+               (id < extras ? id : extras);
+      }
+    } else {
+      if (id == first_thread_with_ecore) {
+        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
+      } else {
+        init = id * small_chunk + first_thread_with_ecore * p_extra +
+               (id < extras ? id : extras);
+      }
+    }
+    p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
+    return;
+  }
+#endif
+
+  small_chunk = nchunks / nproc; // chunks per thread
+  extras = nchunks % nproc;
+  p_extra = 0;
+  init = id * small_chunk + (id < extras ? id : extras);
+}
+
+#if KMP_STATIC_STEAL_ENABLED
+enum { // values for steal_flag (possible states of private per-loop buffer)
+  UNUSED = 0,
+  CLAIMED = 1, // owner thread started initialization
+  READY = 2, // available for stealing
+  THIEF = 3 // finished by owner, or claimed by thief
+  // possible state changes:
+  // 0 -> 1 owner only, sync
+  // 0 -> 3 thief only, sync
+  // 1 -> 2 owner only, async
+  // 2 -> 3 owner only, async
+  // 3 -> 2 owner only, async
+  // 3 -> 0 last thread finishing the loop, async
+};
+#endif
+
+// Initialize a dispatch_private_info_template<T> buffer for a particular
+// type of schedule,chunk.  The loop description is found in lb (lower bound),
+// ub (upper bound), and st (stride).  nproc is the number of threads relevant
+// to the scheduling (often the number of threads in a team, but not always if
+// hierarchical scheduling is used).  tid is the id of the thread calling
+// the function within the group of nproc threads.  It will have a value
+// between 0 and nproc - 1.  This is often just the thread id within a team, but
+// is not necessarily the case when using hierarchical scheduling.
+// loc is the source file location of the corresponding loop
+// gtid is the global thread id
+template <typename T>
+void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
+                                   dispatch_private_info_template<T> *pr,
+                                   enum sched_type schedule, T lb, T ub,
+                                   typename traits_t<T>::signed_t st,
+#if USE_ITT_BUILD
+                                   kmp_uint64 *cur_chunk,
+#endif
+                                   typename traits_t<T>::signed_t chunk,
+                                   T nproc, T tid) {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::floating_t DBL;
+
+  int active;
+  T tc;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  int monotonicity;
+  bool use_hier;
+
+#ifdef KMP_DEBUG
+  typedef typename traits_t<T>::signed_t ST;
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
+                            "pr:%%p lb:%%%s ub:%%%s st:%%%s "
+                            "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
+                            traits_t<T>::spec, traits_t<T>::spec,
+                            traits_t<ST>::spec, traits_t<ST>::spec,
+                            traits_t<T>::spec, traits_t<T>::spec);
+    KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
+    __kmp_str_free(&buff);
+  }
+#endif
+  /* setup data */
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+
+#if USE_ITT_BUILD
+  int itt_need_metadata_reporting =
+      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
+      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
+      team->t.t_active_level == 1;
+#endif
+
+#if KMP_USE_HIER_SCHED
+  use_hier = pr->flags.use_hier;
+#else
+  use_hier = false;
+#endif
+
+  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
+  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
+  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+  /* Pick up the nomerge/ordered bits from the scheduling type */
+  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
+    pr->flags.nomerge = TRUE;
+    schedule =
+        (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
+  } else {
+    pr->flags.nomerge = FALSE;
+  }
+  pr->type_size = traits_t<T>::type_size; // remember the size of variables
+  if (kmp_ord_lower & schedule) {
+    pr->flags.ordered = TRUE;
+    schedule =
+        (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
+  } else {
+    pr->flags.ordered = FALSE;
+  }
+  // Ordered overrides nonmonotonic
+  if (pr->flags.ordered) {
+    monotonicity = SCHEDULE_MONOTONIC;
+  }
+
+  if (schedule == kmp_sch_static) {
+    schedule = __kmp_static;
+  } else {
+    if (schedule == kmp_sch_runtime) {
+      // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
+      // not specified)
+      schedule = team->t.t_sched.r_sched_type;
+      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
+      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+      if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
+        monotonicity = SCHEDULE_MONOTONIC;
+      // Detail the schedule if needed (global controls are differentiated
+      // appropriately)
+      if (schedule == kmp_sch_guided_chunked) {
+        schedule = __kmp_guided;
+      } else if (schedule == kmp_sch_static) {
+        schedule = __kmp_static;
+      }
+      // Use the chunk size specified by OMP_SCHEDULE (or default if not
+      // specified)
+      chunk = team->t.t_sched.chunk;
+#if USE_ITT_BUILD
+      if (cur_chunk)
+        *cur_chunk = chunk;
+#endif
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
+                                "schedule:%%d chunk:%%%s\n",
+                                traits_t<ST>::spec);
+        KD_TRACE(10, (buff, gtid, schedule, chunk));
+        __kmp_str_free(&buff);
+      }
+#endif
+    } else {
+      if (schedule == kmp_sch_guided_chunked) {
+        schedule = __kmp_guided;
+      }
+      if (chunk <= 0) {
+        chunk = KMP_DEFAULT_CHUNK;
+      }
+    }
+
+    if (schedule == kmp_sch_auto) {
+      // mapping and differentiation: in the __kmp_do_serial_initialize()
+      schedule = __kmp_auto;
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
+            "schedule:%%d chunk:%%%s\n",
+            traits_t<ST>::spec);
+        KD_TRACE(10, (buff, gtid, schedule, chunk));
+        __kmp_str_free(&buff);
+      }
+#endif
+    }
+#if KMP_STATIC_STEAL_ENABLED
+    // map nonmonotonic:dynamic to static steal
+    if (schedule == kmp_sch_dynamic_chunked) {
+      if (monotonicity == SCHEDULE_NONMONOTONIC)
+        schedule = kmp_sch_static_steal;
+    }
+#endif
+    /* guided analytical not safe for too many threads */
+    if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
+      schedule = kmp_sch_guided_iterative_chunked;
+      KMP_WARNING(DispatchManyThreads);
+    }
+    if (schedule == kmp_sch_runtime_simd) {
+      // compiler provides simd_width in the chunk parameter
+      schedule = team->t.t_sched.r_sched_type;
+      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
+      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+      // Detail the schedule if needed (global controls are differentiated
+      // appropriately)
+      if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
+          schedule == __kmp_static) {
+        schedule = kmp_sch_static_balanced_chunked;
+      } else {
+        if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
+          schedule = kmp_sch_guided_simd;
+        }
+        chunk = team->t.t_sched.chunk * chunk;
+      }
+#if USE_ITT_BUILD
+      if (cur_chunk)
+        *cur_chunk = chunk;
+#endif
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
+            " chunk:%%%s\n",
+            traits_t<ST>::spec);
+        KD_TRACE(10, (buff, gtid, schedule, chunk));
+        __kmp_str_free(&buff);
+      }
+#endif
+    }
+    pr->u.p.parm1 = chunk;
+  }
+  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
+              "unknown scheduling type");
+
+  pr->u.p.count = 0;
+
+  if (__kmp_env_consistency_check) {
+    if (st == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
+                            (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
+    }
+  }
+  // compute trip count
+  if (st == 1) { // most common case
+    if (ub >= lb) {
+      tc = ub - lb + 1;
+    } else { // ub < lb
+      tc = 0; // zero-trip
+    }
+  } else if (st < 0) {
+    if (lb >= ub) {
+      // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
+      // where the division needs to be unsigned regardless of the result type
+      tc = (UT)(lb - ub) / (-st) + 1;
+    } else { // lb < ub
+      tc = 0; // zero-trip
+    }
+  } else { // st > 0
+    if (ub >= lb) {
+      // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
+      // where the division needs to be unsigned regardless of the result type
+      tc = (UT)(ub - lb) / st + 1;
+    } else { // ub < lb
+      tc = 0; // zero-trip
+    }
+  }
+
+#if KMP_STATS_ENABLED
+  if (KMP_MASTER_GTID(gtid)) {
+    KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
+  }
+#endif
+
+  pr->u.p.lb = lb;
+  pr->u.p.ub = ub;
+  pr->u.p.st = st;
+  pr->u.p.tc = tc;
+
+#if KMP_OS_WINDOWS
+  pr->u.p.last_upper = ub + st;
+#endif /* KMP_OS_WINDOWS */
+
+  /* NOTE: only the active parallel region(s) has active ordered sections */
+
+  if (active) {
+    if (pr->flags.ordered) {
+      pr->ordered_bumped = 0;
+      pr->u.p.ordered_lower = 1;
+      pr->u.p.ordered_upper = 0;
+    }
+  }
+
+  switch (schedule) {
+#if KMP_STATIC_STEAL_ENABLED
+  case kmp_sch_static_steal: {
+    T ntc, init = 0;
+
+    KD_TRACE(100,
+             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
+              gtid));
+
+    ntc = (tc % chunk ? 1 : 0) + tc / chunk;
+    if (nproc > 1 && ntc >= nproc) {
+      KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
+      T id = tid;
+      T small_chunk, extras, p_extra = 0;
+      kmp_uint32 old = UNUSED;
+      int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
+      if (traits_t<T>::type_size > 4) {
+        // AC: TODO: check if 16-byte CAS available and use it to
+        // improve performance (probably wait for explicit request
+        // before spending time on this).
+        // For now use dynamically allocated per-private-buffer lock,
+        // free memory in __kmp_dispatch_next when status==0.
+        pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
+        __kmp_init_lock(pr->u.p.steal_lock);
+      }
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+      // Iterations are divided in a 60/40 skewed distribution among CORE and
+      // ATOM processors for hybrid systems
+      bool use_hybrid = false;
+      kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+      T first_thread_with_ecore = 0;
+      T num_procs_with_pcore = 0;
+      T num_procs_with_ecore = 0;
+      T p_ntc = 0, e_ntc = 0;
+      if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
+          __kmp_affinity.type != affinity_explicit) {
+        use_hybrid = true;
+        core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
+        if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
+            __kmp_first_osid_with_ecore > -1) {
+          for (int i = 0; i < team->t.t_nproc; ++i) {
+            kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
+                                          ->th.th_topology_attrs.core_type;
+            int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
+            if (id == __kmp_first_osid_with_ecore) {
+              first_thread_with_ecore =
+                  team->t.t_threads[i]->th.th_info.ds.ds_tid;
+            }
+            if (type == KMP_HW_CORE_TYPE_CORE) {
+              num_procs_with_pcore++;
+            } else if (type == KMP_HW_CORE_TYPE_ATOM) {
+              num_procs_with_ecore++;
+            } else {
+              use_hybrid = false;
+              break;
+            }
+          }
+        }
+        if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
+          float multiplier = 60.0 / 40.0;
+          float p_ratio = (float)num_procs_with_pcore / nproc;
+          float e_ratio = (float)num_procs_with_ecore / nproc;
+          float e_multiplier =
+              (float)1 /
+              (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
+          float p_multiplier = multiplier * e_multiplier;
+          p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
+          if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
+            e_ntc =
+                (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
+          else
+            e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
+          KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
+
+          // Use regular static steal if not enough chunks for skewed
+          // distribution
+          use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
+                                       e_ntc >= num_procs_with_ecore)
+                            ? true
+                            : false);
+        } else {
+          use_hybrid = false;
+        }
+      }
+      pr->flags.use_hybrid = use_hybrid;
+      pr->u.p.pchunks = p_ntc;
+      pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
+      pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
+
+      if (use_hybrid) {
+        KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
+        T big_chunk = p_ntc / num_procs_with_pcore;
+        small_chunk = e_ntc / num_procs_with_ecore;
+
+        extras =
+            (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
+
+        p_extra = (big_chunk - small_chunk);
+
+        if (core_type == KMP_HW_CORE_TYPE_CORE) {
+          if (id < first_thread_with_ecore) {
+            init =
+                id * small_chunk + id * p_extra + (id < extras ? id : extras);
+          } else {
+            init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
+                   (id < extras ? id : extras);
+          }
+        } else {
+          if (id == first_thread_with_ecore) {
+            init =
+                id * small_chunk + id * p_extra + (id < extras ? id : extras);
+          } else {
+            init = id * small_chunk + first_thread_with_ecore * p_extra +
+                   (id < extras ? id : extras);
+          }
+        }
+        p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
+      } else
+#endif
+      {
+        small_chunk = ntc / nproc;
+        extras = ntc % nproc;
+        init = id * small_chunk + (id < extras ? id : extras);
+        p_extra = 0;
+      }
+      pr->u.p.count = init;
+      if (claimed) { // are we succeeded in claiming own buffer?
+        pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
+        // Other threads will inspect steal_flag when searching for a victim.
+        // READY means other threads may steal from this thread from now on.
+        KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
+      } else {
+        // other thread has stolen whole our range
+        KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
+        pr->u.p.ub = init; // mark there is no iterations to work on
+      }
+      pr->u.p.parm2 = ntc; // save number of chunks
+      // parm3 is the number of times to attempt stealing which is
+      // nproc (just a heuristics, could be optimized later on).
+      pr->u.p.parm3 = nproc;
+      pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
+      break;
+    } else {
+      /* too few chunks: switching to kmp_sch_dynamic_chunked */
+      schedule = kmp_sch_dynamic_chunked;
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
+                     "kmp_sch_dynamic_chunked\n",
+                     gtid));
+      goto dynamic_init;
+      break;
+    } // if
+  } // case
+#endif
+  case kmp_sch_static_balanced: {
+    T init, limit;
+
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
+         gtid));
+
+    if (nproc > 1) {
+      T id = tid;
+
+      if (tc < nproc) {
+        if (id < tc) {
+          init = id;
+          limit = id;
+          pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
+        } else {
+          pr->u.p.count = 1; /* means no more chunks to execute */
+          pr->u.p.parm1 = FALSE;
+          break;
+        }
+      } else {
+        T small_chunk = tc / nproc;
+        T extras = tc % nproc;
+        init = id * small_chunk + (id < extras ? id : extras);
+        limit = init + small_chunk - (id < extras ? 0 : 1);
+        pr->u.p.parm1 = (id == nproc - 1);
+      }
+    } else {
+      if (tc > 0) {
+        init = 0;
+        limit = tc - 1;
+        pr->u.p.parm1 = TRUE;
+      } else {
+        // zero trip count
+        pr->u.p.count = 1; /* means no more chunks to execute */
+        pr->u.p.parm1 = FALSE;
+        break;
+      }
+    }
+#if USE_ITT_BUILD
+    // Calculate chunk for metadata report
+    if (itt_need_metadata_reporting)
+      if (cur_chunk)
+        *cur_chunk = limit - init + 1;
+#endif
+    if (st == 1) {
+      pr->u.p.lb = lb + init;
+      pr->u.p.ub = lb + limit;
+    } else {
+      // calculated upper bound, "ub" is user-defined upper bound
+      T ub_tmp = lb + limit * st;
+      pr->u.p.lb = lb + init * st;
+      // adjust upper bound to "ub" if needed, so that MS lastprivate will match
+      // it exactly
+      if (st > 0) {
+        pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
+      } else {
+        pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
+      }
+    }
+    if (pr->flags.ordered) {
+      pr->u.p.ordered_lower = init;
+      pr->u.p.ordered_upper = limit;
+    }
+    break;
+  } // case
+  case kmp_sch_static_balanced_chunked: {
+    // similar to balanced, but chunk adjusted to multiple of simd width
+    T nth = nproc;
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
+                   " -> falling-through to static_greedy\n",
+                   gtid));
+    schedule = kmp_sch_static_greedy;
+    if (nth > 1)
+      pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
+    else
+      pr->u.p.parm1 = tc;
+    break;
+  } // case
+  case kmp_sch_guided_simd:
+  case kmp_sch_guided_iterative_chunked: {
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
+         " case\n",
+         gtid));
+
+    if (nproc > 1) {
+      if ((2L * chunk + 1) * nproc >= tc) {
+        /* chunk size too large, switch to dynamic */
+        schedule = kmp_sch_dynamic_chunked;
+        goto dynamic_init;
+      } else {
+        // when remaining iters become less than parm2 - switch to dynamic
+        pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
+        *(double *)&pr->u.p.parm3 =
+            guided_flt_param / (double)nproc; // may occupy parm3 and parm4
+      }
+    } else {
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
+                     "kmp_sch_static_greedy\n",
+                     gtid));
+      schedule = kmp_sch_static_greedy;
+      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
+      KD_TRACE(
+          100,
+          ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
+           gtid));
+      pr->u.p.parm1 = tc;
+    } // if
+  } // case
+  break;
+  case kmp_sch_guided_analytical_chunked: {
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
+                   "kmp_sch_guided_analytical_chunked case\n",
+                   gtid));
+
+    if (nproc > 1) {
+      if ((2L * chunk + 1) * nproc >= tc) {
+        /* chunk size too large, switch to dynamic */
+        schedule = kmp_sch_dynamic_chunked;
+        goto dynamic_init;
+      } else {
+        /* commonly used term: (2 nproc - 1)/(2 nproc) */
+        DBL x;
+
+#if KMP_USE_X87CONTROL
+        /* Linux* OS already has 64-bit computation by default for long double,
+           and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
+           Windows* OS on IA-32 architecture, we need to set precision to 64-bit
+           instead of the default 53-bit. Even though long double doesn't work
+           on Windows* OS on Intel(R) 64, the resulting lack of precision is not
+           expected to impact the correctness of the algorithm, but this has not
+           been mathematically proven. */
+        // save original FPCW and set precision to 64-bit, as
+        // Windows* OS on IA-32 architecture defaults to 53-bit
+        unsigned int oldFpcw = _control87(0, 0);
+        _control87(_PC_64, _MCW_PC); // 0,0x30000
+#endif
+        /* value used for comparison in solver for cross-over point */
+        KMP_ASSERT(tc > 0);
+        long double target = ((long double)chunk * 2 + 1) * nproc / tc;
+
+        /* crossover point--chunk indexes equal to or greater than
+           this point switch to dynamic-style scheduling */
+        UT cross;
+
+        /* commonly used term: (2 nproc - 1)/(2 nproc) */
+        x = 1.0 - 0.5 / (double)nproc;
+
+#ifdef KMP_DEBUG
+        { // test natural alignment
+          struct _test_a {
+            char a;
+            union {
+              char b;
+              DBL d;
+            };
+          } t;
+          ptrdiff_t natural_alignment =
+              (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
+          //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
+          // long)natural_alignment );
+          KMP_DEBUG_ASSERT(
+              (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
+        }
+#endif // KMP_DEBUG
+
+        /* save the term in thread private dispatch structure */
+        *(DBL *)&pr->u.p.parm3 = x;
+
+        /* solve for the crossover point to the nearest integer i for which C_i
+           <= chunk */
+        {
+          UT left, right, mid;
+          long double p;
+
+          /* estimate initial upper and lower bound */
+
+          /* doesn't matter what value right is as long as it is positive, but
+             it affects performance of the solver */
+          right = 229;
+          p = __kmp_pow<UT>(x, right);
+          if (p > target) {
+            do {
+              p *= p;
+              right <<= 1;
+            } while (p > target && right < (1 << 27));
+            /* lower bound is previous (failed) estimate of upper bound */
+            left = right >> 1;
+          } else {
+            left = 0;
+          }
+
+          /* bisection root-finding method */
+          while (left + 1 < right) {
+            mid = (left + right) / 2;
+            if (__kmp_pow<UT>(x, mid) > target) {
+              left = mid;
+            } else {
+              right = mid;
+            }
+          } // while
+          cross = right;
+        }
+        /* assert sanity of computed crossover point */
+        KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
+                   __kmp_pow<UT>(x, cross) <= target);
+
+        /* save the crossover point in thread private dispatch structure */
+        pr->u.p.parm2 = cross;
+
+// C75803
+#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
+#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
+#else
+#define GUIDED_ANALYTICAL_WORKAROUND (x)
+#endif
+        /* dynamic-style scheduling offset */
+        pr->u.p.count = tc -
+                        __kmp_dispatch_guided_remaining(
+                            tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
+                        cross * chunk;
+#if KMP_USE_X87CONTROL
+        // restore FPCW
+        _control87(oldFpcw, _MCW_PC);
+#endif
+      } // if
+    } else {
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
+                     "kmp_sch_static_greedy\n",
+                     gtid));
+      schedule = kmp_sch_static_greedy;
+      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
+      pr->u.p.parm1 = tc;
+    } // if
+  } // case
+  break;
+  case kmp_sch_static_greedy:
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
+         gtid));
+    pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
+    break;
+  case kmp_sch_static_chunked:
+  case kmp_sch_dynamic_chunked:
+  dynamic_init:
+    if (tc == 0)
+      break;
+    if (pr->u.p.parm1 <= 0)
+      pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
+    else if (pr->u.p.parm1 > tc)
+      pr->u.p.parm1 = tc;
+    // Store the total number of chunks to prevent integer overflow during
+    // bounds calculations in the get next chunk routine.
+    pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
+                   "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
+                   gtid));
+    break;
+  case kmp_sch_trapezoidal: {
+    /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
+
+    T parm1, parm2, parm3, parm4;
+    KD_TRACE(100,
+             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
+              gtid));
+
+    parm1 = chunk;
+
+    /* F : size of the first cycle */
+    parm2 = (tc / (2 * nproc));
+
+    if (parm2 < 1) {
+      parm2 = 1;
+    }
+
+    /* L : size of the last cycle.  Make sure the last cycle is not larger
+       than the first cycle. */
+    if (parm1 < 1) {
+      parm1 = 1;
+    } else if (parm1 > parm2) {
+      parm1 = parm2;
+    }
+
+    /* N : number of cycles */
+    parm3 = (parm2 + parm1);
+    parm3 = (2 * tc + parm3 - 1) / parm3;
+
+    if (parm3 < 2) {
+      parm3 = 2;
+    }
+
+    /* sigma : decreasing incr of the trapezoid */
+    parm4 = (parm3 - 1);
+    parm4 = (parm2 - parm1) / parm4;
+
+    // pointless check, because parm4 >= 0 always
+    // if ( parm4 < 0 ) {
+    //    parm4 = 0;
+    //}
+
+    pr->u.p.parm1 = parm1;
+    pr->u.p.parm2 = parm2;
+    pr->u.p.parm3 = parm3;
+    pr->u.p.parm4 = parm4;
+  } // case
+  break;
+
+  default: {
+    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
+                KMP_HNT(GetNewerLibrary), // Hint
+                __kmp_msg_null // Variadic argument list terminator
+    );
+  } break;
+  } // switch
+  pr->schedule = schedule;
+}
+
+#if KMP_USE_HIER_SCHED
+template <typename T>
+inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
+                                             typename traits_t<T>::signed_t st);
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
+                                            kmp_int32 ub, kmp_int32 st) {
+  __kmp_dispatch_init_hierarchy<kmp_int32>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
+                                             kmp_uint32 ub, kmp_int32 st) {
+  __kmp_dispatch_init_hierarchy<kmp_uint32>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
+                                            kmp_int64 ub, kmp_int64 st) {
+  __kmp_dispatch_init_hierarchy<kmp_int64>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
+                                             kmp_uint64 ub, kmp_int64 st) {
+  __kmp_dispatch_init_hierarchy<kmp_uint64>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
+}
+
+// free all the hierarchy scheduling memory associated with the team
+void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
+  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
+  for (int i = 0; i < num_disp_buff; ++i) {
+    // type does not matter here so use kmp_int32
+    auto sh =
+        reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
+            &team->t.t_disp_buffer[i]);
+    if (sh->hier) {
+      sh->hier->deallocate();
+      __kmp_free(sh->hier);
+    }
+  }
+}
+#endif
+
+// UT - unsigned flavor of T, ST - signed flavor of T,
+// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
+template <typename T>
+static void
+__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
+                    T ub, typename traits_t<T>::signed_t st,
+                    typename traits_t<T>::signed_t chunk, int push_ws) {
+  typedef typename traits_t<T>::unsigned_t UT;
+
+  int active;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  kmp_uint32 my_buffer_index;
+  dispatch_private_info_template<T> *pr;
+  dispatch_shared_info_template<T> volatile *sh;
+
+  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
+                   sizeof(dispatch_private_info));
+  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
+                   sizeof(dispatch_shared_info));
+  __kmp_assert_valid_gtid(gtid);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_DISPATCH_INIT();
+#endif
+#ifdef KMP_DEBUG
+  typedef typename traits_t<T>::signed_t ST;
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
+                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
+                            traits_t<ST>::spec, traits_t<T>::spec,
+                            traits_t<T>::spec, traits_t<ST>::spec);
+    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
+    __kmp_str_free(&buff);
+  }
+#endif
+  /* setup data */
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+  th->th.th_ident = loc;
+
+  // Any half-decent optimizer will remove this test when the blocks are empty
+  // since the macros expand to nothing
+  // when statistics are disabled.
+  if (schedule == __kmp_static) {
+    KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
+  } else {
+    KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
+  }
+
+#if KMP_USE_HIER_SCHED
+  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
+  // Hierarchical scheduling does not work with ordered, so if ordered is
+  // detected, then revert back to threaded scheduling.
+  bool ordered;
+  enum sched_type my_sched = schedule;
+  my_buffer_index = th->th.th_dispatch->th_disp_index;
+  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+      &th->th.th_dispatch
+           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
+  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
+    my_sched =
+        (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
+  ordered = (kmp_ord_lower & my_sched);
+  if (pr->flags.use_hier) {
+    if (ordered) {
+      KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
+                     "Disabling hierarchical scheduling.\n",
+                     gtid));
+      pr->flags.use_hier = FALSE;
+    }
+  }
+  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
+    // Don't use hierarchical for ordered parallel loops and don't
+    // use the runtime hierarchy if one was specified in the program
+    if (!ordered && !pr->flags.use_hier)
+      __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
+  }
+#endif // KMP_USE_HIER_SCHED
+
+#if USE_ITT_BUILD
+  kmp_uint64 cur_chunk = chunk;
+  int itt_need_metadata_reporting =
+      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
+      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
+      team->t.t_active_level == 1;
+#endif
+  if (!active) {
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
+  } else {
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    my_buffer_index = th->th.th_dispatch->th_disp_index++;
+
+    /* What happens when number of threads changes, need to resize buffer? */
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        &th->th.th_dispatch
+             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+    KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
+                  my_buffer_index));
+    if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
+      KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
+                     " sh->buffer_index:%d\n",
+                     gtid, my_buffer_index, sh->buffer_index));
+      __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
+                             __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
+      // Note: KMP_WAIT() cannot be used there: buffer index and
+      // my_buffer_index are *always* 32-bit integers.
+      KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
+                     "sh->buffer_index:%d\n",
+                     gtid, my_buffer_index, sh->buffer_index));
+    }
+  }
+
+  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
+#if USE_ITT_BUILD
+                                &cur_chunk,
+#endif
+                                chunk, (T)th->th.th_team_nproc,
+                                (T)th->th.th_info.ds.ds_tid);
+  if (active) {
+    if (pr->flags.ordered == 0) {
+      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
+      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
+    } else {
+      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
+      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
+    }
+    th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
+    th->th.th_dispatch->th_dispatch_sh_current =
+        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
+#if USE_ITT_BUILD
+    if (pr->flags.ordered) {
+      __kmp_itt_ordered_init(gtid);
+    }
+    // Report loop metadata
+    if (itt_need_metadata_reporting) {
+      // Only report metadata by primary thread of active team at level 1
+      kmp_uint64 schedtype = 0;
+      switch (schedule) {
+      case kmp_sch_static_chunked:
+      case kmp_sch_static_balanced: // Chunk is calculated in the switch above
+        break;
+      case kmp_sch_static_greedy:
+        cur_chunk = pr->u.p.parm1;
+        break;
+      case kmp_sch_dynamic_chunked:
+        schedtype = 1;
+        break;
+      case kmp_sch_guided_iterative_chunked:
+      case kmp_sch_guided_analytical_chunked:
+      case kmp_sch_guided_simd:
+        schedtype = 2;
+        break;
+      default:
+        // Should we put this case under "static"?
+        // case kmp_sch_static_steal:
+        schedtype = 3;
+        break;
+      }
+      __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
+    }
+#if KMP_USE_HIER_SCHED
+    if (pr->flags.use_hier) {
+      pr->u.p.count = 0;
+      pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
+    }
+#endif // KMP_USER_HIER_SCHED
+#endif /* USE_ITT_BUILD */
+  }
+
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
+        "lb:%%%s ub:%%%s"
+        " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
+        " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
+        traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
+        traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
+        traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
+    KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
+                  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
+                  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
+                  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
+    __kmp_str_free(&buff);
+  }
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
+  }
+#endif
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
+}
+
+/* For ordered loops, either __kmp_dispatch_finish() should be called after
+ * every iteration, or __kmp_dispatch_finish_chunk() should be called after
+ * every chunk of iterations.  If the ordered section(s) were not executed
+ * for this iteration (or every iteration in this chunk), we need to set the
+ * ordered iteration counters so that the next thread can proceed. */
+template <typename UT>
+static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
+  typedef typename traits_t<UT>::signed_t ST;
+  __kmp_assert_valid_gtid(gtid);
+  kmp_info_t *th = __kmp_threads[gtid];
+
+  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
+  if (!th->th.th_team->t.t_serialized) {
+
+    dispatch_private_info_template<UT> *pr =
+        reinterpret_cast<dispatch_private_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_pr_current);
+    dispatch_shared_info_template<UT> volatile *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+    KMP_DEBUG_ASSERT(pr);
+    KMP_DEBUG_ASSERT(sh);
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    if (pr->ordered_bumped) {
+      KD_TRACE(
+          1000,
+          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
+           gtid));
+      pr->ordered_bumped = 0;
+    } else {
+      UT lower = pr->u.p.ordered_lower;
+
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
+                                "ordered_iteration:%%%s lower:%%%s\n",
+                                traits_t<UT>::spec, traits_t<UT>::spec);
+        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+        __kmp_str_free(&buff);
+      }
+#endif
+
+      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+      KMP_MB(); /* is this necessary? */
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
+                                "ordered_iteration:%%%s lower:%%%s\n",
+                                traits_t<UT>::spec, traits_t<UT>::spec);
+        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+        __kmp_str_free(&buff);
+      }
+#endif
+
+      test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
+    } // if
+  } // if
+  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
+}
+
+#ifdef KMP_GOMP_COMPAT
+
+template <typename UT>
+static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
+  typedef typename traits_t<UT>::signed_t ST;
+  __kmp_assert_valid_gtid(gtid);
+  kmp_info_t *th = __kmp_threads[gtid];
+
+  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
+  if (!th->th.th_team->t.t_serialized) {
+    dispatch_private_info_template<UT> *pr =
+        reinterpret_cast<dispatch_private_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_pr_current);
+    dispatch_shared_info_template<UT> volatile *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+    KMP_DEBUG_ASSERT(pr);
+    KMP_DEBUG_ASSERT(sh);
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    UT lower = pr->u.p.ordered_lower;
+    UT upper = pr->u.p.ordered_upper;
+    UT inc = upper - lower + 1;
+
+    if (pr->ordered_bumped == inc) {
+      KD_TRACE(
+          1000,
+          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
+           gtid));
+      pr->ordered_bumped = 0;
+    } else {
+      inc -= pr->ordered_bumped;
+
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_finish_chunk: T#%%d before wait: "
+            "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
+            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
+        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
+        __kmp_str_free(&buff);
+      }
+#endif
+
+      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+
+      KMP_MB(); /* is this necessary? */
+      KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
+                      "ordered_bumped to zero\n",
+                      gtid));
+      pr->ordered_bumped = 0;
+//!!!!! TODO check if the inc should be unsigned, or signed???
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_finish_chunk: T#%%d after wait: "
+            "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
+            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
+            traits_t<UT>::spec);
+        KD_TRACE(1000,
+                 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
+        __kmp_str_free(&buff);
+      }
+#endif
+
+      test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
+    }
+    //        }
+  }
+  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
+}
+
+#endif /* KMP_GOMP_COMPAT */
+
+template <typename T>
+int __kmp_dispatch_next_algorithm(int gtid,
+                                  dispatch_private_info_template<T> *pr,
+                                  dispatch_shared_info_template<T> volatile *sh,
+                                  kmp_int32 *p_last, T *p_lb, T *p_ub,
+                                  typename traits_t<T>::signed_t *p_st, T nproc,
+                                  T tid) {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  typedef typename traits_t<T>::floating_t DBL;
+  int status = 0;
+  bool last = false;
+  T start;
+  ST incr;
+  UT limit, trip, init;
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+
+  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+  KMP_DEBUG_ASSERT(pr);
+  KMP_DEBUG_ASSERT(sh);
+  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff =
+        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
+                         "sh:%%p nproc:%%%s tid:%%%s\n",
+                         traits_t<T>::spec, traits_t<T>::spec);
+    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
+    __kmp_str_free(&buff);
+  }
+#endif
+
+  // zero trip count
+  if (pr->u.p.tc == 0) {
+    KD_TRACE(10,
+             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
+              "zero status:%d\n",
+              gtid, status));
+    return 0;
+  }
+
+  switch (pr->schedule) {
+#if KMP_STATIC_STEAL_ENABLED
+  case kmp_sch_static_steal: {
+    T chunk = pr->u.p.parm1;
+    UT nchunks = pr->u.p.parm2;
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
+              gtid));
+
+    trip = pr->u.p.tc - 1;
+
+    if (traits_t<T>::type_size > 4) {
+      // use lock for 8-byte induction variable.
+      // TODO (optional): check presence and use 16-byte CAS
+      kmp_lock_t *lck = pr->u.p.steal_lock;
+      KMP_DEBUG_ASSERT(lck != NULL);
+      if (pr->u.p.count < (UT)pr->u.p.ub) {
+        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
+        __kmp_acquire_lock(lck, gtid);
+        // try to get own chunk of iterations
+        init = (pr->u.p.count)++;
+        status = (init < (UT)pr->u.p.ub);
+        __kmp_release_lock(lck, gtid);
+      } else {
+        status = 0; // no own chunks
+      }
+      if (!status) { // try to steal
+        kmp_lock_t *lckv; // victim buffer's lock
+        T while_limit = pr->u.p.parm3;
+        T while_index = 0;
+        int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                  __kmp_dispatch_num_buffers; // current loop index
+        // note: victim thread can potentially execute another loop
+        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
+        while ((!status) && (while_limit != ++while_index)) {
+          dispatch_private_info_template<T> *v;
+          T remaining;
+          T victimId = pr->u.p.parm4;
+          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
+          v = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(v);
+          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
+                 oldVictimId != victimId) {
+            victimId = (victimId + 1) % nproc;
+            v = reinterpret_cast<dispatch_private_info_template<T> *>(
+                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(v);
+          }
+          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
+            continue; // try once more (nproc attempts in total)
+          }
+          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
+            kmp_uint32 old = UNUSED;
+            // try to steal whole range from inactive victim
+            status = v->steal_flag.compare_exchange_strong(old, THIEF);
+            if (status) {
+              // initialize self buffer with victim's whole range of chunks
+              T id = victimId;
+              T small_chunk = 0, extras = 0, p_extra = 0;
+              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
+                                              init, small_chunk, extras,
+                                              p_extra);
+              __kmp_acquire_lock(lck, gtid);
+              pr->u.p.count = init + 1; // exclude one we execute immediately
+              pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
+              __kmp_release_lock(lck, gtid);
+              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
+              // no need to reinitialize other thread invariants: lb, st, etc.
+#ifdef KMP_DEBUG
+              {
+                char *buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+                                        "stolen chunks from T#%%d, "
+                                        "count:%%%s ub:%%%s\n",
+                                        traits_t<UT>::spec, traits_t<T>::spec);
+                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
+                __kmp_str_free(&buff);
+              }
+#endif
+              // activate non-empty buffer and let others steal from us
+              if (pr->u.p.count < (UT)pr->u.p.ub)
+                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
+              break;
+            }
+          }
+          if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
+              v->u.p.count >= (UT)v->u.p.ub) {
+            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
+            continue; // no chunks to steal, try next victim
+          }
+          lckv = v->u.p.steal_lock;
+          KMP_ASSERT(lckv != NULL);
+          __kmp_acquire_lock(lckv, gtid);
+          limit = v->u.p.ub; // keep initial ub
+          if (v->u.p.count >= limit) {
+            __kmp_release_lock(lckv, gtid);
+            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
+            continue; // no chunks to steal, try next victim
+          }
+
+          // stealing succeded, reduce victim's ub by 1/4 of undone chunks
+          // TODO: is this heuristics good enough??
+          remaining = limit - v->u.p.count;
+          if (remaining > 7) {
+            // steal 1/4 of remaining
+            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
+            init = (v->u.p.ub -= (remaining >> 2));
+          } else {
+            // steal 1 chunk of 1..7 remaining
+            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
+            init = (v->u.p.ub -= 1);
+          }
+          __kmp_release_lock(lckv, gtid);
+#ifdef KMP_DEBUG
+          {
+            char *buff;
+            // create format specifiers before the debug output
+            buff = __kmp_str_format(
+                "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
+                "count:%%%s ub:%%%s\n",
+                traits_t<UT>::spec, traits_t<UT>::spec);
+            KD_TRACE(10, (buff, gtid, victimId, init, limit));
+            __kmp_str_free(&buff);
+          }
+#endif
+          KMP_DEBUG_ASSERT(init + 1 <= limit);
+          pr->u.p.parm4 = victimId; // remember victim to steal from
+          status = 1;
+          // now update own count and ub with stolen range excluding init chunk
+          __kmp_acquire_lock(lck, gtid);
+          pr->u.p.count = init + 1;
+          pr->u.p.ub = limit;
+          __kmp_release_lock(lck, gtid);
+          // activate non-empty buffer and let others steal from us
+          if (init + 1 < limit)
+            KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
+        } // while (search for victim)
+      } // if (try to find victim and steal)
+    } else {
+      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
+      // as all operations on pair (count, ub) must be done atomically
+      typedef union {
+        struct {
+          UT count;
+          T ub;
+        } p;
+        kmp_int64 b;
+      } union_i4;
+      union_i4 vold, vnew;
+      if (pr->u.p.count < (UT)pr->u.p.ub) {
+        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
+        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
+        vnew.b = vold.b;
+        vnew.p.count++; // get chunk from head of self range
+        while (!KMP_COMPARE_AND_STORE_REL64(
+            (volatile kmp_int64 *)&pr->u.p.count,
+            *VOLATILE_CAST(kmp_int64 *) & vold.b,
+            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
+          KMP_CPU_PAUSE();
+          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
+          vnew.b = vold.b;
+          vnew.p.count++;
+        }
+        init = vold.p.count;
+        status = (init < (UT)vold.p.ub);
+      } else {
+        status = 0; // no own chunks
+      }
+      if (!status) { // try to steal
+        T while_limit = pr->u.p.parm3;
+        T while_index = 0;
+        int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                  __kmp_dispatch_num_buffers; // current loop index
+        // note: victim thread can potentially execute another loop
+        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
+        while ((!status) && (while_limit != ++while_index)) {
+          dispatch_private_info_template<T> *v;
+          T remaining;
+          T victimId = pr->u.p.parm4;
+          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
+          v = reinterpret_cast<dispatch_private_info_template<T> *>(
+              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+          KMP_DEBUG_ASSERT(v);
+          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
+                 oldVictimId != victimId) {
+            victimId = (victimId + 1) % nproc;
+            v = reinterpret_cast<dispatch_private_info_template<T> *>(
+                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
+            KMP_DEBUG_ASSERT(v);
+          }
+          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
+            continue; // try once more (nproc attempts in total)
+          }
+          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
+            kmp_uint32 old = UNUSED;
+            // try to steal whole range from inactive victim
+            status = v->steal_flag.compare_exchange_strong(old, THIEF);
+            if (status) {
+              // initialize self buffer with victim's whole range of chunks
+              T id = victimId;
+              T small_chunk = 0, extras = 0, p_extra = 0;
+              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
+                                              init, small_chunk, extras,
+                                              p_extra);
+              vnew.p.count = init + 1;
+              vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
+              // write pair (count, ub) at once atomically
+#if KMP_ARCH_X86
+              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
+#else
+              *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
+#endif
+              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
+              // no need to initialize other thread invariants: lb, st, etc.
+#ifdef KMP_DEBUG
+              {
+                char *buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+                                        "stolen chunks from T#%%d, "
+                                        "count:%%%s ub:%%%s\n",
+                                        traits_t<UT>::spec, traits_t<T>::spec);
+                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
+                __kmp_str_free(&buff);
+              }
+#endif
+              // activate non-empty buffer and let others steal from us
+              if (pr->u.p.count < (UT)pr->u.p.ub)
+                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
+              break;
+            }
+          }
+          while (1) { // CAS loop with check if victim still has enough chunks
+            // many threads may be stealing concurrently from same victim
+            vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
+            if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
+                vold.p.count >= (UT)vold.p.ub) {
+              pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
+              break; // no chunks to steal, try next victim
+            }
+            vnew.b = vold.b;
+            remaining = vold.p.ub - vold.p.count;
+            // try to steal 1/4 of remaining
+            // TODO: is this heuristics good enough??
+            if (remaining > 7) {
+              vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
+            } else {
+              vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
+            }
+            KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
+            if (KMP_COMPARE_AND_STORE_REL64(
+                    (volatile kmp_int64 *)&v->u.p.count,
+                    *VOLATILE_CAST(kmp_int64 *) & vold.b,
+                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
+              // stealing succedded
+#ifdef KMP_DEBUG
+              {
+                char *buff;
+                // create format specifiers before the debug output
+                buff = __kmp_str_format(
+                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
+                    "count:%%%s ub:%%%s\n",
+                    traits_t<T>::spec, traits_t<T>::spec);
+                KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
+                __kmp_str_free(&buff);
+              }
+#endif
+              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
+                                        vold.p.ub - vnew.p.ub);
+              status = 1;
+              pr->u.p.parm4 = victimId; // keep victim id
+              // now update own count and ub
+              init = vnew.p.ub;
+              vold.p.count = init + 1;
+#if KMP_ARCH_X86
+              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
+#else
+              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
+#endif
+              // activate non-empty buffer and let others steal from us
+              if (vold.p.count < (UT)vold.p.ub)
+                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
+              break;
+            } // if (check CAS result)
+            KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
+          } // while (try to steal from particular victim)
+        } // while (search for victim)
+      } // if (try to find victim and steal)
+    } // if (4-byte induction variable)
+    if (!status) {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } else {
+      start = pr->u.p.lb;
+      init *= chunk;
+      limit = chunk + init - 1;
+      incr = pr->u.p.st;
+      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
+
+      KMP_DEBUG_ASSERT(init <= trip);
+      // keep track of done chunks for possible early exit from stealing
+      // TODO: count executed chunks locally with rare update of shared location
+      // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+    } // if
+    break;
+  } // case
+#endif // KMP_STATIC_STEAL_ENABLED
+  case kmp_sch_static_balanced: {
+    KD_TRACE(
+        10,
+        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
+         gtid));
+    /* check if thread has any iteration to do */
+    if ((status = !pr->u.p.count) != 0) {
+      pr->u.p.count = 1;
+      *p_lb = pr->u.p.lb;
+      *p_ub = pr->u.p.ub;
+      last = (pr->u.p.parm1 != 0);
+      if (p_st != NULL)
+        *p_st = pr->u.p.st;
+    } else { /* no iterations to do */
+      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
+    }
+  } // case
+  break;
+  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
+                                 merged here */
+  case kmp_sch_static_chunked: {
+    T parm1;
+
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
+                   "kmp_sch_static_[affinity|chunked] case\n",
+                   gtid));
+    parm1 = pr->u.p.parm1;
+
+    trip = pr->u.p.tc - 1;
+    init = parm1 * (pr->u.p.count + tid);
+
+    if ((status = (init <= trip)) != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      limit = parm1 + init - 1;
+
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      pr->u.p.count += nproc;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_dynamic_chunked: {
+    UT chunk_number;
+    UT chunk_size = pr->u.p.parm1;
+    UT nchunks = pr->u.p.parm2;
+
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
+         gtid));
+
+    chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
+    status = (chunk_number < nchunks);
+    if (!status) {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } else {
+      init = chunk_size * chunk_number;
+      trip = pr->u.p.tc - 1;
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+
+      if ((last = (trip - init < (UT)chunk_size)))
+        limit = trip;
+      else
+        limit = chunk_size + init - 1;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_guided_iterative_chunked: {
+    T chunkspec = pr->u.p.parm1;
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
+                   "iterative case\n",
+                   gtid));
+    trip = pr->u.p.tc;
+    // Start atomic part of calculations
+    while (1) {
+      ST remaining; // signed, because can be < 0
+      init = sh->u.s.iteration; // shared value
+      remaining = trip - init;
+      if (remaining <= 0) { // AC: need to compare with 0 first
+        // nothing to do, don't try atomic op
+        status = 0;
+        break;
+      }
+      if ((T)remaining <
+          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
+        // use dynamic-style schedule
+        // atomically increment iterations, get old value
+        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                                 (ST)chunkspec);
+        remaining = trip - init;
+        if (remaining <= 0) {
+          status = 0; // all iterations got by other threads
+        } else {
+          // got some iterations to work on
+          status = 1;
+          if ((T)remaining > chunkspec) {
+            limit = init + chunkspec - 1;
+          } else {
+            last = true; // the last chunk
+            limit = init + remaining - 1;
+          } // if
+        } // if
+        break;
+      } // if
+      limit = init + (UT)((double)remaining *
+                          *(double *)&pr->u.p.parm3); // divide by K*nproc
+      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                               (ST)init, (ST)limit)) {
+        // CAS was successful, chunk obtained
+        status = 1;
+        --limit;
+        break;
+      } // if
+    } // while
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_guided_simd: {
+    // same as iterative but curr-chunk adjusted to be multiple of given
+    // chunk
+    T chunk = pr->u.p.parm1;
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
+              gtid));
+    trip = pr->u.p.tc;
+    // Start atomic part of calculations
+    while (1) {
+      ST remaining; // signed, because can be < 0
+      init = sh->u.s.iteration; // shared value
+      remaining = trip - init;
+      if (remaining <= 0) { // AC: need to compare with 0 first
+        status = 0; // nothing to do, don't try atomic op
+        break;
+      }
+      KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
+      // compare with K*nproc*(chunk+1), K=2 by default
+      if ((T)remaining < pr->u.p.parm2) {
+        // use dynamic-style schedule
+        // atomically increment iterations, get old value
+        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                                 (ST)chunk);
+        remaining = trip - init;
+        if (remaining <= 0) {
+          status = 0; // all iterations got by other threads
+        } else {
+          // got some iterations to work on
+          status = 1;
+          if ((T)remaining > chunk) {
+            limit = init + chunk - 1;
+          } else {
+            last = true; // the last chunk
+            limit = init + remaining - 1;
+          } // if
+        } // if
+        break;
+      } // if
+      // divide by K*nproc
+      UT span;
+      __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
+                         &span);
+      UT rem = span % chunk;
+      if (rem) // adjust so that span%chunk == 0
+        span += chunk - rem;
+      limit = init + span;
+      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                               (ST)init, (ST)limit)) {
+        // CAS was successful, chunk obtained
+        status = 1;
+        --limit;
+        break;
+      } // if
+    } // while
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_guided_analytical_chunked: {
+    T chunkspec = pr->u.p.parm1;
+    UT chunkIdx;
+#if KMP_USE_X87CONTROL
+    /* for storing original FPCW value for Windows* OS on
+       IA-32 architecture 8-byte version */
+    unsigned int oldFpcw;
+    unsigned int fpcwSet = 0;
+#endif
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
+                   "kmp_sch_guided_analytical_chunked case\n",
+                   gtid));
+
+    trip = pr->u.p.tc;
+
+    KMP_DEBUG_ASSERT(nproc > 1);
+    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
+
+    while (1) { /* this while loop is a safeguard against unexpected zero
+                   chunk sizes */
+      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
+      if (chunkIdx >= (UT)pr->u.p.parm2) {
+        --trip;
+        /* use dynamic-style scheduling */
+        init = chunkIdx * chunkspec + pr->u.p.count;
+        /* need to verify init > 0 in case of overflow in the above
+         * calculation */
+        if ((status = (init > 0 && init <= trip)) != 0) {
+          limit = init + chunkspec - 1;
+
+          if ((last = (limit >= trip)) != 0)
+            limit = trip;
+        }
+        break;
+      } else {
+/* use exponential-style scheduling */
+/* The following check is to workaround the lack of long double precision on
+   Windows* OS.
+   This check works around the possible effect that init != 0 for chunkIdx == 0.
+ */
+#if KMP_USE_X87CONTROL
+        /* If we haven't already done so, save original
+           FPCW and set precision to 64-bit, as Windows* OS
+           on IA-32 architecture defaults to 53-bit */
+        if (!fpcwSet) {
+          oldFpcw = _control87(0, 0);
+          _control87(_PC_64, _MCW_PC);
+          fpcwSet = 0x30000;
+        }
+#endif
+        if (chunkIdx) {
+          init = __kmp_dispatch_guided_remaining<T>(
+              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
+          KMP_DEBUG_ASSERT(init);
+          init = trip - init;
+        } else
+          init = 0;
+        limit = trip - __kmp_dispatch_guided_remaining<T>(
+                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
+        KMP_ASSERT(init <= limit);
+        if (init < limit) {
+          KMP_DEBUG_ASSERT(limit <= trip);
+          --limit;
+          status = 1;
+          break;
+        } // if
+      } // if
+    } // while (1)
+#if KMP_USE_X87CONTROL
+    /* restore FPCW if necessary
+       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
+    */
+    if (fpcwSet && (oldFpcw & fpcwSet))
+      _control87(oldFpcw, _MCW_PC);
+#endif
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      }
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    }
+  } // case
+  break;
+
+  case kmp_sch_trapezoidal: {
+    UT index;
+    T parm2 = pr->u.p.parm2;
+    T parm3 = pr->u.p.parm3;
+    T parm4 = pr->u.p.parm4;
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
+              gtid));
+
+    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
+
+    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
+    trip = pr->u.p.tc - 1;
+
+    if ((status = ((T)index < parm3 && init <= trip)) == 0) {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } else {
+      start = pr->u.p.lb;
+      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
+      incr = pr->u.p.st;
+
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+  default: {
+    status = 0; // to avoid complaints on uninitialized variable use
+    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
+                KMP_HNT(GetNewerLibrary), // Hint
+                __kmp_msg_null // Variadic argument list terminator
+    );
+  } break;
+  } // switch
+  if (p_last)
+    *p_last = last;
+#ifdef KMP_DEBUG
+  if (pr->flags.ordered) {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+                            "ordered_lower:%%%s ordered_upper:%%%s\n",
+                            traits_t<UT>::spec, traits_t<UT>::spec);
+    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
+    __kmp_str_free(&buff);
+  }
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
+        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
+    KMP_DEBUG_ASSERT(p_last);
+    KMP_DEBUG_ASSERT(p_st);
+    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
+    __kmp_str_free(&buff);
+  }
+#endif
+  return status;
+}
+
+/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
+   work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
+   is not called. */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_LOOP_END                                                          \
+  if (status == 0) {                                                           \
+    if (ompt_enabled.ompt_callback_work) {                                     \
+      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
+      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
+      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
+          ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
+          &(task_info->task_data), 0, codeptr);                                \
+    }                                                                          \
+  }
+#define OMPT_LOOP_DISPATCH(lb, ub, st, status)                                 \
+  if (ompt_enabled.ompt_callback_dispatch && status) {                         \
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);                \
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);              \
+    ompt_dispatch_chunk_t chunk;                                               \
+    ompt_data_t instance = ompt_data_none;                                     \
+    OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st);                                \
+    instance.ptr = &chunk;                                                     \
+    ompt_callbacks.ompt_callback(ompt_callback_dispatch)(                      \
+        &(team_info->parallel_data), &(task_info->task_data),                  \
+        ompt_dispatch_ws_loop_chunk, instance);                                \
+  }
+// TODO: implement count
+#else
+#define OMPT_LOOP_END // no-op
+#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
+#endif
+
+#if KMP_STATS_ENABLED
+#define KMP_STATS_LOOP_END                                                     \
+  {                                                                            \
+    kmp_int64 u, l, t, i;                                                      \
+    l = (kmp_int64)(*p_lb);                                                    \
+    u = (kmp_int64)(*p_ub);                                                    \
+    i = (kmp_int64)(pr->u.p.st);                                               \
+    if (status == 0) {                                                         \
+      t = 0;                                                                   \
+      KMP_POP_PARTITIONED_TIMER();                                             \
+    } else if (i == 1) {                                                       \
+      if (u >= l)                                                              \
+        t = u - l + 1;                                                         \
+      else                                                                     \
+        t = 0;                                                                 \
+    } else if (i < 0) {                                                        \
+      if (l >= u)                                                              \
+        t = (l - u) / (-i) + 1;                                                \
+      else                                                                     \
+        t = 0;                                                                 \
+    } else {                                                                   \
+      if (u >= l)                                                              \
+        t = (u - l) / i + 1;                                                   \
+      else                                                                     \
+        t = 0;                                                                 \
+    }                                                                          \
+    KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
+  }
+#else
+#define KMP_STATS_LOOP_END /* Nothing */
+#endif
+
+template <typename T>
+static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
+                               T *p_lb, T *p_ub,
+                               typename traits_t<T>::signed_t *p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                               ,
+                               void *codeptr
+#endif
+) {
+
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  // This is potentially slightly misleading, schedule(runtime) will appear here
+  // even if the actual runtime schedule is static. (Which points out a
+  // disadvantage of schedule(runtime): even when static scheduling is used it
+  // costs more than a compile time choice to use static scheduling would.)
+  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
+
+  int status;
+  dispatch_private_info_template<T> *pr;
+  __kmp_assert_valid_gtid(gtid);
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+
+  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
+  KD_TRACE(
+      1000,
+      ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
+       gtid, p_lb, p_ub, p_st, p_last));
+
+  if (team->t.t_serialized) {
+    /* NOTE: serialize this dispatch because we are not at the active level */
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
+    KMP_DEBUG_ASSERT(pr);
+
+    if ((status = (pr->u.p.tc != 0)) == 0) {
+      *p_lb = 0;
+      *p_ub = 0;
+      //            if ( p_last != NULL )
+      //                *p_last = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+      if (__kmp_env_consistency_check) {
+        if (pr->pushed_ws != ct_none) {
+          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
+        }
+      }
+    } else if (pr->flags.nomerge) {
+      kmp_int32 last;
+      T start;
+      UT limit, trip, init;
+      ST incr;
+      T chunk = pr->u.p.parm1;
+
+      KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
+                     gtid));
+
+      init = chunk * pr->u.p.count++;
+      trip = pr->u.p.tc - 1;
+
+      if ((status = (init <= trip)) == 0) {
+        *p_lb = 0;
+        *p_ub = 0;
+        //                if ( p_last != NULL )
+        //                    *p_last = 0;
+        if (p_st != NULL)
+          *p_st = 0;
+        if (__kmp_env_consistency_check) {
+          if (pr->pushed_ws != ct_none) {
+            pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
+          }
+        }
+      } else {
+        start = pr->u.p.lb;
+        limit = chunk + init - 1;
+        incr = pr->u.p.st;
+
+        if ((last = (limit >= trip)) != 0) {
+          limit = trip;
+#if KMP_OS_WINDOWS
+          pr->u.p.last_upper = pr->u.p.ub;
+#endif /* KMP_OS_WINDOWS */
+        }
+        if (p_last != NULL)
+          *p_last = last;
+        if (p_st != NULL)
+          *p_st = incr;
+        if (incr == 1) {
+          *p_lb = start + init;
+          *p_ub = start + limit;
+        } else {
+          *p_lb = start + init * incr;
+          *p_ub = start + limit * incr;
+        }
+
+        if (pr->flags.ordered) {
+          pr->u.p.ordered_lower = init;
+          pr->u.p.ordered_upper = limit;
+#ifdef KMP_DEBUG
+          {
+            char *buff;
+            // create format specifiers before the debug output
+            buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
+                                    "ordered_lower:%%%s ordered_upper:%%%s\n",
+                                    traits_t<UT>::spec, traits_t<UT>::spec);
+            KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
+                            pr->u.p.ordered_upper));
+            __kmp_str_free(&buff);
+          }
+#endif
+        } // if
+      } // if
+    } else {
+      pr->u.p.tc = 0;
+      *p_lb = pr->u.p.lb;
+      *p_ub = pr->u.p.ub;
+#if KMP_OS_WINDOWS
+      pr->u.p.last_upper = *p_ub;
+#endif /* KMP_OS_WINDOWS */
+      if (p_last != NULL)
+        *p_last = TRUE;
+      if (p_st != NULL)
+        *p_st = pr->u.p.st;
+    } // if
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format(
+          "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
+          "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
+          traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
+      KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
+                    (p_last ? *p_last : 0), status));
+      __kmp_str_free(&buff);
+    }
+#endif
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_DISPATCH_NEXT();
+#endif
+    OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
+    OMPT_LOOP_END;
+    KMP_STATS_LOOP_END;
+    return status;
+  } else {
+    kmp_int32 last = 0;
+    dispatch_shared_info_template<T> volatile *sh;
+
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        th->th.th_dispatch->th_dispatch_pr_current);
+    KMP_DEBUG_ASSERT(pr);
+    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+        th->th.th_dispatch->th_dispatch_sh_current);
+    KMP_DEBUG_ASSERT(sh);
+
+#if KMP_USE_HIER_SCHED
+    if (pr->flags.use_hier)
+      status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
+    else
+#endif // KMP_USE_HIER_SCHED
+      status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
+                                                p_st, th->th.th_team_nproc,
+                                                th->th.th_info.ds.ds_tid);
+    // status == 0: no more iterations to execute
+    if (status == 0) {
+      ST num_done;
+      num_done = test_then_inc<ST>(&sh->u.s.num_done);
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
+            traits_t<ST>::spec);
+        KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
+        __kmp_str_free(&buff);
+      }
+#endif
+
+#if KMP_USE_HIER_SCHED
+      pr->flags.use_hier = FALSE;
+#endif
+      if (num_done == th->th.th_team_nproc - 1) {
+#if KMP_STATIC_STEAL_ENABLED
+        if (pr->schedule == kmp_sch_static_steal) {
+          int i;
+          int idx = (th->th.th_dispatch->th_disp_index - 1) %
+                    __kmp_dispatch_num_buffers; // current loop index
+          // loop complete, safe to destroy locks used for stealing
+          for (i = 0; i < th->th.th_team_nproc; ++i) {
+            dispatch_private_info_template<T> *buf =
+                reinterpret_cast<dispatch_private_info_template<T> *>(
+                    &team->t.t_dispatch[i].th_disp_buffer[idx]);
+            KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
+            KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
+            if (traits_t<T>::type_size > 4) {
+              // destroy locks used for stealing
+              kmp_lock_t *lck = buf->u.p.steal_lock;
+              KMP_ASSERT(lck != NULL);
+              __kmp_destroy_lock(lck);
+              __kmp_free(lck);
+              buf->u.p.steal_lock = NULL;
+            }
+          }
+        }
+#endif
+        /* NOTE: release shared buffer to be reused */
+
+        KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+        sh->u.s.num_done = 0;
+        sh->u.s.iteration = 0;
+
+        /* TODO replace with general release procedure? */
+        if (pr->flags.ordered) {
+          sh->u.s.ordered_iteration = 0;
+        }
+
+        sh->buffer_index += __kmp_dispatch_num_buffers;
+        KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
+                       gtid, sh->buffer_index));
+
+        KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+      } // if
+      if (__kmp_env_consistency_check) {
+        if (pr->pushed_ws != ct_none) {
+          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
+        }
+      }
+
+      th->th.th_dispatch->th_deo_fcn = NULL;
+      th->th.th_dispatch->th_dxo_fcn = NULL;
+      th->th.th_dispatch->th_dispatch_sh_current = NULL;
+      th->th.th_dispatch->th_dispatch_pr_current = NULL;
+    } // if (status == 0)
+#if KMP_OS_WINDOWS
+    else if (last) {
+      pr->u.p.last_upper = pr->u.p.ub;
+    }
+#endif /* KMP_OS_WINDOWS */
+    if (p_last != NULL && status != 0)
+      *p_last = last;
+  } // if
+
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmp_dispatch_next: T#%%d normal case: "
+        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
+    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
+                  (p_last ? *p_last : 0), status));
+    __kmp_str_free(&buff);
+  }
+#endif
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_DISPATCH_NEXT();
+#endif
+  OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
+  OMPT_LOOP_END;
+  KMP_STATS_LOOP_END;
+  return status;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+@return Zero if the parallel region is not active and this thread should execute
+all sections, non-zero otherwise.
+
+Beginning of sections construct.
+There are no implicit barriers in the "sections" calls, rather the compiler
+should introduce an explicit barrier if it is required.
+
+This implementation is based on __kmp_dispatch_init, using same constructs for
+shared data (we can't have sections nested directly in omp for loop, there
+should be a parallel region in between)
+*/
+kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
+
+  int active;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  kmp_uint32 my_buffer_index;
+  dispatch_shared_info_template<kmp_int32> volatile *sh;
+
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  /* setup data */
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+  th->th.th_ident = loc;
+
+  KMP_COUNT_BLOCK(OMP_SECTIONS);
+  KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
+
+  if (active) {
+    // Setup sections in the same way as dynamic scheduled loops.
+    // We need one shared data: which section is to execute next.
+    // (in case parallel is not active, all sections will be executed on the
+    // same thread)
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    my_buffer_index = th->th.th_dispatch->th_disp_index++;
+
+    // reuse shared data structures from dynamic sched loops:
+    sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
+        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+    KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
+                  my_buffer_index));
+
+    th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
+    th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
+
+    KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
+                   "sh->buffer_index:%d\n",
+                   gtid, my_buffer_index, sh->buffer_index));
+    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
+                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
+    // Note: KMP_WAIT() cannot be used there: buffer index and
+    // my_buffer_index are *always* 32-bit integers.
+    KMP_MB();
+    KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
+                   "sh->buffer_index:%d\n",
+                   gtid, my_buffer_index, sh->buffer_index));
+
+    th->th.th_dispatch->th_dispatch_pr_current =
+        nullptr; // sections construct doesn't need private data
+    th->th.th_dispatch->th_dispatch_sh_current =
+        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+  KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
+
+  return active;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+@param numberOfSections  number of sections in the 'sections' construct
+@return unsigned [from 0 to n) - number (id) of the section to execute next on
+this thread. n (or any other number not in range) - nothing to execute on this
+thread
+*/
+
+kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
+                              kmp_int32 numberOfSections) {
+
+  KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
+
+  kmp_info_t *th = __kmp_threads[gtid];
+#ifdef KMP_DEBUG
+  kmp_team_t *team = th->th.th_team;
+#endif
+
+  KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
+                  numberOfSections));
+
+  // For serialized case we should not call this function:
+  KMP_DEBUG_ASSERT(!team->t.t_serialized);
+
+  dispatch_shared_info_template<kmp_int32> volatile *sh;
+
+  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+  KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
+  sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
+      th->th.th_dispatch->th_dispatch_sh_current);
+  KMP_DEBUG_ASSERT(sh);
+
+  kmp_int32 sectionIndex = 0;
+  bool moreSectionsToExecute = true;
+
+  // Find section to execute:
+  sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
+  if (sectionIndex >= numberOfSections) {
+    moreSectionsToExecute = false;
+  }
+
+  // status == 0: no more sections to execute;
+  // OMPTODO: __kmpc_end_sections could be bypassed?
+  if (!moreSectionsToExecute) {
+    kmp_int32 num_done;
+
+    num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
+
+    if (num_done == th->th.th_team_nproc - 1) {
+      /* NOTE: release this buffer to be reused */
+
+      KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+      sh->u.s.num_done = 0;
+      sh->u.s.iteration = 0;
+
+      KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+      sh->buffer_index += __kmp_dispatch_num_buffers;
+      KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
+                     sh->buffer_index));
+
+      KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    } // if
+
+    th->th.th_dispatch->th_deo_fcn = NULL;
+    th->th.th_dispatch->th_dxo_fcn = NULL;
+    th->th.th_dispatch->th_dispatch_sh_current = NULL;
+    th->th.th_dispatch->th_dispatch_pr_current = NULL;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_dispatch) {
+      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+      ompt_data_t instance = ompt_data_none;
+      instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
+      ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
+          &(team_info->parallel_data), &(task_info->task_data),
+          ompt_dispatch_section, instance);
+    }
+#endif
+  }
+
+  return sectionIndex;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+
+End of "sections" construct.
+Don't need to wait here: barrier is added separately when needed.
+*/
+void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
+
+  kmp_info_t *th = __kmp_threads[gtid];
+  int active = !th->th.th_team->t.t_serialized;
+
+  KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
+
+  if (!active) {
+    // In active case call finalization is done in __kmpc_next_section
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
+          &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
+    }
+#endif
+  }
+
+  KMP_POP_PARTITIONED_TIMER();
+  KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
+}
+
+template <typename T>
+static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
+                                  kmp_int32 *plastiter, T *plower, T *pupper,
+                                  typename traits_t<T>::signed_t incr) {
+  typedef typename traits_t<T>::unsigned_t UT;
+  kmp_uint32 team_id;
+  kmp_uint32 nteams;
+  UT trip_count;
+  kmp_team_t *team;
+  kmp_info_t *th;
+
+  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
+  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
+#ifdef KMP_DEBUG
+  typedef typename traits_t<T>::signed_t ST;
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
+                            "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
+                            traits_t<T>::spec, traits_t<T>::spec,
+                            traits_t<ST>::spec, traits_t<T>::spec);
+    KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
+    __kmp_str_free(&buff);
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    if (incr == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
+                            loc);
+    }
+    if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
+      // The loop is illegal.
+      // Some zero-trip loops maintained by compiler, e.g.:
+      //   for(i=10;i<0;++i) // lower >= upper - run-time check
+      //   for(i=0;i>10;--i) // lower <= upper - run-time check
+      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+      // Compiler does not check the following illegal loops:
+      //   for(i=0;i<10;i+=incr) // where incr<0
+      //   for(i=10;i>0;i-=incr) // where incr<0
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
+    }
+  }
+  __kmp_assert_valid_gtid(gtid);
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
+  nteams = th->th.th_teams_size.nteams;
+  team_id = team->t.t_master_tid;
+  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
+
+  // compute global trip count
+  if (incr == 1) {
+    trip_count = *pupper - *plower + 1;
+  } else if (incr == -1) {
+    trip_count = *plower - *pupper + 1;
+  } else if (incr > 0) {
+    // upper-lower can exceed the limit of signed type
+    trip_count = (UT)(*pupper - *plower) / incr + 1;
+  } else {
+    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
+  }
+
+  if (trip_count <= nteams) {
+    KMP_DEBUG_ASSERT(
+        __kmp_static == kmp_sch_static_greedy ||
+        __kmp_static ==
+            kmp_sch_static_balanced); // Unknown static scheduling type.
+    // only some teams get single iteration, others get nothing
+    if (team_id < trip_count) {
+      *pupper = *plower = *plower + team_id * incr;
+    } else {
+      *plower = *pupper + incr; // zero-trip loop
+    }
+    if (plastiter != NULL)
+      *plastiter = (team_id == trip_count - 1);
+  } else {
+    if (__kmp_static == kmp_sch_static_balanced) {
+      UT chunk = trip_count / nteams;
+      UT extras = trip_count % nteams;
+      *plower +=
+          incr * (team_id * chunk + (team_id < extras ? team_id : extras));
+      *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
+      if (plastiter != NULL)
+        *plastiter = (team_id == nteams - 1);
+    } else {
+      T chunk_inc_count =
+          (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
+      T upper = *pupper;
+      KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
+      // Unknown static scheduling type.
+      *plower += team_id * chunk_inc_count;
+      *pupper = *plower + chunk_inc_count - incr;
+      // Check/correct bounds if needed
+      if (incr > 0) {
+        if (*pupper < *plower)
+          *pupper = traits_t<T>::max_value;
+        if (plastiter != NULL)
+          *plastiter = *plower <= upper && *pupper > upper - incr;
+        if (*pupper > upper)
+          *pupper = upper; // tracker C73258
+      } else {
+        if (*pupper > *plower)
+          *pupper = traits_t<T>::min_value;
+        if (plastiter != NULL)
+          *plastiter = *plower >= upper && *pupper < upper - incr;
+        if (*pupper < upper)
+          *pupper = upper; // tracker C73258
+      }
+    }
+  }
+}
+
+//-----------------------------------------------------------------------------
+// Dispatch routines
+//    Transfer call to template< type T >
+//    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
+//                         T lb, T ub, ST st, ST chunk )
+extern "C" {
+
+/*!
+@ingroup WORK_SHARING
+@{
+@param loc Source location
+@param gtid Global thread id
+@param schedule Schedule type
+@param lb  Lower bound
+@param ub  Upper bound
+@param st  Step (or increment if you prefer)
+@param chunk The chunk size to block with
+
+This function prepares the runtime to start a dynamically scheduled for loop,
+saving the loop arguments.
+These functions are all identical apart from the types of the arguments.
+*/
+
+void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                            enum sched_type schedule, kmp_int32 lb,
+                            kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+/*!
+See @ref __kmpc_dispatch_init_4
+*/
+void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                             enum sched_type schedule, kmp_uint32 lb,
+                             kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+/*!
+See @ref __kmpc_dispatch_init_4
+*/
+void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                            enum sched_type schedule, kmp_int64 lb,
+                            kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+/*!
+See @ref __kmpc_dispatch_init_4
+*/
+void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                             enum sched_type schedule, kmp_uint64 lb,
+                             kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+/*!
+See @ref __kmpc_dispatch_init_4
+
+Difference from __kmpc_dispatch_init set of functions is these functions
+are called for composite distribute parallel for construct. Thus before
+regular iterations dispatching we need to calc per-team iteration space.
+
+These functions are all identical apart from the types of the arguments.
+*/
+void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                                 enum sched_type schedule, kmp_int32 *p_last,
+                                 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
+                                 kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
+  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                                  enum sched_type schedule, kmp_int32 *p_last,
+                                  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
+                                  kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
+  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                                 enum sched_type schedule, kmp_int32 *p_last,
+                                 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
+                                 kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
+  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                                  enum sched_type schedule, kmp_int32 *p_last,
+                                  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
+                                  kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
+  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param p_last Pointer to a flag set to one if this is the last chunk or zero
+otherwise
+@param p_lb   Pointer to the lower bound for the next chunk of work
+@param p_ub   Pointer to the upper bound for the next chunk of work
+@param p_st   Pointer to the stride for the next chunk of work
+@return one if there is work to be done, zero otherwise
+
+Get the next dynamically allocated chunk of work for this thread.
+If there is no more work, then the lb,ub and stride need not be modified.
+*/
+int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                           kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                        ,
+                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+  );
+}
+
+/*!
+See @ref __kmpc_dispatch_next_4
+*/
+int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                            kmp_uint32 *p_lb, kmp_uint32 *p_ub,
+                            kmp_int32 *p_st) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                         ,
+                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+  );
+}
+
+/*!
+See @ref __kmpc_dispatch_next_4
+*/
+int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                           kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                        ,
+                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+  );
+}
+
+/*!
+See @ref __kmpc_dispatch_next_4
+*/
+int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                            kmp_uint64 *p_lb, kmp_uint64 *p_ub,
+                            kmp_int64 *p_st) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                         ,
+                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+  );
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+
+Mark the end of a dynamic loop.
+*/
+void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
+}
+
+/*!
+See @ref __kmpc_dispatch_fini_4
+*/
+void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
+}
+
+/*!
+See @ref __kmpc_dispatch_fini_4
+*/
+void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
+}
+
+/*!
+See @ref __kmpc_dispatch_fini_4
+*/
+void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
+}
+/*! @} */
+
+//-----------------------------------------------------------------------------
+// Non-template routines from kmp_dispatch.cpp used in other sources
+
+kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
+  return value == checker;
+}
+
+kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
+  return value != checker;
+}
+
+kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
+  return value < checker;
+}
+
+kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
+  return value >= checker;
+}
+
+kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
+  return value <= checker;
+}
+
+kmp_uint32
+__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
+             kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
+             void *obj // Higher-level synchronization object, or NULL.
+) {
+  // note: we may not belong to a team at this point
+  volatile kmp_uint32 *spin = spinner;
+  kmp_uint32 check = checker;
+  kmp_uint32 spins;
+  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
+  kmp_uint32 r;
+  kmp_uint64 time;
+
+  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
+  KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
+  // main wait spin loop
+  while (!f(r = TCR_4(*spin), check)) {
+    KMP_FSYNC_SPIN_PREPARE(obj);
+    /* GEH - remove this since it was accidentally introduced when kmp_wait was
+       split. It causes problems with infinite recursion because of exit lock */
+    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
+        __kmp_abort_thread(); */
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
+  }
+  KMP_FSYNC_SPIN_ACQUIRED(obj);
+  return r;
+}
+
+void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
+                      kmp_uint32 (*pred)(void *, kmp_uint32),
+                      void *obj // Higher-level synchronization object, or NULL.
+) {
+  // note: we may not belong to a team at this point
+  void *spin = spinner;
+  kmp_uint32 check = checker;
+  kmp_uint32 spins;
+  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
+  kmp_uint64 time;
+
+  KMP_FSYNC_SPIN_INIT(obj, spin);
+  KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
+  // main wait spin loop
+  while (!f(spin, check)) {
+    KMP_FSYNC_SPIN_PREPARE(obj);
+    /* if we have waited a bit, or are noversubscribed, yield */
+    /* pause is in the following code */
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
+  }
+  KMP_FSYNC_SPIN_ACQUIRED(obj);
+}
+
+} // extern "C"
+
+#ifdef KMP_GOMP_COMPAT
+
+void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                               enum sched_type schedule, kmp_int32 lb,
+                               kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
+                               int push_ws) {
+  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
+                                 push_ws);
+}
+
+void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                                enum sched_type schedule, kmp_uint32 lb,
+                                kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
+                                int push_ws) {
+  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
+                                  push_ws);
+}
+
+void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                               enum sched_type schedule, kmp_int64 lb,
+                               kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
+                               int push_ws) {
+  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
+                                 push_ws);
+}
+
+void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                                enum sched_type schedule, kmp_uint64 lb,
+                                kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
+                                int push_ws) {
+  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
+                                  push_ws);
+}
+
+void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
+}
+
+void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
+}
+
+void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
+}
+
+void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
+}
+
+#endif /* KMP_GOMP_COMPAT */
+
+/* ------------------------------------------------------------------------ */
diff --git a/third_party/openmp/kmp_dispatch.h b/third_party/openmp/kmp_dispatch.h
new file mode 100644
index 000000000..cf19eb526
--- /dev/null
+++ b/third_party/openmp/kmp_dispatch.h
@@ -0,0 +1,513 @@
+/*
+ * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_DISPATCH_H
+#define KMP_DISPATCH_H
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+#include <float.h>
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#include "ompt-specific.h"
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+#if KMP_USE_HIER_SCHED
+// Forward declarations of some hierarchical scheduling data structures
+template <typename T> struct kmp_hier_t;
+template <typename T> struct kmp_hier_top_unit_t;
+#endif // KMP_USE_HIER_SCHED
+
+template <typename T> struct dispatch_shared_info_template;
+template <typename T> struct dispatch_private_info_template;
+
+template <typename T>
+extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
+                                          dispatch_private_info_template<T> *pr,
+                                          enum sched_type schedule, T lb, T ub,
+                                          typename traits_t<T>::signed_t st,
+#if USE_ITT_BUILD
+                                          kmp_uint64 *cur_chunk,
+#endif
+                                          typename traits_t<T>::signed_t chunk,
+                                          T nproc, T unit_id);
+template <typename T>
+extern int __kmp_dispatch_next_algorithm(
+    int gtid, dispatch_private_info_template<T> *pr,
+    dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
+    T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
+
+void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+
+#if KMP_STATIC_STEAL_ENABLED
+
+// replaces dispatch_private_info{32,64} structures and
+// dispatch_private_info{32,64}_t types
+template <typename T> struct dispatch_private_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  UT count; // unsigned
+  T ub;
+  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+  T lb;
+  ST st; // signed
+  UT tc; // unsigned
+  kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+  UT ordered_lower; // unsigned
+  UT ordered_upper; // unsigned
+
+  /* parm[1-4] are used in different ways by different scheduling algorithms */
+
+  // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
+  //    a) parm3 is properly aligned and
+  //    b) all parm1-4 are in the same cache line.
+  // Because of parm1-4 are used together, performance seems to be better
+  // if they are in the same line (not measured though).
+  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
+    T parm1;
+    T parm2;
+    T parm3;
+    T parm4;
+  };
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  UT pchunks; // total number of chunks for processes with p-core
+  UT num_procs_with_pcore; // number of threads with p-core
+  T first_thread_with_ecore;
+#endif
+#if KMP_OS_WINDOWS
+  T last_upper;
+#endif /* KMP_OS_WINDOWS */
+};
+
+#else /* KMP_STATIC_STEAL_ENABLED */
+
+// replaces dispatch_private_info{32,64} structures and
+// dispatch_private_info{32,64}_t types
+template <typename T> struct dispatch_private_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  T lb;
+  T ub;
+  ST st; // signed
+  UT tc; // unsigned
+
+  T parm1;
+  T parm2;
+  T parm3;
+  T parm4;
+
+  UT count; // unsigned
+
+  UT ordered_lower; // unsigned
+  UT ordered_upper; // unsigned
+#if KMP_OS_WINDOWS
+  T last_upper;
+#endif /* KMP_OS_WINDOWS */
+};
+#endif /* KMP_STATIC_STEAL_ENABLED */
+
+template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
+  // duplicate alignment here, otherwise size of structure is not correct in our
+  // compiler
+  union KMP_ALIGN_CACHE private_info_tmpl {
+    dispatch_private_infoXX_template<T> p;
+    dispatch_private_info64_t p64;
+  } u;
+  enum sched_type schedule; /* scheduling algorithm */
+  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+  std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
+  kmp_uint32 ordered_bumped;
+  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
+  kmp_uint32 type_size;
+#if KMP_USE_HIER_SCHED
+  kmp_int32 hier_id;
+  kmp_hier_top_unit_t<T> *hier_parent;
+  // member functions
+  kmp_int32 get_hier_id() const { return hier_id; }
+  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
+#endif
+  enum cons_type pushed_ws;
+};
+
+// replaces dispatch_shared_info{32,64} structures and
+// dispatch_shared_info{32,64}_t types
+template <typename T> struct dispatch_shared_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  /* chunk index under dynamic, number of idle threads under static-steal;
+     iteration index otherwise */
+  volatile UT iteration;
+  volatile ST num_done;
+  volatile UT ordered_iteration;
+  // to retain the structure size making ordered_iteration scalar
+  UT ordered_dummy[KMP_MAX_ORDERED - 3];
+};
+
+// replaces dispatch_shared_info structure and dispatch_shared_info_t type
+template <typename T> struct dispatch_shared_info_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  // we need union here to keep the structure size
+  union shared_info_tmpl {
+    dispatch_shared_infoXX_template<UT> s;
+    dispatch_shared_info64_t s64;
+  } u;
+  volatile kmp_uint32 buffer_index;
+  volatile kmp_int32 doacross_buf_idx; // teamwise index
+  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
+  kmp_int32 doacross_num_done; // count finished threads
+#if KMP_USE_HIER_SCHED
+  kmp_hier_t<T> *hier;
+#endif
+#if KMP_USE_HWLOC
+  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
+  // machines (> 48 cores). Performance analysis showed that a cache thrash
+  // was occurring and this padding helps alleviate the problem.
+  char padding[64];
+#endif
+};
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#undef USE_TEST_LOCKS
+
+// test_then_add template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
+
+template <>
+__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
+                                                 kmp_int32 d) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_ADD32(p, d);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
+                                                 kmp_int64 d) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_ADD64(p, d);
+  return r;
+}
+
+// test_then_inc_acq template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
+
+template <>
+__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_INC_ACQ32(p);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_INC_ACQ64(p);
+  return r;
+}
+
+// test_then_inc template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_inc(volatile T *p);
+
+template <>
+__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_INC32(p);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_INC64(p);
+  return r;
+}
+
+// compare_and_swap template (general template should NOT be used)
+template <typename T>
+static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
+
+template <>
+__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
+                                                    kmp_int32 c, kmp_int32 s) {
+  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
+}
+
+template <>
+__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
+                                                    kmp_int64 c, kmp_int64 s) {
+  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
+}
+
+template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
+  return value >= checker;
+}
+template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
+  return value == checker;
+}
+
+/*
+    Spin wait loop that pauses between checks.
+    Waits until function returns non-zero when called with *spinner and check.
+    Does NOT put threads to sleep.
+    Arguments:
+        UT is unsigned 4- or 8-byte type
+        spinner - memory location to check value
+        checker - value which spinner is >, <, ==, etc.
+        pred - predicate function to perform binary comparison of some sort
+#if USE_ITT_BUILD
+        obj -- is higher-level synchronization object to report to ittnotify. It
+        is used to report locks consistently. For example, if lock is acquired
+        immediately, its address is reported to ittnotify via
+        KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
+        and lock routine calls to KMP_WAIT(), the later should report the
+        same address, not an address of low-level spinner.
+#endif // USE_ITT_BUILD
+    TODO: make inline function (move to header file for icl)
+*/
+template <typename UT>
+static UT __kmp_wait(volatile UT *spinner, UT checker,
+                     kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
+  // note: we may not belong to a team at this point
+  volatile UT *spin = spinner;
+  UT check = checker;
+  kmp_uint32 spins;
+  kmp_uint32 (*f)(UT, UT) = pred;
+  kmp_uint64 time;
+  UT r;
+
+  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
+  KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
+  // main wait spin loop
+  while (!f(r = *spin, check)) {
+    KMP_FSYNC_SPIN_PREPARE(obj);
+    /* GEH - remove this since it was accidentally introduced when kmp_wait was
+       split.
+       It causes problems with infinite recursion because of exit lock */
+    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
+        __kmp_abort_thread(); */
+    // If oversubscribed, or have waited a bit then yield.
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
+  }
+  KMP_FSYNC_SPIN_ACQUIRED(obj);
+  return r;
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+template <typename UT>
+void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  dispatch_private_info_template<UT> *pr;
+
+  int gtid = *gtid_ref;
+  //    int  cid = *cid_ref;
+  kmp_info_t *th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_dispatch);
+
+  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
+  if (__kmp_env_consistency_check) {
+    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+        th->th.th_dispatch->th_dispatch_pr_current);
+    if (pr->pushed_ws != ct_none) {
+#if KMP_USE_DYNAMIC_LOCK
+      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
+#else
+      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
+#endif
+    }
+  }
+
+  if (!th->th.th_team->t.t_serialized) {
+    dispatch_shared_info_template<UT> *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+    UT lower;
+
+    if (!__kmp_env_consistency_check) {
+      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+          th->th.th_dispatch->th_dispatch_pr_current);
+    }
+    lower = pr->u.p.ordered_lower;
+
+#if !defined(KMP_GOMP_COMPAT)
+    if (__kmp_env_consistency_check) {
+      if (pr->ordered_bumped) {
+        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
+                               ct_ordered_in_pdo, loc_ref,
+                               &p->stack_data[p->w_top]);
+      }
+    }
+#endif /* !defined(KMP_GOMP_COMPAT) */
+
+    KMP_MB();
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
+                              "ordered_iter:%%%s lower:%%%s\n",
+                              traits_t<UT>::spec, traits_t<UT>::spec);
+      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+      __kmp_str_free(&buff);
+    }
+#endif
+    __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+                   __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+    KMP_MB(); /* is this necessary? */
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
+                              "ordered_iter:%%%s lower:%%%s\n",
+                              traits_t<UT>::spec, traits_t<UT>::spec);
+      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+      __kmp_str_free(&buff);
+    }
+#endif
+  }
+  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
+}
+
+template <typename UT>
+void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  typedef typename traits_t<UT>::signed_t ST;
+  dispatch_private_info_template<UT> *pr;
+
+  int gtid = *gtid_ref;
+  //    int  cid = *cid_ref;
+  kmp_info_t *th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_dispatch);
+
+  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
+  if (__kmp_env_consistency_check) {
+    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+        th->th.th_dispatch->th_dispatch_pr_current);
+    if (pr->pushed_ws != ct_none) {
+      __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
+    }
+  }
+
+  if (!th->th.th_team->t.t_serialized) {
+    dispatch_shared_info_template<UT> *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+
+    if (!__kmp_env_consistency_check) {
+      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+          th->th.th_dispatch->th_dispatch_pr_current);
+    }
+
+    KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
+#if !defined(KMP_GOMP_COMPAT)
+    if (__kmp_env_consistency_check) {
+      if (pr->ordered_bumped != 0) {
+        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+        /* How to test it? - OM */
+        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
+                               ct_ordered_in_pdo, loc_ref,
+                               &p->stack_data[p->w_top]);
+      }
+    }
+#endif /* !defined(KMP_GOMP_COMPAT) */
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    pr->ordered_bumped += 1;
+
+    KD_TRACE(1000,
+             ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
+              gtid, pr->ordered_bumped));
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    /* TODO use general release procedure? */
+    test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+  }
+  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
+}
+
+/* Computes and returns x to the power of y, where y must a non-negative integer
+ */
+template <typename UT>
+static __forceinline long double __kmp_pow(long double x, UT y) {
+  long double s = 1.0L;
+
+  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
+  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
+  while (y) {
+    if (y & 1)
+      s *= x;
+    x *= x;
+    y >>= 1;
+  }
+  return s;
+}
+
+/* Computes and returns the number of unassigned iterations after idx chunks
+   have been assigned
+   (the total number of unassigned iterations in chunks with index greater than
+   or equal to idx).
+   __forceinline seems to be broken so that if we __forceinline this function,
+   the behavior is wrong
+   (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
+*/
+template <typename T>
+static __inline typename traits_t<T>::unsigned_t
+__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
+                                typename traits_t<T>::unsigned_t idx) {
+  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
+     least for ICL 8.1, long double arithmetic may not really have
+     long double precision, even with /Qlong_double.  Currently, we
+     workaround that in the caller code, by manipulating the FPCW for
+     Windows* OS on IA-32 architecture.  The lack of precision is not
+     expected to be a correctness issue, though.
+  */
+  typedef typename traits_t<T>::unsigned_t UT;
+
+  long double x = tc * __kmp_pow<UT>(base, idx);
+  UT r = (UT)x;
+  if (x == r)
+    return r;
+  return r + 1;
+}
+
+// Parameters of the guided-iterative algorithm:
+//   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
+//   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
+// by default n = 2. For example with n = 3 the chunks distribution will be more
+// flat.
+// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
+static const int guided_int_param = 2;
+static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
+#endif // KMP_DISPATCH_H
diff --git a/third_party/openmp/kmp_dispatch_hier.h b/third_party/openmp/kmp_dispatch_hier.h
new file mode 100644
index 000000000..dbea088ff
--- /dev/null
+++ b/third_party/openmp/kmp_dispatch_hier.h
@@ -0,0 +1,1112 @@
+/*
+ * kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_DISPATCH_HIER_H
+#define KMP_DISPATCH_HIER_H
+#include "kmp.h"
+#include "kmp_dispatch.h"
+
+// Layer type for scheduling hierarchy
+enum kmp_hier_layer_e {
+  LAYER_THREAD = -1,
+  LAYER_L1,
+  LAYER_L2,
+  LAYER_L3,
+  LAYER_NUMA,
+  LAYER_LOOP,
+  LAYER_LAST
+};
+
+// Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
+static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
+  switch (type) {
+  case kmp_hier_layer_e::LAYER_THREAD:
+    return "THREAD";
+  case kmp_hier_layer_e::LAYER_L1:
+    return "L1";
+  case kmp_hier_layer_e::LAYER_L2:
+    return "L2";
+  case kmp_hier_layer_e::LAYER_L3:
+    return "L3";
+  case kmp_hier_layer_e::LAYER_NUMA:
+    return "NUMA";
+  case kmp_hier_layer_e::LAYER_LOOP:
+    return "WHOLE_LOOP";
+  case kmp_hier_layer_e::LAYER_LAST:
+    return "LAST";
+  }
+  KMP_ASSERT(0);
+  // Appease compilers, should never get here
+  return "ERROR";
+}
+
+// Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
+typedef struct kmp_hier_sched_env_t {
+  int size;
+  int capacity;
+  enum sched_type *scheds;
+  kmp_int32 *small_chunks;
+  kmp_int64 *large_chunks;
+  kmp_hier_layer_e *layers;
+  // Append a level of the hierarchy
+  void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
+    if (capacity == 0) {
+      scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *
+                                                  kmp_hier_layer_e::LAYER_LAST);
+      capacity = kmp_hier_layer_e::LAYER_LAST;
+    }
+    int current_size = size;
+    KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
+    scheds[current_size] = sched;
+    layers[current_size] = layer;
+    small_chunks[current_size] = chunk;
+    large_chunks[current_size] = (kmp_int64)chunk;
+    size++;
+  }
+  // Sort the hierarchy using selection sort, size will always be small
+  // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
+  void sort() {
+    if (size <= 1)
+      return;
+    for (int i = 0; i < size; ++i) {
+      int switch_index = i;
+      for (int j = i + 1; j < size; ++j) {
+        if (layers[j] < layers[switch_index])
+          switch_index = j;
+      }
+      if (switch_index != i) {
+        kmp_hier_layer_e temp1 = layers[i];
+        enum sched_type temp2 = scheds[i];
+        kmp_int32 temp3 = small_chunks[i];
+        kmp_int64 temp4 = large_chunks[i];
+        layers[i] = layers[switch_index];
+        scheds[i] = scheds[switch_index];
+        small_chunks[i] = small_chunks[switch_index];
+        large_chunks[i] = large_chunks[switch_index];
+        layers[switch_index] = temp1;
+        scheds[switch_index] = temp2;
+        small_chunks[switch_index] = temp3;
+        large_chunks[switch_index] = temp4;
+      }
+    }
+  }
+  // Free all memory
+  void deallocate() {
+    if (capacity > 0) {
+      __kmp_free(scheds);
+      __kmp_free(layers);
+      __kmp_free(small_chunks);
+      __kmp_free(large_chunks);
+      scheds = NULL;
+      layers = NULL;
+      small_chunks = NULL;
+      large_chunks = NULL;
+    }
+    size = 0;
+    capacity = 0;
+  }
+} kmp_hier_sched_env_t;
+
+extern int __kmp_dispatch_hand_threading;
+extern kmp_hier_sched_env_t __kmp_hier_scheds;
+
+// Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
+extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
+extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
+
+extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);
+extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);
+extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
+                                        kmp_hier_layer_e t2);
+extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
+
+template <typename T> struct kmp_hier_shared_bdata_t {
+  typedef typename traits_t<T>::signed_t ST;
+  volatile kmp_uint64 val[2];
+  kmp_int32 status[2];
+  T lb[2];
+  T ub[2];
+  ST st[2];
+  dispatch_shared_info_template<T> sh[2];
+  void zero() {
+    val[0] = val[1] = 0;
+    status[0] = status[1] = 0;
+    lb[0] = lb[1] = 0;
+    ub[0] = ub[1] = 0;
+    st[0] = st[1] = 0;
+    sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
+  }
+  void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
+                            kmp_uint64 index) {
+    lb[1 - index] = nlb;
+    ub[1 - index] = nub;
+    st[1 - index] = nst;
+    status[1 - index] = nstatus;
+  }
+  void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
+    lb[1 - index] = nlb;
+    ub[1 - index] = nub;
+    st[1 - index] = nst;
+    status[1 - index] = nstatus;
+    sh[1 - index].u.s.iteration = 0;
+  }
+
+  kmp_int32 get_next_status(kmp_uint64 index) const {
+    return status[1 - index];
+  }
+  T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }
+  T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }
+  ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }
+  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
+    return &(sh[1 - index]);
+  }
+
+  kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }
+  T get_curr_lb(kmp_uint64 index) const { return lb[index]; }
+  T get_curr_ub(kmp_uint64 index) const { return ub[index]; }
+  ST get_curr_st(kmp_uint64 index) const { return st[index]; }
+  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
+    return &(sh[index]);
+  }
+};
+
+/*
+ * In the barrier implementations, num_active is the number of threads that are
+ * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
+ * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
+ * structure. tdata is the thread private data that resides on the thread
+ * data structure.
+ *
+ * The reset_shared() method is used to initialize the barrier data on the
+ * kmp_hier_top_unit_t hierarchy structure
+ *
+ * The reset_private() method is used to initialize the barrier data on the
+ * thread's private dispatch buffer structure
+ *
+ * The barrier() method takes an id, which is that thread's id for the
+ * kmp_hier_top_unit_t structure, and implements the barrier.  All threads wait
+ * inside barrier() until all fellow threads who are attached to that
+ * kmp_hier_top_unit_t structure have arrived.
+ */
+
+// Core barrier implementation
+// Can be used in a unit with between 2 to 8 threads
+template <typename T> class core_barrier_impl {
+  static inline kmp_uint64 get_wait_val(int num_active) {
+    kmp_uint64 wait_val = 0LL;
+    switch (num_active) {
+    case 2:
+      wait_val = 0x0101LL;
+      break;
+    case 3:
+      wait_val = 0x010101LL;
+      break;
+    case 4:
+      wait_val = 0x01010101LL;
+      break;
+    case 5:
+      wait_val = 0x0101010101LL;
+      break;
+    case 6:
+      wait_val = 0x010101010101LL;
+      break;
+    case 7:
+      wait_val = 0x01010101010101LL;
+      break;
+    case 8:
+      wait_val = 0x0101010101010101LL;
+      break;
+    default:
+      // don't use the core_barrier_impl for more than 8 threads
+      KMP_ASSERT(0);
+    }
+    return wait_val;
+  }
+
+public:
+  static void reset_private(kmp_int32 num_active,
+                            kmp_hier_private_bdata_t *tdata);
+  static void reset_shared(kmp_int32 num_active,
+                           kmp_hier_shared_bdata_t<T> *bdata);
+  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
+                      kmp_hier_private_bdata_t *tdata);
+};
+
+template <typename T>
+void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
+                                         kmp_hier_private_bdata_t *tdata) {
+  tdata->num_active = num_active;
+  tdata->index = 0;
+  tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
+}
+template <typename T>
+void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
+                                        kmp_hier_shared_bdata_t<T> *bdata) {
+  bdata->val[0] = bdata->val[1] = 0LL;
+  bdata->status[0] = bdata->status[1] = 0LL;
+}
+template <typename T>
+void core_barrier_impl<T>::barrier(kmp_int32 id,
+                                   kmp_hier_shared_bdata_t<T> *bdata,
+                                   kmp_hier_private_bdata_t *tdata) {
+  kmp_uint64 current_index = tdata->index;
+  kmp_uint64 next_index = 1 - current_index;
+  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
+  kmp_uint64 next_wait_value =
+      (current_wait_value ? 0 : get_wait_val(tdata->num_active));
+  KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
+                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
+                __kmp_get_gtid(), current_index, next_index, current_wait_value,
+                next_wait_value));
+  char v = (current_wait_value ? '\1' : '\0');
+  (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
+  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+                         __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+  tdata->wait_val[current_index] = next_wait_value;
+  tdata->index = next_index;
+}
+
+// Counter barrier implementation
+// Can be used in a unit with arbitrary number of active threads
+template <typename T> class counter_barrier_impl {
+public:
+  static void reset_private(kmp_int32 num_active,
+                            kmp_hier_private_bdata_t *tdata);
+  static void reset_shared(kmp_int32 num_active,
+                           kmp_hier_shared_bdata_t<T> *bdata);
+  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
+                      kmp_hier_private_bdata_t *tdata);
+};
+
+template <typename T>
+void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
+                                            kmp_hier_private_bdata_t *tdata) {
+  tdata->num_active = num_active;
+  tdata->index = 0;
+  tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
+}
+template <typename T>
+void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
+                                           kmp_hier_shared_bdata_t<T> *bdata) {
+  bdata->val[0] = bdata->val[1] = 0LL;
+  bdata->status[0] = bdata->status[1] = 0LL;
+}
+template <typename T>
+void counter_barrier_impl<T>::barrier(kmp_int32 id,
+                                      kmp_hier_shared_bdata_t<T> *bdata,
+                                      kmp_hier_private_bdata_t *tdata) {
+  volatile kmp_int64 *val;
+  kmp_uint64 current_index = tdata->index;
+  kmp_uint64 next_index = 1 - current_index;
+  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
+  kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
+
+  KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
+                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
+                __kmp_get_gtid(), current_index, next_index, current_wait_value,
+                next_wait_value));
+  val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
+  KMP_TEST_THEN_INC64(val);
+  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+                         __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+  tdata->wait_val[current_index] = next_wait_value;
+  tdata->index = next_index;
+}
+
+// Data associated with topology unit within a layer
+// For example, one kmp_hier_top_unit_t corresponds to one L1 cache
+template <typename T> struct kmp_hier_top_unit_t {
+  typedef typename traits_t<T>::signed_t ST;
+  typedef typename traits_t<T>::unsigned_t UT;
+  kmp_int32 active; // number of topology units that communicate with this unit
+  // chunk information (lower/upper bound, stride, etc.)
+  dispatch_private_info_template<T> hier_pr;
+  kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit
+  kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit
+
+  kmp_int32 get_hier_id() const { return hier_pr.hier_id; }
+  void reset_shared_barrier() {
+    KMP_DEBUG_ASSERT(active > 0);
+    if (active == 1)
+      return;
+    hier_barrier.zero();
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::reset_shared(active, &hier_barrier);
+    } else {
+      counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
+    }
+  }
+  void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(active > 0);
+    if (active == 1)
+      return;
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::reset_private(active, tdata);
+    } else {
+      counter_barrier_impl<T>::reset_private(active, tdata);
+    }
+  }
+  void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(active > 0);
+    KMP_DEBUG_ASSERT(id >= 0 && id < active);
+    if (active == 1) {
+      tdata->index = 1 - tdata->index;
+      return;
+    }
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
+    } else {
+      counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
+    }
+  }
+
+  kmp_int32 get_next_status(kmp_uint64 index) const {
+    return hier_barrier.get_next_status(index);
+  }
+  T get_next_lb(kmp_uint64 index) const {
+    return hier_barrier.get_next_lb(index);
+  }
+  T get_next_ub(kmp_uint64 index) const {
+    return hier_barrier.get_next_ub(index);
+  }
+  ST get_next_st(kmp_uint64 index) const {
+    return hier_barrier.get_next_st(index);
+  }
+  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
+    return hier_barrier.get_next_sh(index);
+  }
+
+  kmp_int32 get_curr_status(kmp_uint64 index) const {
+    return hier_barrier.get_curr_status(index);
+  }
+  T get_curr_lb(kmp_uint64 index) const {
+    return hier_barrier.get_curr_lb(index);
+  }
+  T get_curr_ub(kmp_uint64 index) const {
+    return hier_barrier.get_curr_ub(index);
+  }
+  ST get_curr_st(kmp_uint64 index) const {
+    return hier_barrier.get_curr_st(index);
+  }
+  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
+    return hier_barrier.get_curr_sh(index);
+  }
+
+  void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
+                            kmp_uint64 index) {
+    hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
+  }
+  void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
+    hier_barrier.set_next(lb, ub, st, status, index);
+  }
+  dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }
+  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
+  dispatch_private_info_template<T> *get_parent_pr() {
+    return &(hier_parent->hier_pr);
+  }
+
+  kmp_int32 is_active() const { return active; }
+  kmp_int32 get_num_active() const { return active; }
+#ifdef KMP_DEBUG
+  void print() {
+    KD_TRACE(
+        10,
+        ("    kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
+         active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
+         hier_pr.u.p.tc));
+  }
+#endif
+};
+
+// Information regarding a single layer within the scheduling hierarchy
+template <typename T> struct kmp_hier_layer_info_t {
+  int num_active; // number of threads active in this level
+  kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.
+  enum sched_type sched; // static, dynamic, guided, etc.
+  typename traits_t<T>::signed_t chunk; // chunk size associated with schedule
+  int length; // length of the kmp_hier_top_unit_t array
+
+#ifdef KMP_DEBUG
+  // Print this layer's information
+  void print() {
+    const char *t = __kmp_get_hier_str(type);
+    KD_TRACE(
+        10,
+        ("    kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
+         "length:%d\n",
+         num_active, t, sched, chunk, length));
+  }
+#endif
+};
+
+/*
+ * Structure to implement entire hierarchy
+ *
+ * The hierarchy is kept as an array of arrays to represent the different
+ * layers.  Layer 0 is the lowest layer to layer num_layers - 1 which is the
+ * highest layer.
+ * Example:
+ * [ 2 ] -> [ L3 | L3 ]
+ * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
+ * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
+ * There is also an array of layer_info_t which has information regarding
+ * each layer
+ */
+template <typename T> struct kmp_hier_t {
+public:
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+
+private:
+  int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,
+                   kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
+                   kmp_int32 previous_id, int hier_level) {
+    int status;
+    kmp_info_t *th = __kmp_threads[gtid];
+    auto parent = current->get_parent();
+    bool last_layer = (hier_level == get_num_layers() - 1);
+    KMP_DEBUG_ASSERT(th);
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
+    KMP_DEBUG_ASSERT(current);
+    KMP_DEBUG_ASSERT(hier_level >= 0);
+    KMP_DEBUG_ASSERT(hier_level < get_num_layers());
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(parent || last_layer);
+
+    KD_TRACE(
+        1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
+
+    T hier_id = (T)current->get_hier_id();
+    // Attempt to grab next iteration range for this level
+    if (previous_id == 0) {
+      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",
+                   gtid, hier_level));
+      kmp_int32 contains_last;
+      T my_lb, my_ub;
+      ST my_st;
+      T nproc;
+      dispatch_shared_info_template<T> volatile *my_sh;
+      dispatch_private_info_template<T> *my_pr;
+      if (last_layer) {
+        // last layer below the very top uses the single shared buffer
+        // from the team struct.
+        KD_TRACE(10,
+                 ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
+                  gtid, hier_level));
+        my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+        nproc = (T)get_top_level_nproc();
+      } else {
+        // middle layers use the shared buffer inside the kmp_hier_top_unit_t
+        // structure
+        KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
+                      gtid, hier_level));
+        my_sh =
+            parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
+        nproc = (T)parent->get_num_active();
+      }
+      my_pr = current->get_my_pr();
+      KMP_DEBUG_ASSERT(my_sh);
+      KMP_DEBUG_ASSERT(my_pr);
+      enum sched_type schedule = get_sched(hier_level);
+      ST chunk = (ST)get_chunk(hier_level);
+      status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
+                                                &contains_last, &my_lb, &my_ub,
+                                                &my_st, nproc, hier_id);
+      KD_TRACE(
+          10,
+          ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
+           gtid, hier_level, status));
+      // When no iterations are found (status == 0) and this is not the last
+      // layer, attempt to go up the hierarchy for more iterations
+      if (status == 0 && !last_layer) {
+        kmp_int32 hid;
+        __kmp_type_convert(hier_id, &hid);
+        status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
+                              &my_st, hid, hier_level + 1);
+        KD_TRACE(
+            10,
+            ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
+             gtid, hier_level, status));
+        if (status == 1) {
+          kmp_hier_private_bdata_t *upper_tdata =
+              &(th->th.th_hier_bar_data[hier_level + 1]);
+          my_sh = parent->get_curr_sh(upper_tdata->index);
+          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
+                        gtid, hier_level));
+          __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
+                                        parent->get_curr_lb(upper_tdata->index),
+                                        parent->get_curr_ub(upper_tdata->index),
+                                        parent->get_curr_st(upper_tdata->index),
+#if USE_ITT_BUILD
+                                        NULL,
+#endif
+                                        chunk, nproc, hier_id);
+          status = __kmp_dispatch_next_algorithm<T>(
+              gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
+              hier_id);
+          if (!status) {
+            KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
+                          "setting to 2!\n",
+                          gtid, hier_level));
+            status = 2;
+          }
+        }
+      }
+      current->set_next(my_lb, my_ub, my_st, status, tdata->index);
+      // Propagate whether a unit holds the actual global last iteration
+      // The contains_last attribute is sent downwards from the top to the
+      // bottom of the hierarchy via the contains_last flag inside the
+      // private dispatch buffers in the hierarchy's middle layers
+      if (contains_last) {
+        // If the next_algorithm() method returns 1 for p_last and it is the
+        // last layer or our parent contains the last serial chunk, then the
+        // chunk must contain the last serial iteration.
+        if (last_layer || parent->hier_pr.flags.contains_last) {
+          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
+                        "to contain last.\n",
+                        gtid, hier_level));
+          current->hier_pr.flags.contains_last = contains_last;
+        }
+        if (!current->hier_pr.flags.contains_last)
+          contains_last = FALSE;
+      }
+      if (p_last)
+        *p_last = contains_last;
+    } // if primary thread of this unit
+    if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
+      KD_TRACE(10,
+               ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
+                gtid, hier_level));
+      current->barrier(previous_id, tdata);
+      KD_TRACE(10,
+               ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
+                gtid, hier_level, current->get_curr_status(tdata->index)));
+    } else {
+      KMP_DEBUG_ASSERT(previous_id == 0);
+      return status;
+    }
+    return current->get_curr_status(tdata->index);
+  }
+
+public:
+  int top_level_nproc;
+  int num_layers;
+  bool valid;
+  int type_size;
+  kmp_hier_layer_info_t<T> *info;
+  kmp_hier_top_unit_t<T> **layers;
+  // Deallocate all memory from this hierarchy
+  void deallocate() {
+    for (int i = 0; i < num_layers; ++i)
+      if (layers[i] != NULL) {
+        __kmp_free(layers[i]);
+      }
+    if (layers != NULL) {
+      __kmp_free(layers);
+      layers = NULL;
+    }
+    if (info != NULL) {
+      __kmp_free(info);
+      info = NULL;
+    }
+    num_layers = 0;
+    valid = false;
+  }
+  // Returns true if reallocation is needed else false
+  bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,
+                          const enum sched_type *new_scheds,
+                          const ST *new_chunks) const {
+    if (!valid || layers == NULL || info == NULL ||
+        traits_t<T>::type_size != type_size || n != num_layers)
+      return true;
+    for (int i = 0; i < n; ++i) {
+      if (info[i].type != new_layers[i])
+        return true;
+      if (info[i].sched != new_scheds[i])
+        return true;
+      if (info[i].chunk != new_chunks[i])
+        return true;
+    }
+    return false;
+  }
+  // A single thread should call this function while the other threads wait
+  // create a new scheduling hierarchy consisting of new_layers, new_scheds
+  // and new_chunks.  These should come pre-sorted according to
+  // kmp_hier_layer_e value.  This function will try to avoid reallocation
+  // if it can
+  void allocate_hier(int n, const kmp_hier_layer_e *new_layers,
+                     const enum sched_type *new_scheds, const ST *new_chunks) {
+    top_level_nproc = 0;
+    if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
+      KD_TRACE(
+          10,
+          ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
+      for (int i = 0; i < n; ++i) {
+        info[i].num_active = 0;
+        for (int j = 0; j < get_length(i); ++j)
+          layers[i][j].active = 0;
+      }
+      return;
+    }
+    KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
+    deallocate();
+    type_size = traits_t<T>::type_size;
+    num_layers = n;
+    info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
+        sizeof(kmp_hier_layer_info_t<T>) * n);
+    layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
+        sizeof(kmp_hier_top_unit_t<T> *) * n);
+    for (int i = 0; i < n; ++i) {
+      int max = 0;
+      kmp_hier_layer_e layer = new_layers[i];
+      info[i].num_active = 0;
+      info[i].type = layer;
+      info[i].sched = new_scheds[i];
+      info[i].chunk = new_chunks[i];
+      max = __kmp_hier_max_units[layer + 1];
+      if (max == 0) {
+        valid = false;
+        KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
+        deallocate();
+        return;
+      }
+      info[i].length = max;
+      layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
+          sizeof(kmp_hier_top_unit_t<T>) * max);
+      for (int j = 0; j < max; ++j) {
+        layers[i][j].active = 0;
+        layers[i][j].hier_pr.flags.use_hier = TRUE;
+      }
+    }
+    valid = true;
+  }
+  // loc - source file location
+  // gtid - global thread identifier
+  // pr - this thread's private dispatch buffer (corresponding with gtid)
+  // p_last (return value) - pointer to flag indicating this set of iterations
+  // contains last
+  //          iteration
+  // p_lb (return value) - lower bound for this chunk of iterations
+  // p_ub (return value) - upper bound for this chunk of iterations
+  // p_st (return value) - stride for this chunk of iterations
+  //
+  // Returns 1 if there are more iterations to perform, 0 otherwise
+  int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,
+           kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
+    int status;
+    kmp_int32 contains_last = 0;
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
+    auto parent = pr->get_parent();
+    KMP_DEBUG_ASSERT(parent);
+    KMP_DEBUG_ASSERT(th);
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(parent);
+    T nproc = (T)parent->get_num_active();
+    T unit_id = (T)pr->get_hier_id();
+    KD_TRACE(
+        10,
+        ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
+         gtid, nproc, unit_id));
+    // Handthreading implementation
+    // Each iteration is performed by all threads on last unit (typically
+    // cores/tiles)
+    // e.g., threads 0,1,2,3 all execute iteration 0
+    //       threads 0,1,2,3 all execute iteration 1
+    //       threads 4,5,6,7 all execute iteration 2
+    //       threads 4,5,6,7 all execute iteration 3
+    //       ... etc.
+    if (__kmp_dispatch_hand_threading) {
+      KD_TRACE(10,
+               ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
+                gtid));
+      if (unit_id == 0) {
+        // For hand threading, the sh buffer on the lowest level is only ever
+        // modified and read by the primary thread on that level.  Because of
+        // this, we can always use the first sh buffer.
+        auto sh = &(parent->hier_barrier.sh[0]);
+        KMP_DEBUG_ASSERT(sh);
+        status = __kmp_dispatch_next_algorithm<T>(
+            gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+        if (!status) {
+          bool done = false;
+          while (!done) {
+            done = true;
+            kmp_int32 uid;
+            __kmp_type_convert(unit_id, &uid);
+            status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
+                                  p_st, uid, 0);
+            if (status == 1) {
+              __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
+                                            parent->get_next_lb(tdata->index),
+                                            parent->get_next_ub(tdata->index),
+                                            parent->get_next_st(tdata->index),
+#if USE_ITT_BUILD
+                                            NULL,
+#endif
+                                            pr->u.p.parm1, nproc, unit_id);
+              sh->u.s.iteration = 0;
+              status = __kmp_dispatch_next_algorithm<T>(
+                  gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
+                  unit_id);
+              if (!status) {
+                KD_TRACE(10,
+                         ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
+                          "after next_pr_sh()"
+                          "trying again.\n",
+                          gtid));
+                done = false;
+              }
+            } else if (status == 2) {
+              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
+                            "trying again.\n",
+                            gtid));
+              done = false;
+            }
+          }
+        }
+        parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
+      } // if primary thread of lowest unit level
+      parent->barrier(pr->get_hier_id(), tdata);
+      if (unit_id != 0) {
+        *p_lb = parent->get_curr_lb(tdata->index);
+        *p_ub = parent->get_curr_ub(tdata->index);
+        *p_st = parent->get_curr_st(tdata->index);
+        status = parent->get_curr_status(tdata->index);
+      }
+    } else {
+      // Normal implementation
+      // Each thread grabs an iteration chunk and executes it (no cooperation)
+      auto sh = parent->get_curr_sh(tdata->index);
+      KMP_DEBUG_ASSERT(sh);
+      status = __kmp_dispatch_next_algorithm<T>(
+          gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+      KD_TRACE(10,
+               ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
+                "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
+                gtid, status, contains_last, *p_lb, *p_ub, *p_st));
+      if (!status) {
+        bool done = false;
+        while (!done) {
+          done = true;
+          kmp_int32 uid;
+          __kmp_type_convert(unit_id, &uid);
+          status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
+                                p_st, uid, 0);
+          if (status == 1) {
+            sh = parent->get_curr_sh(tdata->index);
+            __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
+                                          parent->get_curr_lb(tdata->index),
+                                          parent->get_curr_ub(tdata->index),
+                                          parent->get_curr_st(tdata->index),
+#if USE_ITT_BUILD
+                                          NULL,
+#endif
+                                          pr->u.p.parm1, nproc, unit_id);
+            status = __kmp_dispatch_next_algorithm<T>(
+                gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+            if (!status) {
+              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
+                            "after next_pr_sh()"
+                            "trying again.\n",
+                            gtid));
+              done = false;
+            }
+          } else if (status == 2) {
+            KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
+                          "trying again.\n",
+                          gtid));
+            done = false;
+          }
+        }
+      }
+    }
+    if (contains_last && !parent->hier_pr.flags.contains_last) {
+      KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
+                    "contains_last to FALSE\n",
+                    gtid));
+      contains_last = FALSE;
+    }
+    if (p_last)
+      *p_last = contains_last;
+    KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
+                  status));
+    return status;
+  }
+  // These functions probe the layer info structure
+  // Returns the type of topology unit given level
+  kmp_hier_layer_e get_type(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].type;
+  }
+  // Returns the schedule type at given level
+  enum sched_type get_sched(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].sched;
+  }
+  // Returns the chunk size at given level
+  ST get_chunk(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].chunk;
+  }
+  // Returns the number of active threads at given level
+  int get_num_active(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].num_active;
+  }
+  // Returns the length of topology unit array at given level
+  int get_length(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].length;
+  }
+  // Returns the topology unit given the level and index
+  kmp_hier_top_unit_t<T> *get_unit(int level, int index) {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    KMP_DEBUG_ASSERT(index >= 0);
+    KMP_DEBUG_ASSERT(index < get_length(level));
+    return &(layers[level][index]);
+  }
+  // Returns the number of layers in the hierarchy
+  int get_num_layers() const { return num_layers; }
+  // Returns the number of threads in the top layer
+  // This is necessary because we don't store a topology unit as
+  // the very top level and the scheduling algorithms need this information
+  int get_top_level_nproc() const { return top_level_nproc; }
+  // Return whether this hierarchy is valid or not
+  bool is_valid() const { return valid; }
+#ifdef KMP_DEBUG
+  // Print the hierarchy
+  void print() {
+    KD_TRACE(10, ("kmp_hier_t:\n"));
+    for (int i = num_layers - 1; i >= 0; --i) {
+      KD_TRACE(10, ("Info[%d] = ", i));
+      info[i].print();
+    }
+    for (int i = num_layers - 1; i >= 0; --i) {
+      KD_TRACE(10, ("Layer[%d] =\n", i));
+      for (int j = 0; j < info[i].length; ++j) {
+        layers[i][j].print();
+      }
+    }
+  }
+#endif
+};
+
+template <typename T>
+void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
+                                   kmp_hier_layer_e *new_layers,
+                                   enum sched_type *new_scheds,
+                                   typename traits_t<T>::signed_t *new_chunks,
+                                   T lb, T ub,
+                                   typename traits_t<T>::signed_t st) {
+  int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
+  unsigned int my_buffer_index;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  dispatch_private_info_template<T> *pr;
+  dispatch_shared_info_template<T> volatile *sh;
+  gtid = __kmp_entry_gtid();
+  tid = __kmp_tid_from_gtid(gtid);
+#ifdef KMP_DEBUG
+  KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
+                gtid, n));
+  for (int i = 0; i < n; ++i) {
+    const char *layer = __kmp_get_hier_str(new_layers[i]);
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
+                  "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
+                  gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));
+  }
+#endif // KMP_DEBUG
+  KMP_DEBUG_ASSERT(n > 0);
+  KMP_DEBUG_ASSERT(new_layers);
+  KMP_DEBUG_ASSERT(new_scheds);
+  KMP_DEBUG_ASSERT(new_chunks);
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+  th->th.th_ident = loc;
+  num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
+  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+  my_buffer_index = th->th.th_dispatch->th_disp_index;
+  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+      &th->th.th_dispatch
+           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+      &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  if (!active) {
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
+                  "Using normal dispatch functions.\n",
+                  gtid));
+    KMP_DEBUG_ASSERT(pr);
+    pr->flags.use_hier = FALSE;
+    pr->flags.contains_last = FALSE;
+    return;
+  }
+  KMP_DEBUG_ASSERT(pr);
+  KMP_DEBUG_ASSERT(sh);
+  pr->flags.use_hier = TRUE;
+  pr->u.p.tc = 0;
+  // Have primary thread allocate the hierarchy
+  if (__kmp_tid_from_gtid(gtid) == 0) {
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
+                  "hierarchy\n",
+                  gtid, pr, sh));
+    if (sh->hier == NULL) {
+      sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));
+    }
+    sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
+    sh->u.s.iteration = 0;
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  // Check to make sure the hierarchy is valid
+  kmp_hier_t<T> *hier = sh->hier;
+  if (!sh->hier->is_valid()) {
+    pr->flags.use_hier = FALSE;
+    return;
+  }
+  // Have threads allocate their thread-private barrier data if it hasn't
+  // already been allocated
+  if (th->th.th_hier_bar_data == NULL) {
+    th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
+        sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
+  }
+  // Have threads "register" themselves by modifying the active count for each
+  // level they are involved in. The active count will act as nthreads for that
+  // level regarding the scheduling algorithms
+  for (int i = 0; i < n; ++i) {
+    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
+    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
+    // Setup the thread's private dispatch buffer's hierarchy pointers
+    if (i == 0)
+      pr->hier_parent = my_unit;
+    // If this unit is already active, then increment active count and wait
+    if (my_unit->is_active()) {
+      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
+                    "is already active (%d)\n",
+                    gtid, my_unit, my_unit->active));
+      KMP_TEST_THEN_INC32(&(my_unit->active));
+      break;
+    }
+    // Flag that this unit is active
+    if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
+      // Do not setup parent pointer for top level unit since it has no parent
+      if (i < n - 1) {
+        // Setup middle layer pointers to parents
+        my_unit->get_my_pr()->hier_id =
+            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
+                                                 hier->get_type(i + 1));
+        int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
+        my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
+      } else {
+        // Setup top layer information (no parent pointers are set)
+        my_unit->get_my_pr()->hier_id =
+            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
+                                                 kmp_hier_layer_e::LAYER_LOOP);
+        KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
+        my_unit->hier_parent = nullptr;
+      }
+      // Set trip count to 0 so that next() operation will initially climb up
+      // the hierarchy to get more iterations (early exit in next() for tc == 0)
+      my_unit->get_my_pr()->u.p.tc = 0;
+      // Increment this layer's number of active units
+      KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
+      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
+                    "incrementing num_active\n",
+                    gtid, my_unit));
+    } else {
+      KMP_TEST_THEN_INC32(&(my_unit->active));
+      break;
+    }
+  }
+  // Set this thread's id
+  num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
+      kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
+  pr->hier_id = tid % num_threads_per_layer1;
+  // For oversubscribed threads, increment their index within the lowest unit
+  // This is done to prevent having two or more threads with id 0, id 1, etc.
+  if (tid >= num_hw_threads)
+    pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
+  KD_TRACE(
+      10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
+           gtid, pr->hier_id));
+
+  pr->flags.contains_last = FALSE;
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+  // Now that the number of active threads at each level is determined,
+  // the barrier data for each unit can be initialized and the last layer's
+  // loop information can be initialized.
+  int prev_id = pr->get_hier_id();
+  for (int i = 0; i < n; ++i) {
+    if (prev_id != 0)
+      break;
+    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
+    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
+    // Only primary threads of this unit within the hierarchy do initialization
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
+                  gtid, i));
+    my_unit->reset_shared_barrier();
+    my_unit->hier_pr.flags.contains_last = FALSE;
+    // Last layer, initialize the private buffers with entire loop information
+    // Now the next next_algorithm() call will get the first chunk of
+    // iterations properly
+    if (i == n - 1) {
+      __kmp_dispatch_init_algorithm<T>(
+          loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
+#if USE_ITT_BUILD
+          NULL,
+#endif
+          hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
+    }
+    prev_id = my_unit->get_hier_id();
+  }
+  // Initialize each layer of the thread's private barrier data
+  kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
+  for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
+    unit->reset_private_barrier(tdata);
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+#ifdef KMP_DEBUG
+  if (__kmp_tid_from_gtid(gtid) == 0) {
+    for (int i = 0; i < n; ++i) {
+      KD_TRACE(10,
+               ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
+                gtid, i, hier->get_num_active(i)));
+    }
+    hier->print();
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#endif // KMP_DEBUG
+}
+#endif
diff --git a/third_party/openmp/kmp_environment.cpp b/third_party/openmp/kmp_environment.cpp
new file mode 100644
index 000000000..4def6ea9a
--- /dev/null
+++ b/third_party/openmp/kmp_environment.cpp
@@ -0,0 +1,500 @@
+/*
+ * kmp_environment.cpp -- Handle environment variables OS-independently.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/* We use GetEnvironmentVariable for Windows* OS instead of getenv because the
+   act of loading a DLL on Windows* OS makes any user-set environment variables
+   (i.e. with putenv()) unavailable.  getenv() apparently gets a clean copy of
+   the env variables as they existed at the start of the run. JH 12/23/2002
+
+   On Windows* OS, there are two environments (at least, see below):
+
+   1. Environment maintained by Windows* OS on IA-32 architecture. Accessible
+      through GetEnvironmentVariable(), SetEnvironmentVariable(), and
+      GetEnvironmentStrings().
+
+   2. Environment maintained by C RTL. Accessible through getenv(), putenv().
+
+   putenv() function updates both C and Windows* OS on IA-32 architecture.
+   getenv() function search for variables in C RTL environment only.
+   Windows* OS on IA-32 architecture functions work *only* with Windows* OS on
+   IA-32 architecture.
+
+   Windows* OS on IA-32 architecture maintained by OS, so there is always only
+   one Windows* OS on IA-32 architecture per process. Changes in Windows* OS on
+   IA-32 architecture are process-visible.
+
+   C environment maintained by C RTL. Multiple copies of C RTL may be present
+   in the process, and each C RTL maintains its own environment. :-(
+
+   Thus, proper way to work with environment on Windows* OS is:
+
+   1. Set variables with putenv() function -- both C and Windows* OS on IA-32
+      architecture are being updated. Windows* OS on IA-32 architecture may be
+      considered primary target, while updating C RTL environment is free bonus.
+
+   2. Get variables with GetEnvironmentVariable() -- getenv() does not
+      search Windows* OS on IA-32 architecture, and can not see variables
+      set with SetEnvironmentVariable().
+
+   2007-04-05 -- lev
+*/
+
+#include "kmp_environment.h"
+
+#include "kmp.h" //
+#include "kmp_i18n.h"
+#include "kmp_os.h" // KMP_OS_*.
+#include "kmp_str.h" // __kmp_str_*().
+
+#if KMP_OS_UNIX
+#include <stdlib.h> // getenv, setenv, unsetenv.
+#include <string.h> // strlen, strcpy.
+#if KMP_OS_DARWIN
+#include <crt_externs.h>
+#define environ (*_NSGetEnviron())
+#else
+extern char **environ;
+#endif
+#elif KMP_OS_WINDOWS
+#include <windows.h> // GetEnvironmentVariable, SetEnvironmentVariable,
+// GetLastError.
+#else
+#error Unknown or unsupported OS.
+#endif
+
+// TODO: Eliminate direct memory allocations, use string operations instead.
+
+static inline void *allocate(size_t size) {
+  void *ptr = KMP_INTERNAL_MALLOC(size);
+  if (ptr == NULL) {
+    KMP_FATAL(MemoryAllocFailed);
+  }
+  return ptr;
+} // allocate
+
+char *__kmp_env_get(char const *name) {
+
+  char *result = NULL;
+
+#if KMP_OS_UNIX
+  char const *value = getenv(name);
+  if (value != NULL) {
+    size_t len = KMP_STRLEN(value) + 1;
+    result = (char *)KMP_INTERNAL_MALLOC(len);
+    if (result == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    KMP_STRNCPY_S(result, len, value, len);
+  }
+#elif KMP_OS_WINDOWS
+  /* We use GetEnvironmentVariable for Windows* OS instead of getenv because the
+     act of loading a DLL on Windows* OS makes any user-set environment
+     variables (i.e. with putenv()) unavailable. getenv() apparently gets a
+     clean copy of the env variables as they existed at the start of the run.
+     JH 12/23/2002 */
+  DWORD rc;
+  rc = GetEnvironmentVariable(name, NULL, 0);
+  if (!rc) {
+    DWORD error = GetLastError();
+    if (error != ERROR_ENVVAR_NOT_FOUND) {
+      __kmp_fatal(KMP_MSG(CantGetEnvVar, name), KMP_ERR(error), __kmp_msg_null);
+    }
+    // Variable is not found, it's ok, just continue.
+  } else {
+    DWORD len = rc;
+    result = (char *)KMP_INTERNAL_MALLOC(len);
+    if (result == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    rc = GetEnvironmentVariable(name, result, len);
+    if (!rc) {
+      // GetEnvironmentVariable() may return 0 if variable is empty.
+      // In such a case GetLastError() returns ERROR_SUCCESS.
+      DWORD error = GetLastError();
+      if (error != ERROR_SUCCESS) {
+        // Unexpected error. The variable should be in the environment,
+        // and buffer should be large enough.
+        __kmp_fatal(KMP_MSG(CantGetEnvVar, name), KMP_ERR(error),
+                    __kmp_msg_null);
+        KMP_INTERNAL_FREE((void *)result);
+        result = NULL;
+      }
+    }
+  }
+#else
+#error Unknown or unsupported OS.
+#endif
+
+  return result;
+
+} // func __kmp_env_get
+
+// TODO: Find and replace all regular free() with __kmp_env_free().
+
+void __kmp_env_free(char const **value) {
+
+  KMP_DEBUG_ASSERT(value != NULL);
+  KMP_INTERNAL_FREE(CCAST(char *, *value));
+  *value = NULL;
+
+} // func __kmp_env_free
+
+int __kmp_env_exists(char const *name) {
+
+#if KMP_OS_UNIX
+  char const *value = getenv(name);
+  return ((value == NULL) ? (0) : (1));
+#elif KMP_OS_WINDOWS
+  DWORD rc;
+  rc = GetEnvironmentVariable(name, NULL, 0);
+  if (rc == 0) {
+    DWORD error = GetLastError();
+    if (error != ERROR_ENVVAR_NOT_FOUND) {
+      __kmp_fatal(KMP_MSG(CantGetEnvVar, name), KMP_ERR(error), __kmp_msg_null);
+    }
+    return 0;
+  }
+  return 1;
+#else
+#error Unknown or unsupported OS.
+#endif
+
+} // func __kmp_env_exists
+
+void __kmp_env_set(char const *name, char const *value, int overwrite) {
+
+#if KMP_OS_UNIX
+  int rc = setenv(name, value, overwrite);
+  if (rc != 0) {
+    // Dead code. I tried to put too many variables into Linux* OS
+    // environment on IA-32 architecture. When application consumes
+    // more than ~2.5 GB of memory, entire system feels bad. Sometimes
+    // application is killed (by OS?), sometimes system stops
+    // responding... But this error message never appears. --ln
+    __kmp_fatal(KMP_MSG(CantSetEnvVar, name), KMP_HNT(NotEnoughMemory),
+                __kmp_msg_null);
+  }
+#elif KMP_OS_WINDOWS
+  BOOL rc;
+  if (!overwrite) {
+    rc = GetEnvironmentVariable(name, NULL, 0);
+    if (rc) {
+      // Variable exists, do not overwrite.
+      return;
+    }
+    DWORD error = GetLastError();
+    if (error != ERROR_ENVVAR_NOT_FOUND) {
+      __kmp_fatal(KMP_MSG(CantGetEnvVar, name), KMP_ERR(error), __kmp_msg_null);
+    }
+  }
+  rc = SetEnvironmentVariable(name, value);
+  if (!rc) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(CantSetEnvVar, name), KMP_ERR(error), __kmp_msg_null);
+  }
+#else
+#error Unknown or unsupported OS.
+#endif
+
+} // func __kmp_env_set
+
+void __kmp_env_unset(char const *name) {
+
+#if KMP_OS_UNIX
+  unsetenv(name);
+#elif KMP_OS_WINDOWS
+  BOOL rc = SetEnvironmentVariable(name, NULL);
+  if (!rc) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(CantSetEnvVar, name), KMP_ERR(error), __kmp_msg_null);
+  }
+#else
+#error Unknown or unsupported OS.
+#endif
+
+} // func __kmp_env_unset
+
+/* Intel OpenMP RTL string representation of environment: just a string of
+   characters, variables are separated with vertical bars, e. g.:
+
+        "KMP_WARNINGS=0|KMP_AFFINITY=compact|"
+
+    Empty variables are allowed and ignored:
+
+        "||KMP_WARNINGS=1||"
+*/
+
+static void
+___kmp_env_blk_parse_string(kmp_env_blk_t *block, // M: Env block to fill.
+                            char const *env // I: String to parse.
+) {
+
+  char const chr_delimiter = '|';
+  char const str_delimiter[] = {chr_delimiter, 0};
+
+  char *bulk = NULL;
+  kmp_env_var_t *vars = NULL;
+  int count = 0; // Number of used elements in vars array.
+  int delimiters = 0; // Number of delimiters in input string.
+
+  // Copy original string, we will modify the copy.
+  bulk = __kmp_str_format("%s", env);
+
+  // Loop thru all the vars in environment block. Count delimiters (maximum
+  // number of variables is number of delimiters plus one).
+  {
+    char const *ptr = bulk;
+    for (;;) {
+      ptr = strchr(ptr, chr_delimiter);
+      if (ptr == NULL) {
+        break;
+      }
+      ++delimiters;
+      ptr += 1;
+    }
+  }
+
+  // Allocate vars array.
+  vars = (kmp_env_var_t *)allocate((delimiters + 1) * sizeof(kmp_env_var_t));
+
+  // Loop thru all the variables.
+  {
+    char *var; // Pointer to variable (both name and value).
+    char *name; // Pointer to name of variable.
+    char *value; // Pointer to value.
+    char *buf; // Buffer for __kmp_str_token() function.
+    var = __kmp_str_token(bulk, str_delimiter, &buf); // Get the first var.
+    while (var != NULL) {
+      // Save found variable in vars array.
+      __kmp_str_split(var, '=', &name, &value);
+      KMP_DEBUG_ASSERT(count < delimiters + 1);
+      vars[count].name = name;
+      vars[count].value = value;
+      ++count;
+      // Get the next var.
+      var = __kmp_str_token(NULL, str_delimiter, &buf);
+    }
+  }
+
+  // Fill out result.
+  block->bulk = bulk;
+  block->vars = vars;
+  block->count = count;
+}
+
+/* Windows* OS (actually, DOS) environment block is a piece of memory with
+   environment variables. Each variable is terminated with zero byte, entire
+   block is terminated with one extra zero byte, so we have two zero bytes at
+   the end of environment block, e. g.:
+
+        "HOME=C:\\users\\lev\x00OS=Windows_NT\x00\x00"
+
+    It is not clear how empty environment is represented. "\x00\x00"?
+*/
+
+#if KMP_OS_WINDOWS
+static void ___kmp_env_blk_parse_windows(
+    kmp_env_blk_t *block, // M: Env block to fill.
+    char const *env // I: Pointer to Windows* OS (DOS) environment block.
+) {
+
+  char *bulk = NULL;
+  kmp_env_var_t *vars = NULL;
+  int count = 0; // Number of used elements in vars array.
+  int size = 0; // Size of bulk.
+
+  char *name; // Pointer to name of variable.
+  char *value; // Pointer to value.
+
+  if (env != NULL) {
+
+    // Loop thru all the vars in environment block. Count variables, find size
+    // of block.
+    {
+      char const *var; // Pointer to beginning of var.
+      int len; // Length of variable.
+      count = 0;
+      var =
+          env; // The first variable starts and beginning of environment block.
+      len = KMP_STRLEN(var);
+      while (len != 0) {
+        ++count;
+        size = size + len + 1;
+        var = var + len +
+              1; // Move pointer to the beginning of the next variable.
+        len = KMP_STRLEN(var);
+      }
+      size =
+          size + 1; // Total size of env block, including terminating zero byte.
+    }
+
+    // Copy original block to bulk, we will modify bulk, not original block.
+    bulk = (char *)allocate(size);
+    KMP_MEMCPY_S(bulk, size, env, size);
+    // Allocate vars array.
+    vars = (kmp_env_var_t *)allocate(count * sizeof(kmp_env_var_t));
+
+    // Loop thru all the vars, now in bulk.
+    {
+      char *var; // Pointer to beginning of var.
+      int len; // Length of variable.
+      count = 0;
+      var = bulk;
+      len = KMP_STRLEN(var);
+      while (len != 0) {
+        // Save variable in vars array.
+        __kmp_str_split(var, '=', &name, &value);
+        vars[count].name = name;
+        vars[count].value = value;
+        ++count;
+        // Get the next var.
+        var = var + len + 1;
+        len = KMP_STRLEN(var);
+      }
+    }
+  }
+
+  // Fill out result.
+  block->bulk = bulk;
+  block->vars = vars;
+  block->count = count;
+}
+#endif
+
+/* Unix environment block is a array of pointers to variables, last pointer in
+   array is NULL:
+
+        { "HOME=/home/lev", "TERM=xterm", NULL }
+*/
+
+#if KMP_OS_UNIX
+static void
+___kmp_env_blk_parse_unix(kmp_env_blk_t *block, // M: Env block to fill.
+                          char **env // I: Unix environment to parse.
+) {
+  char *bulk = NULL;
+  kmp_env_var_t *vars = NULL;
+  int count = 0;
+  size_t size = 0; // Size of bulk.
+
+  // Count number of variables and length of required bulk.
+  {
+    while (env[count] != NULL) {
+      size += KMP_STRLEN(env[count]) + 1;
+      ++count;
+    }
+  }
+
+  // Allocate memory.
+  bulk = (char *)allocate(size);
+  vars = (kmp_env_var_t *)allocate(count * sizeof(kmp_env_var_t));
+
+  // Loop thru all the vars.
+  {
+    char *var; // Pointer to beginning of var.
+    char *name; // Pointer to name of variable.
+    char *value; // Pointer to value.
+    size_t len; // Length of variable.
+    int i;
+    var = bulk;
+    for (i = 0; i < count; ++i) {
+      KMP_ASSERT(var < bulk + size);
+      [[maybe_unused]] size_t ssize = size - (var - bulk);
+      // Copy variable to bulk.
+      len = KMP_STRLEN(env[i]);
+      KMP_MEMCPY_S(var, ssize, env[i], len + 1);
+      // Save found variable in vars array.
+      __kmp_str_split(var, '=', &name, &value);
+      vars[i].name = name;
+      vars[i].value = value;
+      // Move pointer.
+      var += len + 1;
+    }
+  }
+
+  // Fill out result.
+  block->bulk = bulk;
+  block->vars = vars;
+  block->count = count;
+}
+#endif
+
+void __kmp_env_blk_init(kmp_env_blk_t *block, // M: Block to initialize.
+                        char const *bulk // I: Initialization string, or NULL.
+) {
+
+  if (bulk != NULL) {
+    ___kmp_env_blk_parse_string(block, bulk);
+  } else {
+#if KMP_OS_UNIX
+    ___kmp_env_blk_parse_unix(block, environ);
+#elif KMP_OS_WINDOWS
+    {
+      char *mem = GetEnvironmentStrings();
+      if (mem == NULL) {
+        DWORD error = GetLastError();
+        __kmp_fatal(KMP_MSG(CantGetEnvironment), KMP_ERR(error),
+                    __kmp_msg_null);
+      }
+      ___kmp_env_blk_parse_windows(block, mem);
+      FreeEnvironmentStrings(mem);
+    }
+#else
+#error Unknown or unsupported OS.
+#endif
+  }
+
+} // __kmp_env_blk_init
+
+static int ___kmp_env_var_cmp( // Comparison function for qsort().
+    kmp_env_var_t const *lhs, kmp_env_var_t const *rhs) {
+  return strcmp(lhs->name, rhs->name);
+}
+
+void __kmp_env_blk_sort(
+    kmp_env_blk_t *block // M: Block of environment variables to sort.
+) {
+
+  qsort(CCAST(kmp_env_var_t *, block->vars), block->count,
+        sizeof(kmp_env_var_t),
+        (int (*)(void const *, void const *)) & ___kmp_env_var_cmp);
+
+} // __kmp_env_block_sort
+
+void __kmp_env_blk_free(
+    kmp_env_blk_t *block // M: Block of environment variables to free.
+) {
+
+  KMP_INTERNAL_FREE(CCAST(kmp_env_var_t *, block->vars));
+  __kmp_str_free(&(block->bulk));
+
+  block->count = 0;
+  block->vars = NULL;
+
+} // __kmp_env_blk_free
+
+char const * // R: Value of variable or NULL if variable does not exist.
+__kmp_env_blk_var(kmp_env_blk_t *block, // I: Block of environment variables.
+                  char const *name // I: Name of variable to find.
+) {
+
+  int i;
+  for (i = 0; i < block->count; ++i) {
+    if (strcmp(block->vars[i].name, name) == 0) {
+      return block->vars[i].value;
+    }
+  }
+  return NULL;
+
+} // __kmp_env_block_var
+
+// end of file //
diff --git a/third_party/openmp/kmp_environment.h b/third_party/openmp/kmp_environment.h
new file mode 100644
index 000000000..a7ea9e955
--- /dev/null
+++ b/third_party/openmp/kmp_environment.h
@@ -0,0 +1,77 @@
+/*
+ * kmp_environment.h -- Handle environment variables OS-independently.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_ENVIRONMENT_H
+#define KMP_ENVIRONMENT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Return a copy of the value of environment variable or NULL if the variable
+// does not exist.
+// *Note*: Returned pointed *must* be freed after use with __kmp_env_free().
+char *__kmp_env_get(char const *name);
+void __kmp_env_free(char const **value);
+
+// Return 1 if the environment variable exists or 0 if does not exist.
+int __kmp_env_exists(char const *name);
+
+// Set the environment variable.
+void __kmp_env_set(char const *name, char const *value, int overwrite);
+
+// Unset (remove) environment variable.
+void __kmp_env_unset(char const *name);
+
+// -----------------------------------------------------------------------------
+//  Working with environment blocks.
+
+/* kmp_env_blk_t is read-only collection of environment variables (or
+   environment-like). Usage:
+
+kmp_env_blk_t block;
+__kmp_env_blk_init( & block, NULL ); // Initialize block from process
+                                        // environment.
+// or
+__kmp_env_blk_init( & block, "KMP_WARNING=1|KMP_AFFINITY=none" ); // from string
+__kmp_env_blk_sort( & block ); // Optionally, sort list.
+for ( i = 0; i < block.count; ++ i ) {
+    // Process block.vars[ i ].name and block.vars[ i ].value...
+}
+__kmp_env_block_free( & block );
+*/
+
+struct __kmp_env_var {
+  char *name;
+  char *value;
+};
+typedef struct __kmp_env_var kmp_env_var_t;
+
+struct __kmp_env_blk {
+  char *bulk;
+  kmp_env_var_t *vars;
+  int count;
+};
+typedef struct __kmp_env_blk kmp_env_blk_t;
+
+void __kmp_env_blk_init(kmp_env_blk_t *block, char const *bulk);
+void __kmp_env_blk_free(kmp_env_blk_t *block);
+void __kmp_env_blk_sort(kmp_env_blk_t *block);
+char const *__kmp_env_blk_var(kmp_env_blk_t *block, char const *name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KMP_ENVIRONMENT_H
+
+// end of file //
diff --git a/third_party/openmp/kmp_error.cpp b/third_party/openmp/kmp_error.cpp
new file mode 100644
index 000000000..cf5749dfd
--- /dev/null
+++ b/third_party/openmp/kmp_error.cpp
@@ -0,0 +1,451 @@
+/*
+ * kmp_error.cpp -- KPTS functions for error checking at runtime
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_str.h"
+
+/* ------------------------------------------------------------------------ */
+
+#define MIN_STACK 100
+
+static char const *cons_text_c[] = {
+    "(none)",
+    "\"parallel\"",
+    "work-sharing", /* this is not called "for"
+                       because of lowering of
+                       "sections" pragmas */
+    "\"ordered\" work-sharing", /* this is not called "for ordered" because of
+                                   lowering of "sections" pragmas */
+    "\"sections\"",
+    "work-sharing", /* this is not called "single" because of lowering of
+                       "sections" pragmas */
+    "\"critical\"",
+    "\"ordered\"", /* in PARALLEL */
+    "\"ordered\"", /* in PDO */
+    "\"master\"",
+    "\"reduce\"",
+    "\"barrier\"",
+    "\"masked\""};
+
+#define get_src(ident) ((ident) == NULL ? NULL : (ident)->psource)
+
+#define PUSH_MSG(ct, ident)                                                    \
+  "\tpushing on stack: %s (%s)\n", cons_text_c[(ct)], get_src((ident))
+#define POP_MSG(p)                                                             \
+  "\tpopping off stack: %s (%s)\n", cons_text_c[(p)->stack_data[tos].type],    \
+      get_src((p)->stack_data[tos].ident)
+
+static int const cons_text_c_num = sizeof(cons_text_c) / sizeof(char const *);
+
+/* --------------- START OF STATIC LOCAL ROUTINES ------------------------- */
+
+static void __kmp_check_null_func(void) { /* nothing to do */
+}
+
+static void __kmp_expand_cons_stack(int gtid, struct cons_header *p) {
+  int i;
+  struct cons_data *d;
+
+  /* TODO for monitor perhaps? */
+  if (gtid < 0)
+    __kmp_check_null_func();
+
+  KE_TRACE(10, ("expand cons_stack (%d %d)\n", gtid, __kmp_get_gtid()));
+
+  d = p->stack_data;
+
+  p->stack_size = (p->stack_size * 2) + 100;
+
+  /* TODO free the old data */
+  p->stack_data = (struct cons_data *)__kmp_allocate(sizeof(struct cons_data) *
+                                                     (p->stack_size + 1));
+
+  for (i = p->stack_top; i >= 0; --i)
+    p->stack_data[i] = d[i];
+
+  /* NOTE: we do not free the old stack_data */
+}
+
+// NOTE: Function returns allocated memory, caller must free it!
+static char *__kmp_pragma(int ct, ident_t const *ident) {
+  char const *cons = NULL; // Construct name.
+  char *file = NULL; // File name.
+  char *func = NULL; // Function (routine) name.
+  char *line = NULL; // Line number.
+  kmp_str_buf_t buffer;
+  kmp_msg_t prgm;
+  __kmp_str_buf_init(&buffer);
+  if (0 < ct && ct < cons_text_c_num) {
+    cons = cons_text_c[ct];
+  } else {
+    KMP_DEBUG_ASSERT(0);
+  }
+  if (ident != NULL && ident->psource != NULL) {
+    char *tail = NULL;
+    __kmp_str_buf_print(&buffer, "%s",
+                        ident->psource); // Copy source to buffer.
+    // Split string in buffer to file, func, and line.
+    tail = buffer.str;
+    __kmp_str_split(tail, ';', NULL, &tail);
+    __kmp_str_split(tail, ';', &file, &tail);
+    __kmp_str_split(tail, ';', &func, &tail);
+    __kmp_str_split(tail, ';', &line, &tail);
+  }
+  prgm = __kmp_msg_format(kmp_i18n_fmt_Pragma, cons, file, func, line);
+  __kmp_str_buf_free(&buffer);
+  return prgm.str;
+} // __kmp_pragma
+
+/* ----------------- END OF STATIC LOCAL ROUTINES ------------------------- */
+
+void __kmp_error_construct(kmp_i18n_id_t id, // Message identifier.
+                           enum cons_type ct, // Construct type.
+                           ident_t const *ident // Construct ident.
+) {
+  char *construct = __kmp_pragma(ct, ident);
+  __kmp_fatal(__kmp_msg_format(id, construct), __kmp_msg_null);
+  KMP_INTERNAL_FREE(construct);
+}
+
+void __kmp_error_construct2(kmp_i18n_id_t id, // Message identifier.
+                            enum cons_type ct, // First construct type.
+                            ident_t const *ident, // First construct ident.
+                            struct cons_data const *cons // Second construct.
+) {
+  char *construct1 = __kmp_pragma(ct, ident);
+  char *construct2 = __kmp_pragma(cons->type, cons->ident);
+  __kmp_fatal(__kmp_msg_format(id, construct1, construct2), __kmp_msg_null);
+  KMP_INTERNAL_FREE(construct1);
+  KMP_INTERNAL_FREE(construct2);
+}
+
+struct cons_header *__kmp_allocate_cons_stack(int gtid) {
+  struct cons_header *p;
+
+  /* TODO for monitor perhaps? */
+  if (gtid < 0) {
+    __kmp_check_null_func();
+  }
+  KE_TRACE(10, ("allocate cons_stack (%d)\n", gtid));
+  p = (struct cons_header *)__kmp_allocate(sizeof(struct cons_header));
+  p->p_top = p->w_top = p->s_top = 0;
+  p->stack_data = (struct cons_data *)__kmp_allocate(sizeof(struct cons_data) *
+                                                     (MIN_STACK + 1));
+  p->stack_size = MIN_STACK;
+  p->stack_top = 0;
+  p->stack_data[0].type = ct_none;
+  p->stack_data[0].prev = 0;
+  p->stack_data[0].ident = NULL;
+  return p;
+}
+
+void __kmp_free_cons_stack(void *ptr) {
+  struct cons_header *p = (struct cons_header *)ptr;
+  if (p != NULL) {
+    if (p->stack_data != NULL) {
+      __kmp_free(p->stack_data);
+      p->stack_data = NULL;
+    }
+    __kmp_free(p);
+  }
+}
+
+#if KMP_DEBUG
+static void dump_cons_stack(int gtid, struct cons_header *p) {
+  int i;
+  int tos = p->stack_top;
+  kmp_str_buf_t buffer;
+  __kmp_str_buf_init(&buffer);
+  __kmp_str_buf_print(
+      &buffer,
+      "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\n");
+  __kmp_str_buf_print(&buffer,
+                      "Begin construct stack with %d items for thread %d\n",
+                      tos, gtid);
+  __kmp_str_buf_print(&buffer, "     stack_top=%d { P=%d, W=%d, S=%d }\n", tos,
+                      p->p_top, p->w_top, p->s_top);
+  for (i = tos; i > 0; i--) {
+    struct cons_data *c = &(p->stack_data[i]);
+    __kmp_str_buf_print(
+        &buffer, "        stack_data[%2d] = { %s (%s) %d %p }\n", i,
+        cons_text_c[c->type], get_src(c->ident), c->prev, c->name);
+  }
+  __kmp_str_buf_print(&buffer, "End construct stack for thread %d\n", gtid);
+  __kmp_str_buf_print(
+      &buffer,
+      "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\n");
+  __kmp_debug_printf("%s", buffer.str);
+  __kmp_str_buf_free(&buffer);
+}
+#endif
+
+void __kmp_push_parallel(int gtid, ident_t const *ident) {
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+
+  KMP_DEBUG_ASSERT(__kmp_threads[gtid]->th.th_cons);
+  KE_TRACE(10, ("__kmp_push_parallel (%d %d)\n", gtid, __kmp_get_gtid()));
+  KE_TRACE(100, (PUSH_MSG(ct_parallel, ident)));
+  if (p->stack_top >= p->stack_size) {
+    __kmp_expand_cons_stack(gtid, p);
+  }
+  tos = ++p->stack_top;
+  p->stack_data[tos].type = ct_parallel;
+  p->stack_data[tos].prev = p->p_top;
+  p->stack_data[tos].ident = ident;
+  p->stack_data[tos].name = NULL;
+  p->p_top = tos;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+}
+
+void __kmp_check_workshare(int gtid, enum cons_type ct, ident_t const *ident) {
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+
+  KMP_DEBUG_ASSERT(__kmp_threads[gtid]->th.th_cons);
+  KE_TRACE(10, ("__kmp_check_workshare (%d %d)\n", gtid, __kmp_get_gtid()));
+
+  if (p->stack_top >= p->stack_size) {
+    __kmp_expand_cons_stack(gtid, p);
+  }
+  if (p->w_top > p->p_top) {
+    // We are already in a WORKSHARE construct for this PARALLEL region.
+    __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                           &p->stack_data[p->w_top]);
+  }
+  if (p->s_top > p->p_top) {
+    // We are already in a SYNC construct for this PARALLEL region.
+    __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                           &p->stack_data[p->s_top]);
+  }
+}
+
+void __kmp_push_workshare(int gtid, enum cons_type ct, ident_t const *ident) {
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+  KE_TRACE(10, ("__kmp_push_workshare (%d %d)\n", gtid, __kmp_get_gtid()));
+  __kmp_check_workshare(gtid, ct, ident);
+  KE_TRACE(100, (PUSH_MSG(ct, ident)));
+  tos = ++p->stack_top;
+  p->stack_data[tos].type = ct;
+  p->stack_data[tos].prev = p->w_top;
+  p->stack_data[tos].ident = ident;
+  p->stack_data[tos].name = NULL;
+  p->w_top = tos;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+}
+
+void
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_check_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck, kmp_uint32 seq )
+#else
+__kmp_check_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck )
+#endif
+{
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+
+  KE_TRACE(10, ("__kmp_check_sync (gtid=%d)\n", __kmp_get_gtid()));
+
+  if (p->stack_top >= p->stack_size)
+    __kmp_expand_cons_stack(gtid, p);
+
+  if (ct == ct_ordered_in_parallel || ct == ct_ordered_in_pdo) {
+    if (p->w_top <= p->p_top) {
+/* we are not in a worksharing construct */
+#ifdef BUILD_PARALLEL_ORDERED
+      /* do not report error messages for PARALLEL ORDERED */
+      KMP_ASSERT(ct == ct_ordered_in_parallel);
+#else
+      __kmp_error_construct(kmp_i18n_msg_CnsBoundToWorksharing, ct, ident);
+#endif /* BUILD_PARALLEL_ORDERED */
+    } else {
+      /* inside a WORKSHARING construct for this PARALLEL region */
+      if (!IS_CONS_TYPE_ORDERED(p->stack_data[p->w_top].type)) {
+        __kmp_error_construct2(kmp_i18n_msg_CnsNoOrderedClause, ct, ident,
+                               &p->stack_data[p->w_top]);
+      }
+    }
+    if (p->s_top > p->p_top && p->s_top > p->w_top) {
+      /* inside a sync construct which is inside a worksharing construct */
+      int index = p->s_top;
+      enum cons_type stack_type;
+
+      stack_type = p->stack_data[index].type;
+
+      if (stack_type == ct_critical ||
+          ((stack_type == ct_ordered_in_parallel ||
+            stack_type == ct_ordered_in_pdo) &&
+           /* C doesn't allow named ordered; ordered in ordered gets error */
+           p->stack_data[index].ident != NULL &&
+           (p->stack_data[index].ident->flags & KMP_IDENT_KMPC))) {
+        /* we are in ORDERED which is inside an ORDERED or CRITICAL construct */
+        __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                               &p->stack_data[index]);
+      }
+    }
+  } else if (ct == ct_critical) {
+#if KMP_USE_DYNAMIC_LOCK
+    if (lck != NULL &&
+        __kmp_get_user_lock_owner(lck, seq) ==
+            gtid) { /* this thread already has lock for this critical section */
+#else
+    if (lck != NULL &&
+        __kmp_get_user_lock_owner(lck) ==
+            gtid) { /* this thread already has lock for this critical section */
+#endif
+      int index = p->s_top;
+      struct cons_data cons = {NULL, ct_critical, 0, NULL};
+      /* walk up construct stack and try to find critical with matching name */
+      while (index != 0 && p->stack_data[index].name != lck) {
+        index = p->stack_data[index].prev;
+      }
+      if (index != 0) {
+        /* found match on the stack (may not always because of interleaved
+         * critical for Fortran) */
+        cons = p->stack_data[index];
+      }
+      /* we are in CRITICAL which is inside a CRITICAL construct of same name */
+      __kmp_error_construct2(kmp_i18n_msg_CnsNestingSameName, ct, ident, &cons);
+    }
+  } else if (ct == ct_master || ct == ct_masked || ct == ct_reduce) {
+    if (p->w_top > p->p_top) {
+      /* inside a WORKSHARING construct for this PARALLEL region */
+      __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                             &p->stack_data[p->w_top]);
+    }
+    if (ct == ct_reduce && p->s_top > p->p_top) {
+      /* inside a another SYNC construct for this PARALLEL region */
+      __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                             &p->stack_data[p->s_top]);
+    }
+  }
+}
+
+void
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_push_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck, kmp_uint32 seq )
+#else
+__kmp_push_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck )
+#endif
+{
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+
+  KMP_ASSERT(gtid == __kmp_get_gtid());
+  KE_TRACE(10, ("__kmp_push_sync (gtid=%d)\n", gtid));
+#if KMP_USE_DYNAMIC_LOCK
+  __kmp_check_sync(gtid, ct, ident, lck, seq);
+#else
+  __kmp_check_sync(gtid, ct, ident, lck);
+#endif
+  KE_TRACE(100, (PUSH_MSG(ct, ident)));
+  tos = ++p->stack_top;
+  p->stack_data[tos].type = ct;
+  p->stack_data[tos].prev = p->s_top;
+  p->stack_data[tos].ident = ident;
+  p->stack_data[tos].name = lck;
+  p->s_top = tos;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_pop_parallel(int gtid, ident_t const *ident) {
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+  tos = p->stack_top;
+  KE_TRACE(10, ("__kmp_pop_parallel (%d %d)\n", gtid, __kmp_get_gtid()));
+  if (tos == 0 || p->p_top == 0) {
+    __kmp_error_construct(kmp_i18n_msg_CnsDetectedEnd, ct_parallel, ident);
+  }
+  if (tos != p->p_top || p->stack_data[tos].type != ct_parallel) {
+    __kmp_error_construct2(kmp_i18n_msg_CnsExpectedEnd, ct_parallel, ident,
+                           &p->stack_data[tos]);
+  }
+  KE_TRACE(100, (POP_MSG(p)));
+  p->p_top = p->stack_data[tos].prev;
+  p->stack_data[tos].type = ct_none;
+  p->stack_data[tos].ident = NULL;
+  p->stack_top = tos - 1;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+}
+
+enum cons_type __kmp_pop_workshare(int gtid, enum cons_type ct,
+                                   ident_t const *ident) {
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+
+  tos = p->stack_top;
+  KE_TRACE(10, ("__kmp_pop_workshare (%d %d)\n", gtid, __kmp_get_gtid()));
+  if (tos == 0 || p->w_top == 0) {
+    __kmp_error_construct(kmp_i18n_msg_CnsDetectedEnd, ct, ident);
+  }
+
+  if (tos != p->w_top ||
+      (p->stack_data[tos].type != ct &&
+       // below is the exception to the rule that construct types must match
+       !(p->stack_data[tos].type == ct_pdo_ordered && ct == ct_pdo))) {
+    __kmp_check_null_func();
+    __kmp_error_construct2(kmp_i18n_msg_CnsExpectedEnd, ct, ident,
+                           &p->stack_data[tos]);
+  }
+  KE_TRACE(100, (POP_MSG(p)));
+  p->w_top = p->stack_data[tos].prev;
+  p->stack_data[tos].type = ct_none;
+  p->stack_data[tos].ident = NULL;
+  p->stack_top = tos - 1;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+  return p->stack_data[p->w_top].type;
+}
+
+void __kmp_pop_sync(int gtid, enum cons_type ct, ident_t const *ident) {
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+  tos = p->stack_top;
+  KE_TRACE(10, ("__kmp_pop_sync (%d %d)\n", gtid, __kmp_get_gtid()));
+  if (tos == 0 || p->s_top == 0) {
+    __kmp_error_construct(kmp_i18n_msg_CnsDetectedEnd, ct, ident);
+  }
+  if (tos != p->s_top || p->stack_data[tos].type != ct) {
+    __kmp_check_null_func();
+    __kmp_error_construct2(kmp_i18n_msg_CnsExpectedEnd, ct, ident,
+                           &p->stack_data[tos]);
+  }
+  KE_TRACE(100, (POP_MSG(p)));
+  p->s_top = p->stack_data[tos].prev;
+  p->stack_data[tos].type = ct_none;
+  p->stack_data[tos].ident = NULL;
+  p->stack_top = tos - 1;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_check_barrier(int gtid, enum cons_type ct, ident_t const *ident) {
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+  KE_TRACE(10, ("__kmp_check_barrier (loc: %p, gtid: %d %d)\n", ident, gtid,
+                __kmp_get_gtid()));
+  if (ident != 0) {
+    __kmp_check_null_func();
+  }
+  if (p->w_top > p->p_top) {
+    /* we are already in a WORKSHARING construct for this PARALLEL region */
+    __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                           &p->stack_data[p->w_top]);
+  }
+  if (p->s_top > p->p_top) {
+    /* we are already in a SYNC construct for this PARALLEL region */
+    __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                           &p->stack_data[p->s_top]);
+  }
+}
diff --git a/third_party/openmp/kmp_error.h b/third_party/openmp/kmp_error.h
new file mode 100644
index 000000000..fe6fd3429
--- /dev/null
+++ b/third_party/openmp/kmp_error.h
@@ -0,0 +1,60 @@
+/*
+ * kmp_error.h -- PTS functions for error checking at runtime.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_ERROR_H
+#define KMP_ERROR_H
+
+#include "kmp_i18n.h"
+
+/* ------------------------------------------------------------------------ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void __kmp_error_construct(kmp_i18n_id_t id, enum cons_type ct,
+                           ident_t const *ident);
+void __kmp_error_construct2(kmp_i18n_id_t id, enum cons_type ct,
+                            ident_t const *ident, struct cons_data const *cons);
+
+struct cons_header *__kmp_allocate_cons_stack(int gtid);
+void __kmp_free_cons_stack(void *ptr);
+
+void __kmp_push_parallel(int gtid, ident_t const *ident);
+void __kmp_push_workshare(int gtid, enum cons_type ct, ident_t const *ident);
+#if KMP_USE_DYNAMIC_LOCK
+void __kmp_push_sync(int gtid, enum cons_type ct, ident_t const *ident,
+                     kmp_user_lock_p name, kmp_uint32);
+#else
+void __kmp_push_sync(int gtid, enum cons_type ct, ident_t const *ident,
+                     kmp_user_lock_p name);
+#endif
+
+void __kmp_check_workshare(int gtid, enum cons_type ct, ident_t const *ident);
+#if KMP_USE_DYNAMIC_LOCK
+void __kmp_check_sync(int gtid, enum cons_type ct, ident_t const *ident,
+                      kmp_user_lock_p name, kmp_uint32);
+#else
+void __kmp_check_sync(int gtid, enum cons_type ct, ident_t const *ident,
+                      kmp_user_lock_p name);
+#endif
+
+void __kmp_pop_parallel(int gtid, ident_t const *ident);
+enum cons_type __kmp_pop_workshare(int gtid, enum cons_type ct,
+                                   ident_t const *ident);
+void __kmp_pop_sync(int gtid, enum cons_type ct, ident_t const *ident);
+void __kmp_check_barrier(int gtid, enum cons_type ct, ident_t const *ident);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // KMP_ERROR_H
diff --git a/third_party/openmp/kmp_ftn_cdecl.cpp b/third_party/openmp/kmp_ftn_cdecl.cpp
new file mode 100644
index 000000000..cf1d429a9
--- /dev/null
+++ b/third_party/openmp/kmp_ftn_cdecl.cpp
@@ -0,0 +1,34 @@
+/*
+ * kmp_ftn_cdecl.cpp -- Fortran __cdecl linkage support for OpenMP.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+
+#if KMP_OS_WINDOWS
+#if defined KMP_WIN_CDECL || !KMP_DYNAMIC_LIB
+#define KMP_FTN_ENTRIES KMP_FTN_UPPER
+#endif
+#elif KMP_OS_UNIX
+#define KMP_FTN_ENTRIES KMP_FTN_PLAIN
+#endif
+
+// Note: This string is not printed when KMP_VERSION=1.
+char const __kmp_version_ftncdecl[] =
+    KMP_VERSION_PREFIX "Fortran __cdecl OMP support: "
+#ifdef KMP_FTN_ENTRIES
+                       "yes";
+#define FTN_STDCALL /* no stdcall */
+#include "kmp_ftn_os.h"
+#include "kmp_ftn_entry.h"
+#else
+                       "no";
+#endif /* KMP_FTN_ENTRIES */
diff --git a/third_party/openmp/kmp_ftn_entry.h b/third_party/openmp/kmp_ftn_entry.h
new file mode 100644
index 000000000..713561734
--- /dev/null
+++ b/third_party/openmp/kmp_ftn_entry.h
@@ -0,0 +1,1783 @@
+/*
+ * kmp_ftn_entry.h -- Fortran entry linkage support for OpenMP.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FTN_STDCALL
+#error The support file kmp_ftn_entry.h should not be compiled by itself.
+#endif
+
+#ifdef KMP_STUB
+#include "kmp_stub.h"
+#endif
+
+#include "kmp_i18n.h"
+
+// For affinity format functions
+#include "kmp_io.h"
+#include "kmp_str.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/* For compatibility with the Gnu/MS Open MP codegen, omp_set_num_threads(),
+ * omp_set_nested(), and omp_set_dynamic() [in lowercase on MS, and w/o
+ * a trailing underscore on Linux* OS] take call by value integer arguments.
+ * + omp_set_max_active_levels()
+ * + omp_set_schedule()
+ *
+ * For backward compatibility with 9.1 and previous Intel compiler, these
+ * entry points take call by reference integer arguments. */
+#ifdef KMP_GOMP_COMPAT
+#if (KMP_FTN_ENTRIES == KMP_FTN_PLAIN) || (KMP_FTN_ENTRIES == KMP_FTN_UPPER)
+#define PASS_ARGS_BY_VALUE 1
+#endif
+#endif
+#if KMP_OS_WINDOWS
+#if (KMP_FTN_ENTRIES == KMP_FTN_PLAIN) || (KMP_FTN_ENTRIES == KMP_FTN_APPEND)
+#define PASS_ARGS_BY_VALUE 1
+#endif
+#endif
+
+// This macro helps to reduce code duplication.
+#ifdef PASS_ARGS_BY_VALUE
+#define KMP_DEREF
+#else
+#define KMP_DEREF *
+#endif
+
+// For API with specific C vs. Fortran interfaces (ompc_* exists in
+// kmp_csupport.cpp), only create GOMP versioned symbols of the API for the
+// APPEND Fortran entries in this file. The GOMP versioned symbols of the C API
+// will take place where the ompc_* functions are defined.
+#if KMP_FTN_ENTRIES == KMP_FTN_APPEND
+#define KMP_EXPAND_NAME_IF_APPEND(name) KMP_EXPAND_NAME(name)
+#else
+#define KMP_EXPAND_NAME_IF_APPEND(name) name
+#endif
+
+void FTN_STDCALL FTN_SET_STACKSIZE(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+  __kmps_set_stacksize(KMP_DEREF arg);
+#else
+  // __kmp_aux_set_stacksize initializes the library if needed
+  __kmp_aux_set_stacksize((size_t)KMP_DEREF arg);
+#endif
+}
+
+void FTN_STDCALL FTN_SET_STACKSIZE_S(size_t KMP_DEREF arg) {
+#ifdef KMP_STUB
+  __kmps_set_stacksize(KMP_DEREF arg);
+#else
+  // __kmp_aux_set_stacksize initializes the library if needed
+  __kmp_aux_set_stacksize(KMP_DEREF arg);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_STACKSIZE(void) {
+#ifdef KMP_STUB
+  return (int)__kmps_get_stacksize();
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return (int)__kmp_stksize;
+#endif
+}
+
+size_t FTN_STDCALL FTN_GET_STACKSIZE_S(void) {
+#ifdef KMP_STUB
+  return __kmps_get_stacksize();
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_stksize;
+#endif
+}
+
+void FTN_STDCALL FTN_SET_BLOCKTIME(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+  __kmps_set_blocktime(KMP_DEREF arg);
+#else
+  int gtid, tid, bt = (KMP_DEREF arg);
+  kmp_info_t *thread;
+
+  gtid = __kmp_entry_gtid();
+  tid = __kmp_tid_from_gtid(gtid);
+  thread = __kmp_thread_from_gtid(gtid);
+
+  __kmp_aux_convert_blocktime(&bt);
+  __kmp_aux_set_blocktime(bt, thread, tid);
+#endif
+}
+
+// Gets blocktime in units used for KMP_BLOCKTIME, ms otherwise
+int FTN_STDCALL FTN_GET_BLOCKTIME(void) {
+#ifdef KMP_STUB
+  return __kmps_get_blocktime();
+#else
+  int gtid, tid;
+  kmp_team_p *team;
+
+  gtid = __kmp_entry_gtid();
+  tid = __kmp_tid_from_gtid(gtid);
+  team = __kmp_threads[gtid]->th.th_team;
+
+  /* These must match the settings used in __kmp_wait_sleep() */
+  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
+    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid,
+                  team->t.t_id, tid, KMP_MAX_BLOCKTIME, __kmp_blocktime_units));
+    return KMP_MAX_BLOCKTIME;
+  }
+#ifdef KMP_ADJUST_BLOCKTIME
+  else if (__kmp_zero_bt && !get__bt_set(team, tid)) {
+    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid,
+                  team->t.t_id, tid, 0, __kmp_blocktime_units));
+    return 0;
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+  else {
+    int bt = get__blocktime(team, tid);
+    if (__kmp_blocktime_units == 'm')
+      bt = bt / 1000;
+    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid,
+                  team->t.t_id, tid, bt, __kmp_blocktime_units));
+    return bt;
+  }
+#endif
+}
+
+void FTN_STDCALL FTN_SET_LIBRARY_SERIAL(void) {
+#ifdef KMP_STUB
+  __kmps_set_library(library_serial);
+#else
+  // __kmp_user_set_library initializes the library if needed
+  __kmp_user_set_library(library_serial);
+#endif
+}
+
+void FTN_STDCALL FTN_SET_LIBRARY_TURNAROUND(void) {
+#ifdef KMP_STUB
+  __kmps_set_library(library_turnaround);
+#else
+  // __kmp_user_set_library initializes the library if needed
+  __kmp_user_set_library(library_turnaround);
+#endif
+}
+
+void FTN_STDCALL FTN_SET_LIBRARY_THROUGHPUT(void) {
+#ifdef KMP_STUB
+  __kmps_set_library(library_throughput);
+#else
+  // __kmp_user_set_library initializes the library if needed
+  __kmp_user_set_library(library_throughput);
+#endif
+}
+
+void FTN_STDCALL FTN_SET_LIBRARY(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+  __kmps_set_library(KMP_DEREF arg);
+#else
+  enum library_type lib;
+  lib = (enum library_type)KMP_DEREF arg;
+  // __kmp_user_set_library initializes the library if needed
+  __kmp_user_set_library(lib);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_LIBRARY(void) {
+#ifdef KMP_STUB
+  return __kmps_get_library();
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return ((int)__kmp_library);
+#endif
+}
+
+void FTN_STDCALL FTN_SET_DISP_NUM_BUFFERS(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+  ; // empty routine
+#else
+  // ignore after initialization because some teams have already
+  // allocated dispatch buffers
+  int num_buffers = KMP_DEREF arg;
+  if (__kmp_init_serial == FALSE && num_buffers >= KMP_MIN_DISP_NUM_BUFF &&
+      num_buffers <= KMP_MAX_DISP_NUM_BUFF) {
+    __kmp_dispatch_num_buffers = num_buffers;
+  }
+#endif
+}
+
+int FTN_STDCALL FTN_SET_AFFINITY(void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  return __kmp_aux_set_affinity(mask);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_AFFINITY(void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  int gtid = __kmp_get_gtid();
+  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
+      __kmp_affinity.flags.reset) {
+    __kmp_reset_root_init_mask(gtid);
+  }
+  return __kmp_aux_get_affinity(mask);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_AFFINITY_MAX_PROC(void) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  // We really only NEED serial initialization here.
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  return __kmp_aux_get_affinity_max_proc();
+#endif
+}
+
+void FTN_STDCALL FTN_CREATE_AFFINITY_MASK(void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  *mask = NULL;
+#else
+  // We really only NEED serial initialization here.
+  kmp_affin_mask_t *mask_internals;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  mask_internals = __kmp_affinity_dispatch->allocate_mask();
+  KMP_CPU_ZERO(mask_internals);
+  *mask = mask_internals;
+#endif
+}
+
+void FTN_STDCALL FTN_DESTROY_AFFINITY_MASK(void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+// Nothing
+#else
+  // We really only NEED serial initialization here.
+  kmp_affin_mask_t *mask_internals;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  if (__kmp_env_consistency_check) {
+    if (*mask == NULL) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_destroy_affinity_mask");
+    }
+  }
+  mask_internals = (kmp_affin_mask_t *)(*mask);
+  __kmp_affinity_dispatch->deallocate_mask(mask_internals);
+  *mask = NULL;
+#endif
+}
+
+int FTN_STDCALL FTN_SET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  return __kmp_aux_set_affinity_mask_proc(KMP_DEREF proc, mask);
+#endif
+}
+
+int FTN_STDCALL FTN_UNSET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  return __kmp_aux_unset_affinity_mask_proc(KMP_DEREF proc, mask);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  return __kmp_aux_get_affinity_mask_proc(KMP_DEREF proc, mask);
+#endif
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* sets the requested number of threads for the next parallel region */
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_NUM_THREADS)(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+// Nothing.
+#else
+  __kmp_set_num_threads(KMP_DEREF arg, __kmp_entry_gtid());
+#endif
+}
+
+/* returns the number of threads in current team */
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_THREADS)(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  // __kmpc_bound_num_threads initializes the library if needed
+  return __kmpc_bound_num_threads(NULL);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_THREADS)(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  int gtid;
+  kmp_info_t *thread;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_threads[gtid];
+#if KMP_AFFINITY_SUPPORTED
+  if (thread->th.th_team->t.t_level == 0 && !__kmp_affinity.flags.reset) {
+    __kmp_assign_root_init_mask();
+  }
+#endif
+  // return thread -> th.th_team -> t.t_current_task[
+  // thread->th.th_info.ds.ds_tid ] -> icvs.nproc;
+  return thread->th.th_current_task->td_icvs.nproc;
+#endif
+}
+
+int FTN_STDCALL FTN_CONTROL_TOOL(int command, int modifier, void *arg) {
+#if defined(KMP_STUB) || !OMPT_SUPPORT
+  return -2;
+#else
+  OMPT_STORE_RETURN_ADDRESS(__kmp_entry_gtid());
+  if (!TCR_4(__kmp_init_middle)) {
+    return -2;
+  }
+  kmp_info_t *this_thr = __kmp_threads[__kmp_entry_gtid()];
+  ompt_task_info_t *parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
+  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  int ret = __kmp_control_tool(command, modifier, arg);
+  parent_task_info->frame.enter_frame.ptr = 0;
+  return ret;
+#endif
+}
+
+/* OpenMP 5.0 Memory Management support */
+omp_allocator_handle_t FTN_STDCALL
+FTN_INIT_ALLOCATOR(omp_memspace_handle_t KMP_DEREF m, int KMP_DEREF ntraits,
+                   omp_alloctrait_t tr[]) {
+#ifdef KMP_STUB
+  return NULL;
+#else
+  return __kmpc_init_allocator(__kmp_entry_gtid(), KMP_DEREF m,
+                               KMP_DEREF ntraits, tr);
+#endif
+}
+
+void FTN_STDCALL FTN_DESTROY_ALLOCATOR(omp_allocator_handle_t al) {
+#ifndef KMP_STUB
+  __kmpc_destroy_allocator(__kmp_entry_gtid(), al);
+#endif
+}
+void FTN_STDCALL FTN_SET_DEFAULT_ALLOCATOR(omp_allocator_handle_t al) {
+#ifndef KMP_STUB
+  __kmpc_set_default_allocator(__kmp_entry_gtid(), al);
+#endif
+}
+omp_allocator_handle_t FTN_STDCALL FTN_GET_DEFAULT_ALLOCATOR(void) {
+#ifdef KMP_STUB
+  return NULL;
+#else
+  return __kmpc_get_default_allocator(__kmp_entry_gtid());
+#endif
+}
+
+/* OpenMP 5.0 affinity format support */
+#ifndef KMP_STUB
+static void __kmp_fortran_strncpy_truncate(char *buffer, size_t buf_size,
+                                           char const *csrc, size_t csrc_size) {
+  size_t capped_src_size = csrc_size;
+  if (csrc_size >= buf_size) {
+    capped_src_size = buf_size - 1;
+  }
+  KMP_STRNCPY_S(buffer, buf_size, csrc, capped_src_size);
+  if (csrc_size >= buf_size) {
+    KMP_DEBUG_ASSERT(buffer[buf_size - 1] == '\0');
+    buffer[buf_size - 1] = csrc[buf_size - 1];
+  } else {
+    for (size_t i = csrc_size; i < buf_size; ++i)
+      buffer[i] = ' ';
+  }
+}
+
+// Convert a Fortran string to a C string by adding null byte
+class ConvertedString {
+  char *buf;
+  kmp_info_t *th;
+
+public:
+  ConvertedString(char const *fortran_str, size_t size) {
+    th = __kmp_get_thread();
+    buf = (char *)__kmp_thread_malloc(th, size + 1);
+    KMP_STRNCPY_S(buf, size + 1, fortran_str, size);
+    buf[size] = '\0';
+  }
+  ~ConvertedString() { __kmp_thread_free(th, buf); }
+  const char *get() const { return buf; }
+};
+#endif // KMP_STUB
+
+/*
+ * Set the value of the affinity-format-var ICV on the current device to the
+ * format specified in the argument.
+ */
+void FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_SET_AFFINITY_FORMAT)(
+    char const *format, size_t size) {
+#ifdef KMP_STUB
+  return;
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  ConvertedString cformat(format, size);
+  // Since the __kmp_affinity_format variable is a C string, do not
+  // use the fortran strncpy function
+  __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
+                         cformat.get(), KMP_STRLEN(cformat.get()));
+#endif
+}
+
+/*
+ * Returns the number of characters required to hold the entire affinity format
+ * specification (not including null byte character) and writes the value of the
+ * affinity-format-var ICV on the current device to buffer. If the return value
+ * is larger than size, the affinity format specification is truncated.
+ */
+size_t FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_GET_AFFINITY_FORMAT)(
+    char *buffer, size_t size) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  size_t format_size;
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  format_size = KMP_STRLEN(__kmp_affinity_format);
+  if (buffer && size) {
+    __kmp_fortran_strncpy_truncate(buffer, size, __kmp_affinity_format,
+                                   format_size);
+  }
+  return format_size;
+#endif
+}
+
+/*
+ * Prints the thread affinity information of the current thread in the format
+ * specified by the format argument. If the format is NULL or a zero-length
+ * string, the value of the affinity-format-var ICV is used.
+ */
+void FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_DISPLAY_AFFINITY)(
+    char const *format, size_t size) {
+#ifdef KMP_STUB
+  return;
+#else
+  int gtid;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  gtid = __kmp_get_gtid();
+#if KMP_AFFINITY_SUPPORTED
+  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
+      __kmp_affinity.flags.reset) {
+    __kmp_reset_root_init_mask(gtid);
+  }
+#endif
+  ConvertedString cformat(format, size);
+  __kmp_aux_display_affinity(gtid, cformat.get());
+#endif
+}
+
+/*
+ * Returns the number of characters required to hold the entire affinity format
+ * specification (not including null byte) and prints the thread affinity
+ * information of the current thread into the character string buffer with the
+ * size of size in the format specified by the format argument. If the format is
+ * NULL or a zero-length string, the value of the affinity-format-var ICV is
+ * used. The buffer must be allocated prior to calling the routine. If the
+ * return value is larger than size, the affinity format specification is
+ * truncated.
+ */
+size_t FTN_STDCALL KMP_EXPAND_NAME_IF_APPEND(FTN_CAPTURE_AFFINITY)(
+    char *buffer, char const *format, size_t buf_size, size_t for_size) {
+#if defined(KMP_STUB)
+  return 0;
+#else
+  int gtid;
+  size_t num_required;
+  kmp_str_buf_t capture_buf;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  gtid = __kmp_get_gtid();
+#if KMP_AFFINITY_SUPPORTED
+  if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
+      __kmp_affinity.flags.reset) {
+    __kmp_reset_root_init_mask(gtid);
+  }
+#endif
+  __kmp_str_buf_init(&capture_buf);
+  ConvertedString cformat(format, for_size);
+  num_required = __kmp_aux_capture_affinity(gtid, cformat.get(), &capture_buf);
+  if (buffer && buf_size) {
+    __kmp_fortran_strncpy_truncate(buffer, buf_size, capture_buf.str,
+                                   capture_buf.used);
+  }
+  __kmp_str_buf_free(&capture_buf);
+  return num_required;
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_NUM)(void) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  int gtid;
+
+#if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||    \
+    KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX
+  gtid = __kmp_entry_gtid();
+#elif KMP_OS_WINDOWS
+  if (!__kmp_init_parallel ||
+      (gtid = (int)((kmp_intptr_t)TlsGetValue(__kmp_gtid_threadprivate_key))) ==
+          0) {
+    // Either library isn't initialized or thread is not registered
+    // 0 is the correct TID in this case
+    return 0;
+  }
+  --gtid; // We keep (gtid+1) in TLS
+#elif KMP_OS_LINUX || KMP_OS_WASI
+#ifdef KMP_TDATA_GTID
+  if (__kmp_gtid_mode >= 3) {
+    if ((gtid = __kmp_gtid) == KMP_GTID_DNE) {
+      return 0;
+    }
+  } else {
+#endif
+    if (!__kmp_init_parallel ||
+        (gtid = (int)((kmp_intptr_t)(
+             pthread_getspecific(__kmp_gtid_threadprivate_key)))) == 0) {
+      return 0;
+    }
+    --gtid;
+#ifdef KMP_TDATA_GTID
+  }
+#endif
+#else
+#error Unknown or unsupported OS
+#endif
+
+  return __kmp_tid_from_gtid(gtid);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_NUM_KNOWN_THREADS(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  /* NOTE: this is not syncronized, so it can change at any moment */
+  /* NOTE: this number also includes threads preallocated in hot-teams */
+  return TCR_4(__kmp_nth);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PROCS)(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+#if KMP_AFFINITY_SUPPORTED
+  if (!__kmp_affinity.flags.reset) {
+    // only bind root here if its affinity reset is not requested
+    int gtid = __kmp_entry_gtid();
+    kmp_info_t *thread = __kmp_threads[gtid];
+    if (thread->th.th_team->t.t_level == 0) {
+      __kmp_assign_root_init_mask();
+    }
+  }
+#endif
+  return __kmp_avail_proc;
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_NESTED)(int KMP_DEREF flag) {
+#ifdef KMP_STUB
+  __kmps_set_nested(KMP_DEREF flag);
+#else
+  kmp_info_t *thread;
+  /* For the thread-private internal controls implementation */
+  thread = __kmp_entry_thread();
+  KMP_INFORM(APIDeprecated, "omp_set_nested", "omp_set_max_active_levels");
+  __kmp_save_internal_controls(thread);
+  // Somewhat arbitrarily decide where to get a value for max_active_levels
+  int max_active_levels = get__max_active_levels(thread);
+  if (max_active_levels == 1)
+    max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+  set__max_active_levels(thread, (KMP_DEREF flag) ? max_active_levels : 1);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NESTED)(void) {
+#ifdef KMP_STUB
+  return __kmps_get_nested();
+#else
+  kmp_info_t *thread;
+  thread = __kmp_entry_thread();
+  KMP_INFORM(APIDeprecated, "omp_get_nested", "omp_get_max_active_levels");
+  return get__max_active_levels(thread) > 1;
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DYNAMIC)(int KMP_DEREF flag) {
+#ifdef KMP_STUB
+  __kmps_set_dynamic(KMP_DEREF flag ? TRUE : FALSE);
+#else
+  kmp_info_t *thread;
+  /* For the thread-private implementation of the internal controls */
+  thread = __kmp_entry_thread();
+  // !!! What if foreign thread calls it?
+  __kmp_save_internal_controls(thread);
+  set__dynamic(thread, KMP_DEREF flag ? true : false);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_DYNAMIC)(void) {
+#ifdef KMP_STUB
+  return __kmps_get_dynamic();
+#else
+  kmp_info_t *thread;
+  thread = __kmp_entry_thread();
+  return get__dynamic(thread);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IN_PARALLEL)(void) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  kmp_info_t *th = __kmp_entry_thread();
+  if (th->th.th_teams_microtask) {
+    // AC: r_in_parallel does not work inside teams construct where real
+    // parallel is inactive, but all threads have same root, so setting it in
+    // one team affects other teams.
+    // The solution is to use per-team nesting level
+    return (th->th.th_team->t.t_active_level ? 1 : 0);
+  } else
+    return (th->th.th_root->r.r_in_parallel ? FTN_TRUE : FTN_FALSE);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_SCHEDULE)(kmp_sched_t KMP_DEREF kind,
+                                                   int KMP_DEREF modifier) {
+#ifdef KMP_STUB
+  __kmps_set_schedule(KMP_DEREF kind, KMP_DEREF modifier);
+#else
+  /* TO DO: For the per-task implementation of the internal controls */
+  __kmp_set_schedule(__kmp_entry_gtid(), KMP_DEREF kind, KMP_DEREF modifier);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_SCHEDULE)(kmp_sched_t *kind,
+                                                   int *modifier) {
+#ifdef KMP_STUB
+  __kmps_get_schedule(kind, modifier);
+#else
+  /* TO DO: For the per-task implementation of the internal controls */
+  __kmp_get_schedule(__kmp_entry_gtid(), kind, modifier);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_MAX_ACTIVE_LEVELS)(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+// Nothing.
+#else
+  /* TO DO: We want per-task implementation of this internal control */
+  __kmp_set_max_active_levels(__kmp_entry_gtid(), KMP_DEREF arg);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_ACTIVE_LEVELS)(void) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  /* TO DO: We want per-task implementation of this internal control */
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  return __kmp_get_max_active_levels(__kmp_entry_gtid());
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_ACTIVE_LEVEL)(void) {
+#ifdef KMP_STUB
+  return 0; // returns 0 if it is called from the sequential part of the program
+#else
+  /* TO DO: For the per-task implementation of the internal controls */
+  return __kmp_entry_thread()->th.th_team->t.t_active_level;
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_LEVEL)(void) {
+#ifdef KMP_STUB
+  return 0; // returns 0 if it is called from the sequential part of the program
+#else
+  /* TO DO: For the per-task implementation of the internal controls */
+  return __kmp_entry_thread()->th.th_team->t.t_level;
+#endif
+}
+
+int FTN_STDCALL
+KMP_EXPAND_NAME(FTN_GET_ANCESTOR_THREAD_NUM)(int KMP_DEREF level) {
+#ifdef KMP_STUB
+  return (KMP_DEREF level) ? (-1) : (0);
+#else
+  return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), KMP_DEREF level);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_TEAM_SIZE)(int KMP_DEREF level) {
+#ifdef KMP_STUB
+  return (KMP_DEREF level) ? (-1) : (1);
+#else
+  return __kmp_get_team_size(__kmp_entry_gtid(), KMP_DEREF level);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_LIMIT)(void) {
+#ifdef KMP_STUB
+  return 1; // TO DO: clarify whether it returns 1 or 0?
+#else
+  int gtid;
+  kmp_info_t *thread;
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_threads[gtid];
+  // If thread_limit for the target task is defined, return that instead of the
+  // regular task thread_limit
+  if (int thread_limit = thread->th.th_current_task->td_icvs.task_thread_limit)
+    return thread_limit;
+  return thread->th.th_current_task->td_icvs.thread_limit;
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IN_FINAL)(void) {
+#ifdef KMP_STUB
+  return 0; // TO DO: clarify whether it returns 1 or 0?
+#else
+  if (!TCR_4(__kmp_init_parallel)) {
+    return 0;
+  }
+  return __kmp_entry_thread()->th.th_current_task->td_flags.final;
+#endif
+}
+
+kmp_proc_bind_t FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PROC_BIND)(void) {
+#ifdef KMP_STUB
+  return __kmps_get_proc_bind();
+#else
+  return get__proc_bind(__kmp_entry_thread());
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PLACES)(void) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  if (!__kmp_affinity.flags.reset) {
+    // only bind root here if its affinity reset is not requested
+    int gtid = __kmp_entry_gtid();
+    kmp_info_t *thread = __kmp_threads[gtid];
+    if (thread->th.th_team->t.t_level == 0) {
+      __kmp_assign_root_init_mask();
+    }
+  }
+  return __kmp_affinity.num_masks;
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM_PROCS)(int place_num) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  int i;
+  int retval = 0;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  if (!__kmp_affinity.flags.reset) {
+    // only bind root here if its affinity reset is not requested
+    int gtid = __kmp_entry_gtid();
+    kmp_info_t *thread = __kmp_threads[gtid];
+    if (thread->th.th_team->t.t_level == 0) {
+      __kmp_assign_root_init_mask();
+    }
+  }
+  if (place_num < 0 || place_num >= (int)__kmp_affinity.num_masks)
+    return 0;
+  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity.masks, place_num);
+  KMP_CPU_SET_ITERATE(i, mask) {
+    if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
+        (!KMP_CPU_ISSET(i, mask))) {
+      continue;
+    }
+    ++retval;
+  }
+  return retval;
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_PROC_IDS)(int place_num,
+                                                         int *ids) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+// Nothing.
+#else
+  int i, j;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+  if (!__kmp_affinity.flags.reset) {
+    // only bind root here if its affinity reset is not requested
+    int gtid = __kmp_entry_gtid();
+    kmp_info_t *thread = __kmp_threads[gtid];
+    if (thread->th.th_team->t.t_level == 0) {
+      __kmp_assign_root_init_mask();
+    }
+  }
+  if (place_num < 0 || place_num >= (int)__kmp_affinity.num_masks)
+    return;
+  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity.masks, place_num);
+  j = 0;
+  KMP_CPU_SET_ITERATE(i, mask) {
+    if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
+        (!KMP_CPU_ISSET(i, mask))) {
+      continue;
+    }
+    ids[j++] = i;
+  }
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM)(void) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  int gtid;
+  kmp_info_t *thread;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return -1;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  if (thread->th.th_team->t.t_level == 0 && !__kmp_affinity.flags.reset) {
+    __kmp_assign_root_init_mask();
+  }
+  if (thread->th.th_current_place < 0)
+    return -1;
+  return thread->th.th_current_place;
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_NUM_PLACES)(void) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  int gtid, num_places, first_place, last_place;
+  kmp_info_t *thread;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  if (thread->th.th_team->t.t_level == 0 && !__kmp_affinity.flags.reset) {
+    __kmp_assign_root_init_mask();
+  }
+  first_place = thread->th.th_first_place;
+  last_place = thread->th.th_last_place;
+  if (first_place < 0 || last_place < 0)
+    return 0;
+  if (first_place <= last_place)
+    num_places = last_place - first_place + 1;
+  else
+    num_places = __kmp_affinity.num_masks - first_place + last_place + 1;
+  return num_places;
+#endif
+}
+
+void FTN_STDCALL
+KMP_EXPAND_NAME(FTN_GET_PARTITION_PLACE_NUMS)(int *place_nums) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+// Nothing.
+#else
+  int i, gtid, place_num, first_place, last_place, start, end;
+  kmp_info_t *thread;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  if (thread->th.th_team->t.t_level == 0 && !__kmp_affinity.flags.reset) {
+    __kmp_assign_root_init_mask();
+  }
+  first_place = thread->th.th_first_place;
+  last_place = thread->th.th_last_place;
+  if (first_place < 0 || last_place < 0)
+    return;
+  if (first_place <= last_place) {
+    start = first_place;
+    end = last_place;
+  } else {
+    start = last_place;
+    end = first_place;
+  }
+  for (i = 0, place_num = start; place_num <= end; ++place_num, ++i) {
+    place_nums[i] = place_num;
+  }
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_TEAMS)(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  return __kmp_aux_get_num_teams();
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_TEAM_NUM)(void) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  return __kmp_aux_get_team_num();
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_DEFAULT_DEVICE)(void) {
+#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB)
+  return 0;
+#else
+  return __kmp_entry_thread()->th.th_current_task->td_icvs.default_device;
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DEFAULT_DEVICE)(int KMP_DEREF arg) {
+#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB)
+// Nothing.
+#else
+  __kmp_entry_thread()->th.th_current_task->td_icvs.default_device =
+      KMP_DEREF arg;
+#endif
+}
+
+// Get number of NON-HOST devices.
+// libomptarget, if loaded, provides this function in api.cpp.
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void)
+    KMP_WEAK_ATTRIBUTE_EXTERNAL;
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) {
+#if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return 0;
+#else
+  int (*fptr)();
+  if ((*(void **)(&fptr) = KMP_DLSYM("__tgt_get_num_devices"))) {
+    return (*fptr)();
+  } else if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_num_devices"))) {
+    return (*fptr)();
+  } else if ((*(void **)(&fptr) = KMP_DLSYM("_Offload_number_of_devices"))) {
+    return (*fptr)();
+  } else { // liboffload & libomptarget don't exist
+    return 0;
+  }
+#endif // KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
+}
+
+// This function always returns true when called on host device.
+// Compiler/libomptarget should handle when it is called inside target region.
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void)
+    KMP_WEAK_ATTRIBUTE_EXTERNAL;
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) {
+  return 1; // This is the host
+}
+
+// libomptarget, if loaded, provides this function
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)(void)
+    KMP_WEAK_ATTRIBUTE_EXTERNAL;
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)(void) {
+  // same as omp_get_num_devices()
+  return KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)();
+}
+
+#if defined(KMP_STUB)
+// Entries for stubs library
+// As all *target* functions are C-only parameters always passed by value
+void *FTN_STDCALL FTN_TARGET_ALLOC(size_t size, int device_num) { return 0; }
+
+void FTN_STDCALL FTN_TARGET_FREE(void *device_ptr, int device_num) {}
+
+int FTN_STDCALL FTN_TARGET_IS_PRESENT(void *ptr, int device_num) { return 0; }
+
+int FTN_STDCALL FTN_TARGET_MEMCPY(void *dst, void *src, size_t length,
+                                  size_t dst_offset, size_t src_offset,
+                                  int dst_device, int src_device) {
+  return -1;
+}
+
+int FTN_STDCALL FTN_TARGET_MEMCPY_RECT(
+    void *dst, void *src, size_t element_size, int num_dims,
+    const size_t *volume, const size_t *dst_offsets, const size_t *src_offsets,
+    const size_t *dst_dimensions, const size_t *src_dimensions, int dst_device,
+    int src_device) {
+  return -1;
+}
+
+int FTN_STDCALL FTN_TARGET_ASSOCIATE_PTR(void *host_ptr, void *device_ptr,
+                                         size_t size, size_t device_offset,
+                                         int device_num) {
+  return -1;
+}
+
+int FTN_STDCALL FTN_TARGET_DISASSOCIATE_PTR(void *host_ptr, int device_num) {
+  return -1;
+}
+#endif // defined(KMP_STUB)
+
+#ifdef KMP_STUB
+typedef enum { UNINIT = -1, UNLOCKED, LOCKED } kmp_stub_lock_t;
+#endif /* KMP_STUB */
+
+#if KMP_USE_DYNAMIC_LOCK
+void FTN_STDCALL FTN_INIT_LOCK_WITH_HINT(void **user_lock,
+                                         uintptr_t KMP_DEREF hint) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_lock_with_hint(NULL, gtid, user_lock, KMP_DEREF hint);
+#endif
+}
+
+void FTN_STDCALL FTN_INIT_NEST_LOCK_WITH_HINT(void **user_lock,
+                                              uintptr_t KMP_DEREF hint) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_nest_lock_with_hint(NULL, gtid, user_lock, KMP_DEREF hint);
+#endif
+}
+#endif
+
+/* initialize the lock */
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_INIT_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_lock(NULL, gtid, user_lock);
+#endif
+}
+
+/* initialize the lock */
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_INIT_NEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_nest_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_DESTROY_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNINIT;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_destroy_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_DESTROY_NEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNINIT;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_destroy_nest_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  if (*((kmp_stub_lock_t *)user_lock) != UNLOCKED) {
+    // TODO: Issue an error.
+  }
+  *((kmp_stub_lock_t *)user_lock) = LOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_set_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_NEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  (*((int *)user_lock))++;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_set_nest_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_UNSET_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  if (*((kmp_stub_lock_t *)user_lock) == UNLOCKED) {
+    // TODO: Issue an error.
+  }
+  *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_unset_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_UNSET_NEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  if (*((kmp_stub_lock_t *)user_lock) == UNLOCKED) {
+    // TODO: Issue an error.
+  }
+  (*((int *)user_lock))--;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_unset_nest_lock(NULL, gtid, user_lock);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_TEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  if (*((kmp_stub_lock_t *)user_lock) == LOCKED) {
+    return 0;
+  }
+  *((kmp_stub_lock_t *)user_lock) = LOCKED;
+  return 1;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmpc_test_lock(NULL, gtid, user_lock);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_TEST_NEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  return ++(*((int *)user_lock));
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmpc_test_nest_lock(NULL, gtid, user_lock);
+#endif
+}
+
+double FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_WTIME)(void) {
+#ifdef KMP_STUB
+  return __kmps_get_wtime();
+#else
+  double data;
+#if !KMP_OS_LINUX
+  // We don't need library initialization to get the time on Linux* OS. The
+  // routine can be used to measure library initialization time on Linux* OS now
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+#endif
+  __kmp_elapsed(&data);
+  return data;
+#endif
+}
+
+double FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_WTICK)(void) {
+#ifdef KMP_STUB
+  return __kmps_get_wtick();
+#else
+  double data;
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  __kmp_elapsed_tick(&data);
+  return data;
+#endif
+}
+
+/* ------------------------------------------------------------------------ */
+
+void *FTN_STDCALL FTN_MALLOC(size_t KMP_DEREF size) {
+  // kmpc_malloc initializes the library if needed
+  return kmpc_malloc(KMP_DEREF size);
+}
+
+void *FTN_STDCALL FTN_ALIGNED_MALLOC(size_t KMP_DEREF size,
+                                     size_t KMP_DEREF alignment) {
+  // kmpc_aligned_malloc initializes the library if needed
+  return kmpc_aligned_malloc(KMP_DEREF size, KMP_DEREF alignment);
+}
+
+void *FTN_STDCALL FTN_CALLOC(size_t KMP_DEREF nelem, size_t KMP_DEREF elsize) {
+  // kmpc_calloc initializes the library if needed
+  return kmpc_calloc(KMP_DEREF nelem, KMP_DEREF elsize);
+}
+
+void *FTN_STDCALL FTN_REALLOC(void *KMP_DEREF ptr, size_t KMP_DEREF size) {
+  // kmpc_realloc initializes the library if needed
+  return kmpc_realloc(KMP_DEREF ptr, KMP_DEREF size);
+}
+
+void FTN_STDCALL FTN_KFREE(void *KMP_DEREF ptr) {
+  // does nothing if the library is not initialized
+  kmpc_free(KMP_DEREF ptr);
+}
+
+void FTN_STDCALL FTN_SET_WARNINGS_ON(void) {
+#ifndef KMP_STUB
+  __kmp_generate_warnings = kmp_warnings_explicit;
+#endif
+}
+
+void FTN_STDCALL FTN_SET_WARNINGS_OFF(void) {
+#ifndef KMP_STUB
+  __kmp_generate_warnings = FALSE;
+#endif
+}
+
+void FTN_STDCALL FTN_SET_DEFAULTS(char const *str
+#ifndef PASS_ARGS_BY_VALUE
+                                  ,
+                                  int len
+#endif
+) {
+#ifndef KMP_STUB
+#ifdef PASS_ARGS_BY_VALUE
+  int len = (int)KMP_STRLEN(str);
+#endif
+  __kmp_aux_set_defaults(str, len);
+#endif
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* returns the status of cancellation */
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_CANCELLATION)(void) {
+#ifdef KMP_STUB
+  return 0 /* false */;
+#else
+  // initialize the library if needed
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_omp_cancellation;
+#endif
+}
+
+int FTN_STDCALL FTN_GET_CANCELLATION_STATUS(int cancel_kind) {
+#ifdef KMP_STUB
+  return 0 /* false */;
+#else
+  return __kmp_get_cancellation_status(cancel_kind);
+#endif
+}
+
+/* returns the maximum allowed task priority */
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_TASK_PRIORITY)(void) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_max_task_priority;
+#endif
+}
+
+// This function will be defined in libomptarget. When libomptarget is not
+// loaded, we assume we are on the host and return KMP_HOST_DEVICE.
+// Compiler/libomptarget will handle this if called inside target.
+int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
+int FTN_STDCALL FTN_GET_DEVICE_NUM(void) {
+  return KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)();
+}
+
+// Compiler will ensure that this is only called from host in sequential region
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_PAUSE_RESOURCE)(kmp_pause_status_t kind,
+                                                    int device_num) {
+#ifdef KMP_STUB
+  return 1; // just fail
+#else
+  if (device_num == KMP_EXPAND_NAME(FTN_GET_INITIAL_DEVICE)())
+    return __kmpc_pause_resource(kind);
+  else {
+    int (*fptr)(kmp_pause_status_t, int);
+    if ((*(void **)(&fptr) = KMP_DLSYM("tgt_pause_resource")))
+      return (*fptr)(kind, device_num);
+    else
+      return 1; // just fail if there is no libomptarget
+  }
+#endif
+}
+
+// Compiler will ensure that this is only called from host in sequential region
+int FTN_STDCALL
+    KMP_EXPAND_NAME(FTN_PAUSE_RESOURCE_ALL)(kmp_pause_status_t kind) {
+#ifdef KMP_STUB
+  return 1; // just fail
+#else
+  int fails = 0;
+  int (*fptr)(kmp_pause_status_t, int);
+  if ((*(void **)(&fptr) = KMP_DLSYM("tgt_pause_resource")))
+    fails = (*fptr)(kind, KMP_DEVICE_ALL); // pause devices
+  fails += __kmpc_pause_resource(kind); // pause host
+  return fails;
+#endif
+}
+
+// Returns the maximum number of nesting levels supported by implementation
+int FTN_STDCALL FTN_GET_SUPPORTED_ACTIVE_LEVELS(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  return KMP_MAX_ACTIVE_LEVELS_LIMIT;
+#endif
+}
+
+void FTN_STDCALL FTN_FULFILL_EVENT(kmp_event_t *event) {
+#ifndef KMP_STUB
+  __kmp_fulfill_event(event);
+#endif
+}
+
+// nteams-var per-device ICV
+void FTN_STDCALL FTN_SET_NUM_TEAMS(int KMP_DEREF num_teams) {
+#ifdef KMP_STUB
+// Nothing.
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  __kmp_set_num_teams(KMP_DEREF num_teams);
+#endif
+}
+int FTN_STDCALL FTN_GET_MAX_TEAMS(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_get_max_teams();
+#endif
+}
+// teams-thread-limit-var per-device ICV
+void FTN_STDCALL FTN_SET_TEAMS_THREAD_LIMIT(int KMP_DEREF limit) {
+#ifdef KMP_STUB
+// Nothing.
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  __kmp_set_teams_thread_limit(KMP_DEREF limit);
+#endif
+}
+int FTN_STDCALL FTN_GET_TEAMS_THREAD_LIMIT(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_get_teams_thread_limit();
+#endif
+}
+
+/// TODO: Include the `omp.h` of the current build
+/* OpenMP 5.1 interop */
+typedef intptr_t omp_intptr_t;
+
+/* 0..omp_get_num_interop_properties()-1 are reserved for implementation-defined
+ * properties */
+typedef enum omp_interop_property {
+  omp_ipr_fr_id = -1,
+  omp_ipr_fr_name = -2,
+  omp_ipr_vendor = -3,
+  omp_ipr_vendor_name = -4,
+  omp_ipr_device_num = -5,
+  omp_ipr_platform = -6,
+  omp_ipr_device = -7,
+  omp_ipr_device_context = -8,
+  omp_ipr_targetsync = -9,
+  omp_ipr_first = -9
+} omp_interop_property_t;
+
+#define omp_interop_none 0
+
+typedef enum omp_interop_rc {
+  omp_irc_no_value = 1,
+  omp_irc_success = 0,
+  omp_irc_empty = -1,
+  omp_irc_out_of_range = -2,
+  omp_irc_type_int = -3,
+  omp_irc_type_ptr = -4,
+  omp_irc_type_str = -5,
+  omp_irc_other = -6
+} omp_interop_rc_t;
+
+typedef enum omp_interop_fr {
+  omp_ifr_cuda = 1,
+  omp_ifr_cuda_driver = 2,
+  omp_ifr_opencl = 3,
+  omp_ifr_sycl = 4,
+  omp_ifr_hip = 5,
+  omp_ifr_level_zero = 6,
+  omp_ifr_last = 7
+} omp_interop_fr_t;
+
+typedef void *omp_interop_t;
+
+// libomptarget, if loaded, provides this function
+int FTN_STDCALL FTN_GET_NUM_INTEROP_PROPERTIES(const omp_interop_t interop) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return 0;
+#else
+  int (*fptr)(const omp_interop_t);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_num_interop_properties")))
+    return (*fptr)(interop);
+  return 0;
+#endif
+}
+
+/// TODO Convert FTN_GET_INTEROP_XXX functions into a macro like interop.cpp
+// libomptarget, if loaded, provides this function
+intptr_t FTN_STDCALL FTN_GET_INTEROP_INT(const omp_interop_t interop,
+                                         omp_interop_property_t property_id,
+                                         int *err) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return 0;
+#else
+  intptr_t (*fptr)(const omp_interop_t, omp_interop_property_t, int *);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_int")))
+    return (*fptr)(interop, property_id, err);
+  return 0;
+#endif
+}
+
+// libomptarget, if loaded, provides this function
+void *FTN_STDCALL FTN_GET_INTEROP_PTR(const omp_interop_t interop,
+                                      omp_interop_property_t property_id,
+                                      int *err) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return nullptr;
+#else
+  void *(*fptr)(const omp_interop_t, omp_interop_property_t, int *);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_ptr")))
+    return (*fptr)(interop, property_id, err);
+  return nullptr;
+#endif
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_STR(const omp_interop_t interop,
+                                            omp_interop_property_t property_id,
+                                            int *err) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return nullptr;
+#else
+  const char *(*fptr)(const omp_interop_t, omp_interop_property_t, int *);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_str")))
+    return (*fptr)(interop, property_id, err);
+  return nullptr;
+#endif
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_NAME(
+    const omp_interop_t interop, omp_interop_property_t property_id) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return nullptr;
+#else
+  const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_name")))
+    return (*fptr)(interop, property_id);
+  return nullptr;
+#endif
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_TYPE_DESC(
+    const omp_interop_t interop, omp_interop_property_t property_id) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return nullptr;
+#else
+  const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_type_desc")))
+    return (*fptr)(interop, property_id);
+  return nullptr;
+#endif
+}
+
+// libomptarget, if loaded, provides this function
+const char *FTN_STDCALL FTN_GET_INTEROP_RC_DESC(
+    const omp_interop_t interop, omp_interop_property_t property_id) {
+#if KMP_OS_DARWIN || KMP_OS_WASI || defined(KMP_STUB)
+  return nullptr;
+#else
+  const char *(*fptr)(const omp_interop_t, omp_interop_property_t);
+  if ((*(void **)(&fptr) = KMP_DLSYM_NEXT("omp_get_interop_rec_desc")))
+    return (*fptr)(interop, property_id);
+  return nullptr;
+#endif
+}
+
+// display environment variables when requested
+void FTN_STDCALL FTN_DISPLAY_ENV(int verbose) {
+#ifndef KMP_STUB
+  __kmp_omp_display_env(verbose);
+#endif
+}
+
+int FTN_STDCALL FTN_IN_EXPLICIT_TASK(void) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  int gtid = __kmp_entry_gtid();
+  return __kmp_thread_from_gtid(gtid)->th.th_current_task->td_flags.tasktype;
+#endif
+}
+
+// GCC compatibility (versioned symbols)
+#ifdef KMP_USE_VERSION_SYMBOLS
+
+/* These following sections create versioned symbols for the
+   omp_* routines. The KMP_VERSION_SYMBOL macro expands the API name and
+   then maps it to a versioned symbol.
+   libgomp ``versions'' its symbols (OMP_1.0, OMP_2.0, OMP_3.0, ...) while also
+   retaining the default version which libomp uses: VERSION (defined in
+   exports_so.txt). If you want to see the versioned symbols for libgomp.so.1
+   then just type:
+
+   objdump -T /path/to/libgomp.so.1 | grep omp_
+
+   Example:
+   Step 1) Create __kmp_api_omp_set_num_threads_10_alias which is alias of
+     __kmp_api_omp_set_num_threads
+   Step 2) Set __kmp_api_omp_set_num_threads_10_alias to version:
+     omp_set_num_threads@OMP_1.0
+   Step 2B) Set __kmp_api_omp_set_num_threads to default version:
+     omp_set_num_threads@@VERSION
+*/
+
+// OMP_1.0 versioned symbols
+KMP_VERSION_SYMBOL(FTN_SET_NUM_THREADS, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_NUM_THREADS, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_MAX_THREADS, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_THREAD_NUM, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_NUM_PROCS, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_IN_PARALLEL, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_SET_DYNAMIC, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_DYNAMIC, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_SET_NESTED, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_NESTED, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_INIT_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_INIT_NEST_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_DESTROY_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_DESTROY_NEST_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_SET_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_SET_NEST_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_UNSET_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_UNSET_NEST_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_TEST_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_TEST_NEST_LOCK, 10, "OMP_1.0");
+
+// OMP_2.0 versioned symbols
+KMP_VERSION_SYMBOL(FTN_GET_WTICK, 20, "OMP_2.0");
+KMP_VERSION_SYMBOL(FTN_GET_WTIME, 20, "OMP_2.0");
+
+// OMP_3.0 versioned symbols
+KMP_VERSION_SYMBOL(FTN_SET_SCHEDULE, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_SCHEDULE, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_THREAD_LIMIT, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_SET_MAX_ACTIVE_LEVELS, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_MAX_ACTIVE_LEVELS, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_ANCESTOR_THREAD_NUM, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_LEVEL, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_TEAM_SIZE, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_ACTIVE_LEVEL, 30, "OMP_3.0");
+
+// the lock routines have a 1.0 and 3.0 version
+KMP_VERSION_SYMBOL(FTN_INIT_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_INIT_NEST_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_DESTROY_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_DESTROY_NEST_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_SET_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_SET_NEST_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_UNSET_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_UNSET_NEST_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_TEST_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_TEST_NEST_LOCK, 30, "OMP_3.0");
+
+// OMP_3.1 versioned symbol
+KMP_VERSION_SYMBOL(FTN_IN_FINAL, 31, "OMP_3.1");
+
+// OMP_4.0 versioned symbols
+KMP_VERSION_SYMBOL(FTN_GET_PROC_BIND, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_GET_NUM_TEAMS, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_GET_TEAM_NUM, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_GET_CANCELLATION, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_GET_DEFAULT_DEVICE, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_SET_DEFAULT_DEVICE, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_IS_INITIAL_DEVICE, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_GET_NUM_DEVICES, 40, "OMP_4.0");
+
+// OMP_4.5 versioned symbols
+KMP_VERSION_SYMBOL(FTN_GET_MAX_TASK_PRIORITY, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_NUM_PLACES, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PLACE_NUM_PROCS, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PLACE_PROC_IDS, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PLACE_NUM, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PARTITION_NUM_PLACES, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PARTITION_PLACE_NUMS, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_INITIAL_DEVICE, 45, "OMP_4.5");
+
+// OMP_5.0 versioned symbols
+// KMP_VERSION_SYMBOL(FTN_GET_DEVICE_NUM, 50, "OMP_5.0");
+KMP_VERSION_SYMBOL(FTN_PAUSE_RESOURCE, 50, "OMP_5.0");
+KMP_VERSION_SYMBOL(FTN_PAUSE_RESOURCE_ALL, 50, "OMP_5.0");
+// The C versions (KMP_FTN_PLAIN) of these symbols are in kmp_csupport.c
+#if KMP_FTN_ENTRIES == KMP_FTN_APPEND
+KMP_VERSION_SYMBOL(FTN_CAPTURE_AFFINITY, 50, "OMP_5.0");
+KMP_VERSION_SYMBOL(FTN_DISPLAY_AFFINITY, 50, "OMP_5.0");
+KMP_VERSION_SYMBOL(FTN_GET_AFFINITY_FORMAT, 50, "OMP_5.0");
+KMP_VERSION_SYMBOL(FTN_SET_AFFINITY_FORMAT, 50, "OMP_5.0");
+#endif
+// KMP_VERSION_SYMBOL(FTN_GET_SUPPORTED_ACTIVE_LEVELS, 50, "OMP_5.0");
+// KMP_VERSION_SYMBOL(FTN_FULFILL_EVENT, 50, "OMP_5.0");
+
+#endif // KMP_USE_VERSION_SYMBOLS
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+// end of file //
diff --git a/third_party/openmp/kmp_ftn_os.h b/third_party/openmp/kmp_ftn_os.h
new file mode 100644
index 000000000..7d595b947
--- /dev/null
+++ b/third_party/openmp/kmp_ftn_os.h
@@ -0,0 +1,755 @@
+/*
+ * kmp_ftn_os.h -- KPTS Fortran defines header file.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_FTN_OS_H
+#define KMP_FTN_OS_H
+
+// KMP_FNT_ENTRIES may be one of: KMP_FTN_PLAIN, KMP_FTN_UPPER, KMP_FTN_APPEND,
+// KMP_FTN_UAPPEND.
+
+/* -------------------------- External definitions ------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_PLAIN
+
+#define FTN_SET_STACKSIZE kmp_set_stacksize
+#define FTN_SET_STACKSIZE_S kmp_set_stacksize_s
+#define FTN_GET_STACKSIZE kmp_get_stacksize
+#define FTN_GET_STACKSIZE_S kmp_get_stacksize_s
+#define FTN_SET_BLOCKTIME kmp_set_blocktime
+#define FTN_GET_BLOCKTIME kmp_get_blocktime
+#define FTN_SET_LIBRARY_SERIAL kmp_set_library_serial
+#define FTN_SET_LIBRARY_TURNAROUND kmp_set_library_turnaround
+#define FTN_SET_LIBRARY_THROUGHPUT kmp_set_library_throughput
+#define FTN_SET_LIBRARY kmp_set_library
+#define FTN_GET_LIBRARY kmp_get_library
+#define FTN_SET_DEFAULTS kmp_set_defaults
+#define FTN_SET_DISP_NUM_BUFFERS kmp_set_disp_num_buffers
+#define FTN_SET_AFFINITY kmp_set_affinity
+#define FTN_GET_AFFINITY kmp_get_affinity
+#define FTN_GET_AFFINITY_MAX_PROC kmp_get_affinity_max_proc
+#define FTN_CREATE_AFFINITY_MASK kmp_create_affinity_mask
+#define FTN_DESTROY_AFFINITY_MASK kmp_destroy_affinity_mask
+#define FTN_SET_AFFINITY_MASK_PROC kmp_set_affinity_mask_proc
+#define FTN_UNSET_AFFINITY_MASK_PROC kmp_unset_affinity_mask_proc
+#define FTN_GET_AFFINITY_MASK_PROC kmp_get_affinity_mask_proc
+
+#define FTN_MALLOC kmp_malloc
+#define FTN_ALIGNED_MALLOC kmp_aligned_malloc
+#define FTN_CALLOC kmp_calloc
+#define FTN_REALLOC kmp_realloc
+#define FTN_KFREE kmp_free
+
+#define FTN_GET_NUM_KNOWN_THREADS kmp_get_num_known_threads
+
+#define FTN_SET_NUM_THREADS omp_set_num_threads
+#define FTN_GET_NUM_THREADS omp_get_num_threads
+#define FTN_GET_MAX_THREADS omp_get_max_threads
+#define FTN_GET_THREAD_NUM omp_get_thread_num
+#define FTN_GET_NUM_PROCS omp_get_num_procs
+#define FTN_SET_DYNAMIC omp_set_dynamic
+#define FTN_GET_DYNAMIC omp_get_dynamic
+#define FTN_SET_NESTED omp_set_nested
+#define FTN_GET_NESTED omp_get_nested
+#define FTN_IN_PARALLEL omp_in_parallel
+#define FTN_GET_THREAD_LIMIT omp_get_thread_limit
+#define FTN_SET_SCHEDULE omp_set_schedule
+#define FTN_GET_SCHEDULE omp_get_schedule
+#define FTN_SET_MAX_ACTIVE_LEVELS omp_set_max_active_levels
+#define FTN_GET_MAX_ACTIVE_LEVELS omp_get_max_active_levels
+#define FTN_GET_ACTIVE_LEVEL omp_get_active_level
+#define FTN_GET_LEVEL omp_get_level
+#define FTN_GET_ANCESTOR_THREAD_NUM omp_get_ancestor_thread_num
+#define FTN_GET_TEAM_SIZE omp_get_team_size
+#define FTN_IN_FINAL omp_in_final
+#define FTN_GET_PROC_BIND omp_get_proc_bind
+#define FTN_GET_NUM_TEAMS omp_get_num_teams
+#define FTN_GET_TEAM_NUM omp_get_team_num
+#define FTN_INIT_LOCK omp_init_lock
+#if KMP_USE_DYNAMIC_LOCK
+#define FTN_INIT_LOCK_WITH_HINT omp_init_lock_with_hint
+#define FTN_INIT_NEST_LOCK_WITH_HINT omp_init_nest_lock_with_hint
+#endif
+#define FTN_DESTROY_LOCK omp_destroy_lock
+#define FTN_SET_LOCK omp_set_lock
+#define FTN_UNSET_LOCK omp_unset_lock
+#define FTN_TEST_LOCK omp_test_lock
+#define FTN_INIT_NEST_LOCK omp_init_nest_lock
+#define FTN_DESTROY_NEST_LOCK omp_destroy_nest_lock
+#define FTN_SET_NEST_LOCK omp_set_nest_lock
+#define FTN_UNSET_NEST_LOCK omp_unset_nest_lock
+#define FTN_TEST_NEST_LOCK omp_test_nest_lock
+
+#define FTN_SET_WARNINGS_ON kmp_set_warnings_on
+#define FTN_SET_WARNINGS_OFF kmp_set_warnings_off
+
+#define FTN_GET_WTIME omp_get_wtime
+#define FTN_GET_WTICK omp_get_wtick
+
+#define FTN_GET_NUM_DEVICES omp_get_num_devices
+#define FTN_GET_DEFAULT_DEVICE omp_get_default_device
+#define FTN_SET_DEFAULT_DEVICE omp_set_default_device
+#define FTN_IS_INITIAL_DEVICE omp_is_initial_device
+
+#define FTN_GET_CANCELLATION omp_get_cancellation
+#define FTN_GET_CANCELLATION_STATUS kmp_get_cancellation_status
+
+#define FTN_GET_MAX_TASK_PRIORITY omp_get_max_task_priority
+#define FTN_GET_NUM_PLACES omp_get_num_places
+#define FTN_GET_PLACE_NUM_PROCS omp_get_place_num_procs
+#define FTN_GET_PLACE_PROC_IDS omp_get_place_proc_ids
+#define FTN_GET_PLACE_NUM omp_get_place_num
+#define FTN_GET_PARTITION_NUM_PLACES omp_get_partition_num_places
+#define FTN_GET_PARTITION_PLACE_NUMS omp_get_partition_place_nums
+#define FTN_GET_INITIAL_DEVICE omp_get_initial_device
+#ifdef KMP_STUB
+#define FTN_TARGET_ALLOC omp_target_alloc
+#define FTN_TARGET_FREE omp_target_free
+#define FTN_TARGET_IS_PRESENT omp_target_is_present
+#define FTN_TARGET_MEMCPY omp_target_memcpy
+#define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect
+#define FTN_TARGET_MEMSET omp_target_memset
+#define FTN_TARGET_MEMSET_ASYNC omp_target_memset_async
+#define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr
+#define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr
+#endif
+
+#define FTN_CONTROL_TOOL omp_control_tool
+#define FTN_INIT_ALLOCATOR omp_init_allocator
+#define FTN_DESTROY_ALLOCATOR omp_destroy_allocator
+#define FTN_SET_DEFAULT_ALLOCATOR omp_set_default_allocator
+#define FTN_GET_DEFAULT_ALLOCATOR omp_get_default_allocator
+#define FTN_GET_DEVICE_NUM omp_get_device_num
+#define FTN_SET_AFFINITY_FORMAT omp_set_affinity_format
+#define FTN_GET_AFFINITY_FORMAT omp_get_affinity_format
+#define FTN_DISPLAY_AFFINITY omp_display_affinity
+#define FTN_CAPTURE_AFFINITY omp_capture_affinity
+#define FTN_PAUSE_RESOURCE omp_pause_resource
+#define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all
+#define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels
+#define FTN_DISPLAY_ENV omp_display_env
+#define FTN_IN_EXPLICIT_TASK omp_in_explicit_task
+#define FTN_FULFILL_EVENT omp_fulfill_event
+#define FTN_SET_NUM_TEAMS omp_set_num_teams
+#define FTN_GET_MAX_TEAMS omp_get_max_teams
+#define FTN_SET_TEAMS_THREAD_LIMIT omp_set_teams_thread_limit
+#define FTN_GET_TEAMS_THREAD_LIMIT omp_get_teams_thread_limit
+
+#define FTN_GET_NUM_INTEROP_PROPERTIES omp_get_num_interop_properties
+#define FTN_GET_INTEROP_INT omp_get_interop_int
+#define FTN_GET_INTEROP_PTR omp_get_interop_ptr
+#define FTN_GET_INTEROP_STR omp_get_interop_str
+#define FTN_GET_INTEROP_NAME omp_get_interop_name
+#define FTN_GET_INTEROP_TYPE_DESC omp_get_interop_type_desc
+#define FTN_GET_INTEROP_RC_DESC omp_get_interop_rc_desc
+
+#endif /* KMP_FTN_PLAIN */
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_APPEND
+
+#define FTN_SET_STACKSIZE kmp_set_stacksize_
+#define FTN_SET_STACKSIZE_S kmp_set_stacksize_s_
+#define FTN_GET_STACKSIZE kmp_get_stacksize_
+#define FTN_GET_STACKSIZE_S kmp_get_stacksize_s_
+#define FTN_SET_BLOCKTIME kmp_set_blocktime_
+#define FTN_GET_BLOCKTIME kmp_get_blocktime_
+#define FTN_SET_LIBRARY_SERIAL kmp_set_library_serial_
+#define FTN_SET_LIBRARY_TURNAROUND kmp_set_library_turnaround_
+#define FTN_SET_LIBRARY_THROUGHPUT kmp_set_library_throughput_
+#define FTN_SET_LIBRARY kmp_set_library_
+#define FTN_GET_LIBRARY kmp_get_library_
+#define FTN_SET_DEFAULTS kmp_set_defaults_
+#define FTN_SET_DISP_NUM_BUFFERS kmp_set_disp_num_buffers_
+#define FTN_SET_AFFINITY kmp_set_affinity_
+#define FTN_GET_AFFINITY kmp_get_affinity_
+#define FTN_GET_AFFINITY_MAX_PROC kmp_get_affinity_max_proc_
+#define FTN_CREATE_AFFINITY_MASK kmp_create_affinity_mask_
+#define FTN_DESTROY_AFFINITY_MASK kmp_destroy_affinity_mask_
+#define FTN_SET_AFFINITY_MASK_PROC kmp_set_affinity_mask_proc_
+#define FTN_UNSET_AFFINITY_MASK_PROC kmp_unset_affinity_mask_proc_
+#define FTN_GET_AFFINITY_MASK_PROC kmp_get_affinity_mask_proc_
+
+#define FTN_MALLOC kmp_malloc_
+#define FTN_ALIGNED_MALLOC kmp_aligned_malloc_
+#define FTN_CALLOC kmp_calloc_
+#define FTN_REALLOC kmp_realloc_
+#define FTN_KFREE kmp_free_
+
+#define FTN_GET_NUM_KNOWN_THREADS kmp_get_num_known_threads_
+
+#define FTN_SET_NUM_THREADS omp_set_num_threads_
+#define FTN_GET_NUM_THREADS omp_get_num_threads_
+#define FTN_GET_MAX_THREADS omp_get_max_threads_
+#define FTN_GET_THREAD_NUM omp_get_thread_num_
+#define FTN_GET_NUM_PROCS omp_get_num_procs_
+#define FTN_SET_DYNAMIC omp_set_dynamic_
+#define FTN_GET_DYNAMIC omp_get_dynamic_
+#define FTN_SET_NESTED omp_set_nested_
+#define FTN_GET_NESTED omp_get_nested_
+#define FTN_IN_PARALLEL omp_in_parallel_
+#define FTN_GET_THREAD_LIMIT omp_get_thread_limit_
+#define FTN_SET_SCHEDULE omp_set_schedule_
+#define FTN_GET_SCHEDULE omp_get_schedule_
+#define FTN_SET_MAX_ACTIVE_LEVELS omp_set_max_active_levels_
+#define FTN_GET_MAX_ACTIVE_LEVELS omp_get_max_active_levels_
+#define FTN_GET_ACTIVE_LEVEL omp_get_active_level_
+#define FTN_GET_LEVEL omp_get_level_
+#define FTN_GET_ANCESTOR_THREAD_NUM omp_get_ancestor_thread_num_
+#define FTN_GET_TEAM_SIZE omp_get_team_size_
+#define FTN_IN_FINAL omp_in_final_
+#define FTN_GET_PROC_BIND omp_get_proc_bind_
+#define FTN_GET_NUM_TEAMS omp_get_num_teams_
+#define FTN_GET_TEAM_NUM omp_get_team_num_
+#define FTN_INIT_LOCK omp_init_lock_
+#if KMP_USE_DYNAMIC_LOCK
+#define FTN_INIT_LOCK_WITH_HINT omp_init_lock_with_hint_
+#define FTN_INIT_NEST_LOCK_WITH_HINT omp_init_nest_lock_with_hint_
+#endif
+#define FTN_DESTROY_LOCK omp_destroy_lock_
+#define FTN_SET_LOCK omp_set_lock_
+#define FTN_UNSET_LOCK omp_unset_lock_
+#define FTN_TEST_LOCK omp_test_lock_
+#define FTN_INIT_NEST_LOCK omp_init_nest_lock_
+#define FTN_DESTROY_NEST_LOCK omp_destroy_nest_lock_
+#define FTN_SET_NEST_LOCK omp_set_nest_lock_
+#define FTN_UNSET_NEST_LOCK omp_unset_nest_lock_
+#define FTN_TEST_NEST_LOCK omp_test_nest_lock_
+
+#define FTN_SET_WARNINGS_ON kmp_set_warnings_on_
+#define FTN_SET_WARNINGS_OFF kmp_set_warnings_off_
+
+#define FTN_GET_WTIME omp_get_wtime_
+#define FTN_GET_WTICK omp_get_wtick_
+
+#define FTN_GET_NUM_DEVICES omp_get_num_devices_
+#define FTN_GET_DEFAULT_DEVICE omp_get_default_device_
+#define FTN_SET_DEFAULT_DEVICE omp_set_default_device_
+#define FTN_IS_INITIAL_DEVICE omp_is_initial_device_
+
+#define FTN_GET_CANCELLATION omp_get_cancellation_
+#define FTN_GET_CANCELLATION_STATUS kmp_get_cancellation_status_
+
+#define FTN_GET_MAX_TASK_PRIORITY omp_get_max_task_priority_
+#define FTN_GET_NUM_PLACES omp_get_num_places_
+#define FTN_GET_PLACE_NUM_PROCS omp_get_place_num_procs_
+#define FTN_GET_PLACE_PROC_IDS omp_get_place_proc_ids_
+#define FTN_GET_PLACE_NUM omp_get_place_num_
+#define FTN_GET_PARTITION_NUM_PLACES omp_get_partition_num_places_
+#define FTN_GET_PARTITION_PLACE_NUMS omp_get_partition_place_nums_
+#define FTN_GET_INITIAL_DEVICE omp_get_initial_device_
+#ifdef KMP_STUB
+#define FTN_TARGET_ALLOC omp_target_alloc_
+#define FTN_TARGET_FREE omp_target_free_
+#define FTN_TARGET_IS_PRESENT omp_target_is_present_
+#define FTN_TARGET_MEMCPY omp_target_memcpy_
+#define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect_
+#define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr_
+#define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr_
+#endif
+
+#define FTN_CONTROL_TOOL omp_control_tool_
+#define FTN_INIT_ALLOCATOR omp_init_allocator_
+#define FTN_DESTROY_ALLOCATOR omp_destroy_allocator_
+#define FTN_SET_DEFAULT_ALLOCATOR omp_set_default_allocator_
+#define FTN_GET_DEFAULT_ALLOCATOR omp_get_default_allocator_
+#define FTN_ALLOC omp_alloc_
+#define FTN_FREE omp_free_
+#define FTN_GET_DEVICE_NUM omp_get_device_num_
+#define FTN_SET_AFFINITY_FORMAT omp_set_affinity_format_
+#define FTN_GET_AFFINITY_FORMAT omp_get_affinity_format_
+#define FTN_DISPLAY_AFFINITY omp_display_affinity_
+#define FTN_CAPTURE_AFFINITY omp_capture_affinity_
+#define FTN_PAUSE_RESOURCE omp_pause_resource_
+#define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all_
+#define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels_
+#define FTN_DISPLAY_ENV omp_display_env_
+#define FTN_IN_EXPLICIT_TASK omp_in_explicit_task_
+#define FTN_FULFILL_EVENT omp_fulfill_event_
+#define FTN_SET_NUM_TEAMS omp_set_num_teams_
+#define FTN_GET_MAX_TEAMS omp_get_max_teams_
+#define FTN_SET_TEAMS_THREAD_LIMIT omp_set_teams_thread_limit_
+#define FTN_GET_TEAMS_THREAD_LIMIT omp_get_teams_thread_limit_
+
+#define FTN_GET_NUM_INTEROP_PROPERTIES omp_get_num_interop_properties_
+#define FTN_GET_INTEROP_INT omp_get_interop_int_
+#define FTN_GET_INTEROP_PTR omp_get_interop_ptr_
+#define FTN_GET_INTEROP_STR omp_get_interop_str_
+#define FTN_GET_INTEROP_NAME omp_get_interop_name_
+#define FTN_GET_INTEROP_TYPE_DESC omp_get_interop_type_desc_
+#define FTN_GET_INTEROP_RC_DESC omp_get_interop_rc_desc_
+
+#endif /* KMP_FTN_APPEND */
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_UPPER
+
+#define FTN_SET_STACKSIZE KMP_SET_STACKSIZE
+#define FTN_SET_STACKSIZE_S KMP_SET_STACKSIZE_S
+#define FTN_GET_STACKSIZE KMP_GET_STACKSIZE
+#define FTN_GET_STACKSIZE_S KMP_GET_STACKSIZE_S
+#define FTN_SET_BLOCKTIME KMP_SET_BLOCKTIME
+#define FTN_GET_BLOCKTIME KMP_GET_BLOCKTIME
+#define FTN_SET_LIBRARY_SERIAL KMP_SET_LIBRARY_SERIAL
+#define FTN_SET_LIBRARY_TURNAROUND KMP_SET_LIBRARY_TURNAROUND
+#define FTN_SET_LIBRARY_THROUGHPUT KMP_SET_LIBRARY_THROUGHPUT
+#define FTN_SET_LIBRARY KMP_SET_LIBRARY
+#define FTN_GET_LIBRARY KMP_GET_LIBRARY
+#define FTN_SET_DEFAULTS KMP_SET_DEFAULTS
+#define FTN_SET_DISP_NUM_BUFFERS KMP_SET_DISP_NUM_BUFFERS
+#define FTN_SET_AFFINITY KMP_SET_AFFINITY
+#define FTN_GET_AFFINITY KMP_GET_AFFINITY
+#define FTN_GET_AFFINITY_MAX_PROC KMP_GET_AFFINITY_MAX_PROC
+#define FTN_CREATE_AFFINITY_MASK KMP_CREATE_AFFINITY_MASK
+#define FTN_DESTROY_AFFINITY_MASK KMP_DESTROY_AFFINITY_MASK
+#define FTN_SET_AFFINITY_MASK_PROC KMP_SET_AFFINITY_MASK_PROC
+#define FTN_UNSET_AFFINITY_MASK_PROC KMP_UNSET_AFFINITY_MASK_PROC
+#define FTN_GET_AFFINITY_MASK_PROC KMP_GET_AFFINITY_MASK_PROC
+
+#define FTN_MALLOC KMP_MALLOC
+#define FTN_ALIGNED_MALLOC KMP_ALIGNED_MALLOC
+#define FTN_CALLOC KMP_CALLOC
+#define FTN_REALLOC KMP_REALLOC
+#define FTN_KFREE KMP_FREE
+
+#define FTN_GET_NUM_KNOWN_THREADS KMP_GET_NUM_KNOWN_THREADS
+
+#define FTN_SET_NUM_THREADS OMP_SET_NUM_THREADS
+#define FTN_GET_NUM_THREADS OMP_GET_NUM_THREADS
+#define FTN_GET_MAX_THREADS OMP_GET_MAX_THREADS
+#define FTN_GET_THREAD_NUM OMP_GET_THREAD_NUM
+#define FTN_GET_NUM_PROCS OMP_GET_NUM_PROCS
+#define FTN_SET_DYNAMIC OMP_SET_DYNAMIC
+#define FTN_GET_DYNAMIC OMP_GET_DYNAMIC
+#define FTN_SET_NESTED OMP_SET_NESTED
+#define FTN_GET_NESTED OMP_GET_NESTED
+#define FTN_IN_PARALLEL OMP_IN_PARALLEL
+#define FTN_GET_THREAD_LIMIT OMP_GET_THREAD_LIMIT
+#define FTN_SET_SCHEDULE OMP_SET_SCHEDULE
+#define FTN_GET_SCHEDULE OMP_GET_SCHEDULE
+#define FTN_SET_MAX_ACTIVE_LEVELS OMP_SET_MAX_ACTIVE_LEVELS
+#define FTN_GET_MAX_ACTIVE_LEVELS OMP_GET_MAX_ACTIVE_LEVELS
+#define FTN_GET_ACTIVE_LEVEL OMP_GET_ACTIVE_LEVEL
+#define FTN_GET_LEVEL OMP_GET_LEVEL
+#define FTN_GET_ANCESTOR_THREAD_NUM OMP_GET_ANCESTOR_THREAD_NUM
+#define FTN_GET_TEAM_SIZE OMP_GET_TEAM_SIZE
+#define FTN_IN_FINAL OMP_IN_FINAL
+#define FTN_GET_PROC_BIND OMP_GET_PROC_BIND
+#define FTN_GET_NUM_TEAMS OMP_GET_NUM_TEAMS
+#define FTN_GET_TEAM_NUM OMP_GET_TEAM_NUM
+#define FTN_INIT_LOCK OMP_INIT_LOCK
+#if KMP_USE_DYNAMIC_LOCK
+#define FTN_INIT_LOCK_WITH_HINT OMP_INIT_LOCK_WITH_HINT
+#define FTN_INIT_NEST_LOCK_WITH_HINT OMP_INIT_NEST_LOCK_WITH_HINT
+#endif
+#define FTN_DESTROY_LOCK OMP_DESTROY_LOCK
+#define FTN_SET_LOCK OMP_SET_LOCK
+#define FTN_UNSET_LOCK OMP_UNSET_LOCK
+#define FTN_TEST_LOCK OMP_TEST_LOCK
+#define FTN_INIT_NEST_LOCK OMP_INIT_NEST_LOCK
+#define FTN_DESTROY_NEST_LOCK OMP_DESTROY_NEST_LOCK
+#define FTN_SET_NEST_LOCK OMP_SET_NEST_LOCK
+#define FTN_UNSET_NEST_LOCK OMP_UNSET_NEST_LOCK
+#define FTN_TEST_NEST_LOCK OMP_TEST_NEST_LOCK
+
+#define FTN_SET_WARNINGS_ON KMP_SET_WARNINGS_ON
+#define FTN_SET_WARNINGS_OFF KMP_SET_WARNINGS_OFF
+
+#define FTN_GET_WTIME OMP_GET_WTIME
+#define FTN_GET_WTICK OMP_GET_WTICK
+
+#define FTN_GET_NUM_DEVICES OMP_GET_NUM_DEVICES
+#define FTN_GET_DEFAULT_DEVICE OMP_GET_DEFAULT_DEVICE
+#define FTN_SET_DEFAULT_DEVICE OMP_SET_DEFAULT_DEVICE
+#define FTN_IS_INITIAL_DEVICE OMP_IS_INITIAL_DEVICE
+
+#define FTN_GET_CANCELLATION OMP_GET_CANCELLATION
+#define FTN_GET_CANCELLATION_STATUS KMP_GET_CANCELLATION_STATUS
+
+#define FTN_GET_MAX_TASK_PRIORITY OMP_GET_MAX_TASK_PRIORITY
+#define FTN_GET_NUM_PLACES OMP_GET_NUM_PLACES
+#define FTN_GET_PLACE_NUM_PROCS OMP_GET_PLACE_NUM_PROCS
+#define FTN_GET_PLACE_PROC_IDS OMP_GET_PLACE_PROC_IDS
+#define FTN_GET_PLACE_NUM OMP_GET_PLACE_NUM
+#define FTN_GET_PARTITION_NUM_PLACES OMP_GET_PARTITION_NUM_PLACES
+#define FTN_GET_PARTITION_PLACE_NUMS OMP_GET_PARTITION_PLACE_NUMS
+#define FTN_GET_INITIAL_DEVICE OMP_GET_INITIAL_DEVICE
+#ifdef KMP_STUB
+#define FTN_TARGET_ALLOC OMP_TARGET_ALLOC
+#define FTN_TARGET_FREE OMP_TARGET_FREE
+#define FTN_TARGET_IS_PRESENT OMP_TARGET_IS_PRESENT
+#define FTN_TARGET_MEMCPY OMP_TARGET_MEMCPY
+#define FTN_TARGET_MEMCPY_RECT OMP_TARGET_MEMCPY_RECT
+#define FTN_TARGET_ASSOCIATE_PTR OMP_TARGET_ASSOCIATE_PTR
+#define FTN_TARGET_DISASSOCIATE_PTR OMP_TARGET_DISASSOCIATE_PTR
+#endif
+
+#define FTN_CONTROL_TOOL OMP_CONTROL_TOOL
+#define FTN_INIT_ALLOCATOR OMP_INIT_ALLOCATOR
+#define FTN_DESTROY_ALLOCATOR OMP_DESTROY_ALLOCATOR
+#define FTN_SET_DEFAULT_ALLOCATOR OMP_SET_DEFAULT_ALLOCATOR
+#define FTN_GET_DEFAULT_ALLOCATOR OMP_GET_DEFAULT_ALLOCATOR
+#define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM
+#define FTN_SET_AFFINITY_FORMAT OMP_SET_AFFINITY_FORMAT
+#define FTN_GET_AFFINITY_FORMAT OMP_GET_AFFINITY_FORMAT
+#define FTN_DISPLAY_AFFINITY OMP_DISPLAY_AFFINITY
+#define FTN_CAPTURE_AFFINITY OMP_CAPTURE_AFFINITY
+#define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE
+#define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL
+#define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS
+#define FTN_DISPLAY_ENV OMP_DISPLAY_ENV
+#define FTN_IN_EXPLICIT_TASK OMP_IN_EXPLICIT_TASK
+#define FTN_FULFILL_EVENT OMP_FULFILL_EVENT
+#define FTN_SET_NUM_TEAMS OMP_SET_NUM_TEAMS
+#define FTN_GET_MAX_TEAMS OMP_GET_MAX_TEAMS
+#define FTN_SET_TEAMS_THREAD_LIMIT OMP_SET_TEAMS_THREAD_LIMIT
+#define FTN_GET_TEAMS_THREAD_LIMIT OMP_GET_TEAMS_THREAD_LIMIT
+
+#define FTN_GET_NUM_INTEROP_PROPERTIES OMP_GET_NUM_INTEROP_PROPERTIES
+#define FTN_GET_INTEROP_INT OMP_GET_INTEROP_INT
+#define FTN_GET_INTEROP_PTR OMP_GET_INTEROP_PTR
+#define FTN_GET_INTEROP_STR OMP_GET_INTEROP_STR
+#define FTN_GET_INTEROP_NAME OMP_GET_INTEROP_NAME
+#define FTN_GET_INTEROP_TYPE_DESC OMP_GET_INTEROP_TYPE_DESC
+#define FTN_GET_INTEROP_RC_DESC OMP_GET_INTEROP_RC_DESC
+
+#endif /* KMP_FTN_UPPER */
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_UAPPEND
+
+#define FTN_SET_STACKSIZE KMP_SET_STACKSIZE_
+#define FTN_SET_STACKSIZE_S KMP_SET_STACKSIZE_S_
+#define FTN_GET_STACKSIZE KMP_GET_STACKSIZE_
+#define FTN_GET_STACKSIZE_S KMP_GET_STACKSIZE_S_
+#define FTN_SET_BLOCKTIME KMP_SET_BLOCKTIME_
+#define FTN_GET_BLOCKTIME KMP_GET_BLOCKTIME_
+#define FTN_SET_LIBRARY_SERIAL KMP_SET_LIBRARY_SERIAL_
+#define FTN_SET_LIBRARY_TURNAROUND KMP_SET_LIBRARY_TURNAROUND_
+#define FTN_SET_LIBRARY_THROUGHPUT KMP_SET_LIBRARY_THROUGHPUT_
+#define FTN_SET_LIBRARY KMP_SET_LIBRARY_
+#define FTN_GET_LIBRARY KMP_GET_LIBRARY_
+#define FTN_SET_DEFAULTS KMP_SET_DEFAULTS_
+#define FTN_SET_DISP_NUM_BUFFERS KMP_SET_DISP_NUM_BUFFERS_
+#define FTN_SET_AFFINITY KMP_SET_AFFINITY_
+#define FTN_GET_AFFINITY KMP_GET_AFFINITY_
+#define FTN_GET_AFFINITY_MAX_PROC KMP_GET_AFFINITY_MAX_PROC_
+#define FTN_CREATE_AFFINITY_MASK KMP_CREATE_AFFINITY_MASK_
+#define FTN_DESTROY_AFFINITY_MASK KMP_DESTROY_AFFINITY_MASK_
+#define FTN_SET_AFFINITY_MASK_PROC KMP_SET_AFFINITY_MASK_PROC_
+#define FTN_UNSET_AFFINITY_MASK_PROC KMP_UNSET_AFFINITY_MASK_PROC_
+#define FTN_GET_AFFINITY_MASK_PROC KMP_GET_AFFINITY_MASK_PROC_
+
+#define FTN_MALLOC KMP_MALLOC_
+#define FTN_ALIGNED_MALLOC KMP_ALIGNED_MALLOC_
+#define FTN_CALLOC KMP_CALLOC_
+#define FTN_REALLOC KMP_REALLOC_
+#define FTN_KFREE KMP_FREE_
+
+#define FTN_GET_NUM_KNOWN_THREADS KMP_GET_NUM_KNOWN_THREADS_
+
+#define FTN_SET_NUM_THREADS OMP_SET_NUM_THREADS_
+#define FTN_GET_NUM_THREADS OMP_GET_NUM_THREADS_
+#define FTN_GET_MAX_THREADS OMP_GET_MAX_THREADS_
+#define FTN_GET_THREAD_NUM OMP_GET_THREAD_NUM_
+#define FTN_GET_NUM_PROCS OMP_GET_NUM_PROCS_
+#define FTN_SET_DYNAMIC OMP_SET_DYNAMIC_
+#define FTN_GET_DYNAMIC OMP_GET_DYNAMIC_
+#define FTN_SET_NESTED OMP_SET_NESTED_
+#define FTN_GET_NESTED OMP_GET_NESTED_
+#define FTN_IN_PARALLEL OMP_IN_PARALLEL_
+#define FTN_GET_THREAD_LIMIT OMP_GET_THREAD_LIMIT_
+#define FTN_SET_SCHEDULE OMP_SET_SCHEDULE_
+#define FTN_GET_SCHEDULE OMP_GET_SCHEDULE_
+#define FTN_SET_MAX_ACTIVE_LEVELS OMP_SET_MAX_ACTIVE_LEVELS_
+#define FTN_GET_MAX_ACTIVE_LEVELS OMP_GET_MAX_ACTIVE_LEVELS_
+#define FTN_GET_ACTIVE_LEVEL OMP_GET_ACTIVE_LEVEL_
+#define FTN_GET_LEVEL OMP_GET_LEVEL_
+#define FTN_GET_ANCESTOR_THREAD_NUM OMP_GET_ANCESTOR_THREAD_NUM_
+#define FTN_GET_TEAM_SIZE OMP_GET_TEAM_SIZE_
+#define FTN_IN_FINAL OMP_IN_FINAL_
+#define FTN_GET_PROC_BIND OMP_GET_PROC_BIND_
+#define FTN_GET_NUM_TEAMS OMP_GET_NUM_TEAMS_
+#define FTN_GET_TEAM_NUM OMP_GET_TEAM_NUM_
+#define FTN_INIT_LOCK OMP_INIT_LOCK_
+#if KMP_USE_DYNAMIC_LOCK
+#define FTN_INIT_LOCK_WITH_HINT OMP_INIT_LOCK_WITH_HINT_
+#define FTN_INIT_NEST_LOCK_WITH_HINT OMP_INIT_NEST_LOCK_WITH_HINT_
+#endif
+#define FTN_DESTROY_LOCK OMP_DESTROY_LOCK_
+#define FTN_SET_LOCK OMP_SET_LOCK_
+#define FTN_UNSET_LOCK OMP_UNSET_LOCK_
+#define FTN_TEST_LOCK OMP_TEST_LOCK_
+#define FTN_INIT_NEST_LOCK OMP_INIT_NEST_LOCK_
+#define FTN_DESTROY_NEST_LOCK OMP_DESTROY_NEST_LOCK_
+#define FTN_SET_NEST_LOCK OMP_SET_NEST_LOCK_
+#define FTN_UNSET_NEST_LOCK OMP_UNSET_NEST_LOCK_
+#define FTN_TEST_NEST_LOCK OMP_TEST_NEST_LOCK_
+
+#define FTN_SET_WARNINGS_ON KMP_SET_WARNINGS_ON_
+#define FTN_SET_WARNINGS_OFF KMP_SET_WARNINGS_OFF_
+
+#define FTN_GET_WTIME OMP_GET_WTIME_
+#define FTN_GET_WTICK OMP_GET_WTICK_
+
+#define FTN_GET_NUM_DEVICES OMP_GET_NUM_DEVICES_
+#define FTN_GET_DEFAULT_DEVICE OMP_GET_DEFAULT_DEVICE_
+#define FTN_SET_DEFAULT_DEVICE OMP_SET_DEFAULT_DEVICE_
+#define FTN_IS_INITIAL_DEVICE OMP_IS_INITIAL_DEVICE_
+
+#define FTN_GET_CANCELLATION OMP_GET_CANCELLATION_
+#define FTN_GET_CANCELLATION_STATUS KMP_GET_CANCELLATION_STATUS_
+
+#define FTN_GET_MAX_TASK_PRIORITY OMP_GET_MAX_TASK_PRIORITY_
+#define FTN_GET_NUM_PLACES OMP_GET_NUM_PLACES_
+#define FTN_GET_PLACE_NUM_PROCS OMP_GET_PLACE_NUM_PROCS_
+#define FTN_GET_PLACE_PROC_IDS OMP_GET_PLACE_PROC_IDS_
+#define FTN_GET_PLACE_NUM OMP_GET_PLACE_NUM_
+#define FTN_GET_PARTITION_NUM_PLACES OMP_GET_PARTITION_NUM_PLACES_
+#define FTN_GET_PARTITION_PLACE_NUMS OMP_GET_PARTITION_PLACE_NUMS_
+#define FTN_GET_INITIAL_DEVICE OMP_GET_INITIAL_DEVICE_
+#ifdef KMP_STUB
+#define FTN_TARGET_ALLOC OMP_TARGET_ALLOC_
+#define FTN_TARGET_FREE OMP_TARGET_FREE_
+#define FTN_TARGET_IS_PRESENT OMP_TARGET_IS_PRESENT_
+#define FTN_TARGET_MEMCPY OMP_TARGET_MEMCPY_
+#define FTN_TARGET_MEMCPY_RECT OMP_TARGET_MEMCPY_RECT_
+#define FTN_TARGET_ASSOCIATE_PTR OMP_TARGET_ASSOCIATE_PTR_
+#define FTN_TARGET_DISASSOCIATE_PTR OMP_TARGET_DISASSOCIATE_PTR_
+#endif
+
+#define FTN_CONTROL_TOOL OMP_CONTROL_TOOL_
+#define FTN_INIT_ALLOCATOR OMP_INIT_ALLOCATOR_
+#define FTN_DESTROY_ALLOCATOR OMP_DESTROY_ALLOCATOR_
+#define FTN_SET_DEFAULT_ALLOCATOR OMP_SET_DEFAULT_ALLOCATOR_
+#define FTN_GET_DEFAULT_ALLOCATOR OMP_GET_DEFAULT_ALLOCATOR_
+#define FTN_ALLOC OMP_ALLOC_
+#define FTN_FREE OMP_FREE_
+#define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM_
+#define FTN_SET_AFFINITY_FORMAT OMP_SET_AFFINITY_FORMAT_
+#define FTN_GET_AFFINITY_FORMAT OMP_GET_AFFINITY_FORMAT_
+#define FTN_DISPLAY_AFFINITY OMP_DISPLAY_AFFINITY_
+#define FTN_CAPTURE_AFFINITY OMP_CAPTURE_AFFINITY_
+#define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE_
+#define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL_
+#define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS_
+#define FTN_DISPLAY_ENV OMP_DISPLAY_ENV_
+#define FTN_IN_EXPLICIT_TASK OMP_IN_EXPLICIT_TASK_
+#define FTN_FULFILL_EVENT OMP_FULFILL_EVENT_
+#define FTN_SET_NUM_TEAMS OMP_SET_NUM_TEAMS_
+#define FTN_GET_MAX_TEAMS OMP_GET_MAX_TEAMS_
+#define FTN_SET_TEAMS_THREAD_LIMIT OMP_SET_TEAMS_THREAD_LIMIT_
+#define FTN_GET_TEAMS_THREAD_LIMIT OMP_GET_TEAMS_THREAD_LIMIT_
+
+#define FTN_GET_NUM_INTEROP_PROPERTIES OMP_GET_NUM_INTEROP_PROPERTIES_
+#define FTN_GET_INTEROP_INT OMP_GET_INTEROP_INT_
+#define FTN_GET_INTEROP_PTR OMP_GET_INTEROP_PTR_
+#define FTN_GET_INTEROP_STR OMP_GET_INTEROP_STR_
+#define FTN_GET_INTEROP_NAME OMP_GET_INTEROP_NAME_
+#define FTN_GET_INTEROP_TYPE_DESC OMP_GET_INTEROP_TYPE_DESC_
+#define FTN_GET_INTEROP_RC_DESC OMP_GET_INTEROP_RC_DESC_
+
+#endif /* KMP_FTN_UAPPEND */
+
+/* -------------------------- GOMP API NAMES ------------------------ */
+// All GOMP_1.0 symbols
+#define KMP_API_NAME_GOMP_ATOMIC_END GOMP_atomic_end
+#define KMP_API_NAME_GOMP_ATOMIC_START GOMP_atomic_start
+#define KMP_API_NAME_GOMP_BARRIER GOMP_barrier
+#define KMP_API_NAME_GOMP_CRITICAL_END GOMP_critical_end
+#define KMP_API_NAME_GOMP_CRITICAL_NAME_END GOMP_critical_name_end
+#define KMP_API_NAME_GOMP_CRITICAL_NAME_START GOMP_critical_name_start
+#define KMP_API_NAME_GOMP_CRITICAL_START GOMP_critical_start
+#define KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT GOMP_loop_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_DYNAMIC_START GOMP_loop_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_END GOMP_loop_end
+#define KMP_API_NAME_GOMP_LOOP_END_NOWAIT GOMP_loop_end_nowait
+#define KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT GOMP_loop_guided_next
+#define KMP_API_NAME_GOMP_LOOP_GUIDED_START GOMP_loop_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT                            \
+  GOMP_loop_ordered_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START                           \
+  GOMP_loop_ordered_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT GOMP_loop_ordered_guided_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START                            \
+  GOMP_loop_ordered_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT                            \
+  GOMP_loop_ordered_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START                           \
+  GOMP_loop_ordered_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT GOMP_loop_ordered_static_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START                            \
+  GOMP_loop_ordered_static_start
+#define KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT GOMP_loop_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_RUNTIME_START GOMP_loop_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_STATIC_NEXT GOMP_loop_static_next
+#define KMP_API_NAME_GOMP_LOOP_STATIC_START GOMP_loop_static_start
+#define KMP_API_NAME_GOMP_ORDERED_END GOMP_ordered_end
+#define KMP_API_NAME_GOMP_ORDERED_START GOMP_ordered_start
+#define KMP_API_NAME_GOMP_PARALLEL_END GOMP_parallel_end
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START                          \
+  GOMP_parallel_loop_dynamic_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START                           \
+  GOMP_parallel_loop_guided_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START                          \
+  GOMP_parallel_loop_runtime_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START                           \
+  GOMP_parallel_loop_static_start
+#define KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START GOMP_parallel_sections_start
+#define KMP_API_NAME_GOMP_PARALLEL_START GOMP_parallel_start
+#define KMP_API_NAME_GOMP_SECTIONS_END GOMP_sections_end
+#define KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT GOMP_sections_end_nowait
+#define KMP_API_NAME_GOMP_SECTIONS_NEXT GOMP_sections_next
+#define KMP_API_NAME_GOMP_SECTIONS_START GOMP_sections_start
+#define KMP_API_NAME_GOMP_SINGLE_COPY_END GOMP_single_copy_end
+#define KMP_API_NAME_GOMP_SINGLE_COPY_START GOMP_single_copy_start
+#define KMP_API_NAME_GOMP_SINGLE_START GOMP_single_start
+
+// All GOMP_2.0 symbols
+#define KMP_API_NAME_GOMP_TASK GOMP_task
+#define KMP_API_NAME_GOMP_TASKWAIT GOMP_taskwait
+#define KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT GOMP_loop_ull_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START GOMP_loop_ull_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT GOMP_loop_ull_guided_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START GOMP_loop_ull_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT                        \
+  GOMP_loop_ull_ordered_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START                       \
+  GOMP_loop_ull_ordered_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT                         \
+  GOMP_loop_ull_ordered_guided_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START                        \
+  GOMP_loop_ull_ordered_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT                        \
+  GOMP_loop_ull_ordered_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START                       \
+  GOMP_loop_ull_ordered_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT                         \
+  GOMP_loop_ull_ordered_static_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START                        \
+  GOMP_loop_ull_ordered_static_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT GOMP_loop_ull_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START GOMP_loop_ull_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT GOMP_loop_ull_static_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START GOMP_loop_ull_static_start
+
+// All GOMP_3.0 symbols
+#define KMP_API_NAME_GOMP_TASKYIELD GOMP_taskyield
+
+// All GOMP_4.0 symbols
+#define KMP_API_NAME_GOMP_BARRIER_CANCEL GOMP_barrier_cancel
+#define KMP_API_NAME_GOMP_CANCEL GOMP_cancel
+#define KMP_API_NAME_GOMP_CANCELLATION_POINT GOMP_cancellation_point
+#define KMP_API_NAME_GOMP_LOOP_END_CANCEL GOMP_loop_end_cancel
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC GOMP_parallel_loop_dynamic
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED GOMP_parallel_loop_guided
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME GOMP_parallel_loop_runtime
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC GOMP_parallel_loop_static
+#define KMP_API_NAME_GOMP_PARALLEL_SECTIONS GOMP_parallel_sections
+#define KMP_API_NAME_GOMP_PARALLEL GOMP_parallel
+#define KMP_API_NAME_GOMP_SECTIONS_END_CANCEL GOMP_sections_end_cancel
+#define KMP_API_NAME_GOMP_TASKGROUP_START GOMP_taskgroup_start
+#define KMP_API_NAME_GOMP_TASKGROUP_END GOMP_taskgroup_end
+/* Target functions should be taken care of by liboffload */
+#define KMP_API_NAME_GOMP_TARGET GOMP_target
+#define KMP_API_NAME_GOMP_TARGET_DATA GOMP_target_data
+#define KMP_API_NAME_GOMP_TARGET_END_DATA GOMP_target_end_data
+#define KMP_API_NAME_GOMP_TARGET_UPDATE GOMP_target_update
+#define KMP_API_NAME_GOMP_TEAMS GOMP_teams
+
+// All GOMP_4.5 symbols
+#define KMP_API_NAME_GOMP_TASKLOOP GOMP_taskloop
+#define KMP_API_NAME_GOMP_TASKLOOP_ULL GOMP_taskloop_ull
+#define KMP_API_NAME_GOMP_DOACROSS_POST GOMP_doacross_post
+#define KMP_API_NAME_GOMP_DOACROSS_WAIT GOMP_doacross_wait
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START                           \
+  GOMP_loop_doacross_static_start
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START                          \
+  GOMP_loop_doacross_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START                           \
+  GOMP_loop_doacross_guided_start
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START                          \
+  GOMP_loop_doacross_runtime_start
+#define KMP_API_NAME_GOMP_DOACROSS_ULL_POST GOMP_doacross_ull_post
+#define KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT GOMP_doacross_ull_wait
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START                       \
+  GOMP_loop_ull_doacross_static_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START                      \
+  GOMP_loop_ull_doacross_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START                       \
+  GOMP_loop_ull_doacross_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START                      \
+  GOMP_loop_ull_doacross_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_NEXT                       \
+  GOMP_loop_nonmonotonic_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_START                      \
+  GOMP_loop_nonmonotonic_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_NEXT                        \
+  GOMP_loop_nonmonotonic_guided_next
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_START                       \
+  GOMP_loop_nonmonotonic_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_NEXT                   \
+  GOMP_loop_ull_nonmonotonic_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_START                  \
+  GOMP_loop_ull_nonmonotonic_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_NEXT                    \
+  GOMP_loop_ull_nonmonotonic_guided_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_START                   \
+  GOMP_loop_ull_nonmonotonic_guided_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_DYNAMIC                   \
+  GOMP_parallel_loop_nonmonotonic_dynamic
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED                    \
+  GOMP_parallel_loop_nonmonotonic_guided
+
+// All GOMP_5.0 symbols
+#define KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT                 \
+  GOMP_loop_maybe_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START                \
+  GOMP_loop_maybe_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT                       \
+  GOMP_loop_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START                      \
+  GOMP_loop_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT             \
+  GOMP_loop_ull_maybe_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START            \
+  GOMP_loop_ull_maybe_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT                   \
+  GOMP_loop_ull_nonmonotonic_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START                  \
+  GOMP_loop_ull_nonmonotonic_runtime_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME                   \
+  GOMP_parallel_loop_nonmonotonic_runtime
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME             \
+  GOMP_parallel_loop_maybe_nonmonotonic_runtime
+#define KMP_API_NAME_GOMP_TEAMS_REG GOMP_teams_reg
+#define KMP_API_NAME_GOMP_TASKWAIT_DEPEND GOMP_taskwait_depend
+#define KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_REGISTER                         \
+  GOMP_taskgroup_reduction_register
+#define KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_UNREGISTER                       \
+  GOMP_taskgroup_reduction_unregister
+#define KMP_API_NAME_GOMP_TASK_REDUCTION_REMAP GOMP_task_reduction_remap
+#define KMP_API_NAME_GOMP_PARALLEL_REDUCTIONS GOMP_parallel_reductions
+#define KMP_API_NAME_GOMP_LOOP_START GOMP_loop_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_START GOMP_loop_ull_start
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_START GOMP_loop_doacross_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_START GOMP_loop_ull_doacross_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_START GOMP_loop_ordered_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_START GOMP_loop_ull_ordered_start
+#define KMP_API_NAME_GOMP_SECTIONS2_START GOMP_sections2_start
+#define KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER                  \
+  GOMP_workshare_task_reduction_unregister
+#define KMP_API_NAME_GOMP_ALLOC GOMP_alloc
+#define KMP_API_NAME_GOMP_FREE GOMP_free
+#endif /* KMP_FTN_OS_H */
diff --git a/third_party/openmp/kmp_ftn_stdcall.cpp b/third_party/openmp/kmp_ftn_stdcall.cpp
new file mode 100644
index 000000000..174c21973
--- /dev/null
+++ b/third_party/openmp/kmp_ftn_stdcall.cpp
@@ -0,0 +1,32 @@
+/*
+ * kmp_ftn_stdcall.cpp -- Fortran __stdcall linkage support for OpenMP.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+
+// Note: This string is not printed when KMP_VERSION=1.
+char const __kmp_version_ftnstdcall[] =
+    KMP_VERSION_PREFIX "Fortran __stdcall OMP support: "
+#ifdef USE_FTN_STDCALL
+                       "yes";
+#else
+                       "no";
+#endif
+
+#ifdef USE_FTN_STDCALL
+
+#define FTN_STDCALL KMP_STDCALL
+#define KMP_FTN_ENTRIES USE_FTN_STDCALL
+
+#include "kmp_ftn_entry.h"
+#include "kmp_ftn_os.h"
+
+#endif /* USE_FTN_STDCALL */
diff --git a/third_party/openmp/kmp_global.cpp b/third_party/openmp/kmp_global.cpp
new file mode 100644
index 000000000..5017cd3de
--- /dev/null
+++ b/third_party/openmp/kmp_global.cpp
@@ -0,0 +1,578 @@
+/*
+ * kmp_global.cpp -- KPTS global variables for runtime support library
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
+
+kmp_key_t __kmp_gtid_threadprivate_key;
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+kmp_cpuinfo_t __kmp_cpuinfo = {0}; // Not initialized
+#endif
+
+#if KMP_STATS_ENABLED
+#include "kmp_stats.h"
+// lock for modifying the global __kmp_stats_list
+kmp_tas_lock_t __kmp_stats_lock;
+
+// global list of per thread stats, the head is a sentinel node which
+// accumulates all stats produced before __kmp_create_worker is called.
+kmp_stats_list *__kmp_stats_list;
+
+// thread local pointer to stats node within list
+KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr = NULL;
+
+// gives reference tick for all events (considered the 0 tick)
+tsc_tick_count __kmp_stats_start_time;
+#endif
+
+/* ----------------------------------------------------- */
+/* INITIALIZATION VARIABLES */
+/* they are syncronized to write during init, but read anytime */
+volatile int __kmp_init_serial = FALSE;
+volatile int __kmp_init_gtid = FALSE;
+volatile int __kmp_init_common = FALSE;
+volatile int __kmp_need_register_serial = TRUE;
+volatile int __kmp_init_middle = FALSE;
+volatile int __kmp_init_parallel = FALSE;
+volatile int __kmp_init_hidden_helper = FALSE;
+volatile int __kmp_init_hidden_helper_threads = FALSE;
+volatile int __kmp_hidden_helper_team_done = FALSE;
+#if KMP_USE_MONITOR
+volatile int __kmp_init_monitor =
+    0; /* 1 - launched, 2 - actually started (Windows* OS only) */
+#endif
+volatile int __kmp_init_user_locks = FALSE;
+
+/* list of address of allocated caches for commons */
+kmp_cached_addr_t *__kmp_threadpriv_cache_list = NULL;
+
+int __kmp_init_counter = 0;
+int __kmp_root_counter = 0;
+int __kmp_version = 0;
+
+std::atomic<kmp_int32> __kmp_team_counter = 0;
+std::atomic<kmp_int32> __kmp_task_counter = 0;
+
+size_t __kmp_stksize = KMP_DEFAULT_STKSIZE;
+#if KMP_USE_MONITOR
+size_t __kmp_monitor_stksize = 0; // auto adjust
+#endif
+size_t __kmp_stkoffset = KMP_DEFAULT_STKOFFSET;
+int __kmp_stkpadding = KMP_MIN_STKPADDING;
+
+size_t __kmp_malloc_pool_incr = KMP_DEFAULT_MALLOC_POOL_INCR;
+
+// Barrier method defaults, settings, and strings.
+// branch factor = 2^branch_bits (only relevant for tree & hyper barrier types)
+kmp_uint32 __kmp_barrier_gather_bb_dflt = 2;
+/* branch_factor = 4 */ /* hyper2: C78980 */
+kmp_uint32 __kmp_barrier_release_bb_dflt = 2;
+/* branch_factor = 4 */ /* hyper2: C78980 */
+
+kmp_bar_pat_e __kmp_barrier_gather_pat_dflt = bp_hyper_bar;
+/* hyper2: C78980 */
+kmp_bar_pat_e __kmp_barrier_release_pat_dflt = bp_hyper_bar;
+/* hyper2: C78980 */
+
+kmp_uint32 __kmp_barrier_gather_branch_bits[bs_last_barrier] = {0};
+kmp_uint32 __kmp_barrier_release_branch_bits[bs_last_barrier] = {0};
+kmp_bar_pat_e __kmp_barrier_gather_pattern[bs_last_barrier] = {bp_linear_bar};
+kmp_bar_pat_e __kmp_barrier_release_pattern[bs_last_barrier] = {bp_linear_bar};
+char const *__kmp_barrier_branch_bit_env_name[bs_last_barrier] = {
+    "KMP_PLAIN_BARRIER", "KMP_FORKJOIN_BARRIER"
+#if KMP_FAST_REDUCTION_BARRIER
+    ,
+    "KMP_REDUCTION_BARRIER"
+#endif // KMP_FAST_REDUCTION_BARRIER
+};
+char const *__kmp_barrier_pattern_env_name[bs_last_barrier] = {
+    "KMP_PLAIN_BARRIER_PATTERN", "KMP_FORKJOIN_BARRIER_PATTERN"
+#if KMP_FAST_REDUCTION_BARRIER
+    ,
+    "KMP_REDUCTION_BARRIER_PATTERN"
+#endif // KMP_FAST_REDUCTION_BARRIER
+};
+char const *__kmp_barrier_type_name[bs_last_barrier] = {"plain", "forkjoin"
+#if KMP_FAST_REDUCTION_BARRIER
+                                                        ,
+                                                        "reduction"
+#endif // KMP_FAST_REDUCTION_BARRIER
+};
+char const *__kmp_barrier_pattern_name[bp_last_bar] = {
+    "linear", "tree", "hyper", "hierarchical", "dist"};
+
+int __kmp_allThreadsSpecified = 0;
+size_t __kmp_align_alloc = CACHE_LINE;
+
+int __kmp_generate_warnings = kmp_warnings_low;
+int __kmp_reserve_warn = 0;
+int __kmp_xproc = 0;
+int __kmp_avail_proc = 0;
+size_t __kmp_sys_min_stksize = KMP_MIN_STKSIZE;
+int __kmp_sys_max_nth = KMP_MAX_NTH;
+int __kmp_max_nth = 0;
+int __kmp_cg_max_nth = 0;
+int __kmp_task_max_nth = 0;
+int __kmp_teams_max_nth = 0;
+int __kmp_threads_capacity = 0;
+int __kmp_dflt_team_nth = 0;
+int __kmp_dflt_team_nth_ub = 0;
+int __kmp_tp_capacity = 0;
+int __kmp_tp_cached = 0;
+int __kmp_dispatch_num_buffers = KMP_DFLT_DISP_NUM_BUFF;
+int __kmp_dflt_max_active_levels = 1; // Nesting off by default
+bool __kmp_dflt_max_active_levels_set = false; // Don't override set value
+#if KMP_NESTED_HOT_TEAMS
+int __kmp_hot_teams_mode = 0; /* 0 - free extra threads when reduced */
+/* 1 - keep extra threads when reduced */
+int __kmp_hot_teams_max_level = 1; /* nesting level of hot teams */
+#endif
+enum library_type __kmp_library = library_none;
+enum sched_type __kmp_sched =
+    kmp_sch_default; /* scheduling method for runtime scheduling */
+enum sched_type __kmp_static =
+    kmp_sch_static_greedy; /* default static scheduling method */
+enum sched_type __kmp_guided =
+    kmp_sch_guided_iterative_chunked; /* default guided scheduling method */
+enum sched_type __kmp_auto =
+    kmp_sch_guided_analytical_chunked; /* default auto scheduling method */
+#if KMP_USE_HIER_SCHED
+int __kmp_dispatch_hand_threading = 0;
+int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
+int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
+kmp_hier_sched_env_t __kmp_hier_scheds = {0, 0, NULL, NULL, NULL};
+#endif
+int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; // in microseconds
+char __kmp_blocktime_units = 'm'; // Units specified in KMP_BLOCKTIME
+bool __kmp_wpolicy_passive = false;
+#if KMP_USE_MONITOR
+int __kmp_monitor_wakeups = KMP_MIN_MONITOR_WAKEUPS;
+int __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(KMP_DEFAULT_BLOCKTIME,
+                                                      KMP_MIN_MONITOR_WAKEUPS);
+#endif
+#ifdef KMP_ADJUST_BLOCKTIME
+int __kmp_zero_bt = FALSE;
+#endif /* KMP_ADJUST_BLOCKTIME */
+#ifdef KMP_DFLT_NTH_CORES
+int __kmp_ncores = 0;
+#endif
+int __kmp_chunk = 0;
+int __kmp_force_monotonic = 0;
+int __kmp_abort_delay = 0;
+#if (KMP_OS_LINUX || KMP_OS_AIX) && defined(KMP_TDATA_GTID)
+int __kmp_gtid_mode = 3; /* use __declspec(thread) TLS to store gtid */
+int __kmp_adjust_gtid_mode = FALSE;
+#elif KMP_OS_WINDOWS
+int __kmp_gtid_mode = 2; /* use TLS functions to store gtid */
+int __kmp_adjust_gtid_mode = FALSE;
+#else
+int __kmp_gtid_mode = 0; /* select method to get gtid based on #threads */
+int __kmp_adjust_gtid_mode = TRUE;
+#endif /* KMP_OS_LINUX && defined(KMP_TDATA_GTID) */
+#ifdef KMP_TDATA_GTID
+KMP_THREAD_LOCAL int __kmp_gtid = KMP_GTID_DNE;
+#endif /* KMP_TDATA_GTID */
+int __kmp_tls_gtid_min = INT_MAX;
+int __kmp_foreign_tp = TRUE;
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+int __kmp_inherit_fp_control = TRUE;
+kmp_int16 __kmp_init_x87_fpu_control_word = 0;
+kmp_uint32 __kmp_init_mxcsr = 0;
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#ifdef USE_LOAD_BALANCE
+double __kmp_load_balance_interval = 1.0;
+#endif /* USE_LOAD_BALANCE */
+
+kmp_nested_nthreads_t __kmp_nested_nth = {NULL, 0, 0};
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+kmp_adaptive_backoff_params_t __kmp_adaptive_backoff_params = {
+    1, 1024}; // TODO: tune it!
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+const char *__kmp_speculative_statsfile = "-";
+#endif
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+int __kmp_display_env = FALSE;
+int __kmp_display_env_verbose = FALSE;
+int __kmp_omp_cancellation = FALSE;
+int __kmp_nteams = 0;
+int __kmp_teams_thread_limit = 0;
+
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+int __kmp_user_level_mwait = FALSE;
+int __kmp_umwait_enabled = FALSE;
+int __kmp_mwait_enabled = FALSE;
+int __kmp_mwait_hints = 0;
+#endif
+
+#if KMP_HAVE_UMWAIT
+int __kmp_waitpkg_enabled = 0;
+int __kmp_tpause_state = 0;
+int __kmp_tpause_hint = 1;
+int __kmp_tpause_enabled = 0;
+#endif
+
+/* map OMP 3.0 schedule types with our internal schedule types */
+enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext +
+                              kmp_sched_upper_std - kmp_sched_lower - 2] = {
+    kmp_sch_static_chunked, // ==> kmp_sched_static            = 1
+    kmp_sch_dynamic_chunked, // ==> kmp_sched_dynamic           = 2
+    kmp_sch_guided_chunked, // ==> kmp_sched_guided            = 3
+    kmp_sch_auto, // ==> kmp_sched_auto              = 4
+    kmp_sch_trapezoidal // ==> kmp_sched_trapezoidal       = 101
+    // will likely not be used, introduced here just to debug the code
+    // of public intel extension schedules
+};
+
+#if KMP_OS_LINUX
+enum clock_function_type __kmp_clock_function;
+int __kmp_clock_function_param;
+#endif /* KMP_OS_LINUX */
+
+#if KMP_MIC_SUPPORTED
+enum mic_type __kmp_mic_type = non_mic;
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+
+KMPAffinity *__kmp_affinity_dispatch = NULL;
+
+#if KMP_USE_HWLOC
+int __kmp_hwloc_error = FALSE;
+hwloc_topology_t __kmp_hwloc_topology = NULL;
+#endif
+
+#if KMP_OS_WINDOWS
+#if KMP_GROUP_AFFINITY
+int __kmp_num_proc_groups = 1;
+#endif /* KMP_GROUP_AFFINITY */
+kmp_GetActiveProcessorCount_t __kmp_GetActiveProcessorCount = NULL;
+kmp_GetActiveProcessorGroupCount_t __kmp_GetActiveProcessorGroupCount = NULL;
+kmp_GetThreadGroupAffinity_t __kmp_GetThreadGroupAffinity = NULL;
+kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity = NULL;
+#endif /* KMP_OS_WINDOWS */
+
+size_t __kmp_affin_mask_size = 0;
+enum affinity_top_method __kmp_affinity_top_method =
+    affinity_top_method_default;
+
+// Regular thread affinity settings from KMP_AFFINITY
+kmp_affinity_t __kmp_affinity = KMP_AFFINITY_INIT("KMP_AFFINITY");
+// Hidden helper thread affinity settings from KMP_HIDDEN_HELPER_AFFINITY
+kmp_affinity_t __kmp_hh_affinity =
+    KMP_AFFINITY_INIT("KMP_HIDDEN_HELPER_AFFINITY");
+kmp_affinity_t *__kmp_affinities[] = {&__kmp_affinity, &__kmp_hh_affinity};
+
+char *__kmp_cpuinfo_file = NULL;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+int __kmp_first_osid_with_ecore = -1;
+#endif
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+kmp_nested_proc_bind_t __kmp_nested_proc_bind = {NULL, 0, 0};
+kmp_proc_bind_t __kmp_teams_proc_bind = proc_bind_spread;
+int __kmp_affinity_num_places = 0;
+int __kmp_display_affinity = FALSE;
+char *__kmp_affinity_format = NULL;
+
+kmp_int32 __kmp_default_device = 0;
+
+kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams;
+kmp_int32 __kmp_max_task_priority = 0;
+kmp_uint64 __kmp_taskloop_min_tasks = 0;
+
+int __kmp_memkind_available = 0;
+omp_allocator_handle_t const omp_null_allocator = NULL;
+omp_allocator_handle_t const omp_default_mem_alloc =
+    (omp_allocator_handle_t const)1;
+omp_allocator_handle_t const omp_large_cap_mem_alloc =
+    (omp_allocator_handle_t const)2;
+omp_allocator_handle_t const omp_const_mem_alloc =
+    (omp_allocator_handle_t const)3;
+omp_allocator_handle_t const omp_high_bw_mem_alloc =
+    (omp_allocator_handle_t const)4;
+omp_allocator_handle_t const omp_low_lat_mem_alloc =
+    (omp_allocator_handle_t const)5;
+omp_allocator_handle_t const omp_cgroup_mem_alloc =
+    (omp_allocator_handle_t const)6;
+omp_allocator_handle_t const omp_pteam_mem_alloc =
+    (omp_allocator_handle_t const)7;
+omp_allocator_handle_t const omp_thread_mem_alloc =
+    (omp_allocator_handle_t const)8;
+omp_allocator_handle_t const llvm_omp_target_host_mem_alloc =
+    (omp_allocator_handle_t const)100;
+omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc =
+    (omp_allocator_handle_t const)101;
+omp_allocator_handle_t const llvm_omp_target_device_mem_alloc =
+    (omp_allocator_handle_t const)102;
+omp_allocator_handle_t const kmp_max_mem_alloc =
+    (omp_allocator_handle_t const)1024;
+omp_allocator_handle_t __kmp_def_allocator = omp_default_mem_alloc;
+
+omp_memspace_handle_t const omp_default_mem_space =
+    (omp_memspace_handle_t const)0;
+omp_memspace_handle_t const omp_large_cap_mem_space =
+    (omp_memspace_handle_t const)1;
+omp_memspace_handle_t const omp_const_mem_space =
+    (omp_memspace_handle_t const)2;
+omp_memspace_handle_t const omp_high_bw_mem_space =
+    (omp_memspace_handle_t const)3;
+omp_memspace_handle_t const omp_low_lat_mem_space =
+    (omp_memspace_handle_t const)4;
+omp_memspace_handle_t const llvm_omp_target_host_mem_space =
+    (omp_memspace_handle_t const)100;
+omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
+    (omp_memspace_handle_t const)101;
+omp_memspace_handle_t const llvm_omp_target_device_mem_space =
+    (omp_memspace_handle_t const)102;
+
+/* This check ensures that the compiler is passing the correct data type for the
+   flags formal parameter of the function kmpc_omp_task_alloc(). If the type is
+   not a 4-byte type, then give an error message about a non-positive length
+   array pointing here.  If that happens, the kmp_tasking_flags_t structure must
+   be redefined to have exactly 32 bits. */
+KMP_BUILD_ASSERT(sizeof(kmp_tasking_flags_t) == 4);
+
+int __kmp_task_stealing_constraint = 1; /* Constrain task stealing by default */
+int __kmp_enable_task_throttling = 1;
+
+#ifdef DEBUG_SUSPEND
+int __kmp_suspend_count = 0;
+#endif
+
+int __kmp_settings = FALSE;
+int __kmp_duplicate_library_ok = 0;
+#if USE_ITT_BUILD
+int __kmp_forkjoin_frames = 1;
+int __kmp_forkjoin_frames_mode = 3;
+#endif
+PACKED_REDUCTION_METHOD_T __kmp_force_reduction_method =
+    reduction_method_not_defined;
+int __kmp_determ_red = FALSE;
+
+#ifdef KMP_DEBUG
+int kmp_a_debug = 0;
+int kmp_b_debug = 0;
+int kmp_c_debug = 0;
+int kmp_d_debug = 0;
+int kmp_e_debug = 0;
+int kmp_f_debug = 0;
+int kmp_diag = 0;
+#endif
+
+/* For debug information logging using rotating buffer */
+int __kmp_debug_buf =
+    FALSE; /* TRUE means use buffer, FALSE means print to stderr */
+int __kmp_debug_buf_lines =
+    KMP_DEBUG_BUF_LINES_INIT; /* Lines of debug stored in buffer */
+int __kmp_debug_buf_chars =
+    KMP_DEBUG_BUF_CHARS_INIT; /* Characters allowed per line in buffer */
+int __kmp_debug_buf_atomic =
+    FALSE; /* TRUE means use atomic update of buffer entry pointer */
+
+char *__kmp_debug_buffer = NULL; /* Debug buffer itself */
+std::atomic<int> __kmp_debug_count =
+    0; /* number of lines printed in buffer so far */
+int __kmp_debug_buf_warn_chars =
+    0; /* Keep track of char increase recommended in warnings */
+/* end rotating debug buffer */
+
+#ifdef KMP_DEBUG
+int __kmp_par_range; /* +1 => only go par for constructs in range */
+/* -1 => only go par for constructs outside range */
+char __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN] = {'\0'};
+char __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN] = {'\0'};
+int __kmp_par_range_lb = 0;
+int __kmp_par_range_ub = INT_MAX;
+#endif /* KMP_DEBUG */
+
+/* For printing out dynamic storage map for threads and teams */
+int __kmp_storage_map =
+    FALSE; /* True means print storage map for threads and teams */
+int __kmp_storage_map_verbose =
+    FALSE; /* True means storage map includes placement info */
+int __kmp_storage_map_verbose_specified = FALSE;
+/* Initialize the library data structures when we fork a child process, defaults
+ * to TRUE */
+int __kmp_need_register_atfork =
+    TRUE; /* At initialization, call pthread_atfork to install fork handler */
+int __kmp_need_register_atfork_specified = TRUE;
+
+int __kmp_env_stksize = FALSE; /* KMP_STACKSIZE specified? */
+int __kmp_env_blocktime = FALSE; /* KMP_BLOCKTIME specified? */
+int __kmp_env_checks = FALSE; /* KMP_CHECKS specified?    */
+int __kmp_env_consistency_check = FALSE; /* KMP_CONSISTENCY_CHECK specified? */
+
+// From KMP_USE_YIELD:
+// 0 = never yield;
+// 1 = always yield (default);
+// 2 = yield only if oversubscribed
+#if KMP_OS_DARWIN && KMP_ARCH_AARCH64
+// Set to 0 for environments where yield is slower
+kmp_int32 __kmp_use_yield = 0;
+#else
+kmp_int32 __kmp_use_yield = 1;
+#endif
+
+// This will be 1 if KMP_USE_YIELD environment variable was set explicitly
+kmp_int32 __kmp_use_yield_exp_set = 0;
+
+kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT;
+kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
+kmp_uint64 __kmp_pause_init = 1; // for tpause
+
+/* ------------------------------------------------------ */
+/* STATE mostly syncronized with global lock */
+/* data written to rarely by primary threads, read often by workers */
+/* TODO: None of this global padding stuff works consistently because the order
+   of declaration is not necessarily correlated to storage order. To fix this,
+   all the important globals must be put in a big structure instead. */
+KMP_ALIGN_CACHE
+kmp_info_t **__kmp_threads = NULL;
+kmp_root_t **__kmp_root = NULL;
+kmp_old_threads_list_t *__kmp_old_threads_list = NULL;
+
+/* data read/written to often by primary threads */
+KMP_ALIGN_CACHE
+volatile int __kmp_nth = 0;
+volatile int __kmp_all_nth = 0;
+volatile kmp_info_t *__kmp_thread_pool = NULL;
+volatile kmp_team_t *__kmp_team_pool = NULL;
+
+KMP_ALIGN_CACHE
+std::atomic<int> __kmp_thread_pool_active_nth = 0;
+
+/* -------------------------------------------------
+ * GLOBAL/ROOT STATE */
+KMP_ALIGN_CACHE
+kmp_global_t __kmp_global;
+
+/* ----------------------------------------------- */
+/* GLOBAL SYNCHRONIZATION LOCKS */
+/* TODO verify the need for these locks and if they need to be global */
+
+#if KMP_USE_INTERNODE_ALIGNMENT
+/* Multinode systems have larger cache line granularity which can cause
+ * false sharing if the alignment is not large enough for these locks */
+KMP_ALIGN_CACHE_INTERNODE
+
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
+KMP_ALIGN_CACHE_INTERNODE
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
+KMP_ALIGN_CACHE_INTERNODE
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
+#if KMP_USE_MONITOR
+/* control monitor thread creation */
+KMP_ALIGN_CACHE_INTERNODE
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
+#endif
+/* used for the hack to allow threadprivate cache and __kmp_threads expansion
+   to co-exist */
+KMP_ALIGN_CACHE_INTERNODE
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);
+
+KMP_ALIGN_CACHE_INTERNODE
+KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access  */
+KMP_ALIGN_CACHE_INTERNODE
+KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
+#else
+KMP_ALIGN_CACHE
+
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
+#if KMP_USE_MONITOR
+/* control monitor thread creation */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
+#endif
+/* used for the hack to allow threadprivate cache and __kmp_threads expansion
+   to co-exist */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);
+
+KMP_ALIGN(128)
+KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
+KMP_ALIGN(128)
+kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access  */
+KMP_ALIGN(128)
+KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
+#endif
+
+/* ----------------------------------------------- */
+
+#if KMP_HANDLE_SIGNALS
+/* Signal handling is disabled by default, because it confuses users: In case of
+   sigsegv (or other trouble) in user code signal handler catches the signal,
+   which then "appears" in the monitor thread (when the monitor executes raise()
+   function). Users see signal in the monitor thread and blame OpenMP RTL.
+
+   Grant said signal handling required on some older OSes (Irix?) supported by
+   KAI, because bad applications hung but not aborted. Currently it is not a
+   problem for Linux* OS, OS X* and Windows* OS.
+
+   Grant: Found new hangs for EL4, EL5, and a Fedora Core machine.  So I'm
+   putting the default back for now to see if that fixes hangs on those
+   machines.
+
+   2010-04013 Lev: It was a bug in Fortran RTL. Fortran RTL prints a kind of
+   stack backtrace when program is aborting, but the code is not signal-safe.
+   When multiple signals raised at the same time (which occurs in dynamic
+   negative tests because all the worker threads detects the same error),
+   Fortran RTL may hang. The bug finally fixed in Fortran RTL library provided
+   by Steve R., and will be available soon. */
+int __kmp_handle_signals = FALSE;
+#endif
+
+#ifdef DEBUG_SUSPEND
+int get_suspend_count_(void) {
+  int count = __kmp_suspend_count;
+  __kmp_suspend_count = 0;
+  return count;
+}
+void set_suspend_count_(int *value) { __kmp_suspend_count = *value; }
+#endif
+
+kmp_target_offload_kind_t __kmp_target_offload = tgt_default;
+
+// OMP Pause Resources
+kmp_pause_status_t __kmp_pause_status = kmp_not_paused;
+
+// Nesting mode
+int __kmp_nesting_mode = 0;
+int __kmp_nesting_mode_nlevels = 1;
+int *__kmp_nesting_nth_level;
+
+#if OMPX_TASKGRAPH
+// TDG record & replay
+int __kmp_tdg_dot = 0;
+kmp_int32 __kmp_max_tdgs = 100;
+kmp_tdg_info_t **__kmp_global_tdgs = NULL;
+kmp_int32 __kmp_curr_tdg_idx =
+    0; // Id of the current TDG being recorded or executed
+kmp_int32 __kmp_num_tdg = 0;
+kmp_int32 __kmp_successors_size = 10; // Initial succesor size list for
+                                      // recording
+std::atomic<kmp_int32> __kmp_tdg_task_id = 0;
+#endif
+// end of file //
+
diff --git a/third_party/openmp/kmp_gsupport.cpp b/third_party/openmp/kmp_gsupport.cpp
new file mode 100644
index 000000000..88189659a
--- /dev/null
+++ b/third_party/openmp/kmp_gsupport.cpp
@@ -0,0 +1,2706 @@
+/*
+ * kmp_gsupport.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_atomic.h"
+#include "kmp_utils.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+enum {
+  KMP_GOMP_TASK_UNTIED_FLAG = 1,
+  KMP_GOMP_TASK_FINAL_FLAG = 2,
+  KMP_GOMP_TASK_DEPENDS_FLAG = 8
+};
+
+enum {
+  KMP_GOMP_DEPOBJ_IN = 1,
+  KMP_GOMP_DEPOBJ_OUT = 2,
+  KMP_GOMP_DEPOBJ_INOUT = 3,
+  KMP_GOMP_DEPOBJ_MTXINOUTSET = 4
+};
+
+// This class helps convert gomp dependency info into
+// kmp_depend_info_t structures
+class kmp_gomp_depends_info_t {
+  void **depend;
+  kmp_int32 num_deps;
+  size_t num_out, num_mutexinout, num_in, num_depobj;
+  size_t offset;
+
+public:
+  kmp_gomp_depends_info_t(void **depend) : depend(depend) {
+    size_t ndeps = (kmp_intptr_t)depend[0];
+    // GOMP taskdep structure:
+    // if depend[0] != 0:
+    // depend =  [ ndeps | nout | &out | ... | &out | &in | ... | &in ]
+    //
+    // if depend[0] == 0:
+    // depend = [ 0 | ndeps | nout | nmtx | nin | &out | ... | &out | &mtx |
+    //            ... | &mtx | &in   | ...  | &in  | &depobj | ... | &depobj ]
+    if (ndeps) {
+      num_out = (kmp_intptr_t)depend[1];
+      num_in = ndeps - num_out;
+      num_mutexinout = num_depobj = 0;
+      offset = 2;
+    } else {
+      ndeps = (kmp_intptr_t)depend[1];
+      num_out = (kmp_intptr_t)depend[2];
+      num_mutexinout = (kmp_intptr_t)depend[3];
+      num_in = (kmp_intptr_t)depend[4];
+      num_depobj = ndeps - num_out - num_mutexinout - num_in;
+      KMP_ASSERT(num_depobj <= ndeps);
+      offset = 5;
+    }
+    num_deps = static_cast<kmp_int32>(ndeps);
+  }
+  kmp_int32 get_num_deps() const { return num_deps; }
+  kmp_depend_info_t get_kmp_depend(size_t index) const {
+    kmp_depend_info_t retval;
+    memset(&retval, '\0', sizeof(retval));
+    KMP_ASSERT(index < (size_t)num_deps);
+    retval.len = 0;
+    // Because inout and out are logically equivalent,
+    // use inout and in dependency flags. GOMP does not provide a
+    // way to distinguish if user specified out vs. inout.
+    if (index < num_out) {
+      retval.flags.in = 1;
+      retval.flags.out = 1;
+      retval.base_addr = (kmp_intptr_t)depend[offset + index];
+    } else if (index >= num_out && index < (num_out + num_mutexinout)) {
+      retval.flags.mtx = 1;
+      retval.base_addr = (kmp_intptr_t)depend[offset + index];
+    } else if (index >= (num_out + num_mutexinout) &&
+               index < (num_out + num_mutexinout + num_in)) {
+      retval.flags.in = 1;
+      retval.base_addr = (kmp_intptr_t)depend[offset + index];
+    } else {
+      // depobj is a two element array (size of elements are size of pointer)
+      // depobj[0] = base_addr
+      // depobj[1] = type (in, out, inout, mutexinoutset, etc.)
+      kmp_intptr_t *depobj = (kmp_intptr_t *)depend[offset + index];
+      retval.base_addr = depobj[0];
+      switch (depobj[1]) {
+      case KMP_GOMP_DEPOBJ_IN:
+        retval.flags.in = 1;
+        break;
+      case KMP_GOMP_DEPOBJ_OUT:
+        retval.flags.out = 1;
+        break;
+      case KMP_GOMP_DEPOBJ_INOUT:
+        retval.flags.in = 1;
+        retval.flags.out = 1;
+        break;
+      case KMP_GOMP_DEPOBJ_MTXINOUTSET:
+        retval.flags.mtx = 1;
+        break;
+      default:
+        KMP_FATAL(GompFeatureNotSupported, "Unknown depobj type");
+      }
+    }
+    return retval;
+  }
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#define MKLOC(loc, routine)                                                    \
+  static ident_t loc = {0, KMP_IDENT_KMPC, 0, 0, ";unknown;unknown;0;0;;"};
+
+#include "kmp_ftn_os.h"
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER)(void) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_barrier");
+  KA_TRACE(20, ("GOMP_barrier: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_barrier(&loc, gtid);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+}
+
+// Mutual exclusion
+
+// The symbol that icc/ifort generates for unnamed for unnamed critical sections
+// - .gomp_critical_user_ - is defined using .comm in any objects reference it.
+// We can't reference it directly here in C code, as the symbol contains a ".".
+//
+// The RTL contains an assembly language definition of .gomp_critical_user_
+// with another symbol __kmp_unnamed_critical_addr initialized with it's
+// address.
+extern kmp_critical_name *__kmp_unnamed_critical_addr;
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_START)(void) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_critical_start");
+  KA_TRACE(20, ("GOMP_critical_start: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_critical(&loc, gtid, __kmp_unnamed_critical_addr);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_END)(void) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_critical_end");
+  KA_TRACE(20, ("GOMP_critical_end: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_end_critical(&loc, gtid, __kmp_unnamed_critical_addr);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_NAME_START)(void **pptr) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_critical_name_start");
+  KA_TRACE(20, ("GOMP_critical_name_start: T#%d\n", gtid));
+  __kmpc_critical(&loc, gtid, (kmp_critical_name *)pptr);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_NAME_END)(void **pptr) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_critical_name_end");
+  KA_TRACE(20, ("GOMP_critical_name_end: T#%d\n", gtid));
+  __kmpc_end_critical(&loc, gtid, (kmp_critical_name *)pptr);
+}
+
+// The Gnu codegen tries to use locked operations to perform atomic updates
+// inline.  If it can't, then it calls GOMP_atomic_start() before performing
+// the update and GOMP_atomic_end() afterward, regardless of the data type.
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ATOMIC_START)(void) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_atomic_start: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  __ompt_thread_assign_wait_id(0);
+#endif
+
+  __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ATOMIC_END)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_atomic_end: T#%d\n", gtid));
+  __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+int KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_START)(void) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_single_start");
+  KA_TRACE(20, ("GOMP_single_start: T#%d\n", gtid));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  // 3rd parameter == FALSE prevents kmp_enter_single from pushing a
+  // workshare when USE_CHECKS is defined.  We need to avoid the push,
+  // as there is no corresponding GOMP_single_end() call.
+  kmp_int32 rc = __kmp_enter_single(gtid, &loc, FALSE);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+  int tid = __kmp_tid_from_gtid(gtid);
+
+  if (ompt_enabled.enabled) {
+    if (rc) {
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_executor, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_end,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+      }
+    }
+  }
+#endif
+
+  return rc;
+}
+
+void *KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_COPY_START)(void) {
+  void *retval;
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_single_copy_start");
+  KA_TRACE(20, ("GOMP_single_copy_start: T#%d\n", gtid));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  // If this is the first thread to enter, return NULL.  The generated code will
+  // then call GOMP_single_copy_end() for this thread only, with the
+  // copyprivate data pointer as an argument.
+  if (__kmp_enter_single(gtid, &loc, FALSE))
+    return NULL;
+
+    // Wait for the first thread to set the copyprivate data pointer,
+    // and for all other threads to reach this point.
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+  // Retrieve the value of the copyprivate data point, and wait for all
+  // threads to do likewise, then return.
+  retval = __kmp_team_from_gtid(gtid)->t.t_copypriv_data;
+  {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+  return retval;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_COPY_END)(void *data) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_single_copy_end: T#%d\n", gtid));
+
+  // Set the copyprivate data pointer fo the team, then hit the barrier so that
+  // the other threads will continue on and read it.  Hit another barrier before
+  // continuing, so that the know that the copyprivate data pointer has been
+  // propagated to all threads before trying to reuse the t_copypriv_data field.
+  __kmp_team_from_gtid(gtid)->t.t_copypriv_data = data;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  }
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_START)(void) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_ordered_start");
+  KA_TRACE(20, ("GOMP_ordered_start: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_ordered(&loc, gtid);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_END)(void) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_ordered_end");
+  KA_TRACE(20, ("GOMP_ordered_start: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_end_ordered(&loc, gtid);
+}
+
+// Dispatch macro defs
+//
+// They come in two flavors: 64-bit unsigned, and either 32-bit signed
+// (IA-32 architecture) or 64-bit signed (Intel(R) 64).
+
+#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM ||          \
+    KMP_ARCH_PPC
+#define KMP_DISPATCH_INIT __kmp_aux_dispatch_init_4
+#define KMP_DISPATCH_FINI_CHUNK __kmp_aux_dispatch_fini_chunk_4
+#define KMP_DISPATCH_NEXT __kmpc_dispatch_next_4
+#else
+#define KMP_DISPATCH_INIT __kmp_aux_dispatch_init_8
+#define KMP_DISPATCH_FINI_CHUNK __kmp_aux_dispatch_fini_chunk_8
+#define KMP_DISPATCH_NEXT __kmpc_dispatch_next_8
+#endif /* KMP_ARCH_X86 */
+
+#define KMP_DISPATCH_INIT_ULL __kmp_aux_dispatch_init_8u
+#define KMP_DISPATCH_FINI_CHUNK_ULL __kmp_aux_dispatch_fini_chunk_8u
+#define KMP_DISPATCH_NEXT_ULL __kmpc_dispatch_next_8u
+
+// The parallel construct
+
+#ifndef KMP_DEBUG
+static
+#endif /* KMP_DEBUG */
+    void
+    __kmp_GOMP_microtask_wrapper(int *gtid, int *npr, void (*task)(void *),
+                                 void *data) {
+#if OMPT_SUPPORT
+  kmp_info_t *thr;
+  ompt_frame_t *ompt_frame;
+  ompt_state_t enclosing_state;
+
+  if (ompt_enabled.enabled) {
+    // get pointer to thread data structure
+    thr = __kmp_threads[*gtid];
+
+    // save enclosing task state; set current state for task
+    enclosing_state = thr->th.ompt_thread_info.state;
+    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+
+    // set task frame
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+
+  task(data);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    // clear task frame
+    ompt_frame->exit_frame = ompt_data_none;
+
+    // restore enclosing state
+    thr->th.ompt_thread_info.state = enclosing_state;
+  }
+#endif
+}
+
+#ifndef KMP_DEBUG
+static
+#endif /* KMP_DEBUG */
+    void
+    __kmp_GOMP_parallel_microtask_wrapper(int *gtid, int *npr,
+                                          void (*task)(void *), void *data,
+                                          unsigned num_threads, ident_t *loc,
+                                          enum sched_type schedule, long start,
+                                          long end, long incr,
+                                          long chunk_size) {
+  // Initialize the loop worksharing construct.
+
+  KMP_DISPATCH_INIT(loc, *gtid, schedule, start, end, incr, chunk_size,
+                    schedule != kmp_sch_static);
+
+#if OMPT_SUPPORT
+  kmp_info_t *thr;
+  ompt_frame_t *ompt_frame;
+  ompt_state_t enclosing_state;
+
+  if (ompt_enabled.enabled) {
+    thr = __kmp_threads[*gtid];
+    // save enclosing task state; set current state for task
+    enclosing_state = thr->th.ompt_thread_info.state;
+    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+
+    // set task frame
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+
+  // Now invoke the microtask.
+  task(data);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    // clear task frame
+    ompt_frame->exit_frame = ompt_data_none;
+
+    // reset enclosing state
+    thr->th.ompt_thread_info.state = enclosing_state;
+  }
+#endif
+}
+
+static void __kmp_GOMP_fork_call(ident_t *loc, int gtid, unsigned num_threads,
+                                 unsigned flags, void (*unwrapped_task)(void *),
+                                 microtask_t wrapper, int argc, ...) {
+  int rc;
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_team_t *team = thr->th.th_team;
+  int tid = __kmp_tid_from_gtid(gtid);
+
+  va_list ap;
+  va_start(ap, argc);
+
+  if (num_threads != 0)
+    __kmp_push_num_threads(loc, gtid, num_threads);
+  if (flags != 0)
+    __kmp_push_proc_bind(loc, gtid, (kmp_proc_bind_t)flags);
+  rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc, wrapper,
+                       __kmp_invoke_task_func, kmp_va_addr_of(ap));
+
+  va_end(ap);
+
+  if (rc) {
+    __kmp_run_before_invoked_task(gtid, tid, thr, team);
+  }
+
+#if OMPT_SUPPORT
+  int ompt_team_size;
+  if (ompt_enabled.enabled) {
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+
+    // implicit task callback
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_team_size = __kmp_team_from_gtid(gtid)->t.t_nproc;
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), ompt_team_size, __kmp_tid_from_gtid(gtid),
+          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+      task_info->thread_num = __kmp_tid_from_gtid(gtid);
+    }
+    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+  }
+#endif
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *),
+                                                       void *data,
+                                                       unsigned num_threads) {
+  int gtid = __kmp_entry_gtid();
+
+#if OMPT_SUPPORT
+  ompt_frame_t *parent_frame, *frame;
+
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
+    parent_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  MKLOC(loc, "GOMP_parallel_start");
+  KA_TRACE(20, ("GOMP_parallel_start: T#%d\n", gtid));
+  __kmp_GOMP_fork_call(&loc, gtid, num_threads, 0u, task,
+                       (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task,
+                       data);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &frame, NULL, NULL);
+    frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_parallel_begin();
+#endif
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)(void) {
+  int gtid = __kmp_get_gtid();
+  kmp_info_t *thr;
+
+  thr = __kmp_threads[gtid];
+
+  MKLOC(loc, "GOMP_parallel_end");
+  KA_TRACE(20, ("GOMP_parallel_end: T#%d\n", gtid));
+
+  if (!thr->th.th_team->t.t_serialized) {
+    __kmp_run_after_invoked_task(gtid, __kmp_tid_from_gtid(gtid), thr,
+                                 thr->th.th_team);
+  }
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    // Implicit task is finished here, in the barrier we might schedule
+    // deferred tasks,
+    // these don't see the implicit task on the stack
+    OMPT_CUR_TASK_INFO(thr)->frame.exit_frame = ompt_data_none;
+  }
+#endif
+
+  __kmp_join_call(&loc, gtid
+#if OMPT_SUPPORT
+                  ,
+                  fork_context_gnu
+#endif
+  );
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_parallel_end();
+#endif
+}
+
+// Loop worksharing constructs
+
+// The Gnu codegen passes in an exclusive upper bound for the overall range,
+// but the libguide dispatch code expects an inclusive upper bound, hence the
+// "end - incr" 5th argument to KMP_DISPATCH_INIT (and the " ub - str" 11th
+// argument to __kmp_GOMP_fork_call).
+//
+// Conversely, KMP_DISPATCH_NEXT returns and inclusive upper bound in *p_ub,
+// but the Gnu codegen expects an exclusive upper bound, so the adjustment
+// "*p_ub += stride" compensates for the discrepancy.
+//
+// Correction: the gnu codegen always adjusts the upper bound by +-1, not the
+// stride value.  We adjust the dispatch parameters accordingly (by +-1), but
+// we still adjust p_ub by the actual stride value.
+//
+// The "runtime" versions do not take a chunk_sz parameter.
+//
+// The profile lib cannot support construct checking of unordered loops that
+// are predetermined by the compiler to be statically scheduled, as the gcc
+// codegen will not always emit calls to GOMP_loop_static_next() to get the
+// next iteration.  Instead, it emits inline code to call omp_get_thread_num()
+// num and calculate the iteration space using the result.  It doesn't do this
+// with ordered static loop, so they can be checked.
+
+#if OMPT_SUPPORT
+#define IF_OMPT_SUPPORT(code) code
+#else
+#define IF_OMPT_SUPPORT(code)
+#endif
+
+#define LOOP_START(func, schedule)                                             \
+  int func(long lb, long ub, long str, long chunk_sz, long *p_lb,              \
+           long *p_ub) {                                                       \
+    int status;                                                                \
+    long stride;                                                               \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
+         gtid, lb, ub, str, chunk_sz));                                        \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      {                                                                        \
+        IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                      \
+        KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                          \
+                          (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,      \
+                          (schedule) != kmp_sch_static);                       \
+      }                                                                        \
+      {                                                                        \
+        IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                      \
+        status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,          \
+                                   (kmp_int *)p_ub, (kmp_int *)&stride);       \
+      }                                                                        \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
+    return status;                                                             \
+  }
+
+#define LOOP_RUNTIME_START(func, schedule)                                     \
+  int func(long lb, long ub, long str, long *p_lb, long *p_ub) {               \
+    int status;                                                                \
+    long stride;                                                               \
+    long chunk_sz = 0;                                                         \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz %d\n", \
+         gtid, lb, ub, str, chunk_sz));                                        \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      {                                                                        \
+        IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                      \
+        KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                          \
+                          (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,      \
+                          TRUE);                                               \
+      }                                                                        \
+      {                                                                        \
+        IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                      \
+        status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,          \
+                                   (kmp_int *)p_ub, (kmp_int *)&stride);       \
+      }                                                                        \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
+    return status;                                                             \
+  }
+
+#define KMP_DOACROSS_FINI(status, gtid)                                        \
+  if (!status && __kmp_threads[gtid]->th.th_dispatch->th_doacross_flags) {     \
+    __kmpc_doacross_fini(NULL, gtid);                                          \
+  }
+
+#define LOOP_NEXT(func, fini_code)                                             \
+  int func(long *p_lb, long *p_ub) {                                           \
+    int status;                                                                \
+    long stride;                                                               \
+    int gtid = __kmp_get_gtid();                                               \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d\n", gtid));                            \
+                                                                               \
+    IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                          \
+    fini_code status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,    \
+                                         (kmp_int *)p_ub, (kmp_int *)&stride); \
+    if (status) {                                                              \
+      *p_ub += (stride > 0) ? 1 : -1;                                          \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid)                                            \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, stride 0x%lx, " \
+                       "returning %d\n",                                       \
+         gtid, *p_lb, *p_ub, stride, status));                                 \
+    return status;                                                             \
+  }
+
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_STATIC_START), kmp_sch_static)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_STATIC_NEXT), {})
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DYNAMIC_START),
+           kmp_sch_dynamic_chunked)
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_START),
+           kmp_sch_dynamic_chunked)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT), {})
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_NEXT), {})
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_GUIDED_START),
+           kmp_sch_guided_chunked)
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_START),
+           kmp_sch_guided_chunked)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT), {})
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_NEXT), {})
+LOOP_RUNTIME_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_START),
+                   kmp_sch_runtime)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT), {})
+LOOP_RUNTIME_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_RUNTIME_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_NEXT(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT), {})
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT), {})
+
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START),
+           kmp_ord_static)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT),
+          { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START),
+           kmp_ord_dynamic_chunked)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT),
+          { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START),
+           kmp_ord_guided_chunked)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT),
+          { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+LOOP_RUNTIME_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START),
+    kmp_ord_runtime)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT),
+          { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+
+#define LOOP_DOACROSS_START(func, schedule)                                    \
+  bool func(unsigned ncounts, long *counts, long chunk_sz, long *p_lb,         \
+            long *p_ub) {                                                      \
+    int status;                                                                \
+    long stride, lb, ub, str;                                                  \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, ncounts %u, lb 0x%lx, ub 0x%lx, str " \
+                                "0x%lx, chunk_sz "                             \
+                                "0x%lx\n",                                     \
+                  gtid, ncounts, lb, ub, str, chunk_sz));                      \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
+                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,        \
+                        (schedule) != kmp_sch_static);                         \
+      status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
+                                 (kmp_int *)p_ub, (kmp_int *)&stride);         \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+#define LOOP_DOACROSS_RUNTIME_START(func, schedule)                            \
+  int func(unsigned ncounts, long *counts, long *p_lb, long *p_ub) {           \
+    int status;                                                                \
+    long stride, lb, ub, str;                                                  \
+    long chunk_sz = 0;                                                         \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz %d\n", \
+         gtid, lb, ub, str, chunk_sz));                                        \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
+                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz, TRUE); \
+      status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
+                                 (kmp_int *)p_ub, (kmp_int *)&stride);         \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+LOOP_DOACROSS_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START),
+    kmp_sch_static)
+LOOP_DOACROSS_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START),
+    kmp_sch_dynamic_chunked)
+LOOP_DOACROSS_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START),
+    kmp_sch_guided_chunked)
+LOOP_DOACROSS_RUNTIME_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START),
+    kmp_sch_runtime)
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_END)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_loop_end: T#%d\n", gtid))
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+
+  KA_TRACE(20, ("GOMP_loop_end exit: T#%d\n", gtid))
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_END_NOWAIT)(void) {
+  KA_TRACE(20, ("GOMP_loop_end_nowait: T#%d\n", __kmp_get_gtid()))
+}
+
+// Unsigned long long loop worksharing constructs
+//
+// These are new with gcc 4.4
+
+#define LOOP_START_ULL(func, schedule)                                         \
+  int func(int up, unsigned long long lb, unsigned long long ub,               \
+           unsigned long long str, unsigned long long chunk_sz,                \
+           unsigned long long *p_lb, unsigned long long *p_ub) {               \
+    int status;                                                                \
+    long long str2 = up ? ((long long)str) : -((long long)str);                \
+    long long stride;                                                          \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+                                                                               \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str "    \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, up, lb, ub, str, chunk_sz));                           \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
+                            (str2 > 0) ? (ub - 1) : (ub + 1), str2, chunk_sz,  \
+                            (schedule) != kmp_sch_static);                     \
+      status =                                                                 \
+          KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
+                                (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str2);                                      \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
+    return status;                                                             \
+  }
+
+#define LOOP_RUNTIME_START_ULL(func, schedule)                                 \
+  int func(int up, unsigned long long lb, unsigned long long ub,               \
+           unsigned long long str, unsigned long long *p_lb,                   \
+           unsigned long long *p_ub) {                                         \
+    int status;                                                                \
+    long long str2 = up ? ((long long)str) : -((long long)str);                \
+    unsigned long long stride;                                                 \
+    unsigned long long chunk_sz = 0;                                           \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+                                                                               \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str "    \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, up, lb, ub, str, chunk_sz));                           \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
+                            (str2 > 0) ? (ub - 1) : (ub + 1), str2, chunk_sz,  \
+                            TRUE);                                             \
+      status =                                                                 \
+          KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
+                                (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT((long long)stride == str2);                           \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
+    return status;                                                             \
+  }
+
+#define LOOP_NEXT_ULL(func, fini_code)                                         \
+  int func(unsigned long long *p_lb, unsigned long long *p_ub) {               \
+    int status;                                                                \
+    long long stride;                                                          \
+    int gtid = __kmp_get_gtid();                                               \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d\n", gtid));                            \
+                                                                               \
+    fini_code status =                                                         \
+        KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,            \
+                              (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);       \
+    if (status) {                                                              \
+      *p_ub += (stride > 0) ? 1 : -1;                                          \
+    }                                                                          \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, stride 0x%llx, "  \
+                   "returning %d\n",                                           \
+         gtid, *p_lb, *p_ub, stride, status));                                 \
+    return status;                                                             \
+  }
+
+LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START),
+               kmp_sch_static)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT), {})
+LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START),
+               kmp_sch_dynamic_chunked)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT), {})
+LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START),
+               kmp_sch_guided_chunked)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT), {})
+LOOP_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_START),
+    kmp_sch_dynamic_chunked)
+LOOP_NEXT_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_NEXT), {})
+LOOP_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_START),
+    kmp_sch_guided_chunked)
+LOOP_NEXT_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_NEXT), {})
+LOOP_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START), kmp_sch_runtime)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT), {})
+LOOP_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(
+        KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START),
+    kmp_sch_runtime)
+LOOP_NEXT_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT),
+    {})
+LOOP_NEXT_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT), {})
+
+LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START),
+               kmp_ord_static)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT),
+              { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+LOOP_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START),
+    kmp_ord_dynamic_chunked)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT),
+              { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START),
+               kmp_ord_guided_chunked)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT),
+              { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+LOOP_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START),
+    kmp_ord_runtime)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT),
+              { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+
+#define LOOP_DOACROSS_START_ULL(func, schedule)                                \
+  int func(unsigned ncounts, unsigned long long *counts,                       \
+           unsigned long long chunk_sz, unsigned long long *p_lb,              \
+           unsigned long long *p_ub) {                                         \
+    int status;                                                                \
+    long long stride, str, lb, ub;                                             \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+                                                                               \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, lb 0x%llx, ub 0x%llx, str "           \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, lb, ub, str, chunk_sz));                               \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
+                            (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,    \
+                            (schedule) != kmp_sch_static);                     \
+      status =                                                                 \
+          KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
+                                (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+#define LOOP_DOACROSS_RUNTIME_START_ULL(func, schedule)                        \
+  int func(unsigned ncounts, unsigned long long *counts,                       \
+           unsigned long long *p_lb, unsigned long long *p_ub) {               \
+    int status;                                                                \
+    unsigned long long stride, str, lb, ub;                                    \
+    unsigned long long chunk_sz = 0;                                           \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, lb 0x%llx, ub 0x%llx, str "           \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, lb, ub, str, chunk_sz));                               \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
+                            (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,    \
+                            TRUE);                                             \
+      status =                                                                 \
+          KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
+                                (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+LOOP_DOACROSS_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START),
+    kmp_sch_static)
+LOOP_DOACROSS_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START),
+    kmp_sch_dynamic_chunked)
+LOOP_DOACROSS_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START),
+    kmp_sch_guided_chunked)
+LOOP_DOACROSS_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START),
+    kmp_sch_runtime)
+
+// Combined parallel / loop worksharing constructs
+//
+// There are no ull versions (yet).
+
+#define PARALLEL_LOOP_START(func, schedule, ompt_pre, ompt_post)               \
+  void func(void (*task)(void *), void *data, unsigned num_threads, long lb,   \
+            long ub, long str, long chunk_sz) {                                \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
+         gtid, lb, ub, str, chunk_sz));                                        \
+                                                                               \
+    ompt_pre();                                                                \
+                                                                               \
+    __kmp_GOMP_fork_call(&loc, gtid, num_threads, 0u, task,                    \
+                         (microtask_t)__kmp_GOMP_parallel_microtask_wrapper,   \
+                         9, task, data, num_threads, &loc, (schedule), lb,     \
+                         (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);      \
+    IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid));                          \
+                                                                               \
+    KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                              \
+                      (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,          \
+                      (schedule) != kmp_sch_static);                           \
+                                                                               \
+    ompt_post();                                                               \
+                                                                               \
+    KA_TRACE(20, (KMP_STR(func) " exit: T#%d\n", gtid));                       \
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+
+#define OMPT_LOOP_PRE()                                                        \
+  ompt_frame_t *parent_frame;                                                  \
+  if (ompt_enabled.enabled) {                                                  \
+    __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);   \
+    parent_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);                 \
+    OMPT_STORE_RETURN_ADDRESS(gtid);                                           \
+  }
+
+#define OMPT_LOOP_POST()                                                       \
+  if (ompt_enabled.enabled) {                                                  \
+    parent_frame->enter_frame = ompt_data_none;                                \
+  }
+
+#else
+
+#define OMPT_LOOP_PRE()
+
+#define OMPT_LOOP_POST()
+
+#endif
+
+PARALLEL_LOOP_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START),
+    kmp_sch_static, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START),
+    kmp_sch_dynamic_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START),
+    kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START),
+    kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+
+// Tasking constructs
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data,
+                                             void (*copy_func)(void *, void *),
+                                             long arg_size, long arg_align,
+                                             bool if_cond, unsigned gomp_flags,
+                                             void **depend) {
+  MKLOC(loc, "GOMP_task");
+  int gtid = __kmp_entry_gtid();
+  kmp_int32 flags = 0;
+  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
+
+  KA_TRACE(20, ("GOMP_task: T#%d\n", gtid));
+
+  // The low-order bit is the "untied" flag
+  if (!(gomp_flags & KMP_GOMP_TASK_UNTIED_FLAG)) {
+    input_flags->tiedness = TASK_TIED;
+  }
+  // The second low-order bit is the "final" flag
+  if (gomp_flags & KMP_GOMP_TASK_FINAL_FLAG) {
+    input_flags->final = 1;
+  }
+  input_flags->native = 1;
+  // __kmp_task_alloc() sets up all other flags
+
+  if (!if_cond) {
+    arg_size = 0;
+  }
+
+  kmp_task_t *task = __kmp_task_alloc(
+      &loc, gtid, input_flags, sizeof(kmp_task_t),
+      arg_size ? arg_size + arg_align - 1 : 0, (kmp_routine_entry_t)func);
+
+  if (arg_size > 0) {
+    if (arg_align > 0) {
+      task->shareds = (void *)((((size_t)task->shareds) + arg_align - 1) /
+                               arg_align * arg_align);
+    }
+    // else error??
+
+    if (copy_func) {
+      (*copy_func)(task->shareds, data);
+    } else {
+      KMP_MEMCPY(task->shareds, data, arg_size);
+    }
+  }
+
+#if OMPT_SUPPORT
+  kmp_taskdata_t *current_task;
+  if (ompt_enabled.enabled) {
+    current_task = __kmp_threads[gtid]->th.th_current_task;
+    current_task->ompt_task_info.frame.enter_frame.ptr =
+        OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  if (if_cond) {
+    if (gomp_flags & KMP_GOMP_TASK_DEPENDS_FLAG) {
+      KMP_ASSERT(depend);
+      kmp_gomp_depends_info_t gomp_depends(depend);
+      kmp_int32 ndeps = gomp_depends.get_num_deps();
+      SimpleVLA<kmp_depend_info_t> dep_list(ndeps);
+      for (kmp_int32 i = 0; i < ndeps; i++)
+        dep_list[i] = gomp_depends.get_kmp_depend(i);
+      kmp_int32 ndeps_cnv;
+      __kmp_type_convert(ndeps, &ndeps_cnv);
+      __kmpc_omp_task_with_deps(&loc, gtid, task, ndeps_cnv, dep_list, 0, NULL);
+    } else {
+      __kmpc_omp_task(&loc, gtid, task);
+    }
+  } else {
+#if OMPT_SUPPORT
+    ompt_thread_info_t oldInfo;
+    kmp_info_t *thread;
+    kmp_taskdata_t *taskdata;
+    if (ompt_enabled.enabled) {
+      // Store the threads states and restore them after the task
+      thread = __kmp_threads[gtid];
+      taskdata = KMP_TASK_TO_TASKDATA(task);
+      oldInfo = thread->th.ompt_thread_info;
+      thread->th.ompt_thread_info.wait_id = 0;
+      thread->th.ompt_thread_info.state = ompt_state_work_parallel;
+      taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    }
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    if (gomp_flags & KMP_GOMP_TASK_DEPENDS_FLAG) {
+      KMP_ASSERT(depend);
+      kmp_gomp_depends_info_t gomp_depends(depend);
+      kmp_int32 ndeps = gomp_depends.get_num_deps();
+      SimpleVLA<kmp_depend_info_t> dep_list(ndeps);
+      for (kmp_int32 i = 0; i < ndeps; i++)
+        dep_list[i] = gomp_depends.get_kmp_depend(i);
+      __kmpc_omp_wait_deps(&loc, gtid, ndeps, dep_list, 0, NULL);
+    }
+
+    __kmpc_omp_task_begin_if0(&loc, gtid, task);
+    func(data);
+    __kmpc_omp_task_complete_if0(&loc, gtid, task);
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      thread->th.ompt_thread_info = oldInfo;
+      taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
+    }
+#endif
+  }
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
+  }
+#endif
+
+  KA_TRACE(20, ("GOMP_task exit: T#%d\n", gtid));
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKWAIT)(void) {
+  MKLOC(loc, "GOMP_taskwait");
+  int gtid = __kmp_entry_gtid();
+
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  KA_TRACE(20, ("GOMP_taskwait: T#%d\n", gtid));
+
+  __kmpc_omp_taskwait(&loc, gtid);
+
+  KA_TRACE(20, ("GOMP_taskwait exit: T#%d\n", gtid));
+}
+
+// Sections worksharing constructs
+//
+// For the sections construct, we initialize a dynamically scheduled loop
+// worksharing construct with lb 1 and stride 1, and use the iteration #'s
+// that its returns as sections ids.
+//
+// There are no special entry points for ordered sections, so we always use
+// the dynamically scheduled workshare, even if the sections aren't ordered.
+
+unsigned KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_START)(unsigned count) {
+  int status;
+  kmp_int lb, ub, stride;
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_sections_start");
+  KA_TRACE(20, ("GOMP_sections_start: T#%d\n", gtid));
+
+  KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
+
+  status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, &lb, &ub, &stride);
+  if (status) {
+    KMP_DEBUG_ASSERT(stride == 1);
+    KMP_DEBUG_ASSERT(lb > 0);
+    KMP_ASSERT(lb == ub);
+  } else {
+    lb = 0;
+  }
+
+  KA_TRACE(20, ("GOMP_sections_start exit: T#%d returning %u\n", gtid,
+                (unsigned)lb));
+  return (unsigned)lb;
+}
+
+unsigned KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_NEXT)(void) {
+  int status;
+  kmp_int lb, ub, stride;
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_sections_next");
+  KA_TRACE(20, ("GOMP_sections_next: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, &lb, &ub, &stride);
+  if (status) {
+    KMP_DEBUG_ASSERT(stride == 1);
+    KMP_DEBUG_ASSERT(lb > 0);
+    KMP_ASSERT(lb == ub);
+  } else {
+    lb = 0;
+  }
+
+  KA_TRACE(
+      20, ("GOMP_sections_next exit: T#%d returning %u\n", gtid, (unsigned)lb));
+  return (unsigned)lb;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START)(
+    void (*task)(void *), void *data, unsigned num_threads, unsigned count) {
+  int gtid = __kmp_entry_gtid();
+
+#if OMPT_SUPPORT
+  ompt_frame_t *parent_frame;
+
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
+    parent_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  MKLOC(loc, "GOMP_parallel_sections_start");
+  KA_TRACE(20, ("GOMP_parallel_sections_start: T#%d\n", gtid));
+
+  __kmp_GOMP_fork_call(&loc, gtid, num_threads, 0u, task,
+                       (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,
+                       task, data, num_threads, &loc, kmp_nm_dynamic_chunked,
+                       (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    parent_frame->enter_frame = ompt_data_none;
+  }
+#endif
+
+  KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
+
+  KA_TRACE(20, ("GOMP_parallel_sections_start exit: T#%d\n", gtid));
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_END)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_sections_end: T#%d\n", gtid))
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+
+  KA_TRACE(20, ("GOMP_sections_end exit: T#%d\n", gtid))
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT)(void) {
+  KA_TRACE(20, ("GOMP_sections_end_nowait: T#%d\n", __kmp_get_gtid()))
+}
+
+// libgomp has an empty function for GOMP_taskyield as of 2013-10-10
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKYIELD)(void) {
+  KA_TRACE(20, ("GOMP_taskyield: T#%d\n", __kmp_get_gtid()))
+  return;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL)(void (*task)(void *),
+                                                 void *data,
+                                                 unsigned num_threads,
+                                                 unsigned int flags) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_parallel");
+  KA_TRACE(20, ("GOMP_parallel: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  ompt_task_info_t *parent_task_info, *task_info;
+  if (ompt_enabled.enabled) {
+    parent_task_info = __ompt_get_task_info_object(0);
+    parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_GOMP_fork_call(&loc, gtid, num_threads, flags, task,
+                       (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task,
+                       data);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    task_info = __ompt_get_task_info_object(0);
+    task_info->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+  task(data);
+  {
+#if OMPT_SUPPORT
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
+  }
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    task_info->frame.exit_frame = ompt_data_none;
+    parent_task_info->frame.enter_frame = ompt_data_none;
+  }
+#endif
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *),
+                                                          void *data,
+                                                          unsigned num_threads,
+                                                          unsigned count,
+                                                          unsigned flags) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_parallel_sections");
+  KA_TRACE(20, ("GOMP_parallel_sections: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  ompt_frame_t *task_frame;
+  kmp_info_t *thr;
+  if (ompt_enabled.enabled) {
+    thr = __kmp_threads[gtid];
+    task_frame = &(thr->th.th_current_task->ompt_task_info.frame);
+    task_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  __kmp_GOMP_fork_call(&loc, gtid, num_threads, flags, task,
+                       (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,
+                       task, data, num_threads, &loc, kmp_nm_dynamic_chunked,
+                       (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1);
+
+  {
+#if OMPT_SUPPORT
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+    KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
+  }
+
+#if OMPT_SUPPORT
+  ompt_frame_t *child_frame;
+  if (ompt_enabled.enabled) {
+    child_frame = &(thr->th.th_current_task->ompt_task_info.frame);
+    child_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+
+  task(data);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    child_frame->exit_frame = ompt_data_none;
+  }
+#endif
+
+  KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
+  KA_TRACE(20, ("GOMP_parallel_sections exit: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    task_frame->enter_frame = ompt_data_none;
+  }
+#endif
+}
+
+#define PARALLEL_LOOP(func, schedule, ompt_pre, ompt_post)                     \
+  void func(void (*task)(void *), void *data, unsigned num_threads, long lb,   \
+            long ub, long str, long chunk_sz, unsigned flags) {                \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
+         gtid, lb, ub, str, chunk_sz));                                        \
+                                                                               \
+    ompt_pre();                                                                \
+    IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                          \
+    __kmp_GOMP_fork_call(&loc, gtid, num_threads, flags, task,                 \
+                         (microtask_t)__kmp_GOMP_parallel_microtask_wrapper,   \
+                         9, task, data, num_threads, &loc, (schedule), lb,     \
+                         (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);      \
+                                                                               \
+    {                                                                          \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
+      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
+                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,        \
+                        (schedule) != kmp_sch_static);                         \
+    }                                                                          \
+    task(data);                                                                \
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();                         \
+    ompt_post();                                                               \
+                                                                               \
+    KA_TRACE(20, (KMP_STR(func) " exit: T#%d\n", gtid));                       \
+  }
+
+PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC),
+              kmp_sch_static, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC),
+              kmp_sch_dynamic_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED),
+    kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_DYNAMIC),
+    kmp_sch_dynamic_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED),
+              kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME),
+              kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME),
+    kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME),
+    kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_START)(void) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_taskgroup_start");
+  KA_TRACE(20, ("GOMP_taskgroup_start: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  __kmpc_taskgroup(&loc, gtid);
+
+  return;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_END)(void) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_taskgroup_end");
+  KA_TRACE(20, ("GOMP_taskgroup_end: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  __kmpc_end_taskgroup(&loc, gtid);
+
+  return;
+}
+
+static kmp_int32 __kmp_gomp_to_omp_cancellation_kind(int gomp_kind) {
+  kmp_int32 cncl_kind = 0;
+  switch (gomp_kind) {
+  case 1:
+    cncl_kind = cancel_parallel;
+    break;
+  case 2:
+    cncl_kind = cancel_loop;
+    break;
+  case 4:
+    cncl_kind = cancel_sections;
+    break;
+  case 8:
+    cncl_kind = cancel_taskgroup;
+    break;
+  }
+  return cncl_kind;
+}
+
+// Return true if cancellation should take place, false otherwise
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CANCELLATION_POINT)(int which) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_cancellation_point");
+  KA_TRACE(20, ("GOMP_cancellation_point: T#%d which:%d\n", gtid, which));
+  kmp_int32 cncl_kind = __kmp_gomp_to_omp_cancellation_kind(which);
+  return __kmpc_cancellationpoint(&loc, gtid, cncl_kind);
+}
+
+// Return true if cancellation should take place, false otherwise
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CANCEL)(int which, bool do_cancel) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_cancel");
+  KA_TRACE(20, ("GOMP_cancel: T#%d which:%d do_cancel:%d\n", gtid, which,
+                (int)do_cancel));
+  kmp_int32 cncl_kind = __kmp_gomp_to_omp_cancellation_kind(which);
+
+  if (do_cancel == FALSE) {
+    return __kmpc_cancellationpoint(&loc, gtid, cncl_kind);
+  } else {
+    return __kmpc_cancel(&loc, gtid, cncl_kind);
+  }
+}
+
+// Return true if cancellation should take place, false otherwise
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER_CANCEL)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_barrier_cancel: T#%d\n", gtid));
+  return __kmp_barrier_gomp_cancel(gtid);
+}
+
+// Return true if cancellation should take place, false otherwise
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_sections_end_cancel: T#%d\n", gtid));
+  return __kmp_barrier_gomp_cancel(gtid);
+}
+
+// Return true if cancellation should take place, false otherwise
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_END_CANCEL)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_loop_end_cancel: T#%d\n", gtid));
+  return __kmp_barrier_gomp_cancel(gtid);
+}
+
+// All target functions are empty as of 2014-05-29
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET)(int device, void (*fn)(void *),
+                                               const void *openmp_target,
+                                               size_t mapnum, void **hostaddrs,
+                                               size_t *sizes,
+                                               unsigned char *kinds) {
+  return;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET_DATA)(
+    int device, const void *openmp_target, size_t mapnum, void **hostaddrs,
+    size_t *sizes, unsigned char *kinds) {
+  return;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET_END_DATA)(void) { return; }
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET_UPDATE)(
+    int device, const void *openmp_target, size_t mapnum, void **hostaddrs,
+    size_t *sizes, unsigned char *kinds) {
+  return;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TEAMS)(unsigned int num_teams,
+                                              unsigned int thread_limit) {
+  return;
+}
+
+// Task duplication function which copies src to dest (both are
+// preallocated task structures)
+static void __kmp_gomp_task_dup(kmp_task_t *dest, kmp_task_t *src,
+                                kmp_int32 last_private) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(src);
+  if (taskdata->td_copy_func) {
+    (taskdata->td_copy_func)(dest->shareds, src->shareds);
+  }
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_REGISTER)(
+    uintptr_t *);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+template <typename T>
+void __GOMP_taskloop(void (*func)(void *), void *data,
+                     void (*copy_func)(void *, void *), long arg_size,
+                     long arg_align, unsigned gomp_flags,
+                     unsigned long num_tasks, int priority, T start, T end,
+                     T step) {
+  typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
+  MKLOC(loc, "GOMP_taskloop");
+  int sched;
+  T *loop_bounds;
+  int gtid = __kmp_entry_gtid();
+  kmp_int32 flags = 0;
+  int if_val = gomp_flags & (1u << 10);
+  int nogroup = gomp_flags & (1u << 11);
+  int up = gomp_flags & (1u << 8);
+  int reductions = gomp_flags & (1u << 12);
+  p_task_dup_t task_dup = NULL;
+  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    buff = __kmp_str_format(
+        "GOMP_taskloop: T#%%d: func:%%p data:%%p copy_func:%%p "
+        "arg_size:%%ld arg_align:%%ld gomp_flags:0x%%x num_tasks:%%lu "
+        "priority:%%d start:%%%s end:%%%s step:%%%s\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
+    KA_TRACE(20, (buff, gtid, func, data, copy_func, arg_size, arg_align,
+                  gomp_flags, num_tasks, priority, start, end, step));
+    __kmp_str_free(&buff);
+  }
+#endif
+  KMP_ASSERT((size_t)arg_size >= 2 * sizeof(T));
+  KMP_ASSERT(arg_align > 0);
+  // The low-order bit is the "untied" flag
+  if (!(gomp_flags & 1)) {
+    input_flags->tiedness = TASK_TIED;
+  }
+  // The second low-order bit is the "final" flag
+  if (gomp_flags & 2) {
+    input_flags->final = 1;
+  }
+  // Negative step flag
+  if (!up) {
+    // If step is flagged as negative, but isn't properly sign extended
+    // Then manually sign extend it.  Could be a short, int, char embedded
+    // in a long.  So cannot assume any cast.
+    if (step > 0) {
+      for (int i = sizeof(T) * CHAR_BIT - 1; i >= 0L; --i) {
+        // break at the first 1 bit
+        if (step & ((T)1 << i))
+          break;
+        step |= ((T)1 << i);
+      }
+    }
+  }
+  input_flags->native = 1;
+  // Figure out if none/grainsize/num_tasks clause specified
+  if (num_tasks > 0) {
+    if (gomp_flags & (1u << 9))
+      sched = 1; // grainsize specified
+    else
+      sched = 2; // num_tasks specified
+    // neither grainsize nor num_tasks specified
+  } else {
+    sched = 0;
+  }
+
+  // __kmp_task_alloc() sets up all other flags
+  kmp_task_t *task =
+      __kmp_task_alloc(&loc, gtid, input_flags, sizeof(kmp_task_t),
+                       arg_size + arg_align - 1, (kmp_routine_entry_t)func);
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  taskdata->td_copy_func = copy_func;
+  taskdata->td_size_loop_bounds = sizeof(T);
+
+  // re-align shareds if needed and setup firstprivate copy constructors
+  // through the task_dup mechanism
+  task->shareds = (void *)((((size_t)task->shareds) + arg_align - 1) /
+                           arg_align * arg_align);
+  if (copy_func) {
+    task_dup = __kmp_gomp_task_dup;
+  }
+  KMP_MEMCPY(task->shareds, data, arg_size);
+
+  loop_bounds = (T *)task->shareds;
+  loop_bounds[0] = start;
+  loop_bounds[1] = end + (up ? -1 : 1);
+
+  if (!nogroup) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    __kmpc_taskgroup(&loc, gtid);
+    if (reductions) {
+      // The data pointer points to lb, ub, then reduction data
+      struct data_t {
+        T a, b;
+        uintptr_t *d;
+      };
+      uintptr_t *d = ((data_t *)data)->d;
+      KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_REGISTER)(d);
+    }
+  }
+  __kmpc_taskloop(&loc, gtid, task, if_val, (kmp_uint64 *)&(loop_bounds[0]),
+                  (kmp_uint64 *)&(loop_bounds[1]), (kmp_int64)step, 1, sched,
+                  (kmp_uint64)num_tasks, (void *)task_dup);
+  if (!nogroup) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    __kmpc_end_taskgroup(&loc, gtid);
+  }
+}
+
+// 4 byte version of GOMP_doacross_post
+// This verison needs to create a temporary array which converts 4 byte
+// integers into 8 byte integers
+template <typename T, bool need_conversion = (sizeof(long) == 4)>
+void __kmp_GOMP_doacross_post(T *count);
+
+template <> void __kmp_GOMP_doacross_post<long, true>(long *count) {
+  int gtid = __kmp_entry_gtid();
+  kmp_info_t *th = __kmp_threads[gtid];
+  MKLOC(loc, "GOMP_doacross_post");
+  kmp_int64 num_dims = th->th.th_dispatch->th_doacross_info[0];
+  kmp_int64 *vec = (kmp_int64 *)__kmp_thread_malloc(
+      th, (size_t)(sizeof(kmp_int64) * num_dims));
+  for (kmp_int64 i = 0; i < num_dims; ++i) {
+    vec[i] = (kmp_int64)count[i];
+  }
+  __kmpc_doacross_post(&loc, gtid, vec);
+  __kmp_thread_free(th, vec);
+}
+
+// 8 byte versions of GOMP_doacross_post
+// This version can just pass in the count array directly instead of creating
+// a temporary array
+template <> void __kmp_GOMP_doacross_post<long, false>(long *count) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_doacross_post");
+  __kmpc_doacross_post(&loc, gtid, RCAST(kmp_int64 *, count));
+}
+
+template <typename T> void __kmp_GOMP_doacross_wait(T first, va_list args) {
+  int gtid = __kmp_entry_gtid();
+  kmp_info_t *th = __kmp_threads[gtid];
+  MKLOC(loc, "GOMP_doacross_wait");
+  kmp_int64 num_dims = th->th.th_dispatch->th_doacross_info[0];
+  kmp_int64 *vec = (kmp_int64 *)__kmp_thread_malloc(
+      th, (size_t)(sizeof(kmp_int64) * num_dims));
+  vec[0] = (kmp_int64)first;
+  for (kmp_int64 i = 1; i < num_dims; ++i) {
+    T item = va_arg(args, T);
+    vec[i] = (kmp_int64)item;
+  }
+  __kmpc_doacross_wait(&loc, gtid, vec);
+  __kmp_thread_free(th, vec);
+  return;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKLOOP)(
+    void (*func)(void *), void *data, void (*copy_func)(void *, void *),
+    long arg_size, long arg_align, unsigned gomp_flags, unsigned long num_tasks,
+    int priority, long start, long end, long step) {
+  __GOMP_taskloop<long>(func, data, copy_func, arg_size, arg_align, gomp_flags,
+                        num_tasks, priority, start, end, step);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKLOOP_ULL)(
+    void (*func)(void *), void *data, void (*copy_func)(void *, void *),
+    long arg_size, long arg_align, unsigned gomp_flags, unsigned long num_tasks,
+    int priority, unsigned long long start, unsigned long long end,
+    unsigned long long step) {
+  __GOMP_taskloop<unsigned long long>(func, data, copy_func, arg_size,
+                                      arg_align, gomp_flags, num_tasks,
+                                      priority, start, end, step);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_POST)(long *count) {
+  __kmp_GOMP_doacross_post(count);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_WAIT)(long first, ...) {
+  va_list args;
+  va_start(args, first);
+  __kmp_GOMP_doacross_wait<long>(first, args);
+  va_end(args);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_ULL_POST)(
+    unsigned long long *count) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_doacross_ull_post");
+  __kmpc_doacross_post(&loc, gtid, RCAST(kmp_int64 *, count));
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT)(
+    unsigned long long first, ...) {
+  va_list args;
+  va_start(args, first);
+  __kmp_GOMP_doacross_wait<unsigned long long>(first, args);
+  va_end(args);
+}
+
+// fn: the function each primary thread of new team will call
+// data: argument to fn
+// num_teams, thread_limit: max bounds on respective ICV
+// flags: unused
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TEAMS_REG)(void (*fn)(void *),
+                                                  void *data,
+                                                  unsigned num_teams,
+                                                  unsigned thread_limit,
+                                                  unsigned flags) {
+  MKLOC(loc, "GOMP_teams_reg");
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_teams_reg: T#%d num_teams=%u thread_limit=%u flag=%u\n",
+                gtid, num_teams, thread_limit, flags));
+  __kmpc_push_num_teams(&loc, gtid, num_teams, thread_limit);
+  __kmpc_fork_teams(&loc, 2, (microtask_t)__kmp_GOMP_microtask_wrapper, fn,
+                    data);
+  KA_TRACE(20, ("GOMP_teams_reg exit: T#%d\n", gtid));
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKWAIT_DEPEND)(void **depend) {
+  MKLOC(loc, "GOMP_taskwait_depend");
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_taskwait_depend: T#%d\n", gtid));
+  kmp_gomp_depends_info_t gomp_depends(depend);
+  kmp_int32 ndeps = gomp_depends.get_num_deps();
+  SimpleVLA<kmp_depend_info_t> dep_list(ndeps);
+  for (kmp_int32 i = 0; i < ndeps; i++)
+    dep_list[i] = gomp_depends.get_kmp_depend(i);
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_omp_wait_deps(&loc, gtid, ndeps, dep_list, 0, NULL);
+  KA_TRACE(20, ("GOMP_taskwait_depend exit: T#%d\n", gtid));
+}
+
+static inline void
+__kmp_GOMP_taskgroup_reduction_register(uintptr_t *data, kmp_taskgroup_t *tg,
+                                        int nthreads,
+                                        uintptr_t *allocated = nullptr) {
+  KMP_ASSERT(data);
+  KMP_ASSERT(nthreads > 0);
+  // Have private copy pointers point to previously allocated
+  // reduction data or allocate new data here
+  if (allocated) {
+    data[2] = allocated[2];
+    data[6] = allocated[6];
+  } else {
+    data[2] = (uintptr_t)__kmp_allocate(nthreads * data[1]);
+    data[6] = data[2] + (nthreads * data[1]);
+  }
+  if (tg)
+    tg->gomp_data = data;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_REGISTER)(
+    uintptr_t *data) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_taskgroup_reduction_register: T#%d\n", gtid));
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
+  int nthreads = thread->th.th_team_nproc;
+  __kmp_GOMP_taskgroup_reduction_register(data, tg, nthreads);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_UNREGISTER)(
+    uintptr_t *data) {
+  KA_TRACE(20,
+           ("GOMP_taskgroup_reduction_unregister: T#%d\n", __kmp_get_gtid()));
+  KMP_ASSERT(data && data[2]);
+  __kmp_free((void *)data[2]);
+}
+
+// Search through reduction data and set ptrs[] elements
+// to proper privatized copy address
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK_REDUCTION_REMAP)(size_t cnt,
+                                                             size_t cntorig,
+                                                             void **ptrs) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_task_reduction_remap: T#%d\n", gtid));
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_int32 tid = __kmp_get_tid();
+  for (size_t i = 0; i < cnt; ++i) {
+    uintptr_t address = (uintptr_t)ptrs[i];
+    void *propagated_address = NULL;
+    void *mapped_address = NULL;
+    // Check taskgroups reduce data
+    kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
+    while (tg) {
+      uintptr_t *gomp_data = tg->gomp_data;
+      if (!gomp_data) {
+        tg = tg->parent;
+        continue;
+      }
+      // Check the shared addresses list
+      size_t num_vars = (size_t)gomp_data[0];
+      uintptr_t per_thread_size = gomp_data[1];
+      uintptr_t reduce_data = gomp_data[2];
+      uintptr_t end_reduce_data = gomp_data[6];
+      for (size_t j = 0; j < num_vars; ++j) {
+        uintptr_t *entry = gomp_data + 7 + 3 * j;
+        if (entry[0] == address) {
+          uintptr_t offset = entry[1];
+          mapped_address =
+              (void *)(reduce_data + tid * per_thread_size + offset);
+          if (i < cntorig)
+            propagated_address = (void *)entry[0];
+          break;
+        }
+      }
+      if (mapped_address)
+        break;
+      // Check if address is within privatized copies range
+      if (!mapped_address && address >= reduce_data &&
+          address < end_reduce_data) {
+        uintptr_t offset = (address - reduce_data) % per_thread_size;
+        mapped_address = (void *)(reduce_data + tid * per_thread_size + offset);
+        if (i < cntorig) {
+          for (size_t j = 0; j < num_vars; ++j) {
+            uintptr_t *entry = gomp_data + 7 + 3 * j;
+            if (entry[1] == offset) {
+              propagated_address = (void *)entry[0];
+              break;
+            }
+          }
+        }
+      }
+      if (mapped_address)
+        break;
+      tg = tg->parent;
+    }
+    KMP_ASSERT(mapped_address);
+    ptrs[i] = mapped_address;
+    if (i < cntorig) {
+      KMP_ASSERT(propagated_address);
+      ptrs[cnt + i] = propagated_address;
+    }
+  }
+}
+
+static void __kmp_GOMP_init_reductions(int gtid, uintptr_t *data, int is_ws) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_team_t *team = thr->th.th_team;
+  // First start a taskgroup
+  __kmpc_taskgroup(NULL, gtid);
+  // Then setup reduction data
+  void *reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
+  if (reduce_data == NULL &&
+      __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
+                                 (void *)1)) {
+    // Single thread enters this block to initialize common reduction data
+    KMP_DEBUG_ASSERT(reduce_data == NULL);
+    __kmp_GOMP_taskgroup_reduction_register(data, NULL, thr->th.th_team_nproc);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_fini_counter[is_ws], 0);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], (void *)data);
+  } else {
+    // Wait for task reduction initialization
+    while ((reduce_data = KMP_ATOMIC_LD_ACQ(
+                &team->t.t_tg_reduce_data[is_ws])) == (void *)1) {
+      KMP_CPU_PAUSE();
+    }
+    KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
+  }
+  // For worksharing constructs, each thread has its own reduction structure.
+  // Have each reduction structure point to same privatized copies of vars.
+  // For parallel, each thread points to same reduction structure and privatized
+  // copies of vars
+  if (is_ws) {
+    __kmp_GOMP_taskgroup_reduction_register(
+        data, NULL, thr->th.th_team_nproc,
+        (uintptr_t *)KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws]));
+  }
+  kmp_taskgroup_t *tg = thr->th.th_current_task->td_taskgroup;
+  tg->gomp_data = data;
+}
+
+static unsigned
+__kmp_GOMP_par_reductions_microtask_wrapper(int *gtid, int *npr,
+                                            void (*task)(void *), void *data) {
+  kmp_info_t *thr = __kmp_threads[*gtid];
+  kmp_team_t *team = thr->th.th_team;
+  uintptr_t *reduce_data = *(uintptr_t **)data;
+  __kmp_GOMP_init_reductions(*gtid, reduce_data, 0);
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  ompt_state_t enclosing_state;
+
+  if (ompt_enabled.enabled) {
+    // save enclosing task state; set current state for task
+    enclosing_state = thr->th.ompt_thread_info.state;
+    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+
+    // set task frame
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+
+  task(data);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    // clear task frame
+    ompt_frame->exit_frame = ompt_data_none;
+
+    // restore enclosing state
+    thr->th.ompt_thread_info.state = enclosing_state;
+  }
+#endif
+  __kmpc_end_taskgroup(NULL, *gtid);
+  // if last thread out, then reset the team's reduce data
+  // the GOMP_taskgroup_reduction_unregister() function will deallocate
+  // private copies after reduction calculations take place.
+  int count = KMP_ATOMIC_INC(&team->t.t_tg_fini_counter[0]);
+  if (count == thr->th.th_team_nproc - 1) {
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[0], NULL);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_fini_counter[0], 0);
+  }
+  return (unsigned)thr->th.th_team_nproc;
+}
+
+unsigned KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_REDUCTIONS)(
+    void (*task)(void *), void *data, unsigned num_threads,
+    unsigned int flags) {
+  MKLOC(loc, "GOMP_parallel_reductions");
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_parallel_reductions: T#%d\n", gtid));
+  __kmp_GOMP_fork_call(&loc, gtid, num_threads, flags, task,
+                       (microtask_t)__kmp_GOMP_par_reductions_microtask_wrapper,
+                       2, task, data);
+  unsigned retval =
+      __kmp_GOMP_par_reductions_microtask_wrapper(&gtid, NULL, task, data);
+  KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
+  KA_TRACE(20, ("GOMP_parallel_reductions exit: T#%d\n", gtid));
+  return retval;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_START)(
+    long start, long end, long incr, long sched, long chunk_size, long *istart,
+    long *iend, uintptr_t *reductions, void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_loop_start: T#%d, reductions: %p\n", gtid, reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  long monotonic = sched & MONOTONIC_FLAG;
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_START)(
+          start, end, incr, istart, iend);
+    else
+      status = KMP_EXPAND_NAME(
+          KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START)(
+          start, end, incr, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_STATIC_START)(
+        start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DYNAMIC_START)(
+          start, end, incr, chunk_size, istart, iend);
+    else
+      status =
+          KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_START)(
+              start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_GUIDED_START)(
+          start, end, incr, chunk_size, istart, iend);
+    else
+      status =
+          KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_START)(
+              start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 4) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START)(
+        start, end, incr, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_START)(
+    bool up, unsigned long long start, unsigned long long end,
+    unsigned long long incr, long sched, unsigned long long chunk_size,
+    unsigned long long *istart, unsigned long long *iend, uintptr_t *reductions,
+    void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20,
+           ("GOMP_loop_ull_start: T#%d, reductions: %p\n", gtid, reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  long monotonic = sched & MONOTONIC_FLAG;
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START)(
+          up, start, end, incr, istart, iend);
+    else
+      status = KMP_EXPAND_NAME(
+          KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START)(
+          up, start, end, incr, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START)(
+        up, start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START)(
+          up, start, end, incr, chunk_size, istart, iend);
+    else
+      status = KMP_EXPAND_NAME(
+          KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_START)(
+          up, start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    if (monotonic)
+      status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START)(
+          up, start, end, incr, chunk_size, istart, iend);
+    else
+      status =
+          KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_START)(
+              up, start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 4) {
+    status =
+        KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START)(
+            up, start, end, incr, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_START)(
+    unsigned ncounts, long *counts, long sched, long chunk_size, long *istart,
+    long *iend, uintptr_t *reductions, void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_loop_doacross_start: T#%d, reductions: %p\n", gtid,
+                reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  // Ignore any monotonic flag
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START)(
+        ncounts, counts, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_START)(
+    unsigned ncounts, unsigned long long *counts, long sched,
+    unsigned long long chunk_size, unsigned long long *istart,
+    unsigned long long *iend, uintptr_t *reductions, void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_loop_ull_doacross_start: T#%d, reductions: %p\n", gtid,
+                reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  // Ignore any monotonic flag
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START)(
+        ncounts, counts, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START)(
+        ncounts, counts, chunk_size, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_START)(
+    long start, long end, long incr, long sched, long chunk_size, long *istart,
+    long *iend, uintptr_t *reductions, void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_loop_ordered_start: T#%d, reductions: %p\n", gtid,
+                reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  // Ignore any monotonic flag
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START)(
+        start, end, incr, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START)(
+        start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START)(
+        start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START)(
+        start, end, incr, chunk_size, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_START)(
+    bool up, unsigned long long start, unsigned long long end,
+    unsigned long long incr, long sched, unsigned long long chunk_size,
+    unsigned long long *istart, unsigned long long *iend, uintptr_t *reductions,
+    void **mem) {
+  int status = 0;
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_loop_ull_ordered_start: T#%d, reductions: %p\n", gtid,
+                reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  if (istart == NULL)
+    return true;
+  // Ignore any monotonic flag
+  const long MONOTONIC_FLAG = (long)(kmp_sched_monotonic);
+  sched &= ~MONOTONIC_FLAG;
+  if (sched == 0) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START)(
+        up, start, end, incr, istart, iend);
+  } else if (sched == 1) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START)(
+        up, start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 2) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START)(
+        up, start, end, incr, chunk_size, istart, iend);
+  } else if (sched == 3) {
+    status = KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START)(
+        up, start, end, incr, chunk_size, istart, iend);
+  } else {
+    KMP_ASSERT(0);
+  }
+  return status;
+}
+
+unsigned KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS2_START)(
+    unsigned count, uintptr_t *reductions, void **mem) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20,
+           ("GOMP_sections2_start: T#%d, reductions: %p\n", gtid, reductions));
+  if (reductions)
+    __kmp_GOMP_init_reductions(gtid, reductions, 1);
+  if (mem)
+    KMP_FATAL(GompFeatureNotSupported, "scan");
+  return KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_START)(count);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER)(
+    bool cancelled) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_workshare_task_reduction_unregister");
+  KA_TRACE(20, ("GOMP_workshare_task_reduction_unregister: T#%d\n", gtid));
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_team_t *team = thr->th.th_team;
+  __kmpc_end_taskgroup(NULL, gtid);
+  // If last thread out of workshare, then reset the team's reduce data
+  // the GOMP_taskgroup_reduction_unregister() function will deallocate
+  // private copies after reduction calculations take place.
+  int count = KMP_ATOMIC_INC(&team->t.t_tg_fini_counter[1]);
+  if (count == thr->th.th_team_nproc - 1) {
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_UNREGISTER)
+    ((uintptr_t *)KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[1]));
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[1], NULL);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_fini_counter[1], 0);
+  }
+  if (!cancelled) {
+    __kmpc_barrier(&loc, gtid);
+  }
+}
+
+// allocator construct
+void *KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ALLOC)(size_t alignment, size_t size,
+                                               uintptr_t allocator) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_alloc: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_alloc(gtid, alignment, size, (omp_allocator_handle_t)allocator);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_FREE)(void *ptr, uintptr_t allocator) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_free: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return ___kmpc_free(gtid, ptr, (omp_allocator_handle_t)allocator);
+}
+
+/* The following sections of code create aliases for the GOMP_* functions, then
+   create versioned symbols using the assembler directive .symver. This is only
+   pertinent for ELF .so library. The KMP_VERSION_SYMBOL macro is defined in
+   kmp_os.h  */
+
+#ifdef KMP_USE_VERSION_SYMBOLS
+// GOMP_1.0 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ATOMIC_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ATOMIC_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_BARRIER, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_NAME_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_NAME_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DYNAMIC_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_END_NOWAIT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_GUIDED_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_RUNTIME_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_STATIC_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_STATIC_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ORDERED_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ORDERED_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SINGLE_COPY_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SINGLE_COPY_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SINGLE_START, 10, "GOMP_1.0");
+
+// GOMP_2.0 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASK, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKWAIT, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START, 20, "GOMP_2.0");
+
+// GOMP_3.0 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKYIELD, 30, "GOMP_3.0");
+
+// GOMP_4.0 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_SECTIONS, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKGROUP_START, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKGROUP_END, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_BARRIER_CANCEL, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CANCEL, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CANCELLATION_POINT, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_END_CANCEL, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET_DATA, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET_END_DATA, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET_UPDATE, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TEAMS, 40, "GOMP_4.0");
+
+// GOMP_4.5 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKLOOP, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKLOOP_ULL, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_POST, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_WAIT, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_ULL_POST, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_NEXT, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_NEXT, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_NEXT, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_NEXT, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_DYNAMIC, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED, 45,
+                   "GOMP_4.5");
+
+// GOMP_5.0 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_NEXT, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_MAYBE_NONMONOTONIC_RUNTIME_START, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_NEXT, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_RUNTIME_START, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_NEXT,
+                   50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_MAYBE_NONMONOTONIC_RUNTIME_START,
+                   50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_NEXT, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_RUNTIME_START, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_RUNTIME, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_MAYBE_NONMONOTONIC_RUNTIME,
+                   50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TEAMS_REG, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKWAIT_DEPEND, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_REGISTER, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKGROUP_REDUCTION_UNREGISTER, 50,
+                   "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASK_REDUCTION_REMAP, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_REDUCTIONS, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS2_START, 50, "GOMP_5.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER, 50,
+                   "GOMP_5.0");
+
+// GOMP_5.0.1 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ALLOC, 501, "GOMP_5.0.1");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_FREE, 501, "GOMP_5.0.1");
+#endif // KMP_USE_VERSION_SYMBOLS
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
diff --git a/third_party/openmp/kmp_i18n.cpp b/third_party/openmp/kmp_i18n.cpp
new file mode 100644
index 000000000..a164aa180
--- /dev/null
+++ b/third_party/openmp/kmp_i18n.cpp
@@ -0,0 +1,876 @@
+/*
+ * kmp_i18n.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_i18n.h"
+
+#include "kmp.h"
+#include "kmp_debug.h"
+#include "kmp_io.h" // __kmp_printf.
+#include "kmp_lock.h"
+#include "kmp_os.h"
+
+#include <errno.h>
+#include <locale.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "kmp_environment.h"
+#include "kmp_i18n_default.inc"
+#include "kmp_str.h"
+
+#undef KMP_I18N_OK
+
+#define get_section(id) ((id) >> 16)
+#define get_number(id) ((id)&0xFFFF)
+
+kmp_msg_t __kmp_msg_null = {kmp_mt_dummy, 0, NULL, 0};
+static char const *no_message_available = "(No message available)";
+
+static void __kmp_msg(kmp_msg_severity_t severity, kmp_msg_t message,
+                      va_list ap);
+
+enum kmp_i18n_cat_status {
+  KMP_I18N_CLOSED, // Not yet opened or closed.
+  KMP_I18N_OPENED, // Opened successfully, ready to use.
+  KMP_I18N_ABSENT // Opening failed, message catalog should not be used.
+}; // enum kmp_i18n_cat_status
+typedef enum kmp_i18n_cat_status kmp_i18n_cat_status_t;
+static volatile kmp_i18n_cat_status_t status = KMP_I18N_CLOSED;
+
+/* Message catalog is opened at first usage, so we have to synchronize opening
+   to avoid race and multiple openings.
+
+   Closing does not require synchronization, because catalog is closed very late
+   at library shutting down, when no other threads are alive.  */
+
+static void __kmp_i18n_do_catopen();
+static kmp_bootstrap_lock_t lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(lock);
+// `lock' variable may be placed into __kmp_i18n_catopen function because it is
+// used only by that function. But we afraid a (buggy) compiler may treat it
+// wrongly. So we put it outside of function just in case.
+
+void __kmp_i18n_catopen() {
+  if (status == KMP_I18N_CLOSED) {
+    __kmp_acquire_bootstrap_lock(&lock);
+    if (status == KMP_I18N_CLOSED) {
+      __kmp_i18n_do_catopen();
+    }
+    __kmp_release_bootstrap_lock(&lock);
+  }
+} // func __kmp_i18n_catopen
+
+/* Linux* OS and OS X* part */
+#if KMP_OS_UNIX
+#define KMP_I18N_OK
+
+#include <nl_types.h>
+
+#define KMP_I18N_NULLCAT ((nl_catd)(-1))
+static nl_catd cat = KMP_I18N_NULLCAT; // !!! Shall it be volatile?
+static char const *name =
+    (KMP_VERSION_MAJOR == 4 ? "libguide.cat" : "libomp.cat");
+
+/* Useful links:
+http://www.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html#tag_08_02
+http://www.opengroup.org/onlinepubs/000095399/functions/catopen.html
+http://www.opengroup.org/onlinepubs/000095399/functions/setlocale.html
+*/
+
+void __kmp_i18n_do_catopen() {
+  int english = 0;
+  char *lang = __kmp_env_get("LANG");
+  // TODO: What about LC_ALL or LC_MESSAGES?
+
+  KMP_DEBUG_ASSERT(status == KMP_I18N_CLOSED);
+  KMP_DEBUG_ASSERT(cat == KMP_I18N_NULLCAT);
+
+  english = lang == NULL || // In all these cases English language is used.
+            strcmp(lang, "") == 0 || strcmp(lang, " ") == 0 ||
+            // Workaround for Fortran RTL bug DPD200137873 "Fortran runtime
+            // resets LANG env var to space if it is not set".
+            strcmp(lang, "C") == 0 || strcmp(lang, "POSIX") == 0;
+
+  if (!english) { // English language is not yet detected, let us continue.
+    // Format of LANG is: [language[_territory][.codeset][@modifier]]
+    // Strip all parts except language.
+    char *tail = NULL;
+    __kmp_str_split(lang, '@', &lang, &tail);
+    __kmp_str_split(lang, '.', &lang, &tail);
+    __kmp_str_split(lang, '_', &lang, &tail);
+    english = (strcmp(lang, "en") == 0);
+  }
+
+  KMP_INTERNAL_FREE(lang);
+
+  // Do not try to open English catalog because internal messages are
+  // exact copy of messages in English catalog.
+  if (english) {
+    status = KMP_I18N_ABSENT; // mark catalog as absent so it will not
+    // be re-opened.
+    return;
+  }
+
+  cat = catopen(name, 0);
+  // TODO: Why do we pass 0 in flags?
+  status = (cat == KMP_I18N_NULLCAT ? KMP_I18N_ABSENT : KMP_I18N_OPENED);
+
+  if (status == KMP_I18N_ABSENT) {
+    if (__kmp_generate_warnings > kmp_warnings_low) {
+      // AC: only issue warning in case explicitly asked to
+      int error = errno; // Save errno immediately.
+      char *nlspath = __kmp_env_get("NLSPATH");
+      char *lang = __kmp_env_get("LANG");
+
+      // Infinite recursion will not occur -- status is KMP_I18N_ABSENT now, so
+      // __kmp_i18n_catgets() will not try to open catalog, but will return
+      // default message.
+      kmp_msg_t err_code = KMP_ERR(error);
+      __kmp_msg(kmp_ms_warning, KMP_MSG(CantOpenMessageCatalog, name), err_code,
+                KMP_HNT(CheckEnvVar, "NLSPATH", nlspath),
+                KMP_HNT(CheckEnvVar, "LANG", lang), __kmp_msg_null);
+      if (__kmp_generate_warnings == kmp_warnings_off) {
+        __kmp_str_free(&err_code.str);
+      }
+
+      KMP_INFORM(WillUseDefaultMessages);
+      KMP_INTERNAL_FREE(nlspath);
+      KMP_INTERNAL_FREE(lang);
+    }
+  } else { // status == KMP_I18N_OPENED
+    int section = get_section(kmp_i18n_prp_Version);
+    int number = get_number(kmp_i18n_prp_Version);
+    char const *expected = __kmp_i18n_default_table.sect[section].str[number];
+    // Expected version of the catalog.
+    kmp_str_buf_t version; // Actual version of the catalog.
+    __kmp_str_buf_init(&version);
+    __kmp_str_buf_print(&version, "%s", catgets(cat, section, number, NULL));
+
+    // String returned by catgets is invalid after closing catalog, so copy it.
+    if (strcmp(version.str, expected) != 0) {
+      __kmp_i18n_catclose(); // Close bad catalog.
+      status = KMP_I18N_ABSENT; // And mark it as absent.
+      if (__kmp_generate_warnings > kmp_warnings_low) {
+        // AC: only issue warning in case explicitly asked to
+        // And now print a warning using default messages.
+        char const *name = "NLSPATH";
+        char const *nlspath = __kmp_env_get(name);
+        __kmp_msg(kmp_ms_warning,
+                  KMP_MSG(WrongMessageCatalog, name, version.str, expected),
+                  KMP_HNT(CheckEnvVar, name, nlspath), __kmp_msg_null);
+        KMP_INFORM(WillUseDefaultMessages);
+        KMP_INTERNAL_FREE(CCAST(char *, nlspath));
+      } // __kmp_generate_warnings
+    }
+    __kmp_str_buf_free(&version);
+  }
+} // func __kmp_i18n_do_catopen
+
+void __kmp_i18n_catclose() {
+  if (status == KMP_I18N_OPENED) {
+    KMP_DEBUG_ASSERT(cat != KMP_I18N_NULLCAT);
+    catclose(cat);
+    cat = KMP_I18N_NULLCAT;
+  }
+  status = KMP_I18N_CLOSED;
+} // func __kmp_i18n_catclose
+
+char const *__kmp_i18n_catgets(kmp_i18n_id_t id) {
+
+  int section = get_section(id);
+  int number = get_number(id);
+  char const *message = NULL;
+
+  if (1 <= section && section <= __kmp_i18n_default_table.size) {
+    if (1 <= number && number <= __kmp_i18n_default_table.sect[section].size) {
+      if (status == KMP_I18N_CLOSED) {
+        __kmp_i18n_catopen();
+      }
+      if (status == KMP_I18N_OPENED) {
+        message = catgets(cat, section, number,
+                          __kmp_i18n_default_table.sect[section].str[number]);
+      }
+      if (message == NULL) {
+        message = __kmp_i18n_default_table.sect[section].str[number];
+      }
+    }
+  }
+  if (message == NULL) {
+    message = no_message_available;
+  }
+  return message;
+
+} // func __kmp_i18n_catgets
+
+#endif // KMP_OS_UNIX
+
+/* Windows* OS part. */
+
+#if KMP_OS_WINDOWS
+#define KMP_I18N_OK
+
+#include "kmp_environment.h"
+#include <windows.h>
+
+#define KMP_I18N_NULLCAT NULL
+static HMODULE cat = KMP_I18N_NULLCAT; // !!! Shall it be volatile?
+static char const *name =
+    (KMP_VERSION_MAJOR == 4 ? "libguide40ui.dll" : "libompui.dll");
+
+static kmp_i18n_table_t table = {0, NULL};
+// Messages formatted by FormatMessage() should be freed, but catgets()
+// interface assumes user will not free messages. So we cache all the retrieved
+// messages in the table, which are freed at catclose().
+static UINT const default_code_page = CP_OEMCP;
+static UINT code_page = default_code_page;
+
+static char const *___catgets(kmp_i18n_id_t id);
+static UINT get_code_page();
+static void kmp_i18n_table_free(kmp_i18n_table_t *table);
+
+static UINT get_code_page() {
+
+  UINT cp = default_code_page;
+  char const *value = __kmp_env_get("KMP_CODEPAGE");
+  if (value != NULL) {
+    if (_stricmp(value, "ANSI") == 0) {
+      cp = CP_ACP;
+    } else if (_stricmp(value, "OEM") == 0) {
+      cp = CP_OEMCP;
+    } else if (_stricmp(value, "UTF-8") == 0 || _stricmp(value, "UTF8") == 0) {
+      cp = CP_UTF8;
+    } else if (_stricmp(value, "UTF-7") == 0 || _stricmp(value, "UTF7") == 0) {
+      cp = CP_UTF7;
+    } else {
+      // !!! TODO: Issue a warning?
+    }
+  }
+  KMP_INTERNAL_FREE((void *)value);
+  return cp;
+
+} // func get_code_page
+
+static void kmp_i18n_table_free(kmp_i18n_table_t *table) {
+  int s;
+  int m;
+  for (s = 0; s < table->size; ++s) {
+    for (m = 0; m < table->sect[s].size; ++m) {
+      // Free message.
+      KMP_INTERNAL_FREE((void *)table->sect[s].str[m]);
+      table->sect[s].str[m] = NULL;
+    }
+    table->sect[s].size = 0;
+    // Free section itself.
+    KMP_INTERNAL_FREE((void *)table->sect[s].str);
+    table->sect[s].str = NULL;
+  }
+  table->size = 0;
+  KMP_INTERNAL_FREE((void *)table->sect);
+  table->sect = NULL;
+} // kmp_i18n_table_free
+
+void __kmp_i18n_do_catopen() {
+
+  LCID locale_id = GetThreadLocale();
+  WORD lang_id = LANGIDFROMLCID(locale_id);
+  WORD primary_lang_id = PRIMARYLANGID(lang_id);
+  kmp_str_buf_t path;
+
+  KMP_DEBUG_ASSERT(status == KMP_I18N_CLOSED);
+  KMP_DEBUG_ASSERT(cat == KMP_I18N_NULLCAT);
+
+  __kmp_str_buf_init(&path);
+
+  // Do not try to open English catalog because internal messages are exact copy
+  // of messages in English catalog.
+  if (primary_lang_id == LANG_ENGLISH) {
+    status = KMP_I18N_ABSENT; // mark catalog as absent so it will not
+    // be re-opened.
+    goto end;
+  }
+
+  // Construct resource DLL name.
+  /* Simple LoadLibrary( name ) is not suitable due to security issue (see
+     http://www.microsoft.com/technet/security/advisory/2269637.mspx). We have
+     to specify full path to the message catalog.  */
+  {
+    // Get handle of our DLL first.
+    HMODULE handle;
+    BOOL brc = GetModuleHandleEx(
+        GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+            GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+        reinterpret_cast<LPCSTR>(&__kmp_i18n_do_catopen), &handle);
+    if (!brc) { // Error occurred.
+      status = KMP_I18N_ABSENT; // mark catalog as absent so it will not be
+      // re-opened.
+      goto end;
+      // TODO: Enable multiple messages (KMP_MSG) to be passed to __kmp_msg; and
+      // print a proper warning.
+    }
+
+    // Now get path to the our DLL.
+    for (;;) {
+      DWORD drc = GetModuleFileName(handle, path.str, path.size);
+      if (drc == 0) { // Error occurred.
+        status = KMP_I18N_ABSENT;
+        goto end;
+      }
+      if (drc < path.size) {
+        path.used = drc;
+        break;
+      }
+      __kmp_str_buf_reserve(&path, path.size * 2);
+    }
+
+    // Now construct the name of message catalog.
+    kmp_str_fname fname;
+    __kmp_str_fname_init(&fname, path.str);
+    __kmp_str_buf_clear(&path);
+    __kmp_str_buf_print(&path, "%s%lu/%s", fname.dir,
+                        (unsigned long)(locale_id), name);
+    __kmp_str_fname_free(&fname);
+  }
+
+  // For security reasons, use LoadLibraryEx() and load message catalog as a
+  // data file.
+  cat = LoadLibraryEx(path.str, NULL, LOAD_LIBRARY_AS_DATAFILE);
+  status = (cat == KMP_I18N_NULLCAT ? KMP_I18N_ABSENT : KMP_I18N_OPENED);
+
+  if (status == KMP_I18N_ABSENT) {
+    if (__kmp_generate_warnings > kmp_warnings_low) {
+      // AC: only issue warning in case explicitly asked to
+      DWORD error = GetLastError();
+      // Infinite recursion will not occur -- status is KMP_I18N_ABSENT now, so
+      // __kmp_i18n_catgets() will not try to open catalog but will return
+      // default message.
+      /* If message catalog for another architecture found (e.g. OpenMP RTL for
+         IA-32 architecture opens libompui.dll for Intel(R) 64) Windows* OS
+         returns error 193 (ERROR_BAD_EXE_FORMAT). However, FormatMessage fails
+         to return a message for this error, so user will see:
+
+         OMP: Warning #2: Cannot open message catalog "1041\libompui.dll":
+         OMP: System error #193: (No system error message available)
+         OMP: Info #3: Default messages will be used.
+
+         Issue hint in this case so cause of trouble is more understandable. */
+      kmp_msg_t err_code = KMP_SYSERRCODE(error);
+      __kmp_msg(kmp_ms_warning, KMP_MSG(CantOpenMessageCatalog, path.str),
+                err_code,
+                (error == ERROR_BAD_EXE_FORMAT
+                     ? KMP_HNT(BadExeFormat, path.str, KMP_ARCH_STR)
+                     : __kmp_msg_null),
+                __kmp_msg_null);
+      if (__kmp_generate_warnings == kmp_warnings_off) {
+        __kmp_str_free(&err_code.str);
+      }
+      KMP_INFORM(WillUseDefaultMessages);
+    }
+  } else { // status == KMP_I18N_OPENED
+
+    int section = get_section(kmp_i18n_prp_Version);
+    int number = get_number(kmp_i18n_prp_Version);
+    char const *expected = __kmp_i18n_default_table.sect[section].str[number];
+    kmp_str_buf_t version; // Actual version of the catalog.
+    __kmp_str_buf_init(&version);
+    __kmp_str_buf_print(&version, "%s", ___catgets(kmp_i18n_prp_Version));
+    // String returned by catgets is invalid after closing catalog, so copy it.
+    if (strcmp(version.str, expected) != 0) {
+      // Close bad catalog.
+      __kmp_i18n_catclose();
+      status = KMP_I18N_ABSENT; // And mark it as absent.
+      if (__kmp_generate_warnings > kmp_warnings_low) {
+        // And now print a warning using default messages.
+        __kmp_msg(kmp_ms_warning,
+                  KMP_MSG(WrongMessageCatalog, path.str, version.str, expected),
+                  __kmp_msg_null);
+        KMP_INFORM(WillUseDefaultMessages);
+      } // __kmp_generate_warnings
+    }
+    __kmp_str_buf_free(&version);
+  }
+  code_page = get_code_page();
+
+end:
+  __kmp_str_buf_free(&path);
+  return;
+} // func __kmp_i18n_do_catopen
+
+void __kmp_i18n_catclose() {
+  if (status == KMP_I18N_OPENED) {
+    KMP_DEBUG_ASSERT(cat != KMP_I18N_NULLCAT);
+    kmp_i18n_table_free(&table);
+    FreeLibrary(cat);
+    cat = KMP_I18N_NULLCAT;
+  }
+  code_page = default_code_page;
+  status = KMP_I18N_CLOSED;
+} // func __kmp_i18n_catclose
+
+/* We use FormatMessage() to get strings from catalog, get system error
+   messages, etc. FormatMessage() tends to return Windows* OS-style
+   end-of-lines, "\r\n". When string is printed, printf() also replaces all the
+   occurrences of "\n" with "\r\n" (again!), so sequences like "\r\r\r\n"
+   appear in output. It is not too good.
+
+   Additional mess comes from message catalog: Our catalog source en_US.mc file
+   (generated by message-converter.pl) contains only "\n" characters, but
+   en_US_msg_1033.bin file (produced by mc.exe) may contain "\r\n" or just "\n".
+   This mess goes from en_US_msg_1033.bin file to message catalog,
+   libompui.dll. For example, message
+
+   Error
+
+   (there is "\n" at the end) is compiled by mc.exe to "Error\r\n", while
+
+   OMP: Error %1!d!: %2!s!\n
+
+   (there is "\n" at the end as well) is compiled to "OMP: Error %1!d!:
+   %2!s!\r\n\n".
+
+   Thus, stripping all "\r" normalizes string and returns it to canonical form,
+   so printf() will produce correct end-of-line sequences.
+
+   ___strip_crs() serves for this purpose: it removes all the occurrences of
+   "\r" in-place and returns new length of string.  */
+static int ___strip_crs(char *str) {
+  int in = 0; // Input character index.
+  int out = 0; // Output character index.
+  for (;;) {
+    if (str[in] != '\r') {
+      str[out] = str[in];
+      ++out;
+    }
+    if (str[in] == 0) {
+      break;
+    }
+    ++in;
+  }
+  return out - 1;
+} // func __strip_crs
+
+static char const *___catgets(kmp_i18n_id_t id) {
+
+  char *result = NULL;
+  PVOID addr = NULL;
+  wchar_t *wmsg = NULL;
+  DWORD wlen = 0;
+  char *msg = NULL;
+  int len = 0;
+  int rc;
+
+  KMP_DEBUG_ASSERT(cat != KMP_I18N_NULLCAT);
+  wlen = // wlen does *not* include terminating null.
+      FormatMessageW(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+                         FORMAT_MESSAGE_FROM_HMODULE |
+                         FORMAT_MESSAGE_IGNORE_INSERTS,
+                     cat, id,
+                     0, // LangId
+                     (LPWSTR)&addr,
+                     0, // Size in elements, not in bytes.
+                     NULL);
+  if (wlen <= 0) {
+    goto end;
+  }
+  wmsg = (wchar_t *)addr; // Warning: wmsg may be not nul-terminated!
+
+  // Calculate length of multibyte message.
+  // Since wlen does not include terminating null, len does not include it also.
+  len = WideCharToMultiByte(code_page,
+                            0, // Flags.
+                            wmsg, wlen, // Wide buffer and size.
+                            NULL, 0, // Buffer and size.
+                            NULL, NULL // Default char and used default char.
+  );
+  if (len <= 0) {
+    goto end;
+  }
+
+  // Allocate memory.
+  msg = (char *)KMP_INTERNAL_MALLOC(len + 1);
+
+  // Convert wide message to multibyte one.
+  rc = WideCharToMultiByte(code_page,
+                           0, // Flags.
+                           wmsg, wlen, // Wide buffer and size.
+                           msg, len, // Buffer and size.
+                           NULL, NULL // Default char and used default char.
+  );
+  if (rc <= 0 || rc > len) {
+    goto end;
+  }
+  KMP_DEBUG_ASSERT(rc == len);
+  len = rc;
+  msg[len] = 0; // Put terminating null to the end.
+
+  // Stripping all "\r" before stripping last end-of-line simplifies the task.
+  len = ___strip_crs(msg);
+
+  // Every message in catalog is terminated with "\n". Strip it.
+  if (len >= 1 && msg[len - 1] == '\n') {
+    --len;
+    msg[len] = 0;
+  }
+
+  // Everything looks ok.
+  result = msg;
+  msg = NULL;
+
+end:
+
+  if (msg != NULL) {
+    KMP_INTERNAL_FREE(msg);
+  }
+  if (wmsg != NULL) {
+    LocalFree(wmsg);
+  }
+
+  return result;
+
+} // ___catgets
+
+char const *__kmp_i18n_catgets(kmp_i18n_id_t id) {
+
+  int section = get_section(id);
+  int number = get_number(id);
+  char const *message = NULL;
+
+  if (1 <= section && section <= __kmp_i18n_default_table.size) {
+    if (1 <= number && number <= __kmp_i18n_default_table.sect[section].size) {
+      if (status == KMP_I18N_CLOSED) {
+        __kmp_i18n_catopen();
+      }
+      if (cat != KMP_I18N_NULLCAT) {
+        if (table.size == 0) {
+          table.sect = (kmp_i18n_section_t *)KMP_INTERNAL_CALLOC(
+              (__kmp_i18n_default_table.size + 2), sizeof(kmp_i18n_section_t));
+          table.size = __kmp_i18n_default_table.size;
+        }
+        if (table.sect[section].size == 0) {
+          table.sect[section].str = (const char **)KMP_INTERNAL_CALLOC(
+              __kmp_i18n_default_table.sect[section].size + 2,
+              sizeof(char const *));
+          table.sect[section].size =
+              __kmp_i18n_default_table.sect[section].size;
+        }
+        if (table.sect[section].str[number] == NULL) {
+          table.sect[section].str[number] = ___catgets(id);
+        }
+        message = table.sect[section].str[number];
+      }
+      if (message == NULL) {
+        // Catalog is not opened or message is not found, return default
+        // message.
+        message = __kmp_i18n_default_table.sect[section].str[number];
+      }
+    }
+  }
+  if (message == NULL) {
+    message = no_message_available;
+  }
+  return message;
+
+} // func __kmp_i18n_catgets
+
+#endif // KMP_OS_WINDOWS
+
+// -----------------------------------------------------------------------------
+
+#ifndef KMP_I18N_OK
+#error I18n support is not implemented for this OS.
+#endif // KMP_I18N_OK
+
+// -----------------------------------------------------------------------------
+
+void __kmp_i18n_dump_catalog(kmp_str_buf_t *buffer) {
+
+  struct kmp_i18n_id_range_t {
+    kmp_i18n_id_t first;
+    kmp_i18n_id_t last;
+  }; // struct kmp_i18n_id_range_t
+
+  static struct kmp_i18n_id_range_t ranges[] = {
+      {kmp_i18n_prp_first, kmp_i18n_prp_last},
+      {kmp_i18n_str_first, kmp_i18n_str_last},
+      {kmp_i18n_fmt_first, kmp_i18n_fmt_last},
+      {kmp_i18n_msg_first, kmp_i18n_msg_last},
+      {kmp_i18n_hnt_first, kmp_i18n_hnt_last}}; // ranges
+
+  int num_of_ranges = sizeof(ranges) / sizeof(struct kmp_i18n_id_range_t);
+  int range;
+  kmp_i18n_id_t id;
+
+  for (range = 0; range < num_of_ranges; ++range) {
+    __kmp_str_buf_print(buffer, "*** Set #%d ***\n", range + 1);
+    for (id = (kmp_i18n_id_t)(ranges[range].first + 1); id < ranges[range].last;
+         id = (kmp_i18n_id_t)(id + 1)) {
+      __kmp_str_buf_print(buffer, "%d: <<%s>>\n", id, __kmp_i18n_catgets(id));
+    }
+  }
+
+  __kmp_printf("%s", buffer->str);
+
+} // __kmp_i18n_dump_catalog
+
+// -----------------------------------------------------------------------------
+kmp_msg_t __kmp_msg_format(unsigned id_arg, ...) {
+
+  kmp_msg_t msg;
+  va_list args;
+  kmp_str_buf_t buffer;
+  __kmp_str_buf_init(&buffer);
+
+  va_start(args, id_arg);
+
+  // We use unsigned for the ID argument and explicitly cast it here to the
+  // right enumerator because variadic functions are not compatible with
+  // default promotions.
+  kmp_i18n_id_t id = (kmp_i18n_id_t)id_arg;
+
+#if KMP_OS_UNIX
+  // On Linux* OS and OS X*, printf() family functions process parameter
+  // numbers, for example:  "%2$s %1$s".
+  __kmp_str_buf_vprint(&buffer, __kmp_i18n_catgets(id), args);
+#elif KMP_OS_WINDOWS
+  // On Windows, printf() family functions does not recognize GNU style
+  // parameter numbers, so we have to use FormatMessage() instead. It recognizes
+  // parameter numbers, e. g.:  "%2!s! "%1!s!".
+  {
+    LPTSTR str = NULL;
+    int len;
+    FormatMessage(FORMAT_MESSAGE_FROM_STRING | FORMAT_MESSAGE_ALLOCATE_BUFFER,
+                  __kmp_i18n_catgets(id), 0, 0, (LPTSTR)(&str), 0, &args);
+    len = ___strip_crs(str);
+    __kmp_str_buf_cat(&buffer, str, len);
+    LocalFree(str);
+  }
+#else
+#error
+#endif
+  va_end(args);
+  __kmp_str_buf_detach(&buffer);
+
+  msg.type = (kmp_msg_type_t)(id >> 16);
+  msg.num = id & 0xFFFF;
+  msg.str = buffer.str;
+  msg.len = buffer.used;
+
+  return msg;
+
+} // __kmp_msg_format
+
+// -----------------------------------------------------------------------------
+static char *sys_error(int err) {
+
+  char *message = NULL;
+
+#if KMP_OS_WINDOWS
+
+  LPVOID buffer = NULL;
+  int len;
+  DWORD rc;
+  rc = FormatMessage(
+      FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, err,
+      MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), // Default language.
+      (LPTSTR)&buffer, 0, NULL);
+  if (rc > 0) {
+    // Message formatted. Copy it (so we can free it later with normal free().
+    message = __kmp_str_format("%s", (char *)buffer);
+    len = ___strip_crs(message); // Delete carriage returns if any.
+    // Strip trailing newlines.
+    while (len > 0 && message[len - 1] == '\n') {
+      --len;
+    }
+    message[len] = 0;
+  } else {
+    // FormatMessage() failed to format system error message. GetLastError()
+    // would give us error code, which we would convert to message... this it
+    // dangerous recursion, which cannot clarify original error, so we will not
+    // even start it.
+  }
+  if (buffer != NULL) {
+    LocalFree(buffer);
+  }
+
+#else // Non-Windows* OS: Linux* OS or OS X*
+
+  /* There are 2 incompatible versions of strerror_r:
+
+     char * strerror_r( int, char *, size_t );  // GNU version
+     int    strerror_r( int, char *, size_t );  // XSI version
+  */
+
+#if (defined(__GLIBC__) && defined(_GNU_SOURCE)) ||                            \
+    (defined(__BIONIC__) && defined(_GNU_SOURCE) &&                            \
+     __ANDROID_API__ >= __ANDROID_API_M__)
+  // GNU version of strerror_r.
+
+  char buffer[2048];
+  char *const err_msg = strerror_r(err, buffer, sizeof(buffer));
+  // Do not eliminate this assignment to temporary variable, otherwise compiler
+  // would not issue warning if strerror_r() returns `int' instead of expected
+  // `char *'.
+  message = __kmp_str_format("%s", err_msg);
+
+#else // OS X*, FreeBSD* etc.
+  // XSI version of strerror_r.
+  int size = 2048;
+  char *buffer = (char *)KMP_INTERNAL_MALLOC(size);
+  int rc;
+  if (buffer == NULL) {
+    KMP_FATAL(MemoryAllocFailed);
+  }
+  rc = strerror_r(err, buffer, size);
+  if (rc == -1) {
+    rc = errno; // XSI version sets errno.
+  }
+  while (rc == ERANGE) { // ERANGE means the buffer is too small.
+    KMP_INTERNAL_FREE(buffer);
+    size *= 2;
+    buffer = (char *)KMP_INTERNAL_MALLOC(size);
+    if (buffer == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    rc = strerror_r(err, buffer, size);
+    if (rc == -1) {
+      rc = errno; // XSI version sets errno.
+    }
+  }
+  if (rc == 0) {
+    message = buffer;
+  } else { // Buffer is unused. Free it.
+    KMP_INTERNAL_FREE(buffer);
+  }
+
+#endif
+
+#endif /* KMP_OS_WINDOWS */
+
+  if (message == NULL) {
+    // TODO: I18n this message.
+    message = __kmp_str_format("%s", "(No system error message available)");
+  }
+  return message;
+} // sys_error
+
+// -----------------------------------------------------------------------------
+kmp_msg_t __kmp_msg_error_code(int code) {
+
+  kmp_msg_t msg;
+  msg.type = kmp_mt_syserr;
+  msg.num = code;
+  msg.str = sys_error(code);
+  msg.len = KMP_STRLEN(msg.str);
+  return msg;
+
+} // __kmp_msg_error_code
+
+// -----------------------------------------------------------------------------
+kmp_msg_t __kmp_msg_error_mesg(char const *mesg) {
+
+  kmp_msg_t msg;
+  msg.type = kmp_mt_syserr;
+  msg.num = 0;
+  msg.str = __kmp_str_format("%s", mesg);
+  msg.len = KMP_STRLEN(msg.str);
+  return msg;
+
+} // __kmp_msg_error_mesg
+
+// -----------------------------------------------------------------------------
+void __kmp_msg(kmp_msg_severity_t severity, kmp_msg_t message, va_list args) {
+  kmp_i18n_id_t format; // format identifier
+  kmp_msg_t fmsg; // formatted message
+  kmp_str_buf_t buffer;
+
+  if (severity != kmp_ms_fatal && __kmp_generate_warnings == kmp_warnings_off)
+    return; // no reason to form a string in order to not print it
+
+  __kmp_str_buf_init(&buffer);
+
+  // Format the primary message.
+  switch (severity) {
+  case kmp_ms_inform: {
+    format = kmp_i18n_fmt_Info;
+  } break;
+  case kmp_ms_warning: {
+    format = kmp_i18n_fmt_Warning;
+  } break;
+  case kmp_ms_fatal: {
+    format = kmp_i18n_fmt_Fatal;
+  } break;
+  default: {
+    KMP_DEBUG_ASSERT(0);
+  }
+  }
+  fmsg = __kmp_msg_format(format, message.num, message.str);
+  __kmp_str_free(&message.str);
+  __kmp_str_buf_cat(&buffer, fmsg.str, fmsg.len);
+  __kmp_str_free(&fmsg.str);
+
+  // Format other messages.
+  for (;;) {
+    message = va_arg(args, kmp_msg_t);
+    if (message.type == kmp_mt_dummy && message.str == NULL) {
+      break;
+    }
+    switch (message.type) {
+    case kmp_mt_hint: {
+      format = kmp_i18n_fmt_Hint;
+      // we cannot skip %1$ and only use %2$ to print the message without the
+      // number
+      fmsg = __kmp_msg_format(format, message.str);
+    } break;
+    case kmp_mt_syserr: {
+      format = kmp_i18n_fmt_SysErr;
+      fmsg = __kmp_msg_format(format, message.num, message.str);
+    } break;
+    default: {
+      KMP_DEBUG_ASSERT(0);
+    }
+    }
+    __kmp_str_free(&message.str);
+    __kmp_str_buf_cat(&buffer, fmsg.str, fmsg.len);
+    __kmp_str_free(&fmsg.str);
+  }
+
+  // Print formatted messages.
+  // This lock prevents multiple fatal errors on the same problem.
+  // __kmp_acquire_bootstrap_lock( & lock );    // GEH - This lock causing tests
+  // to hang on OS X*.
+  __kmp_printf("%s", buffer.str);
+  __kmp_str_buf_free(&buffer);
+
+  // __kmp_release_bootstrap_lock( & lock );  // GEH - this lock causing tests
+  // to hang on OS X*.
+
+} // __kmp_msg
+
+void __kmp_msg(kmp_msg_severity_t severity, kmp_msg_t message, ...) {
+  va_list args;
+  va_start(args, message);
+  __kmp_msg(severity, message, args);
+  va_end(args);
+}
+
+void __kmp_fatal(kmp_msg_t message, ...) {
+  va_list args;
+  va_start(args, message);
+  __kmp_msg(kmp_ms_fatal, message, args);
+  va_end(args);
+#if KMP_OS_WINDOWS
+  // Delay to give message a chance to appear before reaping
+  __kmp_thread_sleep(500);
+#endif
+  __kmp_abort_process();
+} // __kmp_fatal
+
+// end of file //
diff --git a/third_party/openmp/kmp_i18n.h b/third_party/openmp/kmp_i18n.h
new file mode 100644
index 000000000..23f6f20bd
--- /dev/null
+++ b/third_party/openmp/kmp_i18n.h
@@ -0,0 +1,178 @@
+/*
+ * kmp_i18n.h
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_I18N_H
+#define KMP_I18N_H
+
+#include "kmp_str.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/* kmp_i18n_id.inc defines kmp_i18n_id_t type. It is an enumeration with
+   identifiers of all the messages in the catalog. There is one special
+   identifier: kmp_i18n_null, which denotes absence of message. */
+#include "kmp_i18n_id.inc" // Generated file. Do not edit it manually.
+
+/* Low-level functions handling message catalog. __kmp_i18n_open() opens message
+   catalog, __kmp_i18n_closes() it. Explicit opening is not required: if message
+   catalog is not yet open, __kmp_i18n_catgets() will open it implicitly.
+   However, catalog should be explicitly closed, otherwise resources (mamory,
+   handles) may leak.
+
+   __kmp_i18n_catgets() returns read-only string. It should not be freed.
+
+   KMP_I18N_STR macro simplifies access to strings in message catalog a bit.
+   Following two lines are equivalent:
+
+   __kmp_i18n_catgets( kmp_i18n_str_Warning )
+   KMP_I18N_STR( Warning )
+*/
+
+void __kmp_i18n_catopen();
+void __kmp_i18n_catclose();
+char const *__kmp_i18n_catgets(kmp_i18n_id_t id);
+
+#define KMP_I18N_STR(id) __kmp_i18n_catgets(kmp_i18n_str_##id)
+
+/* High-level interface for printing strings targeted to the user.
+
+   All the strings are divided into 3 types:
+   * messages,
+   * hints,
+   * system errors.
+
+   There are 3 kind of message severities:
+   * informational messages,
+   * warnings (non-fatal errors),
+   * fatal errors.
+
+   For example:
+     OMP: Warning #2: Cannot open message catalog "libguide.cat":   (1)
+     OMP: System error #2: No such file or directory                (2)
+     OMP: Hint: Please check NLSPATH environment variable.          (3)
+     OMP: Info #3: Default messages will be used.                   (4)
+
+   where
+   (1) is a message of warning severity,
+   (2) is a system error caused the previous warning,
+   (3) is a hint for the user how to fix the problem,
+   (4) is a message of informational severity.
+
+   Usage in complex cases (message is accompanied with hints and system errors):
+
+   int error = errno; // We need save errno immediately, because it may
+                      // be changed.
+   __kmp_msg(
+       kmp_ms_warning,                        // Severity
+       KMP_MSG( CantOpenMessageCatalog, name ), // Primary message
+       KMP_ERR( error ),                      // System error
+       KMP_HNT( CheckNLSPATH ),               // Hint
+       __kmp_msg_null                         // Variadic argument list finisher
+   );
+
+   Usage in simple cases (just a message, no system errors or hints):
+   KMP_INFORM( WillUseDefaultMessages );
+   KMP_WARNING( CantOpenMessageCatalog, name );
+   KMP_FATAL( StackOverlap );
+   KMP_SYSFAIL( "pthread_create", status );
+   KMP_CHECK_SYSFAIL( "pthread_create", status );
+   KMP_CHECK_SYSFAIL_ERRNO( "gettimeofday", status );
+*/
+
+enum kmp_msg_type {
+  kmp_mt_dummy = 0, // Special type for internal purposes.
+  kmp_mt_mesg =
+      4, // Primary OpenMP message, could be information, warning, or fatal.
+  kmp_mt_hint = 5, // Hint to the user.
+  kmp_mt_syserr = -1 // System error message.
+}; // enum kmp_msg_type
+typedef enum kmp_msg_type kmp_msg_type_t;
+
+struct kmp_msg {
+  kmp_msg_type_t type;
+  int num;
+  char *str;
+  size_t len;
+}; // struct kmp_message
+typedef struct kmp_msg kmp_msg_t;
+
+// Special message to denote the end of variadic list of arguments.
+extern kmp_msg_t __kmp_msg_null;
+
+// Helper functions. Creates messages either from message catalog or from
+// system. Note: these functions allocate memory. You should pass created
+// messages to __kmp_msg() function, it will print messages and destroy them.
+kmp_msg_t __kmp_msg_format(unsigned id_arg, ...);
+kmp_msg_t __kmp_msg_error_code(int code);
+kmp_msg_t __kmp_msg_error_mesg(char const *mesg);
+
+// Helper macros to make calls shorter.
+#define KMP_MSG(...) __kmp_msg_format(kmp_i18n_msg_##__VA_ARGS__)
+#define KMP_HNT(...) __kmp_msg_format(kmp_i18n_hnt_##__VA_ARGS__)
+#define KMP_SYSERRCODE(code) __kmp_msg_error_code(code)
+#define KMP_SYSERRMESG(mesg) __kmp_msg_error_mesg(mesg)
+#define KMP_ERR KMP_SYSERRCODE
+
+// Message severity.
+enum kmp_msg_severity {
+  kmp_ms_inform, // Just information for the user.
+  kmp_ms_warning, // Non-fatal error, execution continues.
+  kmp_ms_fatal // Fatal error, program aborts.
+}; // enum kmp_msg_severity
+typedef enum kmp_msg_severity kmp_msg_severity_t;
+
+// Primary function for printing messages for the user. The first message is
+// mandatory. Any number of system errors and hints may be specified. Argument
+// list must be finished with __kmp_msg_null.
+void __kmp_msg(kmp_msg_severity_t severity, kmp_msg_t message, ...);
+KMP_NORETURN void __kmp_fatal(kmp_msg_t message, ...);
+
+// Helper macros to make calls shorter in simple cases.
+#define KMP_INFORM(...)                                                        \
+  __kmp_msg(kmp_ms_inform, KMP_MSG(__VA_ARGS__), __kmp_msg_null)
+#define KMP_WARNING(...)                                                       \
+  __kmp_msg(kmp_ms_warning, KMP_MSG(__VA_ARGS__), __kmp_msg_null)
+#define KMP_FATAL(...) __kmp_fatal(KMP_MSG(__VA_ARGS__), __kmp_msg_null)
+#define KMP_SYSFAIL(func, error)                                               \
+  __kmp_fatal(KMP_MSG(FunctionError, func), KMP_SYSERRCODE(error),             \
+              __kmp_msg_null)
+
+// Check error, if not zero, generate fatal error message.
+#define KMP_CHECK_SYSFAIL(func, error)                                         \
+  {                                                                            \
+    if (error) {                                                               \
+      KMP_SYSFAIL(func, error);                                                \
+    }                                                                          \
+  }
+
+// Check status, if not zero, generate fatal error message using errno.
+#define KMP_CHECK_SYSFAIL_ERRNO(func, status)                                  \
+  {                                                                            \
+    if (status != 0) {                                                         \
+      int error = errno;                                                       \
+      KMP_SYSFAIL(func, error);                                                \
+    }                                                                          \
+  }
+
+#ifdef KMP_DEBUG
+void __kmp_i18n_dump_catalog(kmp_str_buf_t *buffer);
+#endif // KMP_DEBUG
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // KMP_I18N_H
+
+// end of file //
diff --git a/third_party/openmp/kmp_i18n_default.inc b/third_party/openmp/kmp_i18n_default.inc
new file mode 100644
index 000000000..3319432fe
--- /dev/null
+++ b/third_party/openmp/kmp_i18n_default.inc
@@ -0,0 +1,489 @@
+// Do not edit this file! //
+// The file was generated from en_US.txt by message-converter.pl on Sun Jan 28 19:00:54 2024. //
+
+static char const *
+__kmp_i18n_default_meta[] =
+    {
+        NULL,
+        "English",
+        "USA",
+        "1033",
+        "2",
+        "20170523",
+        NULL
+    };
+
+static char const *
+__kmp_i18n_default_strings[] =
+    {
+        NULL,
+        "Error",
+        "(unknown file)",
+        "not a number",
+        "bad unit",
+        "illegal characters",
+        "value too large",
+        "value too small",
+        "value is not a multiple of 4k",
+        "Unknown processor topology",
+        "Cannot open /proc/cpuinfo",
+        "/proc/cpuinfo",
+        "cpuinfo file invalid (No processor records)",
+        "cpuinfo file invalid (Too many processor records)",
+        "Cannot rewind cpuinfo file",
+        "cpuinfo file invalid (long line)",
+        "cpuinfo file contains too many entries",
+        "cpuinfo file missing processor field",
+        "cpuinfo file missing physical id field",
+        "cpuinfo file invalid (missing val)",
+        "cpuinfo file invalid (duplicate field)",
+        "Physical node/pkg/core/thread ids not unique",
+        "APIC not present",
+        "Invalid cpuid info",
+        "APIC ids not unique",
+        "Inconsistent cpuid info",
+        "Out of heap memory",
+        "Memory allocation failed",
+        "core",
+        "thread",
+        "package",
+        "node",
+        "<undef>",
+        "decoding legacy APIC ids",
+        "parsing /proc/cpuinfo",
+        "value is not defined",
+        "Effective settings:",
+        "User settings:",
+        "warning: pointers or size don't make sense",
+        "CPU",
+        "TPU",
+        "TPUs per package",
+        "HT enabled",
+        "HT disabled",
+        "decoding x2APIC ids",
+        "cpuid leaf 11 not supported",
+        "cpuid leaf 4 not supported",
+        "thread ids not unique",
+        "using pthread info",
+        "legacy APIC ids not unique",
+        "x2APIC ids not unique",
+        "OPENMP DISPLAY ENVIRONMENT BEGIN",
+        "OPENMP DISPLAY ENVIRONMENT END",
+        "[device]",
+        "[host]",
+        "tile",
+        "tiles",
+        "threads",
+        "cores",
+        "socket",
+        "sockets",
+        "die",
+        "dice",
+        "module",
+        "modules",
+        "L1 cache",
+        "L1 caches",
+        "L2 cache",
+        "L2 caches",
+        "L3 cache",
+        "L3 caches",
+        "NUMA domain",
+        "NUMA domains",
+        "processor group",
+        "processor groups",
+        "unknown",
+        "cpuid leaf 31 not supported",
+        "Hwloc api failure",
+        "LL cache",
+        "LL caches",
+        NULL
+    };
+
+static char const *
+__kmp_i18n_default_formats[] =
+    {
+        NULL,
+        "OMP: Info #%1$d: %2$s\n",
+        "OMP: Warning #%1$d: %2$s\n",
+        "OMP: Error #%1$d: %2$s\n",
+        "OMP: System error #%1$d: %2$s\n",
+        "OMP: Hint %1$s\n",
+        "%1$s pragma (at %2$s:%3$s():%4$s)",
+        NULL
+    };
+
+static char const *
+__kmp_i18n_default_messages[] =
+    {
+        NULL,
+        "Library is \"serial\".",
+        "Cannot open message catalog \"%1$s\":",
+        "Default messages will be used.",
+        "%1$s: Lock is uninitialized",
+        "%1$s: Lock was initialized as simple, but used as nestable",
+        "%1$s: Lock was initialized as nestable, but used as simple",
+        "%1$s: Lock is already owned by requesting thread",
+        "%1$s: Lock is still owned by a thread",
+        "%1$s: Attempt to release a lock not owned by any thread",
+        "%1$s: Attempt to release a lock owned by another thread",
+        "Stack overflow detected for OpenMP thread #%1$d",
+        "Stack overlap detected. ",
+        "Assertion failure at %1$s(%2$d).",
+        "Unable to register a new user thread.",
+        "Initializing %1$s, but found %2$s already initialized.",
+        "Cannot open file \"%1$s\" for reading:",
+        "Getting environment variable \"%1$s\" failed:",
+        "Setting environment variable \"%1$s\" failed:",
+        "Getting environment failed:",
+        "%1$s=\"%2$s\": Wrong value, boolean expected.",
+        "No Helper Thread support built in this OMP library.",
+        "Helper thread failed to soft terminate.",
+        "Buffer overflow detected.",
+        "Real-time scheduling policy is not supported.",
+        "OMP application is running at maximum priority with real-time scheduling policy. ",
+        "Changing priority of the monitor thread failed:",
+        "Deadlocks are highly possible due to monitor thread starvation.",
+        "Unable to set monitor thread stack size to %1$lu bytes:",
+        "Unable to set OMP thread stack size to %1$lu bytes:",
+        "Thread attribute initialization failed:",
+        "Thread attribute destroying failed:",
+        "OMP thread joinable state setting failed:",
+        "Monitor thread joinable state setting failed:",
+        "System unable to allocate necessary resources for OMP thread:",
+        "System unable to allocate necessary resources for the monitor thread:",
+        "Unable to terminate OMP thread:",
+        "Wrong schedule type %1$d, see <omp.h> or <omp_lib.h> file for the list of values supported.",
+        "Unknown scheduling type \"%1$d\".",
+        "%1$s value \"%2$s\" is invalid.",
+        "%1$s value \"%2$s\" is too small.",
+        "%1$s value \"%2$s\" is too large.",
+        "%1$s: \"%2$s\" is an invalid value; ignored.",
+        "%1$s release value \"%2$s\" is invalid.",
+        "%1$s gather value \"%2$s\" is invalid.",
+        "%1$s supported only on debug builds; ignored.",
+        "Syntax error: Usage: %1$s=[ routine=<func> | filename=<file> | range=<lb>:<ub> | excl_range=<lb>:<ub> ],...",
+        "Unbalanced quotes in %1$s.",
+        "Empty string specified for %1$s; ignored.",
+        "%1$s value is too long; ignored.",
+        "%1$s: Invalid clause in \"%2$s\".",
+        "Empty clause in %1$s.",
+        "%1$s value \"%2$s\" is invalid chunk size.",
+        "%1$s value \"%2$s\" is to large chunk size.",
+        "%1$s value \"%2$s\" is ignored.",
+        "Cannot get processor frequency, using zero KMP_ITT_PREPARE_DELAY.",
+        "%1$s must be set prior to first parallel region; ignored.",
+        "%1$s: parameter has been specified already, ignoring \"%2$s\".",
+        "%1$s: parameter invalid, ignoring \"%2$s\".",
+        "%1$s: too many integer parameters specified, ignoring \"%2$s\".",
+        "%1$s: too many integer parameters specified for logical or physical type, ignoring \"%2$d\".",
+        "%1$s: '%2$s' type does not take any integer parameters, ignoring them.",
+        "%1$s: proclist not specified with explicit affinity type, using \"none\".",
+        "%1$s: proclist specified, setting affinity type to \"explicit\".",
+        "%1$s: proclist specified without \"explicit\" affinity type, proclist ignored.",
+        "%1$s: syntax error, not using affinity.",
+        "%1$s: range error (zero stride), not using affinity.",
+        "%1$s: range error (%2$d > %3$d), not using affinity.",
+        "%1$s: range error (%2$d < %3$d & stride < 0), not using affinity.",
+        "%1$s: range error ((%2$d-%3$d)/%4$d too big), not using affinity.",
+        "%1$s: %2$s is defined. %3$s will be ignored.",
+        "%1$s: affinity not supported, using \"disabled\".",
+        "%1$s: affinity only supported for Intel(R) Architecture Processors.",
+        "%1$s: getaffinity system call not supported.",
+        "%1$s: setaffinity system call not supported.",
+        "%1$s: pthread_aff_set_np call not found.",
+        "%1$s: pthread_get_num_resources_np call not found.",
+        "%1$s: the OS kernel does not support affinity.",
+        "%1$s: pthread_get_num_resources_np returned %2$d.",
+        "%1$s: cannot determine proper affinity mask size.",
+        "%1$s=\"%2$s\": %3$s.",
+        "%1$s: extra trailing characters ignored: \"%2$s\".",
+        "%1$s: unknown method \"%2$s\".",
+        "KMP_STATS_TIMER: clock_gettime is undefined, using gettimeofday.",
+        "KMP_STATS_TIMER: \"%1$s\" needs additional parameter, e.g. 'clock_gettime,2'. Using gettimeofday.",
+        "KMP_STATS_TIMER: clock_gettime parameter \"%1$s\" is invalid, using gettimeofday.",
+        "KMP_STATS_TIMER: clock_gettime failed, using gettimeofday.",
+        "KMP_STATS_TIMER: clock function unknown (ignoring value \"%1$s\").",
+        "Unknown scheduling type detected.",
+        "Too many threads to use analytical guided scheduling - switching to iterative guided scheduling.",
+        "ittnotify: Lookup of \"%1$s\" function in \"%2$s\" library failed.",
+        "ittnotify: Loading \"%1$s\" library failed.",
+        "ittnotify: All itt notifications disabled.",
+        "ittnotify: Object state itt notifications disabled.",
+        "ittnotify: Mark itt notifications disabled.",
+        "ittnotify: Unloading \"%1$s\" library failed.",
+        "Cannot form a team with %1$d threads, using %2$d instead.",
+        "Requested number of active parallel levels \"%1$d\" is negative; ignored.",
+        "Requested number of active parallel levels \"%1$d\" exceeds supported limit; the following limit value will be used: \"%1$d\".",
+        "kmp_set_library must only be called from the top level serial thread; ignored.",
+        "Fatal system error detected.",
+        "Out of heap memory.",
+        "Clearing __KMP_REGISTERED_LIB env var failed.",
+        "Registering library with env var failed.",
+        "%1$s value \"%2$d\" will be used.",
+        "%1$s value \"%2$u\" will be used.",
+        "%1$s value \"%2$s\" will be used.",
+        "%1$s value \"%2$s\" will be used.",
+        "Mixing other barrier patterns with dist is prohibited. Using dist for all barrier patterns.",
+        "%1$s maximum value \"%2$d\" will be used.",
+        "%1$s minimum value \"%2$d\" will be used.",
+        "Memory allocation failed.",
+        "File name too long.",
+        "Lock table overflow.",
+        "Too many threads to use threadprivate directive.",
+        "%1$s: invalid mask.",
+        "Wrong definition.",
+        "Windows* OS: TLS Set Value failed.",
+        "Windows* OS: TLS out of indexes.",
+        "PDONE directive must be nested within a DO directive.",
+        "Cannot get number of available CPUs.",
+        "Assumed number of CPUs is 2.",
+        "Error initializing affinity - not using affinity.",
+        "Threads may migrate across all available OS procs (granularity setting too coarse).",
+        "Ignoring invalid OS proc ID %1$d.",
+        "No valid OS proc IDs specified - not using affinity.",
+        "%1$s - using \"flat\" OS <-> physical proc mapping.",
+        "%1$s: %2$s - using \"flat\" OS <-> physical proc mapping.",
+        "%1$s, line %2$d: %3$s - using \"flat\" OS <-> physical proc mapping.",
+        "%1$s: %2$s - exiting.",
+        "%1$s, line %2$d: %3$s - exiting.",
+        "Construct identifier invalid.",
+        "Thread identifier invalid.",
+        "runtime library not initialized.",
+        "Inconsistent THREADPRIVATE common block declarations are non-conforming and are unsupported. Either all threadprivate common blocks must be declared identically, or the largest instance of each threadprivate common block must be referenced first during the run.",
+        "Cannot set thread affinity mask.",
+        "Cannot set thread priority.",
+        "Cannot create thread.",
+        "Cannot create event.",
+        "Cannot set event.",
+        "Cannot close handle.",
+        "Unknown library type: %1$d.",
+        "Monitor did not reap properly.",
+        "Worker thread failed to join.",
+        "Cannot change thread affinity mask.",
+        "%1$s: Threads may migrate across %2$d innermost levels of machine",
+        "%1$s: decrease to %2$d threads",
+        "%1$s: increase to %2$d threads",
+        "%1$s: Internal thread %2$d bound to OS proc set %3$s",
+        "%1$s: Affinity capable, using cpuinfo file",
+        "%1$s: Affinity capable, using global cpuid info",
+        "%1$s: Affinity capable, using default \"flat\" topology",
+        "%1$s: Affinity not capable, using local cpuid info",
+        "%1$s: Affinity not capable, using cpuinfo file",
+        "%1$s: Affinity not capable, assuming \"flat\" topology",
+        "%1$s: Initial OS proc set respected: %2$s",
+        "%1$s: Initial OS proc set not respected: %2$s",
+        "%1$s: %2$d available OS procs",
+        "%1$s: Uniform topology",
+        "%1$s: Nonuniform topology",
+        "%1$s: %2$d packages x %3$d cores/pkg x %4$d threads/core (%5$d total cores)",
+        "%1$s: OS proc to physical thread map ([] => level not in map):",
+        "%1$s: OS proc <n> maps to <n>th package core 0",
+        "%1$s: OS proc %2$d maps to package %3$d [core %4$d] [thread %5$d]",
+        "%1$s: OS proc %2$d maps to [package %3$d] [core %4$d] [thread %5$d]",
+        "%1$s: OS proc %2$d maps to [package %3$d] [core %4$d] thread %5$d",
+        "%1$s: OS proc %2$d maps to [package %3$d] core %4$d [thread %5$d]",
+        "%1$s: OS proc %2$d maps to package %3$d [core %4$d] [thread %5$d]",
+        "%1$s: OS proc %2$d maps to [package %3$d] core %4$d thread %5$d",
+        "%1$s: OS proc %2$d maps to package %3$d core %4$d [thread %5$d]",
+        "%1$s: OS proc %2$d maps to package %3$d [core %4$d] thread %5$d",
+        "%1$s: OS proc %2$d maps to package %3$d core %4$d thread %5$d",
+        "%1$s: OS proc %2$d maps to %3$s",
+        "%1$s: Internal thread %2$d changed affinity mask from %3$s to %4$s",
+        "%1$s: OS proc %2$d maps to package %3$d, CPU %4$d, TPU %5$d",
+        "%1$s: OS proc %2$d maps to package %3$d, CPU %4$d",
+        "%1$s: HT enabled; %2$d packages; %3$d TPU; %4$d TPUs per package",
+        "%1$s: HT disabled; %2$d packages",
+        "Threads encountered barriers in different order. ",
+        "Function %1$s failed:",
+        "%1$s: %2$s packages x %3$d cores/pkg x %4$d threads/core (%5$d total cores)",
+        "Incompatible message catalog \"%1$s\": Version \"%2$s\" found, version \"%3$s\" expected.",
+        "%1$s: ignored because %2$s has been defined",
+        "%1$s: overrides %3$s specified before",
+        "%1$s: Tiles are only supported if KMP_TOPOLOGY_METHOD=hwloc, using granularity=package instead",
+        "%1$s: Tiles requested but were not detected on this HW, using granularity=package instead",
+        "%1$s: %2$d packages x %3$d tiles/pkg x %4$d cores/tile x %5$d threads/core (%6$d total cores)",
+        "%1$s: %2$d packages x %3$d nodes/pkg x %4$d cores/node x %5$d threads/core (%6$d total cores)",
+        "%1$s: %2$d packages x %3$d nodes/pkg x %4$d tiles/node x %5$d cores/tile x %6$d threads/core (%7$d total cores)",
+        "OMPT: Cannot determine workshare type; using the default (loop) instead. This issue is fixed in an up-to-date compiler.",
+        "Allocator %1$s is not available, will use default allocator.",
+        "%1$s: %2$s (%3$d total cores)",
+        "%1$s: granularity setting: %2$s does not exist in topology.  Using granularity=%3$s instead.",
+        "%1$s: hybrid core type detected: %2$d %3$s cores.",
+        "%1$s:   %2$d with core efficiency %3$d.",
+        "%1$s must be bound to a work-sharing or work-queuing construct with an \"ordered\" clause",
+        "Detected end of %1$s without first executing a corresponding beginning.",
+        "Iteration range too large in %1$s.",
+        "%1$s must not have a loop increment that evaluates to zero.",
+        "Expected end of %1$s; %2$s, however, has most recently begun execution.",
+        "%1$s is incorrectly nested within %2$s",
+        "%1$s cannot be executed multiple times during execution of one parallel iteration/section of %2$s",
+        "%1$s is incorrectly nested within %2$s of the same name",
+        "%1$s is incorrectly nested within %2$s that does not have an \"ordered\" clause",
+        "%1$s is incorrectly nested within %2$s but not within any of its \"task\" constructs",
+        "One thread at %1$s while another thread is at %2$s.",
+        "Cannot connect to %1$s",
+        "Cannot connect to %1$s - Using %2$s",
+        "%1$s does not support %2$s. Continuing without using %2$s.",
+        "%1$s does not support %2$s for %3$s. Continuing without using %2$s.",
+        "Static %1$s does not support %2$s. Continuing without using %2$s.",
+        "KMP_DYNAMIC_MODE=irml cannot be used with KMP_USE_IRML=0",
+        "ittnotify: Unknown group \"%2$s\" specified in environment variable \"%1$s\".",
+        "ittnotify: Environment variable \"%1$s\" too long: Actual lengths is %2$lu, max allowed length is %3$lu.",
+        "%1$s: Affinity capable, using global cpuid leaf 11 info",
+        "%1$s: Affinity not capable, using local cpuid leaf 11 info",
+        "%1$s: %2$s.",
+        "%1$s: %2$s - %3$s.",
+        "%1$s: OS proc to physical thread map:",
+        "%1$s: using \"flat\" OS <-> physical proc mapping.",
+        "%1$s: parsing %2$s.",
+        "%1$s - exiting.",
+        "Incompatible %1$s library with version %2$s found.",
+        "ittnotify: Function %1$s failed:",
+        "ittnotify: Error #%1$d.",
+        "%1$s must be set prior to first parallel region or certain API calls; ignored.",
+        "Lock initialized at %1$s(%2$d) was not destroyed",
+        "Cannot determine machine load balance - Using %1$s",
+        "%1$s: Affinity not capable, using pthread info",
+        "%1$s: Affinity capable, using pthread info",
+        "Loading \"%1$s\" library failed:",
+        "Lookup of \"%1$s\" function failed:",
+        "Buffer too small.",
+        "Error #%1$d.",
+        "%1$s: Invalid symbols found. Check the value \"%2$s\".",
+        "%1$s: Spaces between digits are not allowed \"%2$s\".",
+        "%1$s: %2$s - parsing %3$s.",
+        "%1$s cannot be specified via kmp_set_defaults() on this machine because it has more than one processor group.",
+        "Cannot use affinity type \"%1$s\" with multiple Windows* OS processor groups, using \"%2$s\".",
+        "Cannot use affinity granularity \"%1$s\" with multiple Windows* OS processor groups, using \"%2$s\".",
+        "%1$s: Mapping Windows* OS processor group <i> proc <j> to OS proc 64*<i>+<j>.",
+        "%1$s: OS proc %2$d maps to Windows* OS processor group %3$d proc %4$d",
+        "%1$s: Affinity balanced is not available.",
+        "%1$s: granularity=core will be used.",
+        "%1$s must be set prior to first OMP lock call or critical section; ignored.",
+        "futex system call not supported; %1$s=%2$s ignored.",
+        "%1$s: granularity=%2$s will be used.",
+        "%1$s: invalid value \"%2$s\", valid format is \"N<item>[@N][,...][,Nt] (<item> can be S, N, L2, C, T  for Socket, NUMA Node, L2 Cache, Core, Thread)\".",
+        "KMP_HW_SUBSET ignored: unsupported architecture.",
+        "KMP_HW_SUBSET ignored: too many cores requested.",
+        "%1$s: syntax error, using %2$s.",
+        "%1$s: Adaptive locks are not supported; using queuing.",
+        "%1$s: Invalid symbols found. Check the value \"%2$s\".",
+        "%1$s: Spaces between digits are not allowed \"%2$s\".",
+        "%1$s: pid %2$d tid %3$d thread %4$d bound to OS proc set %5$s",
+        "%1$s error: parallel loop increment and condition are inconsistent.",
+        "libgomp cancellation is not currently supported.",
+        "KMP_HW_SUBSET ignored: non-uniform topology.",
+        "KMP_HW_SUBSET ignored: only three-level topology is supported.",
+        "%1$s: granularity=%2$s is not supported with KMP_TOPOLOGY_METHOD=group. Using \"granularity=fine\".",
+        "%1$s: granularity=group is not supported with KMP_AFFINITY=%2$s. Using \"granularity=core\".",
+        "KMP_HW_SUBSET ignored: too many sockets requested.",
+        "KMP_HW_SUBSET \"o\" offset designator deprecated, please use @ prefix for offset value.",
+        "%1$s: Affinity capable, using hwloc.",
+        "%1$s: Ignoring hwloc mechanism.",
+        "%1$s: Hwloc failed in %2$s. Relying on internal affinity mechanisms.",
+        "%1$s must be set prior to OpenMP runtime library initialization; ignored.",
+        "You have enabled the use of umonitor/umwait. If the CPU doesn't have that enabled you'll get an illegal instruction exception.",
+        "%1$s variable deprecated, please use %2$s instead.",
+        "KMP_FORCE_REDUCTION: %1$s method is not supported; using critical.",
+        "KMP_HW_SUBSET ignored: unsupported item requested for non-HWLOC topology method (KMP_TOPOLOGY_METHOD)",
+        "KMP_HW_SUBSET ignored: too many NUMA Nodes requested.",
+        "KMP_HW_SUBSET ignored: too many L2 Caches requested.",
+        "KMP_HW_SUBSET ignored: too many Procs requested.",
+        "Hierarchy ignored: unsupported level: %1$s.",
+        "OMP: pid %1$s tid %2$s thread %3$s bound to OS proc set {%4$s}",
+        "%1$s routine deprecated, please use %2$s instead.",
+        "libgomp compatibility layer does not support OpenMP feature: %1$s",
+        "KMP_HW_SUBSET ignored: too many Dies requested.",
+        "%1$s: Affinity capable, using global cpuid leaf %2$d info",
+        "%1$s: Affinity not capable, using local cpuid leaf %2$d info",
+        "%1$s: Affinity not capable, using hwloc.",
+        "%1$s: Encountered user-directed error: %2$s.",
+        "%1$s: Encountered user-directed warning: %2$s.",
+        "Failed to create teams between lower bound (%1$d) and upper bound (%2$d).",
+        "KMP_HW_SUBSET ignored: %1$s: too many requested.",
+        "KMP_HW_SUBSET ignored: %1$s: level not detected in machine topology.",
+        "KMP_HW_SUBSET ignored: %1$s, %2$s: layers are equivalent, please only specify one.",
+        "KMP_HW_SUBSET ignored: %1$s layer should come after %2$s.",
+        "%1$s: topology layer \"%2$s\" is equivalent to \"%3$s\".",
+        "%1$s: granularity=%2$s is too coarse, setting granularity=group.",
+        "%1$s: \"%2$s\" value is deprecated. Please use \"%3$s\" instead.",
+        "num_teams value must be positive, it is %1$d, using %2$d instead.",
+        "KMP_HW_SUBSET ignored: %1$s, %2$s: attributes are ambiguous, please only specify one.",
+        "KMP_HW_SUBSET ignored: %1$s: attribute specified more than once.",
+        "KMP_HW_SUBSET ignored: %1$s: attribute value %2$s is invalid.",
+        "KMP_HW_SUBSET ignored: all hardware resources would be filtered, please reduce the filter.",
+        "KMP_HW_SUBSET ignored: Too many attributes specified. This machine is not a hybrid architecutre.",
+        "KMP_HW_SUBSET: ignoring %1$s attribute. This machine is not a hybrid architecutre.",
+        "Target memory not available, will use default allocator.",
+        "%1$s ignored: This machine is not a hybrid architecutre. Using \"%2$s\" instead.",
+        "%1$s ignored: %2$s is not available. Using \"%3$s\" instead.",
+        NULL
+    };
+
+static char const *
+__kmp_i18n_default_hints[] =
+    {
+        NULL,
+        "Please submit a bug report with this message, compile and run commands used, and machine configuration info including native compiler and operating system versions. Faster response will be obtained by including all program sources. For information on submitting this issue, please see https://github.com/llvm/llvm-project/issues/.",
+        "Check NLSPATH environment variable, its value is \"%1$s\".",
+        "Please try changing the shell stack limit or adjusting the OMP_STACKSIZE environment variable.",
+        "Consider unsetting KMP_DEVICE_THREAD_LIMIT (KMP_ALL_THREADS), KMP_TEAMS_THREAD_LIMIT, and OMP_THREAD_LIMIT (if any are set).",
+        "Consider setting KMP_ALL_THREADPRIVATE to a value larger than %1$d.",
+        "This could also be due to a system-related limit on the number of threads.",
+        "This means that multiple copies of the OpenMP runtime have been linked into the program. That is dangerous, since it can degrade performance or cause incorrect results. The best thing to do is to ensure that only a single OpenMP runtime is linked into the process, e.g. by avoiding static linking of the OpenMP runtime in any library. As an unsafe, unsupported, undocumented workaround you can set the environment variable KMP_DUPLICATE_LIB_OK=TRUE to allow the program to continue to execute, but that may cause crashes or silently produce incorrect results. For more information, please see http://openmp.llvm.org/",
+        "This name is specified in environment variable KMP_CPUINFO_FILE.",
+        "Seems application required too much memory.",
+        "Use \"0\", \"FALSE\". \".F.\", \"off\", \"no\" as false values, \"1\", \"TRUE\", \".T.\", \"on\", \"yes\" as true values.",
+        "Perhaps too many threads.",
+        "Decrease priority of application. This will allow the monitor thread run at higher priority than other threads.",
+        "Try changing KMP_MONITOR_STACKSIZE or the shell stack limit.",
+        "Try changing OMP_STACKSIZE and/or the shell stack limit.",
+        "Try increasing OMP_STACKSIZE or the shell stack limit.",
+        "Try decreasing OMP_STACKSIZE.",
+        "Try decreasing the value of OMP_NUM_THREADS.",
+        "Try increasing KMP_MONITOR_STACKSIZE.",
+        "Try decreasing KMP_MONITOR_STACKSIZE.",
+        "Try decreasing the number of threads in use simultaneously.",
+        "Will use default schedule type (%1$s).",
+        "It could be a result of using an older OMP library with a newer compiler or memory corruption. You may check the proper OMP library is linked to the application.",
+        "Check %1$s environment variable, its value is \"%2$s\".",
+        "You may want to use an %1$s library that supports %2$s interface with version %3$s.",
+        "You may want to use an %1$s library with version %2$s.",
+        "System error #193 is \"Bad format of EXE or DLL file\". Usually it means the file is found, but it is corrupted or a file for another architecture. Check whether \"%1$s\" is a file for %2$s architecture.",
+        "System-related limit on the number of threads.",
+        "Try setting new bounds (preferably less than or equal to %1$d) for num_teams clause.",
+        "Valid values are from %1$d to %2$d.",
+        NULL
+    };
+
+struct kmp_i18n_section {
+    int           size;
+    char const ** str;
+}; // struct kmp_i18n_section
+typedef struct kmp_i18n_section  kmp_i18n_section_t;
+
+static kmp_i18n_section_t
+__kmp_i18n_sections[] =
+    {
+        { 0, NULL },
+        { 5, __kmp_i18n_default_meta },
+        { 79, __kmp_i18n_default_strings },
+        { 6, __kmp_i18n_default_formats },
+        { 301, __kmp_i18n_default_messages },
+        { 29, __kmp_i18n_default_hints },
+        { 0, NULL }
+    };
+
+struct kmp_i18n_table {
+    int                   size;
+    kmp_i18n_section_t *  sect;
+}; // struct kmp_i18n_table
+typedef struct kmp_i18n_table  kmp_i18n_table_t;
+
+static kmp_i18n_table_t __kmp_i18n_default_table =
+    {
+        5,
+        __kmp_i18n_sections
+    };
+
+// end of file //
diff --git a/third_party/openmp/kmp_i18n_id.inc b/third_party/openmp/kmp_i18n_id.inc
new file mode 100644
index 000000000..4d25bc08c
--- /dev/null
+++ b/third_party/openmp/kmp_i18n_id.inc
@@ -0,0 +1,456 @@
+// Do not edit this file! //
+// The file was generated from en_US.txt by message-converter.pl on Sun Jan 28 19:00:54 2024. //
+
+enum kmp_i18n_id {
+
+    // A special id for absence of message.
+    kmp_i18n_null = 0,
+
+    // Set #1, meta.
+    kmp_i18n_prp_first = 65536,
+    kmp_i18n_prp_Language,
+    kmp_i18n_prp_Country,
+    kmp_i18n_prp_LangId,
+    kmp_i18n_prp_Version,
+    kmp_i18n_prp_Revision,
+    kmp_i18n_prp_last,
+
+    // Set #2, strings.
+    kmp_i18n_str_first = 131072,
+    kmp_i18n_str_Error,
+    kmp_i18n_str_UnknownFile,
+    kmp_i18n_str_NotANumber,
+    kmp_i18n_str_BadUnit,
+    kmp_i18n_str_IllegalCharacters,
+    kmp_i18n_str_ValueTooLarge,
+    kmp_i18n_str_ValueTooSmall,
+    kmp_i18n_str_NotMultiple4K,
+    kmp_i18n_str_UnknownTopology,
+    kmp_i18n_str_CantOpenCpuinfo,
+    kmp_i18n_str_ProcCpuinfo,
+    kmp_i18n_str_NoProcRecords,
+    kmp_i18n_str_TooManyProcRecords,
+    kmp_i18n_str_CantRewindCpuinfo,
+    kmp_i18n_str_LongLineCpuinfo,
+    kmp_i18n_str_TooManyEntries,
+    kmp_i18n_str_MissingProcField,
+    kmp_i18n_str_MissingPhysicalIDField,
+    kmp_i18n_str_MissingValCpuinfo,
+    kmp_i18n_str_DuplicateFieldCpuinfo,
+    kmp_i18n_str_PhysicalIDsNotUnique,
+    kmp_i18n_str_ApicNotPresent,
+    kmp_i18n_str_InvalidCpuidInfo,
+    kmp_i18n_str_OBSOLETE1,
+    kmp_i18n_str_InconsistentCpuidInfo,
+    kmp_i18n_str_OutOfHeapMemory,
+    kmp_i18n_str_MemoryAllocFailed,
+    kmp_i18n_str_Core,
+    kmp_i18n_str_Thread,
+    kmp_i18n_str_Package,
+    kmp_i18n_str_Node,
+    kmp_i18n_str_OBSOLETE2,
+    kmp_i18n_str_DecodingLegacyAPIC,
+    kmp_i18n_str_OBSOLETE3,
+    kmp_i18n_str_NotDefined,
+    kmp_i18n_str_EffectiveSettings,
+    kmp_i18n_str_UserSettings,
+    kmp_i18n_str_StorageMapWarning,
+    kmp_i18n_str_OBSOLETE4,
+    kmp_i18n_str_OBSOLETE5,
+    kmp_i18n_str_OBSOLETE6,
+    kmp_i18n_str_OBSOLETE7,
+    kmp_i18n_str_OBSOLETE8,
+    kmp_i18n_str_Decodingx2APIC,
+    kmp_i18n_str_NoLeaf11Support,
+    kmp_i18n_str_NoLeaf4Support,
+    kmp_i18n_str_ThreadIDsNotUnique,
+    kmp_i18n_str_UsingPthread,
+    kmp_i18n_str_LegacyApicIDsNotUnique,
+    kmp_i18n_str_x2ApicIDsNotUnique,
+    kmp_i18n_str_DisplayEnvBegin,
+    kmp_i18n_str_DisplayEnvEnd,
+    kmp_i18n_str_Device,
+    kmp_i18n_str_Host,
+    kmp_i18n_str_Tile,
+    kmp_i18n_str_Tiles,
+    kmp_i18n_str_Threads,
+    kmp_i18n_str_Cores,
+    kmp_i18n_str_Socket,
+    kmp_i18n_str_Sockets,
+    kmp_i18n_str_Die,
+    kmp_i18n_str_Dice,
+    kmp_i18n_str_Module,
+    kmp_i18n_str_Modules,
+    kmp_i18n_str_L1Cache,
+    kmp_i18n_str_L1Caches,
+    kmp_i18n_str_L2Cache,
+    kmp_i18n_str_L2Caches,
+    kmp_i18n_str_L3Cache,
+    kmp_i18n_str_L3Caches,
+    kmp_i18n_str_NumaDomain,
+    kmp_i18n_str_NumaDomains,
+    kmp_i18n_str_ProcGroup,
+    kmp_i18n_str_ProcGroups,
+    kmp_i18n_str_Unknown,
+    kmp_i18n_str_NoLeaf31Support,
+    kmp_i18n_str_HwlocFailed,
+    kmp_i18n_str_LLCache,
+    kmp_i18n_str_LLCaches,
+    kmp_i18n_str_last,
+
+    // Set #3, formats.
+    kmp_i18n_fmt_first = 196608,
+    kmp_i18n_fmt_Info,
+    kmp_i18n_fmt_Warning,
+    kmp_i18n_fmt_Fatal,
+    kmp_i18n_fmt_SysErr,
+    kmp_i18n_fmt_Hint,
+    kmp_i18n_fmt_Pragma,
+    kmp_i18n_fmt_last,
+
+    // Set #4, messages.
+    kmp_i18n_msg_first = 262144,
+    kmp_i18n_msg_LibraryIsSerial,
+    kmp_i18n_msg_CantOpenMessageCatalog,
+    kmp_i18n_msg_WillUseDefaultMessages,
+    kmp_i18n_msg_LockIsUninitialized,
+    kmp_i18n_msg_LockSimpleUsedAsNestable,
+    kmp_i18n_msg_LockNestableUsedAsSimple,
+    kmp_i18n_msg_LockIsAlreadyOwned,
+    kmp_i18n_msg_LockStillOwned,
+    kmp_i18n_msg_LockUnsettingFree,
+    kmp_i18n_msg_LockUnsettingSetByAnother,
+    kmp_i18n_msg_StackOverflow,
+    kmp_i18n_msg_StackOverlap,
+    kmp_i18n_msg_AssertionFailure,
+    kmp_i18n_msg_CantRegisterNewThread,
+    kmp_i18n_msg_DuplicateLibrary,
+    kmp_i18n_msg_CantOpenFileForReading,
+    kmp_i18n_msg_CantGetEnvVar,
+    kmp_i18n_msg_CantSetEnvVar,
+    kmp_i18n_msg_CantGetEnvironment,
+    kmp_i18n_msg_BadBoolValue,
+    kmp_i18n_msg_SSPNotBuiltIn,
+    kmp_i18n_msg_SPPSotfTerminateFailed,
+    kmp_i18n_msg_BufferOverflow,
+    kmp_i18n_msg_RealTimeSchedNotSupported,
+    kmp_i18n_msg_RunningAtMaxPriority,
+    kmp_i18n_msg_CantChangeMonitorPriority,
+    kmp_i18n_msg_MonitorWillStarve,
+    kmp_i18n_msg_CantSetMonitorStackSize,
+    kmp_i18n_msg_CantSetWorkerStackSize,
+    kmp_i18n_msg_CantInitThreadAttrs,
+    kmp_i18n_msg_CantDestroyThreadAttrs,
+    kmp_i18n_msg_CantSetWorkerState,
+    kmp_i18n_msg_CantSetMonitorState,
+    kmp_i18n_msg_NoResourcesForWorkerThread,
+    kmp_i18n_msg_NoResourcesForMonitorThread,
+    kmp_i18n_msg_CantTerminateWorkerThread,
+    kmp_i18n_msg_ScheduleKindOutOfRange,
+    kmp_i18n_msg_UnknownSchedulingType,
+    kmp_i18n_msg_InvalidValue,
+    kmp_i18n_msg_SmallValue,
+    kmp_i18n_msg_LargeValue,
+    kmp_i18n_msg_StgInvalidValue,
+    kmp_i18n_msg_BarrReleaseValueInvalid,
+    kmp_i18n_msg_BarrGatherValueInvalid,
+    kmp_i18n_msg_OBSOLETE9,
+    kmp_i18n_msg_ParRangeSyntax,
+    kmp_i18n_msg_UnbalancedQuotes,
+    kmp_i18n_msg_EmptyString,
+    kmp_i18n_msg_LongValue,
+    kmp_i18n_msg_InvalidClause,
+    kmp_i18n_msg_EmptyClause,
+    kmp_i18n_msg_InvalidChunk,
+    kmp_i18n_msg_LargeChunk,
+    kmp_i18n_msg_IgnoreChunk,
+    kmp_i18n_msg_CantGetProcFreq,
+    kmp_i18n_msg_EnvParallelWarn,
+    kmp_i18n_msg_AffParamDefined,
+    kmp_i18n_msg_AffInvalidParam,
+    kmp_i18n_msg_AffManyParams,
+    kmp_i18n_msg_AffManyParamsForLogic,
+    kmp_i18n_msg_AffNoParam,
+    kmp_i18n_msg_AffNoProcList,
+    kmp_i18n_msg_AffProcListNoType,
+    kmp_i18n_msg_AffProcListNotExplicit,
+    kmp_i18n_msg_AffSyntaxError,
+    kmp_i18n_msg_AffZeroStride,
+    kmp_i18n_msg_AffStartGreaterEnd,
+    kmp_i18n_msg_AffStrideLessZero,
+    kmp_i18n_msg_AffRangeTooBig,
+    kmp_i18n_msg_OBSOLETE10,
+    kmp_i18n_msg_AffNotSupported,
+    kmp_i18n_msg_OBSOLETE11,
+    kmp_i18n_msg_GetAffSysCallNotSupported,
+    kmp_i18n_msg_SetAffSysCallNotSupported,
+    kmp_i18n_msg_OBSOLETE12,
+    kmp_i18n_msg_OBSOLETE13,
+    kmp_i18n_msg_OBSOLETE14,
+    kmp_i18n_msg_OBSOLETE15,
+    kmp_i18n_msg_AffCantGetMaskSize,
+    kmp_i18n_msg_ParseSizeIntWarn,
+    kmp_i18n_msg_ParseExtraCharsWarn,
+    kmp_i18n_msg_UnknownForceReduction,
+    kmp_i18n_msg_TimerUseGettimeofday,
+    kmp_i18n_msg_TimerNeedMoreParam,
+    kmp_i18n_msg_TimerInvalidParam,
+    kmp_i18n_msg_TimerGettimeFailed,
+    kmp_i18n_msg_TimerUnknownFunction,
+    kmp_i18n_msg_UnknownSchedTypeDetected,
+    kmp_i18n_msg_DispatchManyThreads,
+    kmp_i18n_msg_IttLookupFailed,
+    kmp_i18n_msg_IttLoadLibFailed,
+    kmp_i18n_msg_IttAllNotifDisabled,
+    kmp_i18n_msg_IttObjNotifDisabled,
+    kmp_i18n_msg_IttMarkNotifDisabled,
+    kmp_i18n_msg_IttUnloadLibFailed,
+    kmp_i18n_msg_CantFormThrTeam,
+    kmp_i18n_msg_ActiveLevelsNegative,
+    kmp_i18n_msg_ActiveLevelsExceedLimit,
+    kmp_i18n_msg_SetLibraryIncorrectCall,
+    kmp_i18n_msg_FatalSysError,
+    kmp_i18n_msg_OutOfHeapMemory,
+    kmp_i18n_msg_OBSOLETE16,
+    kmp_i18n_msg_OBSOLETE17,
+    kmp_i18n_msg_Using_int_Value,
+    kmp_i18n_msg_Using_uint_Value,
+    kmp_i18n_msg_Using_uint64_Value,
+    kmp_i18n_msg_Using_str_Value,
+    kmp_i18n_msg_BarrierPatternOverride,
+    kmp_i18n_msg_MaxValueUsing,
+    kmp_i18n_msg_MinValueUsing,
+    kmp_i18n_msg_MemoryAllocFailed,
+    kmp_i18n_msg_FileNameTooLong,
+    kmp_i18n_msg_OBSOLETE18,
+    kmp_i18n_msg_ManyThreadsForTPDirective,
+    kmp_i18n_msg_AffinityInvalidMask,
+    kmp_i18n_msg_WrongDefinition,
+    kmp_i18n_msg_TLSSetValueFailed,
+    kmp_i18n_msg_TLSOutOfIndexes,
+    kmp_i18n_msg_OBSOLETE19,
+    kmp_i18n_msg_CantGetNumAvailCPU,
+    kmp_i18n_msg_AssumedNumCPU,
+    kmp_i18n_msg_ErrorInitializeAffinity,
+    kmp_i18n_msg_AffThreadsMayMigrate,
+    kmp_i18n_msg_AffIgnoreInvalidProcID,
+    kmp_i18n_msg_AffNoValidProcID,
+    kmp_i18n_msg_UsingFlatOS,
+    kmp_i18n_msg_UsingFlatOSFile,
+    kmp_i18n_msg_UsingFlatOSFileLine,
+    kmp_i18n_msg_FileMsgExiting,
+    kmp_i18n_msg_FileLineMsgExiting,
+    kmp_i18n_msg_ConstructIdentInvalid,
+    kmp_i18n_msg_ThreadIdentInvalid,
+    kmp_i18n_msg_RTLNotInitialized,
+    kmp_i18n_msg_TPCommonBlocksInconsist,
+    kmp_i18n_msg_CantSetThreadAffMask,
+    kmp_i18n_msg_CantSetThreadPriority,
+    kmp_i18n_msg_CantCreateThread,
+    kmp_i18n_msg_CantCreateEvent,
+    kmp_i18n_msg_CantSetEvent,
+    kmp_i18n_msg_CantCloseHandle,
+    kmp_i18n_msg_UnknownLibraryType,
+    kmp_i18n_msg_ReapMonitorError,
+    kmp_i18n_msg_ReapWorkerError,
+    kmp_i18n_msg_ChangeThreadAffMaskError,
+    kmp_i18n_msg_ThreadsMigrate,
+    kmp_i18n_msg_DecreaseToThreads,
+    kmp_i18n_msg_IncreaseToThreads,
+    kmp_i18n_msg_OBSOLETE20,
+    kmp_i18n_msg_AffCapableUseCpuinfo,
+    kmp_i18n_msg_AffUseGlobCpuid,
+    kmp_i18n_msg_AffCapableUseFlat,
+    kmp_i18n_msg_AffNotCapableUseLocCpuid,
+    kmp_i18n_msg_AffNotCapableUseCpuinfo,
+    kmp_i18n_msg_AffFlatTopology,
+    kmp_i18n_msg_InitOSProcSetRespect,
+    kmp_i18n_msg_InitOSProcSetNotRespect,
+    kmp_i18n_msg_AvailableOSProc,
+    kmp_i18n_msg_Uniform,
+    kmp_i18n_msg_NonUniform,
+    kmp_i18n_msg_Topology,
+    kmp_i18n_msg_OBSOLETE21,
+    kmp_i18n_msg_OSProcToPackage,
+    kmp_i18n_msg_OBSOLETE22,
+    kmp_i18n_msg_OBSOLETE23,
+    kmp_i18n_msg_OBSOLETE24,
+    kmp_i18n_msg_OBSOLETE25,
+    kmp_i18n_msg_OBSOLETE26,
+    kmp_i18n_msg_OBSOLETE27,
+    kmp_i18n_msg_OBSOLETE28,
+    kmp_i18n_msg_OBSOLETE29,
+    kmp_i18n_msg_OBSOLETE30,
+    kmp_i18n_msg_OSProcMapToPack,
+    kmp_i18n_msg_OBSOLETE31,
+    kmp_i18n_msg_OBSOLETE32,
+    kmp_i18n_msg_OBSOLETE33,
+    kmp_i18n_msg_OBSOLETE34,
+    kmp_i18n_msg_OBSOLETE35,
+    kmp_i18n_msg_BarriersInDifferentOrder,
+    kmp_i18n_msg_FunctionError,
+    kmp_i18n_msg_TopologyExtra,
+    kmp_i18n_msg_WrongMessageCatalog,
+    kmp_i18n_msg_StgIgnored,
+    kmp_i18n_msg_OBSOLETE36,
+    kmp_i18n_msg_AffTilesNoHWLOC,
+    kmp_i18n_msg_AffTilesNoTiles,
+    kmp_i18n_msg_TopologyExtraTile,
+    kmp_i18n_msg_TopologyExtraNode,
+    kmp_i18n_msg_TopologyExtraNoTi,
+    kmp_i18n_msg_OmptOutdatedWorkshare,
+    kmp_i18n_msg_OmpNoAllocator,
+    kmp_i18n_msg_TopologyGeneric,
+    kmp_i18n_msg_AffGranularityBad,
+    kmp_i18n_msg_TopologyHybrid,
+    kmp_i18n_msg_TopologyHybridCoreEff,
+    kmp_i18n_msg_CnsBoundToWorksharing,
+    kmp_i18n_msg_CnsDetectedEnd,
+    kmp_i18n_msg_CnsIterationRangeTooLarge,
+    kmp_i18n_msg_CnsLoopIncrZeroProhibited,
+    kmp_i18n_msg_CnsExpectedEnd,
+    kmp_i18n_msg_CnsInvalidNesting,
+    kmp_i18n_msg_CnsMultipleNesting,
+    kmp_i18n_msg_CnsNestingSameName,
+    kmp_i18n_msg_CnsNoOrderedClause,
+    kmp_i18n_msg_CnsNotInTaskConstruct,
+    kmp_i18n_msg_CnsThreadsAtBarrier,
+    kmp_i18n_msg_CantConnect,
+    kmp_i18n_msg_CantConnectUsing,
+    kmp_i18n_msg_LibNotSupport,
+    kmp_i18n_msg_LibNotSupportFor,
+    kmp_i18n_msg_StaticLibNotSupport,
+    kmp_i18n_msg_OBSOLETE37,
+    kmp_i18n_msg_IttUnknownGroup,
+    kmp_i18n_msg_IttEnvVarTooLong,
+    kmp_i18n_msg_OBSOLETE38,
+    kmp_i18n_msg_OBSOLETE39,
+    kmp_i18n_msg_AffInfoStr,
+    kmp_i18n_msg_AffInfoStrStr,
+    kmp_i18n_msg_OSProcToPhysicalThreadMap,
+    kmp_i18n_msg_AffUsingFlatOS,
+    kmp_i18n_msg_AffParseFilename,
+    kmp_i18n_msg_MsgExiting,
+    kmp_i18n_msg_IncompatibleLibrary,
+    kmp_i18n_msg_IttFunctionError,
+    kmp_i18n_msg_IttUnknownError,
+    kmp_i18n_msg_EnvMiddleWarn,
+    kmp_i18n_msg_CnsLockNotDestroyed,
+    kmp_i18n_msg_CantLoadBalUsing,
+    kmp_i18n_msg_AffNotCapableUsePthread,
+    kmp_i18n_msg_AffUsePthread,
+    kmp_i18n_msg_OBSOLETE40,
+    kmp_i18n_msg_OBSOLETE41,
+    kmp_i18n_msg_OBSOLETE42,
+    kmp_i18n_msg_OBSOLETE43,
+    kmp_i18n_msg_NthSyntaxError,
+    kmp_i18n_msg_NthSpacesNotAllowed,
+    kmp_i18n_msg_AffStrParseFilename,
+    kmp_i18n_msg_OBSOLETE44,
+    kmp_i18n_msg_AffTypeCantUseMultGroups,
+    kmp_i18n_msg_AffGranCantUseMultGroups,
+    kmp_i18n_msg_AffWindowsProcGroupMap,
+    kmp_i18n_msg_AffOSProcToGroup,
+    kmp_i18n_msg_AffBalancedNotAvail,
+    kmp_i18n_msg_OBSOLETE45,
+    kmp_i18n_msg_EnvLockWarn,
+    kmp_i18n_msg_FutexNotSupported,
+    kmp_i18n_msg_AffGranUsing,
+    kmp_i18n_msg_AffHWSubsetInvalid,
+    kmp_i18n_msg_AffHWSubsetUnsupported,
+    kmp_i18n_msg_AffHWSubsetManyCores,
+    kmp_i18n_msg_SyntaxErrorUsing,
+    kmp_i18n_msg_AdaptiveNotSupported,
+    kmp_i18n_msg_EnvSyntaxError,
+    kmp_i18n_msg_EnvSpacesNotAllowed,
+    kmp_i18n_msg_BoundToOSProcSet,
+    kmp_i18n_msg_CnsLoopIncrIllegal,
+    kmp_i18n_msg_NoGompCancellation,
+    kmp_i18n_msg_AffHWSubsetNonUniform,
+    kmp_i18n_msg_AffHWSubsetNonThreeLevel,
+    kmp_i18n_msg_AffGranTopGroup,
+    kmp_i18n_msg_AffGranGroupType,
+    kmp_i18n_msg_AffHWSubsetManySockets,
+    kmp_i18n_msg_AffHWSubsetDeprecated,
+    kmp_i18n_msg_AffUsingHwloc,
+    kmp_i18n_msg_AffIgnoringHwloc,
+    kmp_i18n_msg_AffHwlocErrorOccurred,
+    kmp_i18n_msg_EnvSerialWarn,
+    kmp_i18n_msg_EnvMwaitWarn,
+    kmp_i18n_msg_EnvVarDeprecated,
+    kmp_i18n_msg_RedMethodNotSupported,
+    kmp_i18n_msg_AffHWSubsetNoHWLOC,
+    kmp_i18n_msg_AffHWSubsetManyNodes,
+    kmp_i18n_msg_AffHWSubsetManyTiles,
+    kmp_i18n_msg_AffHWSubsetManyProcs,
+    kmp_i18n_msg_HierSchedInvalid,
+    kmp_i18n_msg_AffFormatDefault,
+    kmp_i18n_msg_APIDeprecated,
+    kmp_i18n_msg_GompFeatureNotSupported,
+    kmp_i18n_msg_AffHWSubsetManyDies,
+    kmp_i18n_msg_AffUseGlobCpuidL,
+    kmp_i18n_msg_AffNotCapableUseLocCpuidL,
+    kmp_i18n_msg_AffNotUsingHwloc,
+    kmp_i18n_msg_UserDirectedError,
+    kmp_i18n_msg_UserDirectedWarning,
+    kmp_i18n_msg_FailedToCreateTeam,
+    kmp_i18n_msg_AffHWSubsetManyGeneric,
+    kmp_i18n_msg_AffHWSubsetNotExistGeneric,
+    kmp_i18n_msg_AffHWSubsetEqvLayers,
+    kmp_i18n_msg_AffHWSubsetOutOfOrder,
+    kmp_i18n_msg_AffEqualTopologyTypes,
+    kmp_i18n_msg_AffGranTooCoarseProcGroup,
+    kmp_i18n_msg_StgDeprecatedValue,
+    kmp_i18n_msg_NumTeamsNotPositive,
+    kmp_i18n_msg_AffHWSubsetIncompat,
+    kmp_i18n_msg_AffHWSubsetAttrRepeat,
+    kmp_i18n_msg_AffHWSubsetAttrInvalid,
+    kmp_i18n_msg_AffHWSubsetAllFiltered,
+    kmp_i18n_msg_AffHWSubsetAttrsNonHybrid,
+    kmp_i18n_msg_AffHWSubsetIgnoringAttr,
+    kmp_i18n_msg_TargetMemNotAvailable,
+    kmp_i18n_msg_AffIgnoringNonHybrid,
+    kmp_i18n_msg_AffIgnoringNotAvailable,
+    kmp_i18n_msg_last,
+
+    // Set #5, hints.
+    kmp_i18n_hnt_first = 327680,
+    kmp_i18n_hnt_SubmitBugReport,
+    kmp_i18n_hnt_OBSOLETE46,
+    kmp_i18n_hnt_ChangeStackLimit,
+    kmp_i18n_hnt_Unset_ALL_THREADS,
+    kmp_i18n_hnt_Set_ALL_THREADPRIVATE,
+    kmp_i18n_hnt_PossibleSystemLimitOnThreads,
+    kmp_i18n_hnt_DuplicateLibrary,
+    kmp_i18n_hnt_NameComesFrom_CPUINFO_FILE,
+    kmp_i18n_hnt_NotEnoughMemory,
+    kmp_i18n_hnt_ValidBoolValues,
+    kmp_i18n_hnt_BufferOverflow,
+    kmp_i18n_hnt_RunningAtMaxPriority,
+    kmp_i18n_hnt_ChangeMonitorStackSize,
+    kmp_i18n_hnt_ChangeWorkerStackSize,
+    kmp_i18n_hnt_IncreaseWorkerStackSize,
+    kmp_i18n_hnt_DecreaseWorkerStackSize,
+    kmp_i18n_hnt_Decrease_NUM_THREADS,
+    kmp_i18n_hnt_IncreaseMonitorStackSize,
+    kmp_i18n_hnt_DecreaseMonitorStackSize,
+    kmp_i18n_hnt_DecreaseNumberOfThreadsInUse,
+    kmp_i18n_hnt_DefaultScheduleKindUsed,
+    kmp_i18n_hnt_GetNewerLibrary,
+    kmp_i18n_hnt_CheckEnvVar,
+    kmp_i18n_hnt_OBSOLETE47,
+    kmp_i18n_hnt_OBSOLETE48,
+    kmp_i18n_hnt_BadExeFormat,
+    kmp_i18n_hnt_SystemLimitOnThreads,
+    kmp_i18n_hnt_SetNewBound,
+    kmp_i18n_hnt_ValidValuesRange,
+    kmp_i18n_hnt_last,
+
+    kmp_i18n_xxx_lastest
+
+}; // enum kmp_i18n_id
+
+typedef enum kmp_i18n_id  kmp_i18n_id_t;
+
+
+// end of file //
diff --git a/third_party/openmp/kmp_io.cpp b/third_party/openmp/kmp_io.cpp
new file mode 100644
index 000000000..0c52662bc
--- /dev/null
+++ b/third_party/openmp/kmp_io.cpp
@@ -0,0 +1,215 @@
+/*
+ * kmp_io.cpp -- RTL IO
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef __ABSOFT_WIN
+#include <sys/types.h>
+#endif
+
+#include "kmp.h" // KMP_GTID_DNE, __kmp_debug_buf, etc
+#include "kmp_io.h"
+#include "kmp_lock.h"
+#include "kmp_os.h"
+#include "kmp_str.h"
+
+#if KMP_OS_WINDOWS
+#if KMP_MSVC_COMPAT
+#pragma warning(push)
+#pragma warning(disable : 271 310)
+#endif
+#include <windows.h>
+#if KMP_MSVC_COMPAT
+#pragma warning(pop)
+#endif
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+kmp_bootstrap_lock_t __kmp_stdio_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(
+    __kmp_stdio_lock); /* Control stdio functions */
+kmp_bootstrap_lock_t __kmp_console_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(
+    __kmp_console_lock); /* Control console initialization */
+
+#if KMP_OS_WINDOWS
+
+static HANDLE __kmp_stdout = NULL;
+static HANDLE __kmp_stderr = NULL;
+static int __kmp_console_exists = FALSE;
+static kmp_str_buf_t __kmp_console_buf;
+
+void __kmp_close_console(void) {
+  /* wait until user presses return before closing window */
+  /* TODO only close if a window was opened */
+  if (__kmp_console_exists) {
+    __kmp_stdout = NULL;
+    __kmp_stderr = NULL;
+    __kmp_str_buf_free(&__kmp_console_buf);
+    __kmp_console_exists = FALSE;
+  }
+}
+
+/* For windows, call this before stdout, stderr, or stdin are used.
+   It opens a console window and starts processing */
+static void __kmp_redirect_output(void) {
+  __kmp_acquire_bootstrap_lock(&__kmp_console_lock);
+
+  if (!__kmp_console_exists) {
+    HANDLE ho;
+    HANDLE he;
+
+    __kmp_str_buf_init(&__kmp_console_buf);
+
+    AllocConsole();
+    // We do not check the result of AllocConsole because
+    //  1. the call is harmless
+    //  2. it is not clear how to communicate failue
+    //  3. we will detect failure later when we get handle(s)
+
+    ho = GetStdHandle(STD_OUTPUT_HANDLE);
+    if (ho == INVALID_HANDLE_VALUE || ho == NULL) {
+
+      DWORD err = GetLastError();
+      // TODO: output error somehow (maybe message box)
+      (void)err;
+      __kmp_stdout = NULL;
+
+    } else {
+
+      __kmp_stdout = ho; // temporary code, need new global for ho
+    }
+    he = GetStdHandle(STD_ERROR_HANDLE);
+    if (he == INVALID_HANDLE_VALUE || he == NULL) {
+
+      DWORD err = GetLastError();
+      // TODO: output error somehow (maybe message box)
+      (void)err;
+      __kmp_stderr = NULL;
+
+    } else {
+
+      __kmp_stderr = he; // temporary code, need new global
+    }
+    __kmp_console_exists = TRUE;
+  }
+  __kmp_release_bootstrap_lock(&__kmp_console_lock);
+}
+
+#else
+#define __kmp_stderr (stderr)
+#define __kmp_stdout (stdout)
+#endif /* KMP_OS_WINDOWS */
+
+void __kmp_vprintf(enum kmp_io out_stream, char const *format, va_list ap) {
+#if KMP_OS_WINDOWS
+  if (!__kmp_console_exists) {
+    __kmp_redirect_output();
+  }
+  if (!__kmp_stderr && out_stream == kmp_err) {
+    return;
+  }
+  if (!__kmp_stdout && out_stream == kmp_out) {
+    return;
+  }
+#endif /* KMP_OS_WINDOWS */
+  auto stream = ((out_stream == kmp_out) ? __kmp_stdout : __kmp_stderr);
+
+  if (__kmp_debug_buf && __kmp_debug_buffer != NULL) {
+
+    int dc = __kmp_debug_count++ % __kmp_debug_buf_lines;
+    char *db = &__kmp_debug_buffer[dc * __kmp_debug_buf_chars];
+    int chars = 0;
+
+#ifdef KMP_DEBUG_PIDS
+    chars = KMP_SNPRINTF(db, __kmp_debug_buf_chars,
+                         "pid=%d: ", (kmp_int32)getpid());
+#endif
+    chars += KMP_VSNPRINTF(db, __kmp_debug_buf_chars, format, ap);
+
+    if (chars + 1 > __kmp_debug_buf_chars) {
+      if (chars + 1 > __kmp_debug_buf_warn_chars) {
+#if KMP_OS_WINDOWS
+        DWORD count;
+        __kmp_str_buf_print(&__kmp_console_buf,
+                            "OMP warning: Debugging buffer "
+                            "overflow; increase "
+                            "KMP_DEBUG_BUF_CHARS to %d\n",
+                            chars + 1);
+        WriteFile(stream, __kmp_console_buf.str, __kmp_console_buf.used, &count,
+                  NULL);
+        __kmp_str_buf_clear(&__kmp_console_buf);
+#else
+        fprintf(stream,
+                "OMP warning: Debugging buffer overflow; "
+                "increase KMP_DEBUG_BUF_CHARS to %d\n",
+                chars + 1);
+        fflush(stream);
+#endif
+        __kmp_debug_buf_warn_chars = chars + 1;
+      }
+      /* terminate string if overflow occurred */
+      db[__kmp_debug_buf_chars - 2] = '\n';
+      db[__kmp_debug_buf_chars - 1] = '\0';
+    }
+  } else {
+#if KMP_OS_WINDOWS
+    DWORD count;
+#ifdef KMP_DEBUG_PIDS
+    __kmp_str_buf_print(&__kmp_console_buf, "pid=%d: ", (kmp_int32)getpid());
+#endif
+    __kmp_str_buf_vprint(&__kmp_console_buf, format, ap);
+    WriteFile(stream, __kmp_console_buf.str, __kmp_console_buf.used, &count,
+              NULL);
+    __kmp_str_buf_clear(&__kmp_console_buf);
+#else
+#ifdef KMP_DEBUG_PIDS
+    fprintf(stream, "pid=%d: ", (kmp_int32)getpid());
+#endif
+    vfprintf(stream, format, ap);
+    fflush(stream);
+#endif
+  }
+}
+
+void __kmp_printf(char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+
+  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_vprintf(kmp_err, format, ap);
+  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+
+  va_end(ap);
+}
+
+void __kmp_printf_no_lock(char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+
+  __kmp_vprintf(kmp_err, format, ap);
+
+  va_end(ap);
+}
+
+void __kmp_fprintf(enum kmp_io stream, char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+
+  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_vprintf(stream, format, ap);
+  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+
+  va_end(ap);
+}
diff --git a/third_party/openmp/kmp_io.h b/third_party/openmp/kmp_io.h
new file mode 100644
index 000000000..49afda59e
--- /dev/null
+++ b/third_party/openmp/kmp_io.h
@@ -0,0 +1,38 @@
+/*
+ * kmp_io.h -- RTL IO header file.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_IO_H
+#define KMP_IO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+enum kmp_io { kmp_out = 0, kmp_err };
+
+extern kmp_bootstrap_lock_t __kmp_stdio_lock; /* Control stdio functions */
+extern kmp_bootstrap_lock_t
+    __kmp_console_lock; /* Control console initialization */
+
+extern void __kmp_vprintf(enum kmp_io stream, char const *format, va_list ap);
+extern void __kmp_printf(char const *format, ...);
+extern void __kmp_printf_no_lock(char const *format, ...);
+extern void __kmp_fprintf(enum kmp_io stream, char const *format, ...);
+extern void __kmp_close_console(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* KMP_IO_H */
diff --git a/third_party/openmp/kmp_itt.cpp b/third_party/openmp/kmp_itt.cpp
new file mode 100644
index 000000000..b5c3063b9
--- /dev/null
+++ b/third_party/openmp/kmp_itt.cpp
@@ -0,0 +1,159 @@
+#include "kmp_config.h"
+
+#if USE_ITT_BUILD
+/*
+ * kmp_itt.cpp -- ITT Notify interface.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_itt.h"
+
+#if KMP_DEBUG
+#include "kmp_itt.inc"
+#endif
+
+#if USE_ITT_NOTIFY
+
+// #include "ittnotify_config.h"
+__itt_global __kmp_ittapi_clean_global;
+extern __itt_global __kmp_itt__ittapi_global;
+
+kmp_itthash_t __kmp_itt_barrier_domains = {{0}, 0};
+kmp_itthash_t __kmp_itt_region_domains = {{0}, 0};
+__itt_domain *metadata_domain = NULL;
+__itt_string_handle *string_handle_imbl = NULL;
+__itt_string_handle *string_handle_loop = NULL;
+__itt_string_handle *string_handle_sngl = NULL;
+
+#include "kmp_i18n.h"
+#include "kmp_str.h"
+#include "kmp_version.h"
+
+KMP_BUILD_ASSERT(sizeof(kmp_itt_mark_t) == sizeof(__itt_mark_type));
+
+/* Previously used warnings:
+
+   KMP_WARNING( IttAllNotifDisabled );
+   KMP_WARNING( IttObjNotifDisabled );
+   KMP_WARNING( IttMarkNotifDisabled );
+   KMP_WARNING( IttUnloadLibFailed, libittnotify );
+*/
+
+kmp_int32 __kmp_itt_prepare_delay = 0;
+kmp_bootstrap_lock_t __kmp_itt_debug_lock =
+    KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_itt_debug_lock);
+
+#endif // USE_ITT_NOTIFY
+
+void __kmp_itt_reset() {
+#if USE_ITT_NOTIFY
+  __kmp_itt__ittapi_global = __kmp_ittapi_clean_global;
+#endif
+}
+
+void __kmp_itt_initialize() {
+
+  // ITTNotify library is loaded and initialized at first call to any ittnotify
+  // function, so we do not need to explicitly load it any more. Just report OMP
+  // RTL version to ITTNotify.
+
+#if USE_ITT_NOTIFY
+  // Backup a clean global state
+  __kmp_ittapi_clean_global = __kmp_itt__ittapi_global;
+
+  // Report OpenMP RTL version.
+  kmp_str_buf_t buf;
+  __itt_mark_type version;
+  __kmp_str_buf_init(&buf);
+  __kmp_str_buf_print(&buf, "OMP RTL Version %d.%d.%d", __kmp_version_major,
+                      __kmp_version_minor, __kmp_version_build);
+  if (__itt_api_version_ptr != NULL) {
+    __kmp_str_buf_print(&buf, ":%s", __itt_api_version());
+  }
+  version = __itt_mark_create(buf.str);
+  __itt_mark(version, NULL);
+  __kmp_str_buf_free(&buf);
+#endif
+
+} // __kmp_itt_initialize
+
+void __kmp_itt_destroy() {
+#if USE_ITT_NOTIFY
+  __kmp_itt_fini_ittlib();
+#endif
+} // __kmp_itt_destroy
+
+extern "C" void __itt_error_handler(__itt_error_code err, va_list args) {
+
+  switch (err) {
+  case __itt_error_no_module: {
+    char const *library = va_arg(args, char const *);
+#if KMP_OS_WINDOWS
+    int sys_err = va_arg(args, int);
+    kmp_msg_t err_code = KMP_SYSERRCODE(sys_err);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(IttLoadLibFailed, library), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+#else
+    char const *sys_err = va_arg(args, char const *);
+    kmp_msg_t err_code = KMP_SYSERRMESG(sys_err);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(IttLoadLibFailed, library), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+#endif
+  } break;
+  case __itt_error_no_symbol: {
+    char const *library = va_arg(args, char const *);
+    char const *symbol = va_arg(args, char const *);
+    KMP_WARNING(IttLookupFailed, symbol, library);
+  } break;
+  case __itt_error_unknown_group: {
+    char const *var = va_arg(args, char const *);
+    char const *group = va_arg(args, char const *);
+    KMP_WARNING(IttUnknownGroup, var, group);
+  } break;
+  case __itt_error_env_too_long: {
+    char const *var = va_arg(args, char const *);
+    size_t act_len = va_arg(args, size_t);
+    size_t max_len = va_arg(args, size_t);
+    KMP_WARNING(IttEnvVarTooLong, var, (unsigned long)act_len,
+                (unsigned long)max_len);
+  } break;
+  case __itt_error_cant_read_env: {
+    char const *var = va_arg(args, char const *);
+    int sys_err = va_arg(args, int);
+    kmp_msg_t err_code = KMP_ERR(sys_err);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(CantGetEnvVar, var), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+  } break;
+  case __itt_error_system: {
+    char const *func = va_arg(args, char const *);
+    int sys_err = va_arg(args, int);
+    kmp_msg_t err_code = KMP_SYSERRCODE(sys_err);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(IttFunctionError, func), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+  } break;
+  default: {
+    KMP_WARNING(IttUnknownError, err);
+  }
+  }
+} // __itt_error_handler
+
+#endif /* USE_ITT_BUILD */
diff --git a/third_party/openmp/kmp_itt.h b/third_party/openmp/kmp_itt.h
new file mode 100644
index 000000000..5ae445268
--- /dev/null
+++ b/third_party/openmp/kmp_itt.h
@@ -0,0 +1,348 @@
+#if USE_ITT_BUILD
+/*
+ * kmp_itt.h -- ITT Notify interface.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_ITT_H
+#define KMP_ITT_H
+
+#include "kmp_lock.h"
+
+#define INTEL_ITTNOTIFY_API_PRIVATE
+// #include "ittnotify.h"
+// #include "legacy/ittnotify.h"
+
+#if KMP_DEBUG
+#define __kmp_inline // Turn off inlining in debug mode.
+#else
+#define __kmp_inline static inline
+#endif
+
+#if USE_ITT_NOTIFY
+extern kmp_int32 __kmp_itt_prepare_delay;
+#ifdef __cplusplus
+extern "C" void __kmp_itt_fini_ittlib(void);
+#else
+extern void __kmp_itt_fini_ittlib(void);
+#endif
+#endif
+
+// Simplify the handling of an argument that is only required when USE_ITT_BUILD
+// is enabled.
+#define USE_ITT_BUILD_ARG(x) , x
+
+void __kmp_itt_initialize();
+void __kmp_itt_destroy();
+void __kmp_itt_reset();
+
+// -----------------------------------------------------------------------------
+// New stuff for reporting high-level constructs.
+
+// Note the naming convention:
+//     __kmp_itt_xxxing() function should be called before action, while
+//     __kmp_itt_xxxed()  function should be called after action.
+
+// --- Parallel region reporting ---
+__kmp_inline void
+__kmp_itt_region_forking(int gtid, int team_size,
+                         int barriers); // Primary only, before forking threads.
+__kmp_inline void
+__kmp_itt_region_joined(int gtid); // Primary only, after joining threads.
+// (*) Note: A thread may execute tasks after this point, though.
+
+// --- Frame reporting ---
+// region=0: no regions, region=1: parallel, region=2: serialized parallel
+__kmp_inline void __kmp_itt_frame_submit(int gtid, __itt_timestamp begin,
+                                         __itt_timestamp end, int imbalance,
+                                         ident_t *loc, int team_size,
+                                         int region = 0);
+
+// --- Metadata reporting ---
+// begin/end - begin/end timestamps of a barrier frame, imbalance - aggregated
+// wait time value, reduction -if this is a reduction barrier
+__kmp_inline void __kmp_itt_metadata_imbalance(int gtid, kmp_uint64 begin,
+                                               kmp_uint64 end,
+                                               kmp_uint64 imbalance,
+                                               kmp_uint64 reduction);
+// sched_type: 0 - static, 1 - dynamic, 2 - guided, 3 - custom (all others);
+// iterations - loop trip count, chunk - chunk size
+__kmp_inline void __kmp_itt_metadata_loop(ident_t *loc, kmp_uint64 sched_type,
+                                          kmp_uint64 iterations,
+                                          kmp_uint64 chunk);
+__kmp_inline void __kmp_itt_metadata_single(ident_t *loc);
+
+// --- Barrier reporting ---
+__kmp_inline void *__kmp_itt_barrier_object(int gtid, int bt, int set_name = 0,
+                                            int delta = 0);
+__kmp_inline void __kmp_itt_barrier_starting(int gtid, void *object);
+__kmp_inline void __kmp_itt_barrier_middle(int gtid, void *object);
+__kmp_inline void __kmp_itt_barrier_finished(int gtid, void *object);
+
+// --- Taskwait reporting ---
+__kmp_inline void *__kmp_itt_taskwait_object(int gtid);
+__kmp_inline void __kmp_itt_taskwait_starting(int gtid, void *object);
+__kmp_inline void __kmp_itt_taskwait_finished(int gtid, void *object);
+#define KMP_ITT_TASKWAIT_STARTING(obj)                                         \
+  if (UNLIKELY(__itt_sync_create_ptr)) {                                       \
+    obj = __kmp_itt_taskwait_object(gtid);                                     \
+    if (obj != NULL) {                                                         \
+      __kmp_itt_taskwait_starting(gtid, obj);                                  \
+    }                                                                          \
+  }
+#define KMP_ITT_TASKWAIT_FINISHED(obj)                                         \
+  if (UNLIKELY(obj != NULL))                                                   \
+    __kmp_itt_taskwait_finished(gtid, obj);
+
+// --- Task reporting ---
+__kmp_inline void __kmp_itt_task_starting(void *object);
+__kmp_inline void __kmp_itt_task_finished(void *object);
+
+// --- Lock reporting ---
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_inline void __kmp_itt_lock_creating(kmp_user_lock_p lock,
+                                          const ident_t *);
+#else
+__kmp_inline void __kmp_itt_lock_creating(kmp_user_lock_p lock);
+#endif
+__kmp_inline void __kmp_itt_lock_acquiring(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_lock_acquired(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_lock_releasing(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_lock_cancelled(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_lock_destroyed(kmp_user_lock_p lock);
+
+// --- Critical reporting ---
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_inline void __kmp_itt_critical_creating(kmp_user_lock_p lock,
+                                              const ident_t *);
+#else
+__kmp_inline void __kmp_itt_critical_creating(kmp_user_lock_p lock);
+#endif
+__kmp_inline void __kmp_itt_critical_acquiring(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_critical_acquired(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_critical_releasing(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_critical_destroyed(kmp_user_lock_p lock);
+
+// --- Single reporting ---
+__kmp_inline void __kmp_itt_single_start(int gtid);
+__kmp_inline void __kmp_itt_single_end(int gtid);
+
+// --- Ordered reporting ---
+__kmp_inline void __kmp_itt_ordered_init(int gtid);
+__kmp_inline void __kmp_itt_ordered_prep(int gtid);
+__kmp_inline void __kmp_itt_ordered_start(int gtid);
+__kmp_inline void __kmp_itt_ordered_end(int gtid);
+
+// --- Threads reporting ---
+__kmp_inline void __kmp_itt_thread_ignore();
+__kmp_inline void __kmp_itt_thread_name(int gtid);
+
+// --- System objects ---
+__kmp_inline void __kmp_itt_system_object_created(void *object,
+                                                  char const *name);
+
+// --- Stack stitching ---
+__kmp_inline __itt_caller __kmp_itt_stack_caller_create(void);
+__kmp_inline void __kmp_itt_stack_caller_destroy(__itt_caller);
+__kmp_inline void __kmp_itt_stack_callee_enter(__itt_caller);
+__kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);
+
+// -----------------------------------------------------------------------------
+// Old stuff for reporting low-level internal synchronization.
+
+#if USE_ITT_NOTIFY
+
+/* Support for SSC marks, which are used by SDE
+   http://software.intel.com/en-us/articles/intel-software-development-emulator
+   to mark points in instruction traces that represent spin-loops and are
+   therefore uninteresting when collecting traces for architecture simulation.
+ */
+#ifndef INCLUDE_SSC_MARKS
+#define INCLUDE_SSC_MARKS (KMP_OS_LINUX && KMP_ARCH_X86_64)
+#endif
+
+/* Linux 64 only for now */
+#if (INCLUDE_SSC_MARKS && KMP_OS_LINUX && KMP_ARCH_X86_64)
+// Portable (at least for gcc and icc) code to insert the necessary instructions
+// to set %ebx and execute the unlikely no-op.
+#if defined(__INTEL_COMPILER)
+#define INSERT_SSC_MARK(tag) __SSC_MARK(tag)
+#else
+#define INSERT_SSC_MARK(tag)                                                   \
+  __asm__ __volatile__("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(tag)    \
+                       : "%ebx")
+#endif
+#else
+#define INSERT_SSC_MARK(tag) ((void)0)
+#endif
+
+/* Markers for the start and end of regions that represent polling and are
+   therefore uninteresting to architectural simulations 0x4376 and 0x4377 are
+   arbitrary numbers that should be unique in the space of SSC tags, but there
+   is no central issuing authority rather randomness is expected to work. */
+#define SSC_MARK_SPIN_START() INSERT_SSC_MARK(0x4376)
+#define SSC_MARK_SPIN_END() INSERT_SSC_MARK(0x4377)
+
+// Markers for architecture simulation.
+// FORKING      : Before the primary thread forks.
+// JOINING      : At the start of the join.
+// INVOKING     : Before the threads invoke microtasks.
+// DISPATCH_INIT: At the start of dynamically scheduled loop.
+// DISPATCH_NEXT: After claming next iteration of dynamically scheduled loop.
+#define SSC_MARK_FORKING() INSERT_SSC_MARK(0xd693)
+#define SSC_MARK_JOINING() INSERT_SSC_MARK(0xd694)
+#define SSC_MARK_INVOKING() INSERT_SSC_MARK(0xd695)
+#define SSC_MARK_DISPATCH_INIT() INSERT_SSC_MARK(0xd696)
+#define SSC_MARK_DISPATCH_NEXT() INSERT_SSC_MARK(0xd697)
+
+// The object is an address that associates a specific set of the prepare,
+// acquire, release, and cancel operations.
+
+/* Sync prepare indicates a thread is going to start waiting for another thread
+   to send a release event.  This operation should be done just before the
+   thread begins checking for the existence of the release event */
+
+/* Sync cancel indicates a thread is cancelling a wait on another thread and
+   continuing execution without waiting for the other thread to release it */
+
+/* Sync acquired indicates a thread has received a release event from another
+   thread and has stopped waiting.  This operation must occur only after the
+   release event is received. */
+
+/* Sync release indicates a thread is going to send a release event to another
+   thread so it will stop waiting and continue execution. This operation must
+   just happen before the release event. */
+
+#define KMP_FSYNC_PREPARE(obj) __itt_fsync_prepare((void *)(obj))
+#define KMP_FSYNC_CANCEL(obj) __itt_fsync_cancel((void *)(obj))
+#define KMP_FSYNC_ACQUIRED(obj) __itt_fsync_acquired((void *)(obj))
+#define KMP_FSYNC_RELEASING(obj) __itt_fsync_releasing((void *)(obj))
+
+/* In case of waiting in a spin loop, ITT wants KMP_FSYNC_PREPARE() to be called
+   with a delay (and not called at all if waiting time is small). So, in spin
+   loops, do not use KMP_FSYNC_PREPARE(), but use KMP_FSYNC_SPIN_INIT() (before
+   spin loop), KMP_FSYNC_SPIN_PREPARE() (whithin the spin loop), and
+   KMP_FSYNC_SPIN_ACQUIRED(). See KMP_WAIT() for example. */
+
+#undef KMP_FSYNC_SPIN_INIT
+#define KMP_FSYNC_SPIN_INIT(obj, spin)                                         \
+  int sync_iters = 0;                                                          \
+  if (__itt_fsync_prepare_ptr) {                                               \
+    if (obj == NULL) {                                                         \
+      obj = spin;                                                              \
+    } /* if */                                                                 \
+  } /* if */                                                                   \
+  SSC_MARK_SPIN_START()
+
+#undef KMP_FSYNC_SPIN_PREPARE
+#define KMP_FSYNC_SPIN_PREPARE(obj)                                            \
+  do {                                                                         \
+    if (__itt_fsync_prepare_ptr && sync_iters < __kmp_itt_prepare_delay) {     \
+      ++sync_iters;                                                            \
+      if (sync_iters >= __kmp_itt_prepare_delay) {                             \
+        KMP_FSYNC_PREPARE((void *)obj);                                        \
+      } /* if */                                                               \
+    } /* if */                                                                 \
+  } while (0)
+#undef KMP_FSYNC_SPIN_ACQUIRED
+#define KMP_FSYNC_SPIN_ACQUIRED(obj)                                           \
+  do {                                                                         \
+    SSC_MARK_SPIN_END();                                                       \
+    if (sync_iters >= __kmp_itt_prepare_delay) {                               \
+      KMP_FSYNC_ACQUIRED((void *)obj);                                         \
+    } /* if */                                                                 \
+  } while (0)
+
+/* ITT will not report objects created within KMP_ITT_IGNORE(), e. g.:
+       KMP_ITT_IGNORE(
+           ptr = malloc( size );
+       );
+*/
+#define KMP_ITT_IGNORE(statement)                                              \
+  do {                                                                         \
+    __itt_state_t __itt_state_;                                                \
+    if (__itt_state_get_ptr) {                                                 \
+      __itt_state_ = __itt_state_get();                                        \
+      __itt_obj_mode_set(__itt_obj_prop_ignore, __itt_obj_state_set);          \
+    } /* if */                                                                 \
+    { statement }                                                              \
+    if (__itt_state_get_ptr) {                                                 \
+      __itt_state_set(__itt_state_);                                           \
+    } /* if */                                                                 \
+  } while (0)
+
+// Maximum number of frame domains to use (maps to
+// different OpenMP regions in the user source code).
+const int KMP_MAX_FRAME_DOMAINS = 997;
+typedef struct kmp_itthash_entry {
+  ident_t *loc;
+  int team_size;
+  __itt_domain *d;
+  struct kmp_itthash_entry *next_in_bucket;
+} kmp_itthash_entry_t;
+typedef struct kmp_itthash {
+  kmp_itthash_entry_t *buckets[KMP_MAX_FRAME_DOMAINS];
+  int count; // just a heuristic to limit number of entries
+} kmp_itthash_t;
+extern kmp_itthash_t __kmp_itt_region_domains;
+extern kmp_itthash_t __kmp_itt_barrier_domains;
+extern __itt_domain *metadata_domain;
+extern __itt_string_handle *string_handle_imbl;
+extern __itt_string_handle *string_handle_loop;
+extern __itt_string_handle *string_handle_sngl;
+
+#else
+
+// Null definitions of the synchronization tracing functions.
+#define KMP_FSYNC_PREPARE(obj) ((void)0)
+#define KMP_FSYNC_CANCEL(obj) ((void)0)
+#define KMP_FSYNC_ACQUIRED(obj) ((void)0)
+#define KMP_FSYNC_RELEASING(obj) ((void)0)
+
+#define KMP_FSYNC_SPIN_INIT(obj, spin) ((void)0)
+#define KMP_FSYNC_SPIN_PREPARE(obj) ((void)0)
+#define KMP_FSYNC_SPIN_ACQUIRED(obj) ((void)0)
+
+#define KMP_ITT_IGNORE(stmt)                                                   \
+  do {                                                                         \
+    stmt                                                                       \
+  } while (0)
+
+#endif // USE_ITT_NOTIFY
+
+#if !KMP_DEBUG
+// In release mode include definitions of inline functions.
+#include "kmp_itt.inc"
+#endif
+
+#endif // KMP_ITT_H
+
+#else /* USE_ITT_BUILD */
+
+// Null definitions of the synchronization tracing functions.
+// If USE_ITT_BULID is not enabled, USE_ITT_NOTIFY cannot be either.
+// By defining these we avoid unpleasant ifdef tests in many places.
+#define KMP_FSYNC_PREPARE(obj) ((void)0)
+#define KMP_FSYNC_CANCEL(obj) ((void)0)
+#define KMP_FSYNC_ACQUIRED(obj) ((void)0)
+#define KMP_FSYNC_RELEASING(obj) ((void)0)
+
+#define KMP_FSYNC_SPIN_INIT(obj, spin) ((void)0)
+#define KMP_FSYNC_SPIN_PREPARE(obj) ((void)0)
+#define KMP_FSYNC_SPIN_ACQUIRED(obj) ((void)0)
+
+#define KMP_ITT_IGNORE(stmt)                                                   \
+  do {                                                                         \
+    stmt                                                                       \
+  } while (0)
+
+#define USE_ITT_BUILD_ARG(x)
+
+#endif /* USE_ITT_BUILD */
diff --git a/third_party/openmp/kmp_itt.inc b/third_party/openmp/kmp_itt.inc
new file mode 100644
index 000000000..5236165c3
--- /dev/null
+++ b/third_party/openmp/kmp_itt.inc
@@ -0,0 +1,978 @@
+#if USE_ITT_BUILD
+/*
+ * kmp_itt.inl -- Inline functions of ITT Notify.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Inline function definitions. This file should be included into kmp_itt.h file
+// for production build (to let compiler inline functions) or into kmp_itt.c
+// file for debug build (to reduce the number of files to recompile and save
+// build time).
+
+#include "kmp.h"
+#include "kmp_str.h"
+
+#if KMP_ITT_DEBUG
+extern kmp_bootstrap_lock_t __kmp_itt_debug_lock;
+#define KMP_ITT_DEBUG_LOCK()                                                   \
+  { __kmp_acquire_bootstrap_lock(&__kmp_itt_debug_lock); }
+#define KMP_ITT_DEBUG_PRINT(...)                                               \
+  {                                                                            \
+    fprintf(stderr, "#%02d: ", __kmp_get_gtid());                              \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+    __kmp_release_bootstrap_lock(&__kmp_itt_debug_lock);                       \
+  }
+#else
+#define KMP_ITT_DEBUG_LOCK()
+#define KMP_ITT_DEBUG_PRINT(...)
+#endif // KMP_ITT_DEBUG
+
+// Ensure that the functions are static if they're supposed to be being inlined.
+// Otherwise they cannot be used in more than one file, since there will be
+// multiple definitions.
+#if KMP_DEBUG
+#define LINKAGE
+#else
+#define LINKAGE static inline
+#endif
+
+// ZCA interface used by Intel(R) Inspector. Intel(R) Parallel Amplifier uses
+// this API to support user-defined synchronization primitives, but does not use
+// ZCA; it would be safe to turn this off until wider support becomes available.
+#if USE_ITT_ZCA
+#ifdef __INTEL_COMPILER
+#if __INTEL_COMPILER >= 1200
+#undef __itt_sync_acquired
+#undef __itt_sync_releasing
+#define __itt_sync_acquired(addr)                                              \
+  __notify_zc_intrinsic((char *)"sync_acquired", addr)
+#define __itt_sync_releasing(addr)                                             \
+  __notify_intrinsic((char *)"sync_releasing", addr)
+#endif
+#endif
+#endif
+
+static kmp_bootstrap_lock_t metadata_lock =
+    KMP_BOOTSTRAP_LOCK_INITIALIZER(metadata_lock);
+
+#if USE_ITT_NOTIFY
+LINKAGE size_t __kmp_itthash_hash(kmp_intptr_t addr, size_t hsize) {
+  return ((addr >> 6) ^ (addr >> 2)) % hsize;
+}
+LINKAGE kmp_itthash_entry *__kmp_itthash_find(kmp_info_t *thread,
+                                              kmp_itthash_t *h, ident_t *loc,
+                                              int team_size) {
+  kmp_itthash_entry_t *entry;
+  size_t bucket = __kmp_itthash_hash((kmp_intptr_t)loc, KMP_MAX_FRAME_DOMAINS);
+  for (entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket)
+    if (entry->loc == loc && entry->team_size == team_size)
+      break;
+
+  if (entry == NULL) {
+    // two foreign threads could report frames concurrently
+    int cnt = KMP_TEST_THEN_INC32(&h->count);
+    if (cnt >= KMP_MAX_FRAME_DOMAINS) {
+      KMP_TEST_THEN_DEC32(&h->count); // revert the count
+      return entry; // too many entries
+    }
+    // create new entry
+    entry = (kmp_itthash_entry_t *)__kmp_thread_malloc(
+        thread, sizeof(kmp_itthash_entry_t));
+    entry->loc = loc;
+    entry->team_size = team_size;
+    entry->d = NULL;
+    entry->next_in_bucket = h->buckets[bucket];
+    while (!KMP_COMPARE_AND_STORE_PTR(&h->buckets[bucket],
+                                      entry->next_in_bucket, entry)) {
+      KMP_CPU_PAUSE();
+      entry->next_in_bucket = h->buckets[bucket];
+    }
+  }
+#if KMP_DEBUG
+  else {
+    // check the contents of the location info is unique
+    KMP_DEBUG_ASSERT(loc->psource == entry->loc->psource);
+  }
+#endif
+  return entry;
+}
+#endif
+
+/* Parallel region reporting.
+ * __kmp_itt_region_forking should be called by primary thread of a team.
+   Exact moment of call does not matter, but it should be completed before any
+   thread of this team calls __kmp_itt_region_starting.
+ * __kmp_itt_region_starting should be called by each thread of a team just
+   before entering parallel region body.
+ * __kmp_itt_region_finished should be called by each thread of a team right
+   after returning from parallel region body.
+ * __kmp_itt_region_joined should be called by primary thread of a team, after
+   all threads called __kmp_itt_region_finished.
+
+ Note: Thread waiting at join barrier (after __kmp_itt_region_finished) can
+ execute some more user code -- such a thread can execute tasks.
+
+ Note: The overhead of logging region_starting and region_finished in each
+ thread is too large, so these calls are not used. */
+
+LINKAGE void __kmp_itt_region_forking(int gtid, int team_size, int barriers) {
+#if USE_ITT_NOTIFY
+  kmp_team_t *team = __kmp_team_from_gtid(gtid);
+  if (team->t.t_active_level > 1) {
+    // The frame notifications are only supported for the outermost teams.
+    return;
+  }
+  kmp_info_t *th = __kmp_thread_from_gtid(gtid);
+  ident_t *loc = th->th.th_ident;
+  if (!loc) {
+    // no sense to report a region without location info
+    return;
+  }
+  kmp_itthash_entry *e;
+  e = __kmp_itthash_find(th, &__kmp_itt_region_domains, loc, team_size);
+  if (e == NULL)
+    return; // too many entries in the hash
+  if (e->d == NULL) {
+    // Transform compiler-generated region location into the format
+    // that the tools more or less standardized on:
+    //   "<func>$omp$parallel@[file:]<line>[:<col>]"
+    char *buff = NULL;
+    kmp_str_loc_t str_loc =
+        __kmp_str_loc_init(loc->psource, /* init_fname */ false);
+    buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func,
+                            team_size, str_loc.file, str_loc.line, str_loc.col);
+
+    __itt_suppress_push(__itt_suppress_memory_errors);
+    e->d = __itt_domain_create(buff);
+    KMP_ASSERT(e->d != NULL);
+    __itt_suppress_pop();
+
+    __kmp_str_free(&buff);
+    if (barriers) {
+      kmp_itthash_entry *e;
+      e = __kmp_itthash_find(th, &__kmp_itt_barrier_domains, loc, 0);
+      if (e != NULL) {
+        KMP_DEBUG_ASSERT(e->d == NULL);
+        char *buff = NULL;
+        buff = __kmp_str_format("%s$omp$barrier@%s:%d", str_loc.func,
+                                str_loc.file, str_loc.line);
+        __itt_suppress_push(__itt_suppress_memory_errors);
+        e->d = __itt_domain_create(buff);
+        KMP_ASSERT(e->d != NULL);
+        __itt_suppress_pop();
+        __kmp_str_free(&buff);
+      }
+    }
+    __kmp_str_loc_free(&str_loc);
+  }
+  __itt_frame_begin_v3(e->d, NULL);
+  KMP_ITT_DEBUG_LOCK();
+  KMP_ITT_DEBUG_PRINT("[frm beg] gtid=%d, domain=%p, loc:%p\n", gtid, e->d,
+                      loc);
+#endif
+} // __kmp_itt_region_forking
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_frame_submit(int gtid, __itt_timestamp begin,
+                                    __itt_timestamp end, int imbalance,
+                                    ident_t *loc, int team_size, int region) {
+#if USE_ITT_NOTIFY
+  if (!loc) {
+    // no sense to report a region without location info
+    return;
+  }
+  kmp_info_t *th = __kmp_thread_from_gtid(gtid);
+  if (region) {
+    kmp_team_t *team = __kmp_team_from_gtid(gtid);
+    int serialized = (region == 2 ? 1 : 0);
+    if (team->t.t_active_level + serialized > 1) {
+      // The frame notifications are only supported for the outermost teams.
+      return;
+    }
+    // Check region domain has not been created before.
+    kmp_itthash_entry *e;
+    e = __kmp_itthash_find(th, &__kmp_itt_region_domains, loc, team_size);
+    if (e == NULL)
+      return; // too many entries in the hash
+    if (e->d == NULL) { // new entry, need to calculate domain
+      // Transform compiler-generated region location into the format
+      // that the tools more or less standardized on:
+      //   "<func>$omp$parallel:team_size@[file:]<line>[:<col>]"
+      char *buff = NULL;
+      kmp_str_loc_t str_loc =
+          __kmp_str_loc_init(loc->psource, /* init_fname */ false);
+      buff =
+          __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func,
+                           team_size, str_loc.file, str_loc.line, str_loc.col);
+      __itt_suppress_push(__itt_suppress_memory_errors);
+      e->d = __itt_domain_create(buff);
+      KMP_ASSERT(e->d != NULL);
+      __itt_suppress_pop();
+
+      __kmp_str_free(&buff);
+      __kmp_str_loc_free(&str_loc);
+    }
+    __itt_frame_submit_v3(e->d, NULL, begin, end);
+    KMP_ITT_DEBUG_LOCK();
+    KMP_ITT_DEBUG_PRINT(
+        "[reg sub] gtid=%d, domain=%p, region:%d, loc:%p, beg:%llu, end:%llu\n",
+        gtid, e->d, region, loc, begin, end);
+    return;
+  } else { // called for barrier reporting
+    kmp_itthash_entry *e;
+    e = __kmp_itthash_find(th, &__kmp_itt_barrier_domains, loc, 0);
+    if (e == NULL)
+      return; // too many entries in the hash
+    if (e->d == NULL) { // new entry, need to calculate domain
+      // Transform compiler-generated region location into the format
+      // that the tools more or less standardized on:
+      //   "<func>$omp$frame@[file:]<line>[:<col>]"
+      kmp_str_loc_t str_loc =
+          __kmp_str_loc_init(loc->psource, /* init_fname */ false);
+      char *buff = NULL;
+      if (imbalance) {
+        buff =
+            __kmp_str_format("%s$omp$barrier-imbalance:%d@%s:%d", str_loc.func,
+                             team_size, str_loc.file, str_loc.line);
+      } else {
+        buff = __kmp_str_format("%s$omp$barrier@%s:%d", str_loc.func,
+                                str_loc.file, str_loc.line);
+      }
+      __itt_suppress_push(__itt_suppress_memory_errors);
+      e->d = __itt_domain_create(buff);
+      KMP_ASSERT(e->d != NULL);
+      __itt_suppress_pop();
+      __kmp_str_free(&buff);
+      __kmp_str_loc_free(&str_loc);
+    }
+    __itt_frame_submit_v3(e->d, NULL, begin, end);
+    KMP_ITT_DEBUG_LOCK();
+    KMP_ITT_DEBUG_PRINT(
+        "[frm sub] gtid=%d, domain=%p, loc:%p, beg:%llu, end:%llu\n", gtid,
+        e->d, loc, begin, end);
+  }
+#endif
+} // __kmp_itt_frame_submit
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_metadata_imbalance(int gtid, kmp_uint64 begin,
+                                          kmp_uint64 end, kmp_uint64 imbalance,
+                                          kmp_uint64 reduction) {
+#if USE_ITT_NOTIFY
+  if (metadata_domain == NULL) {
+    __kmp_acquire_bootstrap_lock(&metadata_lock);
+    if (metadata_domain == NULL) {
+      __itt_suppress_push(__itt_suppress_memory_errors);
+      metadata_domain = __itt_domain_create("OMP Metadata");
+      string_handle_imbl = __itt_string_handle_create("omp_metadata_imbalance");
+      string_handle_loop = __itt_string_handle_create("omp_metadata_loop");
+      string_handle_sngl = __itt_string_handle_create("omp_metadata_single");
+      __itt_suppress_pop();
+    }
+    __kmp_release_bootstrap_lock(&metadata_lock);
+  }
+
+  kmp_uint64 imbalance_data[4];
+  imbalance_data[0] = begin;
+  imbalance_data[1] = end;
+  imbalance_data[2] = imbalance;
+  imbalance_data[3] = reduction;
+
+  __itt_metadata_add(metadata_domain, __itt_null, string_handle_imbl,
+                     __itt_metadata_u64, 4, imbalance_data);
+#endif
+} // __kmp_itt_metadata_imbalance
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_metadata_loop(ident_t *loc, kmp_uint64 sched_type,
+                                     kmp_uint64 iterations, kmp_uint64 chunk) {
+#if USE_ITT_NOTIFY
+  if (metadata_domain == NULL) {
+    __kmp_acquire_bootstrap_lock(&metadata_lock);
+    if (metadata_domain == NULL) {
+      __itt_suppress_push(__itt_suppress_memory_errors);
+      metadata_domain = __itt_domain_create("OMP Metadata");
+      string_handle_imbl = __itt_string_handle_create("omp_metadata_imbalance");
+      string_handle_loop = __itt_string_handle_create("omp_metadata_loop");
+      string_handle_sngl = __itt_string_handle_create("omp_metadata_single");
+      __itt_suppress_pop();
+    }
+    __kmp_release_bootstrap_lock(&metadata_lock);
+  }
+
+  // Parse line and column from psource string: ";file;func;line;col;;"
+  KMP_DEBUG_ASSERT(loc->psource);
+  kmp_uint64 loop_data[5];
+  int line, col;
+  __kmp_str_loc_numbers(loc->psource, &line, &col);
+  loop_data[0] = line;
+  loop_data[1] = col;
+  loop_data[2] = sched_type;
+  loop_data[3] = iterations;
+  loop_data[4] = chunk;
+
+  __itt_metadata_add(metadata_domain, __itt_null, string_handle_loop,
+                     __itt_metadata_u64, 5, loop_data);
+#endif
+} // __kmp_itt_metadata_loop
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_metadata_single(ident_t *loc) {
+#if USE_ITT_NOTIFY
+  if (metadata_domain == NULL) {
+    __kmp_acquire_bootstrap_lock(&metadata_lock);
+    if (metadata_domain == NULL) {
+      __itt_suppress_push(__itt_suppress_memory_errors);
+      metadata_domain = __itt_domain_create("OMP Metadata");
+      string_handle_imbl = __itt_string_handle_create("omp_metadata_imbalance");
+      string_handle_loop = __itt_string_handle_create("omp_metadata_loop");
+      string_handle_sngl = __itt_string_handle_create("omp_metadata_single");
+      __itt_suppress_pop();
+    }
+    __kmp_release_bootstrap_lock(&metadata_lock);
+  }
+
+  int line, col;
+  __kmp_str_loc_numbers(loc->psource, &line, &col);
+  kmp_uint64 single_data[2];
+  single_data[0] = line;
+  single_data[1] = col;
+
+  __itt_metadata_add(metadata_domain, __itt_null, string_handle_sngl,
+                     __itt_metadata_u64, 2, single_data);
+#endif
+} // __kmp_itt_metadata_single
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_region_starting(int gtid) {
+#if USE_ITT_NOTIFY
+#endif
+} // __kmp_itt_region_starting
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_region_finished(int gtid) {
+#if USE_ITT_NOTIFY
+#endif
+} // __kmp_itt_region_finished
+
+// ----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_region_joined(int gtid) {
+#if USE_ITT_NOTIFY
+  kmp_team_t *team = __kmp_team_from_gtid(gtid);
+  if (team->t.t_active_level > 1) {
+    // The frame notifications are only supported for the outermost teams.
+    return;
+  }
+  kmp_info_t *th = __kmp_thread_from_gtid(gtid);
+  ident_t *loc = th->th.th_ident;
+  if (loc) {
+    kmp_itthash_entry *e = __kmp_itthash_find(th, &__kmp_itt_region_domains,
+                                              loc, th->th.th_team_nproc);
+    if (e == NULL)
+      return; // too many entries in the hash
+    KMP_DEBUG_ASSERT(e->d);
+    KMP_ITT_DEBUG_LOCK();
+    __itt_frame_end_v3(e->d, NULL);
+    KMP_ITT_DEBUG_PRINT("[frm end] gtid=%d, domain=%p, loc:%p\n", gtid, e->d,
+                        loc);
+  }
+#endif
+} // __kmp_itt_region_joined
+
+/* Barriers reporting.
+
+   A barrier consists of two phases:
+   1. Gather -- primary thread waits for all worker threads to arrive; each
+      worker thread registers arrival and goes further.
+   2. Release -- each worker thread waits until primary thread lets it go;
+      primary thread lets worker threads go.
+
+   Function should be called by each thread:
+   * __kmp_itt_barrier_starting() -- before arriving to the gather phase.
+   * __kmp_itt_barrier_middle()   -- between gather and release phases.
+   * __kmp_itt_barrier_finished() -- after release phase.
+
+   Note: Call __kmp_itt_barrier_object() before call to
+   __kmp_itt_barrier_starting() and save result in local variable.
+   __kmp_itt_barrier_object(), being called too late (e. g. after gather phase)
+   would return itt sync object for the next barrier!
+
+   ITT need an address (void *) to be specified as a sync object. OpenMP RTL
+   does not have barrier object or barrier data structure. Barrier is just a
+   counter in team and thread structures. We could use an address of team
+   structure as a barrier sync object, but ITT wants different objects for
+   different barriers (even whithin the same team). So let us use team address
+   as barrier sync object for the first barrier, then increase it by one for the
+   next barrier, and so on (but wrap it not to use addresses outside of team
+   structure). */
+
+void *__kmp_itt_barrier_object(int gtid, int bt, int set_name,
+                               int delta // 0 (current barrier) is default
+                               // value; specify -1 to get previous
+                               // barrier.
+                               ) {
+  void *object = NULL;
+#if USE_ITT_NOTIFY
+  kmp_info_t *thr = __kmp_thread_from_gtid(gtid);
+  kmp_team_t *team = thr->th.th_team;
+
+  // NOTE: If the function is called from __kmp_fork_barrier, team pointer can
+  // be NULL. This "if" helps to avoid crash. However, this is not complete
+  // solution, and reporting fork/join barriers to ITT should be revisited.
+
+  if (team != NULL) {
+    // Primary thread increases b_arrived by KMP_BARRIER_STATE_BUMP each time.
+    // Divide b_arrived by KMP_BARRIER_STATE_BUMP to get plain barrier counter.
+    kmp_uint64 counter =
+        team->t.t_bar[bt].b_arrived / KMP_BARRIER_STATE_BUMP + delta;
+    // Now form the barrier id. Encode barrier type (bt) in barrier id too, so
+    // barriers of different types do not have the same ids.
+    KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= bs_last_barrier);
+    // This condition is a must (we would have zero divide otherwise).
+    KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= 2 * bs_last_barrier);
+    // More strong condition: make sure we have room at least for two
+    // different ids (for each barrier type).
+    object = reinterpret_cast<void *>(
+        (kmp_uintptr_t)(team) +
+        (kmp_uintptr_t)counter % (sizeof(kmp_team_t) / bs_last_barrier) *
+            bs_last_barrier +
+        bt);
+    KMP_ITT_DEBUG_LOCK();
+    KMP_ITT_DEBUG_PRINT("[bar obj] type=%d, counter=%lld, object=%p\n", bt,
+                        counter, object);
+
+    if (set_name) {
+      ident_t const *loc = NULL;
+      char const *src = NULL;
+      char const *type = "OMP Barrier";
+      switch (bt) {
+      case bs_plain_barrier: {
+        // For plain barrier compiler calls __kmpc_barrier() function, which
+        // saves location in thr->th.th_ident.
+        loc = thr->th.th_ident;
+        // Get the barrier type from flags provided by compiler.
+        kmp_int32 expl = 0;
+        kmp_uint32 impl = 0;
+        if (loc != NULL) {
+          src = loc->psource;
+          expl = (loc->flags & KMP_IDENT_BARRIER_EXPL) != 0;
+          impl = (loc->flags & KMP_IDENT_BARRIER_IMPL) != 0;
+        }
+        if (impl) {
+          switch (loc->flags & KMP_IDENT_BARRIER_IMPL_MASK) {
+          case KMP_IDENT_BARRIER_IMPL_FOR: {
+            type = "OMP For Barrier";
+          } break;
+          case KMP_IDENT_BARRIER_IMPL_SECTIONS: {
+            type = "OMP Sections Barrier";
+          } break;
+          case KMP_IDENT_BARRIER_IMPL_SINGLE: {
+            type = "OMP Single Barrier";
+          } break;
+          case KMP_IDENT_BARRIER_IMPL_WORKSHARE: {
+            type = "OMP Workshare Barrier";
+          } break;
+          default: {
+            type = "OMP Implicit Barrier";
+            KMP_DEBUG_ASSERT(0);
+          }
+          }
+        } else if (expl) {
+          type = "OMP Explicit Barrier";
+        }
+      } break;
+      case bs_forkjoin_barrier: {
+        // In case of fork/join barrier we can read thr->th.th_ident, because it
+        // contains location of last passed construct (while join barrier is not
+        // such one). Use th_ident of primary thread instead --
+        // __kmp_join_call() called by the primary thread saves location.
+        //
+        // AC: cannot read from primary thread because __kmp_join_call may not
+        //    be called yet, so we read the location from team. This is the
+        //    same location. Team is valid on entry to join barrier where this
+        //    happens.
+        loc = team->t.t_ident;
+        if (loc != NULL) {
+          src = loc->psource;
+        }
+        type = "OMP Join Barrier";
+      } break;
+      }
+      KMP_ITT_DEBUG_LOCK();
+      __itt_sync_create(object, type, src, __itt_attr_barrier);
+      KMP_ITT_DEBUG_PRINT(
+          "[bar sta] scre( %p, \"%s\", \"%s\", __itt_attr_barrier )\n", object,
+          type, src);
+    }
+  }
+#endif
+  return object;
+} // __kmp_itt_barrier_object
+
+// -----------------------------------------------------------------------------
+void __kmp_itt_barrier_starting(int gtid, void *object) {
+#if USE_ITT_NOTIFY
+  if (!KMP_MASTER_GTID(gtid)) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_releasing(object);
+    KMP_ITT_DEBUG_PRINT("[bar sta] srel( %p )\n", object);
+  }
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_prepare(object);
+  KMP_ITT_DEBUG_PRINT("[bar sta] spre( %p )\n", object);
+#endif
+} // __kmp_itt_barrier_starting
+
+// -----------------------------------------------------------------------------
+void __kmp_itt_barrier_middle(int gtid, void *object) {
+#if USE_ITT_NOTIFY
+  if (KMP_MASTER_GTID(gtid)) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_acquired(object);
+    KMP_ITT_DEBUG_PRINT("[bar mid] sacq( %p )\n", object);
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_releasing(object);
+    KMP_ITT_DEBUG_PRINT("[bar mid] srel( %p )\n", object);
+  } else {
+  }
+#endif
+} // __kmp_itt_barrier_middle
+
+// -----------------------------------------------------------------------------
+void __kmp_itt_barrier_finished(int gtid, void *object) {
+#if USE_ITT_NOTIFY
+  if (KMP_MASTER_GTID(gtid)) {
+  } else {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_acquired(object);
+    KMP_ITT_DEBUG_PRINT("[bar end] sacq( %p )\n", object);
+  }
+#endif
+} // __kmp_itt_barrier_finished
+
+/* Taskwait reporting.
+   ITT need an address (void *) to be specified as a sync object. OpenMP RTL
+   does not have taskwait structure, so we need to construct something. */
+
+void *__kmp_itt_taskwait_object(int gtid) {
+  void *object = NULL;
+#if USE_ITT_NOTIFY
+  if (UNLIKELY(__itt_sync_create_ptr)) {
+    kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
+    kmp_taskdata_t *taskdata = thread->th.th_current_task;
+    object = reinterpret_cast<void *>(kmp_uintptr_t(taskdata) +
+                                      taskdata->td_taskwait_counter %
+                                          sizeof(kmp_taskdata_t));
+  }
+#endif
+  return object;
+} // __kmp_itt_taskwait_object
+
+void __kmp_itt_taskwait_starting(int gtid, void *object) {
+#if USE_ITT_NOTIFY
+  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
+  kmp_taskdata_t *taskdata = thread->th.th_current_task;
+  ident_t const *loc = taskdata->td_taskwait_ident;
+  char const *src = (loc == NULL ? NULL : loc->psource);
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_create(object, "OMP Taskwait", src, 0);
+  KMP_ITT_DEBUG_PRINT("[twa sta] scre( %p, \"OMP Taskwait\", \"%s\", 0 )\n",
+                      object, src);
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_prepare(object);
+  KMP_ITT_DEBUG_PRINT("[twa sta] spre( %p )\n", object);
+#endif
+} // __kmp_itt_taskwait_starting
+
+void __kmp_itt_taskwait_finished(int gtid, void *object) {
+#if USE_ITT_NOTIFY
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_acquired(object);
+  KMP_ITT_DEBUG_PRINT("[twa end] sacq( %p )\n", object);
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_destroy(object);
+  KMP_ITT_DEBUG_PRINT("[twa end] sdes( %p )\n", object);
+#endif
+} // __kmp_itt_taskwait_finished
+
+/* Task reporting.
+   Only those tasks are reported which are executed by a thread spinning at
+   barrier (or taskwait). Synch object passed to the function must be barrier of
+   taskwait the threads waiting at. */
+
+void __kmp_itt_task_starting(
+    void *object // ITT sync object: barrier or taskwait.
+    ) {
+#if USE_ITT_NOTIFY
+  if (UNLIKELY(object != NULL)) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_cancel(object);
+    KMP_ITT_DEBUG_PRINT("[tsk sta] scan( %p )\n", object);
+  }
+#endif
+} // __kmp_itt_task_starting
+
+// -----------------------------------------------------------------------------
+void __kmp_itt_task_finished(
+    void *object // ITT sync object: barrier or taskwait.
+    ) {
+#if USE_ITT_NOTIFY
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_prepare(object);
+  KMP_ITT_DEBUG_PRINT("[tsk end] spre( %p )\n", object);
+#endif
+} // __kmp_itt_task_finished
+
+/* Lock reporting.
+ * __kmp_itt_lock_creating( lock ) should be called *before* the first lock
+   operation (set/unset). It is not a real event shown to the user but just
+   setting a name for synchronization object. `lock' is an address of sync
+   object, the same address should be used in all subsequent calls.
+ * __kmp_itt_lock_acquiring() should be called before setting the lock.
+ * __kmp_itt_lock_acquired() should be called after setting the lock.
+ * __kmp_itt_lock_realeasing() should be called before unsetting the lock.
+ * __kmp_itt_lock_cancelled() should be called after thread cancelled waiting
+   for the lock.
+ * __kmp_itt_lock_destroyed( lock ) should be called after the last lock
+   operation. After __kmp_itt_lock_destroyed() all the references to the same
+   address will be considered as another sync object, not related with the
+   original one.  */
+
+#if KMP_USE_DYNAMIC_LOCK
+// Takes location information directly
+__kmp_inline void ___kmp_itt_lock_init(kmp_user_lock_p lock, char const *type,
+                                       const ident_t *loc) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    char const *src = (loc == NULL ? NULL : loc->psource);
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_create(lock, type, src, 0);
+    KMP_ITT_DEBUG_PRINT("[lck ini] scre( %p, \"%s\", \"%s\", 0 )\n", lock, type,
+                        src);
+  }
+#endif
+}
+#else // KMP_USE_DYNAMIC_LOCK
+// Internal guts -- common code for locks and critical sections, do not call
+// directly.
+__kmp_inline void ___kmp_itt_lock_init(kmp_user_lock_p lock, char const *type) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    ident_t const *loc = NULL;
+    if (__kmp_get_user_lock_location_ != NULL)
+      loc = __kmp_get_user_lock_location_((lock));
+    char const *src = (loc == NULL ? NULL : loc->psource);
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_create(lock, type, src, 0);
+    KMP_ITT_DEBUG_PRINT("[lck ini] scre( %p, \"%s\", \"%s\", 0 )\n", lock, type,
+                        src);
+  }
+#endif
+} // ___kmp_itt_lock_init
+#endif // KMP_USE_DYNAMIC_LOCK
+
+// Internal guts -- common code for locks and critical sections, do not call
+// directly.
+__kmp_inline void ___kmp_itt_lock_fini(kmp_user_lock_p lock, char const *type) {
+#if USE_ITT_NOTIFY
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_destroy(lock);
+  KMP_ITT_DEBUG_PRINT("[lck dst] sdes( %p )\n", lock);
+#endif
+} // ___kmp_itt_lock_fini
+
+// -----------------------------------------------------------------------------
+#if KMP_USE_DYNAMIC_LOCK
+void __kmp_itt_lock_creating(kmp_user_lock_p lock, const ident_t *loc) {
+  ___kmp_itt_lock_init(lock, "OMP Lock", loc);
+}
+#else
+void __kmp_itt_lock_creating(kmp_user_lock_p lock) {
+  ___kmp_itt_lock_init(lock, "OMP Lock");
+} // __kmp_itt_lock_creating
+#endif
+
+void __kmp_itt_lock_acquiring(kmp_user_lock_p lock) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+  // postpone lock object access
+  if (__itt_sync_prepare_ptr) {
+    if (KMP_EXTRACT_D_TAG(lock) == 0) {
+      kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+      __itt_sync_prepare(ilk->lock);
+    } else {
+      __itt_sync_prepare(lock);
+    }
+  }
+#else
+  __itt_sync_prepare(lock);
+#endif
+} // __kmp_itt_lock_acquiring
+
+void __kmp_itt_lock_acquired(kmp_user_lock_p lock) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+  // postpone lock object access
+  if (__itt_sync_acquired_ptr) {
+    if (KMP_EXTRACT_D_TAG(lock) == 0) {
+      kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+      __itt_sync_acquired(ilk->lock);
+    } else {
+      __itt_sync_acquired(lock);
+    }
+  }
+#else
+  __itt_sync_acquired(lock);
+#endif
+} // __kmp_itt_lock_acquired
+
+void __kmp_itt_lock_releasing(kmp_user_lock_p lock) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+  if (__itt_sync_releasing_ptr) {
+    if (KMP_EXTRACT_D_TAG(lock) == 0) {
+      kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+      __itt_sync_releasing(ilk->lock);
+    } else {
+      __itt_sync_releasing(lock);
+    }
+  }
+#else
+  __itt_sync_releasing(lock);
+#endif
+} // __kmp_itt_lock_releasing
+
+void __kmp_itt_lock_cancelled(kmp_user_lock_p lock) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+  if (__itt_sync_cancel_ptr) {
+    if (KMP_EXTRACT_D_TAG(lock) == 0) {
+      kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+      __itt_sync_cancel(ilk->lock);
+    } else {
+      __itt_sync_cancel(lock);
+    }
+  }
+#else
+  __itt_sync_cancel(lock);
+#endif
+} // __kmp_itt_lock_cancelled
+
+void __kmp_itt_lock_destroyed(kmp_user_lock_p lock) {
+  ___kmp_itt_lock_fini(lock, "OMP Lock");
+} // __kmp_itt_lock_destroyed
+
+/* Critical reporting.
+   Critical sections are treated exactly as locks (but have different object
+   type). */
+#if KMP_USE_DYNAMIC_LOCK
+void __kmp_itt_critical_creating(kmp_user_lock_p lock, const ident_t *loc) {
+  ___kmp_itt_lock_init(lock, "OMP Critical", loc);
+}
+#else
+void __kmp_itt_critical_creating(kmp_user_lock_p lock) {
+  ___kmp_itt_lock_init(lock, "OMP Critical");
+} // __kmp_itt_critical_creating
+#endif
+
+void __kmp_itt_critical_acquiring(kmp_user_lock_p lock) {
+  __itt_sync_prepare(lock);
+} // __kmp_itt_critical_acquiring
+
+void __kmp_itt_critical_acquired(kmp_user_lock_p lock) {
+  __itt_sync_acquired(lock);
+} // __kmp_itt_critical_acquired
+
+void __kmp_itt_critical_releasing(kmp_user_lock_p lock) {
+  __itt_sync_releasing(lock);
+} // __kmp_itt_critical_releasing
+
+void __kmp_itt_critical_destroyed(kmp_user_lock_p lock) {
+  ___kmp_itt_lock_fini(lock, "OMP Critical");
+} // __kmp_itt_critical_destroyed
+
+/* Single reporting. */
+
+void __kmp_itt_single_start(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_mark_create_ptr || KMP_ITT_DEBUG) {
+    kmp_info_t *thr = __kmp_thread_from_gtid((gtid));
+    ident_t *loc = thr->th.th_ident;
+    char const *src = (loc == NULL ? NULL : loc->psource);
+    kmp_str_buf_t name;
+    __kmp_str_buf_init(&name);
+    __kmp_str_buf_print(&name, "OMP Single-%s", src);
+    KMP_ITT_DEBUG_LOCK();
+    thr->th.th_itt_mark_single = __itt_mark_create(name.str);
+    KMP_ITT_DEBUG_PRINT("[sin sta] mcre( \"%s\") -> %d\n", name.str,
+                        thr->th.th_itt_mark_single);
+    __kmp_str_buf_free(&name);
+    KMP_ITT_DEBUG_LOCK();
+    __itt_mark(thr->th.th_itt_mark_single, NULL);
+    KMP_ITT_DEBUG_PRINT("[sin sta] mark( %d, NULL )\n",
+                        thr->th.th_itt_mark_single);
+  }
+#endif
+} // __kmp_itt_single_start
+
+void __kmp_itt_single_end(int gtid) {
+#if USE_ITT_NOTIFY
+  __itt_mark_type mark = __kmp_thread_from_gtid(gtid)->th.th_itt_mark_single;
+  KMP_ITT_DEBUG_LOCK();
+  __itt_mark_off(mark);
+  KMP_ITT_DEBUG_PRINT("[sin end] moff( %d )\n", mark);
+#endif
+} // __kmp_itt_single_end
+
+/* Ordered reporting.
+ * __kmp_itt_ordered_init is called by each thread *before* first using sync
+   object. ITT team would like it to be called once, but it requires extra
+   synchronization.
+ * __kmp_itt_ordered_prep is called when thread is going to enter ordered
+   section (before synchronization).
+ * __kmp_itt_ordered_start is called just before entering user code (after
+   synchronization).
+ * __kmp_itt_ordered_end is called after returning from user code.
+
+ Sync object is th->th.th_dispatch->th_dispatch_sh_current.
+ Events are not generated in case of serialized team. */
+
+void __kmp_itt_ordered_init(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    kmp_info_t *thr = __kmp_thread_from_gtid(gtid);
+    ident_t const *loc = thr->th.th_ident;
+    char const *src = (loc == NULL ? NULL : loc->psource);
+    __itt_sync_create(thr->th.th_dispatch->th_dispatch_sh_current,
+                      "OMP Ordered", src, 0);
+  }
+#endif
+} // __kmp_itt_ordered_init
+
+void __kmp_itt_ordered_prep(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    kmp_team_t *t = __kmp_team_from_gtid(gtid);
+    if (!t->t.t_serialized) {
+      kmp_info_t *th = __kmp_thread_from_gtid(gtid);
+      __itt_sync_prepare(th->th.th_dispatch->th_dispatch_sh_current);
+    }
+  }
+#endif
+} // __kmp_itt_ordered_prep
+
+void __kmp_itt_ordered_start(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    kmp_team_t *t = __kmp_team_from_gtid(gtid);
+    if (!t->t.t_serialized) {
+      kmp_info_t *th = __kmp_thread_from_gtid(gtid);
+      __itt_sync_acquired(th->th.th_dispatch->th_dispatch_sh_current);
+    }
+  }
+#endif
+} // __kmp_itt_ordered_start
+
+void __kmp_itt_ordered_end(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    kmp_team_t *t = __kmp_team_from_gtid(gtid);
+    if (!t->t.t_serialized) {
+      kmp_info_t *th = __kmp_thread_from_gtid(gtid);
+      __itt_sync_releasing(th->th.th_dispatch->th_dispatch_sh_current);
+    }
+  }
+#endif
+} // __kmp_itt_ordered_end
+
+/* Threads reporting. */
+
+void __kmp_itt_thread_ignore() {
+  __itt_thr_ignore();
+} // __kmp_itt_thread_ignore
+
+void __kmp_itt_thread_name(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_thr_name_set_ptr) {
+    kmp_str_buf_t name;
+    __kmp_str_buf_init(&name);
+    if (KMP_MASTER_GTID(gtid)) {
+      __kmp_str_buf_print(&name, "OMP Primary Thread #%d", gtid);
+    } else {
+      __kmp_str_buf_print(&name, "OMP Worker Thread #%d", gtid);
+    }
+    KMP_ITT_DEBUG_LOCK();
+    __itt_thr_name_set(name.str, name.used);
+    KMP_ITT_DEBUG_PRINT("[thr nam] name( \"%s\")\n", name.str);
+    __kmp_str_buf_free(&name);
+  }
+#endif
+} // __kmp_itt_thread_name
+
+/* System object reporting.
+   ITT catches operations with system sync objects (like Windows* OS on IA-32
+   architecture API critical sections and events). We only need to specify
+   name ("OMP Scheduler") for the object to let ITT know it is an object used
+   by OpenMP RTL for internal purposes. */
+
+void __kmp_itt_system_object_created(void *object, char const *name) {
+#if USE_ITT_NOTIFY
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_create(object, "OMP Scheduler", name, 0);
+  KMP_ITT_DEBUG_PRINT("[sys obj] scre( %p, \"OMP Scheduler\", \"%s\", 0 )\n",
+                      object, name);
+#endif
+} // __kmp_itt_system_object_created
+
+/* Stack stitching api.
+   Primary thread calls "create" and put the stitching id into team structure.
+   Workers read the stitching id and call "enter" / "leave" api.
+   Primary thread calls "destroy" at the end of the parallel region. */
+
+__itt_caller __kmp_itt_stack_caller_create() {
+#if USE_ITT_NOTIFY
+  if (!__itt_stack_caller_create_ptr)
+    return NULL;
+  KMP_ITT_DEBUG_LOCK();
+  __itt_caller id = __itt_stack_caller_create();
+  KMP_ITT_DEBUG_PRINT("[stk cre] %p\n", id);
+  return id;
+#endif
+  return NULL;
+}
+
+void __kmp_itt_stack_caller_destroy(__itt_caller id) {
+#if USE_ITT_NOTIFY
+  if (__itt_stack_caller_destroy_ptr) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_stack_caller_destroy(id);
+    KMP_ITT_DEBUG_PRINT("[stk des] %p\n", id);
+  }
+#endif
+}
+
+void __kmp_itt_stack_callee_enter(__itt_caller id) {
+#if USE_ITT_NOTIFY
+  if (__itt_stack_callee_enter_ptr) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_stack_callee_enter(id);
+    KMP_ITT_DEBUG_PRINT("[stk ent] %p\n", id);
+  }
+#endif
+}
+
+void __kmp_itt_stack_callee_leave(__itt_caller id) {
+#if USE_ITT_NOTIFY
+  if (__itt_stack_callee_leave_ptr) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_stack_callee_leave(id);
+    KMP_ITT_DEBUG_PRINT("[stk lea] %p\n", id);
+  }
+#endif
+}
+
+#endif /* USE_ITT_BUILD */
diff --git a/third_party/openmp/kmp_lock.cpp b/third_party/openmp/kmp_lock.cpp
new file mode 100644
index 000000000..85c54f4cd
--- /dev/null
+++ b/third_party/openmp/kmp_lock.cpp
@@ -0,0 +1,4052 @@
+/*
+ * kmp_lock.cpp -- lock-related functions
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stddef.h>
+#include <atomic>
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_itt.h"
+#include "kmp_lock.h"
+#include "kmp_wait_release.h"
+#include "kmp_wrapper_getpid.h"
+
+#if KMP_USE_FUTEX
+#include <sys/syscall.h>
+#include <unistd.h>
+// We should really include <futex.h>, but that causes compatibility problems on
+// different Linux* OS distributions that either require that you include (or
+// break when you try to include) <pci/types.h>. Since all we need is the two
+// macros below (which are part of the kernel ABI, so can't change) we just
+// define the constants here and don't include <futex.h>
+#ifndef FUTEX_WAIT
+#define FUTEX_WAIT 0
+#endif
+#ifndef FUTEX_WAKE
+#define FUTEX_WAKE 1
+#endif
+#endif
+
+/* Implement spin locks for internal library use.             */
+/* The algorithm implemented is Lamport's bakery lock [1974]. */
+
+void __kmp_validate_locks(void) {
+  int i;
+  kmp_uint32 x, y;
+
+  /* Check to make sure unsigned arithmetic does wraps properly */
+  x = ~((kmp_uint32)0) - 2;
+  y = x - 2;
+
+  for (i = 0; i < 8; ++i, ++x, ++y) {
+    kmp_uint32 z = (x - y);
+    KMP_ASSERT(z == 2);
+  }
+
+  KMP_ASSERT(offsetof(kmp_base_queuing_lock, tail_id) % 8 == 0);
+}
+
+/* ------------------------------------------------------------------------ */
+/* test and set locks */
+
+// For the non-nested locks, we can only assume that the first 4 bytes were
+// allocated, since gcc only allocates 4 bytes for omp_lock_t, and the Intel
+// compiler only allocates a 4 byte pointer on IA-32 architecture.  On
+// Windows* OS on Intel(R) 64, we can assume that all 8 bytes were allocated.
+//
+// gcc reserves >= 8 bytes for nested locks, so we can assume that the
+// entire 8 bytes were allocated for nested locks on all 64-bit platforms.
+
+static kmp_int32 __kmp_get_tas_lock_owner(kmp_tas_lock_t *lck) {
+  return KMP_LOCK_STRIP(KMP_ATOMIC_LD_RLX(&lck->lk.poll)) - 1;
+}
+
+static inline bool __kmp_is_tas_lock_nestable(kmp_tas_lock_t *lck) {
+  return lck->lk.depth_locked != -1;
+}
+
+__forceinline static int
+__kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  KMP_MB();
+
+#ifdef USE_LOCK_PROFILE
+  kmp_uint32 curr = KMP_LOCK_STRIP(lck->lk.poll);
+  if ((curr != 0) && (curr != gtid + 1))
+    __kmp_printf("LOCK CONTENTION: %p\n", lck);
+/* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+  kmp_int32 tas_free = KMP_LOCK_FREE(tas);
+  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
+
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
+      __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
+    KMP_FSYNC_ACQUIRED(lck);
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+
+  kmp_uint32 spins;
+  kmp_uint64 time;
+  KMP_FSYNC_PREPARE(lck);
+  KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
+  kmp_backoff_t backoff = __kmp_spin_backoff_params;
+  do {
+#if !KMP_HAVE_UMWAIT
+    __kmp_spin_backoff(&backoff);
+#else
+    if (!__kmp_tpause_enabled)
+      __kmp_spin_backoff(&backoff);
+#endif
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
+  } while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
+           !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy));
+  KMP_FSYNC_ACQUIRED(lck);
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  int retval = __kmp_acquire_tas_lock_timed_template(lck, gtid);
+  return retval;
+}
+
+static int __kmp_acquire_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                              kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+  if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) == gtid)) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+  return __kmp_acquire_tas_lock(lck, gtid);
+}
+
+int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  kmp_int32 tas_free = KMP_LOCK_FREE(tas);
+  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
+      __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
+    KMP_FSYNC_ACQUIRED(lck);
+    return TRUE;
+  }
+  return FALSE;
+}
+
+static int __kmp_test_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                           kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+  if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  return __kmp_test_tas_lock(lck, gtid);
+}
+
+int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KMP_FSYNC_RELEASING(lck);
+  KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(tas));
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KMP_YIELD_OVERSUB();
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                              kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_tas_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) >= 0) &&
+      (__kmp_get_tas_lock_owner(lck) != gtid)) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_tas_lock(lck, gtid);
+}
+
+void __kmp_init_tas_lock(kmp_tas_lock_t *lck) {
+  lck->lk.poll = KMP_LOCK_FREE(tas);
+}
+
+void __kmp_destroy_tas_lock(kmp_tas_lock_t *lck) { lck->lk.poll = 0; }
+
+static void __kmp_destroy_tas_lock_with_checks(kmp_tas_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+  if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_tas_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_tas_lock(lck);
+}
+
+// nested test and set locks
+
+int __kmp_acquire_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_tas_lock_owner(lck) == gtid) {
+    lck->lk.depth_locked += 1;
+    return KMP_LOCK_ACQUIRED_NEXT;
+  } else {
+    __kmp_acquire_tas_lock_timed_template(lck, gtid);
+    lck->lk.depth_locked = 1;
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+}
+
+static int __kmp_acquire_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                                     kmp_int32 gtid) {
+  char const *const func = "omp_set_nest_lock";
+  if (!__kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_acquire_nested_tas_lock(lck, gtid);
+}
+
+int __kmp_test_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  int retval;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_tas_lock_owner(lck) == gtid) {
+    retval = ++lck->lk.depth_locked;
+  } else if (!__kmp_test_tas_lock(lck, gtid)) {
+    retval = 0;
+  } else {
+    KMP_MB();
+    retval = lck->lk.depth_locked = 1;
+  }
+  return retval;
+}
+
+static int __kmp_test_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                                  kmp_int32 gtid) {
+  char const *const func = "omp_test_nest_lock";
+  if (!__kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_test_nested_tas_lock(lck, gtid);
+}
+
+int __kmp_release_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  KMP_MB();
+  if (--(lck->lk.depth_locked) == 0) {
+    __kmp_release_tas_lock(lck, gtid);
+    return KMP_LOCK_RELEASED;
+  }
+  return KMP_LOCK_STILL_HELD;
+}
+
+static int __kmp_release_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                                     kmp_int32 gtid) {
+  char const *const func = "omp_unset_nest_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (!__kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_tas_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_tas_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_nested_tas_lock(lck, gtid);
+}
+
+void __kmp_init_nested_tas_lock(kmp_tas_lock_t *lck) {
+  __kmp_init_tas_lock(lck);
+  lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck) {
+  __kmp_destroy_tas_lock(lck);
+  lck->lk.depth_locked = 0;
+}
+
+static void __kmp_destroy_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
+  char const *const func = "omp_destroy_nest_lock";
+  if (!__kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_tas_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_nested_tas_lock(lck);
+}
+
+#if KMP_USE_FUTEX
+
+/* ------------------------------------------------------------------------ */
+/* futex locks */
+
+// futex locks are really just test and set locks, with a different method
+// of handling contention.  They take the same amount of space as test and
+// set locks, and are allocated the same way (i.e. use the area allocated by
+// the compiler for non-nested locks / allocate nested locks on the heap).
+
+static kmp_int32 __kmp_get_futex_lock_owner(kmp_futex_lock_t *lck) {
+  return KMP_LOCK_STRIP((TCR_4(lck->lk.poll) >> 1)) - 1;
+}
+
+static inline bool __kmp_is_futex_lock_nestable(kmp_futex_lock_t *lck) {
+  return lck->lk.depth_locked != -1;
+}
+
+__forceinline static int
+__kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  kmp_int32 gtid_code = (gtid + 1) << 1;
+
+  KMP_MB();
+
+#ifdef USE_LOCK_PROFILE
+  kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
+  if ((curr != 0) && (curr != gtid_code))
+    __kmp_printf("LOCK CONTENTION: %p\n", lck);
+/* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+  KMP_FSYNC_PREPARE(lck);
+  KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d entering\n",
+                  lck, lck->lk.poll, gtid));
+
+  kmp_int32 poll_val;
+
+  while ((poll_val = KMP_COMPARE_AND_STORE_RET32(
+              &(lck->lk.poll), KMP_LOCK_FREE(futex),
+              KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {
+
+    kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;
+    KA_TRACE(
+        1000,
+        ("__kmp_acquire_futex_lock: lck:%p, T#%d poll_val = 0x%x cond = 0x%x\n",
+         lck, gtid, poll_val, cond));
+
+    // NOTE: if you try to use the following condition for this branch
+    //
+    // if ( poll_val & 1 == 0 )
+    //
+    // Then the 12.0 compiler has a bug where the following block will
+    // always be skipped, regardless of the value of the LSB of poll_val.
+    if (!cond) {
+      // Try to set the lsb in the poll to indicate to the owner
+      // thread that they need to wake this thread up.
+      if (!KMP_COMPARE_AND_STORE_REL32(&(lck->lk.poll), poll_val,
+                                       poll_val | KMP_LOCK_BUSY(1, futex))) {
+        KA_TRACE(
+            1000,
+            ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d can't set bit 0\n",
+             lck, lck->lk.poll, gtid));
+        continue;
+      }
+      poll_val |= KMP_LOCK_BUSY(1, futex);
+
+      KA_TRACE(1000,
+               ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d bit 0 set\n", lck,
+                lck->lk.poll, gtid));
+    }
+
+    KA_TRACE(
+        1000,
+        ("__kmp_acquire_futex_lock: lck:%p, T#%d before futex_wait(0x%x)\n",
+         lck, gtid, poll_val));
+
+    long rc;
+    if ((rc = syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAIT, poll_val, NULL,
+                      NULL, 0)) != 0) {
+      KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) "
+                      "failed (rc=%ld errno=%d)\n",
+                      lck, gtid, poll_val, rc, errno));
+      continue;
+    }
+
+    KA_TRACE(1000,
+             ("__kmp_acquire_futex_lock: lck:%p, T#%d after futex_wait(0x%x)\n",
+              lck, gtid, poll_val));
+    // This thread has now done a successful futex wait call and was entered on
+    // the OS futex queue.  We must now perform a futex wake call when releasing
+    // the lock, as we have no idea how many other threads are in the queue.
+    gtid_code |= 1;
+  }
+
+  KMP_FSYNC_ACQUIRED(lck);
+  KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
+                  lck->lk.poll, gtid));
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  int retval = __kmp_acquire_futex_lock_timed_template(lck, gtid);
+  return retval;
+}
+
+static int __kmp_acquire_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                                kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+  if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) == gtid)) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+  return __kmp_acquire_futex_lock(lck, gtid);
+}
+
+int __kmp_test_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  if (KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(futex),
+                                  KMP_LOCK_BUSY((gtid + 1) << 1, futex))) {
+    KMP_FSYNC_ACQUIRED(lck);
+    return TRUE;
+  }
+  return FALSE;
+}
+
+static int __kmp_test_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                             kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+  if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  return __kmp_test_futex_lock(lck, gtid);
+}
+
+int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d entering\n",
+                  lck, lck->lk.poll, gtid));
+
+  KMP_FSYNC_RELEASING(lck);
+
+  kmp_int32 poll_val = KMP_XCHG_FIXED32(&(lck->lk.poll), KMP_LOCK_FREE(futex));
+
+  KA_TRACE(1000,
+           ("__kmp_release_futex_lock: lck:%p, T#%d released poll_val = 0x%x\n",
+            lck, gtid, poll_val));
+
+  if (KMP_LOCK_STRIP(poll_val) & 1) {
+    KA_TRACE(1000,
+             ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n",
+              lck, gtid));
+    syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex),
+            NULL, NULL, 0);
+  }
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
+                  lck->lk.poll, gtid));
+
+  KMP_YIELD_OVERSUB();
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                                kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_futex_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) >= 0) &&
+      (__kmp_get_futex_lock_owner(lck) != gtid)) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_futex_lock(lck, gtid);
+}
+
+void __kmp_init_futex_lock(kmp_futex_lock_t *lck) {
+  TCW_4(lck->lk.poll, KMP_LOCK_FREE(futex));
+}
+
+void __kmp_destroy_futex_lock(kmp_futex_lock_t *lck) { lck->lk.poll = 0; }
+
+static void __kmp_destroy_futex_lock_with_checks(kmp_futex_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+  if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_futex_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_futex_lock(lck);
+}
+
+// nested futex locks
+
+int __kmp_acquire_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_futex_lock_owner(lck) == gtid) {
+    lck->lk.depth_locked += 1;
+    return KMP_LOCK_ACQUIRED_NEXT;
+  } else {
+    __kmp_acquire_futex_lock_timed_template(lck, gtid);
+    lck->lk.depth_locked = 1;
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+}
+
+static int __kmp_acquire_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                                       kmp_int32 gtid) {
+  char const *const func = "omp_set_nest_lock";
+  if (!__kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_acquire_nested_futex_lock(lck, gtid);
+}
+
+int __kmp_test_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  int retval;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_futex_lock_owner(lck) == gtid) {
+    retval = ++lck->lk.depth_locked;
+  } else if (!__kmp_test_futex_lock(lck, gtid)) {
+    retval = 0;
+  } else {
+    KMP_MB();
+    retval = lck->lk.depth_locked = 1;
+  }
+  return retval;
+}
+
+static int __kmp_test_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                                    kmp_int32 gtid) {
+  char const *const func = "omp_test_nest_lock";
+  if (!__kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_test_nested_futex_lock(lck, gtid);
+}
+
+int __kmp_release_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  KMP_MB();
+  if (--(lck->lk.depth_locked) == 0) {
+    __kmp_release_futex_lock(lck, gtid);
+    return KMP_LOCK_RELEASED;
+  }
+  return KMP_LOCK_STILL_HELD;
+}
+
+static int __kmp_release_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                                       kmp_int32 gtid) {
+  char const *const func = "omp_unset_nest_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (!__kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_futex_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_futex_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_nested_futex_lock(lck, gtid);
+}
+
+void __kmp_init_nested_futex_lock(kmp_futex_lock_t *lck) {
+  __kmp_init_futex_lock(lck);
+  lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck) {
+  __kmp_destroy_futex_lock(lck);
+  lck->lk.depth_locked = 0;
+}
+
+static void __kmp_destroy_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
+  char const *const func = "omp_destroy_nest_lock";
+  if (!__kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_futex_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_nested_futex_lock(lck);
+}
+
+#endif // KMP_USE_FUTEX
+
+/* ------------------------------------------------------------------------ */
+/* ticket (bakery) locks */
+
+static kmp_int32 __kmp_get_ticket_lock_owner(kmp_ticket_lock_t *lck) {
+  return std::atomic_load_explicit(&lck->lk.owner_id,
+                                   std::memory_order_relaxed) -
+         1;
+}
+
+static inline bool __kmp_is_ticket_lock_nestable(kmp_ticket_lock_t *lck) {
+  return std::atomic_load_explicit(&lck->lk.depth_locked,
+                                   std::memory_order_relaxed) != -1;
+}
+
+static kmp_uint32 __kmp_bakery_check(void *now_serving, kmp_uint32 my_ticket) {
+  return std::atomic_load_explicit((std::atomic<unsigned> *)now_serving,
+                                   std::memory_order_acquire) == my_ticket;
+}
+
+__forceinline static int
+__kmp_acquire_ticket_lock_timed_template(kmp_ticket_lock_t *lck,
+                                         kmp_int32 gtid) {
+  kmp_uint32 my_ticket = std::atomic_fetch_add_explicit(
+      &lck->lk.next_ticket, 1U, std::memory_order_relaxed);
+
+#ifdef USE_LOCK_PROFILE
+  if (std::atomic_load_explicit(&lck->lk.now_serving,
+                                std::memory_order_relaxed) != my_ticket)
+    __kmp_printf("LOCK CONTENTION: %p\n", lck);
+/* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+  if (std::atomic_load_explicit(&lck->lk.now_serving,
+                                std::memory_order_acquire) == my_ticket) {
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+  KMP_WAIT_PTR(&lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck);
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  int retval = __kmp_acquire_ticket_lock_timed_template(lck, gtid);
+  return retval;
+}
+
+static int __kmp_acquire_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                                 kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) == gtid)) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+
+  __kmp_acquire_ticket_lock(lck, gtid);
+
+  std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
+                             std::memory_order_relaxed);
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_test_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  kmp_uint32 my_ticket = std::atomic_load_explicit(&lck->lk.next_ticket,
+                                                   std::memory_order_relaxed);
+
+  if (std::atomic_load_explicit(&lck->lk.now_serving,
+                                std::memory_order_relaxed) == my_ticket) {
+    kmp_uint32 next_ticket = my_ticket + 1;
+    if (std::atomic_compare_exchange_strong_explicit(
+            &lck->lk.next_ticket, &my_ticket, next_ticket,
+            std::memory_order_acquire, std::memory_order_acquire)) {
+      return TRUE;
+    }
+  }
+  return FALSE;
+}
+
+static int __kmp_test_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                              kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+
+  int retval = __kmp_test_ticket_lock(lck, gtid);
+
+  if (retval) {
+    std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
+                               std::memory_order_relaxed);
+  }
+  return retval;
+}
+
+int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  kmp_uint32 distance = std::atomic_load_explicit(&lck->lk.next_ticket,
+                                                  std::memory_order_relaxed) -
+                        std::atomic_load_explicit(&lck->lk.now_serving,
+                                                  std::memory_order_relaxed);
+
+  std::atomic_fetch_add_explicit(&lck->lk.now_serving, 1U,
+                                 std::memory_order_release);
+
+  KMP_YIELD(distance >
+            (kmp_uint32)(__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                                 kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_ticket_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) >= 0) &&
+      (__kmp_get_ticket_lock_owner(lck) != gtid)) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
+  return __kmp_release_ticket_lock(lck, gtid);
+}
+
+void __kmp_init_ticket_lock(kmp_ticket_lock_t *lck) {
+  lck->lk.location = NULL;
+  lck->lk.self = lck;
+  std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
+                             std::memory_order_relaxed);
+  std::atomic_store_explicit(&lck->lk.now_serving, 0U,
+                             std::memory_order_relaxed);
+  std::atomic_store_explicit(
+      &lck->lk.owner_id, 0,
+      std::memory_order_relaxed); // no thread owns the lock.
+  std::atomic_store_explicit(
+      &lck->lk.depth_locked, -1,
+      std::memory_order_relaxed); // -1 => not a nested lock.
+  std::atomic_store_explicit(&lck->lk.initialized, true,
+                             std::memory_order_release);
+}
+
+void __kmp_destroy_ticket_lock(kmp_ticket_lock_t *lck) {
+  std::atomic_store_explicit(&lck->lk.initialized, false,
+                             std::memory_order_release);
+  lck->lk.self = NULL;
+  lck->lk.location = NULL;
+  std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
+                             std::memory_order_relaxed);
+  std::atomic_store_explicit(&lck->lk.now_serving, 0U,
+                             std::memory_order_relaxed);
+  std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
+  std::atomic_store_explicit(&lck->lk.depth_locked, -1,
+                             std::memory_order_relaxed);
+}
+
+static void __kmp_destroy_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_ticket_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_ticket_lock(lck);
+}
+
+// nested ticket locks
+
+int __kmp_acquire_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_ticket_lock_owner(lck) == gtid) {
+    std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
+                                   std::memory_order_relaxed);
+    return KMP_LOCK_ACQUIRED_NEXT;
+  } else {
+    __kmp_acquire_ticket_lock_timed_template(lck, gtid);
+    std::atomic_store_explicit(&lck->lk.depth_locked, 1,
+                               std::memory_order_relaxed);
+    std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
+                               std::memory_order_relaxed);
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+}
+
+static int __kmp_acquire_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                                        kmp_int32 gtid) {
+  char const *const func = "omp_set_nest_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_acquire_nested_ticket_lock(lck, gtid);
+}
+
+int __kmp_test_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  int retval;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_ticket_lock_owner(lck) == gtid) {
+    retval = std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
+                                            std::memory_order_relaxed) +
+             1;
+  } else if (!__kmp_test_ticket_lock(lck, gtid)) {
+    retval = 0;
+  } else {
+    std::atomic_store_explicit(&lck->lk.depth_locked, 1,
+                               std::memory_order_relaxed);
+    std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
+                               std::memory_order_relaxed);
+    retval = 1;
+  }
+  return retval;
+}
+
+static int __kmp_test_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                                     kmp_int32 gtid) {
+  char const *const func = "omp_test_nest_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_test_nested_ticket_lock(lck, gtid);
+}
+
+int __kmp_release_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if ((std::atomic_fetch_add_explicit(&lck->lk.depth_locked, -1,
+                                      std::memory_order_relaxed) -
+       1) == 0) {
+    std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
+    __kmp_release_ticket_lock(lck, gtid);
+    return KMP_LOCK_RELEASED;
+  }
+  return KMP_LOCK_STILL_HELD;
+}
+
+static int __kmp_release_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                                        kmp_int32 gtid) {
+  char const *const func = "omp_unset_nest_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_ticket_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_ticket_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_nested_ticket_lock(lck, gtid);
+}
+
+void __kmp_init_nested_ticket_lock(kmp_ticket_lock_t *lck) {
+  __kmp_init_ticket_lock(lck);
+  std::atomic_store_explicit(&lck->lk.depth_locked, 0,
+                             std::memory_order_relaxed);
+  // >= 0 for nestable locks, -1 for simple locks
+}
+
+void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck) {
+  __kmp_destroy_ticket_lock(lck);
+  std::atomic_store_explicit(&lck->lk.depth_locked, 0,
+                             std::memory_order_relaxed);
+}
+
+static void
+__kmp_destroy_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
+  char const *const func = "omp_destroy_nest_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_ticket_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_nested_ticket_lock(lck);
+}
+
+// access functions to fields which don't exist for all lock kinds.
+
+static const ident_t *__kmp_get_ticket_lock_location(kmp_ticket_lock_t *lck) {
+  return lck->lk.location;
+}
+
+static void __kmp_set_ticket_lock_location(kmp_ticket_lock_t *lck,
+                                           const ident_t *loc) {
+  lck->lk.location = loc;
+}
+
+static kmp_lock_flags_t __kmp_get_ticket_lock_flags(kmp_ticket_lock_t *lck) {
+  return lck->lk.flags;
+}
+
+static void __kmp_set_ticket_lock_flags(kmp_ticket_lock_t *lck,
+                                        kmp_lock_flags_t flags) {
+  lck->lk.flags = flags;
+}
+
+/* ------------------------------------------------------------------------ */
+/* queuing locks */
+
+/* First the states
+   (head,tail) =              0, 0  means lock is unheld, nobody on queue
+                 UINT_MAX or -1, 0  means lock is held, nobody on queue
+                              h, h  means lock held or about to transition,
+                                    1 element on queue
+                              h, t  h <> t, means lock is held or about to
+                                    transition, >1 elements on queue
+
+   Now the transitions
+      Acquire(0,0)  = -1 ,0
+      Release(0,0)  = Error
+      Acquire(-1,0) =  h ,h    h > 0
+      Release(-1,0) =  0 ,0
+      Acquire(h,h)  =  h ,t    h > 0, t > 0, h <> t
+      Release(h,h)  = -1 ,0    h > 0
+      Acquire(h,t)  =  h ,t'   h > 0, t > 0, t' > 0, h <> t, h <> t', t <> t'
+      Release(h,t)  =  h',t    h > 0, t > 0, h <> t, h <> h', h' maybe = t
+
+   And pictorially
+
+           +-----+
+           | 0, 0|------- release -------> Error
+           +-----+
+             |  ^
+      acquire|  |release
+             |  |
+             |  |
+             v  |
+           +-----+
+           |-1, 0|
+           +-----+
+             |  ^
+      acquire|  |release
+             |  |
+             |  |
+             v  |
+           +-----+
+           | h, h|
+           +-----+
+             |  ^
+      acquire|  |release
+             |  |
+             |  |
+             v  |
+           +-----+
+           | h, t|----- acquire, release loopback ---+
+           +-----+                                   |
+                ^                                    |
+                |                                    |
+                +------------------------------------+
+ */
+
+#ifdef DEBUG_QUEUING_LOCKS
+
+/* Stuff for circular trace buffer */
+#define TRACE_BUF_ELE 1024
+static char traces[TRACE_BUF_ELE][128] = {0};
+static int tc = 0;
+#define TRACE_LOCK(X, Y)                                                       \
+  KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s\n", X, Y);
+#define TRACE_LOCK_T(X, Y, Z)                                                  \
+  KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s%d\n", X, Y, Z);
+#define TRACE_LOCK_HT(X, Y, Z, Q)                                              \
+  KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s %d,%d\n", X, Y,   \
+               Z, Q);
+
+static void __kmp_dump_queuing_lock(kmp_info_t *this_thr, kmp_int32 gtid,
+                                    kmp_queuing_lock_t *lck, kmp_int32 head_id,
+                                    kmp_int32 tail_id) {
+  kmp_int32 t, i;
+
+  __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: TRACE BEGINS HERE! \n");
+
+  i = tc % TRACE_BUF_ELE;
+  __kmp_printf_no_lock("%s\n", traces[i]);
+  i = (i + 1) % TRACE_BUF_ELE;
+  while (i != (tc % TRACE_BUF_ELE)) {
+    __kmp_printf_no_lock("%s", traces[i]);
+    i = (i + 1) % TRACE_BUF_ELE;
+  }
+  __kmp_printf_no_lock("\n");
+
+  __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: gtid+1:%d, spin_here:%d, "
+                       "next_wait:%d, head_id:%d, tail_id:%d\n",
+                       gtid + 1, this_thr->th.th_spin_here,
+                       this_thr->th.th_next_waiting, head_id, tail_id);
+
+  __kmp_printf_no_lock("\t\thead: %d ", lck->lk.head_id);
+
+  if (lck->lk.head_id >= 1) {
+    t = __kmp_threads[lck->lk.head_id - 1]->th.th_next_waiting;
+    while (t > 0) {
+      __kmp_printf_no_lock("-> %d ", t);
+      t = __kmp_threads[t - 1]->th.th_next_waiting;
+    }
+  }
+  __kmp_printf_no_lock(";  tail: %d ", lck->lk.tail_id);
+  __kmp_printf_no_lock("\n\n");
+}
+
+#endif /* DEBUG_QUEUING_LOCKS */
+
+static kmp_int32 __kmp_get_queuing_lock_owner(kmp_queuing_lock_t *lck) {
+  return TCR_4(lck->lk.owner_id) - 1;
+}
+
+static inline bool __kmp_is_queuing_lock_nestable(kmp_queuing_lock_t *lck) {
+  return lck->lk.depth_locked != -1;
+}
+
+/* Acquire a lock using a the queuing lock implementation */
+template <bool takeTime>
+/* [TLW] The unused template above is left behind because of what BEB believes
+   is a potential compiler problem with __forceinline. */
+__forceinline static int
+__kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
+                                          kmp_int32 gtid) {
+  kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid);
+  volatile kmp_int32 *head_id_p = &lck->lk.head_id;
+  volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
+  volatile kmp_uint32 *spin_here_p;
+
+#if OMPT_SUPPORT
+  ompt_state_t prev_state = ompt_state_undefined;
+#endif
+
+  KA_TRACE(1000,
+           ("__kmp_acquire_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
+
+  KMP_FSYNC_PREPARE(lck);
+  KMP_DEBUG_ASSERT(this_thr != NULL);
+  spin_here_p = &this_thr->th.th_spin_here;
+
+#ifdef DEBUG_QUEUING_LOCKS
+  TRACE_LOCK(gtid + 1, "acq ent");
+  if (*spin_here_p)
+    __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
+  if (this_thr->th.th_next_waiting != 0)
+    __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
+#endif
+  KMP_DEBUG_ASSERT(!*spin_here_p);
+  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
+
+  /* The following st.rel to spin_here_p needs to precede the cmpxchg.acq to
+     head_id_p that may follow, not just in execution order, but also in
+     visibility order. This way, when a releasing thread observes the changes to
+     the queue by this thread, it can rightly assume that spin_here_p has
+     already been set to TRUE, so that when it sets spin_here_p to FALSE, it is
+     not premature.  If the releasing thread sets spin_here_p to FALSE before
+     this thread sets it to TRUE, this thread will hang. */
+  *spin_here_p = TRUE; /* before enqueuing to prevent race */
+
+  while (1) {
+    kmp_int32 enqueued;
+    kmp_int32 head;
+    kmp_int32 tail;
+
+    head = *head_id_p;
+
+    switch (head) {
+
+    case -1: {
+#ifdef DEBUG_QUEUING_LOCKS
+      tail = *tail_id_p;
+      TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
+#endif
+      tail = 0; /* to make sure next link asynchronously read is not set
+                accidentally; this assignment prevents us from entering the
+                if ( t > 0 ) condition in the enqueued case below, which is not
+                necessary for this state transition */
+
+      /* try (-1,0)->(tid,tid) */
+      enqueued = KMP_COMPARE_AND_STORE_ACQ64((volatile kmp_int64 *)tail_id_p,
+                                             KMP_PACK_64(-1, 0),
+                                             KMP_PACK_64(gtid + 1, gtid + 1));
+#ifdef DEBUG_QUEUING_LOCKS
+      if (enqueued)
+        TRACE_LOCK(gtid + 1, "acq enq: (-1,0)->(tid,tid)");
+#endif
+    } break;
+
+    default: {
+      tail = *tail_id_p;
+      KMP_DEBUG_ASSERT(tail != gtid + 1);
+
+#ifdef DEBUG_QUEUING_LOCKS
+      TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
+#endif
+
+      if (tail == 0) {
+        enqueued = FALSE;
+      } else {
+        /* try (h,t) or (h,h)->(h,tid) */
+        enqueued = KMP_COMPARE_AND_STORE_ACQ32(tail_id_p, tail, gtid + 1);
+
+#ifdef DEBUG_QUEUING_LOCKS
+        if (enqueued)
+          TRACE_LOCK(gtid + 1, "acq enq: (h,t)->(h,tid)");
+#endif
+      }
+    } break;
+
+    case 0: /* empty queue */
+    {
+      kmp_int32 grabbed_lock;
+
+#ifdef DEBUG_QUEUING_LOCKS
+      tail = *tail_id_p;
+      TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
+#endif
+      /* try (0,0)->(-1,0) */
+
+      /* only legal transition out of head = 0 is head = -1 with no change to
+       * tail */
+      grabbed_lock = KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1);
+
+      if (grabbed_lock) {
+
+        *spin_here_p = FALSE;
+
+        KA_TRACE(
+            1000,
+            ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: no queuing\n",
+             lck, gtid));
+#ifdef DEBUG_QUEUING_LOCKS
+        TRACE_LOCK_HT(gtid + 1, "acq exit: ", head, 0);
+#endif
+
+#if OMPT_SUPPORT
+        if (ompt_enabled.enabled && prev_state != ompt_state_undefined) {
+          /* change the state before clearing wait_id */
+          this_thr->th.ompt_thread_info.state = prev_state;
+          this_thr->th.ompt_thread_info.wait_id = 0;
+        }
+#endif
+
+        KMP_FSYNC_ACQUIRED(lck);
+        return KMP_LOCK_ACQUIRED_FIRST; /* lock holder cannot be on queue */
+      }
+      enqueued = FALSE;
+    } break;
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled && prev_state == ompt_state_undefined) {
+      /* this thread will spin; set wait_id before entering wait state */
+      prev_state = this_thr->th.ompt_thread_info.state;
+      this_thr->th.ompt_thread_info.wait_id = (uint64_t)lck;
+      this_thr->th.ompt_thread_info.state = ompt_state_wait_lock;
+    }
+#endif
+
+    if (enqueued) {
+      if (tail > 0) {
+        kmp_info_t *tail_thr = __kmp_thread_from_gtid(tail - 1);
+        KMP_ASSERT(tail_thr != NULL);
+        tail_thr->th.th_next_waiting = gtid + 1;
+        /* corresponding wait for this write in release code */
+      }
+      KA_TRACE(1000,
+               ("__kmp_acquire_queuing_lock: lck:%p, T#%d waiting for lock\n",
+                lck, gtid));
+
+      KMP_MB();
+      // ToDo: Use __kmp_wait_sleep or similar when blocktime != inf
+      KMP_WAIT(spin_here_p, FALSE, KMP_EQ, lck);
+      // Synchronize writes to both runtime thread structures
+      // and writes in user code.
+      KMP_MB();
+
+#ifdef DEBUG_QUEUING_LOCKS
+      TRACE_LOCK(gtid + 1, "acq spin");
+
+      if (this_thr->th.th_next_waiting != 0)
+        __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
+#endif
+      KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
+      KA_TRACE(1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: after "
+                      "waiting on queue\n",
+                      lck, gtid));
+
+#ifdef DEBUG_QUEUING_LOCKS
+      TRACE_LOCK(gtid + 1, "acq exit 2");
+#endif
+
+#if OMPT_SUPPORT
+      /* change the state before clearing wait_id */
+      this_thr->th.ompt_thread_info.state = prev_state;
+      this_thr->th.ompt_thread_info.wait_id = 0;
+#endif
+
+      /* got lock, we were dequeued by the thread that released lock */
+      return KMP_LOCK_ACQUIRED_FIRST;
+    }
+
+    /* Yield if number of threads > number of logical processors */
+    /* ToDo: Not sure why this should only be in oversubscription case,
+       maybe should be traditional YIELD_INIT/YIELD_WHEN loop */
+    KMP_YIELD_OVERSUB();
+
+#ifdef DEBUG_QUEUING_LOCKS
+    TRACE_LOCK(gtid + 1, "acq retry");
+#endif
+  }
+  KMP_ASSERT2(0, "should not get here");
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  int retval = __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
+  return retval;
+}
+
+static int __kmp_acquire_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                  kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) == gtid) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+
+  __kmp_acquire_queuing_lock(lck, gtid);
+
+  lck->lk.owner_id = gtid + 1;
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  volatile kmp_int32 *head_id_p = &lck->lk.head_id;
+  kmp_int32 head;
+#ifdef KMP_DEBUG
+  kmp_info_t *this_thr;
+#endif
+
+  KA_TRACE(1000, ("__kmp_test_queuing_lock: T#%d entering\n", gtid));
+  KMP_DEBUG_ASSERT(gtid >= 0);
+#ifdef KMP_DEBUG
+  this_thr = __kmp_thread_from_gtid(gtid);
+  KMP_DEBUG_ASSERT(this_thr != NULL);
+  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
+#endif
+
+  head = *head_id_p;
+
+  if (head == 0) { /* nobody on queue, nobody holding */
+    /* try (0,0)->(-1,0) */
+    if (KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1)) {
+      KA_TRACE(1000,
+               ("__kmp_test_queuing_lock: T#%d exiting: holding lock\n", gtid));
+      KMP_FSYNC_ACQUIRED(lck);
+      return TRUE;
+    }
+  }
+
+  KA_TRACE(1000,
+           ("__kmp_test_queuing_lock: T#%d exiting: without lock\n", gtid));
+  return FALSE;
+}
+
+static int __kmp_test_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                               kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+
+  int retval = __kmp_test_queuing_lock(lck, gtid);
+
+  if (retval) {
+    lck->lk.owner_id = gtid + 1;
+  }
+  return retval;
+}
+
+int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  volatile kmp_int32 *head_id_p = &lck->lk.head_id;
+  volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
+
+  KA_TRACE(1000,
+           ("__kmp_release_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
+  KMP_DEBUG_ASSERT(gtid >= 0);
+#if KMP_DEBUG || DEBUG_QUEUING_LOCKS
+  kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid);
+#endif
+  KMP_DEBUG_ASSERT(this_thr != NULL);
+#ifdef DEBUG_QUEUING_LOCKS
+  TRACE_LOCK(gtid + 1, "rel ent");
+
+  if (this_thr->th.th_spin_here)
+    __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
+  if (this_thr->th.th_next_waiting != 0)
+    __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
+#endif
+  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
+  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
+
+  KMP_FSYNC_RELEASING(lck);
+
+  while (1) {
+    kmp_int32 dequeued;
+    kmp_int32 head;
+    kmp_int32 tail;
+
+    head = *head_id_p;
+
+#ifdef DEBUG_QUEUING_LOCKS
+    tail = *tail_id_p;
+    TRACE_LOCK_HT(gtid + 1, "rel read: ", head, tail);
+    if (head == 0)
+      __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
+#endif
+    KMP_DEBUG_ASSERT(head !=
+                     0); /* holding the lock, head must be -1 or queue head */
+
+    if (head == -1) { /* nobody on queue */
+      /* try (-1,0)->(0,0) */
+      if (KMP_COMPARE_AND_STORE_REL32(head_id_p, -1, 0)) {
+        KA_TRACE(
+            1000,
+            ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: queue empty\n",
+             lck, gtid));
+#ifdef DEBUG_QUEUING_LOCKS
+        TRACE_LOCK_HT(gtid + 1, "rel exit: ", 0, 0);
+#endif
+
+#if OMPT_SUPPORT
+/* nothing to do - no other thread is trying to shift blame */
+#endif
+        return KMP_LOCK_RELEASED;
+      }
+      dequeued = FALSE;
+    } else {
+      KMP_MB();
+      tail = *tail_id_p;
+      if (head == tail) { /* only one thread on the queue */
+#ifdef DEBUG_QUEUING_LOCKS
+        if (head <= 0)
+          __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
+#endif
+        KMP_DEBUG_ASSERT(head > 0);
+
+        /* try (h,h)->(-1,0) */
+        dequeued = KMP_COMPARE_AND_STORE_REL64(
+            RCAST(volatile kmp_int64 *, tail_id_p), KMP_PACK_64(head, head),
+            KMP_PACK_64(-1, 0));
+#ifdef DEBUG_QUEUING_LOCKS
+        TRACE_LOCK(gtid + 1, "rel deq: (h,h)->(-1,0)");
+#endif
+
+      } else {
+        volatile kmp_int32 *waiting_id_p;
+        kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
+        KMP_DEBUG_ASSERT(head_thr != NULL);
+        waiting_id_p = &head_thr->th.th_next_waiting;
+
+/* Does this require synchronous reads? */
+#ifdef DEBUG_QUEUING_LOCKS
+        if (head <= 0 || tail <= 0)
+          __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
+#endif
+        KMP_DEBUG_ASSERT(head > 0 && tail > 0);
+
+        /* try (h,t)->(h',t) or (t,t) */
+        KMP_MB();
+        /* make sure enqueuing thread has time to update next waiting thread
+         * field */
+        *head_id_p =
+            KMP_WAIT((volatile kmp_uint32 *)waiting_id_p, 0, KMP_NEQ, NULL);
+#ifdef DEBUG_QUEUING_LOCKS
+        TRACE_LOCK(gtid + 1, "rel deq: (h,t)->(h',t)");
+#endif
+        dequeued = TRUE;
+      }
+    }
+
+    if (dequeued) {
+      kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
+      KMP_DEBUG_ASSERT(head_thr != NULL);
+
+/* Does this require synchronous reads? */
+#ifdef DEBUG_QUEUING_LOCKS
+      if (head <= 0 || tail <= 0)
+        __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
+#endif
+      KMP_DEBUG_ASSERT(head > 0 && tail > 0);
+
+      /* For clean code only. Thread not released until next statement prevents
+         race with acquire code. */
+      head_thr->th.th_next_waiting = 0;
+#ifdef DEBUG_QUEUING_LOCKS
+      TRACE_LOCK_T(gtid + 1, "rel nw=0 for t=", head);
+#endif
+
+      KMP_MB();
+      /* reset spin value */
+      head_thr->th.th_spin_here = FALSE;
+
+      KA_TRACE(1000, ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: after "
+                      "dequeuing\n",
+                      lck, gtid));
+#ifdef DEBUG_QUEUING_LOCKS
+      TRACE_LOCK(gtid + 1, "rel exit 2");
+#endif
+      return KMP_LOCK_RELEASED;
+    }
+    /* KMP_CPU_PAUSE(); don't want to make releasing thread hold up acquiring
+       threads */
+
+#ifdef DEBUG_QUEUING_LOCKS
+    TRACE_LOCK(gtid + 1, "rel retry");
+#endif
+
+  } /* while */
+  KMP_ASSERT2(0, "should not get here");
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                  kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  lck->lk.owner_id = 0;
+  return __kmp_release_queuing_lock(lck, gtid);
+}
+
+void __kmp_init_queuing_lock(kmp_queuing_lock_t *lck) {
+  lck->lk.location = NULL;
+  lck->lk.head_id = 0;
+  lck->lk.tail_id = 0;
+  lck->lk.next_ticket = 0;
+  lck->lk.now_serving = 0;
+  lck->lk.owner_id = 0; // no thread owns the lock.
+  lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
+  lck->lk.initialized = lck;
+
+  KA_TRACE(1000, ("__kmp_init_queuing_lock: lock %p initialized\n", lck));
+}
+
+void __kmp_destroy_queuing_lock(kmp_queuing_lock_t *lck) {
+  lck->lk.initialized = NULL;
+  lck->lk.location = NULL;
+  lck->lk.head_id = 0;
+  lck->lk.tail_id = 0;
+  lck->lk.next_ticket = 0;
+  lck->lk.now_serving = 0;
+  lck->lk.owner_id = 0;
+  lck->lk.depth_locked = -1;
+}
+
+static void __kmp_destroy_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_queuing_lock(lck);
+}
+
+// nested queuing locks
+
+int __kmp_acquire_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_queuing_lock_owner(lck) == gtid) {
+    lck->lk.depth_locked += 1;
+    return KMP_LOCK_ACQUIRED_NEXT;
+  } else {
+    __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
+    KMP_MB();
+    lck->lk.depth_locked = 1;
+    KMP_MB();
+    lck->lk.owner_id = gtid + 1;
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+}
+
+static int
+__kmp_acquire_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                              kmp_int32 gtid) {
+  char const *const func = "omp_set_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_acquire_nested_queuing_lock(lck, gtid);
+}
+
+int __kmp_test_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  int retval;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_queuing_lock_owner(lck) == gtid) {
+    retval = ++lck->lk.depth_locked;
+  } else if (!__kmp_test_queuing_lock(lck, gtid)) {
+    retval = 0;
+  } else {
+    KMP_MB();
+    retval = lck->lk.depth_locked = 1;
+    KMP_MB();
+    lck->lk.owner_id = gtid + 1;
+  }
+  return retval;
+}
+
+static int __kmp_test_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                      kmp_int32 gtid) {
+  char const *const func = "omp_test_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_test_nested_queuing_lock(lck, gtid);
+}
+
+int __kmp_release_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  KMP_MB();
+  if (--(lck->lk.depth_locked) == 0) {
+    KMP_MB();
+    lck->lk.owner_id = 0;
+    __kmp_release_queuing_lock(lck, gtid);
+    return KMP_LOCK_RELEASED;
+  }
+  return KMP_LOCK_STILL_HELD;
+}
+
+static int
+__kmp_release_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                              kmp_int32 gtid) {
+  char const *const func = "omp_unset_nest_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_nested_queuing_lock(lck, gtid);
+}
+
+void __kmp_init_nested_queuing_lock(kmp_queuing_lock_t *lck) {
+  __kmp_init_queuing_lock(lck);
+  lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck) {
+  __kmp_destroy_queuing_lock(lck);
+  lck->lk.depth_locked = 0;
+}
+
+static void
+__kmp_destroy_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
+  char const *const func = "omp_destroy_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_nested_queuing_lock(lck);
+}
+
+// access functions to fields which don't exist for all lock kinds.
+
+static const ident_t *__kmp_get_queuing_lock_location(kmp_queuing_lock_t *lck) {
+  return lck->lk.location;
+}
+
+static void __kmp_set_queuing_lock_location(kmp_queuing_lock_t *lck,
+                                            const ident_t *loc) {
+  lck->lk.location = loc;
+}
+
+static kmp_lock_flags_t __kmp_get_queuing_lock_flags(kmp_queuing_lock_t *lck) {
+  return lck->lk.flags;
+}
+
+static void __kmp_set_queuing_lock_flags(kmp_queuing_lock_t *lck,
+                                         kmp_lock_flags_t flags) {
+  lck->lk.flags = flags;
+}
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+/* RTM Adaptive locks */
+
+#if KMP_HAVE_RTM_INTRINSICS
+#include <immintrin.h>
+#define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
+
+#else
+
+// Values from the status register after failed speculation.
+#define _XBEGIN_STARTED (~0u)
+#define _XABORT_EXPLICIT (1 << 0)
+#define _XABORT_RETRY (1 << 1)
+#define _XABORT_CONFLICT (1 << 2)
+#define _XABORT_CAPACITY (1 << 3)
+#define _XABORT_DEBUG (1 << 4)
+#define _XABORT_NESTED (1 << 5)
+#define _XABORT_CODE(x) ((unsigned char)(((x) >> 24) & 0xFF))
+
+// Aborts for which it's worth trying again immediately
+#define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
+
+#define STRINGIZE_INTERNAL(arg) #arg
+#define STRINGIZE(arg) STRINGIZE_INTERNAL(arg)
+
+// Access to RTM instructions
+/*A version of XBegin which returns -1 on speculation, and the value of EAX on
+  an abort. This is the same definition as the compiler intrinsic that will be
+  supported at some point. */
+static __inline int _xbegin() {
+  int res = -1;
+
+#if KMP_OS_WINDOWS
+#if KMP_ARCH_X86_64
+  _asm {
+        _emit 0xC7
+        _emit 0xF8
+        _emit 2
+        _emit 0
+        _emit 0
+        _emit 0
+        jmp   L2
+        mov   res, eax
+    L2:
+  }
+#else /* IA32 */
+  _asm {
+        _emit 0xC7
+        _emit 0xF8
+        _emit 2
+        _emit 0
+        _emit 0
+        _emit 0
+        jmp   L2
+        mov   res, eax
+    L2:
+  }
+#endif // KMP_ARCH_X86_64
+#else
+  /* Note that %eax must be noted as killed (clobbered), because the XSR is
+     returned in %eax(%rax) on abort.  Other register values are restored, so
+     don't need to be killed.
+
+     We must also mark 'res' as an input and an output, since otherwise
+     'res=-1' may be dropped as being dead, whereas we do need the assignment on
+     the successful (i.e., non-abort) path. */
+  __asm__ volatile("1: .byte  0xC7; .byte 0xF8;\n"
+                   "   .long  1f-1b-6\n"
+                   "    jmp   2f\n"
+                   "1:  movl  %%eax,%0\n"
+                   "2:"
+                   : "+r"(res)::"memory", "%eax");
+#endif // KMP_OS_WINDOWS
+  return res;
+}
+
+/* Transaction end */
+static __inline void _xend() {
+#if KMP_OS_WINDOWS
+  __asm {
+        _emit 0x0f
+        _emit 0x01
+        _emit 0xd5
+  }
+#else
+  __asm__ volatile(".byte 0x0f; .byte 0x01; .byte 0xd5" ::: "memory");
+#endif
+}
+
+/* This is a macro, the argument must be a single byte constant which can be
+   evaluated by the inline assembler, since it is emitted as a byte into the
+   assembly code. */
+// clang-format off
+#if KMP_OS_WINDOWS
+#define _xabort(ARG) _asm _emit 0xc6 _asm _emit 0xf8 _asm _emit ARG
+#else
+#define _xabort(ARG)                                                           \
+  __asm__ volatile(".byte 0xC6; .byte 0xF8; .byte " STRINGIZE(ARG):::"memory");
+#endif
+// clang-format on
+#endif // KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300
+
+// Statistics is collected for testing purpose
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+
+// We accumulate speculative lock statistics when the lock is destroyed. We
+// keep locks that haven't been destroyed in the liveLocks list so that we can
+// grab their statistics too.
+static kmp_adaptive_lock_statistics_t destroyedStats;
+
+// To hold the list of live locks.
+static kmp_adaptive_lock_info_t liveLocks;
+
+// A lock so we can safely update the list of locks.
+static kmp_bootstrap_lock_t chain_lock =
+    KMP_BOOTSTRAP_LOCK_INITIALIZER(chain_lock);
+
+// Initialize the list of stats.
+void __kmp_init_speculative_stats() {
+  kmp_adaptive_lock_info_t *lck = &liveLocks;
+
+  memset(CCAST(kmp_adaptive_lock_statistics_t *, &(lck->stats)), 0,
+         sizeof(lck->stats));
+  lck->stats.next = lck;
+  lck->stats.prev = lck;
+
+  KMP_ASSERT(lck->stats.next->stats.prev == lck);
+  KMP_ASSERT(lck->stats.prev->stats.next == lck);
+
+  __kmp_init_bootstrap_lock(&chain_lock);
+}
+
+// Insert the lock into the circular list
+static void __kmp_remember_lock(kmp_adaptive_lock_info_t *lck) {
+  __kmp_acquire_bootstrap_lock(&chain_lock);
+
+  lck->stats.next = liveLocks.stats.next;
+  lck->stats.prev = &liveLocks;
+
+  liveLocks.stats.next = lck;
+  lck->stats.next->stats.prev = lck;
+
+  KMP_ASSERT(lck->stats.next->stats.prev == lck);
+  KMP_ASSERT(lck->stats.prev->stats.next == lck);
+
+  __kmp_release_bootstrap_lock(&chain_lock);
+}
+
+static void __kmp_forget_lock(kmp_adaptive_lock_info_t *lck) {
+  KMP_ASSERT(lck->stats.next->stats.prev == lck);
+  KMP_ASSERT(lck->stats.prev->stats.next == lck);
+
+  kmp_adaptive_lock_info_t *n = lck->stats.next;
+  kmp_adaptive_lock_info_t *p = lck->stats.prev;
+
+  n->stats.prev = p;
+  p->stats.next = n;
+}
+
+static void __kmp_zero_speculative_stats(kmp_adaptive_lock_info_t *lck) {
+  memset(CCAST(kmp_adaptive_lock_statistics_t *, &lck->stats), 0,
+         sizeof(lck->stats));
+  __kmp_remember_lock(lck);
+}
+
+static void __kmp_add_stats(kmp_adaptive_lock_statistics_t *t,
+                            kmp_adaptive_lock_info_t *lck) {
+  kmp_adaptive_lock_statistics_t volatile *s = &lck->stats;
+
+  t->nonSpeculativeAcquireAttempts += lck->acquire_attempts;
+  t->successfulSpeculations += s->successfulSpeculations;
+  t->hardFailedSpeculations += s->hardFailedSpeculations;
+  t->softFailedSpeculations += s->softFailedSpeculations;
+  t->nonSpeculativeAcquires += s->nonSpeculativeAcquires;
+  t->lemmingYields += s->lemmingYields;
+}
+
+static void __kmp_accumulate_speculative_stats(kmp_adaptive_lock_info_t *lck) {
+  __kmp_acquire_bootstrap_lock(&chain_lock);
+
+  __kmp_add_stats(&destroyedStats, lck);
+  __kmp_forget_lock(lck);
+
+  __kmp_release_bootstrap_lock(&chain_lock);
+}
+
+static float percent(kmp_uint32 count, kmp_uint32 total) {
+  return (total == 0) ? 0.0 : (100.0 * count) / total;
+}
+
+void __kmp_print_speculative_stats() {
+  kmp_adaptive_lock_statistics_t total = destroyedStats;
+  kmp_adaptive_lock_info_t *lck;
+
+  for (lck = liveLocks.stats.next; lck != &liveLocks; lck = lck->stats.next) {
+    __kmp_add_stats(&total, lck);
+  }
+  kmp_adaptive_lock_statistics_t *t = &total;
+  kmp_uint32 totalSections =
+      t->nonSpeculativeAcquires + t->successfulSpeculations;
+  kmp_uint32 totalSpeculations = t->successfulSpeculations +
+                                 t->hardFailedSpeculations +
+                                 t->softFailedSpeculations;
+  if (totalSections <= 0)
+    return;
+
+  kmp_safe_raii_file_t statsFile;
+  if (strcmp(__kmp_speculative_statsfile, "-") == 0) {
+    statsFile.set_stdout();
+  } else {
+    size_t buffLen = KMP_STRLEN(__kmp_speculative_statsfile) + 20;
+    char buffer[buffLen];
+    KMP_SNPRINTF(&buffer[0], buffLen, __kmp_speculative_statsfile,
+                 (kmp_int32)getpid());
+    statsFile.open(buffer, "w");
+  }
+
+  fprintf(statsFile, "Speculative lock statistics (all approximate!)\n");
+  fprintf(statsFile,
+          " Lock parameters: \n"
+          "   max_soft_retries               : %10d\n"
+          "   max_badness                    : %10d\n",
+          __kmp_adaptive_backoff_params.max_soft_retries,
+          __kmp_adaptive_backoff_params.max_badness);
+  fprintf(statsFile, " Non-speculative acquire attempts : %10d\n",
+          t->nonSpeculativeAcquireAttempts);
+  fprintf(statsFile, " Total critical sections          : %10d\n",
+          totalSections);
+  fprintf(statsFile, " Successful speculations          : %10d (%5.1f%%)\n",
+          t->successfulSpeculations,
+          percent(t->successfulSpeculations, totalSections));
+  fprintf(statsFile, " Non-speculative acquires         : %10d (%5.1f%%)\n",
+          t->nonSpeculativeAcquires,
+          percent(t->nonSpeculativeAcquires, totalSections));
+  fprintf(statsFile, " Lemming yields                   : %10d\n\n",
+          t->lemmingYields);
+
+  fprintf(statsFile, " Speculative acquire attempts     : %10d\n",
+          totalSpeculations);
+  fprintf(statsFile, " Successes                        : %10d (%5.1f%%)\n",
+          t->successfulSpeculations,
+          percent(t->successfulSpeculations, totalSpeculations));
+  fprintf(statsFile, " Soft failures                    : %10d (%5.1f%%)\n",
+          t->softFailedSpeculations,
+          percent(t->softFailedSpeculations, totalSpeculations));
+  fprintf(statsFile, " Hard failures                    : %10d (%5.1f%%)\n",
+          t->hardFailedSpeculations,
+          percent(t->hardFailedSpeculations, totalSpeculations));
+}
+
+#define KMP_INC_STAT(lck, stat) (lck->lk.adaptive.stats.stat++)
+#else
+#define KMP_INC_STAT(lck, stat)
+
+#endif // KMP_DEBUG_ADAPTIVE_LOCKS
+
+static inline bool __kmp_is_unlocked_queuing_lock(kmp_queuing_lock_t *lck) {
+  // It is enough to check that the head_id is zero.
+  // We don't also need to check the tail.
+  bool res = lck->lk.head_id == 0;
+
+// We need a fence here, since we must ensure that no memory operations
+// from later in this thread float above that read.
+#if KMP_COMPILER_ICC || KMP_COMPILER_ICX
+  _mm_mfence();
+#else
+  __sync_synchronize();
+#endif
+
+  return res;
+}
+
+// Functions for manipulating the badness
+static __inline void
+__kmp_update_badness_after_success(kmp_adaptive_lock_t *lck) {
+  // Reset the badness to zero so we eagerly try to speculate again
+  lck->lk.adaptive.badness = 0;
+  KMP_INC_STAT(lck, successfulSpeculations);
+}
+
+// Create a bit mask with one more set bit.
+static __inline void __kmp_step_badness(kmp_adaptive_lock_t *lck) {
+  kmp_uint32 newBadness = (lck->lk.adaptive.badness << 1) | 1;
+  if (newBadness > lck->lk.adaptive.max_badness) {
+    return;
+  } else {
+    lck->lk.adaptive.badness = newBadness;
+  }
+}
+
+// Check whether speculation should be attempted.
+KMP_ATTRIBUTE_TARGET_RTM
+static __inline int __kmp_should_speculate(kmp_adaptive_lock_t *lck,
+                                           kmp_int32 gtid) {
+  kmp_uint32 badness = lck->lk.adaptive.badness;
+  kmp_uint32 attempts = lck->lk.adaptive.acquire_attempts;
+  int res = (attempts & badness) == 0;
+  return res;
+}
+
+// Attempt to acquire only the speculative lock.
+// Does not back off to the non-speculative lock.
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_test_adaptive_lock_only(kmp_adaptive_lock_t *lck,
+                                         kmp_int32 gtid) {
+  int retries = lck->lk.adaptive.max_soft_retries;
+
+  // We don't explicitly count the start of speculation, rather we record the
+  // results (success, hard fail, soft fail). The sum of all of those is the
+  // total number of times we started speculation since all speculations must
+  // end one of those ways.
+  do {
+    kmp_uint32 status = _xbegin();
+    // Switch this in to disable actual speculation but exercise at least some
+    // of the rest of the code. Useful for debugging...
+    // kmp_uint32 status = _XABORT_NESTED;
+
+    if (status == _XBEGIN_STARTED) {
+      /* We have successfully started speculation. Check that no-one acquired
+         the lock for real between when we last looked and now. This also gets
+         the lock cache line into our read-set, which we need so that we'll
+         abort if anyone later claims it for real. */
+      if (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
+        // Lock is now visibly acquired, so someone beat us to it. Abort the
+        // transaction so we'll restart from _xbegin with the failure status.
+        _xabort(0x01);
+        KMP_ASSERT2(0, "should not get here");
+      }
+      return 1; // Lock has been acquired (speculatively)
+    } else {
+      // We have aborted, update the statistics
+      if (status & SOFT_ABORT_MASK) {
+        KMP_INC_STAT(lck, softFailedSpeculations);
+        // and loop round to retry.
+      } else {
+        KMP_INC_STAT(lck, hardFailedSpeculations);
+        // Give up if we had a hard failure.
+        break;
+      }
+    }
+  } while (retries--); // Loop while we have retries, and didn't fail hard.
+
+  // Either we had a hard failure or we didn't succeed softly after
+  // the full set of attempts, so back off the badness.
+  __kmp_step_badness(lck);
+  return 0;
+}
+
+// Attempt to acquire the speculative lock, or back off to the non-speculative
+// one if the speculative lock cannot be acquired.
+// We can succeed speculatively, non-speculatively, or fail.
+static int __kmp_test_adaptive_lock(kmp_adaptive_lock_t *lck, kmp_int32 gtid) {
+  // First try to acquire the lock speculatively
+  if (__kmp_should_speculate(lck, gtid) &&
+      __kmp_test_adaptive_lock_only(lck, gtid))
+    return 1;
+
+  // Speculative acquisition failed, so try to acquire it non-speculatively.
+  // Count the non-speculative acquire attempt
+  lck->lk.adaptive.acquire_attempts++;
+
+  // Use base, non-speculative lock.
+  if (__kmp_test_queuing_lock(GET_QLK_PTR(lck), gtid)) {
+    KMP_INC_STAT(lck, nonSpeculativeAcquires);
+    return 1; // Lock is acquired (non-speculatively)
+  } else {
+    return 0; // Failed to acquire the lock, it's already visibly locked.
+  }
+}
+
+static int __kmp_test_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
+                                                kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+  if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+
+  int retval = __kmp_test_adaptive_lock(lck, gtid);
+
+  if (retval) {
+    lck->lk.qlk.owner_id = gtid + 1;
+  }
+  return retval;
+}
+
+// Block until we can acquire a speculative, adaptive lock. We check whether we
+// should be trying to speculate. If we should be, we check the real lock to see
+// if it is free, and, if not, pause without attempting to acquire it until it
+// is. Then we try the speculative acquire. This means that although we suffer
+// from lemmings a little (because all we can't acquire the lock speculatively
+// until the queue of threads waiting has cleared), we don't get into a state
+// where we can never acquire the lock speculatively (because we force the queue
+// to clear by preventing new arrivals from entering the queue). This does mean
+// that when we're trying to break lemmings, the lock is no longer fair. However
+// OpenMP makes no guarantee that its locks are fair, so this isn't a real
+// problem.
+static void __kmp_acquire_adaptive_lock(kmp_adaptive_lock_t *lck,
+                                        kmp_int32 gtid) {
+  if (__kmp_should_speculate(lck, gtid)) {
+    if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
+      if (__kmp_test_adaptive_lock_only(lck, gtid))
+        return;
+      // We tried speculation and failed, so give up.
+    } else {
+      // We can't try speculation until the lock is free, so we pause here
+      // (without suspending on the queueing lock, to allow it to drain, then
+      // try again. All other threads will also see the same result for
+      // shouldSpeculate, so will be doing the same if they try to claim the
+      // lock from now on.
+      while (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
+        KMP_INC_STAT(lck, lemmingYields);
+        KMP_YIELD(TRUE);
+      }
+
+      if (__kmp_test_adaptive_lock_only(lck, gtid))
+        return;
+    }
+  }
+
+  // Speculative acquisition failed, so acquire it non-speculatively.
+  // Count the non-speculative acquire attempt
+  lck->lk.adaptive.acquire_attempts++;
+
+  __kmp_acquire_queuing_lock_timed_template<FALSE>(GET_QLK_PTR(lck), gtid);
+  // We have acquired the base lock, so count that.
+  KMP_INC_STAT(lck, nonSpeculativeAcquires);
+}
+
+static void __kmp_acquire_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
+                                                    kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+  if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == gtid) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+
+  __kmp_acquire_adaptive_lock(lck, gtid);
+
+  lck->lk.qlk.owner_id = gtid + 1;
+}
+
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_release_adaptive_lock(kmp_adaptive_lock_t *lck,
+                                       kmp_int32 gtid) {
+  if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(
+          lck))) { // If the lock doesn't look claimed we must be speculating.
+    // (Or the user's code is buggy and they're releasing without locking;
+    // if we had XTEST we'd be able to check that case...)
+    _xend(); // Exit speculation
+    __kmp_update_badness_after_success(lck);
+  } else { // Since the lock *is* visibly locked we're not speculating,
+    // so should use the underlying lock's release scheme.
+    __kmp_release_queuing_lock(GET_QLK_PTR(lck), gtid);
+  }
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
+                                                   kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  lck->lk.qlk.owner_id = 0;
+  __kmp_release_adaptive_lock(lck, gtid);
+  return KMP_LOCK_RELEASED;
+}
+
+static void __kmp_init_adaptive_lock(kmp_adaptive_lock_t *lck) {
+  __kmp_init_queuing_lock(GET_QLK_PTR(lck));
+  lck->lk.adaptive.badness = 0;
+  lck->lk.adaptive.acquire_attempts = 0; // nonSpeculativeAcquireAttempts = 0;
+  lck->lk.adaptive.max_soft_retries =
+      __kmp_adaptive_backoff_params.max_soft_retries;
+  lck->lk.adaptive.max_badness = __kmp_adaptive_backoff_params.max_badness;
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+  __kmp_zero_speculative_stats(&lck->lk.adaptive);
+#endif
+  KA_TRACE(1000, ("__kmp_init_adaptive_lock: lock %p initialized\n", lck));
+}
+
+static void __kmp_destroy_adaptive_lock(kmp_adaptive_lock_t *lck) {
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+  __kmp_accumulate_speculative_stats(&lck->lk.adaptive);
+#endif
+  __kmp_destroy_queuing_lock(GET_QLK_PTR(lck));
+  // Nothing needed for the speculative part.
+}
+
+static void __kmp_destroy_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+  if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_adaptive_lock(lck);
+}
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+/* ------------------------------------------------------------------------ */
+/* DRDPA ticket locks                                                */
+/* "DRDPA" means Dynamically Reconfigurable Distributed Polling Area */
+
+static kmp_int32 __kmp_get_drdpa_lock_owner(kmp_drdpa_lock_t *lck) {
+  return lck->lk.owner_id - 1;
+}
+
+static inline bool __kmp_is_drdpa_lock_nestable(kmp_drdpa_lock_t *lck) {
+  return lck->lk.depth_locked != -1;
+}
+
+__forceinline static int
+__kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  kmp_uint64 ticket = KMP_ATOMIC_INC(&lck->lk.next_ticket);
+  kmp_uint64 mask = lck->lk.mask; // atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls;
+
+#ifdef USE_LOCK_PROFILE
+  if (polls[ticket & mask] != ticket)
+    __kmp_printf("LOCK CONTENTION: %p\n", lck);
+/* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+  // Now spin-wait, but reload the polls pointer and mask, in case the
+  // polling area has been reconfigured.  Unless it is reconfigured, the
+  // reloads stay in L1 cache and are cheap.
+  //
+  // Keep this code in sync with KMP_WAIT, in kmp_dispatch.cpp !!!
+  // The current implementation of KMP_WAIT doesn't allow for mask
+  // and poll to be re-read every spin iteration.
+  kmp_uint32 spins;
+  kmp_uint64 time;
+  KMP_FSYNC_PREPARE(lck);
+  KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
+  while (polls[ticket & mask] < ticket) { // atomic load
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
+    // Re-read the mask and the poll pointer from the lock structure.
+    //
+    // Make certain that "mask" is read before "polls" !!!
+    //
+    // If another thread picks reconfigures the polling area and updates their
+    // values, and we get the new value of mask and the old polls pointer, we
+    // could access memory beyond the end of the old polling area.
+    mask = lck->lk.mask; // atomic load
+    polls = lck->lk.polls; // atomic load
+  }
+
+  // Critical section starts here
+  KMP_FSYNC_ACQUIRED(lck);
+  KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld acquired lock %p\n",
+                  ticket, lck));
+  lck->lk.now_serving = ticket; // non-volatile store
+
+  // Deallocate a garbage polling area if we know that we are the last
+  // thread that could possibly access it.
+  //
+  // The >= check is in case __kmp_test_drdpa_lock() allocated the cleanup
+  // ticket.
+  if ((lck->lk.old_polls != NULL) && (ticket >= lck->lk.cleanup_ticket)) {
+    __kmp_free(lck->lk.old_polls);
+    lck->lk.old_polls = NULL;
+    lck->lk.cleanup_ticket = 0;
+  }
+
+  // Check to see if we should reconfigure the polling area.
+  // If there is still a garbage polling area to be deallocated from a
+  // previous reconfiguration, let a later thread reconfigure it.
+  if (lck->lk.old_polls == NULL) {
+    bool reconfigure = false;
+    std::atomic<kmp_uint64> *old_polls = polls;
+    kmp_uint32 num_polls = TCR_4(lck->lk.num_polls);
+
+    if (TCR_4(__kmp_nth) >
+        (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
+      // We are in oversubscription mode.  Contract the polling area
+      // down to a single location, if that hasn't been done already.
+      if (num_polls > 1) {
+        reconfigure = true;
+        num_polls = TCR_4(lck->lk.num_polls);
+        mask = 0;
+        num_polls = 1;
+        polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
+                                                          sizeof(*polls));
+        polls[0] = ticket;
+      }
+    } else {
+      // We are in under/fully subscribed mode.  Check the number of
+      // threads waiting on the lock.  The size of the polling area
+      // should be at least the number of threads waiting.
+      kmp_uint64 num_waiting = TCR_8(lck->lk.next_ticket) - ticket - 1;
+      if (num_waiting > num_polls) {
+        kmp_uint32 old_num_polls = num_polls;
+        reconfigure = true;
+        do {
+          mask = (mask << 1) | 1;
+          num_polls *= 2;
+        } while (num_polls <= num_waiting);
+
+        // Allocate the new polling area, and copy the relevant portion
+        // of the old polling area to the new area.  __kmp_allocate()
+        // zeroes the memory it allocates, and most of the old area is
+        // just zero padding, so we only copy the release counters.
+        polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
+                                                          sizeof(*polls));
+        kmp_uint32 i;
+        for (i = 0; i < old_num_polls; i++) {
+          polls[i].store(old_polls[i]);
+        }
+      }
+    }
+
+    if (reconfigure) {
+      // Now write the updated fields back to the lock structure.
+      //
+      // Make certain that "polls" is written before "mask" !!!
+      //
+      // If another thread picks up the new value of mask and the old polls
+      // pointer , it could access memory beyond the end of the old polling
+      // area.
+      //
+      // On x86, we need memory fences.
+      KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld reconfiguring "
+                      "lock %p to %d polls\n",
+                      ticket, lck, num_polls));
+
+      lck->lk.old_polls = old_polls;
+      lck->lk.polls = polls; // atomic store
+
+      KMP_MB();
+
+      lck->lk.num_polls = num_polls;
+      lck->lk.mask = mask; // atomic store
+
+      KMP_MB();
+
+      // Only after the new polling area and mask have been flushed
+      // to main memory can we update the cleanup ticket field.
+      //
+      // volatile load / non-volatile store
+      lck->lk.cleanup_ticket = lck->lk.next_ticket;
+    }
+  }
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  int retval = __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
+  return retval;
+}
+
+static int __kmp_acquire_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                                kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) == gtid)) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+
+  __kmp_acquire_drdpa_lock(lck, gtid);
+
+  lck->lk.owner_id = gtid + 1;
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  // First get a ticket, then read the polls pointer and the mask.
+  // The polls pointer must be read before the mask!!! (See above)
+  kmp_uint64 ticket = lck->lk.next_ticket; // atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls;
+  kmp_uint64 mask = lck->lk.mask; // atomic load
+  if (polls[ticket & mask] == ticket) {
+    kmp_uint64 next_ticket = ticket + 1;
+    if (__kmp_atomic_compare_store_acq(&lck->lk.next_ticket, ticket,
+                                       next_ticket)) {
+      KMP_FSYNC_ACQUIRED(lck);
+      KA_TRACE(1000, ("__kmp_test_drdpa_lock: ticket #%lld acquired lock %p\n",
+                      ticket, lck));
+      lck->lk.now_serving = ticket; // non-volatile store
+
+      // Since no threads are waiting, there is no possibility that we would
+      // want to reconfigure the polling area.  We might have the cleanup ticket
+      // value (which says that it is now safe to deallocate old_polls), but
+      // we'll let a later thread which calls __kmp_acquire_lock do that - this
+      // routine isn't supposed to block, and we would risk blocks if we called
+      // __kmp_free() to do the deallocation.
+      return TRUE;
+    }
+  }
+  return FALSE;
+}
+
+static int __kmp_test_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                             kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+
+  int retval = __kmp_test_drdpa_lock(lck, gtid);
+
+  if (retval) {
+    lck->lk.owner_id = gtid + 1;
+  }
+  return retval;
+}
+
+int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  // Read the ticket value from the lock data struct, then the polls pointer and
+  // the mask.  The polls pointer must be read before the mask!!! (See above)
+  kmp_uint64 ticket = lck->lk.now_serving + 1; // non-atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls; // atomic load
+  kmp_uint64 mask = lck->lk.mask; // atomic load
+  KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
+                  ticket - 1, lck));
+  KMP_FSYNC_RELEASING(lck);
+  polls[ticket & mask] = ticket; // atomic store
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                                kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_drdpa_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) >= 0) &&
+      (__kmp_get_drdpa_lock_owner(lck) != gtid)) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  lck->lk.owner_id = 0;
+  return __kmp_release_drdpa_lock(lck, gtid);
+}
+
+void __kmp_init_drdpa_lock(kmp_drdpa_lock_t *lck) {
+  lck->lk.location = NULL;
+  lck->lk.mask = 0;
+  lck->lk.num_polls = 1;
+  lck->lk.polls = (std::atomic<kmp_uint64> *)__kmp_allocate(
+      lck->lk.num_polls * sizeof(*(lck->lk.polls)));
+  lck->lk.cleanup_ticket = 0;
+  lck->lk.old_polls = NULL;
+  lck->lk.next_ticket = 0;
+  lck->lk.now_serving = 0;
+  lck->lk.owner_id = 0; // no thread owns the lock.
+  lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
+  lck->lk.initialized = lck;
+
+  KA_TRACE(1000, ("__kmp_init_drdpa_lock: lock %p initialized\n", lck));
+}
+
+void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck) {
+  lck->lk.initialized = NULL;
+  lck->lk.location = NULL;
+  if (lck->lk.polls.load() != NULL) {
+    __kmp_free(lck->lk.polls.load());
+    lck->lk.polls = NULL;
+  }
+  if (lck->lk.old_polls != NULL) {
+    __kmp_free(lck->lk.old_polls);
+    lck->lk.old_polls = NULL;
+  }
+  lck->lk.mask = 0;
+  lck->lk.num_polls = 0;
+  lck->lk.cleanup_ticket = 0;
+  lck->lk.next_ticket = 0;
+  lck->lk.now_serving = 0;
+  lck->lk.owner_id = 0;
+  lck->lk.depth_locked = -1;
+}
+
+static void __kmp_destroy_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_drdpa_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_drdpa_lock(lck);
+}
+
+// nested drdpa ticket locks
+
+int __kmp_acquire_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
+    lck->lk.depth_locked += 1;
+    return KMP_LOCK_ACQUIRED_NEXT;
+  } else {
+    __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
+    KMP_MB();
+    lck->lk.depth_locked = 1;
+    KMP_MB();
+    lck->lk.owner_id = gtid + 1;
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+}
+
+static void __kmp_acquire_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                                        kmp_int32 gtid) {
+  char const *const func = "omp_set_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  __kmp_acquire_nested_drdpa_lock(lck, gtid);
+}
+
+int __kmp_test_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  int retval;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
+    retval = ++lck->lk.depth_locked;
+  } else if (!__kmp_test_drdpa_lock(lck, gtid)) {
+    retval = 0;
+  } else {
+    KMP_MB();
+    retval = lck->lk.depth_locked = 1;
+    KMP_MB();
+    lck->lk.owner_id = gtid + 1;
+  }
+  return retval;
+}
+
+static int __kmp_test_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                                    kmp_int32 gtid) {
+  char const *const func = "omp_test_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_test_nested_drdpa_lock(lck, gtid);
+}
+
+int __kmp_release_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  KMP_MB();
+  if (--(lck->lk.depth_locked) == 0) {
+    KMP_MB();
+    lck->lk.owner_id = 0;
+    __kmp_release_drdpa_lock(lck, gtid);
+    return KMP_LOCK_RELEASED;
+  }
+  return KMP_LOCK_STILL_HELD;
+}
+
+static int __kmp_release_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                                       kmp_int32 gtid) {
+  char const *const func = "omp_unset_nest_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_drdpa_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_drdpa_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_nested_drdpa_lock(lck, gtid);
+}
+
+void __kmp_init_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
+  __kmp_init_drdpa_lock(lck);
+  lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
+  __kmp_destroy_drdpa_lock(lck);
+  lck->lk.depth_locked = 0;
+}
+
+static void __kmp_destroy_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
+  char const *const func = "omp_destroy_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_drdpa_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_nested_drdpa_lock(lck);
+}
+
+// access functions to fields which don't exist for all lock kinds.
+
+static const ident_t *__kmp_get_drdpa_lock_location(kmp_drdpa_lock_t *lck) {
+  return lck->lk.location;
+}
+
+static void __kmp_set_drdpa_lock_location(kmp_drdpa_lock_t *lck,
+                                          const ident_t *loc) {
+  lck->lk.location = loc;
+}
+
+static kmp_lock_flags_t __kmp_get_drdpa_lock_flags(kmp_drdpa_lock_t *lck) {
+  return lck->lk.flags;
+}
+
+static void __kmp_set_drdpa_lock_flags(kmp_drdpa_lock_t *lck,
+                                       kmp_lock_flags_t flags) {
+  lck->lk.flags = flags;
+}
+
+// Time stamp counter
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#define __kmp_tsc() __kmp_hardware_timestamp()
+// Runtime's default backoff parameters
+kmp_backoff_t __kmp_spin_backoff_params = {1, 4096, 100};
+#else
+// Use nanoseconds for other platforms
+extern kmp_uint64 __kmp_now_nsec();
+kmp_backoff_t __kmp_spin_backoff_params = {1, 256, 100};
+#define __kmp_tsc() __kmp_now_nsec()
+#endif
+
+// A useful predicate for dealing with timestamps that may wrap.
+// Is a before b? Since the timestamps may wrap, this is asking whether it's
+// shorter to go clockwise from a to b around the clock-face, or anti-clockwise.
+// Times where going clockwise is less distance than going anti-clockwise
+// are in the future, others are in the past. e.g. a = MAX-1, b = MAX+1 (=0),
+// then a > b (true) does not mean a reached b; whereas signed(a) = -2,
+// signed(b) = 0 captures the actual difference
+static inline bool before(kmp_uint64 a, kmp_uint64 b) {
+  return ((kmp_int64)b - (kmp_int64)a) > 0;
+}
+
+// Truncated binary exponential backoff function
+void __kmp_spin_backoff(kmp_backoff_t *boff) {
+  // We could flatten this loop, but making it a nested loop gives better result
+  kmp_uint32 i;
+  for (i = boff->step; i > 0; i--) {
+    kmp_uint64 goal = __kmp_tsc() + boff->min_tick;
+#if KMP_HAVE_UMWAIT
+    if (__kmp_umwait_enabled) {
+      __kmp_tpause(0, boff->min_tick);
+    } else {
+#endif
+      do {
+        KMP_CPU_PAUSE();
+      } while (before(__kmp_tsc(), goal));
+#if KMP_HAVE_UMWAIT
+    }
+#endif
+  }
+  boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1);
+}
+
+#if KMP_USE_DYNAMIC_LOCK
+
+// Direct lock initializers. It simply writes a tag to the low 8 bits of the
+// lock word.
+static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck,
+                                   kmp_dyna_lockseq_t seq) {
+  TCW_4(*lck, KMP_GET_D_TAG(seq));
+  KA_TRACE(
+      20,
+      ("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq));
+}
+
+#if KMP_USE_TSX
+
+// HLE lock functions - imported from the testbed runtime.
+#define HLE_ACQUIRE ".byte 0xf2;"
+#define HLE_RELEASE ".byte 0xf3;"
+
+static inline kmp_uint32 swap4(kmp_uint32 volatile *p, kmp_uint32 v) {
+  __asm__ volatile(HLE_ACQUIRE "xchg %1,%0" : "+r"(v), "+m"(*p) : : "memory");
+  return v;
+}
+
+static void __kmp_destroy_hle_lock(kmp_dyna_lock_t *lck) { TCW_4(*lck, 0); }
+
+static void __kmp_destroy_hle_lock_with_checks(kmp_dyna_lock_t *lck) {
+  TCW_4(*lck, 0);
+}
+
+static void __kmp_acquire_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
+  // Use gtid for KMP_LOCK_BUSY if necessary
+  if (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle)) {
+    int delay = 1;
+    do {
+      while (*(kmp_uint32 volatile *)lck != KMP_LOCK_FREE(hle)) {
+        for (int i = delay; i != 0; --i)
+          KMP_CPU_PAUSE();
+        delay = ((delay << 1) | 1) & 7;
+      }
+    } while (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle));
+  }
+}
+
+static void __kmp_acquire_hle_lock_with_checks(kmp_dyna_lock_t *lck,
+                                               kmp_int32 gtid) {
+  __kmp_acquire_hle_lock(lck, gtid); // TODO: add checks
+}
+
+static int __kmp_release_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
+  __asm__ volatile(HLE_RELEASE "movl %1,%0"
+                   : "=m"(*lck)
+                   : "r"(KMP_LOCK_FREE(hle))
+                   : "memory");
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_hle_lock_with_checks(kmp_dyna_lock_t *lck,
+                                              kmp_int32 gtid) {
+  return __kmp_release_hle_lock(lck, gtid); // TODO: add checks
+}
+
+static int __kmp_test_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
+  return swap4(lck, KMP_LOCK_BUSY(1, hle)) == KMP_LOCK_FREE(hle);
+}
+
+static int __kmp_test_hle_lock_with_checks(kmp_dyna_lock_t *lck,
+                                           kmp_int32 gtid) {
+  return __kmp_test_hle_lock(lck, gtid); // TODO: add checks
+}
+
+static void __kmp_init_rtm_queuing_lock(kmp_queuing_lock_t *lck) {
+  __kmp_init_queuing_lock(lck);
+}
+
+static void __kmp_destroy_rtm_queuing_lock(kmp_queuing_lock_t *lck) {
+  __kmp_destroy_queuing_lock(lck);
+}
+
+static void
+__kmp_destroy_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
+  __kmp_destroy_queuing_lock_with_checks(lck);
+}
+
+KMP_ATTRIBUTE_TARGET_RTM
+static void __kmp_acquire_rtm_queuing_lock(kmp_queuing_lock_t *lck,
+                                           kmp_int32 gtid) {
+  unsigned retries = 3, status;
+  do {
+    status = _xbegin();
+    if (status == _XBEGIN_STARTED) {
+      if (__kmp_is_unlocked_queuing_lock(lck))
+        return;
+      _xabort(0xff);
+    }
+    if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
+      // Wait until lock becomes free
+      while (!__kmp_is_unlocked_queuing_lock(lck)) {
+        KMP_YIELD(TRUE);
+      }
+    } else if (!(status & _XABORT_RETRY))
+      break;
+  } while (retries--);
+
+  // Fall-back non-speculative lock (xchg)
+  __kmp_acquire_queuing_lock(lck, gtid);
+}
+
+static void __kmp_acquire_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                       kmp_int32 gtid) {
+  __kmp_acquire_rtm_queuing_lock(lck, gtid);
+}
+
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_release_rtm_queuing_lock(kmp_queuing_lock_t *lck,
+                                          kmp_int32 gtid) {
+  if (__kmp_is_unlocked_queuing_lock(lck)) {
+    // Releasing from speculation
+    _xend();
+  } else {
+    // Releasing from a real lock
+    __kmp_release_queuing_lock(lck, gtid);
+  }
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                      kmp_int32 gtid) {
+  return __kmp_release_rtm_queuing_lock(lck, gtid);
+}
+
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_test_rtm_queuing_lock(kmp_queuing_lock_t *lck,
+                                       kmp_int32 gtid) {
+  unsigned retries = 3, status;
+  do {
+    status = _xbegin();
+    if (status == _XBEGIN_STARTED && __kmp_is_unlocked_queuing_lock(lck)) {
+      return 1;
+    }
+    if (!(status & _XABORT_RETRY))
+      break;
+  } while (retries--);
+
+  return __kmp_test_queuing_lock(lck, gtid);
+}
+
+static int __kmp_test_rtm_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                   kmp_int32 gtid) {
+  return __kmp_test_rtm_queuing_lock(lck, gtid);
+}
+
+// Reuse kmp_tas_lock_t for TSX lock which use RTM with fall-back spin lock.
+typedef kmp_tas_lock_t kmp_rtm_spin_lock_t;
+
+static void __kmp_destroy_rtm_spin_lock(kmp_rtm_spin_lock_t *lck) {
+  KMP_ATOMIC_ST_REL(&lck->lk.poll, 0);
+}
+
+static void __kmp_destroy_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck) {
+  __kmp_destroy_rtm_spin_lock(lck);
+}
+
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_acquire_rtm_spin_lock(kmp_rtm_spin_lock_t *lck,
+                                       kmp_int32 gtid) {
+  unsigned retries = 3, status;
+  kmp_int32 lock_free = KMP_LOCK_FREE(rtm_spin);
+  kmp_int32 lock_busy = KMP_LOCK_BUSY(1, rtm_spin);
+  do {
+    status = _xbegin();
+    if (status == _XBEGIN_STARTED) {
+      if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free)
+        return KMP_LOCK_ACQUIRED_FIRST;
+      _xabort(0xff);
+    }
+    if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
+      // Wait until lock becomes free
+      while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != lock_free) {
+        KMP_YIELD(TRUE);
+      }
+    } else if (!(status & _XABORT_RETRY))
+      break;
+  } while (retries--);
+
+  // Fall-back spin lock
+  KMP_FSYNC_PREPARE(lck);
+  kmp_backoff_t backoff = __kmp_spin_backoff_params;
+  while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != lock_free ||
+         !__kmp_atomic_compare_store_acq(&lck->lk.poll, lock_free, lock_busy)) {
+    __kmp_spin_backoff(&backoff);
+  }
+  KMP_FSYNC_ACQUIRED(lck);
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+static int __kmp_acquire_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck,
+                                                   kmp_int32 gtid) {
+  return __kmp_acquire_rtm_spin_lock(lck, gtid);
+}
+
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_release_rtm_spin_lock(kmp_rtm_spin_lock_t *lck,
+                                       kmp_int32 gtid) {
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == KMP_LOCK_FREE(rtm_spin)) {
+    // Releasing from speculation
+    _xend();
+  } else {
+    // Releasing from a real lock
+    KMP_FSYNC_RELEASING(lck);
+    KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(rtm_spin));
+  }
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck,
+                                                   kmp_int32 gtid) {
+  return __kmp_release_rtm_spin_lock(lck, gtid);
+}
+
+KMP_ATTRIBUTE_TARGET_RTM
+static int __kmp_test_rtm_spin_lock(kmp_rtm_spin_lock_t *lck, kmp_int32 gtid) {
+  unsigned retries = 3, status;
+  kmp_int32 lock_free = KMP_LOCK_FREE(rtm_spin);
+  kmp_int32 lock_busy = KMP_LOCK_BUSY(1, rtm_spin);
+  do {
+    status = _xbegin();
+    if (status == _XBEGIN_STARTED &&
+        KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free) {
+      return TRUE;
+    }
+    if (!(status & _XABORT_RETRY))
+      break;
+  } while (retries--);
+
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == lock_free &&
+      __kmp_atomic_compare_store_acq(&lck->lk.poll, lock_free, lock_busy)) {
+    KMP_FSYNC_ACQUIRED(lck);
+    return TRUE;
+  }
+  return FALSE;
+}
+
+static int __kmp_test_rtm_spin_lock_with_checks(kmp_rtm_spin_lock_t *lck,
+                                                kmp_int32 gtid) {
+  return __kmp_test_rtm_spin_lock(lck, gtid);
+}
+
+#endif // KMP_USE_TSX
+
+// Entry functions for indirect locks (first element of direct lock jump tables)
+static void __kmp_init_indirect_lock(kmp_dyna_lock_t *l,
+                                     kmp_dyna_lockseq_t tag);
+static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock);
+static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
+static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
+static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
+static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                               kmp_int32);
+static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                                 kmp_int32);
+static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                                kmp_int32);
+
+// Lock function definitions for the union parameter type
+#define KMP_FOREACH_LOCK_KIND(m, a) m(ticket, a) m(queuing, a) m(drdpa, a)
+
+#define expand1(lk, op)                                                        \
+  static void __kmp_##op##_##lk##_##lock(kmp_user_lock_p lock) {               \
+    __kmp_##op##_##lk##_##lock(&lock->lk);                                     \
+  }
+#define expand2(lk, op)                                                        \
+  static int __kmp_##op##_##lk##_##lock(kmp_user_lock_p lock,                  \
+                                        kmp_int32 gtid) {                      \
+    return __kmp_##op##_##lk##_##lock(&lock->lk, gtid);                        \
+  }
+#define expand3(lk, op)                                                        \
+  static void __kmp_set_##lk##_##lock_flags(kmp_user_lock_p lock,              \
+                                            kmp_lock_flags_t flags) {          \
+    __kmp_set_##lk##_lock_flags(&lock->lk, flags);                             \
+  }
+#define expand4(lk, op)                                                        \
+  static void __kmp_set_##lk##_##lock_location(kmp_user_lock_p lock,           \
+                                               const ident_t *loc) {           \
+    __kmp_set_##lk##_lock_location(&lock->lk, loc);                            \
+  }
+
+KMP_FOREACH_LOCK_KIND(expand1, init)
+KMP_FOREACH_LOCK_KIND(expand1, init_nested)
+KMP_FOREACH_LOCK_KIND(expand1, destroy)
+KMP_FOREACH_LOCK_KIND(expand1, destroy_nested)
+KMP_FOREACH_LOCK_KIND(expand2, acquire)
+KMP_FOREACH_LOCK_KIND(expand2, acquire_nested)
+KMP_FOREACH_LOCK_KIND(expand2, release)
+KMP_FOREACH_LOCK_KIND(expand2, release_nested)
+KMP_FOREACH_LOCK_KIND(expand2, test)
+KMP_FOREACH_LOCK_KIND(expand2, test_nested)
+KMP_FOREACH_LOCK_KIND(expand3, )
+KMP_FOREACH_LOCK_KIND(expand4, )
+
+#undef expand1
+#undef expand2
+#undef expand3
+#undef expand4
+
+// Jump tables for the indirect lock functions
+// Only fill in the odd entries, that avoids the need to shift out the low bit
+
+// init functions
+#define expand(l, op) 0, __kmp_init_direct_lock,
+void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t) = {
+    __kmp_init_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, init)};
+#undef expand
+
+// destroy functions
+#define expand(l, op) 0, (void (*)(kmp_dyna_lock_t *))__kmp_##op##_##l##_lock,
+static void (*direct_destroy[])(kmp_dyna_lock_t *) = {
+    __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
+#undef expand
+#define expand(l, op)                                                          \
+  0, (void (*)(kmp_dyna_lock_t *))__kmp_destroy_##l##_lock_with_checks,
+static void (*direct_destroy_check[])(kmp_dyna_lock_t *) = {
+    __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
+#undef expand
+
+// set/acquire functions
+#define expand(l, op)                                                          \
+  0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
+static int (*direct_set[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_set_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, acquire)};
+#undef expand
+#define expand(l, op)                                                          \
+  0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
+static int (*direct_set_check[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_set_indirect_lock_with_checks, 0,
+    KMP_FOREACH_D_LOCK(expand, acquire)};
+#undef expand
+
+// unset/release and test functions
+#define expand(l, op)                                                          \
+  0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
+static int (*direct_unset[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_unset_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, release)};
+static int (*direct_test[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_test_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, test)};
+#undef expand
+#define expand(l, op)                                                          \
+  0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
+static int (*direct_unset_check[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_unset_indirect_lock_with_checks, 0,
+    KMP_FOREACH_D_LOCK(expand, release)};
+static int (*direct_test_check[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_test_indirect_lock_with_checks, 0, KMP_FOREACH_D_LOCK(expand, test)};
+#undef expand
+
+// Exposes only one set of jump tables (*lock or *lock_with_checks).
+void (**__kmp_direct_destroy)(kmp_dyna_lock_t *) = 0;
+int (**__kmp_direct_set)(kmp_dyna_lock_t *, kmp_int32) = 0;
+int (**__kmp_direct_unset)(kmp_dyna_lock_t *, kmp_int32) = 0;
+int (**__kmp_direct_test)(kmp_dyna_lock_t *, kmp_int32) = 0;
+
+// Jump tables for the indirect lock functions
+#define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
+void (*__kmp_indirect_init[])(kmp_user_lock_p) = {
+    KMP_FOREACH_I_LOCK(expand, init)};
+#undef expand
+
+#define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
+static void (*indirect_destroy[])(kmp_user_lock_p) = {
+    KMP_FOREACH_I_LOCK(expand, destroy)};
+#undef expand
+#define expand(l, op)                                                          \
+  (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock_with_checks,
+static void (*indirect_destroy_check[])(kmp_user_lock_p) = {
+    KMP_FOREACH_I_LOCK(expand, destroy)};
+#undef expand
+
+// set/acquire functions
+#define expand(l, op)                                                          \
+  (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
+static int (*indirect_set[])(kmp_user_lock_p,
+                             kmp_int32) = {KMP_FOREACH_I_LOCK(expand, acquire)};
+#undef expand
+#define expand(l, op)                                                          \
+  (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
+static int (*indirect_set_check[])(kmp_user_lock_p, kmp_int32) = {
+    KMP_FOREACH_I_LOCK(expand, acquire)};
+#undef expand
+
+// unset/release and test functions
+#define expand(l, op)                                                          \
+  (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
+static int (*indirect_unset[])(kmp_user_lock_p, kmp_int32) = {
+    KMP_FOREACH_I_LOCK(expand, release)};
+static int (*indirect_test[])(kmp_user_lock_p,
+                              kmp_int32) = {KMP_FOREACH_I_LOCK(expand, test)};
+#undef expand
+#define expand(l, op)                                                          \
+  (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
+static int (*indirect_unset_check[])(kmp_user_lock_p, kmp_int32) = {
+    KMP_FOREACH_I_LOCK(expand, release)};
+static int (*indirect_test_check[])(kmp_user_lock_p, kmp_int32) = {
+    KMP_FOREACH_I_LOCK(expand, test)};
+#undef expand
+
+// Exposes only one jump tables (*lock or *lock_with_checks).
+void (**__kmp_indirect_destroy)(kmp_user_lock_p) = 0;
+int (**__kmp_indirect_set)(kmp_user_lock_p, kmp_int32) = 0;
+int (**__kmp_indirect_unset)(kmp_user_lock_p, kmp_int32) = 0;
+int (**__kmp_indirect_test)(kmp_user_lock_p, kmp_int32) = 0;
+
+// Lock index table.
+kmp_indirect_lock_table_t __kmp_i_lock_table;
+
+// Size of indirect locks.
+static kmp_uint32 __kmp_indirect_lock_size[KMP_NUM_I_LOCKS] = {0};
+
+// Jump tables for lock accessor/modifier.
+void (*__kmp_indirect_set_location[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
+                                                     const ident_t *) = {0};
+void (*__kmp_indirect_set_flags[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
+                                                  kmp_lock_flags_t) = {0};
+const ident_t *(*__kmp_indirect_get_location[KMP_NUM_I_LOCKS])(
+    kmp_user_lock_p) = {0};
+kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
+    kmp_user_lock_p) = {0};
+
+// Use different lock pools for different lock types.
+static kmp_indirect_lock_t *__kmp_indirect_lock_pool[KMP_NUM_I_LOCKS] = {0};
+
+// User lock allocator for dynamically dispatched indirect locks. Every entry of
+// the indirect lock table holds the address and type of the allocated indirect
+// lock (kmp_indirect_lock_t), and the size of the table doubles when it is
+// full. A destroyed indirect lock object is returned to the reusable pool of
+// locks, unique to each lock type.
+kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
+                                                  kmp_int32 gtid,
+                                                  kmp_indirect_locktag_t tag) {
+  kmp_indirect_lock_t *lck;
+  kmp_lock_index_t idx, table_idx;
+
+  __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+  if (__kmp_indirect_lock_pool[tag] != NULL) {
+    // Reuse the allocated and destroyed lock object
+    lck = __kmp_indirect_lock_pool[tag];
+    if (OMP_LOCK_T_SIZE < sizeof(void *))
+      idx = lck->lock->pool.index;
+    __kmp_indirect_lock_pool[tag] = (kmp_indirect_lock_t *)lck->lock->pool.next;
+    KA_TRACE(20, ("__kmp_allocate_indirect_lock: reusing an existing lock %p\n",
+                  lck));
+  } else {
+    kmp_uint32 row, col;
+    kmp_indirect_lock_table_t *lock_table = &__kmp_i_lock_table;
+    idx = 0;
+    // Find location in list of lock tables to put new lock
+    while (1) {
+      table_idx = lock_table->next; // index within this table
+      idx += lock_table->next; // global index within list of tables
+      if (table_idx < lock_table->nrow_ptrs * KMP_I_LOCK_CHUNK) {
+        row = table_idx / KMP_I_LOCK_CHUNK;
+        col = table_idx % KMP_I_LOCK_CHUNK;
+        // Allocate a new row of locks if necessary
+        if (!lock_table->table[row]) {
+          lock_table->table[row] = (kmp_indirect_lock_t *)__kmp_allocate(
+              sizeof(kmp_indirect_lock_t) * KMP_I_LOCK_CHUNK);
+        }
+        break;
+      }
+      // Allocate a new lock table if necessary with double the capacity
+      if (!lock_table->next_table) {
+        kmp_indirect_lock_table_t *next_table =
+            (kmp_indirect_lock_table_t *)__kmp_allocate(
+                sizeof(kmp_indirect_lock_table_t));
+        next_table->table = (kmp_indirect_lock_t **)__kmp_allocate(
+            sizeof(kmp_indirect_lock_t *) * 2 * lock_table->nrow_ptrs);
+        next_table->nrow_ptrs = 2 * lock_table->nrow_ptrs;
+        next_table->next = 0;
+        next_table->next_table = nullptr;
+        lock_table->next_table = next_table;
+      }
+      lock_table = lock_table->next_table;
+      KMP_ASSERT(lock_table);
+    }
+    lock_table->next++;
+
+    lck = &lock_table->table[row][col];
+    // Allocate a new base lock object
+    lck->lock = (kmp_user_lock_p)__kmp_allocate(__kmp_indirect_lock_size[tag]);
+    KA_TRACE(20,
+             ("__kmp_allocate_indirect_lock: allocated a new lock %p\n", lck));
+  }
+
+  __kmp_release_lock(&__kmp_global_lock, gtid);
+
+  lck->type = tag;
+
+  if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+    *((kmp_lock_index_t *)user_lock) = idx
+                                       << 1; // indirect lock word must be even
+  } else {
+    *((kmp_indirect_lock_t **)user_lock) = lck;
+  }
+
+  return lck;
+}
+
+// User lock lookup for dynamically dispatched locks.
+static __forceinline kmp_indirect_lock_t *
+__kmp_lookup_indirect_lock(void **user_lock, const char *func) {
+  if (__kmp_env_consistency_check) {
+    kmp_indirect_lock_t *lck = NULL;
+    if (user_lock == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+    if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+      kmp_lock_index_t idx = KMP_EXTRACT_I_INDEX(user_lock);
+      lck = __kmp_get_i_lock(idx);
+    } else {
+      lck = *((kmp_indirect_lock_t **)user_lock);
+    }
+    if (lck == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+    return lck;
+  } else {
+    if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+      return __kmp_get_i_lock(KMP_EXTRACT_I_INDEX(user_lock));
+    } else {
+      return *((kmp_indirect_lock_t **)user_lock);
+    }
+  }
+}
+
+static void __kmp_init_indirect_lock(kmp_dyna_lock_t *lock,
+                                     kmp_dyna_lockseq_t seq) {
+#if KMP_USE_ADAPTIVE_LOCKS
+  if (seq == lockseq_adaptive && !__kmp_cpuinfo.flags.rtm) {
+    KMP_WARNING(AdaptiveNotSupported, "kmp_lockseq_t", "adaptive");
+    seq = lockseq_queuing;
+  }
+#endif
+#if KMP_USE_TSX
+  if (seq == lockseq_rtm_queuing && !__kmp_cpuinfo.flags.rtm) {
+    seq = lockseq_queuing;
+  }
+#endif
+  kmp_indirect_locktag_t tag = KMP_GET_I_TAG(seq);
+  kmp_indirect_lock_t *l =
+      __kmp_allocate_indirect_lock((void **)lock, __kmp_entry_gtid(), tag);
+  KMP_I_LOCK_FUNC(l, init)(l->lock);
+  KA_TRACE(
+      20, ("__kmp_init_indirect_lock: initialized indirect lock with type#%d\n",
+           seq));
+}
+
+static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock) {
+  kmp_uint32 gtid = __kmp_entry_gtid();
+  kmp_indirect_lock_t *l =
+      __kmp_lookup_indirect_lock((void **)lock, "omp_destroy_lock");
+  KMP_I_LOCK_FUNC(l, destroy)(l->lock);
+  kmp_indirect_locktag_t tag = l->type;
+
+  __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+  // Use the base lock's space to keep the pool chain.
+  l->lock->pool.next = (kmp_user_lock_p)__kmp_indirect_lock_pool[tag];
+  if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+    l->lock->pool.index = KMP_EXTRACT_I_INDEX(lock);
+  }
+  __kmp_indirect_lock_pool[tag] = l;
+
+  __kmp_release_lock(&__kmp_global_lock, gtid);
+}
+
+static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
+  kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
+  return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
+}
+
+static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
+  kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
+  return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
+}
+
+static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
+  kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
+  return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
+}
+
+static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                               kmp_int32 gtid) {
+  kmp_indirect_lock_t *l =
+      __kmp_lookup_indirect_lock((void **)lock, "omp_set_lock");
+  return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
+}
+
+static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                                 kmp_int32 gtid) {
+  kmp_indirect_lock_t *l =
+      __kmp_lookup_indirect_lock((void **)lock, "omp_unset_lock");
+  return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
+}
+
+static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                                kmp_int32 gtid) {
+  kmp_indirect_lock_t *l =
+      __kmp_lookup_indirect_lock((void **)lock, "omp_test_lock");
+  return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
+}
+
+kmp_dyna_lockseq_t __kmp_user_lock_seq = lockseq_queuing;
+
+// This is used only in kmp_error.cpp when consistency checking is on.
+kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p lck, kmp_uint32 seq) {
+  switch (seq) {
+  case lockseq_tas:
+  case lockseq_nested_tas:
+    return __kmp_get_tas_lock_owner((kmp_tas_lock_t *)lck);
+#if KMP_USE_FUTEX
+  case lockseq_futex:
+  case lockseq_nested_futex:
+    return __kmp_get_futex_lock_owner((kmp_futex_lock_t *)lck);
+#endif
+  case lockseq_ticket:
+  case lockseq_nested_ticket:
+    return __kmp_get_ticket_lock_owner((kmp_ticket_lock_t *)lck);
+  case lockseq_queuing:
+  case lockseq_nested_queuing:
+#if KMP_USE_ADAPTIVE_LOCKS
+  case lockseq_adaptive:
+#endif
+    return __kmp_get_queuing_lock_owner((kmp_queuing_lock_t *)lck);
+  case lockseq_drdpa:
+  case lockseq_nested_drdpa:
+    return __kmp_get_drdpa_lock_owner((kmp_drdpa_lock_t *)lck);
+  default:
+    return 0;
+  }
+}
+
+// Initializes data for dynamic user locks.
+void __kmp_init_dynamic_user_locks() {
+  // Initialize jump table for the lock functions
+  if (__kmp_env_consistency_check) {
+    __kmp_direct_set = direct_set_check;
+    __kmp_direct_unset = direct_unset_check;
+    __kmp_direct_test = direct_test_check;
+    __kmp_direct_destroy = direct_destroy_check;
+    __kmp_indirect_set = indirect_set_check;
+    __kmp_indirect_unset = indirect_unset_check;
+    __kmp_indirect_test = indirect_test_check;
+    __kmp_indirect_destroy = indirect_destroy_check;
+  } else {
+    __kmp_direct_set = direct_set;
+    __kmp_direct_unset = direct_unset;
+    __kmp_direct_test = direct_test;
+    __kmp_direct_destroy = direct_destroy;
+    __kmp_indirect_set = indirect_set;
+    __kmp_indirect_unset = indirect_unset;
+    __kmp_indirect_test = indirect_test;
+    __kmp_indirect_destroy = indirect_destroy;
+  }
+  // If the user locks have already been initialized, then return. Allow the
+  // switch between different KMP_CONSISTENCY_CHECK values, but do not allocate
+  // new lock tables if they have already been allocated.
+  if (__kmp_init_user_locks)
+    return;
+
+  // Initialize lock index table
+  __kmp_i_lock_table.nrow_ptrs = KMP_I_LOCK_TABLE_INIT_NROW_PTRS;
+  __kmp_i_lock_table.table = (kmp_indirect_lock_t **)__kmp_allocate(
+      sizeof(kmp_indirect_lock_t *) * KMP_I_LOCK_TABLE_INIT_NROW_PTRS);
+  *(__kmp_i_lock_table.table) = (kmp_indirect_lock_t *)__kmp_allocate(
+      KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
+  __kmp_i_lock_table.next = 0;
+  __kmp_i_lock_table.next_table = nullptr;
+
+  // Indirect lock size
+  __kmp_indirect_lock_size[locktag_ticket] = sizeof(kmp_ticket_lock_t);
+  __kmp_indirect_lock_size[locktag_queuing] = sizeof(kmp_queuing_lock_t);
+#if KMP_USE_ADAPTIVE_LOCKS
+  __kmp_indirect_lock_size[locktag_adaptive] = sizeof(kmp_adaptive_lock_t);
+#endif
+  __kmp_indirect_lock_size[locktag_drdpa] = sizeof(kmp_drdpa_lock_t);
+#if KMP_USE_TSX
+  __kmp_indirect_lock_size[locktag_rtm_queuing] = sizeof(kmp_queuing_lock_t);
+#endif
+  __kmp_indirect_lock_size[locktag_nested_tas] = sizeof(kmp_tas_lock_t);
+#if KMP_USE_FUTEX
+  __kmp_indirect_lock_size[locktag_nested_futex] = sizeof(kmp_futex_lock_t);
+#endif
+  __kmp_indirect_lock_size[locktag_nested_ticket] = sizeof(kmp_ticket_lock_t);
+  __kmp_indirect_lock_size[locktag_nested_queuing] = sizeof(kmp_queuing_lock_t);
+  __kmp_indirect_lock_size[locktag_nested_drdpa] = sizeof(kmp_drdpa_lock_t);
+
+// Initialize lock accessor/modifier
+#define fill_jumps(table, expand, sep)                                         \
+  {                                                                            \
+    table[locktag##sep##ticket] = expand(ticket);                              \
+    table[locktag##sep##queuing] = expand(queuing);                            \
+    table[locktag##sep##drdpa] = expand(drdpa);                                \
+  }
+
+#if KMP_USE_ADAPTIVE_LOCKS
+#define fill_table(table, expand)                                              \
+  {                                                                            \
+    fill_jumps(table, expand, _);                                              \
+    table[locktag_adaptive] = expand(queuing);                                 \
+    fill_jumps(table, expand, _nested_);                                       \
+  }
+#else
+#define fill_table(table, expand)                                              \
+  {                                                                            \
+    fill_jumps(table, expand, _);                                              \
+    fill_jumps(table, expand, _nested_);                                       \
+  }
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+#define expand(l)                                                              \
+  (void (*)(kmp_user_lock_p, const ident_t *)) __kmp_set_##l##_lock_location
+  fill_table(__kmp_indirect_set_location, expand);
+#undef expand
+#define expand(l)                                                              \
+  (void (*)(kmp_user_lock_p, kmp_lock_flags_t)) __kmp_set_##l##_lock_flags
+  fill_table(__kmp_indirect_set_flags, expand);
+#undef expand
+#define expand(l)                                                              \
+  (const ident_t *(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_location
+  fill_table(__kmp_indirect_get_location, expand);
+#undef expand
+#define expand(l)                                                              \
+  (kmp_lock_flags_t(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_flags
+  fill_table(__kmp_indirect_get_flags, expand);
+#undef expand
+
+  __kmp_init_user_locks = TRUE;
+}
+
+// Clean up the lock table.
+void __kmp_cleanup_indirect_user_locks() {
+  int k;
+
+  // Clean up locks in the pools first (they were already destroyed before going
+  // into the pools).
+  for (k = 0; k < KMP_NUM_I_LOCKS; ++k) {
+    kmp_indirect_lock_t *l = __kmp_indirect_lock_pool[k];
+    while (l != NULL) {
+      kmp_indirect_lock_t *ll = l;
+      l = (kmp_indirect_lock_t *)l->lock->pool.next;
+      KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: freeing %p from pool\n",
+                    ll));
+      __kmp_free(ll->lock);
+      ll->lock = NULL;
+    }
+    __kmp_indirect_lock_pool[k] = NULL;
+  }
+  // Clean up the remaining undestroyed locks.
+  kmp_indirect_lock_table_t *ptr = &__kmp_i_lock_table;
+  while (ptr) {
+    for (kmp_uint32 row = 0; row < ptr->nrow_ptrs; ++row) {
+      if (!ptr->table[row])
+        continue;
+      for (kmp_uint32 col = 0; col < KMP_I_LOCK_CHUNK; ++col) {
+        kmp_indirect_lock_t *l = &ptr->table[row][col];
+        if (l->lock) {
+          // Locks not destroyed explicitly need to be destroyed here.
+          KMP_I_LOCK_FUNC(l, destroy)(l->lock);
+          KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p "
+                        "from table\n",
+                        l));
+          __kmp_free(l->lock);
+        }
+      }
+      __kmp_free(ptr->table[row]);
+    }
+    kmp_indirect_lock_table_t *next_table = ptr->next_table;
+    if (ptr != &__kmp_i_lock_table)
+      __kmp_free(ptr);
+    ptr = next_table;
+  }
+
+  __kmp_init_user_locks = FALSE;
+}
+
+enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
+int __kmp_num_locks_in_block = 1; // FIXME - tune this value
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+static void __kmp_init_tas_lock_with_checks(kmp_tas_lock_t *lck) {
+  __kmp_init_tas_lock(lck);
+}
+
+static void __kmp_init_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
+  __kmp_init_nested_tas_lock(lck);
+}
+
+#if KMP_USE_FUTEX
+static void __kmp_init_futex_lock_with_checks(kmp_futex_lock_t *lck) {
+  __kmp_init_futex_lock(lck);
+}
+
+static void __kmp_init_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
+  __kmp_init_nested_futex_lock(lck);
+}
+#endif
+
+static int __kmp_is_ticket_lock_initialized(kmp_ticket_lock_t *lck) {
+  return lck == lck->lk.self;
+}
+
+static void __kmp_init_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
+  __kmp_init_ticket_lock(lck);
+}
+
+static void __kmp_init_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
+  __kmp_init_nested_ticket_lock(lck);
+}
+
+static int __kmp_is_queuing_lock_initialized(kmp_queuing_lock_t *lck) {
+  return lck == lck->lk.initialized;
+}
+
+static void __kmp_init_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
+  __kmp_init_queuing_lock(lck);
+}
+
+static void
+__kmp_init_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
+  __kmp_init_nested_queuing_lock(lck);
+}
+
+#if KMP_USE_ADAPTIVE_LOCKS
+static void __kmp_init_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
+  __kmp_init_adaptive_lock(lck);
+}
+#endif
+
+static int __kmp_is_drdpa_lock_initialized(kmp_drdpa_lock_t *lck) {
+  return lck == lck->lk.initialized;
+}
+
+static void __kmp_init_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
+  __kmp_init_drdpa_lock(lck);
+}
+
+static void __kmp_init_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
+  __kmp_init_nested_drdpa_lock(lck);
+}
+
+/* user locks
+ * They are implemented as a table of function pointers which are set to the
+ * lock functions of the appropriate kind, once that has been determined. */
+
+enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
+
+size_t __kmp_base_user_lock_size = 0;
+size_t __kmp_user_lock_size = 0;
+
+kmp_int32 (*__kmp_get_user_lock_owner_)(kmp_user_lock_p lck) = NULL;
+int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                            kmp_int32 gtid) = NULL;
+
+int (*__kmp_test_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                         kmp_int32 gtid) = NULL;
+int (*__kmp_release_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                            kmp_int32 gtid) = NULL;
+void (*__kmp_init_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
+void (*__kmp_destroy_user_lock_)(kmp_user_lock_p lck) = NULL;
+void (*__kmp_destroy_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
+int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid) = NULL;
+
+int (*__kmp_test_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                kmp_int32 gtid) = NULL;
+int (*__kmp_release_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid) = NULL;
+void (*__kmp_init_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
+void (*__kmp_destroy_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
+
+int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck) = NULL;
+const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck) = NULL;
+void (*__kmp_set_user_lock_location_)(kmp_user_lock_p lck,
+                                      const ident_t *loc) = NULL;
+kmp_lock_flags_t (*__kmp_get_user_lock_flags_)(kmp_user_lock_p lck) = NULL;
+void (*__kmp_set_user_lock_flags_)(kmp_user_lock_p lck,
+                                   kmp_lock_flags_t flags) = NULL;
+
+void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind) {
+  switch (user_lock_kind) {
+  case lk_default:
+  default:
+    KMP_ASSERT(0);
+
+  case lk_tas: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_tas_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_tas_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_tas_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(tas);
+      KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(tas);
+    } else {
+      KMP_BIND_USER_LOCK(tas);
+      KMP_BIND_NESTED_USER_LOCK(tas);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_tas_lock);
+
+    __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
+
+    __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
+
+    __kmp_set_user_lock_location_ =
+        (void (*)(kmp_user_lock_p, const ident_t *))NULL;
+
+    __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
+
+    __kmp_set_user_lock_flags_ =
+        (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
+  } break;
+
+#if KMP_USE_FUTEX
+
+  case lk_futex: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_futex_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_futex_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_futex_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(futex);
+      KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(futex);
+    } else {
+      KMP_BIND_USER_LOCK(futex);
+      KMP_BIND_NESTED_USER_LOCK(futex);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_futex_lock);
+
+    __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
+
+    __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
+
+    __kmp_set_user_lock_location_ =
+        (void (*)(kmp_user_lock_p, const ident_t *))NULL;
+
+    __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
+
+    __kmp_set_user_lock_flags_ =
+        (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
+  } break;
+
+#endif // KMP_USE_FUTEX
+
+  case lk_ticket: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_ticket_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_ticket_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(ticket);
+      KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(ticket);
+    } else {
+      KMP_BIND_USER_LOCK(ticket);
+      KMP_BIND_NESTED_USER_LOCK(ticket);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_ticket_lock);
+
+    __kmp_is_user_lock_initialized_ =
+        (int (*)(kmp_user_lock_p))(&__kmp_is_ticket_lock_initialized);
+
+    __kmp_get_user_lock_location_ =
+        (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_location);
+
+    __kmp_set_user_lock_location_ = (void (*)(
+        kmp_user_lock_p, const ident_t *))(&__kmp_set_ticket_lock_location);
+
+    __kmp_get_user_lock_flags_ =
+        (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_flags);
+
+    __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
+        &__kmp_set_ticket_lock_flags);
+  } break;
+
+  case lk_queuing: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_queuing_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_queuing_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(queuing);
+      KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(queuing);
+    } else {
+      KMP_BIND_USER_LOCK(queuing);
+      KMP_BIND_NESTED_USER_LOCK(queuing);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_queuing_lock);
+
+    __kmp_is_user_lock_initialized_ =
+        (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
+
+    __kmp_get_user_lock_location_ =
+        (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
+
+    __kmp_set_user_lock_location_ = (void (*)(
+        kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
+
+    __kmp_get_user_lock_flags_ =
+        (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
+
+    __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
+        &__kmp_set_queuing_lock_flags);
+  } break;
+
+#if KMP_USE_ADAPTIVE_LOCKS
+  case lk_adaptive: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_adaptive_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_adaptive_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(adaptive);
+    } else {
+      KMP_BIND_USER_LOCK(adaptive);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_adaptive_lock);
+
+    __kmp_is_user_lock_initialized_ =
+        (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
+
+    __kmp_get_user_lock_location_ =
+        (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
+
+    __kmp_set_user_lock_location_ = (void (*)(
+        kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
+
+    __kmp_get_user_lock_flags_ =
+        (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
+
+    __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
+        &__kmp_set_queuing_lock_flags);
+
+  } break;
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+  case lk_drdpa: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_drdpa_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_drdpa_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(drdpa);
+      KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(drdpa);
+    } else {
+      KMP_BIND_USER_LOCK(drdpa);
+      KMP_BIND_NESTED_USER_LOCK(drdpa);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_drdpa_lock);
+
+    __kmp_is_user_lock_initialized_ =
+        (int (*)(kmp_user_lock_p))(&__kmp_is_drdpa_lock_initialized);
+
+    __kmp_get_user_lock_location_ =
+        (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_location);
+
+    __kmp_set_user_lock_location_ = (void (*)(
+        kmp_user_lock_p, const ident_t *))(&__kmp_set_drdpa_lock_location);
+
+    __kmp_get_user_lock_flags_ =
+        (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_flags);
+
+    __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
+        &__kmp_set_drdpa_lock_flags);
+  } break;
+  }
+}
+
+// ----------------------------------------------------------------------------
+// User lock table & lock allocation
+
+kmp_lock_table_t __kmp_user_lock_table = {1, 0, NULL};
+kmp_user_lock_p __kmp_lock_pool = NULL;
+
+// Lock block-allocation support.
+kmp_block_of_locks *__kmp_lock_blocks = NULL;
+int __kmp_num_locks_in_block = 1; // FIXME - tune this value
+
+static kmp_lock_index_t __kmp_lock_table_insert(kmp_user_lock_p lck) {
+  // Assume that kmp_global_lock is held upon entry/exit.
+  kmp_lock_index_t index;
+  if (__kmp_user_lock_table.used >= __kmp_user_lock_table.allocated) {
+    kmp_lock_index_t size;
+    kmp_user_lock_p *table;
+    // Reallocate lock table.
+    if (__kmp_user_lock_table.allocated == 0) {
+      size = 1024;
+    } else {
+      size = __kmp_user_lock_table.allocated * 2;
+    }
+    table = (kmp_user_lock_p *)__kmp_allocate(sizeof(kmp_user_lock_p) * size);
+    KMP_MEMCPY(table + 1, __kmp_user_lock_table.table + 1,
+               sizeof(kmp_user_lock_p) * (__kmp_user_lock_table.used - 1));
+    table[0] = (kmp_user_lock_p)__kmp_user_lock_table.table;
+    // We cannot free the previous table now, since it may be in use by other
+    // threads. So save the pointer to the previous table in the first
+    // element of the new table. All the tables will be organized into a list,
+    // and could be freed when library shutting down.
+    __kmp_user_lock_table.table = table;
+    __kmp_user_lock_table.allocated = size;
+  }
+  KMP_DEBUG_ASSERT(__kmp_user_lock_table.used <
+                   __kmp_user_lock_table.allocated);
+  index = __kmp_user_lock_table.used;
+  __kmp_user_lock_table.table[index] = lck;
+  ++__kmp_user_lock_table.used;
+  return index;
+}
+
+static kmp_user_lock_p __kmp_lock_block_allocate() {
+  // Assume that kmp_global_lock is held upon entry/exit.
+  static int last_index = 0;
+  if ((last_index >= __kmp_num_locks_in_block) || (__kmp_lock_blocks == NULL)) {
+    // Restart the index.
+    last_index = 0;
+    // Need to allocate a new block.
+    KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
+    size_t space_for_locks = __kmp_user_lock_size * __kmp_num_locks_in_block;
+    char *buffer =
+        (char *)__kmp_allocate(space_for_locks + sizeof(kmp_block_of_locks));
+    // Set up the new block.
+    kmp_block_of_locks *new_block =
+        (kmp_block_of_locks *)(&buffer[space_for_locks]);
+    new_block->next_block = __kmp_lock_blocks;
+    new_block->locks = (void *)buffer;
+    // Publish the new block.
+    KMP_MB();
+    __kmp_lock_blocks = new_block;
+  }
+  kmp_user_lock_p ret = (kmp_user_lock_p)(&(
+      ((char *)(__kmp_lock_blocks->locks))[last_index * __kmp_user_lock_size]));
+  last_index++;
+  return ret;
+}
+
+// Get memory for a lock. It may be freshly allocated memory or reused memory
+// from lock pool.
+kmp_user_lock_p __kmp_user_lock_allocate(void **user_lock, kmp_int32 gtid,
+                                         kmp_lock_flags_t flags) {
+  kmp_user_lock_p lck;
+  kmp_lock_index_t index;
+  KMP_DEBUG_ASSERT(user_lock);
+
+  __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+  if (__kmp_lock_pool == NULL) {
+    // Lock pool is empty. Allocate new memory.
+
+    if (__kmp_num_locks_in_block <= 1) { // Tune this cutoff point.
+      lck = (kmp_user_lock_p)__kmp_allocate(__kmp_user_lock_size);
+    } else {
+      lck = __kmp_lock_block_allocate();
+    }
+
+    // Insert lock in the table so that it can be freed in __kmp_cleanup,
+    // and debugger has info on all allocated locks.
+    index = __kmp_lock_table_insert(lck);
+  } else {
+    // Pick up lock from pool.
+    lck = __kmp_lock_pool;
+    index = __kmp_lock_pool->pool.index;
+    __kmp_lock_pool = __kmp_lock_pool->pool.next;
+  }
+
+  // We could potentially differentiate between nested and regular locks
+  // here, and do the lock table lookup for regular locks only.
+  if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+    *((kmp_lock_index_t *)user_lock) = index;
+  } else {
+    *((kmp_user_lock_p *)user_lock) = lck;
+  }
+
+  // mark the lock if it is critical section lock.
+  __kmp_set_user_lock_flags(lck, flags);
+
+  __kmp_release_lock(&__kmp_global_lock, gtid); // AC: TODO move this line upper
+
+  return lck;
+}
+
+// Put lock's memory to pool for reusing.
+void __kmp_user_lock_free(void **user_lock, kmp_int32 gtid,
+                          kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(user_lock != NULL);
+  KMP_DEBUG_ASSERT(lck != NULL);
+
+  __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+  lck->pool.next = __kmp_lock_pool;
+  __kmp_lock_pool = lck;
+  if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+    kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
+    KMP_DEBUG_ASSERT(0 < index && index <= __kmp_user_lock_table.used);
+    lck->pool.index = index;
+  }
+
+  __kmp_release_lock(&__kmp_global_lock, gtid);
+}
+
+kmp_user_lock_p __kmp_lookup_user_lock(void **user_lock, char const *func) {
+  kmp_user_lock_p lck = NULL;
+
+  if (__kmp_env_consistency_check) {
+    if (user_lock == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+  }
+
+  if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+    kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
+    if (__kmp_env_consistency_check) {
+      if (!(0 < index && index < __kmp_user_lock_table.used)) {
+        KMP_FATAL(LockIsUninitialized, func);
+      }
+    }
+    KMP_DEBUG_ASSERT(0 < index && index < __kmp_user_lock_table.used);
+    KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
+    lck = __kmp_user_lock_table.table[index];
+  } else {
+    lck = *((kmp_user_lock_p *)user_lock);
+  }
+
+  if (__kmp_env_consistency_check) {
+    if (lck == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+  }
+
+  return lck;
+}
+
+void __kmp_cleanup_user_locks(void) {
+  // Reset lock pool. Don't worry about lock in the pool--we will free them when
+  // iterating through lock table (it includes all the locks, dead or alive).
+  __kmp_lock_pool = NULL;
+
+#define IS_CRITICAL(lck)                                                       \
+  ((__kmp_get_user_lock_flags_ != NULL) &&                                     \
+   ((*__kmp_get_user_lock_flags_)(lck)&kmp_lf_critical_section))
+
+  // Loop through lock table, free all locks.
+  // Do not free item [0], it is reserved for lock tables list.
+  //
+  // FIXME - we are iterating through a list of (pointers to) objects of type
+  // union kmp_user_lock, but we have no way of knowing whether the base type is
+  // currently "pool" or whatever the global user lock type is.
+  //
+  // We are relying on the fact that for all of the user lock types
+  // (except "tas"), the first field in the lock struct is the "initialized"
+  // field, which is set to the address of the lock object itself when
+  // the lock is initialized.  When the union is of type "pool", the
+  // first field is a pointer to the next object in the free list, which
+  // will not be the same address as the object itself.
+  //
+  // This means that the check (*__kmp_is_user_lock_initialized_)(lck) will fail
+  // for "pool" objects on the free list.  This must happen as the "location"
+  // field of real user locks overlaps the "index" field of "pool" objects.
+  //
+  // It would be better to run through the free list, and remove all "pool"
+  // objects from the lock table before executing this loop.  However,
+  // "pool" objects do not always have their index field set (only on
+  // lin_32e), and I don't want to search the lock table for the address
+  // of every "pool" object on the free list.
+  while (__kmp_user_lock_table.used > 1) {
+    const ident *loc;
+
+    // reduce __kmp_user_lock_table.used before freeing the lock,
+    // so that state of locks is consistent
+    kmp_user_lock_p lck =
+        __kmp_user_lock_table.table[--__kmp_user_lock_table.used];
+
+    if ((__kmp_is_user_lock_initialized_ != NULL) &&
+        (*__kmp_is_user_lock_initialized_)(lck)) {
+      // Issue a warning if: KMP_CONSISTENCY_CHECK AND lock is initialized AND
+      // it is NOT a critical section (user is not responsible for destroying
+      // criticals) AND we know source location to report.
+      if (__kmp_env_consistency_check && (!IS_CRITICAL(lck)) &&
+          ((loc = __kmp_get_user_lock_location(lck)) != NULL) &&
+          (loc->psource != NULL)) {
+        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
+        KMP_WARNING(CnsLockNotDestroyed, str_loc.file, str_loc.line);
+        __kmp_str_loc_free(&str_loc);
+      }
+
+#ifdef KMP_DEBUG
+      if (IS_CRITICAL(lck)) {
+        KA_TRACE(
+            20,
+            ("__kmp_cleanup_user_locks: free critical section lock %p (%p)\n",
+             lck, *(void **)lck));
+      } else {
+        KA_TRACE(20, ("__kmp_cleanup_user_locks: free lock %p (%p)\n", lck,
+                      *(void **)lck));
+      }
+#endif // KMP_DEBUG
+
+      // Cleanup internal lock dynamic resources (for drdpa locks particularly).
+      __kmp_destroy_user_lock(lck);
+    }
+
+    // Free the lock if block allocation of locks is not used.
+    if (__kmp_lock_blocks == NULL) {
+      __kmp_free(lck);
+    }
+  }
+
+#undef IS_CRITICAL
+
+  // delete lock table(s).
+  kmp_user_lock_p *table_ptr = __kmp_user_lock_table.table;
+  __kmp_user_lock_table.table = NULL;
+  __kmp_user_lock_table.allocated = 0;
+
+  while (table_ptr != NULL) {
+    // In the first element we saved the pointer to the previous
+    // (smaller) lock table.
+    kmp_user_lock_p *next = (kmp_user_lock_p *)(table_ptr[0]);
+    __kmp_free(table_ptr);
+    table_ptr = next;
+  }
+
+  // Free buffers allocated for blocks of locks.
+  kmp_block_of_locks_t *block_ptr = __kmp_lock_blocks;
+  __kmp_lock_blocks = NULL;
+
+  while (block_ptr != NULL) {
+    kmp_block_of_locks_t *next = block_ptr->next_block;
+    __kmp_free(block_ptr->locks);
+    // *block_ptr itself was allocated at the end of the locks vector.
+    block_ptr = next;
+  }
+
+  TCW_4(__kmp_init_user_locks, FALSE);
+}
+
+#endif // KMP_USE_DYNAMIC_LOCK
diff --git a/third_party/openmp/kmp_lock.h b/third_party/openmp/kmp_lock.h
new file mode 100644
index 000000000..f21179b4e
--- /dev/null
+++ b/third_party/openmp/kmp_lock.h
@@ -0,0 +1,1296 @@
+/*
+ * kmp_lock.h -- lock header file
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_LOCK_H
+#define KMP_LOCK_H
+
+#include <limits.h> // CHAR_BIT
+#include <stddef.h> // offsetof
+
+#include "kmp_debug.h"
+#include "kmp_os.h"
+
+#ifdef __cplusplus
+#include <atomic>
+
+extern "C" {
+#endif // __cplusplus
+
+// ----------------------------------------------------------------------------
+// Have to copy these definitions from kmp.h because kmp.h cannot be included
+// due to circular dependencies.  Will undef these at end of file.
+
+#define KMP_PAD(type, sz)                                                      \
+  (sizeof(type) + (sz - ((sizeof(type) - 1) % (sz)) - 1))
+#define KMP_GTID_DNE (-2)
+
+// Forward declaration of ident and ident_t
+
+struct ident;
+typedef struct ident ident_t;
+
+// End of copied code.
+// ----------------------------------------------------------------------------
+
+// We need to know the size of the area we can assume that the compiler(s)
+// allocated for objects of type omp_lock_t and omp_nest_lock_t.  The Intel
+// compiler always allocates a pointer-sized area, as does visual studio.
+//
+// gcc however, only allocates 4 bytes for regular locks, even on 64-bit
+// intel archs.  It allocates at least 8 bytes for nested lock (more on
+// recent versions), but we are bounded by the pointer-sized chunks that
+// the Intel compiler allocates.
+
+#if KMP_OS_LINUX && defined(KMP_GOMP_COMPAT)
+#define OMP_LOCK_T_SIZE sizeof(int)
+#define OMP_NEST_LOCK_T_SIZE sizeof(void *)
+#else
+#define OMP_LOCK_T_SIZE sizeof(void *)
+#define OMP_NEST_LOCK_T_SIZE sizeof(void *)
+#endif
+
+// The Intel compiler allocates a 32-byte chunk for a critical section.
+// Both gcc and visual studio only allocate enough space for a pointer.
+// Sometimes we know that the space was allocated by the Intel compiler.
+#define OMP_CRITICAL_SIZE sizeof(void *)
+#define INTEL_CRITICAL_SIZE 32
+
+// lock flags
+typedef kmp_uint32 kmp_lock_flags_t;
+
+#define kmp_lf_critical_section 1
+
+// When a lock table is used, the indices are of kmp_lock_index_t
+typedef kmp_uint32 kmp_lock_index_t;
+
+// When memory allocated for locks are on the lock pool (free list),
+// it is treated as structs of this type.
+struct kmp_lock_pool {
+  union kmp_user_lock *next;
+  kmp_lock_index_t index;
+};
+
+typedef struct kmp_lock_pool kmp_lock_pool_t;
+
+extern void __kmp_validate_locks(void);
+
+// ----------------------------------------------------------------------------
+//  There are 5 lock implementations:
+//       1. Test and set locks.
+//       2. futex locks (Linux* OS on x86 and
+//          Intel(R) Many Integrated Core Architecture)
+//       3. Ticket (Lamport bakery) locks.
+//       4. Queuing locks (with separate spin fields).
+//       5. DRPA (Dynamically Reconfigurable Distributed Polling Area) locks
+//
+//   and 3 lock purposes:
+//       1. Bootstrap locks -- Used for a few locks available at library
+//       startup-shutdown time.
+//          These do not require non-negative global thread ID's.
+//       2. Internal RTL locks -- Used everywhere else in the RTL
+//       3. User locks (includes critical sections)
+// ----------------------------------------------------------------------------
+
+// ============================================================================
+// Lock implementations.
+//
+// Test and set locks.
+//
+// Non-nested test and set locks differ from the other lock kinds (except
+// futex) in that we use the memory allocated by the compiler for the lock,
+// rather than a pointer to it.
+//
+// On lin32, lin_32e, and win_32, the space allocated may be as small as 4
+// bytes, so we have to use a lock table for nested locks, and avoid accessing
+// the depth_locked field for non-nested locks.
+//
+// Information normally available to the tools, such as lock location, lock
+// usage (normal lock vs. critical section), etc. is not available with test and
+// set locks.
+// ----------------------------------------------------------------------------
+
+struct kmp_base_tas_lock {
+  // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
+  std::atomic<kmp_int32> poll;
+  kmp_int32 depth_locked; // depth locked, for nested locks only
+};
+
+typedef struct kmp_base_tas_lock kmp_base_tas_lock_t;
+
+union kmp_tas_lock {
+  kmp_base_tas_lock_t lk;
+  kmp_lock_pool_t pool; // make certain struct is large enough
+  double lk_align; // use worst case alignment; no cache line padding
+};
+
+typedef union kmp_tas_lock kmp_tas_lock_t;
+
+// Static initializer for test and set lock variables. Usage:
+//    kmp_tas_lock_t xlock = KMP_TAS_LOCK_INITIALIZER( xlock );
+#define KMP_TAS_LOCK_INITIALIZER(lock)                                         \
+  {                                                                            \
+    { KMP_LOCK_FREE(tas), 0 }                                                  \
+  }
+
+extern int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_tas_lock(kmp_tas_lock_t *lck);
+extern void __kmp_destroy_tas_lock(kmp_tas_lock_t *lck);
+
+extern int __kmp_acquire_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_nested_tas_lock(kmp_tas_lock_t *lck);
+extern void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck);
+
+#define KMP_LOCK_RELEASED 1
+#define KMP_LOCK_STILL_HELD 0
+#define KMP_LOCK_ACQUIRED_FIRST 1
+#define KMP_LOCK_ACQUIRED_NEXT 0
+#ifndef KMP_USE_FUTEX
+#define KMP_USE_FUTEX                                                          \
+  (KMP_OS_LINUX &&                                                             \
+   (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64))
+#endif
+#if KMP_USE_FUTEX
+
+// ----------------------------------------------------------------------------
+// futex locks.  futex locks are only available on Linux* OS.
+//
+// Like non-nested test and set lock, non-nested futex locks use the memory
+// allocated by the compiler for the lock, rather than a pointer to it.
+//
+// Information normally available to the tools, such as lock location, lock
+// usage (normal lock vs. critical section), etc. is not available with test and
+// set locks. With non-nested futex locks, the lock owner is not even available.
+// ----------------------------------------------------------------------------
+
+struct kmp_base_futex_lock {
+  volatile kmp_int32 poll; // KMP_LOCK_FREE(futex) => unlocked
+  // 2*(gtid+1) of owning thread, 0 if unlocked
+  // locked: (gtid+1) of owning thread
+  kmp_int32 depth_locked; // depth locked, for nested locks only
+};
+
+typedef struct kmp_base_futex_lock kmp_base_futex_lock_t;
+
+union kmp_futex_lock {
+  kmp_base_futex_lock_t lk;
+  kmp_lock_pool_t pool; // make certain struct is large enough
+  double lk_align; // use worst case alignment
+  // no cache line padding
+};
+
+typedef union kmp_futex_lock kmp_futex_lock_t;
+
+// Static initializer for futex lock variables. Usage:
+//    kmp_futex_lock_t xlock = KMP_FUTEX_LOCK_INITIALIZER( xlock );
+#define KMP_FUTEX_LOCK_INITIALIZER(lock)                                       \
+  {                                                                            \
+    { KMP_LOCK_FREE(futex), 0 }                                                \
+  }
+
+extern int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_futex_lock(kmp_futex_lock_t *lck);
+extern void __kmp_destroy_futex_lock(kmp_futex_lock_t *lck);
+
+extern int __kmp_acquire_nested_futex_lock(kmp_futex_lock_t *lck,
+                                           kmp_int32 gtid);
+extern int __kmp_test_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_nested_futex_lock(kmp_futex_lock_t *lck,
+                                           kmp_int32 gtid);
+extern void __kmp_init_nested_futex_lock(kmp_futex_lock_t *lck);
+extern void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck);
+
+#endif // KMP_USE_FUTEX
+
+// ----------------------------------------------------------------------------
+// Ticket locks.
+
+#ifdef __cplusplus
+
+#ifdef _MSC_VER
+// MSVC won't allow use of std::atomic<> in a union since it has non-trivial
+// copy constructor.
+
+struct kmp_base_ticket_lock {
+  // `initialized' must be the first entry in the lock data structure!
+  std::atomic_bool initialized;
+  volatile union kmp_ticket_lock *self; // points to the lock union
+  ident_t const *location; // Source code location of omp_init_lock().
+  std::atomic_uint
+      next_ticket; // ticket number to give to next thread which acquires
+  std::atomic_uint now_serving; // ticket number for thread which holds the lock
+  std::atomic_int owner_id; // (gtid+1) of owning thread, 0 if unlocked
+  std::atomic_int depth_locked; // depth locked, for nested locks only
+  kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
+};
+#else
+struct kmp_base_ticket_lock {
+  // `initialized' must be the first entry in the lock data structure!
+  std::atomic<bool> initialized;
+  volatile union kmp_ticket_lock *self; // points to the lock union
+  ident_t const *location; // Source code location of omp_init_lock().
+  std::atomic<unsigned>
+      next_ticket; // ticket number to give to next thread which acquires
+  std::atomic<unsigned>
+      now_serving; // ticket number for thread which holds the lock
+  std::atomic<int> owner_id; // (gtid+1) of owning thread, 0 if unlocked
+  std::atomic<int> depth_locked; // depth locked, for nested locks only
+  kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
+};
+#endif
+
+#else // __cplusplus
+
+struct kmp_base_ticket_lock;
+
+#endif // !__cplusplus
+
+typedef struct kmp_base_ticket_lock kmp_base_ticket_lock_t;
+
+union KMP_ALIGN_CACHE kmp_ticket_lock {
+  kmp_base_ticket_lock_t
+      lk; // This field must be first to allow static initializing.
+  kmp_lock_pool_t pool;
+  double lk_align; // use worst case alignment
+  char lk_pad[KMP_PAD(kmp_base_ticket_lock_t, CACHE_LINE)];
+};
+
+typedef union kmp_ticket_lock kmp_ticket_lock_t;
+
+// Static initializer for simple ticket lock variables. Usage:
+//    kmp_ticket_lock_t xlock = KMP_TICKET_LOCK_INITIALIZER( xlock );
+// Note the macro argument. It is important to make var properly initialized.
+#define KMP_TICKET_LOCK_INITIALIZER(lock)                                      \
+  {                                                                            \
+    { true, &(lock), NULL, 0U, 0U, 0, -1 }                                     \
+  }
+
+extern int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_ticket_lock_with_cheks(kmp_ticket_lock_t *lck,
+                                             kmp_int32 gtid);
+extern int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_ticket_lock(kmp_ticket_lock_t *lck);
+extern void __kmp_destroy_ticket_lock(kmp_ticket_lock_t *lck);
+
+extern int __kmp_acquire_nested_ticket_lock(kmp_ticket_lock_t *lck,
+                                            kmp_int32 gtid);
+extern int __kmp_test_nested_ticket_lock(kmp_ticket_lock_t *lck,
+                                         kmp_int32 gtid);
+extern int __kmp_release_nested_ticket_lock(kmp_ticket_lock_t *lck,
+                                            kmp_int32 gtid);
+extern void __kmp_init_nested_ticket_lock(kmp_ticket_lock_t *lck);
+extern void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck);
+
+// ----------------------------------------------------------------------------
+// Queuing locks.
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+struct kmp_adaptive_lock_info;
+
+typedef struct kmp_adaptive_lock_info kmp_adaptive_lock_info_t;
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+
+struct kmp_adaptive_lock_statistics {
+  /* So we can get stats from locks that haven't been destroyed. */
+  kmp_adaptive_lock_info_t *next;
+  kmp_adaptive_lock_info_t *prev;
+
+  /* Other statistics */
+  kmp_uint32 successfulSpeculations;
+  kmp_uint32 hardFailedSpeculations;
+  kmp_uint32 softFailedSpeculations;
+  kmp_uint32 nonSpeculativeAcquires;
+  kmp_uint32 nonSpeculativeAcquireAttempts;
+  kmp_uint32 lemmingYields;
+};
+
+typedef struct kmp_adaptive_lock_statistics kmp_adaptive_lock_statistics_t;
+
+extern void __kmp_print_speculative_stats();
+extern void __kmp_init_speculative_stats();
+
+#endif // KMP_DEBUG_ADAPTIVE_LOCKS
+
+struct kmp_adaptive_lock_info {
+  /* Values used for adaptivity.
+     Although these are accessed from multiple threads we don't access them
+     atomically, because if we miss updates it probably doesn't matter much. (It
+     just affects our decision about whether to try speculation on the lock). */
+  kmp_uint32 volatile badness;
+  kmp_uint32 volatile acquire_attempts;
+  /* Parameters of the lock. */
+  kmp_uint32 max_badness;
+  kmp_uint32 max_soft_retries;
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+  kmp_adaptive_lock_statistics_t volatile stats;
+#endif
+};
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+struct kmp_base_queuing_lock {
+
+  //  `initialized' must be the first entry in the lock data structure!
+  volatile union kmp_queuing_lock
+      *initialized; // Points to the lock union if in initialized state.
+
+  ident_t const *location; // Source code location of omp_init_lock().
+
+  KMP_ALIGN(8) // tail_id  must be 8-byte aligned!
+
+  volatile kmp_int32
+      tail_id; // (gtid+1) of thread at tail of wait queue, 0 if empty
+  // Must be no padding here since head/tail used in 8-byte CAS
+  volatile kmp_int32
+      head_id; // (gtid+1) of thread at head of wait queue, 0 if empty
+  // Decl order assumes little endian
+  // bakery-style lock
+  volatile kmp_uint32
+      next_ticket; // ticket number to give to next thread which acquires
+  volatile kmp_uint32
+      now_serving; // ticket number for thread which holds the lock
+  volatile kmp_int32 owner_id; // (gtid+1) of owning thread, 0 if unlocked
+  kmp_int32 depth_locked; // depth locked, for nested locks only
+
+  kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
+};
+
+typedef struct kmp_base_queuing_lock kmp_base_queuing_lock_t;
+
+KMP_BUILD_ASSERT(offsetof(kmp_base_queuing_lock_t, tail_id) % 8 == 0);
+
+union KMP_ALIGN_CACHE kmp_queuing_lock {
+  kmp_base_queuing_lock_t
+      lk; // This field must be first to allow static initializing.
+  kmp_lock_pool_t pool;
+  double lk_align; // use worst case alignment
+  char lk_pad[KMP_PAD(kmp_base_queuing_lock_t, CACHE_LINE)];
+};
+
+typedef union kmp_queuing_lock kmp_queuing_lock_t;
+
+extern int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_queuing_lock(kmp_queuing_lock_t *lck);
+extern void __kmp_destroy_queuing_lock(kmp_queuing_lock_t *lck);
+
+extern int __kmp_acquire_nested_queuing_lock(kmp_queuing_lock_t *lck,
+                                             kmp_int32 gtid);
+extern int __kmp_test_nested_queuing_lock(kmp_queuing_lock_t *lck,
+                                          kmp_int32 gtid);
+extern int __kmp_release_nested_queuing_lock(kmp_queuing_lock_t *lck,
+                                             kmp_int32 gtid);
+extern void __kmp_init_nested_queuing_lock(kmp_queuing_lock_t *lck);
+extern void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck);
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+// ----------------------------------------------------------------------------
+// Adaptive locks.
+struct kmp_base_adaptive_lock {
+  kmp_base_queuing_lock qlk;
+  KMP_ALIGN(CACHE_LINE)
+  kmp_adaptive_lock_info_t
+      adaptive; // Information for the speculative adaptive lock
+};
+
+typedef struct kmp_base_adaptive_lock kmp_base_adaptive_lock_t;
+
+union KMP_ALIGN_CACHE kmp_adaptive_lock {
+  kmp_base_adaptive_lock_t lk;
+  kmp_lock_pool_t pool;
+  double lk_align;
+  char lk_pad[KMP_PAD(kmp_base_adaptive_lock_t, CACHE_LINE)];
+};
+typedef union kmp_adaptive_lock kmp_adaptive_lock_t;
+
+#define GET_QLK_PTR(l) ((kmp_queuing_lock_t *)&(l)->lk.qlk)
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+// ----------------------------------------------------------------------------
+// DRDPA ticket locks.
+struct kmp_base_drdpa_lock {
+  // All of the fields on the first cache line are only written when
+  // initializing or reconfiguring the lock.  These are relatively rare
+  // operations, so data from the first cache line will usually stay resident in
+  // the cache of each thread trying to acquire the lock.
+  //
+  // initialized must be the first entry in the lock data structure!
+  KMP_ALIGN_CACHE
+
+  volatile union kmp_drdpa_lock
+      *initialized; // points to the lock union if in initialized state
+  ident_t const *location; // Source code location of omp_init_lock().
+  std::atomic<std::atomic<kmp_uint64> *> polls;
+  std::atomic<kmp_uint64> mask; // is 2**num_polls-1 for mod op
+  kmp_uint64 cleanup_ticket; // thread with cleanup ticket
+  std::atomic<kmp_uint64> *old_polls; // will deallocate old_polls
+  kmp_uint32 num_polls; // must be power of 2
+
+  // next_ticket it needs to exist in a separate cache line, as it is
+  // invalidated every time a thread takes a new ticket.
+  KMP_ALIGN_CACHE
+
+  std::atomic<kmp_uint64> next_ticket;
+
+  // now_serving is used to store our ticket value while we hold the lock. It
+  // has a slightly different meaning in the DRDPA ticket locks (where it is
+  // written by the acquiring thread) than it does in the simple ticket locks
+  // (where it is written by the releasing thread).
+  //
+  // Since now_serving is only read and written in the critical section,
+  // it is non-volatile, but it needs to exist on a separate cache line,
+  // as it is invalidated at every lock acquire.
+  //
+  // Likewise, the vars used for nested locks (owner_id and depth_locked) are
+  // only written by the thread owning the lock, so they are put in this cache
+  // line.  owner_id is read by other threads, so it must be declared volatile.
+  KMP_ALIGN_CACHE
+  kmp_uint64 now_serving; // doesn't have to be volatile
+  volatile kmp_uint32 owner_id; // (gtid+1) of owning thread, 0 if unlocked
+  kmp_int32 depth_locked; // depth locked
+  kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
+};
+
+typedef struct kmp_base_drdpa_lock kmp_base_drdpa_lock_t;
+
+union KMP_ALIGN_CACHE kmp_drdpa_lock {
+  kmp_base_drdpa_lock_t
+      lk; // This field must be first to allow static initializing. */
+  kmp_lock_pool_t pool;
+  double lk_align; // use worst case alignment
+  char lk_pad[KMP_PAD(kmp_base_drdpa_lock_t, CACHE_LINE)];
+};
+
+typedef union kmp_drdpa_lock kmp_drdpa_lock_t;
+
+extern int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_drdpa_lock(kmp_drdpa_lock_t *lck);
+extern void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck);
+
+extern int __kmp_acquire_nested_drdpa_lock(kmp_drdpa_lock_t *lck,
+                                           kmp_int32 gtid);
+extern int __kmp_test_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_nested_drdpa_lock(kmp_drdpa_lock_t *lck,
+                                           kmp_int32 gtid);
+extern void __kmp_init_nested_drdpa_lock(kmp_drdpa_lock_t *lck);
+extern void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck);
+
+// ============================================================================
+// Lock purposes.
+// ============================================================================
+
+// Bootstrap locks.
+//
+// Bootstrap locks -- very few locks used at library initialization time.
+// Bootstrap locks are currently implemented as ticket locks.
+// They could also be implemented as test and set lock, but cannot be
+// implemented with other lock kinds as they require gtids which are not
+// available at initialization time.
+
+typedef kmp_ticket_lock_t kmp_bootstrap_lock_t;
+
+#define KMP_BOOTSTRAP_LOCK_INITIALIZER(lock) KMP_TICKET_LOCK_INITIALIZER((lock))
+#define KMP_BOOTSTRAP_LOCK_INIT(lock)                                          \
+  kmp_bootstrap_lock_t lock = KMP_TICKET_LOCK_INITIALIZER(lock)
+
+static inline int __kmp_acquire_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  return __kmp_acquire_ticket_lock(lck, KMP_GTID_DNE);
+}
+
+static inline int __kmp_test_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  return __kmp_test_ticket_lock(lck, KMP_GTID_DNE);
+}
+
+static inline void __kmp_release_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  __kmp_release_ticket_lock(lck, KMP_GTID_DNE);
+}
+
+static inline void __kmp_init_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  __kmp_init_ticket_lock(lck);
+}
+
+static inline void __kmp_destroy_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  __kmp_destroy_ticket_lock(lck);
+}
+
+// Internal RTL locks.
+//
+// Internal RTL locks are also implemented as ticket locks, for now.
+//
+// FIXME - We should go through and figure out which lock kind works best for
+// each internal lock, and use the type declaration and function calls for
+// that explicit lock kind (and get rid of this section).
+
+typedef kmp_ticket_lock_t kmp_lock_t;
+
+#define KMP_LOCK_INIT(lock) kmp_lock_t lock = KMP_TICKET_LOCK_INITIALIZER(lock)
+
+static inline int __kmp_acquire_lock(kmp_lock_t *lck, kmp_int32 gtid) {
+  return __kmp_acquire_ticket_lock(lck, gtid);
+}
+
+static inline int __kmp_test_lock(kmp_lock_t *lck, kmp_int32 gtid) {
+  return __kmp_test_ticket_lock(lck, gtid);
+}
+
+static inline void __kmp_release_lock(kmp_lock_t *lck, kmp_int32 gtid) {
+  __kmp_release_ticket_lock(lck, gtid);
+}
+
+static inline void __kmp_init_lock(kmp_lock_t *lck) {
+  __kmp_init_ticket_lock(lck);
+}
+
+static inline void __kmp_destroy_lock(kmp_lock_t *lck) {
+  __kmp_destroy_ticket_lock(lck);
+}
+
+// User locks.
+//
+// Do not allocate objects of type union kmp_user_lock!!! This will waste space
+// unless __kmp_user_lock_kind == lk_drdpa. Instead, check the value of
+// __kmp_user_lock_kind and allocate objects of the type of the appropriate
+// union member, and cast their addresses to kmp_user_lock_p.
+
+enum kmp_lock_kind {
+  lk_default = 0,
+  lk_tas,
+#if KMP_USE_FUTEX
+  lk_futex,
+#endif
+#if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
+  lk_hle,
+  lk_rtm_queuing,
+  lk_rtm_spin,
+#endif
+  lk_ticket,
+  lk_queuing,
+  lk_drdpa,
+#if KMP_USE_ADAPTIVE_LOCKS
+  lk_adaptive
+#endif // KMP_USE_ADAPTIVE_LOCKS
+};
+
+typedef enum kmp_lock_kind kmp_lock_kind_t;
+
+extern kmp_lock_kind_t __kmp_user_lock_kind;
+
+union kmp_user_lock {
+  kmp_tas_lock_t tas;
+#if KMP_USE_FUTEX
+  kmp_futex_lock_t futex;
+#endif
+  kmp_ticket_lock_t ticket;
+  kmp_queuing_lock_t queuing;
+  kmp_drdpa_lock_t drdpa;
+#if KMP_USE_ADAPTIVE_LOCKS
+  kmp_adaptive_lock_t adaptive;
+#endif // KMP_USE_ADAPTIVE_LOCKS
+  kmp_lock_pool_t pool;
+};
+
+typedef union kmp_user_lock *kmp_user_lock_p;
+
+#if !KMP_USE_DYNAMIC_LOCK
+
+extern size_t __kmp_base_user_lock_size;
+extern size_t __kmp_user_lock_size;
+
+extern kmp_int32 (*__kmp_get_user_lock_owner_)(kmp_user_lock_p lck);
+
+static inline kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_get_user_lock_owner_ != NULL);
+  return (*__kmp_get_user_lock_owner_)(lck);
+}
+
+extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid);
+
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+
+#define __kmp_acquire_user_lock_with_checks(lck, gtid)                         \
+  if (__kmp_user_lock_kind == lk_tas) {                                        \
+    if (__kmp_env_consistency_check) {                                         \
+      char const *const func = "omp_set_lock";                                 \
+      if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&                       \
+          lck->tas.lk.depth_locked != -1) {                                    \
+        KMP_FATAL(LockNestableUsedAsSimple, func);                             \
+      }                                                                        \
+      if ((gtid >= 0) && (lck->tas.lk.poll - 1 == gtid)) {                     \
+        KMP_FATAL(LockIsAlreadyOwned, func);                                   \
+      }                                                                        \
+    }                                                                          \
+    if (lck->tas.lk.poll != 0 ||                                               \
+        !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {     \
+      kmp_uint32 spins;                                                        \
+      kmp_uint64 time;                                                         \
+      KMP_FSYNC_PREPARE(lck);                                                  \
+      KMP_INIT_YIELD(spins);                                                   \
+      KMP_INIT_BACKOFF(time);                                                  \
+      do {                                                                     \
+        KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);                              \
+      } while (                                                                \
+          lck->tas.lk.poll != 0 ||                                             \
+          !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));    \
+    }                                                                          \
+    KMP_FSYNC_ACQUIRED(lck);                                                   \
+  } else {                                                                     \
+    KMP_DEBUG_ASSERT(__kmp_acquire_user_lock_with_checks_ != NULL);            \
+    (*__kmp_acquire_user_lock_with_checks_)(lck, gtid);                        \
+  }
+
+#else
+static inline int __kmp_acquire_user_lock_with_checks(kmp_user_lock_p lck,
+                                                      kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(__kmp_acquire_user_lock_with_checks_ != NULL);
+  return (*__kmp_acquire_user_lock_with_checks_)(lck, gtid);
+}
+#endif
+
+extern int (*__kmp_test_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                kmp_int32 gtid);
+
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+
+#include "kmp_i18n.h" /* AC: KMP_FATAL definition */
+extern int __kmp_env_consistency_check; /* AC: copy from kmp.h here */
+static inline int __kmp_test_user_lock_with_checks(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid) {
+  if (__kmp_user_lock_kind == lk_tas) {
+    if (__kmp_env_consistency_check) {
+      char const *const func = "omp_test_lock";
+      if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
+          lck->tas.lk.depth_locked != -1) {
+        KMP_FATAL(LockNestableUsedAsSimple, func);
+      }
+    }
+    return ((lck->tas.lk.poll == 0) &&
+            __kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));
+  } else {
+    KMP_DEBUG_ASSERT(__kmp_test_user_lock_with_checks_ != NULL);
+    return (*__kmp_test_user_lock_with_checks_)(lck, gtid);
+  }
+}
+#else
+static inline int __kmp_test_user_lock_with_checks(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(__kmp_test_user_lock_with_checks_ != NULL);
+  return (*__kmp_test_user_lock_with_checks_)(lck, gtid);
+}
+#endif
+
+extern int (*__kmp_release_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid);
+
+static inline void __kmp_release_user_lock_with_checks(kmp_user_lock_p lck,
+                                                       kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(__kmp_release_user_lock_with_checks_ != NULL);
+  (*__kmp_release_user_lock_with_checks_)(lck, gtid);
+}
+
+extern void (*__kmp_init_user_lock_with_checks_)(kmp_user_lock_p lck);
+
+static inline void __kmp_init_user_lock_with_checks(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_init_user_lock_with_checks_ != NULL);
+  (*__kmp_init_user_lock_with_checks_)(lck);
+}
+
+// We need a non-checking version of destroy lock for when the RTL is
+// doing the cleanup as it can't always tell if the lock is nested or not.
+extern void (*__kmp_destroy_user_lock_)(kmp_user_lock_p lck);
+
+static inline void __kmp_destroy_user_lock(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_destroy_user_lock_ != NULL);
+  (*__kmp_destroy_user_lock_)(lck);
+}
+
+extern void (*__kmp_destroy_user_lock_with_checks_)(kmp_user_lock_p lck);
+
+static inline void __kmp_destroy_user_lock_with_checks(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_destroy_user_lock_with_checks_ != NULL);
+  (*__kmp_destroy_user_lock_with_checks_)(lck);
+}
+
+extern int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                          kmp_int32 gtid);
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+
+#define __kmp_acquire_nested_user_lock_with_checks(lck, gtid, depth)           \
+  if (__kmp_user_lock_kind == lk_tas) {                                        \
+    if (__kmp_env_consistency_check) {                                         \
+      char const *const func = "omp_set_nest_lock";                            \
+      if ((sizeof(kmp_tas_lock_t) <= OMP_NEST_LOCK_T_SIZE) &&                  \
+          lck->tas.lk.depth_locked == -1) {                                    \
+        KMP_FATAL(LockSimpleUsedAsNestable, func);                             \
+      }                                                                        \
+    }                                                                          \
+    if (lck->tas.lk.poll - 1 == gtid) {                                        \
+      lck->tas.lk.depth_locked += 1;                                           \
+      *depth = KMP_LOCK_ACQUIRED_NEXT;                                         \
+    } else {                                                                   \
+      if ((lck->tas.lk.poll != 0) ||                                           \
+          !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {   \
+        kmp_uint32 spins;                                                      \
+        kmp_uint64 time;                                                       \
+        KMP_FSYNC_PREPARE(lck);                                                \
+        KMP_INIT_YIELD(spins);                                                 \
+        KMP_INIT_BACKOFF(time);                                                \
+        do {                                                                   \
+          KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);                            \
+        } while (                                                              \
+            (lck->tas.lk.poll != 0) ||                                         \
+            !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));  \
+      }                                                                        \
+      lck->tas.lk.depth_locked = 1;                                            \
+      *depth = KMP_LOCK_ACQUIRED_FIRST;                                        \
+    }                                                                          \
+    KMP_FSYNC_ACQUIRED(lck);                                                   \
+  } else {                                                                     \
+    KMP_DEBUG_ASSERT(__kmp_acquire_nested_user_lock_with_checks_ != NULL);     \
+    *depth = (*__kmp_acquire_nested_user_lock_with_checks_)(lck, gtid);        \
+  }
+
+#else
+static inline void
+__kmp_acquire_nested_user_lock_with_checks(kmp_user_lock_p lck, kmp_int32 gtid,
+                                           int *depth) {
+  KMP_DEBUG_ASSERT(__kmp_acquire_nested_user_lock_with_checks_ != NULL);
+  *depth = (*__kmp_acquire_nested_user_lock_with_checks_)(lck, gtid);
+}
+#endif
+
+extern int (*__kmp_test_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                       kmp_int32 gtid);
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+static inline int __kmp_test_nested_user_lock_with_checks(kmp_user_lock_p lck,
+                                                          kmp_int32 gtid) {
+  if (__kmp_user_lock_kind == lk_tas) {
+    int retval;
+    if (__kmp_env_consistency_check) {
+      char const *const func = "omp_test_nest_lock";
+      if ((sizeof(kmp_tas_lock_t) <= OMP_NEST_LOCK_T_SIZE) &&
+          lck->tas.lk.depth_locked == -1) {
+        KMP_FATAL(LockSimpleUsedAsNestable, func);
+      }
+    }
+    KMP_DEBUG_ASSERT(gtid >= 0);
+    if (lck->tas.lk.poll - 1 ==
+        gtid) { /* __kmp_get_tas_lock_owner( lck ) == gtid */
+      return ++lck->tas.lk.depth_locked; /* same owner, depth increased */
+    }
+    retval = ((lck->tas.lk.poll == 0) &&
+              __kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));
+    if (retval) {
+      KMP_MB();
+      lck->tas.lk.depth_locked = 1;
+    }
+    return retval;
+  } else {
+    KMP_DEBUG_ASSERT(__kmp_test_nested_user_lock_with_checks_ != NULL);
+    return (*__kmp_test_nested_user_lock_with_checks_)(lck, gtid);
+  }
+}
+#else
+static inline int __kmp_test_nested_user_lock_with_checks(kmp_user_lock_p lck,
+                                                          kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(__kmp_test_nested_user_lock_with_checks_ != NULL);
+  return (*__kmp_test_nested_user_lock_with_checks_)(lck, gtid);
+}
+#endif
+
+extern int (*__kmp_release_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                          kmp_int32 gtid);
+
+static inline int
+__kmp_release_nested_user_lock_with_checks(kmp_user_lock_p lck,
+                                           kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(__kmp_release_nested_user_lock_with_checks_ != NULL);
+  return (*__kmp_release_nested_user_lock_with_checks_)(lck, gtid);
+}
+
+extern void (*__kmp_init_nested_user_lock_with_checks_)(kmp_user_lock_p lck);
+
+static inline void
+__kmp_init_nested_user_lock_with_checks(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_init_nested_user_lock_with_checks_ != NULL);
+  (*__kmp_init_nested_user_lock_with_checks_)(lck);
+}
+
+extern void (*__kmp_destroy_nested_user_lock_with_checks_)(kmp_user_lock_p lck);
+
+static inline void
+__kmp_destroy_nested_user_lock_with_checks(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_destroy_nested_user_lock_with_checks_ != NULL);
+  (*__kmp_destroy_nested_user_lock_with_checks_)(lck);
+}
+
+// user lock functions which do not necessarily exist for all lock kinds.
+//
+// The "set" functions usually have wrapper routines that check for a NULL set
+// function pointer and call it if non-NULL.
+//
+// In some cases, it makes sense to have a "get" wrapper function check for a
+// NULL get function pointer and return NULL / invalid value / error code if
+// the function pointer is NULL.
+//
+// In other cases, the calling code really should differentiate between an
+// unimplemented function and one that is implemented but returning NULL /
+// invalid value.  If this is the case, no get function wrapper exists.
+
+extern int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck);
+
+// no set function; fields set during local allocation
+
+extern const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck);
+
+static inline const ident_t *__kmp_get_user_lock_location(kmp_user_lock_p lck) {
+  if (__kmp_get_user_lock_location_ != NULL) {
+    return (*__kmp_get_user_lock_location_)(lck);
+  } else {
+    return NULL;
+  }
+}
+
+extern void (*__kmp_set_user_lock_location_)(kmp_user_lock_p lck,
+                                             const ident_t *loc);
+
+static inline void __kmp_set_user_lock_location(kmp_user_lock_p lck,
+                                                const ident_t *loc) {
+  if (__kmp_set_user_lock_location_ != NULL) {
+    (*__kmp_set_user_lock_location_)(lck, loc);
+  }
+}
+
+extern kmp_lock_flags_t (*__kmp_get_user_lock_flags_)(kmp_user_lock_p lck);
+
+extern void (*__kmp_set_user_lock_flags_)(kmp_user_lock_p lck,
+                                          kmp_lock_flags_t flags);
+
+static inline void __kmp_set_user_lock_flags(kmp_user_lock_p lck,
+                                             kmp_lock_flags_t flags) {
+  if (__kmp_set_user_lock_flags_ != NULL) {
+    (*__kmp_set_user_lock_flags_)(lck, flags);
+  }
+}
+
+// The function which sets up all of the vtbl pointers for kmp_user_lock_t.
+extern void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind);
+
+// Macros for binding user lock functions.
+#define KMP_BIND_USER_LOCK_TEMPLATE(nest, kind, suffix)                        \
+  {                                                                            \
+    __kmp_acquire##nest##user_lock_with_checks_ = (int (*)(                    \
+        kmp_user_lock_p, kmp_int32))__kmp_acquire##nest##kind##_##suffix;      \
+    __kmp_release##nest##user_lock_with_checks_ = (int (*)(                    \
+        kmp_user_lock_p, kmp_int32))__kmp_release##nest##kind##_##suffix;      \
+    __kmp_test##nest##user_lock_with_checks_ = (int (*)(                       \
+        kmp_user_lock_p, kmp_int32))__kmp_test##nest##kind##_##suffix;         \
+    __kmp_init##nest##user_lock_with_checks_ =                                 \
+        (void (*)(kmp_user_lock_p))__kmp_init##nest##kind##_##suffix;          \
+    __kmp_destroy##nest##user_lock_with_checks_ =                              \
+        (void (*)(kmp_user_lock_p))__kmp_destroy##nest##kind##_##suffix;       \
+  }
+
+#define KMP_BIND_USER_LOCK(kind) KMP_BIND_USER_LOCK_TEMPLATE(_, kind, lock)
+#define KMP_BIND_USER_LOCK_WITH_CHECKS(kind)                                   \
+  KMP_BIND_USER_LOCK_TEMPLATE(_, kind, lock_with_checks)
+#define KMP_BIND_NESTED_USER_LOCK(kind)                                        \
+  KMP_BIND_USER_LOCK_TEMPLATE(_nested_, kind, lock)
+#define KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(kind)                            \
+  KMP_BIND_USER_LOCK_TEMPLATE(_nested_, kind, lock_with_checks)
+
+// User lock table & lock allocation
+/* On 64-bit Linux* OS (and OS X*) GNU compiler allocates only 4 bytems memory
+   for lock variable, which is not enough to store a pointer, so we have to use
+   lock indexes instead of pointers and maintain lock table to map indexes to
+   pointers.
+
+
+   Note: The first element of the table is not a pointer to lock! It is a
+   pointer to previously allocated table (or NULL if it is the first table).
+
+   Usage:
+
+   if ( OMP_LOCK_T_SIZE < sizeof( <lock> ) ) { // or OMP_NEST_LOCK_T_SIZE
+     Lock table is fully utilized. User locks are indexes, so table is used on
+     user lock operation.
+     Note: it may be the case (lin_32) that we don't need to use a lock
+     table for regular locks, but do need the table for nested locks.
+   }
+   else {
+     Lock table initialized but not actually used.
+   }
+*/
+
+struct kmp_lock_table {
+  kmp_lock_index_t used; // Number of used elements
+  kmp_lock_index_t allocated; // Number of allocated elements
+  kmp_user_lock_p *table; // Lock table.
+};
+
+typedef struct kmp_lock_table kmp_lock_table_t;
+
+extern kmp_lock_table_t __kmp_user_lock_table;
+extern kmp_user_lock_p __kmp_lock_pool;
+
+struct kmp_block_of_locks {
+  struct kmp_block_of_locks *next_block;
+  void *locks;
+};
+
+typedef struct kmp_block_of_locks kmp_block_of_locks_t;
+
+extern kmp_block_of_locks_t *__kmp_lock_blocks;
+extern int __kmp_num_locks_in_block;
+
+extern kmp_user_lock_p __kmp_user_lock_allocate(void **user_lock,
+                                                kmp_int32 gtid,
+                                                kmp_lock_flags_t flags);
+extern void __kmp_user_lock_free(void **user_lock, kmp_int32 gtid,
+                                 kmp_user_lock_p lck);
+extern kmp_user_lock_p __kmp_lookup_user_lock(void **user_lock,
+                                              char const *func);
+extern void __kmp_cleanup_user_locks();
+
+#define KMP_CHECK_USER_LOCK_INIT()                                             \
+  {                                                                            \
+    if (!TCR_4(__kmp_init_user_locks)) {                                       \
+      __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);                         \
+      if (!TCR_4(__kmp_init_user_locks)) {                                     \
+        TCW_4(__kmp_init_user_locks, TRUE);                                    \
+      }                                                                        \
+      __kmp_release_bootstrap_lock(&__kmp_initz_lock);                         \
+    }                                                                          \
+  }
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+#undef KMP_PAD
+#undef KMP_GTID_DNE
+
+#if KMP_USE_DYNAMIC_LOCK
+// KMP_USE_DYNAMIC_LOCK enables dynamic dispatch of lock functions without
+// breaking the current compatibility. Essential functionality of this new code
+// is dynamic dispatch, but it also implements (or enables implementation of)
+// hinted user lock and critical section which will be part of OMP 4.5 soon.
+//
+// Lock type can be decided at creation time (i.e., lock initialization), and
+// subsequent lock function call on the created lock object requires type
+// extraction and call through jump table using the extracted type. This type
+// information is stored in two different ways depending on the size of the lock
+// object, and we differentiate lock types by this size requirement - direct and
+// indirect locks.
+//
+// Direct locks:
+// A direct lock object fits into the space created by the compiler for an
+// omp_lock_t object, and TAS/Futex lock falls into this category. We use low
+// one byte of the lock object as the storage for the lock type, and appropriate
+// bit operation is required to access the data meaningful to the lock
+// algorithms. Also, to differentiate direct lock from indirect lock, 1 is
+// written to LSB of the lock object. The newly introduced "hle" lock is also a
+// direct lock.
+//
+// Indirect locks:
+// An indirect lock object requires more space than the compiler-generated
+// space, and it should be allocated from heap. Depending on the size of the
+// compiler-generated space for the lock (i.e., size of omp_lock_t), this
+// omp_lock_t object stores either the address of the heap-allocated indirect
+// lock (void * fits in the object) or an index to the indirect lock table entry
+// that holds the address. Ticket/Queuing/DRDPA/Adaptive lock falls into this
+// category, and the newly introduced "rtm" lock is also an indirect lock which
+// was implemented on top of the Queuing lock. When the omp_lock_t object holds
+// an index (not lock address), 0 is written to LSB to differentiate the lock
+// from a direct lock, and the remaining part is the actual index to the
+// indirect lock table.
+
+#include <stdint.h> // for uintptr_t
+
+// Shortcuts
+#define KMP_USE_INLINED_TAS                                                    \
+  (KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)) && 1
+#define KMP_USE_INLINED_FUTEX KMP_USE_FUTEX && 0
+
+// List of lock definitions; all nested locks are indirect locks.
+// hle lock is xchg lock prefixed with XACQUIRE/XRELEASE.
+// All nested locks are indirect lock types.
+#if KMP_USE_TSX
+#if KMP_USE_FUTEX
+#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a) m(hle, a) m(rtm_spin, a)
+#define KMP_FOREACH_I_LOCK(m, a)                                               \
+  m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm_queuing, a)      \
+      m(nested_tas, a) m(nested_futex, a) m(nested_ticket, a)                  \
+          m(nested_queuing, a) m(nested_drdpa, a)
+#else
+#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(hle, a) m(rtm_spin, a)
+#define KMP_FOREACH_I_LOCK(m, a)                                               \
+  m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm_queuing, a)      \
+      m(nested_tas, a) m(nested_ticket, a) m(nested_queuing, a)                \
+          m(nested_drdpa, a)
+#endif // KMP_USE_FUTEX
+#define KMP_LAST_D_LOCK lockseq_rtm_spin
+#else
+#if KMP_USE_FUTEX
+#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a)
+#define KMP_FOREACH_I_LOCK(m, a)                                               \
+  m(ticket, a) m(queuing, a) m(drdpa, a) m(nested_tas, a) m(nested_futex, a)   \
+      m(nested_ticket, a) m(nested_queuing, a) m(nested_drdpa, a)
+#define KMP_LAST_D_LOCK lockseq_futex
+#else
+#define KMP_FOREACH_D_LOCK(m, a) m(tas, a)
+#define KMP_FOREACH_I_LOCK(m, a)                                               \
+  m(ticket, a) m(queuing, a) m(drdpa, a) m(nested_tas, a) m(nested_ticket, a)  \
+      m(nested_queuing, a) m(nested_drdpa, a)
+#define KMP_LAST_D_LOCK lockseq_tas
+#endif // KMP_USE_FUTEX
+#endif // KMP_USE_TSX
+
+// Information used in dynamic dispatch
+#define KMP_LOCK_SHIFT                                                         \
+  8 // number of low bits to be used as tag for direct locks
+#define KMP_FIRST_D_LOCK lockseq_tas
+#define KMP_FIRST_I_LOCK lockseq_ticket
+#define KMP_LAST_I_LOCK lockseq_nested_drdpa
+#define KMP_NUM_I_LOCKS                                                        \
+  (locktag_nested_drdpa + 1) // number of indirect lock types
+
+// Base type for dynamic locks.
+typedef kmp_uint32 kmp_dyna_lock_t;
+
+// Lock sequence that enumerates all lock kinds. Always make this enumeration
+// consistent with kmp_lockseq_t in the include directory.
+typedef enum {
+  lockseq_indirect = 0,
+#define expand_seq(l, a) lockseq_##l,
+  KMP_FOREACH_D_LOCK(expand_seq, 0) KMP_FOREACH_I_LOCK(expand_seq, 0)
+#undef expand_seq
+} kmp_dyna_lockseq_t;
+
+// Enumerates indirect lock tags.
+typedef enum {
+#define expand_tag(l, a) locktag_##l,
+  KMP_FOREACH_I_LOCK(expand_tag, 0)
+#undef expand_tag
+} kmp_indirect_locktag_t;
+
+// Utility macros that extract information from lock sequences.
+#define KMP_IS_D_LOCK(seq)                                                     \
+  ((seq) >= KMP_FIRST_D_LOCK && (seq) <= KMP_LAST_D_LOCK)
+#define KMP_IS_I_LOCK(seq)                                                     \
+  ((seq) >= KMP_FIRST_I_LOCK && (seq) <= KMP_LAST_I_LOCK)
+#define KMP_GET_I_TAG(seq) (kmp_indirect_locktag_t)((seq)-KMP_FIRST_I_LOCK)
+#define KMP_GET_D_TAG(seq) ((seq) << 1 | 1)
+
+// Enumerates direct lock tags starting from indirect tag.
+typedef enum {
+#define expand_tag(l, a) locktag_##l = KMP_GET_D_TAG(lockseq_##l),
+  KMP_FOREACH_D_LOCK(expand_tag, 0)
+#undef expand_tag
+} kmp_direct_locktag_t;
+
+// Indirect lock type
+typedef struct {
+  kmp_user_lock_p lock;
+  kmp_indirect_locktag_t type;
+} kmp_indirect_lock_t;
+
+// Function tables for direct locks. Set/unset/test differentiate functions
+// with/without consistency checking.
+extern void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t);
+extern void (**__kmp_direct_destroy)(kmp_dyna_lock_t *);
+extern int (**__kmp_direct_set)(kmp_dyna_lock_t *, kmp_int32);
+extern int (**__kmp_direct_unset)(kmp_dyna_lock_t *, kmp_int32);
+extern int (**__kmp_direct_test)(kmp_dyna_lock_t *, kmp_int32);
+
+// Function tables for indirect locks. Set/unset/test differentiate functions
+// with/without consistency checking.
+extern void (*__kmp_indirect_init[])(kmp_user_lock_p);
+extern void (**__kmp_indirect_destroy)(kmp_user_lock_p);
+extern int (**__kmp_indirect_set)(kmp_user_lock_p, kmp_int32);
+extern int (**__kmp_indirect_unset)(kmp_user_lock_p, kmp_int32);
+extern int (**__kmp_indirect_test)(kmp_user_lock_p, kmp_int32);
+
+// Extracts direct lock tag from a user lock pointer
+#define KMP_EXTRACT_D_TAG(l)                                                   \
+  (*((kmp_dyna_lock_t *)(l)) & ((1 << KMP_LOCK_SHIFT) - 1) &                   \
+   -(*((kmp_dyna_lock_t *)(l)) & 1))
+
+// Extracts indirect lock index from a user lock pointer
+#define KMP_EXTRACT_I_INDEX(l) (*(kmp_lock_index_t *)(l) >> 1)
+
+// Returns function pointer to the direct lock function with l (kmp_dyna_lock_t
+// *) and op (operation type).
+#define KMP_D_LOCK_FUNC(l, op) __kmp_direct_##op[KMP_EXTRACT_D_TAG(l)]
+
+// Returns function pointer to the indirect lock function with l
+// (kmp_indirect_lock_t *) and op (operation type).
+#define KMP_I_LOCK_FUNC(l, op)                                                 \
+  __kmp_indirect_##op[((kmp_indirect_lock_t *)(l))->type]
+
+// Initializes a direct lock with the given lock pointer and lock sequence.
+#define KMP_INIT_D_LOCK(l, seq)                                                \
+  __kmp_direct_init[KMP_GET_D_TAG(seq)]((kmp_dyna_lock_t *)l, seq)
+
+// Initializes an indirect lock with the given lock pointer and lock sequence.
+#define KMP_INIT_I_LOCK(l, seq)                                                \
+  __kmp_direct_init[0]((kmp_dyna_lock_t *)(l), seq)
+
+// Returns "free" lock value for the given lock type.
+#define KMP_LOCK_FREE(type) (locktag_##type)
+
+// Returns "busy" lock value for the given lock teyp.
+#define KMP_LOCK_BUSY(v, type) ((v) << KMP_LOCK_SHIFT | locktag_##type)
+
+// Returns lock value after removing (shifting) lock tag.
+#define KMP_LOCK_STRIP(v) ((v) >> KMP_LOCK_SHIFT)
+
+// Initializes global states and data structures for managing dynamic user
+// locks.
+extern void __kmp_init_dynamic_user_locks();
+
+// Allocates and returns an indirect lock with the given indirect lock tag.
+extern kmp_indirect_lock_t *
+__kmp_allocate_indirect_lock(void **, kmp_int32, kmp_indirect_locktag_t);
+
+// Cleans up global states and data structures for managing dynamic user locks.
+extern void __kmp_cleanup_indirect_user_locks();
+
+// Default user lock sequence when not using hinted locks.
+extern kmp_dyna_lockseq_t __kmp_user_lock_seq;
+
+// Jump table for "set lock location", available only for indirect locks.
+extern void (*__kmp_indirect_set_location[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
+                                                            const ident_t *);
+#define KMP_SET_I_LOCK_LOCATION(lck, loc)                                      \
+  {                                                                            \
+    if (__kmp_indirect_set_location[(lck)->type] != NULL)                      \
+      __kmp_indirect_set_location[(lck)->type]((lck)->lock, loc);              \
+  }
+
+// Jump table for "set lock flags", available only for indirect locks.
+extern void (*__kmp_indirect_set_flags[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
+                                                         kmp_lock_flags_t);
+#define KMP_SET_I_LOCK_FLAGS(lck, flag)                                        \
+  {                                                                            \
+    if (__kmp_indirect_set_flags[(lck)->type] != NULL)                         \
+      __kmp_indirect_set_flags[(lck)->type]((lck)->lock, flag);                \
+  }
+
+// Jump table for "get lock location", available only for indirect locks.
+extern const ident_t *(*__kmp_indirect_get_location[KMP_NUM_I_LOCKS])(
+    kmp_user_lock_p);
+#define KMP_GET_I_LOCK_LOCATION(lck)                                           \
+  (__kmp_indirect_get_location[(lck)->type] != NULL                            \
+       ? __kmp_indirect_get_location[(lck)->type]((lck)->lock)                 \
+       : NULL)
+
+// Jump table for "get lock flags", available only for indirect locks.
+extern kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
+    kmp_user_lock_p);
+#define KMP_GET_I_LOCK_FLAGS(lck)                                              \
+  (__kmp_indirect_get_flags[(lck)->type] != NULL                               \
+       ? __kmp_indirect_get_flags[(lck)->type]((lck)->lock)                    \
+       : NULL)
+
+// number of kmp_indirect_lock_t objects to be allocated together
+#define KMP_I_LOCK_CHUNK 1024
+// Keep at a power of 2 since it is used in multiplication & division
+KMP_BUILD_ASSERT(KMP_I_LOCK_CHUNK % 2 == 0);
+// number of row entries in the initial lock table
+#define KMP_I_LOCK_TABLE_INIT_NROW_PTRS 8
+
+// Lock table for indirect locks.
+typedef struct kmp_indirect_lock_table {
+  kmp_indirect_lock_t **table; // blocks of indirect locks allocated
+  kmp_uint32 nrow_ptrs; // number *table pointer entries in table
+  kmp_lock_index_t next; // index to the next lock to be allocated
+  struct kmp_indirect_lock_table *next_table;
+} kmp_indirect_lock_table_t;
+
+extern kmp_indirect_lock_table_t __kmp_i_lock_table;
+
+// Returns the indirect lock associated with the given index.
+// Returns nullptr if no lock at given index
+static inline kmp_indirect_lock_t *__kmp_get_i_lock(kmp_lock_index_t idx) {
+  kmp_indirect_lock_table_t *lock_table = &__kmp_i_lock_table;
+  while (lock_table) {
+    kmp_lock_index_t max_locks = lock_table->nrow_ptrs * KMP_I_LOCK_CHUNK;
+    if (idx < max_locks) {
+      kmp_lock_index_t row = idx / KMP_I_LOCK_CHUNK;
+      kmp_lock_index_t col = idx % KMP_I_LOCK_CHUNK;
+      if (!lock_table->table[row] || idx >= lock_table->next)
+        break;
+      return &lock_table->table[row][col];
+    }
+    idx -= max_locks;
+    lock_table = lock_table->next_table;
+  }
+  return nullptr;
+}
+
+// Number of locks in a lock block, which is fixed to "1" now.
+// TODO: No lock block implementation now. If we do support, we need to manage
+// lock block data structure for each indirect lock type.
+extern int __kmp_num_locks_in_block;
+
+// Fast lock table lookup without consistency checking
+#define KMP_LOOKUP_I_LOCK(l)                                                   \
+  ((OMP_LOCK_T_SIZE < sizeof(void *))                                          \
+       ? __kmp_get_i_lock(KMP_EXTRACT_I_INDEX(l))                              \
+       : *((kmp_indirect_lock_t **)(l)))
+
+// Used once in kmp_error.cpp
+extern kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p, kmp_uint32);
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+#define KMP_LOCK_BUSY(v, type) (v)
+#define KMP_LOCK_FREE(type) 0
+#define KMP_LOCK_STRIP(v) (v)
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+// data structure for using backoff within spin locks.
+typedef struct {
+  kmp_uint32 step; // current step
+  kmp_uint32 max_backoff; // upper bound of outer delay loop
+  kmp_uint32 min_tick; // size of inner delay loop in ticks (machine-dependent)
+} kmp_backoff_t;
+
+// Runtime's default backoff parameters
+extern kmp_backoff_t __kmp_spin_backoff_params;
+
+// Backoff function
+extern void __kmp_spin_backoff(kmp_backoff_t *);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif /* KMP_LOCK_H */
diff --git a/third_party/openmp/kmp_omp.h b/third_party/openmp/kmp_omp.h
new file mode 100644
index 000000000..995241ff6
--- /dev/null
+++ b/third_party/openmp/kmp_omp.h
@@ -0,0 +1,235 @@
+#if USE_DEBUGGER
+/*
+ * kmp_omp.h -- OpenMP definition for kmp_omp_struct_info_t.
+ *              This is for information about runtime library structures.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/* THIS FILE SHOULD NOT BE MODIFIED IN IDB INTERFACE LIBRARY CODE
+   It should instead be modified in the OpenMP runtime and copied to the
+   interface library code.  This way we can minimize the problems that this is
+   sure to cause having two copies of the same file.
+
+   Files live in libomp and libomp_db/src/include  */
+
+/* CHANGE THIS WHEN STRUCTURES BELOW CHANGE
+   Before we release this to a customer, please don't change this value.  After
+   it is released and stable, then any new updates to the structures or data
+   structure traversal algorithms need to change this value. */
+#define KMP_OMP_VERSION 9
+
+typedef struct {
+  kmp_int32 offset;
+  kmp_int32 size;
+} offset_and_size_t;
+
+typedef struct {
+  kmp_uint64 addr;
+  kmp_int32 size;
+  kmp_int32 padding;
+} addr_and_size_t;
+
+typedef struct {
+  kmp_uint64 flags; // Flags for future extensions.
+  kmp_uint64
+      file; // Pointer to name of source file where the parallel region is.
+  kmp_uint64 func; // Pointer to name of routine where the parallel region is.
+  kmp_int32 begin; // Beginning of source line range.
+  kmp_int32 end; // End of source line range.
+  kmp_int32 num_threads; // Specified number of threads.
+} kmp_omp_nthr_item_t;
+
+typedef struct {
+  kmp_int32 num; // Number of items in the array.
+  kmp_uint64 array; // Address of array of kmp_omp_num_threads_item_t.
+} kmp_omp_nthr_info_t;
+
+/* This structure is known to the idb interface library */
+typedef struct {
+
+  /* Change this only if you make a fundamental data structure change here */
+  kmp_int32 lib_version;
+
+  /* sanity check.  Only should be checked if versions are identical
+   * This is also used for backward compatibility to get the runtime
+   * structure size if it the runtime is older than the interface */
+  kmp_int32 sizeof_this_structure;
+
+  /* OpenMP RTL version info. */
+  addr_and_size_t major;
+  addr_and_size_t minor;
+  addr_and_size_t build;
+  addr_and_size_t openmp_version;
+  addr_and_size_t banner;
+
+  /* Various globals. */
+  addr_and_size_t threads; // Pointer to __kmp_threads.
+  addr_and_size_t roots; // Pointer to __kmp_root.
+  addr_and_size_t capacity; // Pointer to __kmp_threads_capacity.
+#if KMP_USE_MONITOR
+  addr_and_size_t monitor; // Pointer to __kmp_monitor.
+#endif
+#if !KMP_USE_DYNAMIC_LOCK
+  addr_and_size_t lock_table; // Pointer to __kmp_lock_table.
+#endif
+  addr_and_size_t func_microtask;
+  addr_and_size_t func_fork;
+  addr_and_size_t func_fork_teams;
+  addr_and_size_t team_counter;
+  addr_and_size_t task_counter;
+  addr_and_size_t nthr_info;
+  kmp_int32 address_width;
+  kmp_int32 indexed_locks;
+  kmp_int32 last_barrier; // The end in enum barrier_type
+  kmp_int32 deque_size; // TASK_DEQUE_SIZE
+
+  /* thread structure information. */
+  kmp_int32 th_sizeof_struct;
+  offset_and_size_t th_info; // descriptor for thread
+  offset_and_size_t th_team; // team for this thread
+  offset_and_size_t th_root; // root for this thread
+  offset_and_size_t th_serial_team; // serial team under this thread
+  offset_and_size_t th_ident; // location for this thread (if available)
+  offset_and_size_t th_spin_here; // is thread waiting for lock (if available)
+  offset_and_size_t
+      th_next_waiting; // next thread waiting for lock (if available)
+  offset_and_size_t th_task_team; // task team struct
+  offset_and_size_t th_current_task; // innermost task being executed
+  offset_and_size_t
+      th_task_state; // alternating 0/1 for task team identification
+  offset_and_size_t th_bar;
+  offset_and_size_t th_b_worker_arrived; // the worker increases it by 1 when it
+  // arrives to the barrier
+
+  /* teams information */
+  offset_and_size_t th_teams_microtask; // entry address for teams construct
+  offset_and_size_t th_teams_level; // initial level of teams construct
+  offset_and_size_t th_teams_nteams; // number of teams in a league
+  offset_and_size_t
+      th_teams_nth; // number of threads in each team of the league
+
+  /* kmp_desc structure (for info field above) */
+  kmp_int32 ds_sizeof_struct;
+  offset_and_size_t ds_tid; // team thread id
+  offset_and_size_t ds_gtid; // global thread id
+  offset_and_size_t ds_thread; // native thread id
+
+  /* team structure information */
+  kmp_int32 t_sizeof_struct;
+  offset_and_size_t t_master_tid; // tid of primary thread in parent team
+  offset_and_size_t t_ident; // location of parallel region
+  offset_and_size_t t_parent; // parent team
+  offset_and_size_t t_nproc; // # team threads
+  offset_and_size_t t_threads; // array of threads
+  offset_and_size_t t_serialized; // # levels of serialized teams
+  offset_and_size_t t_id; // unique team id
+  offset_and_size_t t_pkfn;
+  offset_and_size_t t_task_team; // task team structure
+  offset_and_size_t t_implicit_task; // taskdata for the thread's implicit task
+  offset_and_size_t t_cancel_request;
+  offset_and_size_t t_bar;
+  offset_and_size_t
+      t_b_master_arrived; // incremented when primary thread reaches barrier
+  offset_and_size_t
+      t_b_team_arrived; // increased by one when all the threads arrived
+
+  /* root structure information */
+  kmp_int32 r_sizeof_struct;
+  offset_and_size_t r_root_team; // team at root
+  offset_and_size_t r_hot_team; // hot team for this root
+  offset_and_size_t r_uber_thread; // root thread
+  offset_and_size_t r_root_id; // unique root id (if available)
+
+  /* ident structure information */
+  kmp_int32 id_sizeof_struct;
+  offset_and_size_t
+      id_psource; /* address of string ";file;func;line1;line2;;". */
+  offset_and_size_t id_flags;
+
+  /* lock structure information */
+  kmp_int32 lk_sizeof_struct;
+  offset_and_size_t lk_initialized;
+  offset_and_size_t lk_location;
+  offset_and_size_t lk_tail_id;
+  offset_and_size_t lk_head_id;
+  offset_and_size_t lk_next_ticket;
+  offset_and_size_t lk_now_serving;
+  offset_and_size_t lk_owner_id;
+  offset_and_size_t lk_depth_locked;
+  offset_and_size_t lk_lock_flags;
+
+#if !KMP_USE_DYNAMIC_LOCK
+  /* lock_table_t */
+  kmp_int32 lt_size_of_struct; /* Size and layout of kmp_lock_table_t. */
+  offset_and_size_t lt_used;
+  offset_and_size_t lt_allocated;
+  offset_and_size_t lt_table;
+#endif
+
+  /* task_team_t */
+  kmp_int32 tt_sizeof_struct;
+  offset_and_size_t tt_threads_data;
+  offset_and_size_t tt_found_tasks;
+  offset_and_size_t tt_nproc;
+  offset_and_size_t tt_unfinished_threads;
+  offset_and_size_t tt_active;
+
+  /* kmp_taskdata_t */
+  kmp_int32 td_sizeof_struct;
+  offset_and_size_t td_task_id; // task id
+  offset_and_size_t td_flags; // task flags
+  offset_and_size_t td_team; // team for this task
+  offset_and_size_t td_parent; // parent task
+  offset_and_size_t td_level; // task testing level
+  offset_and_size_t td_ident; // task identifier
+  offset_and_size_t td_allocated_child_tasks; // child tasks (+ current task)
+  // not yet deallocated
+  offset_and_size_t td_incomplete_child_tasks; // child tasks not yet complete
+
+  /* Taskwait */
+  offset_and_size_t td_taskwait_ident;
+  offset_and_size_t td_taskwait_counter;
+  offset_and_size_t
+      td_taskwait_thread; // gtid + 1 of thread encountered taskwait
+
+  /* Taskgroup */
+  offset_and_size_t td_taskgroup; // pointer to the current taskgroup
+  offset_and_size_t
+      td_task_count; // number of allocated and not yet complete tasks
+  offset_and_size_t td_cancel; // request for cancellation of this taskgroup
+
+  /* Task dependency */
+  offset_and_size_t
+      td_depnode; // pointer to graph node if the task has dependencies
+  offset_and_size_t dn_node;
+  offset_and_size_t dn_next;
+  offset_and_size_t dn_successors;
+  offset_and_size_t dn_task;
+  offset_and_size_t dn_npredecessors;
+  offset_and_size_t dn_nrefs;
+  offset_and_size_t dn_routine;
+
+  /* kmp_thread_data_t */
+  kmp_int32 hd_sizeof_struct;
+  offset_and_size_t hd_deque;
+  offset_and_size_t hd_deque_size;
+  offset_and_size_t hd_deque_head;
+  offset_and_size_t hd_deque_tail;
+  offset_and_size_t hd_deque_ntasks;
+  offset_and_size_t hd_deque_last_stolen;
+
+  // The last field of stable version.
+  kmp_uint64 last_field;
+
+} kmp_omp_struct_info_t;
+
+#endif /* USE_DEBUGGER */
+
+/* end of file */
diff --git a/third_party/openmp/kmp_os.h b/third_party/openmp/kmp_os.h
new file mode 100644
index 000000000..8ef3746ce
--- /dev/null
+++ b/third_party/openmp/kmp_os.h
@@ -0,0 +1,1319 @@
+/*
+ * kmp_os.h -- KPTS runtime header file.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_OS_H
+#define KMP_OS_H
+
+#include "kmp_config.h"
+#include <atomic>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define KMP_FTN_PLAIN 1
+#define KMP_FTN_APPEND 2
+#define KMP_FTN_UPPER 3
+/*
+#define KMP_FTN_PREPEND 4
+#define KMP_FTN_UAPPEND 5
+*/
+
+#define KMP_PTR_SKIP (sizeof(void *))
+
+/* -------------------------- Compiler variations ------------------------ */
+
+#define KMP_OFF 0
+#define KMP_ON 1
+
+#define KMP_MEM_CONS_VOLATILE 0
+#define KMP_MEM_CONS_FENCE 1
+
+#ifndef KMP_MEM_CONS_MODEL
+#define KMP_MEM_CONS_MODEL KMP_MEM_CONS_VOLATILE
+#endif
+
+#ifndef __has_cpp_attribute
+#define __has_cpp_attribute(x) 0
+#endif
+
+#ifndef __has_attribute
+#define __has_attribute(x) 0
+#endif
+
+/* ------------------------- Compiler recognition ---------------------- */
+#define KMP_COMPILER_ICC 0
+#define KMP_COMPILER_GCC 0
+#define KMP_COMPILER_CLANG 0
+#define KMP_COMPILER_MSVC 0
+#define KMP_COMPILER_ICX 0
+
+#if __INTEL_CLANG_COMPILER
+#undef KMP_COMPILER_ICX
+#define KMP_COMPILER_ICX 1
+#elif defined(__INTEL_COMPILER)
+#undef KMP_COMPILER_ICC
+#define KMP_COMPILER_ICC 1
+#elif defined(__clang__)
+#undef KMP_COMPILER_CLANG
+#define KMP_COMPILER_CLANG 1
+#elif defined(__GNUC__)
+#undef KMP_COMPILER_GCC
+#define KMP_COMPILER_GCC 1
+#elif defined(_MSC_VER)
+#undef KMP_COMPILER_MSVC
+#define KMP_COMPILER_MSVC 1
+#else
+#error Unknown compiler
+#endif
+
+#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) && !KMP_OS_WASI
+#define KMP_AFFINITY_SUPPORTED 1
+#if KMP_OS_WINDOWS && KMP_ARCH_X86_64
+#define KMP_GROUP_AFFINITY 1
+#else
+#define KMP_GROUP_AFFINITY 0
+#endif
+#else
+#define KMP_AFFINITY_SUPPORTED 0
+#define KMP_GROUP_AFFINITY 0
+#endif
+
+#if (KMP_OS_LINUX || (KMP_OS_FREEBSD && __FreeBSD_version >= 1301000))
+#define KMP_HAVE_SCHED_GETCPU 1
+#else
+#define KMP_HAVE_SCHED_GETCPU 0
+#endif
+
+/* Check for quad-precision extension. */
+#define KMP_HAVE_QUAD 0
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_COMPILER_ICC || KMP_COMPILER_ICX
+/* _Quad is already defined for icc */
+#undef KMP_HAVE_QUAD
+#define KMP_HAVE_QUAD 1
+#elif KMP_COMPILER_CLANG
+/* Clang doesn't support a software-implemented
+   128-bit extended precision type yet */
+typedef long double _Quad;
+#elif KMP_COMPILER_GCC
+/* GCC on NetBSD lacks __multc3/__divtc3 builtins needed for quad until
+   NetBSD 10.0 which ships with GCC 10.5 */
+#if (!KMP_OS_NETBSD || __GNUC__ >= 10) && !defined(__COSMOPOLITAN__)
+typedef __float128 _Quad;
+#undef KMP_HAVE_QUAD
+#define KMP_HAVE_QUAD 1
+#endif
+#elif KMP_COMPILER_MSVC
+typedef long double _Quad;
+#endif
+#else
+#if __LDBL_MAX_EXP__ >= 16384 && KMP_COMPILER_GCC
+typedef long double _Quad;
+#undef KMP_HAVE_QUAD
+#define KMP_HAVE_QUAD 1
+#endif
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#define KMP_USE_X87CONTROL 0
+#if KMP_OS_WINDOWS
+#define KMP_END_OF_LINE "\r\n"
+typedef char kmp_int8;
+typedef unsigned char kmp_uint8;
+typedef short kmp_int16;
+typedef unsigned short kmp_uint16;
+typedef int kmp_int32;
+typedef unsigned int kmp_uint32;
+#define KMP_INT32_SPEC "d"
+#define KMP_UINT32_SPEC "u"
+#ifndef KMP_STRUCT64
+typedef __int64 kmp_int64;
+typedef unsigned __int64 kmp_uint64;
+#define KMP_INT64_SPEC "I64d"
+#define KMP_UINT64_SPEC "I64u"
+#else
+struct kmp_struct64 {
+  kmp_int32 a, b;
+};
+typedef struct kmp_struct64 kmp_int64;
+typedef struct kmp_struct64 kmp_uint64;
+/* Not sure what to use for KMP_[U]INT64_SPEC here */
+#endif
+#if KMP_ARCH_X86 && KMP_MSVC_COMPAT
+#undef KMP_USE_X87CONTROL
+#define KMP_USE_X87CONTROL 1
+#endif
+#if KMP_ARCH_X86_64 || KMP_ARCH_AARCH64
+#define KMP_INTPTR 1
+typedef __int64 kmp_intptr_t;
+typedef unsigned __int64 kmp_uintptr_t;
+#define KMP_INTPTR_SPEC "I64d"
+#define KMP_UINTPTR_SPEC "I64u"
+#endif
+#endif /* KMP_OS_WINDOWS */
+
+#if KMP_OS_UNIX
+#define KMP_END_OF_LINE "\n"
+typedef char kmp_int8;
+typedef unsigned char kmp_uint8;
+typedef short kmp_int16;
+typedef unsigned short kmp_uint16;
+typedef int kmp_int32;
+typedef unsigned int kmp_uint32;
+typedef long long kmp_int64;
+typedef unsigned long long kmp_uint64;
+#define KMP_INT32_SPEC "d"
+#define KMP_UINT32_SPEC "u"
+#define KMP_INT64_SPEC "lld"
+#define KMP_UINT64_SPEC "llu"
+#endif /* KMP_OS_UNIX */
+
+#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM ||          \
+    KMP_ARCH_PPC
+#define KMP_SIZE_T_SPEC KMP_UINT32_SPEC
+#elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                 \
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
+    KMP_ARCH_VE || KMP_ARCH_S390X
+#define KMP_SIZE_T_SPEC KMP_UINT64_SPEC
+#else
+#error "Can't determine size_t printf format specifier."
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_WASM || KMP_ARCH_PPC
+#define KMP_SIZE_T_MAX (0xFFFFFFFF)
+#else
+#define KMP_SIZE_T_MAX (0xFFFFFFFFFFFFFFFF)
+#endif
+
+typedef size_t kmp_size_t;
+typedef float kmp_real32;
+typedef double kmp_real64;
+
+#ifndef KMP_INTPTR
+#define KMP_INTPTR 1
+typedef long kmp_intptr_t;
+typedef unsigned long kmp_uintptr_t;
+#define KMP_INTPTR_SPEC "ld"
+#define KMP_UINTPTR_SPEC "lu"
+#endif
+
+#ifdef BUILD_I8
+typedef kmp_int64 kmp_int;
+typedef kmp_uint64 kmp_uint;
+#else
+typedef kmp_int32 kmp_int;
+typedef kmp_uint32 kmp_uint;
+#endif /* BUILD_I8 */
+#define KMP_INT_MAX ((kmp_int32)0x7FFFFFFF)
+#define KMP_INT_MIN ((kmp_int32)0x80000000)
+
+// stdarg handling
+#if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64 || KMP_ARCH_WASM) &&  \
+    (KMP_OS_FREEBSD || KMP_OS_LINUX || KMP_OS_WASI)
+typedef va_list *kmp_va_list;
+#define kmp_va_deref(ap) (*(ap))
+#define kmp_va_addr_of(ap) (&(ap))
+#else
+typedef va_list kmp_va_list;
+#define kmp_va_deref(ap) (ap)
+#define kmp_va_addr_of(ap) (ap)
+#endif
+
+#ifdef __cplusplus
+// macros to cast out qualifiers and to re-interpret types
+#define CCAST(type, var) const_cast<type>(var)
+#define RCAST(type, var) reinterpret_cast<type>(var)
+//-------------------------------------------------------------------------
+// template for debug prints specification ( d, u, lld, llu ), and to obtain
+// signed/unsigned flavors of a type
+template <typename T> struct traits_t {};
+// int
+template <> struct traits_t<signed int> {
+  typedef signed int signed_t;
+  typedef unsigned int unsigned_t;
+  typedef double floating_t;
+  static char const *spec;
+  static const signed_t max_value = 0x7fffffff;
+  static const signed_t min_value = 0x80000000;
+  static const int type_size = sizeof(signed_t);
+};
+// unsigned int
+template <> struct traits_t<unsigned int> {
+  typedef signed int signed_t;
+  typedef unsigned int unsigned_t;
+  typedef double floating_t;
+  static char const *spec;
+  static const unsigned_t max_value = 0xffffffff;
+  static const unsigned_t min_value = 0x00000000;
+  static const int type_size = sizeof(unsigned_t);
+};
+// long
+template <> struct traits_t<signed long> {
+  typedef signed long signed_t;
+  typedef unsigned long unsigned_t;
+  typedef long double floating_t;
+  static char const *spec;
+  static const int type_size = sizeof(signed_t);
+};
+// long long
+template <> struct traits_t<signed long long> {
+  typedef signed long long signed_t;
+  typedef unsigned long long unsigned_t;
+  typedef long double floating_t;
+  static char const *spec;
+  static const signed_t max_value = 0x7fffffffffffffffLL;
+  static const signed_t min_value = 0x8000000000000000LL;
+  static const int type_size = sizeof(signed_t);
+};
+// unsigned long long
+template <> struct traits_t<unsigned long long> {
+  typedef signed long long signed_t;
+  typedef unsigned long long unsigned_t;
+  typedef long double floating_t;
+  static char const *spec;
+  static const unsigned_t max_value = 0xffffffffffffffffLL;
+  static const unsigned_t min_value = 0x0000000000000000LL;
+  static const int type_size = sizeof(unsigned_t);
+};
+//-------------------------------------------------------------------------
+#else
+#define CCAST(type, var) (type)(var)
+#define RCAST(type, var) (type)(var)
+#endif // __cplusplus
+
+#define KMP_EXPORT extern /* export declaration in guide libraries */
+
+#if __GNUC__ >= 4 && !defined(__MINGW32__)
+#define __forceinline __inline
+#endif
+
+/* Check if the OS/arch can support user-level mwait */
+// All mwait code tests for UMWAIT first, so it should only fall back to ring3
+// MWAIT for KNL.
+#define KMP_HAVE_MWAIT                                                         \
+  ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS) &&    \
+   !KMP_MIC2)
+#define KMP_HAVE_UMWAIT                                                        \
+  ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS) &&    \
+   !KMP_MIC)
+
+#if KMP_OS_WINDOWS
+// Don't include everything related to NT status code, we'll do that explicitly
+#define WIN32_NO_STATUS
+#include <windows.h>
+
+static inline int KMP_GET_PAGE_SIZE(void) {
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return si.dwPageSize;
+}
+#else
+#define KMP_GET_PAGE_SIZE() getpagesize()
+#endif
+
+#define PAGE_ALIGNED(_addr)                                                    \
+  (!((size_t)_addr & (size_t)(KMP_GET_PAGE_SIZE() - 1)))
+#define ALIGN_TO_PAGE(x)                                                       \
+  (void *)(((size_t)(x)) & ~((size_t)(KMP_GET_PAGE_SIZE() - 1)))
+
+/* ---------- Support for cache alignment, padding, etc. ----------------*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#define INTERNODE_CACHE_LINE 4096 /* for multi-node systems */
+
+/* Define the default size of the cache line */
+#ifndef CACHE_LINE
+#define CACHE_LINE 128 /* cache line size in bytes */
+#else
+#if (CACHE_LINE < 64) && !defined(KMP_OS_DARWIN)
+// 2006-02-13: This produces too many warnings on OS X*. Disable for now
+#warning CACHE_LINE is too small.
+#endif
+#endif /* CACHE_LINE */
+
+#define KMP_CACHE_PREFETCH(ADDR) /* nothing */
+
+// Define attribute that indicates that the fall through from the previous
+// case label is intentional and should not be diagnosed by a compiler
+//   Code from libcxx/include/__config
+// Use a function like macro to imply that it must be followed by a semicolon
+#if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
+#define KMP_FALLTHROUGH() [[fallthrough]]
+// icc cannot properly tell this attribute is absent so force off
+#elif KMP_COMPILER_ICC
+#define KMP_FALLTHROUGH() ((void)0)
+#elif __has_cpp_attribute(clang::fallthrough)
+#define KMP_FALLTHROUGH() [[clang::fallthrough]]
+#elif __has_attribute(fallthrough) || __GNUC__ >= 7
+#define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
+#else
+#define KMP_FALLTHROUGH() ((void)0)
+#endif
+
+#if KMP_HAVE_ATTRIBUTE_WAITPKG
+#define KMP_ATTRIBUTE_TARGET_WAITPKG __attribute__((target("waitpkg")))
+#else
+#define KMP_ATTRIBUTE_TARGET_WAITPKG /* Nothing */
+#endif
+
+#if KMP_HAVE_ATTRIBUTE_RTM
+#define KMP_ATTRIBUTE_TARGET_RTM __attribute__((target("rtm")))
+#else
+#define KMP_ATTRIBUTE_TARGET_RTM /* Nothing */
+#endif
+
+// Define attribute that indicates a function does not return
+#if __cplusplus >= 201103L
+#define KMP_NORETURN [[noreturn]]
+#elif KMP_OS_WINDOWS
+#define KMP_NORETURN __declspec(noreturn)
+#else
+#define KMP_NORETURN __attribute__((noreturn))
+#endif
+
+#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT
+#define KMP_ALIGN(bytes) __declspec(align(bytes))
+#define KMP_THREAD_LOCAL __declspec(thread)
+#define KMP_ALIAS /* Nothing */
+#else
+#define KMP_ALIGN(bytes) __attribute__((aligned(bytes)))
+#define KMP_THREAD_LOCAL __thread
+#define KMP_ALIAS(alias_of) __attribute__((alias(alias_of)))
+#endif
+
+#if KMP_HAVE_WEAK_ATTRIBUTE && !KMP_DYNAMIC_LIB
+#define KMP_WEAK_ATTRIBUTE_EXTERNAL __attribute__((weak))
+#else
+#define KMP_WEAK_ATTRIBUTE_EXTERNAL /* Nothing */
+#endif
+
+#if KMP_HAVE_WEAK_ATTRIBUTE
+#define KMP_WEAK_ATTRIBUTE_INTERNAL __attribute__((weak))
+#else
+#define KMP_WEAK_ATTRIBUTE_INTERNAL /* Nothing */
+#endif
+
+// Define KMP_VERSION_SYMBOL and KMP_EXPAND_NAME
+#ifndef KMP_STR
+#define KMP_STR(x) _KMP_STR(x)
+#define _KMP_STR(x) #x
+#endif
+
+#ifdef KMP_USE_VERSION_SYMBOLS
+// If using versioned symbols, KMP_EXPAND_NAME prepends
+// __kmp_api_ to the real API name
+#define KMP_EXPAND_NAME(api_name) _KMP_EXPAND_NAME(api_name)
+#define _KMP_EXPAND_NAME(api_name) __kmp_api_##api_name
+#define KMP_VERSION_SYMBOL(api_name, ver_num, ver_str)                         \
+  _KMP_VERSION_SYMBOL(api_name, ver_num, ver_str, "VERSION")
+#define _KMP_VERSION_SYMBOL(api_name, ver_num, ver_str, default_ver)            \
+  __typeof__(__kmp_api_##api_name) __kmp_api_##api_name##_##ver_num##_alias     \
+      __attribute__((alias(KMP_STR(__kmp_api_##api_name))));                    \
+  __asm__(                                                                      \
+      ".symver " KMP_STR(__kmp_api_##api_name##_##ver_num##_alias) "," KMP_STR( \
+          api_name) "@" ver_str "\n\t");                                        \
+  __asm__(".symver " KMP_STR(__kmp_api_##api_name) "," KMP_STR(                 \
+      api_name) "@@" default_ver "\n\t")
+
+#define KMP_VERSION_OMPC_SYMBOL(apic_name, api_name, ver_num, ver_str)         \
+  _KMP_VERSION_OMPC_SYMBOL(apic_name, api_name, ver_num, ver_str, "VERSION")
+#define _KMP_VERSION_OMPC_SYMBOL(apic_name, api_name, ver_num, ver_str,          \
+                                 default_ver)                                    \
+  __typeof__(__kmp_api_##apic_name) __kmp_api_##apic_name##_##ver_num##_alias    \
+      __attribute__((alias(KMP_STR(__kmp_api_##apic_name))));                    \
+  __asm__(".symver " KMP_STR(__kmp_api_##apic_name) "," KMP_STR(                 \
+      apic_name) "@@" default_ver "\n\t");                                       \
+  __asm__(                                                                       \
+      ".symver " KMP_STR(__kmp_api_##apic_name##_##ver_num##_alias) "," KMP_STR( \
+          api_name) "@" ver_str "\n\t")
+
+#else // KMP_USE_VERSION_SYMBOLS
+#define KMP_EXPAND_NAME(api_name) api_name
+#define KMP_VERSION_SYMBOL(api_name, ver_num, ver_str) /* Nothing */
+#define KMP_VERSION_OMPC_SYMBOL(apic_name, api_name, ver_num,                  \
+                                ver_str) /* Nothing */
+#endif // KMP_USE_VERSION_SYMBOLS
+
+/* Temporary note: if performance testing of this passes, we can remove
+   all references to KMP_DO_ALIGN and replace with KMP_ALIGN.  */
+#define KMP_DO_ALIGN(bytes) KMP_ALIGN(bytes)
+#define KMP_ALIGN_CACHE KMP_ALIGN(CACHE_LINE)
+#define KMP_ALIGN_CACHE_INTERNODE KMP_ALIGN(INTERNODE_CACHE_LINE)
+
+/* General purpose fence types for memory operations */
+enum kmp_mem_fence_type {
+  kmp_no_fence, /* No memory fence */
+  kmp_acquire_fence, /* Acquire (read) memory fence */
+  kmp_release_fence, /* Release (write) memory fence */
+  kmp_full_fence /* Full (read+write) memory fence */
+};
+
+// Synchronization primitives
+
+#if KMP_ASM_INTRINS && KMP_OS_WINDOWS && !((KMP_ARCH_AARCH64 || KMP_ARCH_ARM) && (KMP_COMPILER_CLANG || KMP_COMPILER_GCC))
+
+#if KMP_MSVC_COMPAT && !KMP_COMPILER_CLANG
+#pragma intrinsic(InterlockedExchangeAdd)
+#pragma intrinsic(InterlockedCompareExchange)
+#pragma intrinsic(InterlockedExchange)
+#if !KMP_32_BIT_ARCH
+#pragma intrinsic(InterlockedExchange64)
+#endif
+#endif
+
+// Using InterlockedIncrement / InterlockedDecrement causes a library loading
+// ordering problem, so we use InterlockedExchangeAdd instead.
+#define KMP_TEST_THEN_INC32(p) InterlockedExchangeAdd((volatile long *)(p), 1)
+#define KMP_TEST_THEN_INC_ACQ32(p)                                             \
+  InterlockedExchangeAdd((volatile long *)(p), 1)
+#define KMP_TEST_THEN_ADD4_32(p) InterlockedExchangeAdd((volatile long *)(p), 4)
+#define KMP_TEST_THEN_ADD4_ACQ32(p)                                            \
+  InterlockedExchangeAdd((volatile long *)(p), 4)
+#define KMP_TEST_THEN_DEC32(p) InterlockedExchangeAdd((volatile long *)(p), -1)
+#define KMP_TEST_THEN_DEC_ACQ32(p)                                             \
+  InterlockedExchangeAdd((volatile long *)(p), -1)
+#define KMP_TEST_THEN_ADD32(p, v)                                              \
+  InterlockedExchangeAdd((volatile long *)(p), (v))
+
+#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv)                                 \
+  InterlockedCompareExchange((volatile long *)(p), (long)(sv), (long)(cv))
+
+#define KMP_XCHG_FIXED32(p, v)                                                 \
+  InterlockedExchange((volatile long *)(p), (long)(v))
+#define KMP_XCHG_FIXED64(p, v)                                                 \
+  InterlockedExchange64((volatile kmp_int64 *)(p), (kmp_int64)(v))
+
+inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
+  kmp_int32 tmp = InterlockedExchange((volatile long *)p, *(long *)&v);
+  return *(kmp_real32 *)&tmp;
+}
+
+#define KMP_TEST_THEN_OR8(p, v) __kmp_test_then_or8((p), (v))
+#define KMP_TEST_THEN_AND8(p, v) __kmp_test_then_and8((p), (v))
+#define KMP_TEST_THEN_OR32(p, v) __kmp_test_then_or32((p), (v))
+#define KMP_TEST_THEN_AND32(p, v) __kmp_test_then_and32((p), (v))
+#define KMP_TEST_THEN_OR64(p, v) __kmp_test_then_or64((p), (v))
+#define KMP_TEST_THEN_AND64(p, v) __kmp_test_then_and64((p), (v))
+
+extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v);
+extern kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 v);
+extern kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 v);
+extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v);
+extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v);
+extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v);
+
+#if KMP_ARCH_AARCH64 && KMP_COMPILER_MSVC && !KMP_COMPILER_CLANG
+#define KMP_TEST_THEN_INC64(p) _InterlockedExchangeAdd64((p), 1LL)
+#define KMP_TEST_THEN_INC_ACQ64(p) _InterlockedExchangeAdd64_acq((p), 1LL)
+#define KMP_TEST_THEN_ADD4_64(p) _InterlockedExchangeAdd64((p), 4LL)
+// #define KMP_TEST_THEN_ADD4_ACQ64(p) _InterlockedExchangeAdd64_acq((p), 4LL)
+// #define KMP_TEST_THEN_DEC64(p) _InterlockedExchangeAdd64((p), -1LL)
+// #define KMP_TEST_THEN_DEC_ACQ64(p) _InterlockedExchangeAdd64_acq((p), -1LL)
+// #define KMP_TEST_THEN_ADD8(p, v) _InterlockedExchangeAdd8((p), (v))
+#define KMP_TEST_THEN_ADD64(p, v) _InterlockedExchangeAdd64((p), (v))
+
+#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
+  __kmp_compare_and_store_acq8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)                                  \
+  __kmp_compare_and_store_rel8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv)                                 \
+  __kmp_compare_and_store_acq16((p), (cv), (sv))
+/*
+#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv)                                 \
+  __kmp_compare_and_store_rel16((p), (cv), (sv))
+*/
+#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)                                 \
+  __kmp_compare_and_store_acq32((volatile kmp_int32 *)(p), (kmp_int32)(cv),    \
+                                (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv)                                 \
+  __kmp_compare_and_store_rel32((volatile kmp_int32 *)(p), (kmp_int32)(cv),    \
+                                (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
+  __kmp_compare_and_store_acq64((volatile kmp_int64 *)(p), (kmp_int64)(cv),    \
+                                (kmp_int64)(sv))
+#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
+  __kmp_compare_and_store_rel64((volatile kmp_int64 *)(p), (kmp_int64)(cv),    \
+                                (kmp_int64)(sv))
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __kmp_compare_and_store_ptr((void *volatile *)(p), (void *)(cv), (void *)(sv))
+
+//  KMP_COMPARE_AND_STORE expects this order:       pointer, compare, exchange
+// _InterlockedCompareExchange expects this order:  pointer, exchange, compare
+// KMP_COMPARE_AND_STORE also returns a bool indicating a successful write. A
+// write is successful if the return value of _InterlockedCompareExchange is the
+// same as the compare value.
+inline kmp_int8 __kmp_compare_and_store_acq8(volatile kmp_int8 *p, kmp_int8 cv,
+                                             kmp_int8 sv) {
+  return _InterlockedCompareExchange8_acq(p, sv, cv) == cv;
+}
+
+inline kmp_int8 __kmp_compare_and_store_rel8(volatile kmp_int8 *p, kmp_int8 cv,
+                                             kmp_int8 sv) {
+  return _InterlockedCompareExchange8_rel(p, sv, cv) == cv;
+}
+
+inline kmp_int16 __kmp_compare_and_store_acq16(volatile kmp_int16 *p,
+                                               kmp_int16 cv, kmp_int16 sv) {
+  return _InterlockedCompareExchange16_acq(p, sv, cv) == cv;
+}
+
+inline kmp_int16 __kmp_compare_and_store_rel16(volatile kmp_int16 *p,
+                                               kmp_int16 cv, kmp_int16 sv) {
+  return _InterlockedCompareExchange16_rel(p, sv, cv) == cv;
+}
+
+inline kmp_int32 __kmp_compare_and_store_acq32(volatile kmp_int32 *p,
+                                               kmp_int32 cv, kmp_int32 sv) {
+  return _InterlockedCompareExchange_acq((volatile long *)p, sv, cv) == cv;
+}
+
+inline kmp_int32 __kmp_compare_and_store_rel32(volatile kmp_int32 *p,
+                                               kmp_int32 cv, kmp_int32 sv) {
+  return _InterlockedCompareExchange_rel((volatile long *)p, sv, cv) == cv;
+}
+
+inline kmp_int32 __kmp_compare_and_store_acq64(volatile kmp_int64 *p,
+                                               kmp_int64 cv, kmp_int64 sv) {
+  return _InterlockedCompareExchange64_acq(p, sv, cv) == cv;
+}
+
+inline kmp_int32 __kmp_compare_and_store_rel64(volatile kmp_int64 *p,
+                                               kmp_int64 cv, kmp_int64 sv) {
+  return _InterlockedCompareExchange64_rel(p, sv, cv) == cv;
+}
+
+inline kmp_int32 __kmp_compare_and_store_ptr(void *volatile *p, void *cv,
+                                             void *sv) {
+  return _InterlockedCompareExchangePointer(p, sv, cv) == cv;
+}
+
+// The _RET versions return the value instead of a bool
+
+#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
+   _InterlockedCompareExchange8((p), (sv), (cv))
+#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
+  _InterlockedCompareExchange16((p), (sv), (cv))
+
+#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
+  _InterlockedCompareExchange64((volatile kmp_int64 *)(p), (kmp_int64)(sv),    \
+                                (kmp_int64)(cv))
+
+
+#define KMP_XCHG_FIXED8(p, v)                                                  \
+  _InterlockedExchange8((volatile kmp_int8 *)(p), (kmp_int8)(v));
+#define KMP_XCHG_FIXED16(p, v) _InterlockedExchange16((p), (v));
+#define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
+
+inline kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v) {
+  kmp_int64 tmp = _InterlockedExchange64((volatile kmp_int64 *)p, *(kmp_int64
+  *)&v); return *(kmp_real64 *)&tmp;
+}
+
+#else // !KMP_ARCH_AARCH64
+
+// Routines that we still need to implement in assembly.
+extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
+
+extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv,
+                                         kmp_int8 sv);
+extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv,
+                                           kmp_int16 sv);
+extern kmp_int32 __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv,
+                                           kmp_int32 sv);
+extern kmp_int32 __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv,
+                                           kmp_int64 sv);
+extern kmp_int8 __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv,
+                                             kmp_int8 sv);
+extern kmp_int16 __kmp_compare_and_store_ret16(volatile kmp_int16 *p,
+                                               kmp_int16 cv, kmp_int16 sv);
+extern kmp_int32 __kmp_compare_and_store_ret32(volatile kmp_int32 *p,
+                                               kmp_int32 cv, kmp_int32 sv);
+extern kmp_int64 __kmp_compare_and_store_ret64(volatile kmp_int64 *p,
+                                               kmp_int64 cv, kmp_int64 sv);
+
+extern kmp_int8 __kmp_xchg_fixed8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int16 __kmp_xchg_fixed16(volatile kmp_int16 *p, kmp_int16 v);
+extern kmp_int32 __kmp_xchg_fixed32(volatile kmp_int32 *p, kmp_int32 v);
+extern kmp_int64 __kmp_xchg_fixed64(volatile kmp_int64 *p, kmp_int64 v);
+extern kmp_real32 __kmp_xchg_real32(volatile kmp_real32 *p, kmp_real32 v);
+extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
+
+//#define KMP_TEST_THEN_INC32(p) __kmp_test_then_add32((p), 1)
+//#define KMP_TEST_THEN_INC_ACQ32(p) __kmp_test_then_add32((p), 1)
+#define KMP_TEST_THEN_INC64(p) __kmp_test_then_add64((p), 1LL)
+#define KMP_TEST_THEN_INC_ACQ64(p) __kmp_test_then_add64((p), 1LL)
+//#define KMP_TEST_THEN_ADD4_32(p) __kmp_test_then_add32((p), 4)
+//#define KMP_TEST_THEN_ADD4_ACQ32(p) __kmp_test_then_add32((p), 4)
+#define KMP_TEST_THEN_ADD4_64(p) __kmp_test_then_add64((p), 4LL)
+#define KMP_TEST_THEN_ADD4_ACQ64(p) __kmp_test_then_add64((p), 4LL)
+//#define KMP_TEST_THEN_DEC32(p) __kmp_test_then_add32((p), -1)
+//#define KMP_TEST_THEN_DEC_ACQ32(p) __kmp_test_then_add32((p), -1)
+#define KMP_TEST_THEN_DEC64(p) __kmp_test_then_add64((p), -1LL)
+#define KMP_TEST_THEN_DEC_ACQ64(p) __kmp_test_then_add64((p), -1LL)
+//#define KMP_TEST_THEN_ADD32(p, v) __kmp_test_then_add32((p), (v))
+#define KMP_TEST_THEN_ADD8(p, v) __kmp_test_then_add8((p), (v))
+#define KMP_TEST_THEN_ADD64(p, v) __kmp_test_then_add64((p), (v))
+
+
+#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
+  __kmp_compare_and_store8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)                                  \
+  __kmp_compare_and_store8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv)                                 \
+  __kmp_compare_and_store16((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv)                                 \
+  __kmp_compare_and_store16((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)                                 \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv)                                 \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+
+#if KMP_ARCH_X86
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#else /* 64 bit pointers */
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+#endif /* KMP_ARCH_X86 */
+
+#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
+  __kmp_compare_and_store_ret8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
+  __kmp_compare_and_store_ret16((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
+  __kmp_compare_and_store_ret64((volatile kmp_int64 *)(p), (kmp_int64)(cv),    \
+                                (kmp_int64)(sv))
+
+#define KMP_XCHG_FIXED8(p, v)                                                  \
+  __kmp_xchg_fixed8((volatile kmp_int8 *)(p), (kmp_int8)(v));
+#define KMP_XCHG_FIXED16(p, v) __kmp_xchg_fixed16((p), (v));
+//#define KMP_XCHG_FIXED32(p, v) __kmp_xchg_fixed32((p), (v));
+//#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
+//#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
+#define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
+#endif
+
+#elif (KMP_ASM_INTRINS && KMP_OS_UNIX) || !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
+
+/* cast p to correct type so that proper intrinsic will be used */
+#define KMP_TEST_THEN_INC32(p)                                                 \
+  __sync_fetch_and_add((volatile kmp_int32 *)(p), 1)
+#define KMP_TEST_THEN_INC_ACQ32(p)                                             \
+  __sync_fetch_and_add((volatile kmp_int32 *)(p), 1)
+#if KMP_ARCH_MIPS
+#define KMP_TEST_THEN_INC64(p)                                                 \
+  __atomic_fetch_add((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
+#define KMP_TEST_THEN_INC_ACQ64(p)                                             \
+  __atomic_fetch_add((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
+#else
+#define KMP_TEST_THEN_INC64(p)                                                 \
+  __sync_fetch_and_add((volatile kmp_int64 *)(p), 1LL)
+#define KMP_TEST_THEN_INC_ACQ64(p)                                             \
+  __sync_fetch_and_add((volatile kmp_int64 *)(p), 1LL)
+#endif
+#define KMP_TEST_THEN_ADD4_32(p)                                               \
+  __sync_fetch_and_add((volatile kmp_int32 *)(p), 4)
+#define KMP_TEST_THEN_ADD4_ACQ32(p)                                            \
+  __sync_fetch_and_add((volatile kmp_int32 *)(p), 4)
+#if KMP_ARCH_MIPS
+#define KMP_TEST_THEN_ADD4_64(p)                                               \
+  __atomic_fetch_add((volatile kmp_int64 *)(p), 4LL, __ATOMIC_SEQ_CST)
+#define KMP_TEST_THEN_ADD4_ACQ64(p)                                            \
+  __atomic_fetch_add((volatile kmp_int64 *)(p), 4LL, __ATOMIC_SEQ_CST)
+#define KMP_TEST_THEN_DEC64(p)                                                 \
+  __atomic_fetch_sub((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
+#define KMP_TEST_THEN_DEC_ACQ64(p)                                             \
+  __atomic_fetch_sub((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
+#else
+#define KMP_TEST_THEN_ADD4_64(p)                                               \
+  __sync_fetch_and_add((volatile kmp_int64 *)(p), 4LL)
+#define KMP_TEST_THEN_ADD4_ACQ64(p)                                            \
+  __sync_fetch_and_add((volatile kmp_int64 *)(p), 4LL)
+#define KMP_TEST_THEN_DEC64(p)                                                 \
+  __sync_fetch_and_sub((volatile kmp_int64 *)(p), 1LL)
+#define KMP_TEST_THEN_DEC_ACQ64(p)                                             \
+  __sync_fetch_and_sub((volatile kmp_int64 *)(p), 1LL)
+#endif
+#define KMP_TEST_THEN_DEC32(p)                                                 \
+  __sync_fetch_and_sub((volatile kmp_int32 *)(p), 1)
+#define KMP_TEST_THEN_DEC_ACQ32(p)                                             \
+  __sync_fetch_and_sub((volatile kmp_int32 *)(p), 1)
+#define KMP_TEST_THEN_ADD8(p, v)                                               \
+  __sync_fetch_and_add((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_ADD32(p, v)                                              \
+  __sync_fetch_and_add((volatile kmp_int32 *)(p), (kmp_int32)(v))
+#if KMP_ARCH_MIPS
+#define KMP_TEST_THEN_ADD64(p, v)                                              \
+  __atomic_fetch_add((volatile kmp_uint64 *)(p), (kmp_uint64)(v),              \
+                     __ATOMIC_SEQ_CST)
+#else
+#define KMP_TEST_THEN_ADD64(p, v)                                              \
+  __sync_fetch_and_add((volatile kmp_int64 *)(p), (kmp_int64)(v))
+#endif
+
+#define KMP_TEST_THEN_OR8(p, v)                                                \
+  __sync_fetch_and_or((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_AND8(p, v)                                               \
+  __sync_fetch_and_and((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_OR32(p, v)                                               \
+  __sync_fetch_and_or((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
+#define KMP_TEST_THEN_AND32(p, v)                                              \
+  __sync_fetch_and_and((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
+#if KMP_ARCH_MIPS
+#define KMP_TEST_THEN_OR64(p, v)                                               \
+  __atomic_fetch_or((volatile kmp_uint64 *)(p), (kmp_uint64)(v),               \
+                    __ATOMIC_SEQ_CST)
+#define KMP_TEST_THEN_AND64(p, v)                                              \
+  __atomic_fetch_and((volatile kmp_uint64 *)(p), (kmp_uint64)(v),              \
+                     __ATOMIC_SEQ_CST)
+#else
+#define KMP_TEST_THEN_OR64(p, v)                                               \
+  __sync_fetch_and_or((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
+#define KMP_TEST_THEN_AND64(p, v)                                              \
+  __sync_fetch_and_and((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
+#endif
+
+#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
+  __sync_bool_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv),     \
+                               (kmp_uint8)(sv))
+#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)                                  \
+  __sync_bool_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv),     \
+                               (kmp_uint8)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv),   \
+                               (kmp_uint16)(sv))
+#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv),   \
+                               (kmp_uint16)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv),   \
+                               (kmp_uint32)(sv))
+#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv),   \
+                               (kmp_uint32)(sv))
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __sync_bool_compare_and_swap((void *volatile *)(p), (void *)(cv),            \
+                               (void *)(sv))
+
+#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
+  __sync_val_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv),      \
+                              (kmp_uint8)(sv))
+#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
+  __sync_val_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv),    \
+                              (kmp_uint16)(sv))
+#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv)                                 \
+  __sync_val_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv),    \
+                              (kmp_uint32)(sv))
+#if KMP_ARCH_MIPS
+static inline bool mips_sync_bool_compare_and_swap(volatile kmp_uint64 *p,
+                                                   kmp_uint64 cv,
+                                                   kmp_uint64 sv) {
+  return __atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
+                                   __ATOMIC_SEQ_CST);
+}
+static inline bool mips_sync_val_compare_and_swap(volatile kmp_uint64 *p,
+                                                  kmp_uint64 cv,
+                                                  kmp_uint64 sv) {
+  __atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
+                            __ATOMIC_SEQ_CST);
+  return cv;
+}
+#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
+  mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p),                  \
+                                  (kmp_uint64)(cv), (kmp_uint64)(sv))
+#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
+  mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p),                  \
+                                  (kmp_uint64)(cv), (kmp_uint64)(sv))
+#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
+  mips_sync_val_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
+                                 (kmp_uint64)(sv))
+#else
+#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),   \
+                               (kmp_uint64)(sv))
+#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),   \
+                               (kmp_uint64)(sv))
+#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
+  __sync_val_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),    \
+                              (kmp_uint64)(sv))
+#endif
+
+#if KMP_OS_DARWIN && defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1800
+#define KMP_XCHG_FIXED8(p, v)                                                  \
+  __atomic_exchange_1((volatile kmp_uint8 *)(p), (kmp_uint8)(v),               \
+                      __ATOMIC_SEQ_CST)
+#else
+#define KMP_XCHG_FIXED8(p, v)                                                  \
+  __sync_lock_test_and_set((volatile kmp_uint8 *)(p), (kmp_uint8)(v))
+#endif
+#define KMP_XCHG_FIXED16(p, v)                                                 \
+  __sync_lock_test_and_set((volatile kmp_uint16 *)(p), (kmp_uint16)(v))
+#define KMP_XCHG_FIXED32(p, v)                                                 \
+  __sync_lock_test_and_set((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
+#define KMP_XCHG_FIXED64(p, v)                                                 \
+  __sync_lock_test_and_set((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
+
+inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
+  volatile kmp_uint32 *up;
+  kmp_uint32 uv;
+  memcpy(&up, &p, sizeof(up));
+  memcpy(&uv, &v, sizeof(uv));
+  kmp_int32 tmp = __sync_lock_test_and_set(up, uv);
+  kmp_real32 ftmp;
+  memcpy(&ftmp, &tmp, sizeof(tmp));
+  return ftmp;
+}
+
+inline kmp_real64 KMP_XCHG_REAL64(volatile kmp_real64 *p, kmp_real64 v) {
+  volatile kmp_uint64 *up;
+  kmp_uint64 uv;
+  memcpy(&up, &p, sizeof(up));
+  memcpy(&uv, &v, sizeof(uv));
+  kmp_int64 tmp = __sync_lock_test_and_set(up, uv);
+  kmp_real64 dtmp;
+  memcpy(&dtmp, &tmp, sizeof(tmp));
+  return dtmp;
+}
+
+#else
+
+extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v);
+extern kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 v);
+extern kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 v);
+extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v);
+extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v);
+extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v);
+
+extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv,
+                                         kmp_int8 sv);
+extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv,
+                                           kmp_int16 sv);
+extern kmp_int32 __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv,
+                                           kmp_int32 sv);
+extern kmp_int32 __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv,
+                                           kmp_int64 sv);
+extern kmp_int8 __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv,
+                                             kmp_int8 sv);
+extern kmp_int16 __kmp_compare_and_store_ret16(volatile kmp_int16 *p,
+                                               kmp_int16 cv, kmp_int16 sv);
+extern kmp_int32 __kmp_compare_and_store_ret32(volatile kmp_int32 *p,
+                                               kmp_int32 cv, kmp_int32 sv);
+extern kmp_int64 __kmp_compare_and_store_ret64(volatile kmp_int64 *p,
+                                               kmp_int64 cv, kmp_int64 sv);
+
+extern kmp_int8 __kmp_xchg_fixed8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int16 __kmp_xchg_fixed16(volatile kmp_int16 *p, kmp_int16 v);
+extern kmp_int32 __kmp_xchg_fixed32(volatile kmp_int32 *p, kmp_int32 v);
+extern kmp_int64 __kmp_xchg_fixed64(volatile kmp_int64 *p, kmp_int64 v);
+extern kmp_real32 __kmp_xchg_real32(volatile kmp_real32 *p, kmp_real32 v);
+extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
+
+#define KMP_TEST_THEN_INC32(p)                                                 \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), 1)
+#define KMP_TEST_THEN_INC_ACQ32(p)                                             \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), 1)
+#define KMP_TEST_THEN_INC64(p)                                                 \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), 1LL)
+#define KMP_TEST_THEN_INC_ACQ64(p)                                             \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), 1LL)
+#define KMP_TEST_THEN_ADD4_32(p)                                               \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), 4)
+#define KMP_TEST_THEN_ADD4_ACQ32(p)                                            \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), 4)
+#define KMP_TEST_THEN_ADD4_64(p)                                               \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), 4LL)
+#define KMP_TEST_THEN_ADD4_ACQ64(p)                                            \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), 4LL)
+#define KMP_TEST_THEN_DEC32(p)                                                 \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), -1)
+#define KMP_TEST_THEN_DEC_ACQ32(p)                                             \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), -1)
+#define KMP_TEST_THEN_DEC64(p)                                                 \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), -1LL)
+#define KMP_TEST_THEN_DEC_ACQ64(p)                                             \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), -1LL)
+#define KMP_TEST_THEN_ADD8(p, v)                                               \
+  __kmp_test_then_add8((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_ADD32(p, v)                                              \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), (kmp_int32)(v))
+#define KMP_TEST_THEN_ADD64(p, v)                                              \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), (kmp_int64)(v))
+
+#define KMP_TEST_THEN_OR8(p, v)                                                \
+  __kmp_test_then_or8((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_AND8(p, v)                                               \
+  __kmp_test_then_and8((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_OR32(p, v)                                               \
+  __kmp_test_then_or32((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
+#define KMP_TEST_THEN_AND32(p, v)                                              \
+  __kmp_test_then_and32((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
+#define KMP_TEST_THEN_OR64(p, v)                                               \
+  __kmp_test_then_or64((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
+#define KMP_TEST_THEN_AND64(p, v)                                              \
+  __kmp_test_then_and64((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
+
+#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
+  __kmp_compare_and_store8((volatile kmp_int8 *)(p), (kmp_int8)(cv),           \
+                           (kmp_int8)(sv))
+#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)                                  \
+  __kmp_compare_and_store8((volatile kmp_int8 *)(p), (kmp_int8)(cv),           \
+                           (kmp_int8)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv)                                 \
+  __kmp_compare_and_store16((volatile kmp_int16 *)(p), (kmp_int16)(cv),        \
+                            (kmp_int16)(sv))
+#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv)                                 \
+  __kmp_compare_and_store16((volatile kmp_int16 *)(p), (kmp_int16)(cv),        \
+                            (kmp_int16)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)                                 \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv)                                 \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+
+#if KMP_ARCH_X86
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#else /* 64 bit pointers */
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+#endif /* KMP_ARCH_X86 */
+
+#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
+  __kmp_compare_and_store_ret8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
+  __kmp_compare_and_store_ret16((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv)                                 \
+  __kmp_compare_and_store_ret32((volatile kmp_int32 *)(p), (kmp_int32)(cv),    \
+                                (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
+  __kmp_compare_and_store_ret64((volatile kmp_int64 *)(p), (kmp_int64)(cv),    \
+                                (kmp_int64)(sv))
+
+#define KMP_XCHG_FIXED8(p, v)                                                  \
+  __kmp_xchg_fixed8((volatile kmp_int8 *)(p), (kmp_int8)(v));
+#define KMP_XCHG_FIXED16(p, v) __kmp_xchg_fixed16((p), (v));
+#define KMP_XCHG_FIXED32(p, v) __kmp_xchg_fixed32((p), (v));
+#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
+#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
+#define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
+
+#endif /* KMP_ASM_INTRINS */
+
+/* ------------- relaxed consistency memory model stuff ------------------ */
+
+#if KMP_OS_WINDOWS
+#ifdef __ABSOFT_WIN
+#define KMP_MB() asm("nop")
+#define KMP_IMB() asm("nop")
+#else
+#define KMP_MB() /* _asm{ nop } */
+#define KMP_IMB() /* _asm{ nop } */
+#endif
+#endif /* KMP_OS_WINDOWS */
+
+#if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS ||     \
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
+    KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC
+#if KMP_OS_WINDOWS
+#undef KMP_MB
+#define KMP_MB() std::atomic_thread_fence(std::memory_order_seq_cst)
+#else /* !KMP_OS_WINDOWS */
+#define KMP_MB() __sync_synchronize()
+#endif
+#endif
+
+#ifndef KMP_MB
+#define KMP_MB() /* nothing to do */
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_MIC
+// fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
+// We shouldn't need it, though, since the ABI rules require that
+// * If the compiler generates NGO stores it also generates the fence
+// * If users hand-code NGO stores they should insert the fence
+// therefore no incomplete unordered stores should be visible.
+#define KMP_MFENCE() /* Nothing */
+#define KMP_SFENCE() /* Nothing */
+#else
+#if KMP_COMPILER_ICC || KMP_COMPILER_ICX
+#define KMP_MFENCE_() _mm_mfence()
+#define KMP_SFENCE_() _mm_sfence()
+#elif KMP_COMPILER_MSVC
+#define KMP_MFENCE_() MemoryBarrier()
+#define KMP_SFENCE_() MemoryBarrier()
+#else
+#define KMP_MFENCE_() __sync_synchronize()
+#define KMP_SFENCE_() __sync_synchronize()
+#endif
+#define KMP_MFENCE()                                                           \
+  if (UNLIKELY(!__kmp_cpuinfo.initialized)) {                                  \
+    __kmp_query_cpuid(&__kmp_cpuinfo);                                         \
+  }                                                                            \
+  if (__kmp_cpuinfo.flags.sse2) {                                              \
+    KMP_MFENCE_();                                                             \
+  }
+#define KMP_SFENCE() KMP_SFENCE_()
+#endif
+#else
+#define KMP_MFENCE() KMP_MB()
+#define KMP_SFENCE() KMP_MB()
+#endif
+
+#ifndef KMP_IMB
+#define KMP_IMB() /* nothing to do */
+#endif
+
+#ifndef KMP_ST_REL32
+#define KMP_ST_REL32(A, D) (*(A) = (D))
+#endif
+
+#ifndef KMP_ST_REL64
+#define KMP_ST_REL64(A, D) (*(A) = (D))
+#endif
+
+#ifndef KMP_LD_ACQ32
+#define KMP_LD_ACQ32(A) (*(A))
+#endif
+
+#ifndef KMP_LD_ACQ64
+#define KMP_LD_ACQ64(A) (*(A))
+#endif
+
+/* ------------------------------------------------------------------------ */
+// FIXME - maybe this should this be
+//
+// #define TCR_4(a)    (*(volatile kmp_int32 *)(&a))
+// #define TCW_4(a,b)  (a) = (*(volatile kmp_int32 *)&(b))
+//
+// #define TCR_8(a)    (*(volatile kmp_int64 *)(a))
+// #define TCW_8(a,b)  (a) = (*(volatile kmp_int64 *)(&b))
+//
+// I'm fairly certain this is the correct thing to do, but I'm afraid
+// of performance regressions.
+
+#define TCR_1(a) (a)
+#define TCW_1(a, b) (a) = (b)
+#define TCR_4(a) (a)
+#define TCW_4(a, b) (a) = (b)
+#define TCI_4(a) (++(a))
+#define TCD_4(a) (--(a))
+#define TCR_8(a) (a)
+#define TCW_8(a, b) (a) = (b)
+#define TCI_8(a) (++(a))
+#define TCD_8(a) (--(a))
+#define TCR_SYNC_4(a) (a)
+#define TCW_SYNC_4(a, b) (a) = (b)
+#define TCX_SYNC_4(a, b, c)                                                    \
+  KMP_COMPARE_AND_STORE_REL32((volatile kmp_int32 *)(volatile void *)&(a),     \
+                              (kmp_int32)(b), (kmp_int32)(c))
+#define TCR_SYNC_8(a) (a)
+#define TCW_SYNC_8(a, b) (a) = (b)
+#define TCX_SYNC_8(a, b, c)                                                    \
+  KMP_COMPARE_AND_STORE_REL64((volatile kmp_int64 *)(volatile void *)&(a),     \
+                              (kmp_int64)(b), (kmp_int64)(c))
+
+#if KMP_ARCH_X86 || KMP_ARCH_MIPS || KMP_ARCH_WASM || KMP_ARCH_PPC
+// What about ARM?
+#define TCR_PTR(a) ((void *)TCR_4(a))
+#define TCW_PTR(a, b) TCW_4((a), (b))
+#define TCR_SYNC_PTR(a) ((void *)TCR_SYNC_4(a))
+#define TCW_SYNC_PTR(a, b) TCW_SYNC_4((a), (b))
+#define TCX_SYNC_PTR(a, b, c) ((void *)TCX_SYNC_4((a), (b), (c)))
+
+#else /* 64 bit pointers */
+
+#define TCR_PTR(a) ((void *)TCR_8(a))
+#define TCW_PTR(a, b) TCW_8((a), (b))
+#define TCR_SYNC_PTR(a) ((void *)TCR_SYNC_8(a))
+#define TCW_SYNC_PTR(a, b) TCW_SYNC_8((a), (b))
+#define TCX_SYNC_PTR(a, b, c) ((void *)TCX_SYNC_8((a), (b), (c)))
+
+#endif /* KMP_ARCH_X86 */
+
+/* If these FTN_{TRUE,FALSE} values change, may need to change several places
+   where they are used to check that language is Fortran, not C. */
+
+#ifndef FTN_TRUE
+#define FTN_TRUE TRUE
+#endif
+
+#ifndef FTN_FALSE
+#define FTN_FALSE FALSE
+#endif
+
+typedef void (*microtask_t)(int *gtid, int *npr, ...);
+
+#ifdef USE_VOLATILE_CAST
+#define VOLATILE_CAST(x) (volatile x)
+#else
+#define VOLATILE_CAST(x) (x)
+#endif
+
+#define KMP_WAIT __kmp_wait_4
+#define KMP_WAIT_PTR __kmp_wait_4_ptr
+#define KMP_EQ __kmp_eq_4
+#define KMP_NEQ __kmp_neq_4
+#define KMP_LT __kmp_lt_4
+#define KMP_GE __kmp_ge_4
+#define KMP_LE __kmp_le_4
+
+/* Workaround for Intel(R) 64 code gen bug when taking address of static array
+ * (Intel(R) 64 Tracker #138) */
+#if (KMP_ARCH_X86_64 || KMP_ARCH_PPC64) && KMP_OS_LINUX
+#define STATIC_EFI2_WORKAROUND
+#else
+#define STATIC_EFI2_WORKAROUND static
+#endif
+
+// Support of BGET usage
+#ifndef KMP_USE_BGET
+#define KMP_USE_BGET 1
+#endif
+
+// Switches for OSS builds
+#ifndef USE_CMPXCHG_FIX
+#define USE_CMPXCHG_FIX 1
+#endif
+
+// Enable dynamic user lock
+#define KMP_USE_DYNAMIC_LOCK 1
+
+// Enable Intel(R) Transactional Synchronization Extensions (Intel(R) TSX) if
+// dynamic user lock is turned on
+#if KMP_USE_DYNAMIC_LOCK
+// Visual studio can't handle the asm sections in this code
+#define KMP_USE_TSX (KMP_ARCH_X86 || KMP_ARCH_X86_64) && !KMP_COMPILER_MSVC
+#ifdef KMP_USE_ADAPTIVE_LOCKS
+#undef KMP_USE_ADAPTIVE_LOCKS
+#endif
+#define KMP_USE_ADAPTIVE_LOCKS KMP_USE_TSX
+#endif
+
+// Enable tick time conversion of ticks to seconds
+#if KMP_STATS_ENABLED
+#define KMP_HAVE_TICK_TIME                                                     \
+  (KMP_OS_LINUX && (KMP_MIC || KMP_ARCH_X86 || KMP_ARCH_X86_64))
+#endif
+
+// Warning levels
+enum kmp_warnings_level {
+  kmp_warnings_off = 0, /* No warnings */
+  kmp_warnings_low, /* Minimal warnings (default) */
+  kmp_warnings_explicit = 6, /* Explicitly set to ON - more warnings */
+  kmp_warnings_verbose /* reserved */
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+// Safe C API
+#include "kmp_safe_c_api.h"
+
+// Macros for C++11 atomic functions
+#define KMP_ATOMIC_LD(p, order) (p)->load(std::memory_order_##order)
+#define KMP_ATOMIC_OP(op, p, v, order) (p)->op(v, std::memory_order_##order)
+
+// For non-default load/store
+#define KMP_ATOMIC_LD_ACQ(p) KMP_ATOMIC_LD(p, acquire)
+#define KMP_ATOMIC_LD_RLX(p) KMP_ATOMIC_LD(p, relaxed)
+#define KMP_ATOMIC_ST_REL(p, v) KMP_ATOMIC_OP(store, p, v, release)
+#define KMP_ATOMIC_ST_RLX(p, v) KMP_ATOMIC_OP(store, p, v, relaxed)
+
+// For non-default fetch_<op>
+#define KMP_ATOMIC_ADD(p, v) KMP_ATOMIC_OP(fetch_add, p, v, acq_rel)
+#define KMP_ATOMIC_SUB(p, v) KMP_ATOMIC_OP(fetch_sub, p, v, acq_rel)
+#define KMP_ATOMIC_AND(p, v) KMP_ATOMIC_OP(fetch_and, p, v, acq_rel)
+#define KMP_ATOMIC_OR(p, v) KMP_ATOMIC_OP(fetch_or, p, v, acq_rel)
+#define KMP_ATOMIC_INC(p) KMP_ATOMIC_OP(fetch_add, p, 1, acq_rel)
+#define KMP_ATOMIC_DEC(p) KMP_ATOMIC_OP(fetch_sub, p, 1, acq_rel)
+#define KMP_ATOMIC_ADD_RLX(p, v) KMP_ATOMIC_OP(fetch_add, p, v, relaxed)
+#define KMP_ATOMIC_INC_RLX(p) KMP_ATOMIC_OP(fetch_add, p, 1, relaxed)
+
+// Callers of the following functions cannot see the side effect on "expected".
+template <typename T>
+bool __kmp_atomic_compare_store(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_acq_rel, std::memory_order_relaxed);
+}
+
+template <typename T>
+bool __kmp_atomic_compare_store_acq(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_acquire, std::memory_order_relaxed);
+}
+
+template <typename T>
+bool __kmp_atomic_compare_store_rel(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_release, std::memory_order_relaxed);
+}
+
+// Symbol lookup on Linux/Windows
+#if KMP_OS_WINDOWS
+extern void *__kmp_lookup_symbol(const char *name, bool next = false);
+#define KMP_DLSYM(name) __kmp_lookup_symbol(name)
+#define KMP_DLSYM_NEXT(name) __kmp_lookup_symbol(name, true)
+#elif KMP_OS_WASI
+#define KMP_DLSYM(name) nullptr
+#define KMP_DLSYM_NEXT(name) nullptr
+#elif defined(__COSMOPOLITAN__)
+#define KMP_DLSYM(name) nullptr
+#define KMP_DLSYM_NEXT(name) nullptr
+#else
+#define KMP_DLSYM(name) dlsym(RTLD_DEFAULT, name)
+#define KMP_DLSYM_NEXT(name) dlsym(RTLD_NEXT, name)
+#endif
+
+// MSVC doesn't have this, but clang/clang-cl does.
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+// Same as LLVM_BUILTIN_UNREACHABLE. States that it is UB to reach this point.
+#if __has_builtin(__builtin_unreachable) || defined(__GNUC__)
+#define KMP_BUILTIN_UNREACHABLE __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define KMP_BUILTIN_UNREACHABLE __assume(false)
+#else
+#define KMP_BUILTIN_UNREACHABLE
+#endif
+
+#endif /* KMP_OS_H */
diff --git a/third_party/openmp/kmp_platform.h b/third_party/openmp/kmp_platform.h
new file mode 100644
index 000000000..ef008f0eb
--- /dev/null
+++ b/third_party/openmp/kmp_platform.h
@@ -0,0 +1,261 @@
+/*
+ * kmp_platform.h -- header for determining operating system and architecture
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_PLATFORM_H
+#define KMP_PLATFORM_H
+
+/* ---------------------- Operating system recognition ------------------- */
+
+#define KMP_OS_LINUX 0
+#define KMP_OS_DRAGONFLY 0
+#define KMP_OS_FREEBSD 0
+#define KMP_OS_NETBSD 0
+#define KMP_OS_OPENBSD 0
+#define KMP_OS_DARWIN 0
+#define KMP_OS_WINDOWS 0
+#define KMP_OS_HURD 0
+#define KMP_OS_SOLARIS 0
+#define KMP_OS_WASI 0
+#define KMP_OS_UNIX 0 /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */
+
+#ifdef _WIN32
+#undef KMP_OS_WINDOWS
+#define KMP_OS_WINDOWS 1
+#endif
+
+#if (defined __APPLE__ && defined __MACH__)
+#undef KMP_OS_DARWIN
+#define KMP_OS_DARWIN 1
+#endif
+
+// in some ppc64 linux installations, only the second condition is met
+#if (defined __linux) || defined(__COSMOPOLITAN__)
+#undef KMP_OS_LINUX
+#define KMP_OS_LINUX 1
+#elif (defined __linux__)
+#undef KMP_OS_LINUX
+#define KMP_OS_LINUX 1
+#else
+#endif
+
+#if (defined __DragonFly__)
+#undef KMP_OS_DRAGONFLY
+#define KMP_OS_DRAGONFLY 1
+#endif
+
+#if (defined __FreeBSD__)
+#undef KMP_OS_FREEBSD
+#define KMP_OS_FREEBSD 1
+#endif
+
+#if (defined __NetBSD__)
+#undef KMP_OS_NETBSD
+#define KMP_OS_NETBSD 1
+#endif
+
+#if (defined __OpenBSD__)
+#undef KMP_OS_OPENBSD
+#define KMP_OS_OPENBSD 1
+#endif
+
+#if (defined __GNU__)
+#undef KMP_OS_HURD
+#define KMP_OS_HURD 1
+#endif
+
+#if (defined __sun__ && defined __svr4__)
+#undef KMP_OS_SOLARIS
+#define KMP_OS_SOLARIS 1
+#endif
+
+#if (defined __wasi__) || (defined __EMSCRIPTEN__)
+#undef KMP_OS_WASI
+#define KMP_OS_WASI 1
+#endif
+
+#if (defined _AIX)
+#undef KMP_OS_AIX
+#define KMP_OS_AIX 1
+#endif
+
+#if (1 != KMP_OS_LINUX + KMP_OS_DRAGONFLY + KMP_OS_FREEBSD + KMP_OS_NETBSD +   \
+              KMP_OS_OPENBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS + KMP_OS_HURD +  \
+              KMP_OS_SOLARIS + KMP_OS_WASI + KMP_OS_AIX)
+#error Unknown OS
+#endif
+
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+    KMP_OS_OPENBSD || KMP_OS_DARWIN || KMP_OS_HURD || KMP_OS_SOLARIS ||        \
+    KMP_OS_WASI || KMP_OS_AIX
+#undef KMP_OS_UNIX
+#define KMP_OS_UNIX 1
+#endif
+
+/* ---------------------- Architecture recognition ------------------- */
+
+#define KMP_ARCH_X86 0
+#define KMP_ARCH_X86_64 0
+#define KMP_ARCH_AARCH64 0
+#define KMP_ARCH_PPC64_ELFv1 0
+#define KMP_ARCH_PPC64_ELFv2 0
+#define KMP_ARCH_PPC64_XCOFF 0
+#define KMP_ARCH_PPC_XCOFF 0
+#define KMP_ARCH_MIPS 0
+#define KMP_ARCH_MIPS64 0
+#define KMP_ARCH_RISCV64 0
+#define KMP_ARCH_LOONGARCH64 0
+#define KMP_ARCH_VE 0
+#define KMP_ARCH_S390X 0
+
+#if KMP_OS_WINDOWS
+#if defined(_M_AMD64) || defined(__x86_64)
+#undef KMP_ARCH_X86_64
+#define KMP_ARCH_X86_64 1
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#undef KMP_ARCH_AARCH64
+#define KMP_ARCH_AARCH64 1
+#elif defined(__arm__) || defined(_M_ARM)
+#undef KMP_ARCH_ARMV7
+#define KMP_ARCH_ARMV7 1
+#else
+#undef KMP_ARCH_X86
+#define KMP_ARCH_X86 1
+#endif
+#endif
+
+#if KMP_OS_UNIX
+#if defined __x86_64
+#undef KMP_ARCH_X86_64
+#define KMP_ARCH_X86_64 1
+#elif defined __i386
+#undef KMP_ARCH_X86
+#define KMP_ARCH_X86 1
+#elif defined __powerpc64__
+#if defined(_CALL_ELF)
+#if _CALL_ELF == 2
+#undef KMP_ARCH_PPC64_ELFv2
+#define KMP_ARCH_PPC64_ELFv2 1
+#else
+#undef KMP_ARCH_PPC64_ELFv1
+#define KMP_ARCH_PPC64_ELFv1 1
+#endif
+#elif defined KMP_OS_AIX
+#undef KMP_ARCH_PPC64_XCOFF
+#define KMP_ARCH_PPC64_XCOFF 1
+#endif
+#elif defined(__powerpc__) && defined(KMP_OS_AIX)
+#undef KMP_ARCH_PPC_XCOFF
+#define KMP_ARCH_PPC_XCOFF 1
+#undef KMP_ARCH_PPC
+#define KMP_ARCH_PPC 1
+#elif defined __aarch64__
+#undef KMP_ARCH_AARCH64
+#define KMP_ARCH_AARCH64 1
+#elif defined __mips__
+#if defined __mips64
+#undef KMP_ARCH_MIPS64
+#define KMP_ARCH_MIPS64 1
+#else
+#undef KMP_ARCH_MIPS
+#define KMP_ARCH_MIPS 1
+#endif
+#elif defined __riscv && __riscv_xlen == 64
+#undef KMP_ARCH_RISCV64
+#define KMP_ARCH_RISCV64 1
+#elif defined __loongarch__ && __loongarch_grlen == 64
+#undef KMP_ARCH_LOONGARCH64
+#define KMP_ARCH_LOONGARCH64 1
+#elif defined __ve__
+#undef KMP_ARCH_VE
+#define KMP_ARCH_VE 1
+#elif defined __s390x__
+#undef KMP_ARCH_S390X
+#define KMP_ARCH_S390X 1
+#endif
+#endif
+
+#if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7R__) ||                     \
+    defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7VE__)
+#define KMP_ARCH_ARMV7 1
+#endif
+
+#if defined(KMP_ARCH_ARMV7) || defined(__ARM_ARCH_6__) ||                      \
+    defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) ||                    \
+    defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6T2__) ||                   \
+    defined(__ARM_ARCH_6ZK__)
+#define KMP_ARCH_ARMV6 1
+#endif
+
+#if defined(KMP_ARCH_ARMV6) || defined(__ARM_ARCH_5T__) ||                     \
+    defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) ||                   \
+    defined(__ARM_ARCH_5TEJ__)
+#define KMP_ARCH_ARMV5 1
+#endif
+
+#if defined(KMP_ARCH_ARMV5) || defined(__ARM_ARCH_4__) ||                      \
+    defined(__ARM_ARCH_4T__)
+#define KMP_ARCH_ARMV4 1
+#endif
+
+#if defined(KMP_ARCH_ARMV4) || defined(__ARM_ARCH_3__) ||                      \
+    defined(__ARM_ARCH_3M__)
+#define KMP_ARCH_ARMV3 1
+#endif
+
+#if defined(KMP_ARCH_ARMV3) || defined(__ARM_ARCH_2__)
+#define KMP_ARCH_ARMV2 1
+#endif
+
+#if defined(KMP_ARCH_ARMV2)
+#define KMP_ARCH_ARM 1
+#endif
+
+#if defined(__wasm32__)
+#define KMP_ARCH_WASM 1
+#endif
+
+#define KMP_ARCH_PPC64                                                         \
+  (KMP_ARCH_PPC64_ELFv2 || KMP_ARCH_PPC64_ELFv1 || KMP_ARCH_PPC64_XCOFF)
+
+#if defined(__MIC__) || defined(__MIC2__)
+#define KMP_MIC 1
+#if __MIC2__ || __KNC__
+#define KMP_MIC1 0
+#define KMP_MIC2 1
+#else
+#define KMP_MIC1 1
+#define KMP_MIC2 0
+#endif
+#else
+#define KMP_MIC 0
+#define KMP_MIC1 0
+#define KMP_MIC2 0
+#endif
+
+/* Specify 32 bit architectures here */
+#define KMP_32_BIT_ARCH                                                        \
+  (KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM ||           \
+   KMP_ARCH_PPC)
+
+// Platforms which support Intel(R) Many Integrated Core Architecture
+#define KMP_MIC_SUPPORTED                                                      \
+  ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS))
+
+// TODO: Fixme - This is clever, but really fugly
+#if (1 != KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 +     \
+              KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 +             \
+              KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64 + KMP_ARCH_VE +          \
+              KMP_ARCH_S390X + KMP_ARCH_WASM + KMP_ARCH_PPC)
+#error Unknown or unsupported architecture
+#endif
+
+#endif // KMP_PLATFORM_H
diff --git a/third_party/openmp/kmp_runtime.cpp b/third_party/openmp/kmp_runtime.cpp
new file mode 100644
index 000000000..8161a2896
--- /dev/null
+++ b/third_party/openmp/kmp_runtime.cpp
@@ -0,0 +1,9351 @@
+/*
+ * kmp_runtime.cpp -- KPTS runtime support library
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+#include "kmp_atomic.h"
+#include "kmp_environment.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_itt.h"
+#include "kmp_settings.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#include "kmp_wait_release.h"
+#include "kmp_wrapper_getpid.h"
+#include "kmp_dispatch.h"
+#include "kmp_utils.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+#if OMPD_SUPPORT
+#include "ompd-specific.h"
+#endif
+
+#if OMP_PROFILING_SUPPORT
+// #include "llvm/Support/TimeProfiler.h"
+static char *ProfileTraceFile = nullptr;
+#endif
+
+/* these are temporary issues to be dealt with */
+#define KMP_USE_PRCTL 0
+
+#if KMP_OS_WINDOWS
+#include <process.h>
+#endif
+
+#ifndef KMP_USE_SHM
+// Windows and WASI do not need these include files as they don't use shared
+// memory.
+#else
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#define SHM_SIZE 1024
+#endif
+
+#if defined(KMP_GOMP_COMPAT)
+char const __kmp_version_alt_comp[] =
+    KMP_VERSION_PREFIX "alternative compiler support: yes";
+#endif /* defined(KMP_GOMP_COMPAT) */
+
+char const __kmp_version_omp_api[] =
+    KMP_VERSION_PREFIX "API version: 5.0 (201611)";
+
+#ifdef KMP_DEBUG
+char const __kmp_version_lock[] =
+    KMP_VERSION_PREFIX "lock type: run time selectable";
+#endif /* KMP_DEBUG */
+
+#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_USE_MONITOR
+kmp_info_t __kmp_monitor;
+#endif
+
+/* Forward declarations */
+
+void __kmp_cleanup(void);
+
+static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
+                                  int gtid);
+static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
+                                  kmp_internal_control_t *new_icvs,
+                                  ident_t *loc);
+#if KMP_AFFINITY_SUPPORTED
+static void __kmp_partition_places(kmp_team_t *team,
+                                   int update_master_only = 0);
+#endif
+static void __kmp_do_serial_initialize(void);
+void __kmp_fork_barrier(int gtid, int tid);
+void __kmp_join_barrier(int gtid);
+void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
+                          kmp_internal_control_t *new_icvs, ident_t *loc);
+
+#ifdef USE_LOAD_BALANCE
+static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
+#endif
+
+static int __kmp_expand_threads(int nNeed);
+#if KMP_OS_WINDOWS
+static int __kmp_unregister_root_other_thread(int gtid);
+#endif
+static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
+kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
+
+void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
+                               int new_nthreads);
+void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
+
+/* Calculate the identifier of the current thread */
+/* fast (and somewhat portable) way to get unique identifier of executing
+   thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
+int __kmp_get_global_thread_id() {
+  int i;
+  kmp_info_t **other_threads;
+  size_t stack_data;
+  char *stack_addr;
+  size_t stack_size;
+  char *stack_base;
+
+  KA_TRACE(
+      1000,
+      ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
+       __kmp_nth, __kmp_all_nth));
+
+  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
+     a parallel region, made it return KMP_GTID_DNE to force serial_initialize
+     by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
+     __kmp_init_gtid for this to work. */
+
+  if (!TCR_4(__kmp_init_gtid))
+    return KMP_GTID_DNE;
+
+#ifdef KMP_TDATA_GTID
+  if (TCR_4(__kmp_gtid_mode) >= 3) {
+    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
+    return __kmp_gtid;
+  }
+#endif
+  if (TCR_4(__kmp_gtid_mode) >= 2) {
+    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
+    return __kmp_gtid_get_specific();
+  }
+  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
+
+  stack_addr = (char *)&stack_data;
+  other_threads = __kmp_threads;
+
+  /* ATT: The code below is a source of potential bugs due to unsynchronized
+     access to __kmp_threads array. For example:
+     1. Current thread loads other_threads[i] to thr and checks it, it is
+        non-NULL.
+     2. Current thread is suspended by OS.
+     3. Another thread unregisters and finishes (debug versions of free()
+        may fill memory with something like 0xEF).
+     4. Current thread is resumed.
+     5. Current thread reads junk from *thr.
+     TODO: Fix it.  --ln  */
+
+  for (i = 0; i < __kmp_threads_capacity; i++) {
+
+    kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
+    if (!thr)
+      continue;
+
+    stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
+    stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
+
+    /* stack grows down -- search through all of the active threads */
+
+    if (stack_addr <= stack_base) {
+      size_t stack_diff = stack_base - stack_addr;
+
+      if (stack_diff <= stack_size) {
+        /* The only way we can be closer than the allocated */
+        /* stack size is if we are running on this thread. */
+        // __kmp_gtid_get_specific can return negative value because this
+        // function can be called by thread destructor. However, before the
+        // thread destructor is called, the value of the corresponding
+        // thread-specific data will be reset to NULL.
+        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
+                         __kmp_gtid_get_specific() == i);
+        return i;
+      }
+    }
+  }
+
+  /* get specific to try and determine our gtid */
+  KA_TRACE(1000,
+           ("*** __kmp_get_global_thread_id: internal alg. failed to find "
+            "thread, using TLS\n"));
+  i = __kmp_gtid_get_specific();
+
+  /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
+
+  /* if we havn't been assigned a gtid, then return code */
+  if (i < 0)
+    return i;
+
+  // other_threads[i] can be nullptr at this point because the corresponding
+  // thread could have already been destructed. It can happen when this function
+  // is called in end library routine.
+  if (!TCR_SYNC_PTR(other_threads[i]))
+    return i;
+
+  /* dynamically updated stack window for uber threads to avoid get_specific
+     call */
+  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
+    KMP_FATAL(StackOverflow, i);
+  }
+
+  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
+  if (stack_addr > stack_base) {
+    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
+    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
+            other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
+                stack_base);
+  } else {
+    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
+            stack_base - stack_addr);
+  }
+
+  /* Reprint stack bounds for ubermaster since they have been refined */
+  if (__kmp_storage_map) {
+    char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
+    char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
+    __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
+                                 other_threads[i]->th.th_info.ds.ds_stacksize,
+                                 "th_%d stack (refinement)", i);
+  }
+  return i;
+}
+
+int __kmp_get_global_thread_id_reg() {
+  int gtid;
+
+  if (!__kmp_init_serial) {
+    gtid = KMP_GTID_DNE;
+  } else
+#ifdef KMP_TDATA_GTID
+      if (TCR_4(__kmp_gtid_mode) >= 3) {
+    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
+    gtid = __kmp_gtid;
+  } else
+#endif
+      if (TCR_4(__kmp_gtid_mode) >= 2) {
+    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
+    gtid = __kmp_gtid_get_specific();
+  } else {
+    KA_TRACE(1000,
+             ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
+    gtid = __kmp_get_global_thread_id();
+  }
+
+  /* we must be a new uber master sibling thread */
+  if (gtid == KMP_GTID_DNE) {
+    KA_TRACE(10,
+             ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
+              "Registering a new gtid.\n"));
+    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+    if (!__kmp_init_serial) {
+      __kmp_do_serial_initialize();
+      gtid = __kmp_gtid_get_specific();
+    } else {
+      gtid = __kmp_register_root(FALSE);
+    }
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
+  }
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  return gtid;
+}
+
+/* caller must hold forkjoin_lock */
+void __kmp_check_stack_overlap(kmp_info_t *th) {
+  int f;
+  char *stack_beg = NULL;
+  char *stack_end = NULL;
+  int gtid;
+
+  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
+  if (__kmp_storage_map) {
+    stack_end = (char *)th->th.th_info.ds.ds_stackbase;
+    stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
+
+    gtid = __kmp_gtid_from_thread(th);
+
+    if (gtid == KMP_GTID_MONITOR) {
+      __kmp_print_storage_map_gtid(
+          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
+          "th_%s stack (%s)", "mon",
+          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
+    } else {
+      __kmp_print_storage_map_gtid(
+          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
+          "th_%d stack (%s)", gtid,
+          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
+    }
+  }
+
+  /* No point in checking ubermaster threads since they use refinement and
+   * cannot overlap */
+  gtid = __kmp_gtid_from_thread(th);
+  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
+    KA_TRACE(10,
+             ("__kmp_check_stack_overlap: performing extensive checking\n"));
+    if (stack_beg == NULL) {
+      stack_end = (char *)th->th.th_info.ds.ds_stackbase;
+      stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
+    }
+
+    for (f = 0; f < __kmp_threads_capacity; f++) {
+      kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
+
+      if (f_th && f_th != th) {
+        char *other_stack_end =
+            (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
+        char *other_stack_beg =
+            other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
+        if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
+            (stack_end > other_stack_beg && stack_end < other_stack_end)) {
+
+          /* Print the other stack values before the abort */
+          if (__kmp_storage_map)
+            __kmp_print_storage_map_gtid(
+                -1, other_stack_beg, other_stack_end,
+                (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
+                "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
+
+          __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
+                      __kmp_msg_null);
+        }
+      }
+    }
+  }
+  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_infinite_loop(void) {
+  static int done = FALSE;
+
+  while (!done) {
+    KMP_YIELD(TRUE);
+  }
+}
+
+#define MAX_MESSAGE 512
+
+void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
+                                  char const *format, ...) {
+  char buffer[MAX_MESSAGE];
+  va_list ap;
+
+  va_start(ap, format);
+  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
+               p2, (unsigned long)size, format);
+  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_vprintf(kmp_err, buffer, ap);
+#if KMP_PRINT_DATA_PLACEMENT
+  int node;
+  if (gtid >= 0) {
+    if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
+      if (__kmp_storage_map_verbose) {
+        node = __kmp_get_host_node(p1);
+        if (node < 0) /* doesn't work, so don't try this next time */
+          __kmp_storage_map_verbose = FALSE;
+        else {
+          char *last;
+          int lastNode;
+          int localProc = __kmp_get_cpu_from_gtid(gtid);
+
+          const int page_size = KMP_GET_PAGE_SIZE();
+
+          p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
+          p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
+          if (localProc >= 0)
+            __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
+                                 localProc >> 1);
+          else
+            __kmp_printf_no_lock("  GTID %d\n", gtid);
+#if KMP_USE_PRCTL
+          /* The more elaborate format is disabled for now because of the prctl
+           * hanging bug. */
+          do {
+            last = p1;
+            lastNode = node;
+            /* This loop collates adjacent pages with the same host node. */
+            do {
+              (char *)p1 += page_size;
+            } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
+            __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
+                                 lastNode);
+          } while (p1 <= p2);
+#else
+          __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
+                               (char *)p1 + (page_size - 1),
+                               __kmp_get_host_node(p1));
+          if (p1 < p2) {
+            __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
+                                 (char *)p2 + (page_size - 1),
+                                 __kmp_get_host_node(p2));
+          }
+#endif
+        }
+      }
+    } else
+      __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
+  }
+#endif /* KMP_PRINT_DATA_PLACEMENT */
+  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+
+  va_end(ap);
+}
+
+void __kmp_warn(char const *format, ...) {
+  char buffer[MAX_MESSAGE];
+  va_list ap;
+
+  if (__kmp_generate_warnings == kmp_warnings_off) {
+    return;
+  }
+
+  va_start(ap, format);
+
+  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
+  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_vprintf(kmp_err, buffer, ap);
+  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+
+  va_end(ap);
+}
+
+void __kmp_abort_process() {
+  // Later threads may stall here, but that's ok because abort() will kill them.
+  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
+
+  if (__kmp_debug_buf) {
+    __kmp_dump_debug_buffer();
+  }
+
+#if KMP_OS_WINDOWS
+  // Let other threads know of abnormal termination and prevent deadlock
+  // if abort happened during library initialization or shutdown
+  __kmp_global.g.g_abort = SIGABRT;
+
+  /* On Windows* OS by default abort() causes pop-up error box, which stalls
+     nightly testing. Unfortunately, we cannot reliably suppress pop-up error
+     boxes. _set_abort_behavior() works well, but this function is not
+     available in VS7 (this is not problem for DLL, but it is a problem for
+     static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
+     help, at least in some versions of MS C RTL.
+
+     It seems following sequence is the only way to simulate abort() and
+     avoid pop-up error box. */
+  raise(SIGABRT);
+  _exit(3); // Just in case, if signal ignored, exit anyway.
+#else
+  __kmp_unregister_library();
+  abort();
+#endif
+
+  __kmp_infinite_loop();
+  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
+
+} // __kmp_abort_process
+
+void __kmp_abort_thread(void) {
+  // TODO: Eliminate g_abort global variable and this function.
+  // In case of abort just call abort(), it will kill all the threads.
+  __kmp_infinite_loop();
+} // __kmp_abort_thread
+
+/* Print out the storage map for the major kmp_info_t thread data structures
+   that are allocated together. */
+
+static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
+  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
+                               gtid);
+
+  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
+                               sizeof(kmp_desc_t), "th_%d.th_info", gtid);
+
+  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
+                               sizeof(kmp_local_t), "th_%d.th_local", gtid);
+
+  __kmp_print_storage_map_gtid(
+      gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
+      sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
+
+  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
+                               &thr->th.th_bar[bs_plain_barrier + 1],
+                               sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
+                               gtid);
+
+  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
+                               &thr->th.th_bar[bs_forkjoin_barrier + 1],
+                               sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
+                               gtid);
+
+#if KMP_FAST_REDUCTION_BARRIER
+  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
+                               &thr->th.th_bar[bs_reduction_barrier + 1],
+                               sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
+                               gtid);
+#endif // KMP_FAST_REDUCTION_BARRIER
+}
+
+/* Print out the storage map for the major kmp_team_t team data structures
+   that are allocated together. */
+
+static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
+                                         int team_id, int num_thr) {
+  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
+  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
+                               header, team_id);
+
+  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
+                               &team->t.t_bar[bs_last_barrier],
+                               sizeof(kmp_balign_team_t) * bs_last_barrier,
+                               "%s_%d.t_bar", header, team_id);
+
+  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
+                               &team->t.t_bar[bs_plain_barrier + 1],
+                               sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
+                               header, team_id);
+
+  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
+                               &team->t.t_bar[bs_forkjoin_barrier + 1],
+                               sizeof(kmp_balign_team_t),
+                               "%s_%d.t_bar[forkjoin]", header, team_id);
+
+#if KMP_FAST_REDUCTION_BARRIER
+  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
+                               &team->t.t_bar[bs_reduction_barrier + 1],
+                               sizeof(kmp_balign_team_t),
+                               "%s_%d.t_bar[reduction]", header, team_id);
+#endif // KMP_FAST_REDUCTION_BARRIER
+
+  __kmp_print_storage_map_gtid(
+      -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
+      sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
+
+  __kmp_print_storage_map_gtid(
+      -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
+      sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
+
+  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
+                               &team->t.t_disp_buffer[num_disp_buff],
+                               sizeof(dispatch_shared_info_t) * num_disp_buff,
+                               "%s_%d.t_disp_buffer", header, team_id);
+}
+
+static void __kmp_init_allocator() {
+  __kmp_init_memkind();
+  __kmp_init_target_mem();
+}
+static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
+
+/* ------------------------------------------------------------------------ */
+
+#if ENABLE_LIBOMPTARGET
+static void __kmp_init_omptarget() {
+  __kmp_init_target_task();
+}
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_DYNAMIC_LIB
+#if KMP_OS_WINDOWS
+
+BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
+  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
+
+  switch (fdwReason) {
+
+  case DLL_PROCESS_ATTACH:
+    KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
+
+    return TRUE;
+
+  case DLL_PROCESS_DETACH:
+    KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
+
+    // According to Windows* documentation for DllMain entry point:
+    // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
+    //   lpReserved == NULL when FreeLibrary() is called,
+    //   lpReserved != NULL when the process is terminated.
+    // When FreeLibrary() is called, worker threads remain alive. So the
+    // runtime's state is consistent and executing proper shutdown is OK.
+    // When the process is terminated, worker threads have exited or been
+    // forcefully terminated by the OS and only the shutdown thread remains.
+    // This can leave the runtime in an inconsistent state.
+    // Hence, only attempt proper cleanup when FreeLibrary() is called.
+    // Otherwise, rely on OS to reclaim resources.
+    if (lpReserved == NULL)
+      __kmp_internal_end_library(__kmp_gtid_get_specific());
+
+    return TRUE;
+
+  case DLL_THREAD_ATTACH:
+    KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
+
+    /* if we want to register new siblings all the time here call
+     * __kmp_get_gtid(); */
+    return TRUE;
+
+  case DLL_THREAD_DETACH:
+    KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
+
+    __kmp_internal_end_thread(__kmp_gtid_get_specific());
+    return TRUE;
+  }
+
+  return TRUE;
+}
+
+#endif /* KMP_OS_WINDOWS */
+#endif /* KMP_DYNAMIC_LIB */
+
+/* __kmp_parallel_deo -- Wait until it's our turn. */
+void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  int gtid = *gtid_ref;
+#ifdef BUILD_PARALLEL_ORDERED
+  kmp_team_t *team = __kmp_team_from_gtid(gtid);
+#endif /* BUILD_PARALLEL_ORDERED */
+
+  if (__kmp_env_consistency_check) {
+    if (__kmp_threads[gtid]->th.th_root->r.r_active)
+#if KMP_USE_DYNAMIC_LOCK
+      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
+#else
+      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
+#endif
+  }
+#ifdef BUILD_PARALLEL_ORDERED
+  if (!team->t.t_serialized) {
+    KMP_MB();
+    KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
+             NULL);
+    KMP_MB();
+  }
+#endif /* BUILD_PARALLEL_ORDERED */
+}
+
+/* __kmp_parallel_dxo -- Signal the next task. */
+void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  int gtid = *gtid_ref;
+#ifdef BUILD_PARALLEL_ORDERED
+  int tid = __kmp_tid_from_gtid(gtid);
+  kmp_team_t *team = __kmp_team_from_gtid(gtid);
+#endif /* BUILD_PARALLEL_ORDERED */
+
+  if (__kmp_env_consistency_check) {
+    if (__kmp_threads[gtid]->th.th_root->r.r_active)
+      __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
+  }
+#ifdef BUILD_PARALLEL_ORDERED
+  if (!team->t.t_serialized) {
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    /* use the tid of the next thread in this team */
+    /* TODO replace with general release procedure */
+    team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+  }
+#endif /* BUILD_PARALLEL_ORDERED */
+}
+
+/* ------------------------------------------------------------------------ */
+/* The BARRIER for a SINGLE process section is always explicit   */
+
+int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
+  int status;
+  kmp_info_t *th;
+  kmp_team_t *team;
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  status = 0;
+
+  th->th.th_ident = id_ref;
+
+  if (team->t.t_serialized) {
+    status = 1;
+  } else {
+    kmp_int32 old_this = th->th.th_local.this_construct;
+
+    ++th->th.th_local.this_construct;
+    /* try to set team count to thread count--success means thread got the
+       single block */
+    /* TODO: Should this be acquire or release? */
+    if (team->t.t_construct == old_this) {
+      status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
+                                              th->th.th_local.this_construct);
+    }
+#if USE_ITT_BUILD
+    if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
+        KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
+        team->t.t_active_level == 1) {
+      // Only report metadata by primary thread of active team at level 1
+      __kmp_itt_metadata_single(id_ref);
+    }
+#endif /* USE_ITT_BUILD */
+  }
+
+  if (__kmp_env_consistency_check) {
+    if (status && push_ws) {
+      __kmp_push_workshare(gtid, ct_psingle, id_ref);
+    } else {
+      __kmp_check_workshare(gtid, ct_psingle, id_ref);
+    }
+  }
+#if USE_ITT_BUILD
+  if (status) {
+    __kmp_itt_single_start(gtid);
+  }
+#endif /* USE_ITT_BUILD */
+  return status;
+}
+
+void __kmp_exit_single(int gtid) {
+#if USE_ITT_BUILD
+  __kmp_itt_single_end(gtid);
+#endif /* USE_ITT_BUILD */
+  if (__kmp_env_consistency_check)
+    __kmp_pop_workshare(gtid, ct_psingle, NULL);
+}
+
+/* determine if we can go parallel or must use a serialized parallel region and
+ * how many threads we can use
+ * set_nproc is the number of threads requested for the team
+ * returns 0 if we should serialize or only use one thread,
+ * otherwise the number of threads to use
+ * The forkjoin lock is held by the caller. */
+static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
+                                 int master_tid, int set_nthreads,
+                                 int enter_teams) {
+  int capacity;
+  int new_nthreads;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  KMP_DEBUG_ASSERT(root && parent_team);
+  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
+
+  // If dyn-var is set, dynamically adjust the number of desired threads,
+  // according to the method specified by dynamic_mode.
+  new_nthreads = set_nthreads;
+  if (!get__dynamic_2(parent_team, master_tid)) {
+    ;
+  }
+#ifdef USE_LOAD_BALANCE
+  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
+    new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
+    if (new_nthreads == 1) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
+                    "reservation to 1 thread\n",
+                    master_tid));
+      return 1;
+    }
+    if (new_nthreads < set_nthreads) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
+                    "reservation to %d threads\n",
+                    master_tid, new_nthreads));
+    }
+  }
+#endif /* USE_LOAD_BALANCE */
+  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
+    new_nthreads = __kmp_avail_proc - __kmp_nth +
+                   (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
+    if (new_nthreads <= 1) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
+                    "reservation to 1 thread\n",
+                    master_tid));
+      return 1;
+    }
+    if (new_nthreads < set_nthreads) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
+                    "reservation to %d threads\n",
+                    master_tid, new_nthreads));
+    } else {
+      new_nthreads = set_nthreads;
+    }
+  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
+    if (set_nthreads > 2) {
+      new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
+      new_nthreads = (new_nthreads % set_nthreads) + 1;
+      if (new_nthreads == 1) {
+        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
+                      "reservation to 1 thread\n",
+                      master_tid));
+        return 1;
+      }
+      if (new_nthreads < set_nthreads) {
+        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
+                      "reservation to %d threads\n",
+                      master_tid, new_nthreads));
+      }
+    }
+  } else {
+    KMP_ASSERT(0);
+  }
+
+  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
+  if (__kmp_nth + new_nthreads -
+          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
+      __kmp_max_nth) {
+    int tl_nthreads = __kmp_max_nth - __kmp_nth +
+                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
+    if (tl_nthreads <= 0) {
+      tl_nthreads = 1;
+    }
+
+    // If dyn-var is false, emit a 1-time warning.
+    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
+      __kmp_reserve_warn = 1;
+      __kmp_msg(kmp_ms_warning,
+                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
+                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+    }
+    if (tl_nthreads == 1) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
+                    "reduced reservation to 1 thread\n",
+                    master_tid));
+      return 1;
+    }
+    KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
+                  "reservation to %d threads\n",
+                  master_tid, tl_nthreads));
+    new_nthreads = tl_nthreads;
+  }
+
+  // Respect OMP_THREAD_LIMIT
+  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
+  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
+  if (cg_nthreads + new_nthreads -
+          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
+      max_cg_threads) {
+    int tl_nthreads = max_cg_threads - cg_nthreads +
+                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
+    if (tl_nthreads <= 0) {
+      tl_nthreads = 1;
+    }
+
+    // If dyn-var is false, emit a 1-time warning.
+    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
+      __kmp_reserve_warn = 1;
+      __kmp_msg(kmp_ms_warning,
+                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
+                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+    }
+    if (tl_nthreads == 1) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
+                    "reduced reservation to 1 thread\n",
+                    master_tid));
+      return 1;
+    }
+    KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
+                  "reservation to %d threads\n",
+                  master_tid, tl_nthreads));
+    new_nthreads = tl_nthreads;
+  }
+
+  // Check if the threads array is large enough, or needs expanding.
+  // See comment in __kmp_register_root() about the adjustment if
+  // __kmp_threads[0] == NULL.
+  capacity = __kmp_threads_capacity;
+  if (TCR_PTR(__kmp_threads[0]) == NULL) {
+    --capacity;
+  }
+  // If it is not for initializing the hidden helper team, we need to take
+  // __kmp_hidden_helper_threads_num out of the capacity because it is included
+  // in __kmp_threads_capacity.
+  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
+    capacity -= __kmp_hidden_helper_threads_num;
+  }
+  if (__kmp_nth + new_nthreads -
+          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
+      capacity) {
+    // Expand the threads array.
+    int slotsRequired = __kmp_nth + new_nthreads -
+                        (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
+                        capacity;
+    int slotsAdded = __kmp_expand_threads(slotsRequired);
+    if (slotsAdded < slotsRequired) {
+      // The threads array was not expanded enough.
+      new_nthreads -= (slotsRequired - slotsAdded);
+      KMP_ASSERT(new_nthreads >= 1);
+
+      // If dyn-var is false, emit a 1-time warning.
+      if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
+        __kmp_reserve_warn = 1;
+        if (__kmp_tp_cached) {
+          __kmp_msg(kmp_ms_warning,
+                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
+                    KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
+                    KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
+        } else {
+          __kmp_msg(kmp_ms_warning,
+                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
+                    KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
+        }
+      }
+    }
+  }
+
+#ifdef KMP_DEBUG
+  if (new_nthreads == 1) {
+    KC_TRACE(10,
+             ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
+              "dead roots and rechecking; requested %d threads\n",
+              __kmp_get_gtid(), set_nthreads));
+  } else {
+    KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
+                  " %d threads\n",
+                  __kmp_get_gtid(), new_nthreads, set_nthreads));
+  }
+#endif // KMP_DEBUG
+  return new_nthreads;
+}
+
+/* Allocate threads from the thread pool and assign them to the new team. We are
+   assured that there are enough threads available, because we checked on that
+   earlier within critical section forkjoin */
+static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
+                                    kmp_info_t *master_th, int master_gtid,
+                                    int fork_teams_workers) {
+  int i;
+  int use_hot_team;
+
+  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
+  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
+  KMP_MB();
+
+  /* first, let's setup the primary thread */
+  master_th->th.th_info.ds.ds_tid = 0;
+  master_th->th.th_team = team;
+  master_th->th.th_team_nproc = team->t.t_nproc;
+  master_th->th.th_team_master = master_th;
+  master_th->th.th_team_serialized = FALSE;
+  master_th->th.th_dispatch = &team->t.t_dispatch[0];
+
+/* make sure we are not the optimized hot team */
+#if KMP_NESTED_HOT_TEAMS
+  use_hot_team = 0;
+  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
+  if (hot_teams) { // hot teams array is not allocated if
+    // KMP_HOT_TEAMS_MAX_LEVEL=0
+    int level = team->t.t_active_level - 1; // index in array of hot teams
+    if (master_th->th.th_teams_microtask) { // are we inside the teams?
+      if (master_th->th.th_teams_size.nteams > 1) {
+        ++level; // level was not increased in teams construct for
+        // team_of_masters
+      }
+      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+          master_th->th.th_teams_level == team->t.t_level) {
+        ++level; // level was not increased in teams construct for
+        // team_of_workers before the parallel
+      } // team->t.t_level will be increased inside parallel
+    }
+    if (level < __kmp_hot_teams_max_level) {
+      if (hot_teams[level].hot_team) {
+        // hot team has already been allocated for given level
+        KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
+        use_hot_team = 1; // the team is ready to use
+      } else {
+        use_hot_team = 0; // AC: threads are not allocated yet
+        hot_teams[level].hot_team = team; // remember new hot team
+        hot_teams[level].hot_team_nth = team->t.t_nproc;
+      }
+    } else {
+      use_hot_team = 0;
+    }
+  }
+#else
+  use_hot_team = team == root->r.r_hot_team;
+#endif
+  if (!use_hot_team) {
+
+    /* install the primary thread */
+    team->t.t_threads[0] = master_th;
+    __kmp_initialize_info(master_th, team, 0, master_gtid);
+
+    /* now, install the worker threads */
+    for (i = 1; i < team->t.t_nproc; i++) {
+
+      /* fork or reallocate a new thread and install it in team */
+      kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
+      team->t.t_threads[i] = thr;
+      KMP_DEBUG_ASSERT(thr);
+      KMP_DEBUG_ASSERT(thr->th.th_team == team);
+      /* align team and thread arrived states */
+      KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
+                    "T#%d(%d:%d) join =%llu, plain=%llu\n",
+                    __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
+                    __kmp_gtid_from_tid(i, team), team->t.t_id, i,
+                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
+                    team->t.t_bar[bs_plain_barrier].b_arrived));
+      thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
+      thr->th.th_teams_level = master_th->th.th_teams_level;
+      thr->th.th_teams_size = master_th->th.th_teams_size;
+      { // Initialize threads' barrier data.
+        int b;
+        kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
+        for (b = 0; b < bs_last_barrier; ++b) {
+          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
+#endif
+        }
+      }
+    }
+
+#if KMP_AFFINITY_SUPPORTED
+    // Do not partition the places list for teams construct workers who
+    // haven't actually been forked to do real work yet. This partitioning
+    // will take place in the parallel region nested within the teams construct.
+    if (!fork_teams_workers) {
+      __kmp_partition_places(team);
+    }
+#endif
+
+    if (team->t.t_nproc > 1 &&
+        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      team->t.b->update_num_threads(team->t.t_nproc);
+      __kmp_add_threads_to_team(team, team->t.t_nproc);
+    }
+  }
+
+  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
+    for (i = 0; i < team->t.t_nproc; i++) {
+      kmp_info_t *thr = team->t.t_threads[i];
+      if (thr->th.th_prev_num_threads != team->t.t_nproc ||
+          thr->th.th_prev_level != team->t.t_level) {
+        team->t.t_display_affinity = 1;
+        break;
+      }
+    }
+  }
+
+  KMP_MB();
+}
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// Propagate any changes to the floating point control registers out to the team
+// We try to avoid unnecessary writes to the relevant cache line in the team
+// structure, so we don't make changes unless they are needed.
+inline static void propagateFPControl(kmp_team_t *team) {
+  if (__kmp_inherit_fp_control) {
+    kmp_int16 x87_fpu_control_word;
+    kmp_uint32 mxcsr;
+
+    // Get primary thread's values of FPU control flags (both X87 and vector)
+    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
+    __kmp_store_mxcsr(&mxcsr);
+    mxcsr &= KMP_X86_MXCSR_MASK;
+
+    // There is no point looking at t_fp_control_saved here.
+    // If it is TRUE, we still have to update the values if they are different
+    // from those we now have. If it is FALSE we didn't save anything yet, but
+    // our objective is the same. We have to ensure that the values in the team
+    // are the same as those we have.
+    // So, this code achieves what we need whether or not t_fp_control_saved is
+    // true. By checking whether the value needs updating we avoid unnecessary
+    // writes that would put the cache-line into a written state, causing all
+    // threads in the team to have to read it again.
+    KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
+    KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
+    // Although we don't use this value, other code in the runtime wants to know
+    // whether it should restore them. So we must ensure it is correct.
+    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
+  } else {
+    // Similarly here. Don't write to this cache-line in the team structure
+    // unless we have to.
+    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
+  }
+}
+
+// Do the opposite, setting the hardware registers to the updated values from
+// the team.
+inline static void updateHWFPControl(kmp_team_t *team) {
+  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
+    // Only reset the fp control regs if they have been changed in the team.
+    // the parallel region that we are exiting.
+    kmp_int16 x87_fpu_control_word;
+    kmp_uint32 mxcsr;
+    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
+    __kmp_store_mxcsr(&mxcsr);
+    mxcsr &= KMP_X86_MXCSR_MASK;
+
+    if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
+      __kmp_clear_x87_fpu_status_word();
+      __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
+    }
+
+    if (team->t.t_mxcsr != mxcsr) {
+      __kmp_load_mxcsr(&team->t.t_mxcsr);
+    }
+  }
+}
+#else
+#define propagateFPControl(x) ((void)0)
+#define updateHWFPControl(x) ((void)0)
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
+                                     int realloc); // forward declaration
+
+/* Run a parallel region that has been serialized, so runs only in a team of the
+   single primary thread. */
+void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
+  kmp_info_t *this_thr;
+  kmp_team_t *serial_team;
+
+  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
+
+  /* Skip all this code for autopar serialized loops since it results in
+     unacceptable overhead */
+  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
+    return;
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  this_thr = __kmp_threads[global_tid];
+  serial_team = this_thr->th.th_serial_team;
+
+  /* utilize the serialized team held by this thread */
+  KMP_DEBUG_ASSERT(serial_team);
+  KMP_MB();
+
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    KMP_DEBUG_ASSERT(
+        this_thr->th.th_task_team ==
+        this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
+    KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
+                     NULL);
+    KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
+                  "team %p, new task_team = NULL\n",
+                  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
+    this_thr->th.th_task_team = NULL;
+  }
+
+  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
+  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
+    proc_bind = proc_bind_false;
+  } else if (proc_bind == proc_bind_default) {
+    // No proc_bind clause was specified, so use the current value
+    // of proc-bind-var for this parallel region.
+    proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
+  }
+  // Reset for next parallel region
+  this_thr->th.th_set_proc_bind = proc_bind_default;
+
+  // Reset num_threads for next parallel region
+  this_thr->th.th_set_nproc = 0;
+
+#if OMPT_SUPPORT
+  ompt_data_t ompt_parallel_data = ompt_data_none;
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
+
+    ompt_task_info_t *parent_task_info;
+    parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
+
+    parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_parallel_begin) {
+      int team_size = 1;
+
+      ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
+          &(parent_task_info->task_data), &(parent_task_info->frame),
+          &ompt_parallel_data, team_size,
+          ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
+    }
+  }
+#endif // OMPT_SUPPORT
+
+  if (this_thr->th.th_team != serial_team) {
+    // Nested level will be an index in the nested nthreads array
+    int level = this_thr->th.th_team->t.t_level;
+
+    if (serial_team->t.t_serialized) {
+      /* this serial team was already used
+         TODO increase performance by making this locks more specific */
+      kmp_team_t *new_team;
+
+      __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+
+      new_team =
+          __kmp_allocate_team(this_thr->th.th_root, 1, 1,
+#if OMPT_SUPPORT
+                              ompt_parallel_data,
+#endif
+                              proc_bind, &this_thr->th.th_current_task->td_icvs,
+                              0 USE_NESTED_HOT_ARG(NULL));
+      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+      KMP_ASSERT(new_team);
+
+      /* setup new serialized team and install it */
+      new_team->t.t_threads[0] = this_thr;
+      new_team->t.t_parent = this_thr->th.th_team;
+      serial_team = new_team;
+      this_thr->th.th_serial_team = serial_team;
+
+      KF_TRACE(
+          10,
+          ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
+           global_tid, serial_team));
+
+      /* TODO the above breaks the requirement that if we run out of resources,
+         then we can still guarantee that serialized teams are ok, since we may
+         need to allocate a new one */
+    } else {
+      KF_TRACE(
+          10,
+          ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
+           global_tid, serial_team));
+    }
+
+    /* we have to initialize this serial team */
+    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
+    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
+    KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
+    serial_team->t.t_ident = loc;
+    serial_team->t.t_serialized = 1;
+    serial_team->t.t_nproc = 1;
+    serial_team->t.t_parent = this_thr->th.th_team;
+    serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
+    this_thr->th.th_team = serial_team;
+    serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
+
+    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
+                  this_thr->th.th_current_task));
+    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
+    this_thr->th.th_current_task->td_flags.executing = 0;
+
+    __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
+
+    /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
+       implicit task for each serialized task represented by
+       team->t.t_serialized? */
+    copy_icvs(&this_thr->th.th_current_task->td_icvs,
+              &this_thr->th.th_current_task->td_parent->td_icvs);
+
+    // Thread value exists in the nested nthreads array for the next nested
+    // level
+    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
+      this_thr->th.th_current_task->td_icvs.nproc =
+          __kmp_nested_nth.nth[level + 1];
+    }
+
+    if (__kmp_nested_proc_bind.used &&
+        (level + 1 < __kmp_nested_proc_bind.used)) {
+      this_thr->th.th_current_task->td_icvs.proc_bind =
+          __kmp_nested_proc_bind.bind_types[level + 1];
+    }
+
+#if USE_DEBUGGER
+    serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
+#endif
+    this_thr->th.th_info.ds.ds_tid = 0;
+
+    /* set thread cache values */
+    this_thr->th.th_team_nproc = 1;
+    this_thr->th.th_team_master = this_thr;
+    this_thr->th.th_team_serialized = 1;
+
+    serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
+    serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
+    serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
+
+    propagateFPControl(serial_team);
+
+    /* check if we need to allocate dispatch buffers stack */
+    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
+    if (!serial_team->t.t_dispatch->th_disp_buffer) {
+      serial_team->t.t_dispatch->th_disp_buffer =
+          (dispatch_private_info_t *)__kmp_allocate(
+              sizeof(dispatch_private_info_t));
+    }
+    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
+
+    KMP_MB();
+
+  } else {
+    /* this serialized team is already being used,
+     * that's fine, just add another nested level */
+    KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
+    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
+    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
+    ++serial_team->t.t_serialized;
+    this_thr->th.th_team_serialized = serial_team->t.t_serialized;
+
+    // Nested level will be an index in the nested nthreads array
+    int level = this_thr->th.th_team->t.t_level;
+    // Thread value exists in the nested nthreads array for the next nested
+    // level
+    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
+      this_thr->th.th_current_task->td_icvs.nproc =
+          __kmp_nested_nth.nth[level + 1];
+    }
+    serial_team->t.t_level++;
+    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
+                  "of serial team %p to %d\n",
+                  global_tid, serial_team, serial_team->t.t_level));
+
+    /* allocate/push dispatch buffers stack */
+    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
+    {
+      dispatch_private_info_t *disp_buffer =
+          (dispatch_private_info_t *)__kmp_allocate(
+              sizeof(dispatch_private_info_t));
+      disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
+      serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
+    }
+    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
+
+    KMP_MB();
+  }
+  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
+
+  // Perform the display affinity functionality for
+  // serialized parallel regions
+  if (__kmp_display_affinity) {
+    if (this_thr->th.th_prev_level != serial_team->t.t_level ||
+        this_thr->th.th_prev_num_threads != 1) {
+      // NULL means use the affinity-format-var ICV
+      __kmp_aux_display_affinity(global_tid, NULL);
+      this_thr->th.th_prev_level = serial_team->t.t_level;
+      this_thr->th.th_prev_num_threads = 1;
+    }
+  }
+
+  if (__kmp_env_consistency_check)
+    __kmp_push_parallel(global_tid, NULL);
+#if OMPT_SUPPORT
+  serial_team->t.ompt_team_info.master_return_address = codeptr;
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
+    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
+        OMPT_GET_FRAME_ADDRESS(0);
+
+    ompt_lw_taskteam_t lw_taskteam;
+    __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
+                            &ompt_parallel_data, codeptr);
+
+    __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
+    // don't use lw_taskteam after linking. content was swaped
+
+    /* OMPT implicit task begin */
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
+          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
+          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+      OMPT_CUR_TASK_INFO(this_thr)->thread_num =
+          __kmp_tid_from_gtid(global_tid);
+    }
+
+    /* OMPT state */
+    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
+        OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+}
+
+// Test if this fork is for a team closely nested in a teams construct
+static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
+                                          microtask_t microtask, int level,
+                                          int teams_level, kmp_va_list ap) {
+  return (master_th->th.th_teams_microtask && ap &&
+          microtask != (microtask_t)__kmp_teams_master && level == teams_level);
+}
+
+// Test if this fork is for the teams construct, i.e. to form the outer league
+// of teams
+static inline bool __kmp_is_entering_teams(int active_level, int level,
+                                           int teams_level, kmp_va_list ap) {
+  return ((ap == NULL && active_level == 0) ||
+          (ap && teams_level > 0 && teams_level == level));
+}
+
+// AC: This is start of parallel that is nested inside teams construct.
+// The team is actual (hot), all workers are ready at the fork barrier.
+// No lock needed to initialize the team a bit, then free workers.
+static inline int
+__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
+                    kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
+                    enum fork_context_e call_context, microtask_t microtask,
+                    launch_t invoker, int master_set_numthreads, int level,
+#if OMPT_SUPPORT
+                    ompt_data_t ompt_parallel_data, void *return_address,
+#endif
+                    kmp_va_list ap) {
+  void **argv;
+  int i;
+
+  parent_team->t.t_ident = loc;
+  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
+  parent_team->t.t_argc = argc;
+  argv = (void **)parent_team->t.t_argv;
+  for (i = argc - 1; i >= 0; --i) {
+    *argv++ = va_arg(kmp_va_deref(ap), void *);
+  }
+  // Increment our nested depth levels, but not increase the serialization
+  if (parent_team == master_th->th.th_serial_team) {
+    // AC: we are in serialized parallel
+    __kmpc_serialized_parallel(loc, gtid);
+    KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
+
+    if (call_context == fork_context_gnu) {
+      // AC: need to decrement t_serialized for enquiry functions to work
+      // correctly, will restore at join time
+      parent_team->t.t_serialized--;
+      return TRUE;
+    }
+
+#if OMPD_SUPPORT
+    parent_team->t.t_pkfn = microtask;
+#endif
+
+#if OMPT_SUPPORT
+    void *dummy;
+    void **exit_frame_p;
+    ompt_data_t *implicit_task_data;
+    ompt_lw_taskteam_t lw_taskteam;
+
+    if (ompt_enabled.enabled) {
+      __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                              &ompt_parallel_data, return_address);
+      exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
+
+      __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+      // Don't use lw_taskteam after linking. Content was swapped.
+
+      /* OMPT implicit task begin */
+      implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
+      if (ompt_enabled.ompt_callback_implicit_task) {
+        OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
+            1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+      }
+
+      /* OMPT state */
+      master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+    } else {
+      exit_frame_p = &dummy;
+    }
+#endif
+
+    // AC: need to decrement t_serialized for enquiry functions to work
+    // correctly, will restore at join time
+    parent_team->t.t_serialized--;
+
+    {
+      KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+      KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+      __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
+#if OMPT_SUPPORT
+                             ,
+                             exit_frame_p
+#endif
+                             );
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      *exit_frame_p = NULL;
+      OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
+      if (ompt_enabled.ompt_callback_implicit_task) {
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_end, NULL, implicit_task_data, 1,
+            OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+      }
+      ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+      __ompt_lw_taskteam_unlink(master_th);
+      if (ompt_enabled.ompt_callback_parallel_end) {
+        ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+            &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
+            OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
+      }
+      master_th->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+    return TRUE;
+  }
+
+  parent_team->t.t_pkfn = microtask;
+  parent_team->t.t_invoke = invoker;
+  KMP_ATOMIC_INC(&root->r.r_in_parallel);
+  parent_team->t.t_active_level++;
+  parent_team->t.t_level++;
+  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
+
+  // If the threads allocated to the team are less than the thread limit, update
+  // the thread limit here. th_teams_size.nth is specific to this team nested
+  // in a teams construct, the team is fully created, and we're about to do
+  // the actual fork. Best to do this here so that the subsequent uses below
+  // and in the join have the correct value.
+  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    ompt_lw_taskteam_t lw_taskteam;
+    __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
+                            return_address);
+    __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
+  }
+#endif
+
+  /* Change number of threads in the team if requested */
+  if (master_set_numthreads) { // The parallel has num_threads clause
+    if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
+      // AC: only can reduce number of threads dynamically, can't increase
+      kmp_info_t **other_threads = parent_team->t.t_threads;
+      // NOTE: if using distributed barrier, we need to run this code block
+      // even when the team size appears not to have changed from the max.
+      int old_proc = master_th->th.th_teams_size.nth;
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
+        __kmp_add_threads_to_team(parent_team, master_set_numthreads);
+      }
+      parent_team->t.t_nproc = master_set_numthreads;
+      for (i = 0; i < master_set_numthreads; ++i) {
+        other_threads[i]->th.th_team_nproc = master_set_numthreads;
+      }
+    }
+    // Keep extra threads hot in the team for possible next parallels
+    master_th->th.th_set_nproc = 0;
+  }
+
+#if USE_DEBUGGER
+  if (__kmp_debugging) { // Let debugger override number of threads.
+    int nth = __kmp_omp_num_threads(loc);
+    if (nth > 0) { // 0 means debugger doesn't want to change num threads
+      master_set_numthreads = nth;
+    }
+  }
+#endif
+
+  // Figure out the proc_bind policy for the nested parallel within teams
+  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
+  // proc_bind_default means don't update
+  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
+  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
+    proc_bind = proc_bind_false;
+  } else {
+    // No proc_bind clause specified; use current proc-bind-var
+    if (proc_bind == proc_bind_default) {
+      proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
+    }
+    /* else: The proc_bind policy was specified explicitly on parallel clause.
+       This overrides proc-bind-var for this parallel region, but does not
+       change proc-bind-var. */
+    // Figure the value of proc-bind-var for the child threads.
+    if ((level + 1 < __kmp_nested_proc_bind.used) &&
+        (__kmp_nested_proc_bind.bind_types[level + 1] !=
+         master_th->th.th_current_task->td_icvs.proc_bind)) {
+      proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+    }
+  }
+  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
+  // Need to change the bind-var ICV to correct value for each implicit task
+  if (proc_bind_icv != proc_bind_default &&
+      master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
+    kmp_info_t **other_threads = parent_team->t.t_threads;
+    for (i = 0; i < master_th->th.th_team_nproc; ++i) {
+      other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
+    }
+  }
+  // Reset for next parallel region
+  master_th->th.th_set_proc_bind = proc_bind_default;
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
+       KMP_ITT_DEBUG) &&
+      __kmp_forkjoin_frames_mode == 3 &&
+      parent_team->t.t_active_level == 1 // only report frames at level 1
+      && master_th->th.th_teams_size.nteams == 1) {
+    kmp_uint64 tmp_time = __itt_get_timestamp();
+    master_th->th.th_frame_time = tmp_time;
+    parent_team->t.t_region_time = tmp_time;
+  }
+  if (__itt_stack_caller_create_ptr) {
+    KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
+    // create new stack stitching id before entering fork barrier
+    parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
+  }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+#if KMP_AFFINITY_SUPPORTED
+  __kmp_partition_places(parent_team);
+#endif
+
+  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
+                "master_th=%p, gtid=%d\n",
+                root, parent_team, master_th, gtid));
+  __kmp_internal_fork(loc, gtid, parent_team);
+  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
+                "master_th=%p, gtid=%d\n",
+                root, parent_team, master_th, gtid));
+
+  if (call_context == fork_context_gnu)
+    return TRUE;
+
+  /* Invoke microtask for PRIMARY thread */
+  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
+                parent_team->t.t_id, parent_team->t.t_pkfn));
+
+  if (!parent_team->t.t_invoke(gtid)) {
+    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
+  }
+  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
+                parent_team->t.t_id, parent_team->t.t_pkfn));
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
+
+  return TRUE;
+}
+
+// Create a serialized parallel region
+static inline int
+__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
+                       kmp_int32 argc, microtask_t microtask, launch_t invoker,
+                       kmp_info_t *master_th, kmp_team_t *parent_team,
+#if OMPT_SUPPORT
+                       ompt_data_t *ompt_parallel_data, void **return_address,
+                       ompt_data_t **parent_task_data,
+#endif
+                       kmp_va_list ap) {
+  kmp_team_t *team;
+  int i;
+  void **argv;
+
+/* josh todo: hypothetical question: what do we do for OS X*? */
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+  SimpleVLA<void *> args(argc);
+#else
+  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
+#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
+          KMP_ARCH_AARCH64) */
+
+  KA_TRACE(
+      20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
+
+  __kmpc_serialized_parallel(loc, gtid);
+
+#if OMPD_SUPPORT
+  master_th->th.th_serial_team->t.t_pkfn = microtask;
+#endif
+
+  if (call_context == fork_context_intel) {
+    /* TODO this sucks, use the compiler itself to pass args! :) */
+    master_th->th.th_serial_team->t.t_ident = loc;
+    if (!ap) {
+      // revert change made in __kmpc_serialized_parallel()
+      master_th->th.th_serial_team->t.t_level--;
+// Get args from parent team for teams construct
+
+#if OMPT_SUPPORT
+      void *dummy;
+      void **exit_frame_p;
+      ompt_task_info_t *task_info;
+      ompt_lw_taskteam_t lw_taskteam;
+
+      if (ompt_enabled.enabled) {
+        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                                ompt_parallel_data, *return_address);
+
+        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+        // don't use lw_taskteam after linking. content was swaped
+        task_info = OMPT_CUR_TASK_INFO(master_th);
+        exit_frame_p = &(task_info->frame.exit_frame.ptr);
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+              &(task_info->task_data), 1,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+        }
+
+        /* OMPT state */
+        master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+      } else {
+        exit_frame_p = &dummy;
+      }
+#endif
+
+      {
+        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+        __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
+#if OMPT_SUPPORT
+                               ,
+                               exit_frame_p
+#endif
+                               );
+      }
+
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        *exit_frame_p = NULL;
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_end, NULL, &(task_info->task_data), 1,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+        }
+        *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+        __ompt_lw_taskteam_unlink(master_th);
+        if (ompt_enabled.ompt_callback_parallel_end) {
+          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+              ompt_parallel_data, *parent_task_data,
+              OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
+        }
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+      }
+#endif
+    } else if (microtask == (microtask_t)__kmp_teams_master) {
+      KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
+      team = master_th->th.th_team;
+      // team->t.t_pkfn = microtask;
+      team->t.t_invoke = invoker;
+      __kmp_alloc_argv_entries(argc, team, TRUE);
+      team->t.t_argc = argc;
+      argv = (void **)team->t.t_argv;
+      if (ap) {
+        for (i = argc - 1; i >= 0; --i)
+          *argv++ = va_arg(kmp_va_deref(ap), void *);
+      } else {
+        for (i = 0; i < argc; ++i)
+          // Get args from parent team for teams construct
+          argv[i] = parent_team->t.t_argv[i];
+      }
+      // AC: revert change made in __kmpc_serialized_parallel()
+      //     because initial code in teams should have level=0
+      team->t.t_level--;
+      // AC: call special invoker for outer "parallel" of teams construct
+      invoker(gtid);
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_end, NULL, &(task_info->task_data), 0,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
+        }
+        if (ompt_enabled.ompt_callback_parallel_end) {
+          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+              ompt_parallel_data, *parent_task_data,
+              OMPT_INVOKER(call_context) | ompt_parallel_league,
+              *return_address);
+        }
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+      }
+#endif
+    } else {
+      argv = args;
+      for (i = argc - 1; i >= 0; --i)
+        *argv++ = va_arg(kmp_va_deref(ap), void *);
+      KMP_MB();
+
+#if OMPT_SUPPORT
+      void *dummy;
+      void **exit_frame_p;
+      ompt_task_info_t *task_info;
+      ompt_lw_taskteam_t lw_taskteam;
+      ompt_data_t *implicit_task_data;
+
+      if (ompt_enabled.enabled) {
+        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                                ompt_parallel_data, *return_address);
+        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+        // don't use lw_taskteam after linking. content was swaped
+        task_info = OMPT_CUR_TASK_INFO(master_th);
+        exit_frame_p = &(task_info->frame.exit_frame.ptr);
+
+        /* OMPT implicit task begin */
+        implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+              implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
+              ompt_task_implicit);
+          OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
+        }
+
+        /* OMPT state */
+        master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+      } else {
+        exit_frame_p = &dummy;
+      }
+#endif
+
+      {
+        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+        __kmp_invoke_microtask(microtask, gtid, 0, argc, args
+#if OMPT_SUPPORT
+                               ,
+                               exit_frame_p
+#endif
+                               );
+      }
+
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        *exit_frame_p = NULL;
+        if (ompt_enabled.ompt_callback_implicit_task) {
+          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+              ompt_scope_end, NULL, &(task_info->task_data), 1,
+              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+        }
+
+        *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+        __ompt_lw_taskteam_unlink(master_th);
+        if (ompt_enabled.ompt_callback_parallel_end) {
+          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+              ompt_parallel_data, *parent_task_data,
+              OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
+        }
+        master_th->th.ompt_thread_info.state = ompt_state_overhead;
+      }
+#endif
+    }
+  } else if (call_context == fork_context_gnu) {
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      ompt_lw_taskteam_t lwt;
+      __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
+                              *return_address);
+
+      lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
+      __ompt_lw_taskteam_link(&lwt, master_th, 1);
+    }
+// don't use lw_taskteam after linking. content was swaped
+#endif
+
+    // we were called from GNU native code
+    KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
+    return FALSE;
+  } else {
+    KMP_ASSERT2(call_context < fork_context_last,
+                "__kmp_serial_fork_call: unknown fork_context parameter");
+  }
+
+  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
+  KMP_MB();
+  return FALSE;
+}
+
+/* most of the work for a fork */
+/* return true if we really went parallel, false if serialized */
+int __kmp_fork_call(ident_t *loc, int gtid,
+                    enum fork_context_e call_context, // Intel, GNU, ...
+                    kmp_int32 argc, microtask_t microtask, launch_t invoker,
+                    kmp_va_list ap) {
+  void **argv;
+  int i;
+  int master_tid;
+  int master_this_cons;
+  kmp_team_t *team;
+  kmp_team_t *parent_team;
+  kmp_info_t *master_th;
+  kmp_root_t *root;
+  int nthreads;
+  int master_active;
+  int master_set_numthreads;
+  int task_thread_limit = 0;
+  int level;
+  int active_level;
+  int teams_level;
+#if KMP_NESTED_HOT_TEAMS
+  kmp_hot_team_ptr_t **p_hot_teams;
+#endif
+  { // KMP_TIME_BLOCK
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
+    KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
+
+    KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
+    if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
+      /* Some systems prefer the stack for the root thread(s) to start with */
+      /* some gap from the parent stack to prevent false sharing. */
+      void *dummy = KMP_ALLOCA(__kmp_stkpadding);
+      /* These 2 lines below are so this does not get optimized out */
+      if (__kmp_stkpadding > KMP_MAX_STKPADDING)
+        __kmp_stkpadding += (short)((kmp_int64)dummy);
+    }
+
+    /* initialize if needed */
+    KMP_DEBUG_ASSERT(
+        __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
+    if (!TCR_4(__kmp_init_parallel))
+      __kmp_parallel_initialize();
+    __kmp_resume_if_soft_paused();
+
+    /* setup current data */
+    // AC: potentially unsafe, not in sync with library shutdown,
+    // __kmp_threads can be freed
+    master_th = __kmp_threads[gtid];
+
+    parent_team = master_th->th.th_team;
+    master_tid = master_th->th.th_info.ds.ds_tid;
+    master_this_cons = master_th->th.th_local.this_construct;
+    root = master_th->th.th_root;
+    master_active = root->r.r_active;
+    master_set_numthreads = master_th->th.th_set_nproc;
+    task_thread_limit =
+        master_th->th.th_current_task->td_icvs.task_thread_limit;
+
+#if OMPT_SUPPORT
+    ompt_data_t ompt_parallel_data = ompt_data_none;
+    ompt_data_t *parent_task_data;
+    ompt_frame_t *ompt_frame;
+    void *return_address = NULL;
+
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
+                                    NULL, NULL);
+      return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    }
+#endif
+
+    // Assign affinity to root thread if it hasn't happened yet
+    __kmp_assign_root_init_mask();
+
+    // Nested level will be an index in the nested nthreads array
+    level = parent_team->t.t_level;
+    // used to launch non-serial teams even if nested is not allowed
+    active_level = parent_team->t.t_active_level;
+    // needed to check nesting inside the teams
+    teams_level = master_th->th.th_teams_level;
+#if KMP_NESTED_HOT_TEAMS
+    p_hot_teams = &master_th->th.th_hot_teams;
+    if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
+      *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
+          sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
+      (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
+      // it is either actual or not needed (when active_level > 0)
+      (*p_hot_teams)[0].hot_team_nth = 1;
+    }
+#endif
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      if (ompt_enabled.ompt_callback_parallel_begin) {
+        int team_size = master_set_numthreads
+                            ? master_set_numthreads
+                            : get__nproc_2(parent_team, master_tid);
+        int flags = OMPT_INVOKER(call_context) |
+                    ((microtask == (microtask_t)__kmp_teams_master)
+                         ? ompt_parallel_league
+                         : ompt_parallel_team);
+        ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
+            parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
+            return_address);
+      }
+      master_th->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+
+    master_th->th.th_ident = loc;
+
+    // Parallel closely nested in teams construct:
+    if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
+      return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
+                                 call_context, microtask, invoker,
+                                 master_set_numthreads, level,
+#if OMPT_SUPPORT
+                                 ompt_parallel_data, return_address,
+#endif
+                                 ap);
+    } // End parallel closely nested in teams construct
+
+#if KMP_DEBUG
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
+                       parent_team->t.t_task_team[master_th->th.th_task_state]);
+    }
+#endif
+
+    // Need this to happen before we determine the number of threads, not while
+    // we are allocating the team
+    //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
+
+    // Determine the number of threads
+    int enter_teams =
+        __kmp_is_entering_teams(active_level, level, teams_level, ap);
+    if ((!enter_teams &&
+         (parent_team->t.t_active_level >=
+          master_th->th.th_current_task->td_icvs.max_active_levels)) ||
+        (__kmp_library == library_serial)) {
+      KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
+      nthreads = 1;
+    } else {
+      nthreads = master_set_numthreads
+                     ? master_set_numthreads
+                     // TODO: get nproc directly from current task
+                     : get__nproc_2(parent_team, master_tid);
+      // Use the thread_limit set for the current target task if exists, else go
+      // with the deduced nthreads
+      nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
+                     ? task_thread_limit
+                     : nthreads;
+      // Check if we need to take forkjoin lock? (no need for serialized
+      // parallel out of teams construct).
+      if (nthreads > 1) {
+        /* determine how many new threads we can use */
+        __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+        /* AC: If we execute teams from parallel region (on host), then teams
+           should be created but each can only have 1 thread if nesting is
+           disabled. If teams called from serial region, then teams and their
+           threads should be created regardless of the nesting setting. */
+        nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
+                                         nthreads, enter_teams);
+        if (nthreads == 1) {
+          // Free lock for single thread execution here; for multi-thread
+          // execution it will be freed later after team of threads created
+          // and initialized
+          __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+        }
+      }
+    }
+    KMP_DEBUG_ASSERT(nthreads > 0);
+
+    // If we temporarily changed the set number of threads then restore it now
+    master_th->th.th_set_nproc = 0;
+
+    if (nthreads == 1) {
+      return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
+                                    invoker, master_th, parent_team,
+#if OMPT_SUPPORT
+                                    &ompt_parallel_data, &return_address,
+                                    &parent_task_data,
+#endif
+                                    ap);
+    } // if (nthreads == 1)
+
+    // GEH: only modify the executing flag in the case when not serialized
+    //      serialized case is handled in kmpc_serialized_parallel
+    KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
+                  "curtask=%p, curtask_max_aclevel=%d\n",
+                  parent_team->t.t_active_level, master_th,
+                  master_th->th.th_current_task,
+                  master_th->th.th_current_task->td_icvs.max_active_levels));
+    // TODO: GEH - cannot do this assertion because root thread not set up as
+    // executing
+    // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
+    master_th->th.th_current_task->td_flags.executing = 0;
+
+    if (!master_th->th.th_teams_microtask || level > teams_level) {
+      /* Increment our nested depth level */
+      KMP_ATOMIC_INC(&root->r.r_in_parallel);
+    }
+
+    // See if we need to make a copy of the ICVs.
+    int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
+    if ((level + 1 < __kmp_nested_nth.used) &&
+        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
+      nthreads_icv = __kmp_nested_nth.nth[level + 1];
+    } else {
+      nthreads_icv = 0; // don't update
+    }
+
+    // Figure out the proc_bind_policy for the new team.
+    kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
+    // proc_bind_default means don't update
+    kmp_proc_bind_t proc_bind_icv = proc_bind_default;
+    if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
+      proc_bind = proc_bind_false;
+    } else {
+      // No proc_bind clause specified; use current proc-bind-var for this
+      // parallel region
+      if (proc_bind == proc_bind_default) {
+        proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
+      }
+      // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
+      if (master_th->th.th_teams_microtask &&
+          microtask == (microtask_t)__kmp_teams_master) {
+        proc_bind = __kmp_teams_proc_bind;
+      }
+      /* else: The proc_bind policy was specified explicitly on parallel clause.
+         This overrides proc-bind-var for this parallel region, but does not
+         change proc-bind-var. */
+      // Figure the value of proc-bind-var for the child threads.
+      if ((level + 1 < __kmp_nested_proc_bind.used) &&
+          (__kmp_nested_proc_bind.bind_types[level + 1] !=
+           master_th->th.th_current_task->td_icvs.proc_bind)) {
+        // Do not modify the proc bind icv for the two teams construct forks
+        // They just let the proc bind icv pass through
+        if (!master_th->th.th_teams_microtask ||
+            !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
+          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+      }
+    }
+
+    // Reset for next parallel region
+    master_th->th.th_set_proc_bind = proc_bind_default;
+
+    if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
+      kmp_internal_control_t new_icvs;
+      copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
+      new_icvs.next = NULL;
+      if (nthreads_icv > 0) {
+        new_icvs.nproc = nthreads_icv;
+      }
+      if (proc_bind_icv != proc_bind_default) {
+        new_icvs.proc_bind = proc_bind_icv;
+      }
+
+      /* allocate a new parallel team */
+      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
+      team = __kmp_allocate_team(root, nthreads, nthreads,
+#if OMPT_SUPPORT
+                                 ompt_parallel_data,
+#endif
+                                 proc_bind, &new_icvs,
+                                 argc USE_NESTED_HOT_ARG(master_th));
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
+        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
+    } else {
+      /* allocate a new parallel team */
+      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
+      team = __kmp_allocate_team(root, nthreads, nthreads,
+#if OMPT_SUPPORT
+                                 ompt_parallel_data,
+#endif
+                                 proc_bind,
+                                 &master_th->th.th_current_task->td_icvs,
+                                 argc USE_NESTED_HOT_ARG(master_th));
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
+        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
+                  &master_th->th.th_current_task->td_icvs);
+    }
+    KF_TRACE(
+        10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
+
+    /* setup the new team */
+    KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
+    KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
+    KMP_CHECK_UPDATE(team->t.t_ident, loc);
+    KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
+    KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
+#if OMPT_SUPPORT
+    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
+                          return_address);
+#endif
+    KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
+    // TODO: parent_team->t.t_level == INT_MAX ???
+    if (!master_th->th.th_teams_microtask || level > teams_level) {
+      int new_level = parent_team->t.t_level + 1;
+      KMP_CHECK_UPDATE(team->t.t_level, new_level);
+      new_level = parent_team->t.t_active_level + 1;
+      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
+    } else {
+      // AC: Do not increase parallel level at start of the teams construct
+      int new_level = parent_team->t.t_level;
+      KMP_CHECK_UPDATE(team->t.t_level, new_level);
+      new_level = parent_team->t.t_active_level;
+      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
+    }
+    kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
+    // set primary thread's schedule as new run-time schedule
+    KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
+
+    KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
+    KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
+
+    // Update the floating point rounding in the team if required.
+    propagateFPControl(team);
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP)
+      ompd_bp_parallel_begin();
+#endif
+
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      // Set primary thread's task team to team's task team. Unless this is hot
+      // team, it should be NULL.
+      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
+                       parent_team->t.t_task_team[master_th->th.th_task_state]);
+      KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
+                    "%p, new task_team %p / team %p\n",
+                    __kmp_gtid_from_thread(master_th),
+                    master_th->th.th_task_team, parent_team,
+                    team->t.t_task_team[master_th->th.th_task_state], team));
+
+      if (active_level || master_th->th.th_task_team) {
+        // Take a memo of primary thread's task_state
+        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
+        if (master_th->th.th_task_state_top >=
+            master_th->th.th_task_state_stack_sz) { // increase size
+          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
+          kmp_uint8 *old_stack, *new_stack;
+          kmp_uint32 i;
+          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
+          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
+            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
+          }
+          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
+               ++i) { // zero-init rest of stack
+            new_stack[i] = 0;
+          }
+          old_stack = master_th->th.th_task_state_memo_stack;
+          master_th->th.th_task_state_memo_stack = new_stack;
+          master_th->th.th_task_state_stack_sz = new_size;
+          __kmp_free(old_stack);
+        }
+        // Store primary thread's task_state on stack
+        master_th->th
+            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
+            master_th->th.th_task_state;
+        master_th->th.th_task_state_top++;
+#if KMP_NESTED_HOT_TEAMS
+        if (master_th->th.th_hot_teams &&
+            active_level < __kmp_hot_teams_max_level &&
+            team == master_th->th.th_hot_teams[active_level].hot_team) {
+          // Restore primary thread's nested state if nested hot team
+          master_th->th.th_task_state =
+              master_th->th
+                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
+        } else {
+#endif
+          master_th->th.th_task_state = 0;
+#if KMP_NESTED_HOT_TEAMS
+        }
+#endif
+      }
+#if !KMP_NESTED_HOT_TEAMS
+      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
+                       (team == root->r.r_hot_team));
+#endif
+    }
+
+    KA_TRACE(
+        20,
+        ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
+         gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
+         team->t.t_nproc));
+    KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
+                     (team->t.t_master_tid == 0 &&
+                      (team->t.t_parent == root->r.r_root_team ||
+                       team->t.t_parent->t.t_serialized)));
+    KMP_MB();
+
+    /* now, setup the arguments */
+    argv = (void **)team->t.t_argv;
+    if (ap) {
+      for (i = argc - 1; i >= 0; --i) {
+        void *new_argv = va_arg(kmp_va_deref(ap), void *);
+        KMP_CHECK_UPDATE(*argv, new_argv);
+        argv++;
+      }
+    } else {
+      for (i = 0; i < argc; ++i) {
+        // Get args from parent team for teams construct
+        KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
+      }
+    }
+
+    /* now actually fork the threads */
+    KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
+    if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
+      root->r.r_active = TRUE;
+
+    __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
+    __kmp_setup_icv_copy(team, nthreads,
+                         &master_th->th.th_current_task->td_icvs, loc);
+
+#if OMPT_SUPPORT
+    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+#endif
+
+    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+
+#if USE_ITT_BUILD
+    if (team->t.t_active_level == 1 // only report frames at level 1
+        && !master_th->th.th_teams_microtask) { // not in teams construct
+#if USE_ITT_NOTIFY
+      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
+          (__kmp_forkjoin_frames_mode == 3 ||
+           __kmp_forkjoin_frames_mode == 1)) {
+        kmp_uint64 tmp_time = 0;
+        if (__itt_get_timestamp_ptr)
+          tmp_time = __itt_get_timestamp();
+        // Internal fork - report frame begin
+        master_th->th.th_frame_time = tmp_time;
+        if (__kmp_forkjoin_frames_mode == 3)
+          team->t.t_region_time = tmp_time;
+      } else
+// only one notification scheme (either "submit" or "forking/joined", not both)
+#endif /* USE_ITT_NOTIFY */
+        if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
+            __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
+          // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
+          __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
+        }
+    }
+#endif /* USE_ITT_BUILD */
+
+    /* now go on and do the work */
+    KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
+    KMP_MB();
+    KF_TRACE(10,
+             ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
+              root, team, master_th, gtid));
+
+#if USE_ITT_BUILD
+    if (__itt_stack_caller_create_ptr) {
+      // create new stack stitching id before entering fork barrier
+      if (!enter_teams) {
+        KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
+        team->t.t_stack_id = __kmp_itt_stack_caller_create();
+      } else if (parent_team->t.t_serialized) {
+        // keep stack stitching id in the serialized parent_team;
+        // current team will be used for parallel inside the teams;
+        // if parent_team is active, then it already keeps stack stitching id
+        // for the league of teams
+        KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
+        parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
+      }
+    }
+#endif /* USE_ITT_BUILD */
+
+    // AC: skip __kmp_internal_fork at teams construct, let only primary
+    // threads execute
+    if (ap) {
+      __kmp_internal_fork(loc, gtid, team);
+      KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
+                    "master_th=%p, gtid=%d\n",
+                    root, team, master_th, gtid));
+    }
+
+    if (call_context == fork_context_gnu) {
+      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
+      return TRUE;
+    }
+
+    /* Invoke microtask for PRIMARY thread */
+    KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
+                  team->t.t_id, team->t.t_pkfn));
+  } // END of timer KMP_fork_call block
+
+#if KMP_STATS_ENABLED
+  // If beginning a teams construct, then change thread state
+  stats_state_e previous_state = KMP_GET_THREAD_STATE();
+  if (!ap) {
+    KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
+  }
+#endif
+
+  if (!team->t.t_invoke(gtid)) {
+    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
+  }
+
+#if KMP_STATS_ENABLED
+  // If was beginning of a teams construct, then reset thread state
+  if (!ap) {
+    KMP_SET_THREAD_STATE(previous_state);
+  }
+#endif
+
+  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
+                team->t.t_id, team->t.t_pkfn));
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    master_th->th.ompt_thread_info.state = ompt_state_overhead;
+  }
+#endif
+
+  return TRUE;
+}
+
+#if OMPT_SUPPORT
+static inline void __kmp_join_restore_state(kmp_info_t *thread,
+                                            kmp_team_t *team) {
+  // restore state outside the region
+  thread->th.ompt_thread_info.state =
+      ((team->t.t_serialized) ? ompt_state_work_serial
+                              : ompt_state_work_parallel);
+}
+
+static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
+                                   kmp_team_t *team, ompt_data_t *parallel_data,
+                                   int flags, void *codeptr) {
+  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+  if (ompt_enabled.ompt_callback_parallel_end) {
+    ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+        parallel_data, &(task_info->task_data), flags, codeptr);
+  }
+
+  task_info->frame.enter_frame = ompt_data_none;
+  __kmp_join_restore_state(thread, team);
+}
+#endif
+
+void __kmp_join_call(ident_t *loc, int gtid
+#if OMPT_SUPPORT
+                     ,
+                     enum fork_context_e fork_context
+#endif
+                     ,
+                     int exit_teams) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
+  kmp_team_t *team;
+  kmp_team_t *parent_team;
+  kmp_info_t *master_th;
+  kmp_root_t *root;
+  int master_active;
+
+  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
+
+  /* setup current data */
+  master_th = __kmp_threads[gtid];
+  root = master_th->th.th_root;
+  team = master_th->th.th_team;
+  parent_team = team->t.t_parent;
+
+  master_th->th.th_ident = loc;
+
+#if OMPT_SUPPORT
+  void *team_microtask = (void *)team->t.t_pkfn;
+  // For GOMP interface with serialized parallel, need the
+  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
+  // and end-parallel events.
+  if (ompt_enabled.enabled &&
+      !(team->t.t_serialized && fork_context == fork_context_gnu)) {
+    master_th->th.ompt_thread_info.state = ompt_state_overhead;
+  }
+#endif
+
+#if KMP_DEBUG
+  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
+    KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
+                  "th_task_team = %p\n",
+                  __kmp_gtid_from_thread(master_th), team,
+                  team->t.t_task_team[master_th->th.th_task_state],
+                  master_th->th.th_task_team));
+    KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
+                     team->t.t_task_team[master_th->th.th_task_state]);
+  }
+#endif
+
+  if (team->t.t_serialized) {
+    if (master_th->th.th_teams_microtask) {
+      // We are in teams construct
+      int level = team->t.t_level;
+      int tlevel = master_th->th.th_teams_level;
+      if (level == tlevel) {
+        // AC: we haven't incremented it earlier at start of teams construct,
+        //     so do it here - at the end of teams construct
+        team->t.t_level++;
+      } else if (level == tlevel + 1) {
+        // AC: we are exiting parallel inside teams, need to increment
+        // serialization in order to restore it in the next call to
+        // __kmpc_end_serialized_parallel
+        team->t.t_serialized++;
+      }
+    }
+    __kmpc_end_serialized_parallel(loc, gtid);
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      if (fork_context == fork_context_gnu) {
+        __ompt_lw_taskteam_unlink(master_th);
+      }
+      __kmp_join_restore_state(master_th, parent_team);
+    }
+#endif
+
+    return;
+  }
+
+  master_active = team->t.t_master_active;
+
+  if (!exit_teams) {
+    // AC: No barrier for internal teams at exit from teams construct.
+    //     But there is barrier for external team (league).
+    __kmp_internal_join(loc, gtid, team);
+#if USE_ITT_BUILD
+    if (__itt_stack_caller_create_ptr) {
+      KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
+      // destroy the stack stitching id after join barrier
+      __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
+      team->t.t_stack_id = NULL;
+    }
+#endif
+  } else {
+    master_th->th.th_task_state =
+        0; // AC: no tasking in teams (out of any parallel)
+#if USE_ITT_BUILD
+    if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
+      KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
+      // destroy the stack stitching id on exit from the teams construct
+      // if parent_team is active, then the id will be destroyed later on
+      // by master of the league of teams
+      __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
+      parent_team->t.t_stack_id = NULL;
+    }
+#endif
+  }
+
+  KMP_MB();
+
+#if OMPT_SUPPORT
+  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
+  void *codeptr = team->t.ompt_team_info.master_return_address;
+#endif
+
+#if USE_ITT_BUILD
+  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
+  if (team->t.t_active_level == 1 &&
+      (!master_th->th.th_teams_microtask || /* not in teams construct */
+       master_th->th.th_teams_size.nteams == 1)) {
+    master_th->th.th_ident = loc;
+    // only one notification scheme (either "submit" or "forking/joined", not
+    // both)
+    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
+        __kmp_forkjoin_frames_mode == 3)
+      __kmp_itt_frame_submit(gtid, team->t.t_region_time,
+                             master_th->th.th_frame_time, 0, loc,
+                             master_th->th.th_team_nproc, 1);
+    else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
+             !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
+      __kmp_itt_region_joined(gtid);
+  } // active_level == 1
+#endif /* USE_ITT_BUILD */
+
+#if KMP_AFFINITY_SUPPORTED
+  if (!exit_teams) {
+    // Restore master thread's partition.
+    master_th->th.th_first_place = team->t.t_first_place;
+    master_th->th.th_last_place = team->t.t_last_place;
+  }
+#endif // KMP_AFFINITY_SUPPORTED
+
+  if (master_th->th.th_teams_microtask && !exit_teams &&
+      team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+      team->t.t_level == master_th->th.th_teams_level + 1) {
+// AC: We need to leave the team structure intact at the end of parallel
+// inside the teams construct, so that at the next parallel same (hot) team
+// works, only adjust nesting levels
+#if OMPT_SUPPORT
+    ompt_data_t ompt_parallel_data = ompt_data_none;
+    if (ompt_enabled.enabled) {
+      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+      if (ompt_enabled.ompt_callback_implicit_task) {
+        int ompt_team_size = team->t.t_nproc;
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
+            OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+      }
+      task_info->frame.exit_frame = ompt_data_none;
+      task_info->task_data = ompt_data_none;
+      ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+      __ompt_lw_taskteam_unlink(master_th);
+    }
+#endif
+    /* Decrement our nested depth level */
+    team->t.t_level--;
+    team->t.t_active_level--;
+    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
+
+    // Restore number of threads in the team if needed. This code relies on
+    // the proper adjustment of th_teams_size.nth after the fork in
+    // __kmp_teams_master on each teams primary thread in the case that
+    // __kmp_reserve_threads reduced it.
+    if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
+      int old_num = master_th->th.th_team_nproc;
+      int new_num = master_th->th.th_teams_size.nth;
+      kmp_info_t **other_threads = team->t.t_threads;
+      team->t.t_nproc = new_num;
+      for (int i = 0; i < old_num; ++i) {
+        other_threads[i]->th.th_team_nproc = new_num;
+      }
+      // Adjust states of non-used threads of the team
+      for (int i = old_num; i < new_num; ++i) {
+        // Re-initialize thread's barrier data.
+        KMP_DEBUG_ASSERT(other_threads[i]);
+        kmp_balign_t *balign = other_threads[i]->th.th_bar;
+        for (int b = 0; b < bs_last_barrier; ++b) {
+          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
+#endif
+        }
+        if (__kmp_tasking_mode != tskm_immediate_exec) {
+          // Synchronize thread's task state
+          other_threads[i]->th.th_task_state = master_th->th.th_task_state;
+        }
+      }
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
+                      OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
+    }
+#endif
+
+    return;
+  }
+
+  /* do cleanup and restore the parent team */
+  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
+  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
+
+  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
+
+  /* jc: The following lock has instructions with REL and ACQ semantics,
+     separating the parallel user code called in this parallel region
+     from the serial user code called after this function returns. */
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+
+  if (!master_th->th.th_teams_microtask ||
+      team->t.t_level > master_th->th.th_teams_level) {
+    /* Decrement our nested depth level */
+    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
+  }
+  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      int flags = (team_microtask == (void *)__kmp_teams_master)
+                      ? ompt_task_initial
+                      : ompt_task_implicit;
+      int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
+          OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
+    }
+    task_info->frame.exit_frame = ompt_data_none;
+    task_info->task_data = ompt_data_none;
+  }
+#endif
+
+  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
+                master_th, team));
+  __kmp_pop_current_task_from_thread(master_th);
+
+  master_th->th.th_def_allocator = team->t.t_def_allocator;
+
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_parallel_end();
+#endif
+  updateHWFPControl(team);
+
+  if (root->r.r_active != master_active)
+    root->r.r_active = master_active;
+
+  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
+                            master_th)); // this will free worker threads
+
+  /* this race was fun to find. make sure the following is in the critical
+     region otherwise assertions may fail occasionally since the old team may be
+     reallocated and the hierarchy appears inconsistent. it is actually safe to
+     run and won't cause any bugs, but will cause those assertion failures. it's
+     only one deref&assign so might as well put this in the critical region */
+  master_th->th.th_team = parent_team;
+  master_th->th.th_team_nproc = parent_team->t.t_nproc;
+  master_th->th.th_team_master = parent_team->t.t_threads[0];
+  master_th->th.th_team_serialized = parent_team->t.t_serialized;
+
+  /* restore serialized team, if need be */
+  if (parent_team->t.t_serialized &&
+      parent_team != master_th->th.th_serial_team &&
+      parent_team != root->r.r_root_team) {
+    __kmp_free_team(root,
+                    master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
+    master_th->th.th_serial_team = parent_team;
+  }
+
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    if (master_th->th.th_task_state_top >
+        0) { // Restore task state from memo stack
+      KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
+      // Remember primary thread's state if we re-use this nested hot team
+      master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
+          master_th->th.th_task_state;
+      --master_th->th.th_task_state_top; // pop
+      // Now restore state at this level
+      master_th->th.th_task_state =
+          master_th->th
+              .th_task_state_memo_stack[master_th->th.th_task_state_top];
+    } else if (team != root->r.r_hot_team) {
+      // Reset the task state of primary thread if we are not hot team because
+      // in this case all the worker threads will be free, and their task state
+      // will be reset. If not reset the primary's, the task state will be
+      // inconsistent.
+      master_th->th.th_task_state = 0;
+    }
+    // Copy the task team from the parent team to the primary thread
+    master_th->th.th_task_team =
+        parent_team->t.t_task_team[master_th->th.th_task_state];
+    KA_TRACE(20,
+             ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
+              __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
+              parent_team));
+  }
+
+  // TODO: GEH - cannot do this assertion because root thread not set up as
+  // executing
+  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
+  master_th->th.th_current_task->td_flags.executing = 1;
+
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+
+#if KMP_AFFINITY_SUPPORTED
+  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
+    __kmp_reset_root_init_mask(gtid);
+  }
+#endif
+#if OMPT_SUPPORT
+  int flags =
+      OMPT_INVOKER(fork_context) |
+      ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
+                                                      : ompt_parallel_team);
+  if (ompt_enabled.enabled) {
+    __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
+                    codeptr);
+  }
+#endif
+
+  KMP_MB();
+  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
+}
+
+/* Check whether we should push an internal control record onto the
+   serial team stack.  If so, do it.  */
+void __kmp_save_internal_controls(kmp_info_t *thread) {
+
+  if (thread->th.th_team != thread->th.th_serial_team) {
+    return;
+  }
+  if (thread->th.th_team->t.t_serialized > 1) {
+    int push = 0;
+
+    if (thread->th.th_team->t.t_control_stack_top == NULL) {
+      push = 1;
+    } else {
+      if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
+          thread->th.th_team->t.t_serialized) {
+        push = 1;
+      }
+    }
+    if (push) { /* push a record on the serial team's stack */
+      kmp_internal_control_t *control =
+          (kmp_internal_control_t *)__kmp_allocate(
+              sizeof(kmp_internal_control_t));
+
+      copy_icvs(control, &thread->th.th_current_task->td_icvs);
+
+      control->serial_nesting_level = thread->th.th_team->t.t_serialized;
+
+      control->next = thread->th.th_team->t.t_control_stack_top;
+      thread->th.th_team->t.t_control_stack_top = control;
+    }
+  }
+}
+
+/* Changes set_nproc */
+void __kmp_set_num_threads(int new_nth, int gtid) {
+  kmp_info_t *thread;
+  kmp_root_t *root;
+
+  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (new_nth < 1)
+    new_nth = 1;
+  else if (new_nth > __kmp_max_nth)
+    new_nth = __kmp_max_nth;
+
+  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
+  thread = __kmp_threads[gtid];
+  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
+    return; // nothing to do
+
+  __kmp_save_internal_controls(thread);
+
+  set__nproc(thread, new_nth);
+
+  // If this omp_set_num_threads() call will cause the hot team size to be
+  // reduced (in the absence of a num_threads clause), then reduce it now,
+  // rather than waiting for the next parallel region.
+  root = thread->th.th_root;
+  if (__kmp_init_parallel && (!root->r.r_active) &&
+      (root->r.r_hot_team->t.t_nproc > new_nth)
+#if KMP_NESTED_HOT_TEAMS
+      && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
+#endif
+  ) {
+    kmp_team_t *hot_team = root->r.r_hot_team;
+    int f;
+
+    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+
+    if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
+    }
+    // Release the extra threads we don't need any more.
+    for (f = new_nth; f < hot_team->t.t_nproc; f++) {
+      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        // When decreasing team size, threads no longer in the team should unref
+        // task team.
+        hot_team->t.t_threads[f]->th.th_task_team = NULL;
+      }
+      __kmp_free_thread(hot_team->t.t_threads[f]);
+      hot_team->t.t_threads[f] = NULL;
+    }
+    hot_team->t.t_nproc = new_nth;
+#if KMP_NESTED_HOT_TEAMS
+    if (thread->th.th_hot_teams) {
+      KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
+      thread->th.th_hot_teams[0].hot_team_nth = new_nth;
+    }
+#endif
+
+    if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      hot_team->t.b->update_num_threads(new_nth);
+      __kmp_add_threads_to_team(hot_team, new_nth);
+    }
+
+    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+
+    // Update the t_nproc field in the threads that are still active.
+    for (f = 0; f < new_nth; f++) {
+      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
+      hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
+    }
+    // Special flag in case omp_set_num_threads() call
+    hot_team->t.t_size_changed = -1;
+  }
+}
+
+/* Changes max_active_levels */
+void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
+  kmp_info_t *thread;
+
+  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
+                "%d = (%d)\n",
+                gtid, max_active_levels));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  // validate max_active_levels
+  if (max_active_levels < 0) {
+    KMP_WARNING(ActiveLevelsNegative, max_active_levels);
+    // We ignore this call if the user has specified a negative value.
+    // The current setting won't be changed. The last valid setting will be
+    // used. A warning will be issued (if warnings are allowed as controlled by
+    // the KMP_WARNINGS env var).
+    KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
+                  "max_active_levels for thread %d = (%d)\n",
+                  gtid, max_active_levels));
+    return;
+  }
+  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
+    // it's OK, the max_active_levels is within the valid range: [ 0;
+    // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
+    // We allow a zero value. (implementation defined behavior)
+  } else {
+    KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
+                KMP_MAX_ACTIVE_LEVELS_LIMIT);
+    max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+    // Current upper limit is MAX_INT. (implementation defined behavior)
+    // If the input exceeds the upper limit, we correct the input to be the
+    // upper limit. (implementation defined behavior)
+    // Actually, the flow should never get here until we use MAX_INT limit.
+  }
+  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
+                "max_active_levels for thread %d = (%d)\n",
+                gtid, max_active_levels));
+
+  thread = __kmp_threads[gtid];
+
+  __kmp_save_internal_controls(thread);
+
+  set__max_active_levels(thread, max_active_levels);
+}
+
+/* Gets max_active_levels */
+int __kmp_get_max_active_levels(int gtid) {
+  kmp_info_t *thread;
+
+  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  thread = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(thread->th.th_current_task);
+  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
+                "curtask_maxaclevel=%d\n",
+                gtid, thread->th.th_current_task,
+                thread->th.th_current_task->td_icvs.max_active_levels));
+  return thread->th.th_current_task->td_icvs.max_active_levels;
+}
+
+// nteams-var per-device ICV
+void __kmp_set_num_teams(int num_teams) {
+  if (num_teams > 0)
+    __kmp_nteams = num_teams;
+}
+int __kmp_get_max_teams(void) { return __kmp_nteams; }
+// teams-thread-limit-var per-device ICV
+void __kmp_set_teams_thread_limit(int limit) {
+  if (limit > 0)
+    __kmp_teams_thread_limit = limit;
+}
+int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
+
+KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
+KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
+
+/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
+void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
+  kmp_info_t *thread;
+  kmp_sched_t orig_kind;
+  //    kmp_team_t *team;
+
+  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
+                gtid, (int)kind, chunk));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  // Check if the kind parameter is valid, correct if needed.
+  // Valid parameters should fit in one of two intervals - standard or extended:
+  //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
+  // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
+  orig_kind = kind;
+  kind = __kmp_sched_without_mods(kind);
+
+  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
+      (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
+    // TODO: Hint needs attention in case we change the default schedule.
+    __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
+              KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
+              __kmp_msg_null);
+    kind = kmp_sched_default;
+    chunk = 0; // ignore chunk value in case of bad kind
+  }
+
+  thread = __kmp_threads[gtid];
+
+  __kmp_save_internal_controls(thread);
+
+  if (kind < kmp_sched_upper_std) {
+    if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
+      // differ static chunked vs. unchunked:  chunk should be invalid to
+      // indicate unchunked schedule (which is the default)
+      thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
+    } else {
+      thread->th.th_current_task->td_icvs.sched.r_sched_type =
+          __kmp_sch_map[kind - kmp_sched_lower - 1];
+    }
+  } else {
+    //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
+    //    kmp_sched_lower - 2 ];
+    thread->th.th_current_task->td_icvs.sched.r_sched_type =
+        __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
+                      kmp_sched_lower - 2];
+  }
+  __kmp_sched_apply_mods_intkind(
+      orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
+  if (kind == kmp_sched_auto || chunk < 1) {
+    // ignore parameter chunk for schedule auto
+    thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
+  } else {
+    thread->th.th_current_task->td_icvs.sched.chunk = chunk;
+  }
+}
+
+/* Gets def_sched_var ICV values */
+void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
+  kmp_info_t *thread;
+  enum sched_type th_type;
+
+  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  thread = __kmp_threads[gtid];
+
+  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
+  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
+  case kmp_sch_static:
+  case kmp_sch_static_greedy:
+  case kmp_sch_static_balanced:
+    *kind = kmp_sched_static;
+    __kmp_sched_apply_mods_stdkind(kind, th_type);
+    *chunk = 0; // chunk was not set, try to show this fact via zero value
+    return;
+  case kmp_sch_static_chunked:
+    *kind = kmp_sched_static;
+    break;
+  case kmp_sch_dynamic_chunked:
+    *kind = kmp_sched_dynamic;
+    break;
+  case kmp_sch_guided_chunked:
+  case kmp_sch_guided_iterative_chunked:
+  case kmp_sch_guided_analytical_chunked:
+    *kind = kmp_sched_guided;
+    break;
+  case kmp_sch_auto:
+    *kind = kmp_sched_auto;
+    break;
+  case kmp_sch_trapezoidal:
+    *kind = kmp_sched_trapezoidal;
+    break;
+#if KMP_STATIC_STEAL_ENABLED
+  case kmp_sch_static_steal:
+    *kind = kmp_sched_static_steal;
+    break;
+#endif
+  default:
+    KMP_FATAL(UnknownSchedulingType, th_type);
+  }
+
+  __kmp_sched_apply_mods_stdkind(kind, th_type);
+  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
+}
+
+int __kmp_get_ancestor_thread_num(int gtid, int level) {
+
+  int ii, dd;
+  kmp_team_t *team;
+  kmp_info_t *thr;
+
+  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  // validate level
+  if (level == 0)
+    return 0;
+  if (level < 0)
+    return -1;
+  thr = __kmp_threads[gtid];
+  team = thr->th.th_team;
+  ii = team->t.t_level;
+  if (level > ii)
+    return -1;
+
+  if (thr->th.th_teams_microtask) {
+    // AC: we are in teams region where multiple nested teams have same level
+    int tlevel = thr->th.th_teams_level; // the level of the teams construct
+    if (level <=
+        tlevel) { // otherwise usual algorithm works (will not touch the teams)
+      KMP_DEBUG_ASSERT(ii >= tlevel);
+      // AC: As we need to pass by the teams league, we need to artificially
+      // increase ii
+      if (ii == tlevel) {
+        ii += 2; // three teams have same level
+      } else {
+        ii++; // two teams have same level
+      }
+    }
+  }
+
+  if (ii == level)
+    return __kmp_tid_from_gtid(gtid);
+
+  dd = team->t.t_serialized;
+  level++;
+  while (ii > level) {
+    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
+    }
+    if ((team->t.t_serialized) && (!dd)) {
+      team = team->t.t_parent;
+      continue;
+    }
+    if (ii > level) {
+      team = team->t.t_parent;
+      dd = team->t.t_serialized;
+      ii--;
+    }
+  }
+
+  return (dd > 1) ? (0) : (team->t.t_master_tid);
+}
+
+int __kmp_get_team_size(int gtid, int level) {
+
+  int ii, dd;
+  kmp_team_t *team;
+  kmp_info_t *thr;
+
+  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  // validate level
+  if (level == 0)
+    return 1;
+  if (level < 0)
+    return -1;
+  thr = __kmp_threads[gtid];
+  team = thr->th.th_team;
+  ii = team->t.t_level;
+  if (level > ii)
+    return -1;
+
+  if (thr->th.th_teams_microtask) {
+    // AC: we are in teams region where multiple nested teams have same level
+    int tlevel = thr->th.th_teams_level; // the level of the teams construct
+    if (level <=
+        tlevel) { // otherwise usual algorithm works (will not touch the teams)
+      KMP_DEBUG_ASSERT(ii >= tlevel);
+      // AC: As we need to pass by the teams league, we need to artificially
+      // increase ii
+      if (ii == tlevel) {
+        ii += 2; // three teams have same level
+      } else {
+        ii++; // two teams have same level
+      }
+    }
+  }
+
+  while (ii > level) {
+    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
+    }
+    if (team->t.t_serialized && (!dd)) {
+      team = team->t.t_parent;
+      continue;
+    }
+    if (ii > level) {
+      team = team->t.t_parent;
+      ii--;
+    }
+  }
+
+  return team->t.t_nproc;
+}
+
+kmp_r_sched_t __kmp_get_schedule_global() {
+  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
+  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
+  // independently. So one can get the updated schedule here.
+
+  kmp_r_sched_t r_sched;
+
+  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
+  // __kmp_guided. __kmp_sched should keep original value, so that user can set
+  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
+  // different roots (even in OMP 2.5)
+  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
+  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
+  if (s == kmp_sch_static) {
+    // replace STATIC with more detailed schedule (balanced or greedy)
+    r_sched.r_sched_type = __kmp_static;
+  } else if (s == kmp_sch_guided_chunked) {
+    // replace GUIDED with more detailed schedule (iterative or analytical)
+    r_sched.r_sched_type = __kmp_guided;
+  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
+    r_sched.r_sched_type = __kmp_sched;
+  }
+  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
+
+  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
+    // __kmp_chunk may be wrong here (if it was not ever set)
+    r_sched.chunk = KMP_DEFAULT_CHUNK;
+  } else {
+    r_sched.chunk = __kmp_chunk;
+  }
+
+  return r_sched;
+}
+
+/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
+   at least argc number of *t_argv entries for the requested team. */
+static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
+
+  KMP_DEBUG_ASSERT(team);
+  if (!realloc || argc > team->t.t_max_argc) {
+
+    KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
+                   "current entries=%d\n",
+                   team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
+    /* if previously allocated heap space for args, free them */
+    if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
+      __kmp_free((void *)team->t.t_argv);
+
+    if (argc <= KMP_INLINE_ARGV_ENTRIES) {
+      /* use unused space in the cache line for arguments */
+      team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
+      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
+                     "argv entries\n",
+                     team->t.t_id, team->t.t_max_argc));
+      team->t.t_argv = &team->t.t_inline_argv[0];
+      if (__kmp_storage_map) {
+        __kmp_print_storage_map_gtid(
+            -1, &team->t.t_inline_argv[0],
+            &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
+            (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
+            team->t.t_id);
+      }
+    } else {
+      /* allocate space for arguments in the heap */
+      team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
+                               ? KMP_MIN_MALLOC_ARGV_ENTRIES
+                               : 2 * argc;
+      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
+                     "argv entries\n",
+                     team->t.t_id, team->t.t_max_argc));
+      team->t.t_argv =
+          (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
+      if (__kmp_storage_map) {
+        __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
+                                     &team->t.t_argv[team->t.t_max_argc],
+                                     sizeof(void *) * team->t.t_max_argc,
+                                     "team_%d.t_argv", team->t.t_id);
+      }
+    }
+  }
+}
+
+static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
+  int i;
+  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
+  team->t.t_threads =
+      (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
+  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
+      sizeof(dispatch_shared_info_t) * num_disp_buff);
+  team->t.t_dispatch =
+      (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
+  team->t.t_implicit_task_taskdata =
+      (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
+  team->t.t_max_nproc = max_nth;
+
+  /* setup dispatch buffers */
+  for (i = 0; i < num_disp_buff; ++i) {
+    team->t.t_disp_buffer[i].buffer_index = i;
+    team->t.t_disp_buffer[i].doacross_buf_idx = i;
+  }
+}
+
+static void __kmp_free_team_arrays(kmp_team_t *team) {
+  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
+  int i;
+  for (i = 0; i < team->t.t_max_nproc; ++i) {
+    if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
+      __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
+      team->t.t_dispatch[i].th_disp_buffer = NULL;
+    }
+  }
+#if KMP_USE_HIER_SCHED
+  __kmp_dispatch_free_hierarchies(team);
+#endif
+  __kmp_free(team->t.t_threads);
+  __kmp_free(team->t.t_disp_buffer);
+  __kmp_free(team->t.t_dispatch);
+  __kmp_free(team->t.t_implicit_task_taskdata);
+  team->t.t_threads = NULL;
+  team->t.t_disp_buffer = NULL;
+  team->t.t_dispatch = NULL;
+  team->t.t_implicit_task_taskdata = 0;
+}
+
+static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
+  kmp_info_t **oldThreads = team->t.t_threads;
+
+  __kmp_free(team->t.t_disp_buffer);
+  __kmp_free(team->t.t_dispatch);
+  __kmp_free(team->t.t_implicit_task_taskdata);
+  __kmp_allocate_team_arrays(team, max_nth);
+
+  KMP_MEMCPY(team->t.t_threads, oldThreads,
+             team->t.t_nproc * sizeof(kmp_info_t *));
+
+  __kmp_free(oldThreads);
+}
+
+static kmp_internal_control_t __kmp_get_global_icvs(void) {
+
+  kmp_r_sched_t r_sched =
+      __kmp_get_schedule_global(); // get current state of scheduling globals
+
+  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
+
+  kmp_internal_control_t g_icvs = {
+    0, // int serial_nesting_level; //corresponds to value of th_team_serialized
+    (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
+    // adjustment of threads (per thread)
+    (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
+    // whether blocktime is explicitly set
+    __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
+#if KMP_USE_MONITOR
+    __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
+// intervals
+#endif
+    __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
+    // next parallel region (per thread)
+    // (use a max ub on value if __kmp_parallel_initialize not called yet)
+    __kmp_cg_max_nth, // int thread_limit;
+    __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
+    // on task. This is used in the case of target thread_limit
+    __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
+    // for max_active_levels
+    r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
+    // {sched,chunk} pair
+    __kmp_nested_proc_bind.bind_types[0],
+    __kmp_default_device,
+    NULL // struct kmp_internal_control *next;
+  };
+
+  return g_icvs;
+}
+
+static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
+
+  kmp_internal_control_t gx_icvs;
+  gx_icvs.serial_nesting_level =
+      0; // probably =team->t.t_serial like in save_inter_controls
+  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
+  gx_icvs.next = NULL;
+
+  return gx_icvs;
+}
+
+static void __kmp_initialize_root(kmp_root_t *root) {
+  int f;
+  kmp_team_t *root_team;
+  kmp_team_t *hot_team;
+  int hot_team_max_nth;
+  kmp_r_sched_t r_sched =
+      __kmp_get_schedule_global(); // get current state of scheduling globals
+  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
+  KMP_DEBUG_ASSERT(root);
+  KMP_ASSERT(!root->r.r_begin);
+
+  /* setup the root state structure */
+  __kmp_init_lock(&root->r.r_begin_lock);
+  root->r.r_begin = FALSE;
+  root->r.r_active = FALSE;
+  root->r.r_in_parallel = 0;
+  root->r.r_blocktime = __kmp_dflt_blocktime;
+#if KMP_AFFINITY_SUPPORTED
+  root->r.r_affinity_assigned = FALSE;
+#endif
+
+  /* setup the root team for this task */
+  /* allocate the root team structure */
+  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
+
+  root_team =
+      __kmp_allocate_team(root,
+                          1, // new_nproc
+                          1, // max_nproc
+#if OMPT_SUPPORT
+                          ompt_data_none, // root parallel id
+#endif
+                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
+                          0 // argc
+                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
+      );
+#if USE_DEBUGGER
+  // Non-NULL value should be assigned to make the debugger display the root
+  // team.
+  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
+#endif
+
+  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
+
+  root->r.r_root_team = root_team;
+  root_team->t.t_control_stack_top = NULL;
+
+  /* initialize root team */
+  root_team->t.t_threads[0] = NULL;
+  root_team->t.t_nproc = 1;
+  root_team->t.t_serialized = 1;
+  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
+  root_team->t.t_sched.sched = r_sched.sched;
+  KA_TRACE(
+      20,
+      ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
+       root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
+
+  /* setup the  hot team for this task */
+  /* allocate the hot team structure */
+  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
+
+  hot_team =
+      __kmp_allocate_team(root,
+                          1, // new_nproc
+                          __kmp_dflt_team_nth_ub * 2, // max_nproc
+#if OMPT_SUPPORT
+                          ompt_data_none, // root parallel id
+#endif
+                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
+                          0 // argc
+                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
+      );
+  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
+
+  root->r.r_hot_team = hot_team;
+  root_team->t.t_control_stack_top = NULL;
+
+  /* first-time initialization */
+  hot_team->t.t_parent = root_team;
+
+  /* initialize hot team */
+  hot_team_max_nth = hot_team->t.t_max_nproc;
+  for (f = 0; f < hot_team_max_nth; ++f) {
+    hot_team->t.t_threads[f] = NULL;
+  }
+  hot_team->t.t_nproc = 1;
+  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
+  hot_team->t.t_sched.sched = r_sched.sched;
+  hot_team->t.t_size_changed = 0;
+}
+
+#ifdef KMP_DEBUG
+
+typedef struct kmp_team_list_item {
+  kmp_team_p const *entry;
+  struct kmp_team_list_item *next;
+} kmp_team_list_item_t;
+typedef kmp_team_list_item_t *kmp_team_list_t;
+
+static void __kmp_print_structure_team_accum( // Add team to list of teams.
+    kmp_team_list_t list, // List of teams.
+    kmp_team_p const *team // Team to add.
+) {
+
+  // List must terminate with item where both entry and next are NULL.
+  // Team is added to the list only once.
+  // List is sorted in ascending order by team id.
+  // Team id is *not* a key.
+
+  kmp_team_list_t l;
+
+  KMP_DEBUG_ASSERT(list != NULL);
+  if (team == NULL) {
+    return;
+  }
+
+  __kmp_print_structure_team_accum(list, team->t.t_parent);
+  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
+
+  // Search list for the team.
+  l = list;
+  while (l->next != NULL && l->entry != team) {
+    l = l->next;
+  }
+  if (l->next != NULL) {
+    return; // Team has been added before, exit.
+  }
+
+  // Team is not found. Search list again for insertion point.
+  l = list;
+  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
+    l = l->next;
+  }
+
+  // Insert team.
+  {
+    kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
+        sizeof(kmp_team_list_item_t));
+    *item = *l;
+    l->entry = team;
+    l->next = item;
+  }
+}
+
+static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
+
+) {
+  __kmp_printf("%s", title);
+  if (team != NULL) {
+    __kmp_printf("%2x %p\n", team->t.t_id, team);
+  } else {
+    __kmp_printf(" - (nil)\n");
+  }
+}
+
+static void __kmp_print_structure_thread(char const *title,
+                                         kmp_info_p const *thread) {
+  __kmp_printf("%s", title);
+  if (thread != NULL) {
+    __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
+  } else {
+    __kmp_printf(" - (nil)\n");
+  }
+}
+
+void __kmp_print_structure(void) {
+
+  kmp_team_list_t list;
+
+  // Initialize list of teams.
+  list =
+      (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
+  list->entry = NULL;
+  list->next = NULL;
+
+  __kmp_printf("\n------------------------------\nGlobal Thread "
+               "Table\n------------------------------\n");
+  {
+    int gtid;
+    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
+      __kmp_printf("%2d", gtid);
+      if (__kmp_threads != NULL) {
+        __kmp_printf(" %p", __kmp_threads[gtid]);
+      }
+      if (__kmp_root != NULL) {
+        __kmp_printf(" %p", __kmp_root[gtid]);
+      }
+      __kmp_printf("\n");
+    }
+  }
+
+  // Print out __kmp_threads array.
+  __kmp_printf("\n------------------------------\nThreads\n--------------------"
+               "----------\n");
+  if (__kmp_threads != NULL) {
+    int gtid;
+    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
+      kmp_info_t const *thread = __kmp_threads[gtid];
+      if (thread != NULL) {
+        __kmp_printf("GTID %2d %p:\n", gtid, thread);
+        __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
+        __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
+        __kmp_print_structure_team("    Serial Team:  ",
+                                   thread->th.th_serial_team);
+        __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
+        __kmp_print_structure_thread("    Primary:      ",
+                                     thread->th.th_team_master);
+        __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
+        __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
+        __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
+        __kmp_print_structure_thread("    Next in pool: ",
+                                     thread->th.th_next_pool);
+        __kmp_printf("\n");
+        __kmp_print_structure_team_accum(list, thread->th.th_team);
+        __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
+      }
+    }
+  } else {
+    __kmp_printf("Threads array is not allocated.\n");
+  }
+
+  // Print out __kmp_root array.
+  __kmp_printf("\n------------------------------\nUbers\n----------------------"
+               "--------\n");
+  if (__kmp_root != NULL) {
+    int gtid;
+    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
+      kmp_root_t const *root = __kmp_root[gtid];
+      if (root != NULL) {
+        __kmp_printf("GTID %2d %p:\n", gtid, root);
+        __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
+        __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
+        __kmp_print_structure_thread("    Uber Thread:  ",
+                                     root->r.r_uber_thread);
+        __kmp_printf("    Active?:      %2d\n", root->r.r_active);
+        __kmp_printf("    In Parallel:  %2d\n",
+                     KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
+        __kmp_printf("\n");
+        __kmp_print_structure_team_accum(list, root->r.r_root_team);
+        __kmp_print_structure_team_accum(list, root->r.r_hot_team);
+      }
+    }
+  } else {
+    __kmp_printf("Ubers array is not allocated.\n");
+  }
+
+  __kmp_printf("\n------------------------------\nTeams\n----------------------"
+               "--------\n");
+  while (list->next != NULL) {
+    kmp_team_p const *team = list->entry;
+    int i;
+    __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
+    __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
+    __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
+    __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
+    __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
+    __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
+    for (i = 0; i < team->t.t_nproc; ++i) {
+      __kmp_printf("    Thread %2d:      ", i);
+      __kmp_print_structure_thread("", team->t.t_threads[i]);
+    }
+    __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
+    __kmp_printf("\n");
+    list = list->next;
+  }
+
+  // Print out __kmp_thread_pool and __kmp_team_pool.
+  __kmp_printf("\n------------------------------\nPools\n----------------------"
+               "--------\n");
+  __kmp_print_structure_thread("Thread pool:          ",
+                               CCAST(kmp_info_t *, __kmp_thread_pool));
+  __kmp_print_structure_team("Team pool:            ",
+                             CCAST(kmp_team_t *, __kmp_team_pool));
+  __kmp_printf("\n");
+
+  // Free team list.
+  while (list != NULL) {
+    kmp_team_list_item_t *item = list;
+    list = list->next;
+    KMP_INTERNAL_FREE(item);
+  }
+}
+
+#endif
+
+//---------------------------------------------------------------------------
+//  Stuff for per-thread fast random number generator
+//  Table of primes
+static const unsigned __kmp_primes[] = {
+    0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
+    0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
+    0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
+    0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
+    0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
+    0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
+    0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
+    0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
+    0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
+    0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
+    0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
+
+//---------------------------------------------------------------------------
+//  __kmp_get_random: Get a random number using a linear congruential method.
+unsigned short __kmp_get_random(kmp_info_t *thread) {
+  unsigned x = thread->th.th_x;
+  unsigned short r = (unsigned short)(x >> 16);
+
+  thread->th.th_x = x * thread->th.th_a + 1;
+
+  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
+                thread->th.th_info.ds.ds_tid, r));
+
+  return r;
+}
+//--------------------------------------------------------
+// __kmp_init_random: Initialize a random number generator
+void __kmp_init_random(kmp_info_t *thread) {
+  unsigned seed = thread->th.th_info.ds.ds_tid;
+
+  thread->th.th_a =
+      __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
+  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
+  KA_TRACE(30,
+           ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
+}
+
+#if KMP_OS_WINDOWS
+/* reclaim array entries for root threads that are already dead, returns number
+ * reclaimed */
+static int __kmp_reclaim_dead_roots(void) {
+  int i, r = 0;
+
+  for (i = 0; i < __kmp_threads_capacity; ++i) {
+    if (KMP_UBER_GTID(i) &&
+        !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
+        !__kmp_root[i]
+             ->r.r_active) { // AC: reclaim only roots died in non-active state
+      r += __kmp_unregister_root_other_thread(i);
+    }
+  }
+  return r;
+}
+#endif
+
+/* This function attempts to create free entries in __kmp_threads and
+   __kmp_root, and returns the number of free entries generated.
+
+   For Windows* OS static library, the first mechanism used is to reclaim array
+   entries for root threads that are already dead.
+
+   On all platforms, expansion is attempted on the arrays __kmp_threads_ and
+   __kmp_root, with appropriate update to __kmp_threads_capacity. Array
+   capacity is increased by doubling with clipping to __kmp_tp_capacity, if
+   threadprivate cache array has been created. Synchronization with
+   __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
+
+   After any dead root reclamation, if the clipping value allows array expansion
+   to result in the generation of a total of nNeed free slots, the function does
+   that expansion. If not, nothing is done beyond the possible initial root
+   thread reclamation.
+
+   If any argument is negative, the behavior is undefined. */
+static int __kmp_expand_threads(int nNeed) {
+  int added = 0;
+  int minimumRequiredCapacity;
+  int newCapacity;
+  kmp_info_t **newThreads;
+  kmp_root_t **newRoot;
+
+  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
+  // resizing __kmp_threads does not need additional protection if foreign
+  // threads are present
+
+#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
+  /* only for Windows static library */
+  /* reclaim array entries for root threads that are already dead */
+  added = __kmp_reclaim_dead_roots();
+
+  if (nNeed) {
+    nNeed -= added;
+    if (nNeed < 0)
+      nNeed = 0;
+  }
+#endif
+  if (nNeed <= 0)
+    return added;
+
+  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
+  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
+  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
+  // > __kmp_max_nth in one of two ways:
+  //
+  // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
+  //    may not be reused by another thread, so we may need to increase
+  //    __kmp_threads_capacity to __kmp_max_nth + 1.
+  //
+  // 2) New foreign root(s) are encountered.  We always register new foreign
+  //    roots. This may cause a smaller # of threads to be allocated at
+  //    subsequent parallel regions, but the worker threads hang around (and
+  //    eventually go to sleep) and need slots in the __kmp_threads[] array.
+  //
+  // Anyway, that is the reason for moving the check to see if
+  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
+  // instead of having it performed here. -BB
+
+  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
+
+  /* compute expansion headroom to check if we can expand */
+  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
+    /* possible expansion too small -- give up */
+    return added;
+  }
+  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
+
+  newCapacity = __kmp_threads_capacity;
+  do {
+    newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
+                                                          : __kmp_sys_max_nth;
+  } while (newCapacity < minimumRequiredCapacity);
+  newThreads = (kmp_info_t **)__kmp_allocate(
+      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
+  newRoot =
+      (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
+  KMP_MEMCPY(newThreads, __kmp_threads,
+             __kmp_threads_capacity * sizeof(kmp_info_t *));
+  KMP_MEMCPY(newRoot, __kmp_root,
+             __kmp_threads_capacity * sizeof(kmp_root_t *));
+  // Put old __kmp_threads array on a list. Any ongoing references to the old
+  // list will be valid. This list is cleaned up at library shutdown.
+  kmp_old_threads_list_t *node =
+      (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
+  node->threads = __kmp_threads;
+  node->next = __kmp_old_threads_list;
+  __kmp_old_threads_list = node;
+
+  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
+  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
+  added += newCapacity - __kmp_threads_capacity;
+  *(volatile int *)&__kmp_threads_capacity = newCapacity;
+
+  if (newCapacity > __kmp_tp_capacity) {
+    __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
+    if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
+      __kmp_threadprivate_resize_cache(newCapacity);
+    } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
+      *(volatile int *)&__kmp_tp_capacity = newCapacity;
+    }
+    __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
+  }
+
+  return added;
+}
+
+/* Register the current thread as a root thread and obtain our gtid. We must
+   have the __kmp_initz_lock held at this point. Argument TRUE only if are the
+   thread that calls from __kmp_do_serial_initialize() */
+int __kmp_register_root(int initial_thread) {
+  kmp_info_t *root_thread;
+  kmp_root_t *root;
+  int gtid;
+  int capacity;
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+  KA_TRACE(20, ("__kmp_register_root: entered\n"));
+  KMP_MB();
+
+  /* 2007-03-02:
+     If initial thread did not invoke OpenMP RTL yet, and this thread is not an
+     initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
+     work as expected -- it may return false (that means there is at least one
+     empty slot in __kmp_threads array), but it is possible the only free slot
+     is #0, which is reserved for initial thread and so cannot be used for this
+     one. Following code workarounds this bug.
+
+     However, right solution seems to be not reserving slot #0 for initial
+     thread because:
+     (1) there is no magic in slot #0,
+     (2) we cannot detect initial thread reliably (the first thread which does
+        serial initialization may be not a real initial thread).
+  */
+  capacity = __kmp_threads_capacity;
+  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
+    --capacity;
+  }
+
+  // If it is not for initializing the hidden helper team, we need to take
+  // __kmp_hidden_helper_threads_num out of the capacity because it is included
+  // in __kmp_threads_capacity.
+  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
+    capacity -= __kmp_hidden_helper_threads_num;
+  }
+
+  /* see if there are too many threads */
+  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
+    if (__kmp_tp_cached) {
+      __kmp_fatal(KMP_MSG(CantRegisterNewThread),
+                  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
+                  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
+    } else {
+      __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
+                  __kmp_msg_null);
+    }
+  }
+
+  // When hidden helper task is enabled, __kmp_threads is organized as follows:
+  // 0: initial thread, also a regular OpenMP thread.
+  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
+  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
+  // regular OpenMP threads.
+  if (TCR_4(__kmp_init_hidden_helper_threads)) {
+    // Find an available thread slot for hidden helper thread. Slots for hidden
+    // helper threads start from 1 to __kmp_hidden_helper_threads_num.
+    for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
+                   gtid <= __kmp_hidden_helper_threads_num;
+         gtid++)
+      ;
+    KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
+    KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
+                 "hidden helper thread: T#%d\n",
+                 gtid));
+  } else {
+    /* find an available thread slot */
+    // Don't reassign the zero slot since we need that to only be used by
+    // initial thread. Slots for hidden helper threads should also be skipped.
+    if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
+      gtid = 0;
+    } else {
+      for (gtid = __kmp_hidden_helper_threads_num + 1;
+           TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
+        ;
+    }
+    KA_TRACE(
+        1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
+    KMP_ASSERT(gtid < __kmp_threads_capacity);
+  }
+
+  /* update global accounting */
+  __kmp_all_nth++;
+  TCW_4(__kmp_nth, __kmp_nth + 1);
+
+  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
+  // numbers of procs, and method #2 (keyed API call) for higher numbers.
+  if (__kmp_adjust_gtid_mode) {
+    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
+      if (TCR_4(__kmp_gtid_mode) != 2) {
+        TCW_4(__kmp_gtid_mode, 2);
+      }
+    } else {
+      if (TCR_4(__kmp_gtid_mode) != 1) {
+        TCW_4(__kmp_gtid_mode, 1);
+      }
+    }
+  }
+
+#ifdef KMP_ADJUST_BLOCKTIME
+  /* Adjust blocktime to zero if necessary            */
+  /* Middle initialization might not have occurred yet */
+  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+    if (__kmp_nth > __kmp_avail_proc) {
+      __kmp_zero_bt = TRUE;
+    }
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+  /* setup this new hierarchy */
+  if (!(root = __kmp_root[gtid])) {
+    root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
+    KMP_DEBUG_ASSERT(!root->r.r_root_team);
+  }
+
+#if KMP_STATS_ENABLED
+  // Initialize stats as soon as possible (right after gtid assignment).
+  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
+  __kmp_stats_thread_ptr->startLife();
+  KMP_SET_THREAD_STATE(SERIAL_REGION);
+  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
+#endif
+  __kmp_initialize_root(root);
+
+  /* setup new root thread structure */
+  if (root->r.r_uber_thread) {
+    root_thread = root->r.r_uber_thread;
+  } else {
+    root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
+    if (__kmp_storage_map) {
+      __kmp_print_thread_storage_map(root_thread, gtid);
+    }
+    root_thread->th.th_info.ds.ds_gtid = gtid;
+#if OMPT_SUPPORT
+    root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
+#endif
+    root_thread->th.th_root = root;
+    if (__kmp_env_consistency_check) {
+      root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
+    }
+#if USE_FAST_MEMORY
+    __kmp_initialize_fast_memory(root_thread);
+#endif /* USE_FAST_MEMORY */
+
+#if KMP_USE_BGET
+    KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
+    __kmp_initialize_bget(root_thread);
+#endif
+    __kmp_init_random(root_thread); // Initialize random number generator
+  }
+
+  /* setup the serial team held in reserve by the root thread */
+  if (!root_thread->th.th_serial_team) {
+    kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
+    KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
+    root_thread->th.th_serial_team = __kmp_allocate_team(
+        root, 1, 1,
+#if OMPT_SUPPORT
+        ompt_data_none, // root parallel id
+#endif
+        proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
+  }
+  KMP_ASSERT(root_thread->th.th_serial_team);
+  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
+                root_thread->th.th_serial_team));
+
+  /* drop root_thread into place */
+  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
+
+  root->r.r_root_team->t.t_threads[0] = root_thread;
+  root->r.r_hot_team->t.t_threads[0] = root_thread;
+  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
+  // AC: the team created in reserve, not for execution (it is unused for now).
+  root_thread->th.th_serial_team->t.t_serialized = 0;
+  root->r.r_uber_thread = root_thread;
+
+  /* initialize the thread, get it ready to go */
+  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
+  TCW_4(__kmp_init_gtid, TRUE);
+
+  /* prepare the primary thread for get_gtid() */
+  __kmp_gtid_set_specific(gtid);
+
+#if USE_ITT_BUILD
+  __kmp_itt_thread_name(gtid);
+#endif /* USE_ITT_BUILD */
+
+#ifdef KMP_TDATA_GTID
+  __kmp_gtid = gtid;
+#endif
+  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
+  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
+
+  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
+                "plain=%u\n",
+                gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
+                root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
+                KMP_INIT_BARRIER_STATE));
+  { // Initialize barrier data.
+    int b;
+    for (b = 0; b < bs_last_barrier; ++b) {
+      root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
+#if USE_DEBUGGER
+      root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
+#endif
+    }
+  }
+  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
+                   KMP_INIT_BARRIER_STATE);
+
+#if KMP_AFFINITY_SUPPORTED
+  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
+  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
+  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
+  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
+#endif /* KMP_AFFINITY_SUPPORTED */
+  root_thread->th.th_def_allocator = __kmp_def_allocator;
+  root_thread->th.th_prev_level = 0;
+  root_thread->th.th_prev_num_threads = 1;
+
+  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
+  tmp->cg_root = root_thread;
+  tmp->cg_thread_limit = __kmp_cg_max_nth;
+  tmp->cg_nthreads = 1;
+  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
+                 " cg_nthreads init to 1\n",
+                 root_thread, tmp));
+  tmp->up = NULL;
+  root_thread->th.th_cg_roots = tmp;
+
+  __kmp_root_counter++;
+
+#if OMPT_SUPPORT
+  if (!initial_thread && ompt_enabled.enabled) {
+
+    kmp_info_t *root_thread = ompt_get_thread();
+
+    ompt_set_thread_state(root_thread, ompt_state_overhead);
+
+    if (ompt_enabled.ompt_callback_thread_begin) {
+      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
+          ompt_thread_initial, __ompt_get_thread_data_internal());
+    }
+    ompt_data_t *task_data;
+    ompt_data_t *parallel_data;
+    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
+                                  NULL);
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
+    }
+
+    ompt_set_thread_state(root_thread, ompt_state_work_serial);
+  }
+#endif
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_thread_begin();
+#endif
+
+  KMP_MB();
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+
+  return gtid;
+}
+
+#if KMP_NESTED_HOT_TEAMS
+static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
+                                const int max_level) {
+  int i, n, nth;
+  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
+  if (!hot_teams || !hot_teams[level].hot_team) {
+    return 0;
+  }
+  KMP_DEBUG_ASSERT(level < max_level);
+  kmp_team_t *team = hot_teams[level].hot_team;
+  nth = hot_teams[level].hot_team_nth;
+  n = nth - 1; // primary thread is not freed
+  if (level < max_level - 1) {
+    for (i = 0; i < nth; ++i) {
+      kmp_info_t *th = team->t.t_threads[i];
+      n += __kmp_free_hot_teams(root, th, level + 1, max_level);
+      if (i > 0 && th->th.th_hot_teams) {
+        __kmp_free(th->th.th_hot_teams);
+        th->th.th_hot_teams = NULL;
+      }
+    }
+  }
+  __kmp_free_team(root, team, NULL);
+  return n;
+}
+#endif
+
+// Resets a root thread and clear its root and hot teams.
+// Returns the number of __kmp_threads entries directly and indirectly freed.
+static int __kmp_reset_root(int gtid, kmp_root_t *root) {
+  kmp_team_t *root_team = root->r.r_root_team;
+  kmp_team_t *hot_team = root->r.r_hot_team;
+  int n = hot_team->t.t_nproc;
+  int i;
+
+  KMP_DEBUG_ASSERT(!root->r.r_active);
+
+  root->r.r_root_team = NULL;
+  root->r.r_hot_team = NULL;
+  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
+  // before call to __kmp_free_team().
+  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
+#if KMP_NESTED_HOT_TEAMS
+  if (__kmp_hot_teams_max_level >
+      0) { // need to free nested hot teams and their threads if any
+    for (i = 0; i < hot_team->t.t_nproc; ++i) {
+      kmp_info_t *th = hot_team->t.t_threads[i];
+      if (__kmp_hot_teams_max_level > 1) {
+        n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
+      }
+      if (th->th.th_hot_teams) {
+        __kmp_free(th->th.th_hot_teams);
+        th->th.th_hot_teams = NULL;
+      }
+    }
+  }
+#endif
+  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
+
+  // Before we can reap the thread, we need to make certain that all other
+  // threads in the teams that had this root as ancestor have stopped trying to
+  // steal tasks.
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    __kmp_wait_to_unref_task_teams();
+  }
+
+#if KMP_OS_WINDOWS
+  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
+  KA_TRACE(
+      10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
+           "\n",
+           (LPVOID) & (root->r.r_uber_thread->th),
+           root->r.r_uber_thread->th.th_info.ds.ds_thread));
+  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
+#endif /* KMP_OS_WINDOWS */
+
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_thread_end();
+#endif
+
+#if OMPT_SUPPORT
+  ompt_data_t *task_data;
+  ompt_data_t *parallel_data;
+  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
+                                NULL);
+  if (ompt_enabled.ompt_callback_implicit_task) {
+    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+        ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
+  }
+  if (ompt_enabled.ompt_callback_thread_end) {
+    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
+        &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
+  }
+#endif
+
+  TCW_4(__kmp_nth,
+        __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
+  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
+  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
+                 " to %d\n",
+                 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
+                 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
+  if (i == 1) {
+    // need to free contention group structure
+    KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
+                     root->r.r_uber_thread->th.th_cg_roots->cg_root);
+    KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
+    __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
+    root->r.r_uber_thread->th.th_cg_roots = NULL;
+  }
+  __kmp_reap_thread(root->r.r_uber_thread, 1);
+
+  // We canot put root thread to __kmp_thread_pool, so we have to reap it
+  // instead of freeing.
+  root->r.r_uber_thread = NULL;
+  /* mark root as no longer in use */
+  root->r.r_begin = FALSE;
+
+  return n;
+}
+
+void __kmp_unregister_root_current_thread(int gtid) {
+  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
+  /* this lock should be ok, since unregister_root_current_thread is never
+     called during an abort, only during a normal close. furthermore, if you
+     have the forkjoin lock, you should never try to get the initz lock */
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
+    KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
+                  "exiting T#%d\n",
+                  gtid));
+    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+    return;
+  }
+  kmp_root_t *root = __kmp_root[gtid];
+
+  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
+  KMP_ASSERT(KMP_UBER_GTID(gtid));
+  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
+  KMP_ASSERT(root->r.r_active == FALSE);
+
+  KMP_MB();
+
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_team_t *team = thread->th.th_team;
+  kmp_task_team_t *task_team = thread->th.th_task_team;
+
+  // we need to wait for the proxy tasks before finishing the thread
+  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
+                            task_team->tt.tt_hidden_helper_task_encountered)) {
+#if OMPT_SUPPORT
+    // the runtime is shutting down so we won't report any events
+    thread->th.ompt_thread_info.state = ompt_state_undefined;
+#endif
+    __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
+  }
+
+  __kmp_reset_root(gtid, root);
+
+  KMP_MB();
+  KC_TRACE(10,
+           ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
+
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+}
+
+#if KMP_OS_WINDOWS
+/* __kmp_forkjoin_lock must be already held
+   Unregisters a root thread that is not the current thread.  Returns the number
+   of __kmp_threads entries freed as a result. */
+static int __kmp_unregister_root_other_thread(int gtid) {
+  kmp_root_t *root = __kmp_root[gtid];
+  int r;
+
+  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
+  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
+  KMP_ASSERT(KMP_UBER_GTID(gtid));
+  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
+  KMP_ASSERT(root->r.r_active == FALSE);
+
+  r = __kmp_reset_root(gtid, root);
+  KC_TRACE(10,
+           ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
+  return r;
+}
+#endif
+
+#if KMP_DEBUG
+void __kmp_task_info() {
+
+  kmp_int32 gtid = __kmp_entry_gtid();
+  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *steam = this_thr->th.th_serial_team;
+  kmp_team_t *team = this_thr->th.th_team;
+
+  __kmp_printf(
+      "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
+      "ptask=%p\n",
+      gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
+      team->t.t_implicit_task_taskdata[tid].td_parent);
+}
+#endif // KMP_DEBUG
+
+/* TODO optimize with one big memclr, take out what isn't needed, split
+   responsibility to workers as much as possible, and delay initialization of
+   features as much as possible  */
+static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
+                                  int tid, int gtid) {
+  /* this_thr->th.th_info.ds.ds_gtid is setup in
+     kmp_allocate_thread/create_worker.
+     this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
+  KMP_DEBUG_ASSERT(this_thr != NULL);
+  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(team->t.t_threads);
+  KMP_DEBUG_ASSERT(team->t.t_dispatch);
+  kmp_info_t *master = team->t.t_threads[0];
+  KMP_DEBUG_ASSERT(master);
+  KMP_DEBUG_ASSERT(master->th.th_root);
+
+  KMP_MB();
+
+  TCW_SYNC_PTR(this_thr->th.th_team, team);
+
+  this_thr->th.th_info.ds.ds_tid = tid;
+  this_thr->th.th_set_nproc = 0;
+  if (__kmp_tasking_mode != tskm_immediate_exec)
+    // When tasking is possible, threads are not safe to reap until they are
+    // done tasking; this will be set when tasking code is exited in wait
+    this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+  else // no tasking --> always safe to reap
+    this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+  this_thr->th.th_set_proc_bind = proc_bind_default;
+#if KMP_AFFINITY_SUPPORTED
+  this_thr->th.th_new_place = this_thr->th.th_current_place;
+#endif
+  this_thr->th.th_root = master->th.th_root;
+
+  /* setup the thread's cache of the team structure */
+  this_thr->th.th_team_nproc = team->t.t_nproc;
+  this_thr->th.th_team_master = master;
+  this_thr->th.th_team_serialized = team->t.t_serialized;
+
+  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
+
+  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
+                tid, gtid, this_thr, this_thr->th.th_current_task));
+
+  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
+                           team, tid, TRUE);
+
+  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
+                tid, gtid, this_thr, this_thr->th.th_current_task));
+  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
+  // __kmp_initialize_team()?
+
+  /* TODO no worksharing in speculative threads */
+  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
+
+  this_thr->th.th_local.this_construct = 0;
+
+  if (!this_thr->th.th_pri_common) {
+    this_thr->th.th_pri_common =
+        (struct common_table *)__kmp_allocate(sizeof(struct common_table));
+    if (__kmp_storage_map) {
+      __kmp_print_storage_map_gtid(
+          gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
+          sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
+    }
+    this_thr->th.th_pri_head = NULL;
+  }
+
+  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
+      this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
+    // Make new thread's CG root same as primary thread's
+    KMP_DEBUG_ASSERT(master->th.th_cg_roots);
+    kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
+    if (tmp) {
+      // worker changes CG, need to check if old CG should be freed
+      int i = tmp->cg_nthreads--;
+      KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
+                     " on node %p of thread %p to %d\n",
+                     this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
+      if (i == 1) {
+        __kmp_free(tmp); // last thread left CG --> free it
+      }
+    }
+    this_thr->th.th_cg_roots = master->th.th_cg_roots;
+    // Increment new thread's CG root's counter to add the new thread
+    this_thr->th.th_cg_roots->cg_nthreads++;
+    KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
+                   " node %p of thread %p to %d\n",
+                   this_thr, this_thr->th.th_cg_roots,
+                   this_thr->th.th_cg_roots->cg_root,
+                   this_thr->th.th_cg_roots->cg_nthreads));
+    this_thr->th.th_current_task->td_icvs.thread_limit =
+        this_thr->th.th_cg_roots->cg_thread_limit;
+  }
+
+  /* Initialize dynamic dispatch */
+  {
+    volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
+    // Use team max_nproc since this will never change for the team.
+    size_t disp_size =
+        sizeof(dispatch_private_info_t) *
+        (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
+    KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
+                  team->t.t_max_nproc));
+    KMP_ASSERT(dispatch);
+    KMP_DEBUG_ASSERT(team->t.t_dispatch);
+    KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
+
+    dispatch->th_disp_index = 0;
+    dispatch->th_doacross_buf_idx = 0;
+    if (!dispatch->th_disp_buffer) {
+      dispatch->th_disp_buffer =
+          (dispatch_private_info_t *)__kmp_allocate(disp_size);
+
+      if (__kmp_storage_map) {
+        __kmp_print_storage_map_gtid(
+            gtid, &dispatch->th_disp_buffer[0],
+            &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
+                                          ? 1
+                                          : __kmp_dispatch_num_buffers],
+            disp_size,
+            "th_%d.th_dispatch.th_disp_buffer "
+            "(team_%d.t_dispatch[%d].th_disp_buffer)",
+            gtid, team->t.t_id, gtid);
+      }
+    } else {
+      memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
+    }
+
+    dispatch->th_dispatch_pr_current = 0;
+    dispatch->th_dispatch_sh_current = 0;
+
+    dispatch->th_deo_fcn = 0; /* ORDERED     */
+    dispatch->th_dxo_fcn = 0; /* END ORDERED */
+  }
+
+  this_thr->th.th_next_pool = NULL;
+
+  if (!this_thr->th.th_task_state_memo_stack) {
+    size_t i;
+    this_thr->th.th_task_state_memo_stack =
+        (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
+    this_thr->th.th_task_state_top = 0;
+    this_thr->th.th_task_state_stack_sz = 4;
+    for (i = 0; i < this_thr->th.th_task_state_stack_sz;
+         ++i) // zero init the stack
+      this_thr->th.th_task_state_memo_stack[i] = 0;
+  }
+
+  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
+  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
+
+  KMP_MB();
+}
+
+/* allocate a new thread for the requesting team. this is only called from
+   within a forkjoin critical section. we will first try to get an available
+   thread from the thread pool. if none is available, we will fork a new one
+   assuming we are able to create a new one. this should be assured, as the
+   caller should check on this first. */
+kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
+                                  int new_tid) {
+  kmp_team_t *serial_team;
+  kmp_info_t *new_thr;
+  int new_gtid;
+
+  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
+  KMP_DEBUG_ASSERT(root && team);
+#if !KMP_NESTED_HOT_TEAMS
+  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
+#endif
+  KMP_MB();
+
+  /* first, try to get one from the thread pool */
+  if (__kmp_thread_pool) {
+    new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
+    __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
+    if (new_thr == __kmp_thread_pool_insert_pt) {
+      __kmp_thread_pool_insert_pt = NULL;
+    }
+    TCW_4(new_thr->th.th_in_pool, FALSE);
+    __kmp_suspend_initialize_thread(new_thr);
+    __kmp_lock_suspend_mx(new_thr);
+    if (new_thr->th.th_active_in_pool == TRUE) {
+      KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
+      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
+      new_thr->th.th_active_in_pool = FALSE;
+    }
+    __kmp_unlock_suspend_mx(new_thr);
+
+    KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
+                  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
+    KMP_ASSERT(!new_thr->th.th_team);
+    KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
+
+    /* setup the thread structure */
+    __kmp_initialize_info(new_thr, team, new_tid,
+                          new_thr->th.th_info.ds.ds_gtid);
+    KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
+
+    TCW_4(__kmp_nth, __kmp_nth + 1);
+
+    new_thr->th.th_task_state = 0;
+    new_thr->th.th_task_state_top = 0;
+    new_thr->th.th_task_state_stack_sz = 4;
+
+    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      // Make sure pool thread has transitioned to waiting on own thread struct
+      KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
+      // Thread activated in __kmp_allocate_team when increasing team size
+    }
+
+#ifdef KMP_ADJUST_BLOCKTIME
+    /* Adjust blocktime back to zero if necessary */
+    /* Middle initialization might not have occurred yet */
+    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+      if (__kmp_nth > __kmp_avail_proc) {
+        __kmp_zero_bt = TRUE;
+      }
+    }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+#if KMP_DEBUG
+    // If thread entered pool via __kmp_free_thread, wait_flag should !=
+    // KMP_BARRIER_PARENT_FLAG.
+    int b;
+    kmp_balign_t *balign = new_thr->th.th_bar;
+    for (b = 0; b < bs_last_barrier; ++b)
+      KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#endif
+
+    KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
+                  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
+
+    KMP_MB();
+    return new_thr;
+  }
+
+  /* no, well fork a new one */
+  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
+  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
+
+#if KMP_USE_MONITOR
+  // If this is the first worker thread the RTL is creating, then also
+  // launch the monitor thread.  We try to do this as early as possible.
+  if (!TCR_4(__kmp_init_monitor)) {
+    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
+    if (!TCR_4(__kmp_init_monitor)) {
+      KF_TRACE(10, ("before __kmp_create_monitor\n"));
+      TCW_4(__kmp_init_monitor, 1);
+      __kmp_create_monitor(&__kmp_monitor);
+      KF_TRACE(10, ("after __kmp_create_monitor\n"));
+#if KMP_OS_WINDOWS
+      // AC: wait until monitor has started. This is a fix for CQ232808.
+      // The reason is that if the library is loaded/unloaded in a loop with
+      // small (parallel) work in between, then there is high probability that
+      // monitor thread started after the library shutdown. At shutdown it is
+      // too late to cope with the problem, because when the primary thread is
+      // in DllMain (process detach) the monitor has no chances to start (it is
+      // blocked), and primary thread has no means to inform the monitor that
+      // the library has gone, because all the memory which the monitor can
+      // access is going to be released/reset.
+      while (TCR_4(__kmp_init_monitor) < 2) {
+        KMP_YIELD(TRUE);
+      }
+      KF_TRACE(10, ("after monitor thread has started\n"));
+#endif
+    }
+    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
+  }
+#endif
+
+  KMP_MB();
+
+  {
+    int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
+                             ? 1
+                             : __kmp_hidden_helper_threads_num + 1;
+
+    for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
+         ++new_gtid) {
+      KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
+    }
+
+    if (TCR_4(__kmp_init_hidden_helper_threads)) {
+      KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
+    }
+  }
+
+  /* allocate space for it. */
+  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
+
+  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
+  // suppress race conditions detection on synchronization flags in debug mode
+  // this helps to analyze library internals eliminating false positives
+  __itt_suppress_mark_range(
+      __itt_suppress_range, __itt_suppress_threading_errors,
+      &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
+  __itt_suppress_mark_range(
+      __itt_suppress_range, __itt_suppress_threading_errors,
+      &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
+#if KMP_OS_WINDOWS
+  __itt_suppress_mark_range(
+      __itt_suppress_range, __itt_suppress_threading_errors,
+      &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
+#else
+  __itt_suppress_mark_range(__itt_suppress_range,
+                            __itt_suppress_threading_errors,
+                            &new_thr->th.th_suspend_init_count,
+                            sizeof(new_thr->th.th_suspend_init_count));
+#endif
+  // TODO: check if we need to also suppress b_arrived flags
+  __itt_suppress_mark_range(__itt_suppress_range,
+                            __itt_suppress_threading_errors,
+                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
+                            sizeof(new_thr->th.th_bar[0].bb.b_go));
+  __itt_suppress_mark_range(__itt_suppress_range,
+                            __itt_suppress_threading_errors,
+                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
+                            sizeof(new_thr->th.th_bar[1].bb.b_go));
+  __itt_suppress_mark_range(__itt_suppress_range,
+                            __itt_suppress_threading_errors,
+                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
+                            sizeof(new_thr->th.th_bar[2].bb.b_go));
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
+  if (__kmp_storage_map) {
+    __kmp_print_thread_storage_map(new_thr, new_gtid);
+  }
+
+  // add the reserve serialized team, initialized from the team's primary thread
+  {
+    kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
+    KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
+    new_thr->th.th_serial_team = serial_team =
+        (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
+#if OMPT_SUPPORT
+                                          ompt_data_none, // root parallel id
+#endif
+                                          proc_bind_default, &r_icvs,
+                                          0 USE_NESTED_HOT_ARG(NULL));
+  }
+  KMP_ASSERT(serial_team);
+  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
+  // execution (it is unused for now).
+  serial_team->t.t_threads[0] = new_thr;
+  KF_TRACE(10,
+           ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
+            new_thr));
+
+  /* setup the thread structures */
+  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
+
+#if USE_FAST_MEMORY
+  __kmp_initialize_fast_memory(new_thr);
+#endif /* USE_FAST_MEMORY */
+
+#if KMP_USE_BGET
+  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
+  __kmp_initialize_bget(new_thr);
+#endif
+
+  __kmp_init_random(new_thr); // Initialize random number generator
+
+  /* Initialize these only once when thread is grabbed for a team allocation */
+  KA_TRACE(20,
+           ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
+            __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
+
+  int b;
+  kmp_balign_t *balign = new_thr->th.th_bar;
+  for (b = 0; b < bs_last_barrier; ++b) {
+    balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
+    balign[b].bb.team = NULL;
+    balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
+    balign[b].bb.use_oncore_barrier = 0;
+  }
+
+  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
+  new_thr->th.th_sleep_loc_type = flag_unset;
+
+  new_thr->th.th_spin_here = FALSE;
+  new_thr->th.th_next_waiting = 0;
+#if KMP_OS_UNIX
+  new_thr->th.th_blocking = false;
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
+  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
+  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
+  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
+#endif
+  new_thr->th.th_def_allocator = __kmp_def_allocator;
+  new_thr->th.th_prev_level = 0;
+  new_thr->th.th_prev_num_threads = 1;
+
+  TCW_4(new_thr->th.th_in_pool, FALSE);
+  new_thr->th.th_active_in_pool = FALSE;
+  TCW_4(new_thr->th.th_active, TRUE);
+
+  /* adjust the global counters */
+  __kmp_all_nth++;
+  __kmp_nth++;
+
+  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
+  // numbers of procs, and method #2 (keyed API call) for higher numbers.
+  if (__kmp_adjust_gtid_mode) {
+    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
+      if (TCR_4(__kmp_gtid_mode) != 2) {
+        TCW_4(__kmp_gtid_mode, 2);
+      }
+    } else {
+      if (TCR_4(__kmp_gtid_mode) != 1) {
+        TCW_4(__kmp_gtid_mode, 1);
+      }
+    }
+  }
+
+#ifdef KMP_ADJUST_BLOCKTIME
+  /* Adjust blocktime back to zero if necessary       */
+  /* Middle initialization might not have occurred yet */
+  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+    if (__kmp_nth > __kmp_avail_proc) {
+      __kmp_zero_bt = TRUE;
+    }
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+#if KMP_AFFINITY_SUPPORTED
+  // Set the affinity and topology information for new thread
+  __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
+#endif
+
+  /* actually fork it and create the new worker thread */
+  KF_TRACE(
+      10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
+  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
+  KF_TRACE(10,
+           ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
+
+  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
+                new_gtid));
+  KMP_MB();
+  return new_thr;
+}
+
+/* Reinitialize team for reuse.
+   The hot team code calls this case at every fork barrier, so EPCC barrier
+   test are extremely sensitive to changes in it, esp. writes to the team
+   struct, which cause a cache invalidation in all threads.
+   IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
+static void __kmp_reinitialize_team(kmp_team_t *team,
+                                    kmp_internal_control_t *new_icvs,
+                                    ident_t *loc) {
+  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
+                team->t.t_threads[0], team));
+  KMP_DEBUG_ASSERT(team && new_icvs);
+  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
+  KMP_CHECK_UPDATE(team->t.t_ident, loc);
+
+  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
+  // Copy ICVs to the primary thread's implicit taskdata
+  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
+  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
+
+  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
+                team->t.t_threads[0], team));
+}
+
+/* Initialize the team data structure.
+   This assumes the t_threads and t_max_nproc are already set.
+   Also, we don't touch the arguments */
+static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
+                                  kmp_internal_control_t *new_icvs,
+                                  ident_t *loc) {
+  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
+
+  /* verify */
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
+  KMP_DEBUG_ASSERT(team->t.t_threads);
+  KMP_MB();
+
+  team->t.t_master_tid = 0; /* not needed */
+  /* team->t.t_master_bar;        not needed */
+  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
+  team->t.t_nproc = new_nproc;
+
+  /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
+  team->t.t_next_pool = NULL;
+  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
+   * up hot team */
+
+  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
+  team->t.t_invoke = NULL; /* not needed */
+
+  // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
+  team->t.t_sched.sched = new_icvs->sched.sched;
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  team->t.t_fp_control_saved = FALSE; /* not needed */
+  team->t.t_x87_fpu_control_word = 0; /* not needed */
+  team->t.t_mxcsr = 0; /* not needed */
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+  team->t.t_construct = 0;
+
+  team->t.t_ordered.dt.t_value = 0;
+  team->t.t_master_active = FALSE;
+
+#ifdef KMP_DEBUG
+  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
+#endif
+#if KMP_OS_WINDOWS
+  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
+#endif
+
+  team->t.t_control_stack_top = NULL;
+
+  __kmp_reinitialize_team(team, new_icvs, loc);
+
+  KMP_MB();
+  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
+}
+
+#if KMP_AFFINITY_SUPPORTED
+static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
+                                          int first, int last, int newp) {
+  th->th.th_first_place = first;
+  th->th.th_last_place = last;
+  th->th.th_new_place = newp;
+  if (newp != th->th.th_current_place) {
+    if (__kmp_display_affinity && team->t.t_display_affinity != 1)
+      team->t.t_display_affinity = 1;
+    // Copy topology information associated with the new place
+    th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
+    th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
+  }
+}
+
+// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
+// It calculates the worker + primary thread's partition based upon the parent
+// thread's partition, and binds each worker to a thread in their partition.
+// The primary thread's partition should already include its current binding.
+static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
+  // Do not partition places for the hidden helper team
+  if (KMP_HIDDEN_HELPER_TEAM(team))
+    return;
+  // Copy the primary thread's place partition to the team struct
+  kmp_info_t *master_th = team->t.t_threads[0];
+  KMP_DEBUG_ASSERT(master_th != NULL);
+  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
+  int first_place = master_th->th.th_first_place;
+  int last_place = master_th->th.th_last_place;
+  int masters_place = master_th->th.th_current_place;
+  int num_masks = __kmp_affinity.num_masks;
+  team->t.t_first_place = first_place;
+  team->t.t_last_place = last_place;
+
+  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
+                "bound to place %d partition = [%d,%d]\n",
+                proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
+                team->t.t_id, masters_place, first_place, last_place));
+
+  switch (proc_bind) {
+
+  case proc_bind_default:
+    // Serial teams might have the proc_bind policy set to proc_bind_default.
+    // Not an issue -- we don't rebind primary thread for any proc_bind policy.
+    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+    break;
+
+  case proc_bind_primary: {
+    int f;
+    int n_th = team->t.t_nproc;
+    for (f = 1; f < n_th; f++) {
+      kmp_info_t *th = team->t.t_threads[f];
+      KMP_DEBUG_ASSERT(th != NULL);
+      __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
+
+      KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
+                     "partition = [%d,%d]\n",
+                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
+                     f, masters_place, first_place, last_place));
+    }
+  } break;
+
+  case proc_bind_close: {
+    int f;
+    int n_th = team->t.t_nproc;
+    int n_places;
+    if (first_place <= last_place) {
+      n_places = last_place - first_place + 1;
+    } else {
+      n_places = num_masks - first_place + last_place + 1;
+    }
+    if (n_th <= n_places) {
+      int place = masters_place;
+      for (f = 1; f < n_th; f++) {
+        kmp_info_t *th = team->t.t_threads[f];
+        KMP_DEBUG_ASSERT(th != NULL);
+
+        if (place == last_place) {
+          place = first_place;
+        } else if (place == (num_masks - 1)) {
+          place = 0;
+        } else {
+          place++;
+        }
+        __kmp_set_thread_place(team, th, first_place, last_place, place);
+
+        KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
+                       "partition = [%d,%d]\n",
+                       __kmp_gtid_from_thread(team->t.t_threads[f]),
+                       team->t.t_id, f, place, first_place, last_place));
+      }
+    } else {
+      int S, rem, gap, s_count;
+      S = n_th / n_places;
+      s_count = 0;
+      rem = n_th - (S * n_places);
+      gap = rem > 0 ? n_places / rem : n_places;
+      int place = masters_place;
+      int gap_ct = gap;
+      for (f = 0; f < n_th; f++) {
+        kmp_info_t *th = team->t.t_threads[f];
+        KMP_DEBUG_ASSERT(th != NULL);
+
+        __kmp_set_thread_place(team, th, first_place, last_place, place);
+        s_count++;
+
+        if ((s_count == S) && rem && (gap_ct == gap)) {
+          // do nothing, add an extra thread to place on next iteration
+        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
+          // we added an extra thread to this place; move to next place
+          if (place == last_place) {
+            place = first_place;
+          } else if (place == (num_masks - 1)) {
+            place = 0;
+          } else {
+            place++;
+          }
+          s_count = 0;
+          gap_ct = 1;
+          rem--;
+        } else if (s_count == S) { // place full; don't add extra
+          if (place == last_place) {
+            place = first_place;
+          } else if (place == (num_masks - 1)) {
+            place = 0;
+          } else {
+            place++;
+          }
+          gap_ct++;
+          s_count = 0;
+        }
+
+        KA_TRACE(100,
+                 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
+                  "partition = [%d,%d]\n",
+                  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
+                  th->th.th_new_place, first_place, last_place));
+      }
+      KMP_DEBUG_ASSERT(place == masters_place);
+    }
+  } break;
+
+  case proc_bind_spread: {
+    int f;
+    int n_th = team->t.t_nproc;
+    int n_places;
+    int thidx;
+    if (first_place <= last_place) {
+      n_places = last_place - first_place + 1;
+    } else {
+      n_places = num_masks - first_place + last_place + 1;
+    }
+    if (n_th <= n_places) {
+      int place = -1;
+
+      if (n_places != num_masks) {
+        int S = n_places / n_th;
+        int s_count, rem, gap, gap_ct;
+
+        place = masters_place;
+        rem = n_places - n_th * S;
+        gap = rem ? n_th / rem : 1;
+        gap_ct = gap;
+        thidx = n_th;
+        if (update_master_only == 1)
+          thidx = 1;
+        for (f = 0; f < thidx; f++) {
+          kmp_info_t *th = team->t.t_threads[f];
+          KMP_DEBUG_ASSERT(th != NULL);
+
+          int fplace = place, nplace = place;
+          s_count = 1;
+          while (s_count < S) {
+            if (place == last_place) {
+              place = first_place;
+            } else if (place == (num_masks - 1)) {
+              place = 0;
+            } else {
+              place++;
+            }
+            s_count++;
+          }
+          if (rem && (gap_ct == gap)) {
+            if (place == last_place) {
+              place = first_place;
+            } else if (place == (num_masks - 1)) {
+              place = 0;
+            } else {
+              place++;
+            }
+            rem--;
+            gap_ct = 0;
+          }
+          __kmp_set_thread_place(team, th, fplace, place, nplace);
+          gap_ct++;
+
+          if (place == last_place) {
+            place = first_place;
+          } else if (place == (num_masks - 1)) {
+            place = 0;
+          } else {
+            place++;
+          }
+
+          KA_TRACE(100,
+                   ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
+                    "partition = [%d,%d], num_masks: %u\n",
+                    __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
+                    f, th->th.th_new_place, th->th.th_first_place,
+                    th->th.th_last_place, num_masks));
+        }
+      } else {
+        /* Having uniform space of available computation places I can create
+           T partitions of round(P/T) size and put threads into the first
+           place of each partition. */
+        double current = static_cast<double>(masters_place);
+        double spacing =
+            (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
+        int first, last;
+        kmp_info_t *th;
+
+        thidx = n_th + 1;
+        if (update_master_only == 1)
+          thidx = 1;
+        for (f = 0; f < thidx; f++) {
+          first = static_cast<int>(current);
+          last = static_cast<int>(current + spacing) - 1;
+          KMP_DEBUG_ASSERT(last >= first);
+          if (first >= n_places) {
+            if (masters_place) {
+              first -= n_places;
+              last -= n_places;
+              if (first == (masters_place + 1)) {
+                KMP_DEBUG_ASSERT(f == n_th);
+                first--;
+              }
+              if (last == masters_place) {
+                KMP_DEBUG_ASSERT(f == (n_th - 1));
+                last--;
+              }
+            } else {
+              KMP_DEBUG_ASSERT(f == n_th);
+              first = 0;
+              last = 0;
+            }
+          }
+          if (last >= n_places) {
+            last = (n_places - 1);
+          }
+          place = first;
+          current += spacing;
+          if (f < n_th) {
+            KMP_DEBUG_ASSERT(0 <= first);
+            KMP_DEBUG_ASSERT(n_places > first);
+            KMP_DEBUG_ASSERT(0 <= last);
+            KMP_DEBUG_ASSERT(n_places > last);
+            KMP_DEBUG_ASSERT(last_place >= first_place);
+            th = team->t.t_threads[f];
+            KMP_DEBUG_ASSERT(th);
+            __kmp_set_thread_place(team, th, first, last, place);
+            KA_TRACE(100,
+                     ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
+                      "partition = [%d,%d], spacing = %.4f\n",
+                      __kmp_gtid_from_thread(team->t.t_threads[f]),
+                      team->t.t_id, f, th->th.th_new_place,
+                      th->th.th_first_place, th->th.th_last_place, spacing));
+          }
+        }
+      }
+      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
+    } else {
+      int S, rem, gap, s_count;
+      S = n_th / n_places;
+      s_count = 0;
+      rem = n_th - (S * n_places);
+      gap = rem > 0 ? n_places / rem : n_places;
+      int place = masters_place;
+      int gap_ct = gap;
+      thidx = n_th;
+      if (update_master_only == 1)
+        thidx = 1;
+      for (f = 0; f < thidx; f++) {
+        kmp_info_t *th = team->t.t_threads[f];
+        KMP_DEBUG_ASSERT(th != NULL);
+
+        __kmp_set_thread_place(team, th, place, place, place);
+        s_count++;
+
+        if ((s_count == S) && rem && (gap_ct == gap)) {
+          // do nothing, add an extra thread to place on next iteration
+        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
+          // we added an extra thread to this place; move on to next place
+          if (place == last_place) {
+            place = first_place;
+          } else if (place == (num_masks - 1)) {
+            place = 0;
+          } else {
+            place++;
+          }
+          s_count = 0;
+          gap_ct = 1;
+          rem--;
+        } else if (s_count == S) { // place is full; don't add extra thread
+          if (place == last_place) {
+            place = first_place;
+          } else if (place == (num_masks - 1)) {
+            place = 0;
+          } else {
+            place++;
+          }
+          gap_ct++;
+          s_count = 0;
+        }
+
+        KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
+                       "partition = [%d,%d]\n",
+                       __kmp_gtid_from_thread(team->t.t_threads[f]),
+                       team->t.t_id, f, th->th.th_new_place,
+                       th->th.th_first_place, th->th.th_last_place));
+      }
+      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
+    }
+  } break;
+
+  default:
+    break;
+  }
+
+  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
+}
+
+#endif // KMP_AFFINITY_SUPPORTED
+
+/* allocate a new team data structure to use.  take one off of the free pool if
+   available */
+kmp_team_t *
+__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
+#if OMPT_SUPPORT
+                    ompt_data_t ompt_parallel_data,
+#endif
+                    kmp_proc_bind_t new_proc_bind,
+                    kmp_internal_control_t *new_icvs,
+                    int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
+  int f;
+  kmp_team_t *team;
+  int use_hot_team = !root->r.r_active;
+  int level = 0;
+  int do_place_partition = 1;
+
+  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
+  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
+  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
+  KMP_MB();
+
+#if KMP_NESTED_HOT_TEAMS
+  kmp_hot_team_ptr_t *hot_teams;
+  if (master) {
+    team = master->th.th_team;
+    level = team->t.t_active_level;
+    if (master->th.th_teams_microtask) { // in teams construct?
+      if (master->th.th_teams_size.nteams > 1 &&
+          ( // #teams > 1
+              team->t.t_pkfn ==
+                  (microtask_t)__kmp_teams_master || // inner fork of the teams
+              master->th.th_teams_level <
+                  team->t.t_level)) { // or nested parallel inside the teams
+        ++level; // not increment if #teams==1, or for outer fork of the teams;
+        // increment otherwise
+      }
+      // Do not perform the place partition if inner fork of the teams
+      // Wait until nested parallel region encountered inside teams construct
+      if ((master->th.th_teams_size.nteams == 1 &&
+           master->th.th_teams_level >= team->t.t_level) ||
+          (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
+        do_place_partition = 0;
+    }
+    hot_teams = master->th.th_hot_teams;
+    if (level < __kmp_hot_teams_max_level && hot_teams &&
+        hot_teams[level].hot_team) {
+      // hot team has already been allocated for given level
+      use_hot_team = 1;
+    } else {
+      use_hot_team = 0;
+    }
+  } else {
+    // check we won't access uninitialized hot_teams, just in case
+    KMP_DEBUG_ASSERT(new_nproc == 1);
+  }
+#endif
+  // Optimization to use a "hot" team
+  if (use_hot_team && new_nproc > 1) {
+    KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
+#if KMP_NESTED_HOT_TEAMS
+    team = hot_teams[level].hot_team;
+#else
+    team = root->r.r_hot_team;
+#endif
+#if KMP_DEBUG
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
+                    "task_team[1] = %p before reinit\n",
+                    team->t.t_task_team[0], team->t.t_task_team[1]));
+    }
+#endif
+
+    if (team->t.t_nproc != new_nproc &&
+        __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      // Distributed barrier may need a resize
+      int old_nthr = team->t.t_nproc;
+      __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
+    }
+
+    // If not doing the place partition, then reset the team's proc bind
+    // to indicate that partitioning of all threads still needs to take place
+    if (do_place_partition == 0)
+      team->t.t_proc_bind = proc_bind_default;
+    // Has the number of threads changed?
+    /* Let's assume the most common case is that the number of threads is
+       unchanged, and put that case first. */
+    if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
+      KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
+      // This case can mean that omp_set_num_threads() was called and the hot
+      // team size was already reduced, so we check the special flag
+      if (team->t.t_size_changed == -1) {
+        team->t.t_size_changed = 1;
+      } else {
+        KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
+      }
+
+      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
+      kmp_r_sched_t new_sched = new_icvs->sched;
+      // set primary thread's schedule as new run-time schedule
+      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
+
+      __kmp_reinitialize_team(team, new_icvs,
+                              root->r.r_uber_thread->th.th_ident);
+
+      KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
+                    team->t.t_threads[0], team));
+      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
+
+#if KMP_AFFINITY_SUPPORTED
+      if ((team->t.t_size_changed == 0) &&
+          (team->t.t_proc_bind == new_proc_bind)) {
+        if (new_proc_bind == proc_bind_spread) {
+          if (do_place_partition) {
+            // add flag to update only master for spread
+            __kmp_partition_places(team, 1);
+          }
+        }
+        KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
+                       "proc_bind = %d, partition = [%d,%d]\n",
+                       team->t.t_id, new_proc_bind, team->t.t_first_place,
+                       team->t.t_last_place));
+      } else {
+        if (do_place_partition) {
+          KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+          __kmp_partition_places(team);
+        }
+      }
+#else
+      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+#endif /* KMP_AFFINITY_SUPPORTED */
+    } else if (team->t.t_nproc > new_nproc) {
+      KA_TRACE(20,
+               ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
+                new_nproc));
+
+      team->t.t_size_changed = 1;
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        // Barrier size already reduced earlier in this function
+        // Activate team threads via th_used_in_team
+        __kmp_add_threads_to_team(team, new_nproc);
+      }
+#if KMP_NESTED_HOT_TEAMS
+      if (__kmp_hot_teams_mode == 0) {
+        // AC: saved number of threads should correspond to team's value in this
+        // mode, can be bigger in mode 1, when hot team has threads in reserve
+        KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
+        hot_teams[level].hot_team_nth = new_nproc;
+#endif // KMP_NESTED_HOT_TEAMS
+        /* release the extra threads we don't need any more */
+        for (f = new_nproc; f < team->t.t_nproc; f++) {
+          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+          if (__kmp_tasking_mode != tskm_immediate_exec) {
+            // When decreasing team size, threads no longer in the team should
+            // unref task team.
+            team->t.t_threads[f]->th.th_task_team = NULL;
+          }
+          __kmp_free_thread(team->t.t_threads[f]);
+          team->t.t_threads[f] = NULL;
+        }
+#if KMP_NESTED_HOT_TEAMS
+      } // (__kmp_hot_teams_mode == 0)
+      else {
+        // When keeping extra threads in team, switch threads to wait on own
+        // b_go flag
+        for (f = new_nproc; f < team->t.t_nproc; ++f) {
+          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+          kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
+          for (int b = 0; b < bs_last_barrier; ++b) {
+            if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
+              balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
+            }
+            KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
+          }
+        }
+      }
+#endif // KMP_NESTED_HOT_TEAMS
+      team->t.t_nproc = new_nproc;
+      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
+      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
+      __kmp_reinitialize_team(team, new_icvs,
+                              root->r.r_uber_thread->th.th_ident);
+
+      // Update remaining threads
+      for (f = 0; f < new_nproc; ++f) {
+        team->t.t_threads[f]->th.th_team_nproc = new_nproc;
+      }
+
+      // restore the current task state of the primary thread: should be the
+      // implicit task
+      KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
+                    team->t.t_threads[0], team));
+
+      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
+
+#ifdef KMP_DEBUG
+      for (f = 0; f < team->t.t_nproc; f++) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
+                         team->t.t_threads[f]->th.th_team_nproc ==
+                             team->t.t_nproc);
+      }
+#endif
+
+      if (do_place_partition) {
+        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+#if KMP_AFFINITY_SUPPORTED
+        __kmp_partition_places(team);
+#endif
+      }
+    } else { // team->t.t_nproc < new_nproc
+
+      KA_TRACE(20,
+               ("__kmp_allocate_team: increasing hot team thread count to %d\n",
+                new_nproc));
+      int old_nproc = team->t.t_nproc; // save old value and use to update only
+      team->t.t_size_changed = 1;
+
+#if KMP_NESTED_HOT_TEAMS
+      int avail_threads = hot_teams[level].hot_team_nth;
+      if (new_nproc < avail_threads)
+        avail_threads = new_nproc;
+      kmp_info_t **other_threads = team->t.t_threads;
+      for (f = team->t.t_nproc; f < avail_threads; ++f) {
+        // Adjust barrier data of reserved threads (if any) of the team
+        // Other data will be set in __kmp_initialize_info() below.
+        int b;
+        kmp_balign_t *balign = other_threads[f]->th.th_bar;
+        for (b = 0; b < bs_last_barrier; ++b) {
+          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
+#endif
+        }
+      }
+      if (hot_teams[level].hot_team_nth >= new_nproc) {
+        // we have all needed threads in reserve, no need to allocate any
+        // this only possible in mode 1, cannot have reserved threads in mode 0
+        KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
+        team->t.t_nproc = new_nproc; // just get reserved threads involved
+      } else {
+        // We may have some threads in reserve, but not enough;
+        // get reserved threads involved if any.
+        team->t.t_nproc = hot_teams[level].hot_team_nth;
+        hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
+#endif // KMP_NESTED_HOT_TEAMS
+        if (team->t.t_max_nproc < new_nproc) {
+          /* reallocate larger arrays */
+          __kmp_reallocate_team_arrays(team, new_nproc);
+          __kmp_reinitialize_team(team, new_icvs, NULL);
+        }
+
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
+        /* Temporarily set full mask for primary thread before creation of
+           workers. The reason is that workers inherit the affinity from the
+           primary thread, so if a lot of workers are created on the single
+           core quickly, they don't get a chance to set their own affinity for
+           a long time. */
+        kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
+#endif
+
+        /* allocate new threads for the hot team */
+        for (f = team->t.t_nproc; f < new_nproc; f++) {
+          kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
+          KMP_DEBUG_ASSERT(new_worker);
+          team->t.t_threads[f] = new_worker;
+
+          KA_TRACE(20,
+                   ("__kmp_allocate_team: team %d init T#%d arrived: "
+                    "join=%llu, plain=%llu\n",
+                    team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
+                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
+                    team->t.t_bar[bs_plain_barrier].b_arrived));
+
+          { // Initialize barrier data for new threads.
+            int b;
+            kmp_balign_t *balign = new_worker->th.th_bar;
+            for (b = 0; b < bs_last_barrier; ++b) {
+              balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+              KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
+                               KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+              balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
+#endif
+            }
+          }
+        }
+
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
+        /* Restore initial primary thread's affinity mask */
+        new_temp_affinity.restore();
+#endif
+#if KMP_NESTED_HOT_TEAMS
+      } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
+#endif // KMP_NESTED_HOT_TEAMS
+      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        // Barrier size already increased earlier in this function
+        // Activate team threads via th_used_in_team
+        __kmp_add_threads_to_team(team, new_nproc);
+      }
+      /* make sure everyone is syncronized */
+      // new threads below
+      __kmp_initialize_team(team, new_nproc, new_icvs,
+                            root->r.r_uber_thread->th.th_ident);
+
+      /* reinitialize the threads */
+      KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
+      for (f = 0; f < team->t.t_nproc; ++f)
+        __kmp_initialize_info(team->t.t_threads[f], team, f,
+                              __kmp_gtid_from_tid(f, team));
+
+      // set th_task_state for new threads in hot team with older thread's state
+      kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
+      for (f = old_nproc; f < team->t.t_nproc; ++f)
+        team->t.t_threads[f]->th.th_task_state = old_state;
+
+#ifdef KMP_DEBUG
+      for (f = 0; f < team->t.t_nproc; ++f) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
+                         team->t.t_threads[f]->th.th_team_nproc ==
+                             team->t.t_nproc);
+      }
+#endif
+
+      if (do_place_partition) {
+        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+#if KMP_AFFINITY_SUPPORTED
+        __kmp_partition_places(team);
+#endif
+      }
+    } // Check changes in number of threads
+
+    kmp_info_t *master = team->t.t_threads[0];
+    if (master->th.th_teams_microtask) {
+      for (f = 1; f < new_nproc; ++f) {
+        // propagate teams construct specific info to workers
+        kmp_info_t *thr = team->t.t_threads[f];
+        thr->th.th_teams_microtask = master->th.th_teams_microtask;
+        thr->th.th_teams_level = master->th.th_teams_level;
+        thr->th.th_teams_size = master->th.th_teams_size;
+      }
+    }
+#if KMP_NESTED_HOT_TEAMS
+    if (level) {
+      // Sync barrier state for nested hot teams, not needed for outermost hot
+      // team.
+      for (f = 1; f < new_nproc; ++f) {
+        kmp_info_t *thr = team->t.t_threads[f];
+        int b;
+        kmp_balign_t *balign = thr->th.th_bar;
+        for (b = 0; b < bs_last_barrier; ++b) {
+          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
+#endif
+        }
+      }
+    }
+#endif // KMP_NESTED_HOT_TEAMS
+
+    /* reallocate space for arguments if necessary */
+    __kmp_alloc_argv_entries(argc, team, TRUE);
+    KMP_CHECK_UPDATE(team->t.t_argc, argc);
+    // The hot team re-uses the previous task team,
+    // if untouched during the previous release->gather phase.
+
+    KF_TRACE(10, (" hot_team = %p\n", team));
+
+#if KMP_DEBUG
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
+                    "task_team[1] = %p after reinit\n",
+                    team->t.t_task_team[0], team->t.t_task_team[1]));
+    }
+#endif
+
+#if OMPT_SUPPORT
+    __ompt_team_assign_id(team, ompt_parallel_data);
+#endif
+
+    KMP_MB();
+
+    return team;
+  }
+
+  /* next, let's try to take one from the team pool */
+  KMP_MB();
+  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
+    /* TODO: consider resizing undersized teams instead of reaping them, now
+       that we have a resizing mechanism */
+    if (team->t.t_max_nproc >= max_nproc) {
+      /* take this team from the team pool */
+      __kmp_team_pool = team->t.t_next_pool;
+
+      if (max_nproc > 1 &&
+          __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        if (!team->t.b) { // Allocate barrier structure
+          team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
+        }
+      }
+
+      /* setup the team for fresh use */
+      __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
+
+      KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
+                    "task_team[1] %p to NULL\n",
+                    &team->t.t_task_team[0], &team->t.t_task_team[1]));
+      team->t.t_task_team[0] = NULL;
+      team->t.t_task_team[1] = NULL;
+
+      /* reallocate space for arguments if necessary */
+      __kmp_alloc_argv_entries(argc, team, TRUE);
+      KMP_CHECK_UPDATE(team->t.t_argc, argc);
+
+      KA_TRACE(
+          20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
+               team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
+      { // Initialize barrier data.
+        int b;
+        for (b = 0; b < bs_last_barrier; ++b) {
+          team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
+#if USE_DEBUGGER
+          team->t.t_bar[b].b_master_arrived = 0;
+          team->t.t_bar[b].b_team_arrived = 0;
+#endif
+        }
+      }
+
+      team->t.t_proc_bind = new_proc_bind;
+
+      KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
+                    team->t.t_id));
+
+#if OMPT_SUPPORT
+      __ompt_team_assign_id(team, ompt_parallel_data);
+#endif
+
+      KMP_MB();
+
+      return team;
+    }
+
+    /* reap team if it is too small, then loop back and check the next one */
+    // not sure if this is wise, but, will be redone during the hot-teams
+    // rewrite.
+    /* TODO: Use technique to find the right size hot-team, don't reap them */
+    team = __kmp_reap_team(team);
+    __kmp_team_pool = team;
+  }
+
+  /* nothing available in the pool, no matter, make a new team! */
+  KMP_MB();
+  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
+
+  /* and set it up */
+  team->t.t_max_nproc = max_nproc;
+  if (max_nproc > 1 &&
+      __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+    // Allocate barrier structure
+    team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
+  }
+
+  /* NOTE well, for some reason allocating one big buffer and dividing it up
+     seems to really hurt performance a lot on the P4, so, let's not use this */
+  __kmp_allocate_team_arrays(team, max_nproc);
+
+  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
+  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
+
+  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
+                "%p to NULL\n",
+                &team->t.t_task_team[0], &team->t.t_task_team[1]));
+  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
+  // memory, no need to duplicate
+  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
+  // memory, no need to duplicate
+
+  if (__kmp_storage_map) {
+    __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
+  }
+
+  /* allocate space for arguments */
+  __kmp_alloc_argv_entries(argc, team, FALSE);
+  team->t.t_argc = argc;
+
+  KA_TRACE(20,
+           ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
+            team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
+  { // Initialize barrier data.
+    int b;
+    for (b = 0; b < bs_last_barrier; ++b) {
+      team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
+#if USE_DEBUGGER
+      team->t.t_bar[b].b_master_arrived = 0;
+      team->t.t_bar[b].b_team_arrived = 0;
+#endif
+    }
+  }
+
+  team->t.t_proc_bind = new_proc_bind;
+
+#if OMPT_SUPPORT
+  __ompt_team_assign_id(team, ompt_parallel_data);
+  team->t.ompt_serialized_team_info = NULL;
+#endif
+
+  KMP_MB();
+
+  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
+                team->t.t_id));
+
+  return team;
+}
+
+/* TODO implement hot-teams at all levels */
+/* TODO implement lazy thread release on demand (disband request) */
+
+/* free the team.  return it to the team pool.  release all the threads
+ * associated with it */
+void __kmp_free_team(kmp_root_t *root,
+                     kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
+  int f;
+  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
+                team->t.t_id));
+
+  /* verify state */
+  KMP_DEBUG_ASSERT(root);
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
+  KMP_DEBUG_ASSERT(team->t.t_threads);
+
+  int use_hot_team = team == root->r.r_hot_team;
+#if KMP_NESTED_HOT_TEAMS
+  int level;
+  if (master) {
+    level = team->t.t_active_level - 1;
+    if (master->th.th_teams_microtask) { // in teams construct?
+      if (master->th.th_teams_size.nteams > 1) {
+        ++level; // level was not increased in teams construct for
+        // team_of_masters
+      }
+      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+          master->th.th_teams_level == team->t.t_level) {
+        ++level; // level was not increased in teams construct for
+        // team_of_workers before the parallel
+      } // team->t.t_level will be increased inside parallel
+    }
+#if KMP_DEBUG
+    kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
+#endif
+    if (level < __kmp_hot_teams_max_level) {
+      KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
+      use_hot_team = 1;
+    }
+  }
+#endif // KMP_NESTED_HOT_TEAMS
+
+  /* team is done working */
+  TCW_SYNC_PTR(team->t.t_pkfn,
+               NULL); // Important for Debugging Support Library.
+#if KMP_OS_WINDOWS
+  team->t.t_copyin_counter = 0; // init counter for possible reuse
+#endif
+  // Do not reset pointer to parent team to NULL for hot teams.
+
+  /* if we are non-hot team, release our threads */
+  if (!use_hot_team) {
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      // Wait for threads to reach reapable state
+      for (f = 1; f < team->t.t_nproc; ++f) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+        kmp_info_t *th = team->t.t_threads[f];
+        volatile kmp_uint32 *state = &th->th.th_reap_state;
+        while (*state != KMP_SAFE_TO_REAP) {
+#if KMP_OS_WINDOWS
+          // On Windows a thread can be killed at any time, check this
+          DWORD ecode;
+          if (!__kmp_is_thread_alive(th, &ecode)) {
+            *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
+            break;
+          }
+#endif
+          // first check if thread is sleeping
+          kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
+          if (fl.is_sleeping())
+            fl.resume(__kmp_gtid_from_thread(th));
+          KMP_CPU_PAUSE();
+        }
+      }
+
+      // Delete task teams
+      int tt_idx;
+      for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
+        kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
+        if (task_team != NULL) {
+          for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
+            KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+            team->t.t_threads[f]->th.th_task_team = NULL;
+          }
+          KA_TRACE(
+              20,
+              ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
+               __kmp_get_gtid(), task_team, team->t.t_id));
+#if KMP_NESTED_HOT_TEAMS
+          __kmp_free_task_team(master, task_team);
+#endif
+          team->t.t_task_team[tt_idx] = NULL;
+        }
+      }
+    }
+
+    // Reset pointer to parent team only for non-hot teams.
+    team->t.t_parent = NULL;
+    team->t.t_level = 0;
+    team->t.t_active_level = 0;
+
+    /* free the worker threads */
+    for (f = 1; f < team->t.t_nproc; ++f) {
+      KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
+                                    1, 2);
+      }
+      __kmp_free_thread(team->t.t_threads[f]);
+    }
+
+    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      if (team->t.b) {
+        // wake up thread at old location
+        team->t.b->go_release();
+        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+          for (f = 1; f < team->t.t_nproc; ++f) {
+            if (team->t.b->sleep[f].sleep) {
+              __kmp_atomic_resume_64(
+                  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
+                  (kmp_atomic_flag_64<> *)NULL);
+            }
+          }
+        }
+        // Wait for threads to be removed from team
+        for (int f = 1; f < team->t.t_nproc; ++f) {
+          while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
+            KMP_CPU_PAUSE();
+        }
+      }
+    }
+
+    for (f = 1; f < team->t.t_nproc; ++f) {
+      team->t.t_threads[f] = NULL;
+    }
+
+    if (team->t.t_max_nproc > 1 &&
+        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+      distributedBarrier::deallocate(team->t.b);
+      team->t.b = NULL;
+    }
+    /* put the team back in the team pool */
+    /* TODO limit size of team pool, call reap_team if pool too large */
+    team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
+    __kmp_team_pool = (volatile kmp_team_t *)team;
+  } else { // Check if team was created for primary threads in teams construct
+    // See if first worker is a CG root
+    KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
+                     team->t.t_threads[1]->th.th_cg_roots);
+    if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
+      // Clean up the CG root nodes on workers so that this team can be re-used
+      for (f = 1; f < team->t.t_nproc; ++f) {
+        kmp_info_t *thr = team->t.t_threads[f];
+        KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
+                         thr->th.th_cg_roots->cg_root == thr);
+        // Pop current CG root off list
+        kmp_cg_root_t *tmp = thr->th.th_cg_roots;
+        thr->th.th_cg_roots = tmp->up;
+        KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
+                       " up to node %p. cg_nthreads was %d\n",
+                       thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
+        int i = tmp->cg_nthreads--;
+        if (i == 1) {
+          __kmp_free(tmp); // free CG if we are the last thread in it
+        }
+        // Restore current task's thread_limit from CG root
+        if (thr->th.th_cg_roots)
+          thr->th.th_current_task->td_icvs.thread_limit =
+              thr->th.th_cg_roots->cg_thread_limit;
+      }
+    }
+  }
+
+  KMP_MB();
+}
+
+/* reap the team.  destroy it, reclaim all its resources and free its memory */
+kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
+  kmp_team_t *next_pool = team->t.t_next_pool;
+
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(team->t.t_dispatch);
+  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
+  KMP_DEBUG_ASSERT(team->t.t_threads);
+  KMP_DEBUG_ASSERT(team->t.t_argv);
+
+  /* TODO clean the threads that are a part of this? */
+
+  /* free stuff */
+  __kmp_free_team_arrays(team);
+  if (team->t.t_argv != &team->t.t_inline_argv[0])
+    __kmp_free((void *)team->t.t_argv);
+  __kmp_free(team);
+
+  KMP_MB();
+  return next_pool;
+}
+
+// Free the thread.  Don't reap it, just place it on the pool of available
+// threads.
+//
+// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
+// binding for the affinity mechanism to be useful.
+//
+// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
+// However, we want to avoid a potential performance problem by always
+// scanning through the list to find the correct point at which to insert
+// the thread (potential N**2 behavior).  To do this we keep track of the
+// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
+// With single-level parallelism, threads will always be added to the tail
+// of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
+// parallelism, all bets are off and we may need to scan through the entire
+// free list.
+//
+// This change also has a potentially large performance benefit, for some
+// applications.  Previously, as threads were freed from the hot team, they
+// would be placed back on the free list in inverse order.  If the hot team
+// grew back to it's original size, then the freed thread would be placed
+// back on the hot team in reverse order.  This could cause bad cache
+// locality problems on programs where the size of the hot team regularly
+// grew and shrunk.
+//
+// Now, for single-level parallelism, the OMP tid is always == gtid.
+void __kmp_free_thread(kmp_info_t *this_th) {
+  int gtid;
+  kmp_info_t **scan;
+
+  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
+                __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
+
+  KMP_DEBUG_ASSERT(this_th);
+
+  // When moving thread to pool, switch thread to wait on own b_go flag, and
+  // uninitialized (NULL team).
+  int b;
+  kmp_balign_t *balign = this_th->th.th_bar;
+  for (b = 0; b < bs_last_barrier; ++b) {
+    if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
+      balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
+    balign[b].bb.team = NULL;
+    balign[b].bb.leaf_kids = 0;
+  }
+  this_th->th.th_task_state = 0;
+  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
+
+  /* put thread back on the free pool */
+  TCW_PTR(this_th->th.th_team, NULL);
+  TCW_PTR(this_th->th.th_root, NULL);
+  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
+
+  while (this_th->th.th_cg_roots) {
+    this_th->th.th_cg_roots->cg_nthreads--;
+    KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
+                   " %p of thread  %p to %d\n",
+                   this_th, this_th->th.th_cg_roots,
+                   this_th->th.th_cg_roots->cg_root,
+                   this_th->th.th_cg_roots->cg_nthreads));
+    kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
+    if (tmp->cg_root == this_th) { // Thread is a cg_root
+      KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
+      KA_TRACE(
+          5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
+      this_th->th.th_cg_roots = tmp->up;
+      __kmp_free(tmp);
+    } else { // Worker thread
+      if (tmp->cg_nthreads == 0) { // last thread leaves contention group
+        __kmp_free(tmp);
+      }
+      this_th->th.th_cg_roots = NULL;
+      break;
+    }
+  }
+
+  /* If the implicit task assigned to this thread can be used by other threads
+   * -> multiple threads can share the data and try to free the task at
+   * __kmp_reap_thread at exit. This duplicate use of the task data can happen
+   * with higher probability when hot team is disabled but can occurs even when
+   * the hot team is enabled */
+  __kmp_free_implicit_task(this_th);
+  this_th->th.th_current_task = NULL;
+
+  // If the __kmp_thread_pool_insert_pt is already past the new insert
+  // point, then we need to re-scan the entire list.
+  gtid = this_th->th.th_info.ds.ds_gtid;
+  if (__kmp_thread_pool_insert_pt != NULL) {
+    KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
+    if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
+      __kmp_thread_pool_insert_pt = NULL;
+    }
+  }
+
+  // Scan down the list to find the place to insert the thread.
+  // scan is the address of a link in the list, possibly the address of
+  // __kmp_thread_pool itself.
+  //
+  // In the absence of nested parallelism, the for loop will have 0 iterations.
+  if (__kmp_thread_pool_insert_pt != NULL) {
+    scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
+  } else {
+    scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
+  }
+  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
+       scan = &((*scan)->th.th_next_pool))
+    ;
+
+  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
+  // to its address.
+  TCW_PTR(this_th->th.th_next_pool, *scan);
+  __kmp_thread_pool_insert_pt = *scan = this_th;
+  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
+                   (this_th->th.th_info.ds.ds_gtid <
+                    this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
+  TCW_4(this_th->th.th_in_pool, TRUE);
+  __kmp_suspend_initialize_thread(this_th);
+  __kmp_lock_suspend_mx(this_th);
+  if (this_th->th.th_active == TRUE) {
+    KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
+    this_th->th.th_active_in_pool = TRUE;
+  }
+#if KMP_DEBUG
+  else {
+    KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
+  }
+#endif
+  __kmp_unlock_suspend_mx(this_th);
+
+  TCW_4(__kmp_nth, __kmp_nth - 1);
+
+#ifdef KMP_ADJUST_BLOCKTIME
+  /* Adjust blocktime back to user setting or default if necessary */
+  /* Middle initialization might never have occurred                */
+  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
+    if (__kmp_nth <= __kmp_avail_proc) {
+      __kmp_zero_bt = FALSE;
+    }
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+  KMP_MB();
+}
+
+/* ------------------------------------------------------------------------ */
+
+void *__kmp_launch_thread(kmp_info_t *this_thr) {
+#if OMP_PROFILING_SUPPORT
+  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
+  // TODO: add a configuration option for time granularity
+  if (ProfileTraceFile)
+    llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
+#endif
+
+  int gtid = this_thr->th.th_info.ds.ds_gtid;
+  /*    void                 *stack_data;*/
+  kmp_team_t **volatile pteam;
+
+  KMP_MB();
+  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
+
+  if (__kmp_env_consistency_check) {
+    this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
+  }
+
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_thread_begin();
+#endif
+
+#if OMPT_SUPPORT
+  ompt_data_t *thread_data = nullptr;
+  if (ompt_enabled.enabled) {
+    thread_data = &(this_thr->th.ompt_thread_info.thread_data);
+    *thread_data = ompt_data_none;
+
+    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    this_thr->th.ompt_thread_info.wait_id = 0;
+    this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
+    this_thr->th.ompt_thread_info.parallel_flags = 0;
+    if (ompt_enabled.ompt_callback_thread_begin) {
+      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
+          ompt_thread_worker, thread_data);
+    }
+    this_thr->th.ompt_thread_info.state = ompt_state_idle;
+  }
+#endif
+
+  /* This is the place where threads wait for work */
+  while (!TCR_4(__kmp_global.g.g_done)) {
+    KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
+    KMP_MB();
+
+    /* wait for work to do */
+    KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
+
+    /* No tid yet since not part of a team */
+    __kmp_fork_barrier(gtid, KMP_GTID_DNE);
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+
+    pteam = &this_thr->th.th_team;
+
+    /* have we been allocated? */
+    if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
+      /* we were just woken up, so run our new task */
+      if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
+        int rc;
+        KA_TRACE(20,
+                 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
+                  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
+                  (*pteam)->t.t_pkfn));
+
+        updateHWFPControl(*pteam);
+
+#if OMPT_SUPPORT
+        if (ompt_enabled.enabled) {
+          this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+        }
+#endif
+
+        rc = (*pteam)->t.t_invoke(gtid);
+        KMP_ASSERT(rc);
+
+        KMP_MB();
+        KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
+                      gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
+                      (*pteam)->t.t_pkfn));
+      }
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        /* no frame set while outside task */
+        __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
+
+        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+      }
+#endif
+      /* join barrier after parallel region */
+      __kmp_join_barrier(gtid);
+    }
+  }
+
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_thread_end();
+#endif
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.ompt_callback_thread_end) {
+    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
+  }
+#endif
+
+  this_thr->th.th_task_team = NULL;
+  /* run the destructors for the threadprivate data for this thread */
+  __kmp_common_destroy_gtid(gtid);
+
+  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
+  KMP_MB();
+
+#if OMP_PROFILING_SUPPORT
+  llvm::timeTraceProfilerFinishThread();
+#endif
+  return this_thr;
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_internal_end_dest(void *specific_gtid) {
+  // Make sure no significant bits are lost
+  int gtid;
+  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
+
+  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
+  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
+   * this is because 0 is reserved for the nothing-stored case */
+
+  __kmp_internal_end_thread(gtid);
+}
+
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
+
+__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
+  __kmp_internal_end_atexit();
+}
+
+#endif
+
+/* [Windows] josh: when the atexit handler is called, there may still be more
+   than one thread alive */
+void __kmp_internal_end_atexit(void) {
+  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
+  /* [Windows]
+     josh: ideally, we want to completely shutdown the library in this atexit
+     handler, but stat code that depends on thread specific data for gtid fails
+     because that data becomes unavailable at some point during the shutdown, so
+     we call __kmp_internal_end_thread instead. We should eventually remove the
+     dependency on __kmp_get_specific_gtid in the stat code and use
+     __kmp_internal_end_library to cleanly shutdown the library.
+
+     // TODO: Can some of this comment about GVS be removed?
+     I suspect that the offending stat code is executed when the calling thread
+     tries to clean up a dead root thread's data structures, resulting in GVS
+     code trying to close the GVS structures for that thread, but since the stat
+     code uses __kmp_get_specific_gtid to get the gtid with the assumption that
+     the calling thread is cleaning up itself instead of another thread, it get
+     confused. This happens because allowing a thread to unregister and cleanup
+     another thread is a recent modification for addressing an issue.
+     Based on the current design (20050722), a thread may end up
+     trying to unregister another thread only if thread death does not trigger
+     the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
+     thread specific data destructor function to detect thread death. For
+     Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
+     is nothing.  Thus, the workaround is applicable only for Windows static
+     stat library. */
+  __kmp_internal_end_library(-1);
+#if KMP_OS_WINDOWS
+  __kmp_close_console();
+#endif
+}
+
+static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
+  // It is assumed __kmp_forkjoin_lock is acquired.
+
+  int gtid;
+
+  KMP_DEBUG_ASSERT(thread != NULL);
+
+  gtid = thread->th.th_info.ds.ds_gtid;
+
+  if (!is_root) {
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+      /* Assume the threads are at the fork barrier here */
+      KA_TRACE(
+          20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
+               gtid));
+      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
+        while (
+            !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
+          KMP_CPU_PAUSE();
+        __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
+      } else {
+        /* Need release fence here to prevent seg faults for tree forkjoin
+           barrier (GEH) */
+        kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
+                           thread);
+        __kmp_release_64(&flag);
+      }
+    }
+
+    // Terminate OS thread.
+    __kmp_reap_worker(thread);
+
+    // The thread was killed asynchronously.  If it was actively
+    // spinning in the thread pool, decrement the global count.
+    //
+    // There is a small timing hole here - if the worker thread was just waking
+    // up after sleeping in the pool, had reset it's th_active_in_pool flag but
+    // not decremented the global counter __kmp_thread_pool_active_nth yet, then
+    // the global counter might not get updated.
+    //
+    // Currently, this can only happen as the library is unloaded,
+    // so there are no harmful side effects.
+    if (thread->th.th_active_in_pool) {
+      thread->th.th_active_in_pool = FALSE;
+      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
+      KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
+    }
+  }
+
+  __kmp_free_implicit_task(thread);
+
+// Free the fast memory for tasking
+#if USE_FAST_MEMORY
+  __kmp_free_fast_memory(thread);
+#endif /* USE_FAST_MEMORY */
+
+  __kmp_suspend_uninitialize_thread(thread);
+
+  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
+  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
+
+  --__kmp_all_nth;
+  // __kmp_nth was decremented when thread is added to the pool.
+
+#ifdef KMP_ADJUST_BLOCKTIME
+  /* Adjust blocktime back to user setting or default if necessary */
+  /* Middle initialization might never have occurred                */
+  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
+    if (__kmp_nth <= __kmp_avail_proc) {
+      __kmp_zero_bt = FALSE;
+    }
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+  /* free the memory being used */
+  if (__kmp_env_consistency_check) {
+    if (thread->th.th_cons) {
+      __kmp_free_cons_stack(thread->th.th_cons);
+      thread->th.th_cons = NULL;
+    }
+  }
+
+  if (thread->th.th_pri_common != NULL) {
+    __kmp_free(thread->th.th_pri_common);
+    thread->th.th_pri_common = NULL;
+  }
+
+  if (thread->th.th_task_state_memo_stack != NULL) {
+    __kmp_free(thread->th.th_task_state_memo_stack);
+    thread->th.th_task_state_memo_stack = NULL;
+  }
+
+#if KMP_USE_BGET
+  if (thread->th.th_local.bget_data != NULL) {
+    __kmp_finalize_bget(thread);
+  }
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+  if (thread->th.th_affin_mask != NULL) {
+    KMP_CPU_FREE(thread->th.th_affin_mask);
+    thread->th.th_affin_mask = NULL;
+  }
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+#if KMP_USE_HIER_SCHED
+  if (thread->th.th_hier_bar_data != NULL) {
+    __kmp_free(thread->th.th_hier_bar_data);
+    thread->th.th_hier_bar_data = NULL;
+  }
+#endif
+
+  __kmp_reap_team(thread->th.th_serial_team);
+  thread->th.th_serial_team = NULL;
+  __kmp_free(thread);
+
+  KMP_MB();
+
+} // __kmp_reap_thread
+
+static void __kmp_itthash_clean(kmp_info_t *th) {
+#if USE_ITT_NOTIFY
+  if (__kmp_itt_region_domains.count > 0) {
+    for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
+      kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
+      while (bucket) {
+        kmp_itthash_entry_t *next = bucket->next_in_bucket;
+        __kmp_thread_free(th, bucket);
+        bucket = next;
+      }
+    }
+  }
+  if (__kmp_itt_barrier_domains.count > 0) {
+    for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
+      kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
+      while (bucket) {
+        kmp_itthash_entry_t *next = bucket->next_in_bucket;
+        __kmp_thread_free(th, bucket);
+        bucket = next;
+      }
+    }
+  }
+#endif
+}
+
+static void __kmp_internal_end(void) {
+  int i;
+
+  /* First, unregister the library */
+  __kmp_unregister_library();
+
+#if KMP_OS_WINDOWS
+  /* In Win static library, we can't tell when a root actually dies, so we
+     reclaim the data structures for any root threads that have died but not
+     unregistered themselves, in order to shut down cleanly.
+     In Win dynamic library we also can't tell when a thread dies.  */
+  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
+// dead roots
+#endif
+
+  for (i = 0; i < __kmp_threads_capacity; i++)
+    if (__kmp_root[i])
+      if (__kmp_root[i]->r.r_active)
+        break;
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
+
+  if (i < __kmp_threads_capacity) {
+#if KMP_USE_MONITOR
+    // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    // Need to check that monitor was initialized before reaping it. If we are
+    // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
+    // __kmp_monitor will appear to contain valid data, but it is only valid in
+    // the parent process, not the child.
+    // New behavior (201008): instead of keying off of the flag
+    // __kmp_init_parallel, the monitor thread creation is keyed off
+    // of the new flag __kmp_init_monitor.
+    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
+    if (TCR_4(__kmp_init_monitor)) {
+      __kmp_reap_monitor(&__kmp_monitor);
+      TCW_4(__kmp_init_monitor, 0);
+    }
+    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
+    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
+#endif // KMP_USE_MONITOR
+  } else {
+/* TODO move this to cleanup code */
+#ifdef KMP_DEBUG
+    /* make sure that everything has properly ended */
+    for (i = 0; i < __kmp_threads_capacity; i++) {
+      if (__kmp_root[i]) {
+        //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
+        //                    there can be uber threads alive here
+        KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
+      }
+    }
+#endif
+
+    KMP_MB();
+
+    // Reap the worker threads.
+    // This is valid for now, but be careful if threads are reaped sooner.
+    while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
+      // Get the next thread from the pool.
+      kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
+      __kmp_thread_pool = thread->th.th_next_pool;
+      // Reap it.
+      KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
+      thread->th.th_next_pool = NULL;
+      thread->th.th_in_pool = FALSE;
+      __kmp_reap_thread(thread, 0);
+    }
+    __kmp_thread_pool_insert_pt = NULL;
+
+    // Reap teams.
+    while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
+      // Get the next team from the pool.
+      kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
+      __kmp_team_pool = team->t.t_next_pool;
+      // Reap it.
+      team->t.t_next_pool = NULL;
+      __kmp_reap_team(team);
+    }
+
+    __kmp_reap_task_teams();
+
+#if KMP_OS_UNIX
+    // Threads that are not reaped should not access any resources since they
+    // are going to be deallocated soon, so the shutdown sequence should wait
+    // until all threads either exit the final spin-waiting loop or begin
+    // sleeping after the given blocktime.
+    for (i = 0; i < __kmp_threads_capacity; i++) {
+      kmp_info_t *thr = __kmp_threads[i];
+      while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
+        KMP_CPU_PAUSE();
+    }
+#endif
+
+    for (i = 0; i < __kmp_threads_capacity; ++i) {
+      // TBD: Add some checking...
+      // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
+    }
+
+    /* Make sure all threadprivate destructors get run by joining with all
+       worker threads before resetting this flag */
+    TCW_SYNC_4(__kmp_init_common, FALSE);
+
+    KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
+    KMP_MB();
+
+#if KMP_USE_MONITOR
+    // See note above: One of the possible fixes for CQ138434 / CQ140126
+    //
+    // FIXME: push both code fragments down and CSE them?
+    // push them into __kmp_cleanup() ?
+    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
+    if (TCR_4(__kmp_init_monitor)) {
+      __kmp_reap_monitor(&__kmp_monitor);
+      TCW_4(__kmp_init_monitor, 0);
+    }
+    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
+    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
+#endif
+  } /* else !__kmp_global.t_active */
+  TCW_4(__kmp_init_gtid, FALSE);
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  __kmp_cleanup();
+#if OMPT_SUPPORT
+  ompt_fini();
+#endif
+}
+
+void __kmp_internal_end_library(int gtid_req) {
+  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
+  /* this shouldn't be a race condition because __kmp_internal_end() is the
+     only place to clear __kmp_serial_init */
+  /* we'll check this later too, after we get the lock */
+  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
+  // redundant, because the next check will work in any case.
+  if (__kmp_global.g.g_abort) {
+    KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
+    /* TODO abort? */
+    return;
+  }
+  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
+    KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
+    return;
+  }
+
+  // If hidden helper team has been initialized, we need to deinit it
+  if (TCR_4(__kmp_init_hidden_helper) &&
+      !TCR_4(__kmp_hidden_helper_team_done)) {
+    TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
+    // First release the main thread to let it continue its work
+    __kmp_hidden_helper_main_thread_release();
+    // Wait until the hidden helper team has been destroyed
+    __kmp_hidden_helper_threads_deinitz_wait();
+  }
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+  /* find out who we are and what we should do */
+  {
+    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
+    KA_TRACE(
+        10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
+    if (gtid == KMP_GTID_SHUTDOWN) {
+      KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
+                    "already shutdown\n"));
+      return;
+    } else if (gtid == KMP_GTID_MONITOR) {
+      KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
+                    "registered, or system shutdown\n"));
+      return;
+    } else if (gtid == KMP_GTID_DNE) {
+      KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
+                    "shutdown\n"));
+      /* we don't know who we are, but we may still shutdown the library */
+    } else if (KMP_UBER_GTID(gtid)) {
+      /* unregister ourselves as an uber thread.  gtid is no longer valid */
+      if (__kmp_root[gtid]->r.r_active) {
+        __kmp_global.g.g_abort = -1;
+        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
+        __kmp_unregister_library();
+        KA_TRACE(10,
+                 ("__kmp_internal_end_library: root still active, abort T#%d\n",
+                  gtid));
+        return;
+      } else {
+        __kmp_itthash_clean(__kmp_threads[gtid]);
+        KA_TRACE(
+            10,
+            ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
+        __kmp_unregister_root_current_thread(gtid);
+      }
+    } else {
+/* worker threads may call this function through the atexit handler, if they
+ * call exit() */
+/* For now, skip the usual subsequent processing and just dump the debug buffer.
+   TODO: do a thorough shutdown instead */
+#ifdef DUMP_DEBUG_ON_EXIT
+      if (__kmp_debug_buf)
+        __kmp_dump_debug_buffer();
+#endif
+      // added unregister library call here when we switch to shm linux
+      // if we don't, it will leave lots of files in /dev/shm
+      // cleanup shared memory file before exiting.
+      __kmp_unregister_library();
+      return;
+    }
+  }
+  /* synchronize the termination process */
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+
+  /* have we already finished */
+  if (__kmp_global.g.g_abort) {
+    KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
+    /* TODO abort? */
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+
+  /* We need this lock to enforce mutex between this reading of
+     __kmp_threads_capacity and the writing by __kmp_register_root.
+     Alternatively, we can use a counter of roots that is atomically updated by
+     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
+     __kmp_internal_end_*.  */
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+
+  /* now we can safely conduct the actual termination */
+  __kmp_internal_end();
+
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+
+  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
+
+#ifdef DUMP_DEBUG_ON_EXIT
+  if (__kmp_debug_buf)
+    __kmp_dump_debug_buffer();
+#endif
+
+#if KMP_OS_WINDOWS
+  __kmp_close_console();
+#endif
+
+  __kmp_fini_allocator();
+
+} // __kmp_internal_end_library
+
+void __kmp_internal_end_thread(int gtid_req) {
+  int i;
+
+  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
+  /* this shouldn't be a race condition because __kmp_internal_end() is the
+   * only place to clear __kmp_serial_init */
+  /* we'll check this later too, after we get the lock */
+  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
+  // redundant, because the next check will work in any case.
+  if (__kmp_global.g.g_abort) {
+    KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
+    /* TODO abort? */
+    return;
+  }
+  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
+    KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
+    return;
+  }
+
+  // If hidden helper team has been initialized, we need to deinit it
+  if (TCR_4(__kmp_init_hidden_helper) &&
+      !TCR_4(__kmp_hidden_helper_team_done)) {
+    TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
+    // First release the main thread to let it continue its work
+    __kmp_hidden_helper_main_thread_release();
+    // Wait until the hidden helper team has been destroyed
+    __kmp_hidden_helper_threads_deinitz_wait();
+  }
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  /* find out who we are and what we should do */
+  {
+    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
+    KA_TRACE(10,
+             ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
+    if (gtid == KMP_GTID_SHUTDOWN) {
+      KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
+                    "already shutdown\n"));
+      return;
+    } else if (gtid == KMP_GTID_MONITOR) {
+      KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
+                    "registered, or system shutdown\n"));
+      return;
+    } else if (gtid == KMP_GTID_DNE) {
+      KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
+                    "shutdown\n"));
+      return;
+      /* we don't know who we are */
+    } else if (KMP_UBER_GTID(gtid)) {
+      /* unregister ourselves as an uber thread.  gtid is no longer valid */
+      if (__kmp_root[gtid]->r.r_active) {
+        __kmp_global.g.g_abort = -1;
+        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
+        KA_TRACE(10,
+                 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
+                  gtid));
+        return;
+      } else {
+        KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
+                      gtid));
+        __kmp_unregister_root_current_thread(gtid);
+      }
+    } else {
+      /* just a worker thread, let's leave */
+      KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
+
+      if (gtid >= 0) {
+        __kmp_threads[gtid]->th.th_task_team = NULL;
+      }
+
+      KA_TRACE(10,
+               ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
+                gtid));
+      return;
+    }
+  }
+#if KMP_DYNAMIC_LIB
+  if (__kmp_pause_status != kmp_hard_paused)
+  // AC: lets not shutdown the dynamic library at the exit of uber thread,
+  // because we will better shutdown later in the library destructor.
+  {
+    KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
+    return;
+  }
+#endif
+  /* synchronize the termination process */
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+
+  /* have we already finished */
+  if (__kmp_global.g.g_abort) {
+    KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
+    /* TODO abort? */
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+
+  /* We need this lock to enforce mutex between this reading of
+     __kmp_threads_capacity and the writing by __kmp_register_root.
+     Alternatively, we can use a counter of roots that is atomically updated by
+     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
+     __kmp_internal_end_*.  */
+
+  /* should we finish the run-time?  are all siblings done? */
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+
+  for (i = 0; i < __kmp_threads_capacity; ++i) {
+    if (KMP_UBER_GTID(i)) {
+      KA_TRACE(
+          10,
+          ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
+      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+      return;
+    }
+  }
+
+  /* now we can safely conduct the actual termination */
+
+  __kmp_internal_end();
+
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+
+  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
+
+#ifdef DUMP_DEBUG_ON_EXIT
+  if (__kmp_debug_buf)
+    __kmp_dump_debug_buffer();
+#endif
+} // __kmp_internal_end_thread
+
+// -----------------------------------------------------------------------------
+// Library registration stuff.
+
+static long __kmp_registration_flag = 0;
+// Random value used to indicate library initialization.
+static char *__kmp_registration_str = NULL;
+// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
+
+static inline char *__kmp_reg_status_name() {
+/* On RHEL 3u5 if linked statically, getpid() returns different values in
+   each thread. If registration and unregistration go in different threads
+   (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
+   env var can not be found, because the name will contain different pid. */
+// macOS* complains about name being too long with additional getuid()
+#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
+  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
+                          (int)getuid());
+#else
+  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
+#endif
+} // __kmp_reg_status_get
+
+#if defined(KMP_USE_SHM)
+bool __kmp_shm_available = false;
+bool __kmp_tmp_available = false;
+// If /dev/shm is not accessible, we will create a temporary file under /tmp.
+char *temp_reg_status_file_name = nullptr;
+#endif
+
+void __kmp_register_library_startup(void) {
+
+  char *name = __kmp_reg_status_name(); // Name of the environment variable.
+  int done = 0;
+  union {
+    double dtime;
+    long ltime;
+  } time;
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  __kmp_initialize_system_tick();
+#endif
+  __kmp_read_system_time(&time.dtime);
+  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
+  __kmp_registration_str =
+      __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
+                       __kmp_registration_flag, KMP_LIBRARY_FILE);
+
+  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
+                __kmp_registration_str));
+
+  while (!done) {
+
+    char *value = NULL; // Actual value of the environment variable.
+
+#if defined(KMP_USE_SHM)
+    char *shm_name = nullptr;
+    char *data1 = nullptr;
+    __kmp_shm_available = __kmp_detect_shm();
+    if (__kmp_shm_available) {
+      int fd1 = -1;
+      shm_name = __kmp_str_format("/%s", name);
+      int shm_preexist = 0;
+      fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
+      if ((fd1 == -1) && (errno == EEXIST)) {
+        // file didn't open because it already exists.
+        // try opening existing file
+        fd1 = shm_open(shm_name, O_RDWR, 0666);
+        if (fd1 == -1) { // file didn't open
+          KMP_WARNING(FunctionError, "Can't open SHM");
+          __kmp_shm_available = false;
+        } else { // able to open existing file
+          shm_preexist = 1;
+        }
+      }
+      if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
+        if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
+          KMP_WARNING(FunctionError, "Can't set size of SHM");
+          __kmp_shm_available = false;
+        }
+      }
+      if (__kmp_shm_available) { // SHM exists, now map it
+        data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+                             fd1, 0);
+        if (data1 == MAP_FAILED) { // failed to map shared memory
+          KMP_WARNING(FunctionError, "Can't map SHM");
+          __kmp_shm_available = false;
+        }
+      }
+      if (__kmp_shm_available) { // SHM mapped
+        if (shm_preexist == 0) { // set data to SHM, set value
+          KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+        }
+        // Read value from either what we just wrote or existing file.
+        value = __kmp_str_format("%s", data1); // read value from SHM
+        munmap(data1, SHM_SIZE);
+      }
+      if (fd1 != -1)
+        close(fd1);
+    }
+    if (!__kmp_shm_available)
+      __kmp_tmp_available = __kmp_detect_tmp();
+    if (!__kmp_shm_available && __kmp_tmp_available) {
+      // SHM failed to work due to an error other than that the file already
+      // exists. Try to create a temp file under /tmp.
+      // If /tmp isn't accessible, fall back to using environment variable.
+      // TODO: /tmp might not always be the temporary directory. For now we will
+      // not consider TMPDIR.
+      int fd1 = -1;
+      temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
+      int tmp_preexist = 0;
+      fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
+      if ((fd1 == -1) && (errno == EEXIST)) {
+        // file didn't open because it already exists.
+        // try opening existing file
+        fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
+        if (fd1 == -1) { // file didn't open if (fd1 == -1) {
+          KMP_WARNING(FunctionError, "Can't open TEMP");
+          __kmp_tmp_available = false;
+        } else {
+          tmp_preexist = 1;
+        }
+      }
+      if (__kmp_tmp_available && tmp_preexist == 0) {
+        // we created /tmp file now set size
+        if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
+          KMP_WARNING(FunctionError, "Can't set size of /tmp file");
+          __kmp_tmp_available = false;
+        }
+      }
+      if (__kmp_tmp_available) {
+        data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+                             fd1, 0);
+        if (data1 == MAP_FAILED) { // failed to map /tmp
+          KMP_WARNING(FunctionError, "Can't map /tmp");
+          __kmp_tmp_available = false;
+        }
+      }
+      if (__kmp_tmp_available) {
+        if (tmp_preexist == 0) { // set data to TMP, set value
+          KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+        }
+        // Read value from either what we just wrote or existing file.
+        value = __kmp_str_format("%s", data1); // read value from SHM
+        munmap(data1, SHM_SIZE);
+      }
+      if (fd1 != -1)
+        close(fd1);
+    }
+    if (!__kmp_shm_available && !__kmp_tmp_available) {
+      // no /dev/shm and no /tmp -- fall back to environment variable
+      // Set environment variable, but do not overwrite if it exists.
+      __kmp_env_set(name, __kmp_registration_str, 0);
+      // read value to see if it got set
+      value = __kmp_env_get(name);
+    }
+#else // Windows and unix with static library
+    // Set environment variable, but do not overwrite if it exists.
+    __kmp_env_set(name, __kmp_registration_str, 0);
+    // read value to see if it got set
+    value = __kmp_env_get(name);
+#endif
+
+    if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
+      done = 1; // Ok, environment variable set successfully, exit the loop.
+    } else {
+      // Oops. Write failed. Another copy of OpenMP RTL is in memory.
+      // Check whether it alive or dead.
+      int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
+      char *tail = value;
+      char *flag_addr_str = NULL;
+      char *flag_val_str = NULL;
+      char const *file_name = NULL;
+      __kmp_str_split(tail, '-', &flag_addr_str, &tail);
+      __kmp_str_split(tail, '-', &flag_val_str, &tail);
+      file_name = tail;
+      if (tail != NULL) {
+        unsigned long *flag_addr = 0;
+        unsigned long flag_val = 0;
+        KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
+        KMP_SSCANF(flag_val_str, "%lx", &flag_val);
+        if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
+          // First, check whether environment-encoded address is mapped into
+          // addr space.
+          // If so, dereference it to see if it still has the right value.
+          if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
+            neighbor = 1;
+          } else {
+            // If not, then we know the other copy of the library is no longer
+            // running.
+            neighbor = 2;
+          }
+        }
+      }
+      switch (neighbor) {
+      case 0: // Cannot parse environment variable -- neighbor status unknown.
+        // Assume it is the incompatible format of future version of the
+        // library. Assume the other library is alive.
+        // WARN( ... ); // TODO: Issue a warning.
+        file_name = "unknown library";
+        KMP_FALLTHROUGH();
+      // Attention! Falling to the next case. That's intentional.
+      case 1: { // Neighbor is alive.
+        // Check it is allowed.
+        char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
+        if (!__kmp_str_match_true(duplicate_ok)) {
+          // That's not allowed. Issue fatal error.
+          __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
+                      KMP_HNT(DuplicateLibrary), __kmp_msg_null);
+        }
+        KMP_INTERNAL_FREE(duplicate_ok);
+        __kmp_duplicate_library_ok = 1;
+        done = 1; // Exit the loop.
+      } break;
+      case 2: { // Neighbor is dead.
+
+#if defined(KMP_USE_SHM)
+        if (__kmp_shm_available) { // close shared memory.
+          shm_unlink(shm_name); // this removes file in /dev/shm
+        } else if (__kmp_tmp_available) {
+          unlink(temp_reg_status_file_name); // this removes the temp file
+        } else {
+          // Clear the variable and try to register library again.
+          __kmp_env_unset(name);
+        }
+#else
+        // Clear the variable and try to register library again.
+        __kmp_env_unset(name);
+#endif
+      } break;
+      default: {
+        KMP_DEBUG_ASSERT(0);
+      } break;
+      }
+    }
+    KMP_INTERNAL_FREE((void *)value);
+#if defined(KMP_USE_SHM)
+    if (shm_name)
+      KMP_INTERNAL_FREE((void *)shm_name);
+#endif
+  } // while
+  KMP_INTERNAL_FREE((void *)name);
+
+} // func __kmp_register_library_startup
+
+void __kmp_unregister_library(void) {
+
+  char *name = __kmp_reg_status_name();
+  char *value = NULL;
+
+#if defined(KMP_USE_SHM)
+  char *shm_name = nullptr;
+  int fd1;
+  if (__kmp_shm_available) {
+    shm_name = __kmp_str_format("/%s", name);
+    fd1 = shm_open(shm_name, O_RDONLY, 0666);
+    if (fd1 != -1) { // File opened successfully
+      char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
+      if (data1 != MAP_FAILED) {
+        value = __kmp_str_format("%s", data1); // read value from SHM
+        munmap(data1, SHM_SIZE);
+      }
+      close(fd1);
+    }
+  } else if (__kmp_tmp_available) { // try /tmp
+    fd1 = open(temp_reg_status_file_name, O_RDONLY);
+    if (fd1 != -1) { // File opened successfully
+      char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
+      if (data1 != MAP_FAILED) {
+        value = __kmp_str_format("%s", data1); // read value from /tmp
+        munmap(data1, SHM_SIZE);
+      }
+      close(fd1);
+    }
+  } else { // fall back to envirable
+    value = __kmp_env_get(name);
+  }
+#else
+  value = __kmp_env_get(name);
+#endif
+
+  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
+  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
+  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
+//  Ok, this is our variable. Delete it.
+#if defined(KMP_USE_SHM)
+    if (__kmp_shm_available) {
+      shm_unlink(shm_name); // this removes file in /dev/shm
+    } else if (__kmp_tmp_available) {
+      unlink(temp_reg_status_file_name); // this removes the temp file
+    } else {
+      __kmp_env_unset(name);
+    }
+#else
+    __kmp_env_unset(name);
+#endif
+  }
+
+#if defined(KMP_USE_SHM)
+  if (shm_name)
+    KMP_INTERNAL_FREE(shm_name);
+  if (temp_reg_status_file_name)
+    KMP_INTERNAL_FREE(temp_reg_status_file_name);
+#endif
+
+  KMP_INTERNAL_FREE(__kmp_registration_str);
+  KMP_INTERNAL_FREE(value);
+  KMP_INTERNAL_FREE(name);
+
+  __kmp_registration_flag = 0;
+  __kmp_registration_str = NULL;
+
+} // __kmp_unregister_library
+
+// End of Library registration stuff.
+// -----------------------------------------------------------------------------
+
+#if KMP_MIC_SUPPORTED
+
+static void __kmp_check_mic_type() {
+  kmp_cpuid_t cpuid_state = {0};
+  kmp_cpuid_t *cs_p = &cpuid_state;
+  __kmp_x86_cpuid(1, 0, cs_p);
+  // We don't support mic1 at the moment
+  if ((cs_p->eax & 0xff0) == 0xB10) {
+    __kmp_mic_type = mic2;
+  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
+    __kmp_mic_type = mic3;
+  } else {
+    __kmp_mic_type = non_mic;
+  }
+}
+
+#endif /* KMP_MIC_SUPPORTED */
+
+#if KMP_HAVE_UMWAIT
+static void __kmp_user_level_mwait_init() {
+  struct kmp_cpuid buf;
+  __kmp_x86_cpuid(7, 0, &buf);
+  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
+  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
+  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
+  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
+                __kmp_umwait_enabled));
+}
+#elif KMP_HAVE_MWAIT
+#ifndef AT_INTELPHIUSERMWAIT
+// Spurious, non-existent value that should always fail to return anything.
+// Will be replaced with the correct value when we know that.
+#define AT_INTELPHIUSERMWAIT 10000
+#endif
+// getauxval() function is available in RHEL7 and SLES12. If a system with an
+// earlier OS is used to build the RTL, we'll use the following internal
+// function when the entry is not found.
+unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
+unsigned long getauxval(unsigned long) { return 0; }
+
+static void __kmp_user_level_mwait_init() {
+  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
+  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
+  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
+  // KMP_USER_LEVEL_MWAIT was set to TRUE.
+  if (__kmp_mic_type == mic3) {
+    unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
+    if ((res & 0x1) || __kmp_user_level_mwait) {
+      __kmp_mwait_enabled = TRUE;
+      if (__kmp_user_level_mwait) {
+        KMP_INFORM(EnvMwaitWarn);
+      }
+    } else {
+      __kmp_mwait_enabled = FALSE;
+    }
+  }
+  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
+                "__kmp_mwait_enabled = %d\n",
+                __kmp_mic_type, __kmp_mwait_enabled));
+}
+#endif /* KMP_HAVE_UMWAIT */
+
+static void __kmp_do_serial_initialize(void) {
+  int i, gtid;
+  size_t size;
+
+  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
+
+  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
+  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
+  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
+  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
+  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
+
+#if OMPT_SUPPORT
+  ompt_pre_init();
+#endif
+#if OMPD_SUPPORT
+  __kmp_env_dump();
+  ompd_init();
+#endif
+
+  __kmp_validate_locks();
+
+#if ENABLE_LIBOMPTARGET
+  /* Initialize functions from libomptarget */
+  __kmp_init_omptarget();
+#endif
+
+  /* Initialize internal memory allocator */
+  __kmp_init_allocator();
+
+  /* Register the library startup via an environment variable or via mapped
+     shared memory file and check to see whether another copy of the library is
+     already registered. Since forked child process is often terminated, we
+     postpone the registration till middle initialization in the child */
+  if (__kmp_need_register_serial)
+    __kmp_register_library_startup();
+
+  /* TODO reinitialization of library */
+  if (TCR_4(__kmp_global.g.g_done)) {
+    KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
+  }
+
+  __kmp_global.g.g_abort = 0;
+  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
+
+/* initialize the locks */
+#if KMP_USE_ADAPTIVE_LOCKS
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+  __kmp_init_speculative_stats();
+#endif
+#endif
+#if KMP_STATS_ENABLED
+  __kmp_stats_init();
+#endif
+  __kmp_init_lock(&__kmp_global_lock);
+  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
+  __kmp_init_lock(&__kmp_debug_lock);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
+  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
+  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
+#if KMP_USE_MONITOR
+  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
+#endif
+  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
+
+  /* conduct initialization and initial setup of configuration */
+
+  __kmp_runtime_initialize();
+
+#if KMP_MIC_SUPPORTED
+  __kmp_check_mic_type();
+#endif
+
+// Some global variable initialization moved here from kmp_env_initialize()
+#ifdef KMP_DEBUG
+  kmp_diag = 0;
+#endif
+  __kmp_abort_delay = 0;
+
+  // From __kmp_init_dflt_team_nth()
+  /* assume the entire machine will be used */
+  __kmp_dflt_team_nth_ub = __kmp_xproc;
+  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
+    __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
+  }
+  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
+    __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
+  }
+  __kmp_max_nth = __kmp_sys_max_nth;
+  __kmp_cg_max_nth = __kmp_sys_max_nth;
+  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
+  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
+    __kmp_teams_max_nth = __kmp_sys_max_nth;
+  }
+
+  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
+  // part
+  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+#if KMP_USE_MONITOR
+  __kmp_monitor_wakeups =
+      KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
+  __kmp_bt_intervals =
+      KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
+#endif
+  // From "KMP_LIBRARY" part of __kmp_env_initialize()
+  __kmp_library = library_throughput;
+  // From KMP_SCHEDULE initialization
+  __kmp_static = kmp_sch_static_balanced;
+// AC: do not use analytical here, because it is non-monotonous
+//__kmp_guided = kmp_sch_guided_iterative_chunked;
+//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
+// need to repeat assignment
+// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
+// bit control and barrier method control parts
+#if KMP_FAST_REDUCTION_BARRIER
+#define kmp_reduction_barrier_gather_bb ((int)1)
+#define kmp_reduction_barrier_release_bb ((int)1)
+#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
+#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
+#endif // KMP_FAST_REDUCTION_BARRIER
+  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
+    __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
+    __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
+    __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
+    __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
+#if KMP_FAST_REDUCTION_BARRIER
+    if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
+      // lin_64 ): hyper,1
+      __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
+      __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
+      __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
+      __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
+    }
+#endif // KMP_FAST_REDUCTION_BARRIER
+  }
+#if KMP_FAST_REDUCTION_BARRIER
+#undef kmp_reduction_barrier_release_pat
+#undef kmp_reduction_barrier_gather_pat
+#undef kmp_reduction_barrier_release_bb
+#undef kmp_reduction_barrier_gather_bb
+#endif // KMP_FAST_REDUCTION_BARRIER
+#if KMP_MIC_SUPPORTED
+  if (__kmp_mic_type == mic2) { // KNC
+    // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
+    __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
+    __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
+        1; // forkjoin release
+    __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
+    __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
+  }
+#if KMP_FAST_REDUCTION_BARRIER
+  if (__kmp_mic_type == mic2) { // KNC
+    __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
+    __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
+  }
+#endif // KMP_FAST_REDUCTION_BARRIER
+#endif // KMP_MIC_SUPPORTED
+
+// From KMP_CHECKS initialization
+#ifdef KMP_DEBUG
+  __kmp_env_checks = TRUE; /* development versions have the extra checks */
+#else
+  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
+#endif
+
+  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
+  __kmp_foreign_tp = TRUE;
+
+  __kmp_global.g.g_dynamic = FALSE;
+  __kmp_global.g.g_dynamic_mode = dynamic_default;
+
+  __kmp_init_nesting_mode();
+
+  __kmp_env_initialize(NULL);
+
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+  __kmp_user_level_mwait_init();
+#endif
+// Print all messages in message catalog for testing purposes.
+#ifdef KMP_DEBUG
+  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
+  if (__kmp_str_match_true(val)) {
+    kmp_str_buf_t buffer;
+    __kmp_str_buf_init(&buffer);
+    __kmp_i18n_dump_catalog(&buffer);
+    __kmp_printf("%s", buffer.str);
+    __kmp_str_buf_free(&buffer);
+  }
+  __kmp_env_free(&val);
+#endif
+
+  __kmp_threads_capacity =
+      __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
+  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
+  __kmp_tp_capacity = __kmp_default_tp_capacity(
+      __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
+
+  // If the library is shut down properly, both pools must be NULL. Just in
+  // case, set them to NULL -- some memory may leak, but subsequent code will
+  // work even if pools are not freed.
+  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
+  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
+  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
+  __kmp_thread_pool = NULL;
+  __kmp_thread_pool_insert_pt = NULL;
+  __kmp_team_pool = NULL;
+
+  /* Allocate all of the variable sized records */
+  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
+   * expandable */
+  /* Since allocation is cache-aligned, just add extra padding at the end */
+  size =
+      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
+      CACHE_LINE;
+  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
+  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
+                               sizeof(kmp_info_t *) * __kmp_threads_capacity);
+
+  /* init thread counts */
+  KMP_DEBUG_ASSERT(__kmp_all_nth ==
+                   0); // Asserts fail if the library is reinitializing and
+  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
+  __kmp_all_nth = 0;
+  __kmp_nth = 0;
+
+  /* setup the uber master thread and hierarchy */
+  gtid = __kmp_register_root(TRUE);
+  KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
+  KMP_ASSERT(KMP_UBER_GTID(gtid));
+  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  __kmp_common_initialize();
+
+#if KMP_OS_UNIX
+  /* invoke the child fork handler */
+  __kmp_register_atfork();
+#endif
+
+#if !KMP_DYNAMIC_LIB ||                                                        \
+    ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
+  {
+    /* Invoke the exit handler when the program finishes, only for static
+       library and macOS* dynamic. For other dynamic libraries, we already
+       have _fini and DllMain. */
+    int rc = atexit(__kmp_internal_end_atexit);
+    if (rc != 0) {
+      __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
+                  __kmp_msg_null);
+    }
+  }
+#endif
+
+#if KMP_HANDLE_SIGNALS
+#if KMP_OS_UNIX
+  /* NOTE: make sure that this is called before the user installs their own
+     signal handlers so that the user handlers are called first. this way they
+     can return false, not call our handler, avoid terminating the library, and
+     continue execution where they left off. */
+  __kmp_install_signals(FALSE);
+#endif /* KMP_OS_UNIX */
+#if KMP_OS_WINDOWS
+  __kmp_install_signals(TRUE);
+#endif /* KMP_OS_WINDOWS */
+#endif
+
+  /* we have finished the serial initialization */
+  __kmp_init_counter++;
+
+  __kmp_init_serial = TRUE;
+
+  if (__kmp_version) {
+    __kmp_print_version_1();
+  }
+
+  if (__kmp_settings) {
+    __kmp_env_print();
+  }
+
+  if (__kmp_display_env || __kmp_display_env_verbose) {
+    __kmp_env_print_2();
+  }
+
+#if OMPT_SUPPORT
+  ompt_post_init();
+#endif
+
+  KMP_MB();
+
+  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
+}
+
+void __kmp_serial_initialize(void) {
+  if (__kmp_init_serial) {
+    return;
+  }
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  if (__kmp_init_serial) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+  __kmp_do_serial_initialize();
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+
+static void __kmp_do_middle_initialize(void) {
+  int i, j;
+  int prev_dflt_team_nth;
+
+  if (!__kmp_init_serial) {
+    __kmp_do_serial_initialize();
+  }
+
+  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
+
+  if (UNLIKELY(!__kmp_need_register_serial)) {
+    // We are in a forked child process. The registration was skipped during
+    // serial initialization in __kmp_atfork_child handler. Do it here.
+    __kmp_register_library_startup();
+  }
+
+  // Save the previous value for the __kmp_dflt_team_nth so that
+  // we can avoid some reinitialization if it hasn't changed.
+  prev_dflt_team_nth = __kmp_dflt_team_nth;
+
+#if KMP_AFFINITY_SUPPORTED
+  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
+  // number of cores on the machine.
+  __kmp_affinity_initialize(__kmp_affinity);
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+  KMP_ASSERT(__kmp_xproc > 0);
+  if (__kmp_avail_proc == 0) {
+    __kmp_avail_proc = __kmp_xproc;
+  }
+
+  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
+  // correct them now
+  j = 0;
+  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
+    __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
+        __kmp_avail_proc;
+    j++;
+  }
+
+  if (__kmp_dflt_team_nth == 0) {
+#ifdef KMP_DFLT_NTH_CORES
+    // Default #threads = #cores
+    __kmp_dflt_team_nth = __kmp_ncores;
+    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
+                  "__kmp_ncores (%d)\n",
+                  __kmp_dflt_team_nth));
+#else
+    // Default #threads = #available OS procs
+    __kmp_dflt_team_nth = __kmp_avail_proc;
+    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
+                  "__kmp_avail_proc(%d)\n",
+                  __kmp_dflt_team_nth));
+#endif /* KMP_DFLT_NTH_CORES */
+  }
+
+  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
+    __kmp_dflt_team_nth = KMP_MIN_NTH;
+  }
+  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
+    __kmp_dflt_team_nth = __kmp_sys_max_nth;
+  }
+
+  if (__kmp_nesting_mode > 0)
+    __kmp_set_nesting_mode_threads();
+
+  // There's no harm in continuing if the following check fails,
+  // but it indicates an error in the previous logic.
+  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
+
+  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
+    // Run through the __kmp_threads array and set the num threads icv for each
+    // root thread that is currently registered with the RTL (which has not
+    // already explicitly set its nthreads-var with a call to
+    // omp_set_num_threads()).
+    for (i = 0; i < __kmp_threads_capacity; i++) {
+      kmp_info_t *thread = __kmp_threads[i];
+      if (thread == NULL)
+        continue;
+      if (thread->th.th_current_task->td_icvs.nproc != 0)
+        continue;
+
+      set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
+    }
+  }
+  KA_TRACE(
+      20,
+      ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
+       __kmp_dflt_team_nth));
+
+#ifdef KMP_ADJUST_BLOCKTIME
+  /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
+  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
+    if (__kmp_nth > __kmp_avail_proc) {
+      __kmp_zero_bt = TRUE;
+    }
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+  /* we have finished middle initialization */
+  TCW_SYNC_4(__kmp_init_middle, TRUE);
+
+  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
+}
+
+void __kmp_middle_initialize(void) {
+  if (__kmp_init_middle) {
+    return;
+  }
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  if (__kmp_init_middle) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+  __kmp_do_middle_initialize();
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+
+void __kmp_parallel_initialize(void) {
+  int gtid = __kmp_entry_gtid(); // this might be a new root
+
+  /* synchronize parallel initialization (for sibling) */
+  if (TCR_4(__kmp_init_parallel))
+    return;
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  if (TCR_4(__kmp_init_parallel)) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+
+  /* TODO reinitialization after we have already shut down */
+  if (TCR_4(__kmp_global.g.g_done)) {
+    KA_TRACE(
+        10,
+        ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
+    __kmp_infinite_loop();
+  }
+
+  /* jc: The lock __kmp_initz_lock is already held, so calling
+     __kmp_serial_initialize would cause a deadlock.  So we call
+     __kmp_do_serial_initialize directly. */
+  if (!__kmp_init_middle) {
+    __kmp_do_middle_initialize();
+  }
+  __kmp_assign_root_init_mask();
+  __kmp_resume_if_hard_paused();
+
+  /* begin initialization */
+  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
+  KMP_ASSERT(KMP_UBER_GTID(gtid));
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  // Save the FP control regs.
+  // Worker threads will set theirs to these values at thread startup.
+  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
+  __kmp_store_mxcsr(&__kmp_init_mxcsr);
+  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#if KMP_OS_UNIX
+#if KMP_HANDLE_SIGNALS
+  /*  must be after __kmp_serial_initialize  */
+  __kmp_install_signals(TRUE);
+#endif
+#endif
+
+  __kmp_suspend_initialize();
+
+#if defined(USE_LOAD_BALANCE)
+  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
+    __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
+  }
+#else
+  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
+    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
+  }
+#endif
+
+  if (__kmp_version) {
+    __kmp_print_version_2();
+  }
+
+  /* we have finished parallel initialization */
+  TCW_SYNC_4(__kmp_init_parallel, TRUE);
+
+  KMP_MB();
+  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
+
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+
+void __kmp_hidden_helper_initialize() {
+  if (TCR_4(__kmp_init_hidden_helper))
+    return;
+
+  // __kmp_parallel_initialize is required before we initialize hidden helper
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  // Double check. Note that this double check should not be placed before
+  // __kmp_parallel_initialize as it will cause dead lock.
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  if (TCR_4(__kmp_init_hidden_helper)) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+
+#if KMP_AFFINITY_SUPPORTED
+  // Initialize hidden helper affinity settings.
+  // The above __kmp_parallel_initialize() will initialize
+  // regular affinity (and topology) if not already done.
+  if (!__kmp_hh_affinity.flags.initialized)
+    __kmp_affinity_initialize(__kmp_hh_affinity);
+#endif
+
+  // Set the count of hidden helper tasks to be executed to zero
+  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
+
+  // Set the global variable indicating that we're initializing hidden helper
+  // team/threads
+  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
+
+  // Platform independent initialization
+  __kmp_do_initialize_hidden_helper_threads();
+
+  // Wait here for the finish of initialization of hidden helper teams
+  __kmp_hidden_helper_threads_initz_wait();
+
+  // We have finished hidden helper initialization
+  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
+
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
+                                   kmp_team_t *team) {
+  kmp_disp_t *dispatch;
+
+  KMP_MB();
+
+  /* none of the threads have encountered any constructs, yet. */
+  this_thr->th.th_local.this_construct = 0;
+#if KMP_CACHE_MANAGE
+  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
+  KMP_DEBUG_ASSERT(dispatch);
+  KMP_DEBUG_ASSERT(team->t.t_dispatch);
+  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
+  // this_thr->th.th_info.ds.ds_tid ] );
+
+  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
+  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
+  if (__kmp_env_consistency_check)
+    __kmp_push_parallel(gtid, team->t.t_ident);
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+}
+
+void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
+                                  kmp_team_t *team) {
+  if (__kmp_env_consistency_check)
+    __kmp_pop_parallel(gtid, team->t.t_ident);
+
+  __kmp_finish_implicit_task(this_thr);
+}
+
+int __kmp_invoke_task_func(int gtid) {
+  int rc;
+  int tid = __kmp_tid_from_gtid(gtid);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+
+  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
+#if USE_ITT_BUILD
+  if (__itt_stack_caller_create_ptr) {
+    // inform ittnotify about entering user's code
+    if (team->t.t_stack_id != NULL) {
+      __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
+    } else {
+      KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
+      __kmp_itt_stack_callee_enter(
+          (__itt_caller)team->t.t_parent->t.t_stack_id);
+    }
+  }
+#endif /* USE_ITT_BUILD */
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_INVOKING();
+#endif
+
+#if OMPT_SUPPORT
+  void *dummy;
+  void **exit_frame_p;
+  ompt_data_t *my_task_data;
+  ompt_data_t *my_parallel_data;
+  int ompt_team_size;
+
+  if (ompt_enabled.enabled) {
+    exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
+                         .ompt_task_info.frame.exit_frame.ptr);
+  } else {
+    exit_frame_p = &dummy;
+  }
+
+  my_task_data =
+      &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
+  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
+  if (ompt_enabled.ompt_callback_implicit_task) {
+    ompt_team_size = team->t.t_nproc;
+    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+        ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
+        __kmp_tid_from_gtid(gtid), ompt_task_implicit);
+    OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
+  }
+#endif
+
+#if KMP_STATS_ENABLED
+  stats_state_e previous_state = KMP_GET_THREAD_STATE();
+  if (previous_state == stats_state_e::TEAMS_REGION) {
+    KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
+  } else {
+    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
+  }
+  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
+#endif
+
+  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
+                              tid, (int)team->t.t_argc, (void **)team->t.t_argv
+#if OMPT_SUPPORT
+                              ,
+                              exit_frame_p
+#endif
+  );
+#if OMPT_SUPPORT
+  *exit_frame_p = NULL;
+  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
+#endif
+
+#if KMP_STATS_ENABLED
+  if (previous_state == stats_state_e::TEAMS_REGION) {
+    KMP_SET_THREAD_STATE(previous_state);
+  }
+  KMP_POP_PARTITIONED_TIMER();
+#endif
+
+#if USE_ITT_BUILD
+  if (__itt_stack_caller_create_ptr) {
+    // inform ittnotify about leaving user's code
+    if (team->t.t_stack_id != NULL) {
+      __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
+    } else {
+      KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
+      __kmp_itt_stack_callee_leave(
+          (__itt_caller)team->t.t_parent->t.t_stack_id);
+    }
+  }
+#endif /* USE_ITT_BUILD */
+  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
+
+  return rc;
+}
+
+void __kmp_teams_master(int gtid) {
+  // This routine is called by all primary threads in teams construct
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_team_t *team = thr->th.th_team;
+  ident_t *loc = team->t.t_ident;
+  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
+  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
+  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
+  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
+                __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
+
+  // This thread is a new CG root.  Set up the proper variables.
+  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
+  tmp->cg_root = thr; // Make thr the CG root
+  // Init to thread limit stored when league primary threads were forked
+  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
+  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
+  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
+                 " cg_nthreads to 1\n",
+                 thr, tmp));
+  tmp->up = thr->th.th_cg_roots;
+  thr->th.th_cg_roots = tmp;
+
+// Launch league of teams now, but not let workers execute
+// (they hang on fork barrier until next parallel)
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_FORKING();
+#endif
+  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
+                  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
+                  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_JOINING();
+#endif
+  // If the team size was reduced from the limit, set it to the new size
+  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
+    thr->th.th_teams_size.nth = thr->th.th_team_nproc;
+  // AC: last parameter "1" eliminates join barrier which won't work because
+  // worker threads are in a fork barrier waiting for more parallel regions
+  __kmp_join_call(loc, gtid
+#if OMPT_SUPPORT
+                  ,
+                  fork_context_intel
+#endif
+                  ,
+                  1);
+}
+
+int __kmp_invoke_teams_master(int gtid) {
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+#if KMP_DEBUG
+  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
+    KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
+                     (void *)__kmp_teams_master);
+#endif
+  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
+#if OMPT_SUPPORT
+  int tid = __kmp_tid_from_gtid(gtid);
+  ompt_data_t *task_data =
+      &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
+  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
+  if (ompt_enabled.ompt_callback_implicit_task) {
+    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+        ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
+        ompt_task_initial);
+    OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
+  }
+#endif
+  __kmp_teams_master(gtid);
+#if OMPT_SUPPORT
+  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
+#endif
+  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
+  return 1;
+}
+
+/* this sets the requested number of threads for the next parallel region
+   encountered by this team. since this should be enclosed in the forkjoin
+   critical section it should avoid race conditions with asymmetrical nested
+   parallelism */
+
+void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+
+  if (num_threads > 0)
+    thr->th.th_set_nproc = num_threads;
+}
+
+static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
+                                    int num_threads) {
+  KMP_DEBUG_ASSERT(thr);
+  // Remember the number of threads for inner parallel regions
+  if (!TCR_4(__kmp_init_middle))
+    __kmp_middle_initialize(); // get internal globals calculated
+  __kmp_assign_root_init_mask();
+  KMP_DEBUG_ASSERT(__kmp_avail_proc);
+  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
+
+  if (num_threads == 0) {
+    if (__kmp_teams_thread_limit > 0) {
+      num_threads = __kmp_teams_thread_limit;
+    } else {
+      num_threads = __kmp_avail_proc / num_teams;
+    }
+    // adjust num_threads w/o warning as it is not user setting
+    // num_threads = min(num_threads, nthreads-var, thread-limit-var)
+    // no thread_limit clause specified -  do not change thread-limit-var ICV
+    if (num_threads > __kmp_dflt_team_nth) {
+      num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
+    }
+    if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
+      num_threads = thr->th.th_current_task->td_icvs.thread_limit;
+    } // prevent team size to exceed thread-limit-var
+    if (num_teams * num_threads > __kmp_teams_max_nth) {
+      num_threads = __kmp_teams_max_nth / num_teams;
+    }
+    if (num_threads == 0) {
+      num_threads = 1;
+    }
+  } else {
+    if (num_threads < 0) {
+      __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
+                __kmp_msg_null);
+      num_threads = 1;
+    }
+    // This thread will be the primary thread of the league primary threads
+    // Store new thread limit; old limit is saved in th_cg_roots list
+    thr->th.th_current_task->td_icvs.thread_limit = num_threads;
+    // num_threads = min(num_threads, nthreads-var)
+    if (num_threads > __kmp_dflt_team_nth) {
+      num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
+    }
+    if (num_teams * num_threads > __kmp_teams_max_nth) {
+      int new_threads = __kmp_teams_max_nth / num_teams;
+      if (new_threads == 0) {
+        new_threads = 1;
+      }
+      if (new_threads != num_threads) {
+        if (!__kmp_reserve_warn) { // user asked for too many threads
+          __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
+          __kmp_msg(kmp_ms_warning,
+                    KMP_MSG(CantFormThrTeam, num_threads, new_threads),
+                    KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+        }
+      }
+      num_threads = new_threads;
+    }
+  }
+  thr->th.th_teams_size.nth = num_threads;
+}
+
+/* this sets the requested number of teams for the teams region and/or
+   the number of threads for the next parallel region encountered  */
+void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
+                          int num_threads) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  if (num_teams < 0) {
+    // OpenMP specification requires requested values to be positive,
+    // but people can send us any value, so we'd better check
+    __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
+              __kmp_msg_null);
+    num_teams = 1;
+  }
+  if (num_teams == 0) {
+    if (__kmp_nteams > 0) {
+      num_teams = __kmp_nteams;
+    } else {
+      num_teams = 1; // default number of teams is 1.
+    }
+  }
+  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
+    if (!__kmp_reserve_warn) {
+      __kmp_reserve_warn = 1;
+      __kmp_msg(kmp_ms_warning,
+                KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
+                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+    }
+    num_teams = __kmp_teams_max_nth;
+  }
+  // Set number of teams (number of threads in the outer "parallel" of the
+  // teams)
+  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
+
+  __kmp_push_thread_limit(thr, num_teams, num_threads);
+}
+
+/* This sets the requested number of teams for the teams region and/or
+   the number of threads for the next parallel region encountered  */
+void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
+                             int num_teams_ub, int num_threads) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
+  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
+  KMP_DEBUG_ASSERT(num_threads >= 0);
+
+  if (num_teams_lb > num_teams_ub) {
+    __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
+                KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
+  }
+
+  int num_teams = 1; // defalt number of teams is 1.
+
+  if (num_teams_lb == 0 && num_teams_ub > 0)
+    num_teams_lb = num_teams_ub;
+
+  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
+    num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
+    if (num_teams > __kmp_teams_max_nth) {
+      if (!__kmp_reserve_warn) {
+        __kmp_reserve_warn = 1;
+        __kmp_msg(kmp_ms_warning,
+                  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
+                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+      }
+      num_teams = __kmp_teams_max_nth;
+    }
+  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
+    num_teams = num_teams_ub;
+  } else { // num_teams_lb <= num_teams <= num_teams_ub
+    if (num_threads <= 0) {
+      if (num_teams_ub > __kmp_teams_max_nth) {
+        num_teams = num_teams_lb;
+      } else {
+        num_teams = num_teams_ub;
+      }
+    } else {
+      num_teams = (num_threads > __kmp_teams_max_nth)
+                      ? num_teams
+                      : __kmp_teams_max_nth / num_threads;
+      if (num_teams < num_teams_lb) {
+        num_teams = num_teams_lb;
+      } else if (num_teams > num_teams_ub) {
+        num_teams = num_teams_ub;
+      }
+    }
+  }
+  // Set number of teams (number of threads in the outer "parallel" of the
+  // teams)
+  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
+
+  __kmp_push_thread_limit(thr, num_teams, num_threads);
+}
+
+// Set the proc_bind var to use in the following parallel region.
+void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  thr->th.th_set_proc_bind = proc_bind;
+}
+
+/* Launch the worker threads into the microtask. */
+
+void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+
+#ifdef KMP_DEBUG
+  int f;
+#endif /* KMP_DEBUG */
+
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
+  KMP_ASSERT(KMP_MASTER_GTID(gtid));
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  team->t.t_construct = 0; /* no single directives seen yet */
+  team->t.t_ordered.dt.t_value =
+      0; /* thread 0 enters the ordered section first */
+
+  /* Reset the identifiers on the dispatch buffer */
+  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
+  if (team->t.t_max_nproc > 1) {
+    int i;
+    for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
+      team->t.t_disp_buffer[i].buffer_index = i;
+      team->t.t_disp_buffer[i].doacross_buf_idx = i;
+    }
+  } else {
+    team->t.t_disp_buffer[0].buffer_index = 0;
+    team->t.t_disp_buffer[0].doacross_buf_idx = 0;
+  }
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+  KMP_ASSERT(this_thr->th.th_team == team);
+
+#ifdef KMP_DEBUG
+  for (f = 0; f < team->t.t_nproc; f++) {
+    KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
+                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
+  }
+#endif /* KMP_DEBUG */
+
+  /* release the worker threads so they may begin working */
+  __kmp_fork_barrier(gtid, 0);
+}
+
+void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
+  KMP_ASSERT(KMP_MASTER_GTID(gtid));
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  /* Join barrier after fork */
+
+#ifdef KMP_DEBUG
+  if (__kmp_threads[gtid] &&
+      __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
+    __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
+                 __kmp_threads[gtid]);
+    __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
+                 "team->t.t_nproc=%d\n",
+                 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
+                 team->t.t_nproc);
+    __kmp_print_structure();
+  }
+  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
+                   __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
+#endif /* KMP_DEBUG */
+
+  __kmp_join_barrier(gtid); /* wait for everyone */
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
+    int ds_tid = this_thr->th.th_info.ds.ds_tid;
+    ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
+    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+#if OMPT_OPTIONAL
+    void *codeptr = NULL;
+    if (KMP_MASTER_TID(ds_tid) &&
+        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
+         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
+      codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
+
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
+          codeptr);
+    }
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
+          codeptr);
+    }
+#endif
+    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_end, NULL, task_data, 0, ds_tid,
+          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+    }
+  }
+#endif
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+  KMP_ASSERT(this_thr->th.th_team == team);
+}
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef USE_LOAD_BALANCE
+
+// Return the worker threads actively spinning in the hot team, if we
+// are at the outermost level of parallelism.  Otherwise, return 0.
+static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
+  int i;
+  int retval;
+  kmp_team_t *hot_team;
+
+  if (root->r.r_active) {
+    return 0;
+  }
+  hot_team = root->r.r_hot_team;
+  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
+    return hot_team->t.t_nproc - 1; // Don't count primary thread
+  }
+
+  // Skip the primary thread - it is accounted for elsewhere.
+  retval = 0;
+  for (i = 1; i < hot_team->t.t_nproc; i++) {
+    if (hot_team->t.t_threads[i]->th.th_active) {
+      retval++;
+    }
+  }
+  return retval;
+}
+
+// Perform an automatic adjustment to the number of
+// threads used by the next parallel region.
+static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
+  int retval;
+  int pool_active;
+  int hot_team_active;
+  int team_curr_active;
+  int system_active;
+
+  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
+                set_nproc));
+  KMP_DEBUG_ASSERT(root);
+  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
+                       ->th.th_current_task->td_icvs.dynamic == TRUE);
+  KMP_DEBUG_ASSERT(set_nproc > 1);
+
+  if (set_nproc == 1) {
+    KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
+    return 1;
+  }
+
+  // Threads that are active in the thread pool, active in the hot team for this
+  // particular root (if we are at the outer par level), and the currently
+  // executing thread (to become the primary thread) are available to add to the
+  // new team, but are currently contributing to the system load, and must be
+  // accounted for.
+  pool_active = __kmp_thread_pool_active_nth;
+  hot_team_active = __kmp_active_hot_team_nproc(root);
+  team_curr_active = pool_active + hot_team_active + 1;
+
+  // Check the system load.
+  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
+  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
+                "hot team active = %d\n",
+                system_active, pool_active, hot_team_active));
+
+  if (system_active < 0) {
+    // There was an error reading the necessary info from /proc, so use the
+    // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
+    // = dynamic_thread_limit, we shouldn't wind up getting back here.
+    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
+    KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
+
+    // Make this call behave like the thread limit algorithm.
+    retval = __kmp_avail_proc - __kmp_nth +
+             (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
+    if (retval > set_nproc) {
+      retval = set_nproc;
+    }
+    if (retval < KMP_MIN_NTH) {
+      retval = KMP_MIN_NTH;
+    }
+
+    KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
+                  retval));
+    return retval;
+  }
+
+  // There is a slight delay in the load balance algorithm in detecting new
+  // running procs. The real system load at this instant should be at least as
+  // large as the #active omp thread that are available to add to the team.
+  if (system_active < team_curr_active) {
+    system_active = team_curr_active;
+  }
+  retval = __kmp_avail_proc - system_active + team_curr_active;
+  if (retval > set_nproc) {
+    retval = set_nproc;
+  }
+  if (retval < KMP_MIN_NTH) {
+    retval = KMP_MIN_NTH;
+  }
+
+  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
+  return retval;
+} // __kmp_load_balance_nproc()
+
+#endif /* USE_LOAD_BALANCE */
+
+/* ------------------------------------------------------------------------ */
+
+/* NOTE: this is called with the __kmp_init_lock held */
+void __kmp_cleanup(void) {
+  int f;
+
+  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
+
+  if (TCR_4(__kmp_init_parallel)) {
+#if KMP_HANDLE_SIGNALS
+    __kmp_remove_signals();
+#endif
+    TCW_4(__kmp_init_parallel, FALSE);
+  }
+
+  if (TCR_4(__kmp_init_middle)) {
+#if KMP_AFFINITY_SUPPORTED
+    __kmp_affinity_uninitialize();
+#endif /* KMP_AFFINITY_SUPPORTED */
+    __kmp_cleanup_hierarchy();
+    TCW_4(__kmp_init_middle, FALSE);
+  }
+
+  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
+
+  if (__kmp_init_serial) {
+    __kmp_runtime_destroy();
+    __kmp_init_serial = FALSE;
+  }
+
+  __kmp_cleanup_threadprivate_caches();
+
+  for (f = 0; f < __kmp_threads_capacity; f++) {
+    if (__kmp_root[f] != NULL) {
+      __kmp_free(__kmp_root[f]);
+      __kmp_root[f] = NULL;
+    }
+  }
+  __kmp_free(__kmp_threads);
+  // __kmp_threads and __kmp_root were allocated at once, as single block, so
+  // there is no need in freeing __kmp_root.
+  __kmp_threads = NULL;
+  __kmp_root = NULL;
+  __kmp_threads_capacity = 0;
+
+  // Free old __kmp_threads arrays if they exist.
+  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
+  while (ptr) {
+    kmp_old_threads_list_t *next = ptr->next;
+    __kmp_free(ptr->threads);
+    __kmp_free(ptr);
+    ptr = next;
+  }
+
+#if KMP_USE_DYNAMIC_LOCK
+  __kmp_cleanup_indirect_user_locks();
+#else
+  __kmp_cleanup_user_locks();
+#endif
+#if OMPD_SUPPORT
+  if (ompd_state) {
+    __kmp_free(ompd_env_block);
+    ompd_env_block = NULL;
+    ompd_env_block_size = 0;
+  }
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
+  __kmp_cpuinfo_file = NULL;
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+#if KMP_USE_ADAPTIVE_LOCKS
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+  __kmp_print_speculative_stats();
+#endif
+#endif
+  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
+  __kmp_nested_nth.nth = NULL;
+  __kmp_nested_nth.size = 0;
+  __kmp_nested_nth.used = 0;
+  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
+  __kmp_nested_proc_bind.bind_types = NULL;
+  __kmp_nested_proc_bind.size = 0;
+  __kmp_nested_proc_bind.used = 0;
+  if (__kmp_affinity_format) {
+    KMP_INTERNAL_FREE(__kmp_affinity_format);
+    __kmp_affinity_format = NULL;
+  }
+
+  __kmp_i18n_catclose();
+
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.deallocate();
+#endif
+
+#if KMP_STATS_ENABLED
+  __kmp_stats_fini();
+#endif
+
+  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
+}
+
+/* ------------------------------------------------------------------------ */
+
+int __kmp_ignore_mppbeg(void) {
+  char *env;
+
+  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
+    if (__kmp_str_match_false(env))
+      return FALSE;
+  }
+  // By default __kmpc_begin() is no-op.
+  return TRUE;
+}
+
+int __kmp_ignore_mppend(void) {
+  char *env;
+
+  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
+    if (__kmp_str_match_false(env))
+      return FALSE;
+  }
+  // By default __kmpc_end() is no-op.
+  return TRUE;
+}
+
+void __kmp_internal_begin(void) {
+  int gtid;
+  kmp_root_t *root;
+
+  /* this is a very important step as it will register new sibling threads
+     and assign these new uber threads a new gtid */
+  gtid = __kmp_entry_gtid();
+  root = __kmp_threads[gtid]->th.th_root;
+  KMP_ASSERT(KMP_UBER_GTID(gtid));
+
+  if (root->r.r_begin)
+    return;
+  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
+  if (root->r.r_begin) {
+    __kmp_release_lock(&root->r.r_begin_lock, gtid);
+    return;
+  }
+
+  root->r.r_begin = TRUE;
+
+  __kmp_release_lock(&root->r.r_begin_lock, gtid);
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_user_set_library(enum library_type arg) {
+  int gtid;
+  kmp_root_t *root;
+  kmp_info_t *thread;
+
+  /* first, make sure we are initialized so we can get our gtid */
+
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_threads[gtid];
+
+  root = thread->th.th_root;
+
+  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
+                library_serial));
+  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
+                                  thread */
+    KMP_WARNING(SetLibraryIncorrectCall);
+    return;
+  }
+
+  switch (arg) {
+  case library_serial:
+    thread->th.th_set_nproc = 0;
+    set__nproc(thread, 1);
+    break;
+  case library_turnaround:
+    thread->th.th_set_nproc = 0;
+    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
+                                           : __kmp_dflt_team_nth_ub);
+    break;
+  case library_throughput:
+    thread->th.th_set_nproc = 0;
+    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
+                                           : __kmp_dflt_team_nth_ub);
+    break;
+  default:
+    KMP_FATAL(UnknownLibraryType, arg);
+  }
+
+  __kmp_aux_set_library(arg);
+}
+
+void __kmp_aux_set_stacksize(size_t arg) {
+  if (!__kmp_init_serial)
+    __kmp_serial_initialize();
+
+#if KMP_OS_DARWIN
+  if (arg & (0x1000 - 1)) {
+    arg &= ~(0x1000 - 1);
+    if (arg + 0x1000) /* check for overflow if we round up */
+      arg += 0x1000;
+  }
+#endif
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+
+  /* only change the default stacksize before the first parallel region */
+  if (!TCR_4(__kmp_init_parallel)) {
+    size_t value = arg; /* argument is in bytes */
+
+    if (value < __kmp_sys_min_stksize)
+      value = __kmp_sys_min_stksize;
+    else if (value > KMP_MAX_STKSIZE)
+      value = KMP_MAX_STKSIZE;
+
+    __kmp_stksize = value;
+
+    __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
+  }
+
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+
+/* set the behaviour of the runtime library */
+/* TODO this can cause some odd behaviour with sibling parallelism... */
+void __kmp_aux_set_library(enum library_type arg) {
+  __kmp_library = arg;
+
+  switch (__kmp_library) {
+  case library_serial: {
+    KMP_INFORM(LibraryIsSerial);
+  } break;
+  case library_turnaround:
+    if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
+      __kmp_use_yield = 2; // only yield when oversubscribed
+    break;
+  case library_throughput:
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
+      __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+    break;
+  default:
+    KMP_FATAL(UnknownLibraryType, arg);
+  }
+}
+
+/* Getting team information common for all team API */
+// Returns NULL if not in teams construct
+static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
+  kmp_info_t *thr = __kmp_entry_thread();
+  teams_serialized = 0;
+  if (thr->th.th_teams_microtask) {
+    kmp_team_t *team = thr->th.th_team;
+    int tlevel = thr->th.th_teams_level; // the level of the teams construct
+    int ii = team->t.t_level;
+    teams_serialized = team->t.t_serialized;
+    int level = tlevel + 1;
+    KMP_DEBUG_ASSERT(ii >= tlevel);
+    while (ii > level) {
+      for (teams_serialized = team->t.t_serialized;
+           (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
+      }
+      if (team->t.t_serialized && (!teams_serialized)) {
+        team = team->t.t_parent;
+        continue;
+      }
+      if (ii > level) {
+        team = team->t.t_parent;
+        ii--;
+      }
+    }
+    return team;
+  }
+  return NULL;
+}
+
+int __kmp_aux_get_team_num() {
+  int serialized;
+  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
+  if (team) {
+    if (serialized > 1) {
+      return 0; // teams region is serialized ( 1 team of 1 thread ).
+    } else {
+      return team->t.t_master_tid;
+    }
+  }
+  return 0;
+}
+
+int __kmp_aux_get_num_teams() {
+  int serialized;
+  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
+  if (team) {
+    if (serialized > 1) {
+      return 1;
+    } else {
+      return team->t.t_parent->t.t_nproc;
+    }
+  }
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+/*
+ * Affinity Format Parser
+ *
+ * Field is in form of: %[[[0].]size]type
+ * % and type are required (%% means print a literal '%')
+ * type is either single char or long name surrounded by {},
+ * e.g., N or {num_threads}
+ * 0 => leading zeros
+ * . => right justified when size is specified
+ * by default output is left justified
+ * size is the *minimum* field length
+ * All other characters are printed as is
+ *
+ * Available field types:
+ * L {thread_level}      - omp_get_level()
+ * n {thread_num}        - omp_get_thread_num()
+ * h {host}              - name of host machine
+ * P {process_id}        - process id (integer)
+ * T {thread_identifier} - native thread identifier (integer)
+ * N {num_threads}       - omp_get_num_threads()
+ * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
+ * a {thread_affinity}   - comma separated list of integers or integer ranges
+ *                         (values of affinity mask)
+ *
+ * Implementation-specific field types can be added
+ * If a type is unknown, print "undefined"
+ */
+
+// Structure holding the short name, long name, and corresponding data type
+// for snprintf.  A table of these will represent the entire valid keyword
+// field types.
+typedef struct kmp_affinity_format_field_t {
+  char short_name; // from spec e.g., L -> thread level
+  const char *long_name; // from spec thread_level -> thread level
+  char field_format; // data type for snprintf (typically 'd' or 's'
+  // for integer or string)
+} kmp_affinity_format_field_t;
+
+static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
+#if KMP_AFFINITY_SUPPORTED
+    {'A', "thread_affinity", 's'},
+#endif
+    {'t', "team_num", 'd'},
+    {'T', "num_teams", 'd'},
+    {'L', "nesting_level", 'd'},
+    {'n', "thread_num", 'd'},
+    {'N', "num_threads", 'd'},
+    {'a', "ancestor_tnum", 'd'},
+    {'H', "host", 's'},
+    {'P', "process_id", 'd'},
+    {'i', "native_thread_id", 'd'}};
+
+// Return the number of characters it takes to hold field
+static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
+                                            const char **ptr,
+                                            kmp_str_buf_t *field_buffer) {
+  int rc, format_index, field_value;
+  const char *width_left, *width_right;
+  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
+  static const int FORMAT_SIZE = 20;
+  char format[FORMAT_SIZE] = {0};
+  char absolute_short_name = 0;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  KMP_DEBUG_ASSERT(th);
+  KMP_DEBUG_ASSERT(**ptr == '%');
+  KMP_DEBUG_ASSERT(field_buffer);
+
+  __kmp_str_buf_clear(field_buffer);
+
+  // Skip the initial %
+  (*ptr)++;
+
+  // Check for %% first
+  if (**ptr == '%') {
+    __kmp_str_buf_cat(field_buffer, "%", 1);
+    (*ptr)++; // skip over the second %
+    return 1;
+  }
+
+  // Parse field modifiers if they are present
+  pad_zeros = false;
+  if (**ptr == '0') {
+    pad_zeros = true;
+    (*ptr)++; // skip over 0
+  }
+  right_justify = false;
+  if (**ptr == '.') {
+    right_justify = true;
+    (*ptr)++; // skip over .
+  }
+  // Parse width of field: [width_left, width_right)
+  width_left = width_right = NULL;
+  if (**ptr >= '0' && **ptr <= '9') {
+    width_left = *ptr;
+    SKIP_DIGITS(*ptr);
+    width_right = *ptr;
+  }
+
+  // Create the format for KMP_SNPRINTF based on flags parsed above
+  format_index = 0;
+  format[format_index++] = '%';
+  if (!right_justify)
+    format[format_index++] = '-';
+  if (pad_zeros)
+    format[format_index++] = '0';
+  if (width_left && width_right) {
+    int i = 0;
+    // Only allow 8 digit number widths.
+    // This also prevents overflowing format variable
+    while (i < 8 && width_left < width_right) {
+      format[format_index++] = *width_left;
+      width_left++;
+      i++;
+    }
+  }
+
+  // Parse a name (long or short)
+  // Canonicalize the name into absolute_short_name
+  found_valid_name = false;
+  parse_long_name = (**ptr == '{');
+  if (parse_long_name)
+    (*ptr)++; // skip initial left brace
+  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
+                             sizeof(__kmp_affinity_format_table[0]);
+       ++i) {
+    char short_name = __kmp_affinity_format_table[i].short_name;
+    const char *long_name = __kmp_affinity_format_table[i].long_name;
+    char field_format = __kmp_affinity_format_table[i].field_format;
+    if (parse_long_name) {
+      size_t length = KMP_STRLEN(long_name);
+      if (strncmp(*ptr, long_name, length) == 0) {
+        found_valid_name = true;
+        (*ptr) += length; // skip the long name
+      }
+    } else if (**ptr == short_name) {
+      found_valid_name = true;
+      (*ptr)++; // skip the short name
+    }
+    if (found_valid_name) {
+      format[format_index++] = field_format;
+      format[format_index++] = '\0';
+      absolute_short_name = short_name;
+      break;
+    }
+  }
+  if (parse_long_name) {
+    if (**ptr != '}') {
+      absolute_short_name = 0;
+    } else {
+      (*ptr)++; // skip over the right brace
+    }
+  }
+
+  // Attempt to fill the buffer with the requested
+  // value using snprintf within __kmp_str_buf_print()
+  switch (absolute_short_name) {
+  case 't':
+    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
+    break;
+  case 'T':
+    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
+    break;
+  case 'L':
+    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
+    break;
+  case 'n':
+    rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
+    break;
+  case 'H': {
+    static const int BUFFER_SIZE = 256;
+    char buf[BUFFER_SIZE];
+    __kmp_expand_host_name(buf, BUFFER_SIZE);
+    rc = __kmp_str_buf_print(field_buffer, format, buf);
+  } break;
+  case 'P':
+    rc = __kmp_str_buf_print(field_buffer, format, getpid());
+    break;
+  case 'i':
+    rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
+    break;
+  case 'N':
+    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
+    break;
+  case 'a':
+    field_value =
+        __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
+    rc = __kmp_str_buf_print(field_buffer, format, field_value);
+    break;
+#if KMP_AFFINITY_SUPPORTED
+  case 'A': {
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
+    __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
+    rc = __kmp_str_buf_print(field_buffer, format, buf.str);
+    __kmp_str_buf_free(&buf);
+  } break;
+#endif
+  default:
+    // According to spec, If an implementation does not have info for field
+    // type, then "undefined" is printed
+    rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
+    // Skip the field
+    if (parse_long_name) {
+      SKIP_TOKEN(*ptr);
+      if (**ptr == '}')
+        (*ptr)++;
+    } else {
+      (*ptr)++;
+    }
+  }
+
+  KMP_ASSERT(format_index <= FORMAT_SIZE);
+  return rc;
+}
+
+/*
+ * Return number of characters needed to hold the affinity string
+ * (not including null byte character)
+ * The resultant string is printed to buffer, which the caller can then
+ * handle afterwards
+ */
+size_t __kmp_aux_capture_affinity(int gtid, const char *format,
+                                  kmp_str_buf_t *buffer) {
+  const char *parse_ptr;
+  size_t retval;
+  const kmp_info_t *th;
+  kmp_str_buf_t field;
+
+  KMP_DEBUG_ASSERT(buffer);
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  __kmp_str_buf_init(&field);
+  __kmp_str_buf_clear(buffer);
+
+  th = __kmp_threads[gtid];
+  retval = 0;
+
+  // If format is NULL or zero-length string, then we use
+  // affinity-format-var ICV
+  parse_ptr = format;
+  if (parse_ptr == NULL || *parse_ptr == '\0') {
+    parse_ptr = __kmp_affinity_format;
+  }
+  KMP_DEBUG_ASSERT(parse_ptr);
+
+  while (*parse_ptr != '\0') {
+    // Parse a field
+    if (*parse_ptr == '%') {
+      // Put field in the buffer
+      int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
+      __kmp_str_buf_catbuf(buffer, &field);
+      retval += rc;
+    } else {
+      // Put literal character in buffer
+      __kmp_str_buf_cat(buffer, parse_ptr, 1);
+      retval++;
+      parse_ptr++;
+    }
+  }
+  __kmp_str_buf_free(&field);
+  return retval;
+}
+
+// Displays the affinity string to stdout
+void __kmp_aux_display_affinity(int gtid, const char *format) {
+  kmp_str_buf_t buf;
+  __kmp_str_buf_init(&buf);
+  __kmp_aux_capture_affinity(gtid, format, &buf);
+  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
+  __kmp_str_buf_free(&buf);
+}
+
+/* ------------------------------------------------------------------------ */
+void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
+  int blocktime = arg; /* argument is in microseconds */
+#if KMP_USE_MONITOR
+  int bt_intervals;
+#endif
+  kmp_int8 bt_set;
+
+  __kmp_save_internal_controls(thread);
+
+  /* Normalize and set blocktime for the teams */
+  if (blocktime < KMP_MIN_BLOCKTIME)
+    blocktime = KMP_MIN_BLOCKTIME;
+  else if (blocktime > KMP_MAX_BLOCKTIME)
+    blocktime = KMP_MAX_BLOCKTIME;
+
+  set__blocktime_team(thread->th.th_team, tid, blocktime);
+  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
+
+#if KMP_USE_MONITOR
+  /* Calculate and set blocktime intervals for the teams */
+  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
+
+  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
+  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
+#endif
+
+  /* Set whether blocktime has been set to "TRUE" */
+  bt_set = TRUE;
+
+  set__bt_set_team(thread->th.th_team, tid, bt_set);
+  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
+#if KMP_USE_MONITOR
+  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
+                "bt_intervals=%d, monitor_updates=%d\n",
+                __kmp_gtid_from_tid(tid, thread->th.th_team),
+                thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
+                __kmp_monitor_wakeups));
+#else
+  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
+                __kmp_gtid_from_tid(tid, thread->th.th_team),
+                thread->th.th_team->t.t_id, tid, blocktime));
+#endif
+}
+
+void __kmp_aux_set_defaults(char const *str, size_t len) {
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  __kmp_env_initialize(str);
+
+  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
+    __kmp_env_print();
+  }
+} // __kmp_aux_set_defaults
+
+/* ------------------------------------------------------------------------ */
+/* internal fast reduction routines */
+
+PACKED_REDUCTION_METHOD_T
+__kmp_determine_reduction_method(
+    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
+    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck) {
+
+  // Default reduction method: critical construct ( lck != NULL, like in current
+  // PAROPT )
+  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
+  // can be selected by RTL
+  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
+  // can be selected by RTL
+  // Finally, it's up to OpenMP RTL to make a decision on which method to select
+  // among generated by PAROPT.
+
+  PACKED_REDUCTION_METHOD_T retval;
+
+  int team_size;
+
+  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
+
+#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
+  (loc &&                                                                      \
+   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
+#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
+
+  retval = critical_reduce_block;
+
+  // another choice of getting a team size (with 1 dynamic deference) is slower
+  team_size = __kmp_get_team_num_threads(global_tid);
+  if (team_size == 1) {
+
+    retval = empty_reduce_block;
+
+  } else {
+
+    int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
+
+#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
+    KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
+
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||        \
+    KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
+
+    int teamsize_cutoff = 4;
+
+#if KMP_MIC_SUPPORTED
+    if (__kmp_mic_type != non_mic) {
+      teamsize_cutoff = 8;
+    }
+#endif
+    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
+    if (tree_available) {
+      if (team_size <= teamsize_cutoff) {
+        if (atomic_available) {
+          retval = atomic_reduce_block;
+        }
+      } else {
+        retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
+      }
+    } else if (atomic_available) {
+      retval = atomic_reduce_block;
+    }
+#else
+#error "Unknown or unsupported OS"
+#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
+       // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
+       // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
+
+#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS ||       \
+    KMP_ARCH_WASM || KMP_ARCH_PPC
+
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS ||       \
+    KMP_OS_WASI || KMP_OS_AIX
+
+    // basic tuning
+
+    if (atomic_available) {
+      if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
+        retval = atomic_reduce_block;
+      }
+    } // otherwise: use critical section
+
+#elif KMP_OS_DARWIN
+
+    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
+    if (atomic_available && (num_vars <= 3)) {
+      retval = atomic_reduce_block;
+    } else if (tree_available) {
+      if ((reduce_size > (9 * sizeof(kmp_real64))) &&
+          (reduce_size < (2000 * sizeof(kmp_real64)))) {
+        retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
+      }
+    } // otherwise: use critical section
+
+#else
+#error "Unknown or unsupported OS"
+#endif
+
+#else
+#error "Unknown or unsupported architecture"
+#endif
+  }
+
+  // KMP_FORCE_REDUCTION
+
+  // If the team is serialized (team_size == 1), ignore the forced reduction
+  // method and stay with the unsynchronized method (empty_reduce_block)
+  if (__kmp_force_reduction_method != reduction_method_not_defined &&
+      team_size != 1) {
+
+    PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
+
+    int atomic_available, tree_available;
+
+    switch ((forced_retval = __kmp_force_reduction_method)) {
+    case critical_reduce_block:
+      KMP_ASSERT(lck); // lck should be != 0
+      break;
+
+    case atomic_reduce_block:
+      atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
+      if (!atomic_available) {
+        KMP_WARNING(RedMethodNotSupported, "atomic");
+        forced_retval = critical_reduce_block;
+      }
+      break;
+
+    case tree_reduce_block:
+      tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
+      if (!tree_available) {
+        KMP_WARNING(RedMethodNotSupported, "tree");
+        forced_retval = critical_reduce_block;
+      } else {
+#if KMP_FAST_REDUCTION_BARRIER
+        forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
+#endif
+      }
+      break;
+
+    default:
+      KMP_ASSERT(0); // "unsupported method specified"
+    }
+
+    retval = forced_retval;
+  }
+
+  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
+
+#undef FAST_REDUCTION_TREE_METHOD_GENERATED
+#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
+
+  return (retval);
+}
+// this function is for testing set/get/determine reduce method
+kmp_int32 __kmp_get_reduce_method(void) {
+  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
+}
+
+// Soft pause sets up threads to ignore blocktime and just go to sleep.
+// Spin-wait code checks __kmp_pause_status and reacts accordingly.
+void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
+
+// Hard pause shuts down the runtime completely.  Resume happens naturally when
+// OpenMP is used subsequently.
+void __kmp_hard_pause() {
+  __kmp_pause_status = kmp_hard_paused;
+  __kmp_internal_end_thread(-1);
+}
+
+// Soft resume sets __kmp_pause_status, and wakes up all threads.
+void __kmp_resume_if_soft_paused() {
+  if (__kmp_pause_status == kmp_soft_paused) {
+    __kmp_pause_status = kmp_not_paused;
+
+    for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
+      kmp_info_t *thread = __kmp_threads[gtid];
+      if (thread) { // Wake it if sleeping
+        kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
+                         thread);
+        if (fl.is_sleeping())
+          fl.resume(gtid);
+        else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
+          __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
+        } else { // thread holds the lock and may sleep soon
+          do { // until either the thread sleeps, or we can get the lock
+            if (fl.is_sleeping()) {
+              fl.resume(gtid);
+              break;
+            } else if (__kmp_try_suspend_mx(thread)) {
+              __kmp_unlock_suspend_mx(thread);
+              break;
+            }
+          } while (1);
+        }
+      }
+    }
+  }
+}
+
+// This function is called via __kmpc_pause_resource. Returns 0 if successful.
+// TODO: add warning messages
+int __kmp_pause_resource(kmp_pause_status_t level) {
+  if (level == kmp_not_paused) { // requesting resume
+    if (__kmp_pause_status == kmp_not_paused) {
+      // error message about runtime not being paused, so can't resume
+      return 1;
+    } else {
+      KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
+                       __kmp_pause_status == kmp_hard_paused);
+      __kmp_pause_status = kmp_not_paused;
+      return 0;
+    }
+  } else if (level == kmp_soft_paused) { // requesting soft pause
+    if (__kmp_pause_status != kmp_not_paused) {
+      // error message about already being paused
+      return 1;
+    } else {
+      __kmp_soft_pause();
+      return 0;
+    }
+  } else if (level == kmp_hard_paused) { // requesting hard pause
+    if (__kmp_pause_status != kmp_not_paused) {
+      // error message about already being paused
+      return 1;
+    } else {
+      __kmp_hard_pause();
+      return 0;
+    }
+  } else {
+    // error message about invalid level
+    return 1;
+  }
+}
+
+void __kmp_omp_display_env(int verbose) {
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  if (__kmp_init_serial == 0)
+    __kmp_do_serial_initialize();
+  __kmp_display_env_impl(!verbose, verbose);
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+
+// The team size is changing, so distributed barrier must be modified
+void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
+                               int new_nthreads) {
+  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
+                   bp_dist_bar);
+  kmp_info_t **other_threads = team->t.t_threads;
+
+  // We want all the workers to stop waiting on the barrier while we adjust the
+  // size of the team.
+  for (int f = 1; f < old_nthreads; ++f) {
+    KMP_DEBUG_ASSERT(other_threads[f] != NULL);
+    // Ignore threads that are already inactive or not present in the team
+    if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
+      // teams construct causes thread_limit to get passed in, and some of
+      // those could be inactive; just ignore them
+      continue;
+    }
+    // If thread is transitioning still to in_use state, wait for it
+    if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
+      while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
+        KMP_CPU_PAUSE();
+    }
+    // The thread should be in_use now
+    KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
+    // Transition to unused state
+    team->t.t_threads[f]->th.th_used_in_team.store(2);
+    KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
+  }
+  // Release all the workers
+  team->t.b->go_release();
+
+  KMP_MFENCE();
+
+  // Workers should see transition status 2 and move to 0; but may need to be
+  // woken up first
+  int count = old_nthreads - 1;
+  while (count > 0) {
+    count = old_nthreads - 1;
+    for (int f = 1; f < old_nthreads; ++f) {
+      if (other_threads[f]->th.th_used_in_team.load() != 0) {
+        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
+          kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
+              void *, other_threads[f]->th.th_sleep_loc);
+          __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
+        }
+      } else {
+        KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
+        count--;
+      }
+    }
+  }
+  // Now update the barrier size
+  team->t.b->update_num_threads(new_nthreads);
+  team->t.b->go_reset();
+}
+
+void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
+  // Add the threads back to the team
+  KMP_DEBUG_ASSERT(team);
+  // Threads were paused and pointed at th_used_in_team temporarily during a
+  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
+  // the thread that it should transition itself back into the team. Then, if
+  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
+  // to wake it up.
+  for (int f = 1; f < new_nthreads; ++f) {
+    KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+    KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
+                                3);
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
+      __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
+                      (kmp_flag_32<false, false> *)NULL);
+    }
+  }
+  // The threads should be transitioning to the team; when they are done, they
+  // should have set th_used_in_team to 1. This loop forces master to wait until
+  // all threads have moved into the team and are waiting in the barrier.
+  int count = new_nthreads - 1;
+  while (count > 0) {
+    count = new_nthreads - 1;
+    for (int f = 1; f < new_nthreads; ++f) {
+      if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
+        count--;
+      }
+    }
+  }
+}
+
+// Globals and functions for hidden helper task
+kmp_info_t **__kmp_hidden_helper_threads;
+kmp_info_t *__kmp_hidden_helper_main_thread;
+std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
+#if KMP_OS_LINUX
+kmp_int32 __kmp_hidden_helper_threads_num = 8;
+kmp_int32 __kmp_enable_hidden_helper = TRUE;
+#else
+kmp_int32 __kmp_hidden_helper_threads_num = 0;
+kmp_int32 __kmp_enable_hidden_helper = FALSE;
+#endif
+
+namespace {
+std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
+
+void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
+  // This is an explicit synchronization on all hidden helper threads in case
+  // that when a regular thread pushes a hidden helper task to one hidden
+  // helper thread, the thread has not been awaken once since they're released
+  // by the main thread after creating the team.
+  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
+  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
+         __kmp_hidden_helper_threads_num)
+    ;
+
+  // If main thread, then wait for signal
+  if (__kmpc_master(nullptr, *gtid)) {
+    // First, unset the initial state and release the initial thread
+    TCW_4(__kmp_init_hidden_helper_threads, FALSE);
+    __kmp_hidden_helper_initz_release();
+    __kmp_hidden_helper_main_thread_wait();
+    // Now wake up all worker threads
+    for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
+      __kmp_hidden_helper_worker_thread_signal();
+    }
+  }
+}
+} // namespace
+
+void __kmp_hidden_helper_threads_initz_routine() {
+  // Create a new root for hidden helper team/threads
+  const int gtid = __kmp_register_root(TRUE);
+  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
+  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
+  __kmp_hidden_helper_main_thread->th.th_set_nproc =
+      __kmp_hidden_helper_threads_num;
+
+  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
+
+  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
+
+  // Set the initialization flag to FALSE
+  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
+
+  __kmp_hidden_helper_threads_deinitz_release();
+}
+
+/* Nesting Mode:
+   Set via KMP_NESTING_MODE, which takes an integer.
+   Note: we skip duplicate topology levels, and skip levels with only
+      one entity.
+   KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
+   KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
+      in the topology, and initializes the number of threads at each of those
+      levels to the number of entities at each level, respectively, below the
+      entity at the parent level.
+   KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
+      but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
+      the user to turn nesting on explicitly. This is an even more experimental
+      option to this experimental feature, and may change or go away in the
+      future.
+*/
+
+// Allocate space to store nesting levels
+void __kmp_init_nesting_mode() {
+  int levels = KMP_HW_LAST;
+  __kmp_nesting_mode_nlevels = levels;
+  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
+  for (int i = 0; i < levels; ++i)
+    __kmp_nesting_nth_level[i] = 0;
+  if (__kmp_nested_nth.size < levels) {
+    __kmp_nested_nth.nth =
+        (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
+    __kmp_nested_nth.size = levels;
+  }
+}
+
+// Set # threads for top levels of nesting; must be called after topology set
+void __kmp_set_nesting_mode_threads() {
+  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
+
+  if (__kmp_nesting_mode == 1)
+    __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+  else if (__kmp_nesting_mode > 1)
+    __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
+
+  if (__kmp_topology) { // use topology info
+    int loc, hw_level;
+    for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
+                                loc < __kmp_nesting_mode_nlevels;
+         loc++, hw_level++) {
+      __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
+      if (__kmp_nesting_nth_level[loc] == 1)
+        loc--;
+    }
+    // Make sure all cores are used
+    if (__kmp_nesting_mode > 1 && loc > 1) {
+      int core_level = __kmp_topology->get_level(KMP_HW_CORE);
+      int num_cores = __kmp_topology->get_count(core_level);
+      int upper_levels = 1;
+      for (int level = 0; level < loc - 1; ++level)
+        upper_levels *= __kmp_nesting_nth_level[level];
+      if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
+        __kmp_nesting_nth_level[loc - 1] =
+            num_cores / __kmp_nesting_nth_level[loc - 2];
+    }
+    __kmp_nesting_mode_nlevels = loc;
+    __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
+  } else { // no topology info available; provide a reasonable guesstimation
+    if (__kmp_avail_proc >= 4) {
+      __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
+      __kmp_nesting_nth_level[1] = 2;
+      __kmp_nesting_mode_nlevels = 2;
+    } else {
+      __kmp_nesting_nth_level[0] = __kmp_avail_proc;
+      __kmp_nesting_mode_nlevels = 1;
+    }
+    __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
+  }
+  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
+    __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
+  }
+  set__nproc(thread, __kmp_nesting_nth_level[0]);
+  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
+    __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
+  if (get__max_active_levels(thread) > 1) {
+    // if max levels was set, set nesting mode levels to same
+    __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
+  }
+  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
+    set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
+}
+
+// Empty symbols to export (see exports_so.txt) when feature is disabled
+extern "C" {
+#if !KMP_STATS_ENABLED
+void __kmp_reset_stats() {}
+#endif
+#if !USE_DEBUGGER
+int __kmp_omp_debug_struct_info = FALSE;
+int __kmp_debugging = FALSE;
+#endif
+#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
+void __kmp_itt_fini_ittlib() {}
+void __kmp_itt_init_ittlib() {}
+#endif
+}
+
+// end of file
diff --git a/third_party/openmp/kmp_safe_c_api.h b/third_party/openmp/kmp_safe_c_api.h
new file mode 100644
index 000000000..79f4a7f57
--- /dev/null
+++ b/third_party/openmp/kmp_safe_c_api.h
@@ -0,0 +1,87 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_SAFE_C_API_H
+#define KMP_SAFE_C_API_H
+
+#include <type_traits>
+#include "kmp_platform.h"
+#include <string.h>
+
+// Replacement for banned C API
+
+// Not every unsafe call listed here is handled now, but keeping everything
+// in one place should be handy for future maintenance.
+#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT
+
+#define RSIZE_MAX_STR (4UL << 10) // 4KB
+
+// _malloca was suggested, but it is not a drop-in replacement for _alloca
+#define KMP_ALLOCA _alloca
+
+#define KMP_MEMCPY_S memcpy_s
+#define KMP_SNPRINTF sprintf_s
+#define KMP_SSCANF sscanf_s
+#define KMP_STRCPY_S strcpy_s
+#define KMP_STRNCPY_S strncpy_s
+#define KMP_STRNCAT_S strncat_s
+
+// Use this only when buffer size is unknown
+#define KMP_MEMCPY(dst, src, cnt) memcpy_s(dst, cnt, src, cnt)
+
+template <typename T, bool B = std::is_array<T>::value>
+struct kmp_get_rmax_t {};
+template <typename T> struct kmp_get_rmax_t<T, false> {
+  static const size_t value = RSIZE_MAX_STR;
+};
+template <typename T> struct kmp_get_rmax_t<T, true> {
+  static const size_t value = sizeof(T);
+};
+#define KMP_STRLEN(str) strnlen_s(str, kmp_get_rmax_t<decltype(str)>::value)
+
+// Use this only when buffer size is unknown
+#define KMP_STRNCPY(dst, src, cnt) strncpy_s(dst, cnt, src, cnt)
+
+// _TRUNCATE insures buffer size > max string to print.
+#define KMP_VSNPRINTF(dst, cnt, fmt, arg)                                      \
+  vsnprintf_s(dst, cnt, _TRUNCATE, fmt, arg)
+
+#else // KMP_OS_WINDOWS
+
+// For now, these macros use the existing API.
+
+#if KMP_OS_NETBSD
+#define KMP_ALLOCA __builtin_alloca
+#else
+#define KMP_ALLOCA alloca
+#endif
+#define KMP_MEMCPY_S(dst, bsz, src, cnt) memcpy(dst, src, cnt)
+#define KMP_SNPRINTF snprintf
+#define KMP_SSCANF sscanf
+#define KMP_STRCPY_S(dst, bsz, src) strcpy(dst, src)
+#define KMP_STRNCPY_S(dst, bsz, src, cnt) strncpy(dst, src, cnt)
+#define KMP_STRNCAT_S(dst, bsz, src, cnt) strncat(dst, src, cnt)
+#define KMP_VSNPRINTF vsnprintf
+#define KMP_STRNCPY strncpy
+#define KMP_STRLEN strlen
+#define KMP_MEMCPY memcpy
+
+#endif // KMP_OS_WINDOWS
+
+// Offer truncated version of strncpy
+static inline void __kmp_strncpy_truncate(char *buffer, size_t buf_size,
+                                          char const *src, size_t src_size) {
+  if (src_size >= buf_size) {
+    src_size = buf_size - 1;
+  }
+  KMP_STRNCPY_S(buffer, buf_size, src, src_size);
+  buffer[src_size] = '\0';
+}
+
+#endif // KMP_SAFE_C_API_H
diff --git a/third_party/openmp/kmp_sched.cpp b/third_party/openmp/kmp_sched.cpp
new file mode 100644
index 000000000..53182bef5
--- /dev/null
+++ b/third_party/openmp/kmp_sched.cpp
@@ -0,0 +1,1092 @@
+/*
+ * kmp_sched.cpp -- static scheduling -- iteration initialization
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/* Static scheduling initialization.
+
+  NOTE: team->t.t_nproc is a constant inside of any dispatch loop, however
+        it may change values between parallel regions.  __kmp_max_nth
+        is the largest value __kmp_nth may take, 1 is the smallest. */
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+#ifdef KMP_DEBUG
+//-------------------------------------------------------------------------
+// template for debug prints specification ( d, u, lld, llu )
+char const *traits_t<int>::spec = "d";
+char const *traits_t<unsigned int>::spec = "u";
+char const *traits_t<long long>::spec = "lld";
+char const *traits_t<unsigned long long>::spec = "llu";
+char const *traits_t<long>::spec = "ld";
+//-------------------------------------------------------------------------
+#endif
+
+#if KMP_STATS_ENABLED
+#define KMP_STATS_LOOP_END(stat)                                               \
+  {                                                                            \
+    kmp_int64 t;                                                               \
+    kmp_int64 u = (kmp_int64)(*pupper);                                        \
+    kmp_int64 l = (kmp_int64)(*plower);                                        \
+    kmp_int64 i = (kmp_int64)incr;                                             \
+    if (i == 1) {                                                              \
+      t = u - l + 1;                                                           \
+    } else if (i == -1) {                                                      \
+      t = l - u + 1;                                                           \
+    } else if (i > 0) {                                                        \
+      t = (u - l) / i + 1;                                                     \
+    } else {                                                                   \
+      t = (l - u) / (-i) + 1;                                                  \
+    }                                                                          \
+    KMP_COUNT_VALUE(stat, t);                                                  \
+    KMP_POP_PARTITIONED_TIMER();                                               \
+  }
+#else
+#define KMP_STATS_LOOP_END(stat) /* Nothing */
+#endif
+
+#if USE_ITT_BUILD || defined KMP_DEBUG
+static ident_t loc_stub = {0, KMP_IDENT_KMPC, 0, 0, ";unknown;unknown;0;0;;"};
+static inline void check_loc(ident_t *&loc) {
+  if (loc == NULL)
+    loc = &loc_stub; // may need to report location info to ittnotify
+}
+#endif
+
+template <typename T>
+static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
+                                  kmp_int32 schedtype, kmp_int32 *plastiter,
+                                  T *plower, T *pupper,
+                                  typename traits_t<T>::signed_t *pstride,
+                                  typename traits_t<T>::signed_t incr,
+                                  typename traits_t<T>::signed_t chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                  ,
+                                  void *codeptr
+#endif
+) {
+  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static_scheduling);
+
+  // Clear monotonic/nonmonotonic bits (ignore it)
+  schedtype = SCHEDULE_WITHOUT_MODIFIERS(schedtype);
+
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  /*  this all has to be changed back to TID and such.. */
+  kmp_int32 gtid = global_tid;
+  kmp_uint32 tid;
+  kmp_uint32 nth;
+  UT trip_count;
+  kmp_team_t *team;
+  __kmp_assert_valid_gtid(gtid);
+  kmp_info_t *th = __kmp_threads[gtid];
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_team_info_t *team_info = NULL;
+  ompt_task_info_t *task_info = NULL;
+  ompt_work_t ompt_work_type = ompt_work_loop;
+
+  static kmp_int8 warn = 0;
+
+  if (ompt_enabled.ompt_callback_work || ompt_enabled.ompt_callback_dispatch) {
+    // Only fully initialize variables needed by OMPT if OMPT is enabled.
+    team_info = __ompt_get_teaminfo(0, NULL);
+    task_info = __ompt_get_task_info_object(0);
+    // Determine workshare type
+    if (loc != NULL) {
+      if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
+        ompt_work_type = ompt_work_loop;
+      } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
+        ompt_work_type = ompt_work_sections;
+      } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
+        ompt_work_type = ompt_work_distribute;
+      } else {
+        kmp_int8 bool_res =
+            KMP_COMPARE_AND_STORE_ACQ8(&warn, (kmp_int8)0, (kmp_int8)1);
+        if (bool_res)
+          KMP_WARNING(OmptOutdatedWorkshare);
+      }
+      KMP_DEBUG_ASSERT(ompt_work_type);
+    }
+  }
+#endif
+
+  KMP_DEBUG_ASSERT(plastiter && plower && pupper && pstride);
+  KE_TRACE(10, ("__kmpc_for_static_init called (%d)\n", global_tid));
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmpc_for_static_init: T#%%d sched=%%d liter=%%d iter=(%%%s,"
+        " %%%s, %%%s) incr=%%%s chunk=%%%s signed?<%s>\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec,
+        traits_t<ST>::spec, traits_t<ST>::spec, traits_t<T>::spec);
+    KD_TRACE(100, (buff, global_tid, schedtype, *plastiter, *plower, *pupper,
+                   *pstride, incr, chunk));
+    __kmp_str_free(&buff);
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    __kmp_push_workshare(global_tid, ct_pdo, loc);
+    if (incr == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
+                            loc);
+    }
+  }
+  /* special handling for zero-trip loops */
+  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
+    if (plastiter != NULL)
+      *plastiter = FALSE;
+    /* leave pupper and plower set to entire iteration space */
+    *pstride = incr; /* value should never be used */
+// *plower = *pupper - incr;
+// let compiler bypass the illegal loop (like for(i=1;i<10;i--))
+// THE LINE COMMENTED ABOVE CAUSED shape2F/h_tests_1.f TO HAVE A FAILURE
+// ON A ZERO-TRIP LOOP (lower=1, upper=0,stride=1) - JPH June 23, 2009.
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmpc_for_static_init:(ZERO TRIP) liter=%%d "
+                              "lower=%%%s upper=%%%s stride = %%%s "
+                              "signed?<%s>, loc = %%s\n",
+                              traits_t<T>::spec, traits_t<T>::spec,
+                              traits_t<ST>::spec, traits_t<T>::spec);
+      check_loc(loc);
+      KD_TRACE(100,
+               (buff, *plastiter, *plower, *pupper, *pstride, loc->psource));
+      __kmp_str_free(&buff);
+    }
+#endif
+    KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), 0, codeptr);
+    }
+#endif
+    KMP_STATS_LOOP_END(OMP_loop_static_iterations);
+    return;
+  }
+
+  // Although there are schedule enumerations above kmp_ord_upper which are not
+  // schedules for "distribute", the only ones which are useful are dynamic, so
+  // cannot be seen here, since this codepath is only executed for static
+  // schedules.
+  if (schedtype > kmp_ord_upper) {
+    // we are in DISTRIBUTE construct
+    schedtype += kmp_sch_static -
+                 kmp_distribute_static; // AC: convert to usual schedule type
+    if (th->th.th_team->t.t_serialized > 1) {
+      tid = 0;
+      team = th->th.th_team;
+    } else {
+      tid = th->th.th_team->t.t_master_tid;
+      team = th->th.th_team->t.t_parent;
+    }
+  } else {
+    tid = __kmp_tid_from_gtid(global_tid);
+    team = th->th.th_team;
+  }
+
+  /* determine if "for" loop is an active worksharing construct */
+  if (team->t.t_serialized) {
+    /* serialized parallel, each thread executes whole iteration space */
+    if (plastiter != NULL)
+      *plastiter = TRUE;
+    /* leave pupper and plower set to entire iteration space */
+    *pstride =
+        (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
+
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmpc_for_static_init: (serial) liter=%%d "
+                              "lower=%%%s upper=%%%s stride = %%%s\n",
+                              traits_t<T>::spec, traits_t<T>::spec,
+                              traits_t<ST>::spec);
+      KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
+      __kmp_str_free(&buff);
+    }
+#endif
+    KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), *pstride, codeptr);
+    }
+#endif
+    KMP_STATS_LOOP_END(OMP_loop_static_iterations);
+    return;
+  }
+  nth = team->t.t_nproc;
+  if (nth == 1) {
+    if (plastiter != NULL)
+      *plastiter = TRUE;
+    *pstride =
+        (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmpc_for_static_init: (serial) liter=%%d "
+                              "lower=%%%s upper=%%%s stride = %%%s\n",
+                              traits_t<T>::spec, traits_t<T>::spec,
+                              traits_t<ST>::spec);
+      KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
+      __kmp_str_free(&buff);
+    }
+#endif
+    KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), *pstride, codeptr);
+    }
+#endif
+    KMP_STATS_LOOP_END(OMP_loop_static_iterations);
+    return;
+  }
+
+  /* compute trip count */
+  if (incr == 1) {
+    trip_count = *pupper - *plower + 1;
+  } else if (incr == -1) {
+    trip_count = *plower - *pupper + 1;
+  } else if (incr > 0) {
+    // upper-lower can exceed the limit of signed type
+    trip_count = (UT)(*pupper - *plower) / incr + 1;
+  } else {
+    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
+  }
+
+#if KMP_STATS_ENABLED
+  if (KMP_MASTER_GTID(gtid)) {
+    KMP_COUNT_VALUE(OMP_loop_static_total_iterations, trip_count);
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    /* tripcount overflow? */
+    if (trip_count == 0 && *pupper != *plower) {
+      __kmp_error_construct(kmp_i18n_msg_CnsIterationRangeTooLarge, ct_pdo,
+                            loc);
+    }
+  }
+
+  /* compute remaining parameters */
+  switch (schedtype) {
+  case kmp_sch_static: {
+    if (trip_count < nth) {
+      KMP_DEBUG_ASSERT(
+          __kmp_static == kmp_sch_static_greedy ||
+          __kmp_static ==
+              kmp_sch_static_balanced); // Unknown static scheduling type.
+      if (tid < trip_count) {
+        *pupper = *plower = *plower + tid * incr;
+      } else {
+        // set bounds so non-active threads execute no iterations
+        *plower = *pupper + (incr > 0 ? 1 : -1);
+      }
+      if (plastiter != NULL)
+        *plastiter = (tid == trip_count - 1);
+    } else {
+      if (__kmp_static == kmp_sch_static_balanced) {
+        UT small_chunk = trip_count / nth;
+        UT extras = trip_count % nth;
+        *plower += incr * (tid * small_chunk + (tid < extras ? tid : extras));
+        *pupper = *plower + small_chunk * incr - (tid < extras ? 0 : incr);
+        if (plastiter != NULL)
+          *plastiter = (tid == nth - 1);
+      } else {
+        T big_chunk_inc_count =
+            (trip_count / nth + ((trip_count % nth) ? 1 : 0)) * incr;
+        T old_upper = *pupper;
+
+        KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
+        // Unknown static scheduling type.
+
+        *plower += tid * big_chunk_inc_count;
+        *pupper = *plower + big_chunk_inc_count - incr;
+        if (incr > 0) {
+          if (*pupper < *plower)
+            *pupper = traits_t<T>::max_value;
+          if (plastiter != NULL)
+            *plastiter = *plower <= old_upper && *pupper > old_upper - incr;
+          if (*pupper > old_upper)
+            *pupper = old_upper; // tracker C73258
+        } else {
+          if (*pupper > *plower)
+            *pupper = traits_t<T>::min_value;
+          if (plastiter != NULL)
+            *plastiter = *plower >= old_upper && *pupper < old_upper - incr;
+          if (*pupper < old_upper)
+            *pupper = old_upper; // tracker C73258
+        }
+      }
+    }
+    *pstride = trip_count;
+    break;
+  }
+  case kmp_sch_static_chunked: {
+    ST span;
+    UT nchunks;
+    if (chunk < 1)
+      chunk = 1;
+    else if ((UT)chunk > trip_count)
+      chunk = trip_count;
+    nchunks = (trip_count) / (UT)chunk + (trip_count % (UT)chunk ? 1 : 0);
+    span = chunk * incr;
+    if (nchunks < nth) {
+      *pstride = span * nchunks;
+      if (tid < nchunks) {
+        *plower = *plower + (span * tid);
+        *pupper = *plower + span - incr;
+      } else {
+        *plower = *pupper + (incr > 0 ? 1 : -1);
+      }
+    } else {
+      *pstride = span * nth;
+      *plower = *plower + (span * tid);
+      *pupper = *plower + span - incr;
+    }
+    if (plastiter != NULL)
+      *plastiter = (tid == (nchunks - 1) % nth);
+    break;
+  }
+  case kmp_sch_static_balanced_chunked: {
+    T old_upper = *pupper;
+    // round up to make sure the chunk is enough to cover all iterations
+    UT span = (trip_count + nth - 1) / nth;
+
+    // perform chunk adjustment
+    chunk = (span + chunk - 1) & ~(chunk - 1);
+
+    span = chunk * incr;
+    *plower = *plower + (span * tid);
+    *pupper = *plower + span - incr;
+    if (incr > 0) {
+      if (*pupper > old_upper)
+        *pupper = old_upper;
+    } else if (*pupper < old_upper)
+      *pupper = old_upper;
+
+    if (plastiter != NULL)
+      *plastiter = (tid == ((trip_count - 1) / (UT)chunk));
+    break;
+  }
+  default:
+    KMP_ASSERT2(0, "__kmpc_for_static_init: unknown scheduling type");
+    break;
+  }
+
+#if USE_ITT_BUILD
+  // Report loop metadata
+  if (KMP_MASTER_TID(tid) && __itt_metadata_add_ptr &&
+      __kmp_forkjoin_frames_mode == 3 && th->th.th_teams_microtask == NULL &&
+      team->t.t_active_level == 1) {
+    kmp_uint64 cur_chunk = chunk;
+    check_loc(loc);
+    // Calculate chunk in case it was not specified; it is specified for
+    // kmp_sch_static_chunked
+    if (schedtype == kmp_sch_static) {
+      cur_chunk = trip_count / nth + ((trip_count % nth) ? 1 : 0);
+    }
+    // 0 - "static" schedule
+    __kmp_itt_metadata_loop(loc, 0, trip_count, cur_chunk);
+  }
+#endif
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmpc_for_static_init: liter=%%d lower=%%%s "
+                            "upper=%%%s stride = %%%s signed?<%s>\n",
+                            traits_t<T>::spec, traits_t<T>::spec,
+                            traits_t<ST>::spec, traits_t<T>::spec);
+    KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
+    __kmp_str_free(&buff);
+  }
+#endif
+  KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), trip_count, codeptr);
+  }
+  if (ompt_enabled.ompt_callback_dispatch) {
+    ompt_dispatch_t dispatch_type;
+    ompt_data_t instance = ompt_data_none;
+    ompt_dispatch_chunk_t dispatch_chunk;
+    if (ompt_work_type == ompt_work_sections) {
+      dispatch_type = ompt_dispatch_section;
+      instance.ptr = codeptr;
+    } else {
+      OMPT_GET_DISPATCH_CHUNK(dispatch_chunk, *plower, *pupper, incr);
+      dispatch_type = (ompt_work_type == ompt_work_distribute)
+                          ? ompt_dispatch_distribute_chunk
+                          : ompt_dispatch_ws_loop_chunk;
+      instance.ptr = &dispatch_chunk;
+    }
+    ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
+        &(team_info->parallel_data), &(task_info->task_data), dispatch_type,
+        instance);
+  }
+#endif
+
+  KMP_STATS_LOOP_END(OMP_loop_static_iterations);
+  return;
+}
+
+template <typename T>
+static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
+                                       kmp_int32 schedule, kmp_int32 *plastiter,
+                                       T *plower, T *pupper, T *pupperDist,
+                                       typename traits_t<T>::signed_t *pstride,
+                                       typename traits_t<T>::signed_t incr,
+                                       typename traits_t<T>::signed_t chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                       ,
+                                       void *codeptr
+#endif
+) {
+  KMP_COUNT_BLOCK(OMP_DISTRIBUTE);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_distribute);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_distribute_scheduling);
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  kmp_uint32 tid;
+  kmp_uint32 nth;
+  kmp_uint32 team_id;
+  kmp_uint32 nteams;
+  UT trip_count;
+  kmp_team_t *team;
+  kmp_info_t *th;
+
+  KMP_DEBUG_ASSERT(plastiter && plower && pupper && pupperDist && pstride);
+  KE_TRACE(10, ("__kmpc_dist_for_static_init called (%d)\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmpc_dist_for_static_init: T#%%d schedLoop=%%d liter=%%d "
+        "iter=(%%%s, %%%s, %%%s) chunk=%%%s signed?<%s>\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec,
+        traits_t<ST>::spec, traits_t<T>::spec);
+    KD_TRACE(100,
+             (buff, gtid, schedule, *plastiter, *plower, *pupper, incr, chunk));
+    __kmp_str_free(&buff);
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    __kmp_push_workshare(gtid, ct_pdo, loc);
+    if (incr == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
+                            loc);
+    }
+    if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
+      // The loop is illegal.
+      // Some zero-trip loops maintained by compiler, e.g.:
+      //   for(i=10;i<0;++i) // lower >= upper - run-time check
+      //   for(i=0;i>10;--i) // lower <= upper - run-time check
+      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+      // Compiler does not check the following illegal loops:
+      //   for(i=0;i<10;i+=incr) // where incr<0
+      //   for(i=10;i>0;i-=incr) // where incr<0
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
+    }
+  }
+  tid = __kmp_tid_from_gtid(gtid);
+  th = __kmp_threads[gtid];
+  nth = th->th.th_team_nproc;
+  team = th->th.th_team;
+  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
+  nteams = th->th.th_teams_size.nteams;
+  team_id = team->t.t_master_tid;
+  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
+
+  // compute global trip count
+  if (incr == 1) {
+    trip_count = *pupper - *plower + 1;
+  } else if (incr == -1) {
+    trip_count = *plower - *pupper + 1;
+  } else if (incr > 0) {
+    // upper-lower can exceed the limit of signed type
+    trip_count = (UT)(*pupper - *plower) / incr + 1;
+  } else {
+    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
+  }
+
+  *pstride = *pupper - *plower; // just in case (can be unused)
+  if (trip_count <= nteams) {
+    KMP_DEBUG_ASSERT(
+        __kmp_static == kmp_sch_static_greedy ||
+        __kmp_static ==
+            kmp_sch_static_balanced); // Unknown static scheduling type.
+    // only primary threads of some teams get single iteration, other threads
+    // get nothing
+    if (team_id < trip_count && tid == 0) {
+      *pupper = *pupperDist = *plower = *plower + team_id * incr;
+    } else {
+      *pupperDist = *pupper;
+      *plower = *pupper + incr; // compiler should skip loop body
+    }
+    if (plastiter != NULL)
+      *plastiter = (tid == 0 && team_id == trip_count - 1);
+  } else {
+    // Get the team's chunk first (each team gets at most one chunk)
+    if (__kmp_static == kmp_sch_static_balanced) {
+      UT chunkD = trip_count / nteams;
+      UT extras = trip_count % nteams;
+      *plower +=
+          incr * (team_id * chunkD + (team_id < extras ? team_id : extras));
+      *pupperDist = *plower + chunkD * incr - (team_id < extras ? 0 : incr);
+      if (plastiter != NULL)
+        *plastiter = (team_id == nteams - 1);
+    } else {
+      T chunk_inc_count =
+          (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
+      T upper = *pupper;
+      KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
+      // Unknown static scheduling type.
+      *plower += team_id * chunk_inc_count;
+      *pupperDist = *plower + chunk_inc_count - incr;
+      // Check/correct bounds if needed
+      if (incr > 0) {
+        if (*pupperDist < *plower)
+          *pupperDist = traits_t<T>::max_value;
+        if (plastiter != NULL)
+          *plastiter = *plower <= upper && *pupperDist > upper - incr;
+        if (*pupperDist > upper)
+          *pupperDist = upper; // tracker C73258
+        if (*plower > *pupperDist) {
+          *pupper = *pupperDist; // no iterations available for the team
+          goto end;
+        }
+      } else {
+        if (*pupperDist > *plower)
+          *pupperDist = traits_t<T>::min_value;
+        if (plastiter != NULL)
+          *plastiter = *plower >= upper && *pupperDist < upper - incr;
+        if (*pupperDist < upper)
+          *pupperDist = upper; // tracker C73258
+        if (*plower < *pupperDist) {
+          *pupper = *pupperDist; // no iterations available for the team
+          goto end;
+        }
+      }
+    }
+    // Get the parallel loop chunk now (for thread)
+    // compute trip count for team's chunk
+    if (incr == 1) {
+      trip_count = *pupperDist - *plower + 1;
+    } else if (incr == -1) {
+      trip_count = *plower - *pupperDist + 1;
+    } else if (incr > 1) {
+      // upper-lower can exceed the limit of signed type
+      trip_count = (UT)(*pupperDist - *plower) / incr + 1;
+    } else {
+      trip_count = (UT)(*plower - *pupperDist) / (-incr) + 1;
+    }
+    KMP_DEBUG_ASSERT(trip_count);
+    switch (schedule) {
+    case kmp_sch_static: {
+      if (trip_count <= nth) {
+        KMP_DEBUG_ASSERT(
+            __kmp_static == kmp_sch_static_greedy ||
+            __kmp_static ==
+                kmp_sch_static_balanced); // Unknown static scheduling type.
+        if (tid < trip_count)
+          *pupper = *plower = *plower + tid * incr;
+        else
+          *plower = *pupper + incr; // no iterations available
+        if (plastiter != NULL)
+          if (*plastiter != 0 && !(tid == trip_count - 1))
+            *plastiter = 0;
+      } else {
+        if (__kmp_static == kmp_sch_static_balanced) {
+          UT chunkL = trip_count / nth;
+          UT extras = trip_count % nth;
+          *plower += incr * (tid * chunkL + (tid < extras ? tid : extras));
+          *pupper = *plower + chunkL * incr - (tid < extras ? 0 : incr);
+          if (plastiter != NULL)
+            if (*plastiter != 0 && !(tid == nth - 1))
+              *plastiter = 0;
+        } else {
+          T chunk_inc_count =
+              (trip_count / nth + ((trip_count % nth) ? 1 : 0)) * incr;
+          T upper = *pupperDist;
+          KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
+          // Unknown static scheduling type.
+          *plower += tid * chunk_inc_count;
+          *pupper = *plower + chunk_inc_count - incr;
+          if (incr > 0) {
+            if (*pupper < *plower)
+              *pupper = traits_t<T>::max_value;
+            if (plastiter != NULL)
+              if (*plastiter != 0 &&
+                  !(*plower <= upper && *pupper > upper - incr))
+                *plastiter = 0;
+            if (*pupper > upper)
+              *pupper = upper; // tracker C73258
+          } else {
+            if (*pupper > *plower)
+              *pupper = traits_t<T>::min_value;
+            if (plastiter != NULL)
+              if (*plastiter != 0 &&
+                  !(*plower >= upper && *pupper < upper - incr))
+                *plastiter = 0;
+            if (*pupper < upper)
+              *pupper = upper; // tracker C73258
+          }
+        }
+      }
+      break;
+    }
+    case kmp_sch_static_chunked: {
+      ST span;
+      if (chunk < 1)
+        chunk = 1;
+      span = chunk * incr;
+      *pstride = span * nth;
+      *plower = *plower + (span * tid);
+      *pupper = *plower + span - incr;
+      if (plastiter != NULL)
+        if (*plastiter != 0 && !(tid == ((trip_count - 1) / (UT)chunk) % nth))
+          *plastiter = 0;
+      break;
+    }
+    default:
+      KMP_ASSERT2(0,
+                  "__kmpc_dist_for_static_init: unknown loop scheduling type");
+      break;
+    }
+  }
+end:;
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmpc_dist_for_static_init: last=%%d lo=%%%s up=%%%s upDist=%%%s "
+        "stride=%%%s signed?<%s>\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec,
+        traits_t<ST>::spec, traits_t<T>::spec);
+    KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pupperDist, *pstride));
+    __kmp_str_free(&buff);
+  }
+#endif
+  KE_TRACE(10, ("__kmpc_dist_for_static_init: T#%d return\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work || ompt_enabled.ompt_callback_dispatch) {
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_distribute, ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), 0, codeptr);
+    }
+    if (ompt_enabled.ompt_callback_dispatch) {
+      ompt_data_t instance = ompt_data_none;
+      ompt_dispatch_chunk_t dispatch_chunk;
+      OMPT_GET_DISPATCH_CHUNK(dispatch_chunk, *plower, *pupperDist, incr);
+      instance.ptr = &dispatch_chunk;
+      ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
+          &(team_info->parallel_data), &(task_info->task_data),
+          ompt_dispatch_distribute_chunk, instance);
+    }
+  }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+  KMP_STATS_LOOP_END(OMP_distribute_iterations);
+  return;
+}
+
+template <typename T>
+static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
+                                   typename traits_t<T>::signed_t *p_st,
+                                   typename traits_t<T>::signed_t incr,
+                                   typename traits_t<T>::signed_t chunk) {
+  // The routine returns the first chunk distributed to the team and
+  // stride for next chunks calculation.
+  // Last iteration flag set for the team that will execute
+  // the last iteration of the loop.
+  // The routine is called for dist_schedule(static,chunk) only.
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  kmp_uint32 team_id;
+  kmp_uint32 nteams;
+  UT trip_count;
+  T lower;
+  T upper;
+  ST span;
+  kmp_team_t *team;
+  kmp_info_t *th;
+
+  KMP_DEBUG_ASSERT(p_last && p_lb && p_ub && p_st);
+  KE_TRACE(10, ("__kmp_team_static_init called (%d)\n", gtid));
+  __kmp_assert_valid_gtid(gtid);
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_team_static_init enter: T#%%d liter=%%d "
+                            "iter=(%%%s, %%%s, %%%s) chunk %%%s; signed?<%s>\n",
+                            traits_t<T>::spec, traits_t<T>::spec,
+                            traits_t<ST>::spec, traits_t<ST>::spec,
+                            traits_t<T>::spec);
+    KD_TRACE(100, (buff, gtid, *p_last, *p_lb, *p_ub, *p_st, chunk));
+    __kmp_str_free(&buff);
+  }
+#endif
+
+  lower = *p_lb;
+  upper = *p_ub;
+  if (__kmp_env_consistency_check) {
+    if (incr == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
+                            loc);
+    }
+    if (incr > 0 ? (upper < lower) : (lower < upper)) {
+      // The loop is illegal.
+      // Some zero-trip loops maintained by compiler, e.g.:
+      //   for(i=10;i<0;++i) // lower >= upper - run-time check
+      //   for(i=0;i>10;--i) // lower <= upper - run-time check
+      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+      // Compiler does not check the following illegal loops:
+      //   for(i=0;i<10;i+=incr) // where incr<0
+      //   for(i=10;i>0;i-=incr) // where incr<0
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
+    }
+  }
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
+  nteams = th->th.th_teams_size.nteams;
+  team_id = team->t.t_master_tid;
+  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
+
+  // compute trip count
+  if (incr == 1) {
+    trip_count = upper - lower + 1;
+  } else if (incr == -1) {
+    trip_count = lower - upper + 1;
+  } else if (incr > 0) {
+    // upper-lower can exceed the limit of signed type
+    trip_count = (UT)(upper - lower) / incr + 1;
+  } else {
+    trip_count = (UT)(lower - upper) / (-incr) + 1;
+  }
+  if (chunk < 1)
+    chunk = 1;
+  span = chunk * incr;
+  *p_st = span * nteams;
+  *p_lb = lower + (span * team_id);
+  *p_ub = *p_lb + span - incr;
+  if (p_last != NULL)
+    *p_last = (team_id == ((trip_count - 1) / (UT)chunk) % nteams);
+  // Correct upper bound if needed
+  if (incr > 0) {
+    if (*p_ub < *p_lb) // overflow?
+      *p_ub = traits_t<T>::max_value;
+    if (*p_ub > upper)
+      *p_ub = upper; // tracker C73258
+  } else { // incr < 0
+    if (*p_ub > *p_lb)
+      *p_ub = traits_t<T>::min_value;
+    if (*p_ub < upper)
+      *p_ub = upper; // tracker C73258
+  }
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff =
+        __kmp_str_format("__kmp_team_static_init exit: T#%%d team%%u liter=%%d "
+                         "iter=(%%%s, %%%s, %%%s) chunk %%%s\n",
+                         traits_t<T>::spec, traits_t<T>::spec,
+                         traits_t<ST>::spec, traits_t<ST>::spec);
+    KD_TRACE(100, (buff, gtid, team_id, *p_last, *p_lb, *p_ub, *p_st, chunk));
+    __kmp_str_free(&buff);
+  }
+#endif
+}
+
+//------------------------------------------------------------------------------
+extern "C" {
+/*!
+@ingroup WORK_SHARING
+@param    loc       Source code location
+@param    gtid      Global thread id of this thread
+@param    schedtype  Scheduling type
+@param    plastiter Pointer to the "last iteration" flag
+@param    plower    Pointer to the lower bound
+@param    pupper    Pointer to the upper bound
+@param    pstride   Pointer to the stride
+@param    incr      Loop increment
+@param    chunk     The chunk size
+
+Each of the four functions here are identical apart from the argument types.
+
+The functions compute the upper and lower bounds and stride to be used for the
+set of iterations to be executed by the current thread from the statically
+scheduled loop that is described by the initial values of the bounds, stride,
+increment and chunk size.
+
+@{
+*/
+void __kmpc_for_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
+                              kmp_int32 *plastiter, kmp_int32 *plower,
+                              kmp_int32 *pupper, kmp_int32 *pstride,
+                              kmp_int32 incr, kmp_int32 chunk) {
+  __kmp_for_static_init<kmp_int32>(loc, gtid, schedtype, plastiter, plower,
+                                   pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                   ,
+                                   OMPT_GET_RETURN_ADDRESS(0)
+#endif
+  );
+}
+
+/*!
+ See @ref __kmpc_for_static_init_4
+ */
+void __kmpc_for_static_init_4u(ident_t *loc, kmp_int32 gtid,
+                               kmp_int32 schedtype, kmp_int32 *plastiter,
+                               kmp_uint32 *plower, kmp_uint32 *pupper,
+                               kmp_int32 *pstride, kmp_int32 incr,
+                               kmp_int32 chunk) {
+  __kmp_for_static_init<kmp_uint32>(loc, gtid, schedtype, plastiter, plower,
+                                    pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                    ,
+                                    OMPT_GET_RETURN_ADDRESS(0)
+#endif
+  );
+}
+
+/*!
+ See @ref __kmpc_for_static_init_4
+ */
+void __kmpc_for_static_init_8(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
+                              kmp_int32 *plastiter, kmp_int64 *plower,
+                              kmp_int64 *pupper, kmp_int64 *pstride,
+                              kmp_int64 incr, kmp_int64 chunk) {
+  __kmp_for_static_init<kmp_int64>(loc, gtid, schedtype, plastiter, plower,
+                                   pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                   ,
+                                   OMPT_GET_RETURN_ADDRESS(0)
+#endif
+  );
+}
+
+/*!
+ See @ref __kmpc_for_static_init_4
+ */
+void __kmpc_for_static_init_8u(ident_t *loc, kmp_int32 gtid,
+                               kmp_int32 schedtype, kmp_int32 *plastiter,
+                               kmp_uint64 *plower, kmp_uint64 *pupper,
+                               kmp_int64 *pstride, kmp_int64 incr,
+                               kmp_int64 chunk) {
+  __kmp_for_static_init<kmp_uint64>(loc, gtid, schedtype, plastiter, plower,
+                                    pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                    ,
+                                    OMPT_GET_RETURN_ADDRESS(0)
+#endif
+  );
+}
+/*!
+@}
+*/
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_CODEPTR_ARG , OMPT_GET_RETURN_ADDRESS(0)
+#else
+#define OMPT_CODEPTR_ARG
+#endif
+
+/*!
+@ingroup WORK_SHARING
+@param    loc       Source code location
+@param    gtid      Global thread id of this thread
+@param    schedule  Scheduling type for the parallel loop
+@param    plastiter Pointer to the "last iteration" flag
+@param    plower    Pointer to the lower bound
+@param    pupper    Pointer to the upper bound of loop chunk
+@param    pupperD   Pointer to the upper bound of dist_chunk
+@param    pstride   Pointer to the stride for parallel loop
+@param    incr      Loop increment
+@param    chunk     The chunk size for the parallel loop
+
+Each of the four functions here are identical apart from the argument types.
+
+The functions compute the upper and lower bounds and strides to be used for the
+set of iterations to be executed by the current thread from the statically
+scheduled loop that is described by the initial values of the bounds, strides,
+increment and chunks for parallel loop and distribute constructs.
+
+@{
+*/
+void __kmpc_dist_for_static_init_4(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 schedule, kmp_int32 *plastiter,
+                                   kmp_int32 *plower, kmp_int32 *pupper,
+                                   kmp_int32 *pupperD, kmp_int32 *pstride,
+                                   kmp_int32 incr, kmp_int32 chunk) {
+  __kmp_dist_for_static_init<kmp_int32>(loc, gtid, schedule, plastiter, plower,
+                                        pupper, pupperD, pstride, incr,
+                                        chunk OMPT_CODEPTR_ARG);
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void __kmpc_dist_for_static_init_4u(ident_t *loc, kmp_int32 gtid,
+                                    kmp_int32 schedule, kmp_int32 *plastiter,
+                                    kmp_uint32 *plower, kmp_uint32 *pupper,
+                                    kmp_uint32 *pupperD, kmp_int32 *pstride,
+                                    kmp_int32 incr, kmp_int32 chunk) {
+  __kmp_dist_for_static_init<kmp_uint32>(loc, gtid, schedule, plastiter, plower,
+                                         pupper, pupperD, pstride, incr,
+                                         chunk OMPT_CODEPTR_ARG);
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void __kmpc_dist_for_static_init_8(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 schedule, kmp_int32 *plastiter,
+                                   kmp_int64 *plower, kmp_int64 *pupper,
+                                   kmp_int64 *pupperD, kmp_int64 *pstride,
+                                   kmp_int64 incr, kmp_int64 chunk) {
+  __kmp_dist_for_static_init<kmp_int64>(loc, gtid, schedule, plastiter, plower,
+                                        pupper, pupperD, pstride, incr,
+                                        chunk OMPT_CODEPTR_ARG);
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void __kmpc_dist_for_static_init_8u(ident_t *loc, kmp_int32 gtid,
+                                    kmp_int32 schedule, kmp_int32 *plastiter,
+                                    kmp_uint64 *plower, kmp_uint64 *pupper,
+                                    kmp_uint64 *pupperD, kmp_int64 *pstride,
+                                    kmp_int64 incr, kmp_int64 chunk) {
+  __kmp_dist_for_static_init<kmp_uint64>(loc, gtid, schedule, plastiter, plower,
+                                         pupper, pupperD, pstride, incr,
+                                         chunk OMPT_CODEPTR_ARG);
+}
+/*!
+@}
+*/
+
+//------------------------------------------------------------------------------
+// Auxiliary routines for Distribute Parallel Loop construct implementation
+//    Transfer call to template< type T >
+//    __kmp_team_static_init( ident_t *loc, int gtid,
+//        int *p_last, T *lb, T *ub, ST *st, ST incr, ST chunk )
+
+/*!
+@ingroup WORK_SHARING
+@{
+@param loc Source location
+@param gtid Global thread id
+@param p_last pointer to last iteration flag
+@param p_lb  pointer to Lower bound
+@param p_ub  pointer to Upper bound
+@param p_st  Step (or increment if you prefer)
+@param incr  Loop increment
+@param chunk The chunk size to block with
+
+The functions compute the upper and lower bounds and stride to be used for the
+set of iterations to be executed by the current team from the statically
+scheduled loop that is described by the initial values of the bounds, stride,
+increment and chunk for the distribute construct as part of composite distribute
+parallel loop construct. These functions are all identical apart from the types
+of the arguments.
+*/
+
+void __kmpc_team_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                               kmp_int32 *p_lb, kmp_int32 *p_ub,
+                               kmp_int32 *p_st, kmp_int32 incr,
+                               kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  __kmp_team_static_init<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
+                                    chunk);
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void __kmpc_team_static_init_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                                kmp_uint32 *p_lb, kmp_uint32 *p_ub,
+                                kmp_int32 *p_st, kmp_int32 incr,
+                                kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  __kmp_team_static_init<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
+                                     chunk);
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void __kmpc_team_static_init_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                               kmp_int64 *p_lb, kmp_int64 *p_ub,
+                               kmp_int64 *p_st, kmp_int64 incr,
+                               kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  __kmp_team_static_init<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
+                                    chunk);
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void __kmpc_team_static_init_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                                kmp_uint64 *p_lb, kmp_uint64 *p_ub,
+                                kmp_int64 *p_st, kmp_int64 incr,
+                                kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  __kmp_team_static_init<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
+                                     chunk);
+}
+/*!
+@}
+*/
+
+} // extern "C"
diff --git a/third_party/openmp/kmp_settings.cpp b/third_party/openmp/kmp_settings.cpp
new file mode 100644
index 000000000..d2157b10b
--- /dev/null
+++ b/third_party/openmp/kmp_settings.cpp
@@ -0,0 +1,6636 @@
+/*
+ * kmp_settings.cpp -- Initialize environment variables
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+#include "kmp_atomic.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
+#include "kmp_environment.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_itt.h"
+#include "kmp_lock.h"
+#include "kmp_settings.h"
+#include "kmp_str.h"
+#include "kmp_wrapper_getpid.h"
+#include <ctype.h> // toupper()
+#if OMPD_SUPPORT
+#include "ompd-specific.h"
+#endif
+
+static int __kmp_env_toPrint(char const *name, int flag);
+
+bool __kmp_env_format = 0; // 0 - old format; 1 - new format
+
+// -----------------------------------------------------------------------------
+// Helper string functions. Subject to move to kmp_str.
+
+#ifdef USE_LOAD_BALANCE
+static double __kmp_convert_to_double(char const *s) {
+  double result;
+
+  if (KMP_SSCANF(s, "%lf", &result) < 1) {
+    result = 0.0;
+  }
+
+  return result;
+}
+#endif
+
+#ifdef KMP_DEBUG
+static unsigned int __kmp_readstr_with_sentinel(char *dest, char const *src,
+                                                size_t len, char sentinel) {
+  unsigned int i;
+  for (i = 0; i < len; i++) {
+    if ((*src == '\0') || (*src == sentinel)) {
+      break;
+    }
+    *(dest++) = *(src++);
+  }
+  *dest = '\0';
+  return i;
+}
+#endif
+
+static int __kmp_match_with_sentinel(char const *a, char const *b, size_t len,
+                                     char sentinel) {
+  size_t l = 0;
+
+  if (a == NULL)
+    a = "";
+  if (b == NULL)
+    b = "";
+  while (*a && *b && *b != sentinel) {
+    char ca = *a, cb = *b;
+
+    if (ca >= 'a' && ca <= 'z')
+      ca -= 'a' - 'A';
+    if (cb >= 'a' && cb <= 'z')
+      cb -= 'a' - 'A';
+    if (ca != cb)
+      return FALSE;
+    ++l;
+    ++a;
+    ++b;
+  }
+  return l >= len;
+}
+
+// Expected usage:
+//     token is the token to check for.
+//     buf is the string being parsed.
+//     *end returns the char after the end of the token.
+//        it is not modified unless a match occurs.
+//
+// Example 1:
+//
+//     if (__kmp_match_str("token", buf, *end) {
+//         <do something>
+//         buf = end;
+//     }
+//
+//  Example 2:
+//
+//     if (__kmp_match_str("token", buf, *end) {
+//         char *save = **end;
+//         **end = sentinel;
+//         <use any of the __kmp*_with_sentinel() functions>
+//         **end = save;
+//         buf = end;
+//     }
+
+static int __kmp_match_str(char const *token, char const *buf,
+                           const char **end) {
+
+  KMP_ASSERT(token != NULL);
+  KMP_ASSERT(buf != NULL);
+  KMP_ASSERT(end != NULL);
+
+  while (*token && *buf) {
+    char ct = *token, cb = *buf;
+
+    if (ct >= 'a' && ct <= 'z')
+      ct -= 'a' - 'A';
+    if (cb >= 'a' && cb <= 'z')
+      cb -= 'a' - 'A';
+    if (ct != cb)
+      return FALSE;
+    ++token;
+    ++buf;
+  }
+  if (*token) {
+    return FALSE;
+  }
+  *end = buf;
+  return TRUE;
+}
+
+#if KMP_OS_DARWIN
+static size_t __kmp_round4k(size_t size) {
+  size_t _4k = 4 * 1024;
+  if (size & (_4k - 1)) {
+    size &= ~(_4k - 1);
+    if (size <= KMP_SIZE_T_MAX - _4k) {
+      size += _4k; // Round up if there is no overflow.
+    }
+  }
+  return size;
+} // __kmp_round4k
+#endif
+
+static int __kmp_strcasecmp_with_sentinel(char const *a, char const *b,
+                                          char sentinel) {
+  if (a == NULL)
+    a = "";
+  if (b == NULL)
+    b = "";
+  while (*a && *b && *b != sentinel) {
+    char ca = *a, cb = *b;
+
+    if (ca >= 'a' && ca <= 'z')
+      ca -= 'a' - 'A';
+    if (cb >= 'a' && cb <= 'z')
+      cb -= 'a' - 'A';
+    if (ca != cb)
+      return (int)(unsigned char)*a - (int)(unsigned char)*b;
+    ++a;
+    ++b;
+  }
+  return *a                       ? (*b && *b != sentinel)
+                                        ? (int)(unsigned char)*a - (int)(unsigned char)*b
+                                        : 1
+         : (*b && *b != sentinel) ? -1
+                                  : 0;
+}
+
+// =============================================================================
+// Table structures and helper functions.
+
+typedef struct __kmp_setting kmp_setting_t;
+typedef struct __kmp_stg_ss_data kmp_stg_ss_data_t;
+typedef struct __kmp_stg_wp_data kmp_stg_wp_data_t;
+typedef struct __kmp_stg_fr_data kmp_stg_fr_data_t;
+
+typedef void (*kmp_stg_parse_func_t)(char const *name, char const *value,
+                                     void *data);
+typedef void (*kmp_stg_print_func_t)(kmp_str_buf_t *buffer, char const *name,
+                                     void *data);
+
+struct __kmp_setting {
+  char const *name; // Name of setting (environment variable).
+  kmp_stg_parse_func_t parse; // Parser function.
+  kmp_stg_print_func_t print; // Print function.
+  void *data; // Data passed to parser and printer.
+  int set; // Variable set during this "session"
+  //     (__kmp_env_initialize() or kmp_set_defaults() call).
+  int defined; // Variable set in any "session".
+}; // struct __kmp_setting
+
+struct __kmp_stg_ss_data {
+  size_t factor; // Default factor: 1 for KMP_STACKSIZE, 1024 for others.
+  kmp_setting_t **rivals; // Array of pointers to rivals (including itself).
+}; // struct __kmp_stg_ss_data
+
+struct __kmp_stg_wp_data {
+  int omp; // 0 -- KMP_LIBRARY, 1 -- OMP_WAIT_POLICY.
+  kmp_setting_t **rivals; // Array of pointers to rivals (including itself).
+}; // struct __kmp_stg_wp_data
+
+struct __kmp_stg_fr_data {
+  int force; // 0 -- KMP_DETERMINISTIC_REDUCTION, 1 -- KMP_FORCE_REDUCTION.
+  kmp_setting_t **rivals; // Array of pointers to rivals (including itself).
+}; // struct __kmp_stg_fr_data
+
+static int __kmp_stg_check_rivals( // 0 -- Ok, 1 -- errors found.
+    char const *name, // Name of variable.
+    char const *value, // Value of the variable.
+    kmp_setting_t **rivals // List of rival settings (must include current one).
+);
+
+// Helper struct that trims heading/trailing white spaces
+struct kmp_trimmed_str_t {
+  kmp_str_buf_t buf;
+  kmp_trimmed_str_t(const char *str) {
+    __kmp_str_buf_init(&buf);
+    size_t len = KMP_STRLEN(str);
+    if (len == 0)
+      return;
+    const char *begin = str;
+    const char *end = str + KMP_STRLEN(str) - 1;
+    SKIP_WS(begin);
+    while (begin < end && *end == ' ')
+      end--;
+    __kmp_str_buf_cat(&buf, begin, end - begin + 1);
+  }
+  ~kmp_trimmed_str_t() { __kmp_str_buf_free(&buf); }
+  const char *get() { return buf.str; }
+};
+
+// -----------------------------------------------------------------------------
+// Helper parse functions.
+
+static void __kmp_stg_parse_bool(char const *name, char const *value,
+                                 int *out) {
+  if (__kmp_str_match_true(value)) {
+    *out = TRUE;
+  } else if (__kmp_str_match_false(value)) {
+    *out = FALSE;
+  } else {
+    __kmp_msg(kmp_ms_warning, KMP_MSG(BadBoolValue, name, value),
+              KMP_HNT(ValidBoolValues), __kmp_msg_null);
+  }
+} // __kmp_stg_parse_bool
+
+// placed here in order to use __kmp_round4k static function
+void __kmp_check_stksize(size_t *val) {
+  // if system stack size is too big then limit the size for worker threads
+  if (*val > KMP_DEFAULT_STKSIZE * 16) // just a heuristics...
+    *val = KMP_DEFAULT_STKSIZE * 16;
+  if (*val < __kmp_sys_min_stksize)
+    *val = __kmp_sys_min_stksize;
+  if (*val > KMP_MAX_STKSIZE)
+    *val = KMP_MAX_STKSIZE; // dead code currently, but may work in future
+#if KMP_OS_DARWIN
+  *val = __kmp_round4k(*val);
+#endif // KMP_OS_DARWIN
+}
+
+static void __kmp_stg_parse_size(char const *name, char const *value,
+                                 size_t size_min, size_t size_max,
+                                 int *is_specified, size_t *out,
+                                 size_t factor) {
+  char const *msg = NULL;
+#if KMP_OS_DARWIN
+  size_min = __kmp_round4k(size_min);
+  size_max = __kmp_round4k(size_max);
+#endif // KMP_OS_DARWIN
+  if (value) {
+    if (is_specified != NULL) {
+      *is_specified = 1;
+    }
+    __kmp_str_to_size(value, out, factor, &msg);
+    if (msg == NULL) {
+      if (*out > size_max) {
+        *out = size_max;
+        msg = KMP_I18N_STR(ValueTooLarge);
+      } else if (*out < size_min) {
+        *out = size_min;
+        msg = KMP_I18N_STR(ValueTooSmall);
+      } else {
+#if KMP_OS_DARWIN
+        size_t round4k = __kmp_round4k(*out);
+        if (*out != round4k) {
+          *out = round4k;
+          msg = KMP_I18N_STR(NotMultiple4K);
+        }
+#endif
+      }
+    } else {
+      // If integer overflow occurred, * out == KMP_SIZE_T_MAX. Cut it to
+      // size_max silently.
+      if (*out < size_min) {
+        *out = size_max;
+      } else if (*out > size_max) {
+        *out = size_max;
+      }
+    }
+    if (msg != NULL) {
+      // Message is not empty. Print warning.
+      kmp_str_buf_t buf;
+      __kmp_str_buf_init(&buf);
+      __kmp_str_buf_print_size(&buf, *out);
+      KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+      KMP_INFORM(Using_str_Value, name, buf.str);
+      __kmp_str_buf_free(&buf);
+    }
+  }
+} // __kmp_stg_parse_size
+
+static void __kmp_stg_parse_str(char const *name, char const *value,
+                                char **out) {
+  __kmp_str_free(out);
+  *out = __kmp_str_format("%s", value);
+} // __kmp_stg_parse_str
+
+static void __kmp_stg_parse_int(
+    char const
+        *name, // I: Name of environment variable (used in warning messages).
+    char const *value, // I: Value of environment variable to parse.
+    int min, // I: Minimum allowed value.
+    int max, // I: Maximum allowed value.
+    int *out // O: Output (parsed) value.
+) {
+  char const *msg = NULL;
+  kmp_uint64 uint = *out;
+  __kmp_str_to_uint(value, &uint, &msg);
+  if (msg == NULL) {
+    if (uint < (unsigned int)min) {
+      msg = KMP_I18N_STR(ValueTooSmall);
+      uint = min;
+    } else if (uint > (unsigned int)max) {
+      msg = KMP_I18N_STR(ValueTooLarge);
+      uint = max;
+    }
+  } else {
+    // If overflow occurred msg contains error message and uint is very big. Cut
+    // tmp it to INT_MAX.
+    if (uint < (unsigned int)min) {
+      uint = min;
+    } else if (uint > (unsigned int)max) {
+      uint = max;
+    }
+  }
+  if (msg != NULL) {
+    // Message is not empty. Print warning.
+    kmp_str_buf_t buf;
+    KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+    __kmp_str_buf_init(&buf);
+    __kmp_str_buf_print(&buf, "%" KMP_UINT64_SPEC "", uint);
+    KMP_INFORM(Using_uint64_Value, name, buf.str);
+    __kmp_str_buf_free(&buf);
+  }
+  __kmp_type_convert(uint, out);
+} // __kmp_stg_parse_int
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+static void __kmp_stg_parse_file(char const *name, char const *value,
+                                 const char *suffix, char **out) {
+  char buffer[256];
+  char *t;
+  int hasSuffix;
+  __kmp_str_free(out);
+  t = (char *)strrchr(value, '.');
+  hasSuffix = t && __kmp_str_eqf(t, suffix);
+  t = __kmp_str_format("%s%s", value, hasSuffix ? "" : suffix);
+  __kmp_expand_file_name(buffer, sizeof(buffer), t);
+  __kmp_str_free(&t);
+  *out = __kmp_str_format("%s", buffer);
+} // __kmp_stg_parse_file
+#endif
+
+#ifdef KMP_DEBUG
+static char *par_range_to_print = NULL;
+
+static void __kmp_stg_parse_par_range(char const *name, char const *value,
+                                      int *out_range, char *out_routine,
+                                      char *out_file, int *out_lb,
+                                      int *out_ub) {
+  const char *par_range_value;
+  size_t len = KMP_STRLEN(value) + 1;
+  par_range_to_print = (char *)KMP_INTERNAL_MALLOC(len + 1);
+  KMP_STRNCPY_S(par_range_to_print, len + 1, value, len + 1);
+  __kmp_par_range = +1;
+  __kmp_par_range_lb = 0;
+  __kmp_par_range_ub = INT_MAX;
+  for (;;) {
+    unsigned int len;
+    if (!value || *value == '\0') {
+      break;
+    }
+    if (!__kmp_strcasecmp_with_sentinel("routine", value, '=')) {
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
+      len = __kmp_readstr_with_sentinel(out_routine, value,
+                                        KMP_PAR_RANGE_ROUTINE_LEN - 1, ',');
+      if (len == 0) {
+        goto par_range_error;
+      }
+      value = strchr(value, ',');
+      if (value != NULL) {
+        value++;
+      }
+      continue;
+    }
+    if (!__kmp_strcasecmp_with_sentinel("filename", value, '=')) {
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
+      len = __kmp_readstr_with_sentinel(out_file, value,
+                                        KMP_PAR_RANGE_FILENAME_LEN - 1, ',');
+      if (len == 0) {
+        goto par_range_error;
+      }
+      value = strchr(value, ',');
+      if (value != NULL) {
+        value++;
+      }
+      continue;
+    }
+    if ((!__kmp_strcasecmp_with_sentinel("range", value, '=')) ||
+        (!__kmp_strcasecmp_with_sentinel("incl_range", value, '='))) {
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
+      if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) {
+        goto par_range_error;
+      }
+      *out_range = +1;
+      value = strchr(value, ',');
+      if (value != NULL) {
+        value++;
+      }
+      continue;
+    }
+    if (!__kmp_strcasecmp_with_sentinel("excl_range", value, '=')) {
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
+      if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) {
+        goto par_range_error;
+      }
+      *out_range = -1;
+      value = strchr(value, ',');
+      if (value != NULL) {
+        value++;
+      }
+      continue;
+    }
+  par_range_error:
+    KMP_WARNING(ParRangeSyntax, name);
+    __kmp_par_range = 0;
+    break;
+  }
+} // __kmp_stg_parse_par_range
+#endif
+
+int __kmp_initial_threads_capacity(int req_nproc) {
+  int nth = 32;
+
+  /* MIN( MAX( 32, 4 * $OMP_NUM_THREADS, 4 * omp_get_num_procs() ),
+   * __kmp_max_nth) */
+  if (nth < (4 * req_nproc))
+    nth = (4 * req_nproc);
+  if (nth < (4 * __kmp_xproc))
+    nth = (4 * __kmp_xproc);
+
+  // If hidden helper task is enabled, we initialize the thread capacity with
+  // extra __kmp_hidden_helper_threads_num.
+  if (__kmp_enable_hidden_helper) {
+    nth += __kmp_hidden_helper_threads_num;
+  }
+
+  if (nth > __kmp_max_nth)
+    nth = __kmp_max_nth;
+
+  return nth;
+}
+
+int __kmp_default_tp_capacity(int req_nproc, int max_nth,
+                              int all_threads_specified) {
+  int nth = 128;
+
+  if (all_threads_specified)
+    return max_nth;
+  /* MIN( MAX (128, 4 * $OMP_NUM_THREADS, 4 * omp_get_num_procs() ),
+   * __kmp_max_nth ) */
+  if (nth < (4 * req_nproc))
+    nth = (4 * req_nproc);
+  if (nth < (4 * __kmp_xproc))
+    nth = (4 * __kmp_xproc);
+
+  if (nth > __kmp_max_nth)
+    nth = __kmp_max_nth;
+
+  return nth;
+}
+
+// -----------------------------------------------------------------------------
+// Helper print functions.
+
+static void __kmp_stg_print_bool(kmp_str_buf_t *buffer, char const *name,
+                                 int value) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_BOOL;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=%s\n", name, value ? "true" : "false");
+  }
+} // __kmp_stg_print_bool
+
+static void __kmp_stg_print_int(kmp_str_buf_t *buffer, char const *name,
+                                int value) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_INT;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=%d\n", name, value);
+  }
+} // __kmp_stg_print_int
+
+static void __kmp_stg_print_uint64(kmp_str_buf_t *buffer, char const *name,
+                                   kmp_uint64 value) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_UINT64;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=%" KMP_UINT64_SPEC "\n", name, value);
+  }
+} // __kmp_stg_print_uint64
+
+static void __kmp_stg_print_str(kmp_str_buf_t *buffer, char const *name,
+                                char const *value) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_STR;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=%s\n", name, value);
+  }
+} // __kmp_stg_print_str
+
+static void __kmp_stg_print_size(kmp_str_buf_t *buffer, char const *name,
+                                 size_t value) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+    __kmp_str_buf_print_size(buffer, value);
+    __kmp_str_buf_print(buffer, "'\n");
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=", name);
+    __kmp_str_buf_print_size(buffer, value);
+    __kmp_str_buf_print(buffer, "\n");
+    return;
+  }
+} // __kmp_stg_print_size
+
+// =============================================================================
+// Parse and print functions.
+
+// -----------------------------------------------------------------------------
+// KMP_DEVICE_THREAD_LIMIT, KMP_ALL_THREADS
+
+static void __kmp_stg_parse_device_thread_limit(char const *name,
+                                                char const *value, void *data) {
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  int rc;
+  if (strcmp(name, "KMP_ALL_THREADS") == 0) {
+    KMP_INFORM(EnvVarDeprecated, name, "KMP_DEVICE_THREAD_LIMIT");
+  }
+  rc = __kmp_stg_check_rivals(name, value, rivals);
+  if (rc) {
+    return;
+  }
+  if (!__kmp_strcasecmp_with_sentinel("all", value, 0)) {
+    __kmp_max_nth = __kmp_xproc;
+    __kmp_allThreadsSpecified = 1;
+  } else {
+    __kmp_stg_parse_int(name, value, 1, __kmp_sys_max_nth, &__kmp_max_nth);
+    __kmp_allThreadsSpecified = 0;
+  }
+  K_DIAG(1, ("__kmp_max_nth == %d\n", __kmp_max_nth));
+
+} // __kmp_stg_parse_device_thread_limit
+
+static void __kmp_stg_print_device_thread_limit(kmp_str_buf_t *buffer,
+                                                char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_max_nth);
+} // __kmp_stg_print_device_thread_limit
+
+// -----------------------------------------------------------------------------
+// OMP_THREAD_LIMIT
+static void __kmp_stg_parse_thread_limit(char const *name, char const *value,
+                                         void *data) {
+  __kmp_stg_parse_int(name, value, 1, __kmp_sys_max_nth, &__kmp_cg_max_nth);
+  K_DIAG(1, ("__kmp_cg_max_nth == %d\n", __kmp_cg_max_nth));
+
+} // __kmp_stg_parse_thread_limit
+
+static void __kmp_stg_print_thread_limit(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_cg_max_nth);
+} // __kmp_stg_print_thread_limit
+
+// -----------------------------------------------------------------------------
+// OMP_NUM_TEAMS
+static void __kmp_stg_parse_nteams(char const *name, char const *value,
+                                   void *data) {
+  __kmp_stg_parse_int(name, value, 1, __kmp_sys_max_nth, &__kmp_nteams);
+  K_DIAG(1, ("__kmp_nteams == %d\n", __kmp_nteams));
+} // __kmp_stg_parse_nteams
+
+static void __kmp_stg_print_nteams(kmp_str_buf_t *buffer, char const *name,
+                                   void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_nteams);
+} // __kmp_stg_print_nteams
+
+// -----------------------------------------------------------------------------
+// OMP_TEAMS_THREAD_LIMIT
+static void __kmp_stg_parse_teams_th_limit(char const *name, char const *value,
+                                           void *data) {
+  __kmp_stg_parse_int(name, value, 1, __kmp_sys_max_nth,
+                      &__kmp_teams_thread_limit);
+  K_DIAG(1, ("__kmp_teams_thread_limit == %d\n", __kmp_teams_thread_limit));
+} // __kmp_stg_parse_teams_th_limit
+
+static void __kmp_stg_print_teams_th_limit(kmp_str_buf_t *buffer,
+                                           char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_teams_thread_limit);
+} // __kmp_stg_print_teams_th_limit
+
+// -----------------------------------------------------------------------------
+// KMP_TEAMS_THREAD_LIMIT
+static void __kmp_stg_parse_teams_thread_limit(char const *name,
+                                               char const *value, void *data) {
+  __kmp_stg_parse_int(name, value, 1, __kmp_sys_max_nth, &__kmp_teams_max_nth);
+} // __kmp_stg_teams_thread_limit
+
+static void __kmp_stg_print_teams_thread_limit(kmp_str_buf_t *buffer,
+                                               char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_teams_max_nth);
+} // __kmp_stg_print_teams_thread_limit
+
+// -----------------------------------------------------------------------------
+// KMP_USE_YIELD
+static void __kmp_stg_parse_use_yield(char const *name, char const *value,
+                                      void *data) {
+  __kmp_stg_parse_int(name, value, 0, 2, &__kmp_use_yield);
+  __kmp_use_yield_exp_set = 1;
+} // __kmp_stg_parse_use_yield
+
+static void __kmp_stg_print_use_yield(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_use_yield);
+} // __kmp_stg_print_use_yield
+
+// -----------------------------------------------------------------------------
+// KMP_BLOCKTIME
+
+static void __kmp_stg_parse_blocktime(char const *name, char const *value,
+                                      void *data) {
+  const char *buf = value;
+  const char *next;
+  const int ms_mult = 1000;
+  int multiplier = 1;
+  int num;
+
+  // Read integer blocktime value
+  SKIP_WS(buf);
+  if ((*buf >= '0') && (*buf <= '9')) {
+    next = buf;
+    SKIP_DIGITS(next);
+    num = __kmp_basic_str_to_int(buf);
+    KMP_ASSERT(num >= 0);
+    buf = next;
+    SKIP_WS(buf);
+  } else {
+    num = -1;
+  }
+
+  // Read units: note that __kmp_dflt_blocktime units is now us
+  next = buf;
+  if (*buf == '\0' || __kmp_match_str("ms", buf, &next)) {
+    // units are in ms; convert
+    __kmp_dflt_blocktime = ms_mult * num;
+    __kmp_blocktime_units = 'm';
+    multiplier = ms_mult;
+  } else if (__kmp_match_str("us", buf, &next)) {
+    // units are in us
+    __kmp_dflt_blocktime = num;
+    __kmp_blocktime_units = 'u';
+  } else if (__kmp_match_str("infinite", buf, &next) ||
+             __kmp_match_str("infinity", buf, &next)) {
+    // units are in ms
+    __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
+    __kmp_blocktime_units = 'm';
+    multiplier = ms_mult;
+  } else {
+    KMP_WARNING(StgInvalidValue, name, value);
+    // default units are in ms
+    __kmp_dflt_blocktime = ms_mult * num;
+    __kmp_blocktime_units = 'm';
+    multiplier = ms_mult;
+  }
+
+  if (num < 0 && __kmp_dflt_blocktime < 0) { // num out of range
+    __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; // now in us
+    __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidValue, name, value),
+              __kmp_msg_null);
+    // Inform in appropriate units
+    KMP_INFORM(Using_int_Value, name, __kmp_dflt_blocktime / multiplier);
+    __kmp_env_blocktime = FALSE; // Revert to default as if var not set.
+  } else if (num > 0 && __kmp_dflt_blocktime < 0) { // overflow
+    __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
+    __kmp_msg(kmp_ms_warning, KMP_MSG(LargeValue, name, value), __kmp_msg_null);
+    KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime / multiplier);
+    __kmp_env_blocktime = TRUE; // KMP_BLOCKTIME was specified.
+  } else {
+    if (__kmp_dflt_blocktime < KMP_MIN_BLOCKTIME) {
+      __kmp_dflt_blocktime = KMP_MIN_BLOCKTIME;
+      __kmp_msg(kmp_ms_warning, KMP_MSG(SmallValue, name, value),
+                __kmp_msg_null);
+      KMP_INFORM(MinValueUsing, name, __kmp_dflt_blocktime / multiplier);
+    } else if (__kmp_dflt_blocktime > KMP_MAX_BLOCKTIME) {
+      __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
+      __kmp_msg(kmp_ms_warning, KMP_MSG(LargeValue, name, value),
+                __kmp_msg_null);
+      KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime / multiplier);
+    }
+    __kmp_env_blocktime = TRUE; // KMP_BLOCKTIME was specified.
+  }
+#if KMP_USE_MONITOR
+  // calculate number of monitor thread wakeup intervals corresponding to
+  // blocktime.
+  __kmp_monitor_wakeups =
+      KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
+  __kmp_bt_intervals =
+      KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
+#endif
+  K_DIAG(1, ("__kmp_env_blocktime == %d\n", __kmp_env_blocktime));
+  if (__kmp_env_blocktime) {
+    K_DIAG(1, ("__kmp_dflt_blocktime == %d\n", __kmp_dflt_blocktime));
+  }
+} // __kmp_stg_parse_blocktime
+
+static void __kmp_stg_print_blocktime(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  int num = __kmp_dflt_blocktime;
+  if (__kmp_blocktime_units == 'm') {
+    num = num / 1000;
+  }
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=", name);
+  }
+  __kmp_str_buf_print(buffer, "%d", num);
+  __kmp_str_buf_print(buffer, "%cs\n", __kmp_blocktime_units);
+} // __kmp_stg_print_blocktime
+
+// -----------------------------------------------------------------------------
+// KMP_DUPLICATE_LIB_OK
+
+static void __kmp_stg_parse_duplicate_lib_ok(char const *name,
+                                             char const *value, void *data) {
+  /* actually this variable is not supported, put here for compatibility with
+     earlier builds and for static/dynamic combination */
+  __kmp_stg_parse_bool(name, value, &__kmp_duplicate_library_ok);
+} // __kmp_stg_parse_duplicate_lib_ok
+
+static void __kmp_stg_print_duplicate_lib_ok(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_duplicate_library_ok);
+} // __kmp_stg_print_duplicate_lib_ok
+
+// -----------------------------------------------------------------------------
+// KMP_INHERIT_FP_CONTROL
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+static void __kmp_stg_parse_inherit_fp_control(char const *name,
+                                               char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_inherit_fp_control);
+} // __kmp_stg_parse_inherit_fp_control
+
+static void __kmp_stg_print_inherit_fp_control(kmp_str_buf_t *buffer,
+                                               char const *name, void *data) {
+#if KMP_DEBUG
+  __kmp_stg_print_bool(buffer, name, __kmp_inherit_fp_control);
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_inherit_fp_control
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// Used for OMP_WAIT_POLICY
+static char const *blocktime_str = NULL;
+
+// -----------------------------------------------------------------------------
+// KMP_LIBRARY, OMP_WAIT_POLICY
+
+static void __kmp_stg_parse_wait_policy(char const *name, char const *value,
+                                        void *data) {
+
+  kmp_stg_wp_data_t *wait = (kmp_stg_wp_data_t *)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, wait->rivals);
+  if (rc) {
+    return;
+  }
+
+  if (wait->omp) {
+    if (__kmp_str_match("ACTIVE", 1, value)) {
+      __kmp_library = library_turnaround;
+      if (blocktime_str == NULL) {
+        // KMP_BLOCKTIME not specified, so set default to "infinite".
+        __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
+      }
+    } else if (__kmp_str_match("PASSIVE", 1, value)) {
+      __kmp_library = library_throughput;
+      __kmp_wpolicy_passive = true; /* allow sleep while active tasking */
+      if (blocktime_str == NULL) {
+        // KMP_BLOCKTIME not specified, so set default to 0.
+        __kmp_dflt_blocktime = 0;
+      }
+    } else {
+      KMP_WARNING(StgInvalidValue, name, value);
+    }
+  } else {
+    if (__kmp_str_match("serial", 1, value)) { /* S */
+      __kmp_library = library_serial;
+    } else if (__kmp_str_match("throughput", 2, value)) { /* TH */
+      __kmp_library = library_throughput;
+      if (blocktime_str == NULL) {
+        // KMP_BLOCKTIME not specified, so set default to 0.
+        __kmp_dflt_blocktime = 0;
+      }
+    } else if (__kmp_str_match("turnaround", 2, value)) { /* TU */
+      __kmp_library = library_turnaround;
+    } else if (__kmp_str_match("dedicated", 1, value)) { /* D */
+      __kmp_library = library_turnaround;
+    } else if (__kmp_str_match("multiuser", 1, value)) { /* M */
+      __kmp_library = library_throughput;
+      if (blocktime_str == NULL) {
+        // KMP_BLOCKTIME not specified, so set default to 0.
+        __kmp_dflt_blocktime = 0;
+      }
+    } else {
+      KMP_WARNING(StgInvalidValue, name, value);
+    }
+  }
+} // __kmp_stg_parse_wait_policy
+
+static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+
+  kmp_stg_wp_data_t *wait = (kmp_stg_wp_data_t *)data;
+  char const *value = NULL;
+
+  if (wait->omp) {
+    switch (__kmp_library) {
+    case library_turnaround: {
+      value = "ACTIVE";
+    } break;
+    case library_throughput: {
+      value = "PASSIVE";
+    } break;
+    case library_none:
+    case library_serial: {
+      value = NULL;
+    } break;
+    }
+  } else {
+    switch (__kmp_library) {
+    case library_serial: {
+      value = "serial";
+    } break;
+    case library_turnaround: {
+      value = "turnaround";
+    } break;
+    case library_throughput: {
+      value = "throughput";
+    } break;
+    case library_none: {
+      value = NULL;
+    } break;
+    }
+  }
+  if (value != NULL) {
+    __kmp_stg_print_str(buffer, name, value);
+  }
+
+} // __kmp_stg_print_wait_policy
+
+#if KMP_USE_MONITOR
+// -----------------------------------------------------------------------------
+// KMP_MONITOR_STACKSIZE
+
+static void __kmp_stg_parse_monitor_stacksize(char const *name,
+                                              char const *value, void *data) {
+  __kmp_stg_parse_size(name, value, __kmp_sys_min_stksize, KMP_MAX_STKSIZE,
+                       NULL, &__kmp_monitor_stksize, 1);
+} // __kmp_stg_parse_monitor_stacksize
+
+static void __kmp_stg_print_monitor_stacksize(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  if (__kmp_env_format) {
+    if (__kmp_monitor_stksize > 0)
+      KMP_STR_BUF_PRINT_NAME_EX(name);
+    else
+      KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  if (__kmp_monitor_stksize > 0) {
+    __kmp_str_buf_print_size(buffer, __kmp_monitor_stksize);
+  } else {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+  if (__kmp_env_format && __kmp_monitor_stksize) {
+    __kmp_str_buf_print(buffer, "'\n");
+  }
+} // __kmp_stg_print_monitor_stacksize
+#endif // KMP_USE_MONITOR
+
+// -----------------------------------------------------------------------------
+// KMP_SETTINGS
+
+static void __kmp_stg_parse_settings(char const *name, char const *value,
+                                     void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_settings);
+} // __kmp_stg_parse_settings
+
+static void __kmp_stg_print_settings(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_settings);
+} // __kmp_stg_print_settings
+
+// -----------------------------------------------------------------------------
+// KMP_STACKPAD
+
+static void __kmp_stg_parse_stackpad(char const *name, char const *value,
+                                     void *data) {
+  __kmp_stg_parse_int(name, // Env var name
+                      value, // Env var value
+                      KMP_MIN_STKPADDING, // Min value
+                      KMP_MAX_STKPADDING, // Max value
+                      &__kmp_stkpadding // Var to initialize
+  );
+} // __kmp_stg_parse_stackpad
+
+static void __kmp_stg_print_stackpad(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_stkpadding);
+} // __kmp_stg_print_stackpad
+
+// -----------------------------------------------------------------------------
+// KMP_STACKOFFSET
+
+static void __kmp_stg_parse_stackoffset(char const *name, char const *value,
+                                        void *data) {
+  __kmp_stg_parse_size(name, // Env var name
+                       value, // Env var value
+                       KMP_MIN_STKOFFSET, // Min value
+                       KMP_MAX_STKOFFSET, // Max value
+                       NULL, //
+                       &__kmp_stkoffset, // Var to initialize
+                       1);
+} // __kmp_stg_parse_stackoffset
+
+static void __kmp_stg_print_stackoffset(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_size(buffer, name, __kmp_stkoffset);
+} // __kmp_stg_print_stackoffset
+
+// -----------------------------------------------------------------------------
+// KMP_STACKSIZE, OMP_STACKSIZE, GOMP_STACKSIZE
+
+static void __kmp_stg_parse_stacksize(char const *name, char const *value,
+                                      void *data) {
+
+  kmp_stg_ss_data_t *stacksize = (kmp_stg_ss_data_t *)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, stacksize->rivals);
+  if (rc) {
+    return;
+  }
+  __kmp_stg_parse_size(name, // Env var name
+                       value, // Env var value
+                       __kmp_sys_min_stksize, // Min value
+                       KMP_MAX_STKSIZE, // Max value
+                       &__kmp_env_stksize, //
+                       &__kmp_stksize, // Var to initialize
+                       stacksize->factor);
+
+} // __kmp_stg_parse_stacksize
+
+// This function is called for printing both KMP_STACKSIZE (factor is 1) and
+// OMP_STACKSIZE (factor is 1024). Currently it is not possible to print
+// OMP_STACKSIZE value in bytes. We can consider adding this possibility by a
+// customer request in future.
+static void __kmp_stg_print_stacksize(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  kmp_stg_ss_data_t *stacksize = (kmp_stg_ss_data_t *)data;
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+    __kmp_str_buf_print_size(buffer, (__kmp_stksize % 1024)
+                                         ? __kmp_stksize / stacksize->factor
+                                         : __kmp_stksize);
+    __kmp_str_buf_print(buffer, "'\n");
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=", name);
+    __kmp_str_buf_print_size(buffer, (__kmp_stksize % 1024)
+                                         ? __kmp_stksize / stacksize->factor
+                                         : __kmp_stksize);
+    __kmp_str_buf_print(buffer, "\n");
+  }
+} // __kmp_stg_print_stacksize
+
+// -----------------------------------------------------------------------------
+// KMP_VERSION
+
+static void __kmp_stg_parse_version(char const *name, char const *value,
+                                    void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_version);
+} // __kmp_stg_parse_version
+
+static void __kmp_stg_print_version(kmp_str_buf_t *buffer, char const *name,
+                                    void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_version);
+} // __kmp_stg_print_version
+
+// -----------------------------------------------------------------------------
+// KMP_WARNINGS
+
+static void __kmp_stg_parse_warnings(char const *name, char const *value,
+                                     void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_generate_warnings);
+  if (__kmp_generate_warnings != kmp_warnings_off) {
+    // AC: only 0/1 values documented, so reset to explicit to distinguish from
+    // default setting
+    __kmp_generate_warnings = kmp_warnings_explicit;
+  }
+} // __kmp_stg_parse_warnings
+
+static void __kmp_stg_print_warnings(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  // AC: TODO: change to print_int? (needs documentation change)
+  __kmp_stg_print_bool(buffer, name, __kmp_generate_warnings);
+} // __kmp_stg_print_warnings
+
+// -----------------------------------------------------------------------------
+// KMP_NESTING_MODE
+
+static void __kmp_stg_parse_nesting_mode(char const *name, char const *value,
+                                         void *data) {
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_nesting_mode);
+#if KMP_AFFINITY_SUPPORTED && KMP_USE_HWLOC
+  if (__kmp_nesting_mode > 0)
+    __kmp_affinity_top_method = affinity_top_method_hwloc;
+#endif
+} // __kmp_stg_parse_nesting_mode
+
+static void __kmp_stg_print_nesting_mode(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  __kmp_str_buf_print(buffer, "=%d\n", __kmp_nesting_mode);
+} // __kmp_stg_print_nesting_mode
+
+// -----------------------------------------------------------------------------
+// OMP_NESTED, OMP_NUM_THREADS
+
+static void __kmp_stg_parse_nested(char const *name, char const *value,
+                                   void *data) {
+  int nested;
+  KMP_INFORM(EnvVarDeprecated, name, "OMP_MAX_ACTIVE_LEVELS");
+  __kmp_stg_parse_bool(name, value, &nested);
+  if (nested) {
+    if (!__kmp_dflt_max_active_levels_set)
+      __kmp_dflt_max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+  } else { // nesting explicitly turned off
+    __kmp_dflt_max_active_levels = 1;
+    __kmp_dflt_max_active_levels_set = true;
+  }
+} // __kmp_stg_parse_nested
+
+static void __kmp_stg_print_nested(kmp_str_buf_t *buffer, char const *name,
+                                   void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  __kmp_str_buf_print(buffer, ": deprecated; max-active-levels-var=%d\n",
+                      __kmp_dflt_max_active_levels);
+} // __kmp_stg_print_nested
+
+static void __kmp_parse_nested_num_threads(const char *var, const char *env,
+                                           kmp_nested_nthreads_t *nth_array) {
+  const char *next = env;
+  const char *scan = next;
+
+  int total = 0; // Count elements that were set. It'll be used as an array size
+  int prev_comma = FALSE; // For correct processing sequential commas
+
+  // Count the number of values in the env. var string
+  for (;;) {
+    SKIP_WS(next);
+
+    if (*next == '\0') {
+      break;
+    }
+    // Next character is not an integer or not a comma => end of list
+    if (((*next < '0') || (*next > '9')) && (*next != ',')) {
+      KMP_WARNING(NthSyntaxError, var, env);
+      return;
+    }
+    // The next character is ','
+    if (*next == ',') {
+      // ',' is the first character
+      if (total == 0 || prev_comma) {
+        total++;
+      }
+      prev_comma = TRUE;
+      next++; // skip ','
+      SKIP_WS(next);
+    }
+    // Next character is a digit
+    if (*next >= '0' && *next <= '9') {
+      prev_comma = FALSE;
+      SKIP_DIGITS(next);
+      total++;
+      const char *tmp = next;
+      SKIP_WS(tmp);
+      if ((*next == ' ' || *next == '\t') && (*tmp >= '0' && *tmp <= '9')) {
+        KMP_WARNING(NthSpacesNotAllowed, var, env);
+        return;
+      }
+    }
+  }
+  if (!__kmp_dflt_max_active_levels_set && total > 1)
+    __kmp_dflt_max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+  KMP_DEBUG_ASSERT(total > 0);
+  if (total <= 0) {
+    KMP_WARNING(NthSyntaxError, var, env);
+    return;
+  }
+
+  // Check if the nested nthreads array exists
+  if (!nth_array->nth) {
+    // Allocate an array of double size
+    nth_array->nth = (int *)KMP_INTERNAL_MALLOC(sizeof(int) * total * 2);
+    if (nth_array->nth == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    nth_array->size = total * 2;
+  } else {
+    if (nth_array->size < total) {
+      // Increase the array size
+      do {
+        nth_array->size *= 2;
+      } while (nth_array->size < total);
+
+      nth_array->nth = (int *)KMP_INTERNAL_REALLOC(
+          nth_array->nth, sizeof(int) * nth_array->size);
+      if (nth_array->nth == NULL) {
+        KMP_FATAL(MemoryAllocFailed);
+      }
+    }
+  }
+  nth_array->used = total;
+  int i = 0;
+
+  prev_comma = FALSE;
+  total = 0;
+  // Save values in the array
+  for (;;) {
+    SKIP_WS(scan);
+    if (*scan == '\0') {
+      break;
+    }
+    // The next character is ','
+    if (*scan == ',') {
+      // ',' in the beginning of the list
+      if (total == 0) {
+        // The value is supposed to be equal to __kmp_avail_proc but it is
+        // unknown at the moment.
+        // So let's put a placeholder (#threads = 0) to correct it later.
+        nth_array->nth[i++] = 0;
+        total++;
+      } else if (prev_comma) {
+        // Num threads is inherited from the previous level
+        nth_array->nth[i] = nth_array->nth[i - 1];
+        i++;
+        total++;
+      }
+      prev_comma = TRUE;
+      scan++; // skip ','
+      SKIP_WS(scan);
+    }
+    // Next character is a digit
+    if (*scan >= '0' && *scan <= '9') {
+      int num;
+      const char *buf = scan;
+      char const *msg = NULL;
+      prev_comma = FALSE;
+      SKIP_DIGITS(scan);
+      total++;
+
+      num = __kmp_str_to_int(buf, *scan);
+      if (num < KMP_MIN_NTH) {
+        msg = KMP_I18N_STR(ValueTooSmall);
+        num = KMP_MIN_NTH;
+      } else if (num > __kmp_sys_max_nth) {
+        msg = KMP_I18N_STR(ValueTooLarge);
+        num = __kmp_sys_max_nth;
+      }
+      if (msg != NULL) {
+        // Message is not empty. Print warning.
+        KMP_WARNING(ParseSizeIntWarn, var, env, msg);
+        KMP_INFORM(Using_int_Value, var, num);
+      }
+      nth_array->nth[i++] = num;
+    }
+  }
+}
+
+static void __kmp_stg_parse_num_threads(char const *name, char const *value,
+                                        void *data) {
+  // TODO: Remove this option. OMP_NUM_THREADS is a list of positive integers!
+  if (!__kmp_strcasecmp_with_sentinel("all", value, 0)) {
+    // The array of 1 element
+    __kmp_nested_nth.nth = (int *)KMP_INTERNAL_MALLOC(sizeof(int));
+    __kmp_nested_nth.size = __kmp_nested_nth.used = 1;
+    __kmp_nested_nth.nth[0] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
+        __kmp_xproc;
+  } else {
+    __kmp_parse_nested_num_threads(name, value, &__kmp_nested_nth);
+    if (__kmp_nested_nth.nth) {
+      __kmp_dflt_team_nth = __kmp_nested_nth.nth[0];
+      if (__kmp_dflt_team_nth_ub < __kmp_dflt_team_nth) {
+        __kmp_dflt_team_nth_ub = __kmp_dflt_team_nth;
+      }
+    }
+  }
+  K_DIAG(1, ("__kmp_dflt_team_nth == %d\n", __kmp_dflt_team_nth));
+} // __kmp_stg_parse_num_threads
+
+#if OMPX_TASKGRAPH
+static void __kmp_stg_parse_max_tdgs(char const *name, char const *value,
+                                     void *data) {
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_max_tdgs);
+} // __kmp_stg_parse_max_tdgs
+
+static void __kmp_std_print_max_tdgs(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_max_tdgs);
+} // __kmp_std_print_max_tdgs
+
+static void __kmp_stg_parse_tdg_dot(char const *name, char const *value,
+                                   void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_tdg_dot);
+} // __kmp_stg_parse_tdg_dot
+
+static void __kmp_stg_print_tdg_dot(kmp_str_buf_t *buffer, char const *name,
+                                   void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_tdg_dot);
+} // __kmp_stg_print_tdg_dot
+#endif
+
+static void __kmp_stg_parse_num_hidden_helper_threads(char const *name,
+                                                      char const *value,
+                                                      void *data) {
+  __kmp_stg_parse_int(name, value, 0, 16, &__kmp_hidden_helper_threads_num);
+  // If the number of hidden helper threads is zero, we disable hidden helper
+  // task
+  if (__kmp_hidden_helper_threads_num == 0) {
+    __kmp_enable_hidden_helper = FALSE;
+  } else {
+    // Since the main thread of hidden helper team does not participate
+    // in tasks execution let's increment the number of threads by one
+    // so that requested number of threads do actual job.
+    __kmp_hidden_helper_threads_num++;
+  }
+} // __kmp_stg_parse_num_hidden_helper_threads
+
+static void __kmp_stg_print_num_hidden_helper_threads(kmp_str_buf_t *buffer,
+                                                      char const *name,
+                                                      void *data) {
+  if (__kmp_hidden_helper_threads_num == 0) {
+    __kmp_stg_print_int(buffer, name, __kmp_hidden_helper_threads_num);
+  } else {
+    KMP_DEBUG_ASSERT(__kmp_hidden_helper_threads_num > 1);
+    // Let's exclude the main thread of hidden helper team and print
+    // number of worker threads those do actual job.
+    __kmp_stg_print_int(buffer, name, __kmp_hidden_helper_threads_num - 1);
+  }
+} // __kmp_stg_print_num_hidden_helper_threads
+
+static void __kmp_stg_parse_use_hidden_helper(char const *name,
+                                              char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_enable_hidden_helper);
+#if !KMP_OS_LINUX
+  __kmp_enable_hidden_helper = FALSE;
+  K_DIAG(1,
+         ("__kmp_stg_parse_use_hidden_helper: Disable hidden helper task on "
+          "non-Linux platform although it is enabled by user explicitly.\n"));
+#endif
+} // __kmp_stg_parse_use_hidden_helper
+
+static void __kmp_stg_print_use_hidden_helper(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_enable_hidden_helper);
+} // __kmp_stg_print_use_hidden_helper
+
+static void __kmp_stg_print_num_threads(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  if (__kmp_nested_nth.used) {
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
+    for (int i = 0; i < __kmp_nested_nth.used; i++) {
+      __kmp_str_buf_print(&buf, "%d", __kmp_nested_nth.nth[i]);
+      if (i < __kmp_nested_nth.used - 1) {
+        __kmp_str_buf_print(&buf, ",");
+      }
+    }
+    __kmp_str_buf_print(buffer, "='%s'\n", buf.str);
+    __kmp_str_buf_free(&buf);
+  } else {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+} // __kmp_stg_print_num_threads
+
+// -----------------------------------------------------------------------------
+// OpenMP 3.0: KMP_TASKING, OMP_MAX_ACTIVE_LEVELS,
+
+static void __kmp_stg_parse_tasking(char const *name, char const *value,
+                                    void *data) {
+  __kmp_stg_parse_int(name, value, 0, (int)tskm_max,
+                      (int *)&__kmp_tasking_mode);
+} // __kmp_stg_parse_tasking
+
+static void __kmp_stg_print_tasking(kmp_str_buf_t *buffer, char const *name,
+                                    void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_tasking_mode);
+} // __kmp_stg_print_tasking
+
+static void __kmp_stg_parse_task_stealing(char const *name, char const *value,
+                                          void *data) {
+  __kmp_stg_parse_int(name, value, 0, 1,
+                      (int *)&__kmp_task_stealing_constraint);
+} // __kmp_stg_parse_task_stealing
+
+static void __kmp_stg_print_task_stealing(kmp_str_buf_t *buffer,
+                                          char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_task_stealing_constraint);
+} // __kmp_stg_print_task_stealing
+
+static void __kmp_stg_parse_max_active_levels(char const *name,
+                                              char const *value, void *data) {
+  kmp_uint64 tmp_dflt = 0;
+  char const *msg = NULL;
+  if (!__kmp_dflt_max_active_levels_set) {
+    // Don't overwrite __kmp_dflt_max_active_levels if we get an invalid setting
+    __kmp_str_to_uint(value, &tmp_dflt, &msg);
+    if (msg != NULL) { // invalid setting; print warning and ignore
+      KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+    } else if (tmp_dflt > KMP_MAX_ACTIVE_LEVELS_LIMIT) {
+      // invalid setting; print warning and ignore
+      msg = KMP_I18N_STR(ValueTooLarge);
+      KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+    } else { // valid setting
+      __kmp_type_convert(tmp_dflt, &(__kmp_dflt_max_active_levels));
+      __kmp_dflt_max_active_levels_set = true;
+    }
+  }
+} // __kmp_stg_parse_max_active_levels
+
+static void __kmp_stg_print_max_active_levels(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_dflt_max_active_levels);
+} // __kmp_stg_print_max_active_levels
+
+// -----------------------------------------------------------------------------
+// OpenMP 4.0: OMP_DEFAULT_DEVICE
+static void __kmp_stg_parse_default_device(char const *name, char const *value,
+                                           void *data) {
+  __kmp_stg_parse_int(name, value, 0, KMP_MAX_DEFAULT_DEVICE_LIMIT,
+                      &__kmp_default_device);
+} // __kmp_stg_parse_default_device
+
+static void __kmp_stg_print_default_device(kmp_str_buf_t *buffer,
+                                           char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_default_device);
+} // __kmp_stg_print_default_device
+
+// -----------------------------------------------------------------------------
+// OpenMP 5.0: OMP_TARGET_OFFLOAD
+static void __kmp_stg_parse_target_offload(char const *name, char const *value,
+                                           void *data) {
+  kmp_trimmed_str_t value_str(value);
+  const char *scan = value_str.get();
+  __kmp_target_offload = tgt_default;
+
+  if (*scan == '\0')
+    return;
+
+  if (!__kmp_strcasecmp_with_sentinel("mandatory", scan, 0)) {
+    __kmp_target_offload = tgt_mandatory;
+  } else if (!__kmp_strcasecmp_with_sentinel("disabled", scan, 0)) {
+    __kmp_target_offload = tgt_disabled;
+  } else if (!__kmp_strcasecmp_with_sentinel("default", scan, 0)) {
+    __kmp_target_offload = tgt_default;
+  } else {
+    KMP_WARNING(SyntaxErrorUsing, name, "DEFAULT");
+  }
+} // __kmp_stg_parse_target_offload
+
+static void __kmp_stg_print_target_offload(kmp_str_buf_t *buffer,
+                                           char const *name, void *data) {
+  const char *value = NULL;
+  if (__kmp_target_offload == tgt_default)
+    value = "DEFAULT";
+  else if (__kmp_target_offload == tgt_mandatory)
+    value = "MANDATORY";
+  else if (__kmp_target_offload == tgt_disabled)
+    value = "DISABLED";
+  KMP_DEBUG_ASSERT(value);
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  __kmp_str_buf_print(buffer, "=%s\n", value);
+} // __kmp_stg_print_target_offload
+
+// -----------------------------------------------------------------------------
+// OpenMP 4.5: OMP_MAX_TASK_PRIORITY
+static void __kmp_stg_parse_max_task_priority(char const *name,
+                                              char const *value, void *data) {
+  __kmp_stg_parse_int(name, value, 0, KMP_MAX_TASK_PRIORITY_LIMIT,
+                      &__kmp_max_task_priority);
+} // __kmp_stg_parse_max_task_priority
+
+static void __kmp_stg_print_max_task_priority(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_max_task_priority);
+} // __kmp_stg_print_max_task_priority
+
+// KMP_TASKLOOP_MIN_TASKS
+// taskloop threshold to switch from recursive to linear tasks creation
+static void __kmp_stg_parse_taskloop_min_tasks(char const *name,
+                                               char const *value, void *data) {
+  int tmp = 0;
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &tmp);
+  __kmp_taskloop_min_tasks = tmp;
+} // __kmp_stg_parse_taskloop_min_tasks
+
+static void __kmp_stg_print_taskloop_min_tasks(kmp_str_buf_t *buffer,
+                                               char const *name, void *data) {
+  __kmp_stg_print_uint64(buffer, name, __kmp_taskloop_min_tasks);
+} // __kmp_stg_print_taskloop_min_tasks
+
+// -----------------------------------------------------------------------------
+// KMP_DISP_NUM_BUFFERS
+static void __kmp_stg_parse_disp_buffers(char const *name, char const *value,
+                                         void *data) {
+  if (TCR_4(__kmp_init_serial)) {
+    KMP_WARNING(EnvSerialWarn, name);
+    return;
+  } // read value before serial initialization only
+  __kmp_stg_parse_int(name, value, KMP_MIN_DISP_NUM_BUFF, KMP_MAX_DISP_NUM_BUFF,
+                      &__kmp_dispatch_num_buffers);
+} // __kmp_stg_parse_disp_buffers
+
+static void __kmp_stg_print_disp_buffers(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_dispatch_num_buffers);
+} // __kmp_stg_print_disp_buffers
+
+#if KMP_NESTED_HOT_TEAMS
+// -----------------------------------------------------------------------------
+// KMP_HOT_TEAMS_MAX_LEVEL, KMP_HOT_TEAMS_MODE
+
+static void __kmp_stg_parse_hot_teams_level(char const *name, char const *value,
+                                            void *data) {
+  if (TCR_4(__kmp_init_parallel)) {
+    KMP_WARNING(EnvParallelWarn, name);
+    return;
+  } // read value before first parallel only
+  __kmp_stg_parse_int(name, value, 0, KMP_MAX_ACTIVE_LEVELS_LIMIT,
+                      &__kmp_hot_teams_max_level);
+} // __kmp_stg_parse_hot_teams_level
+
+static void __kmp_stg_print_hot_teams_level(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_hot_teams_max_level);
+} // __kmp_stg_print_hot_teams_level
+
+static void __kmp_stg_parse_hot_teams_mode(char const *name, char const *value,
+                                           void *data) {
+  if (TCR_4(__kmp_init_parallel)) {
+    KMP_WARNING(EnvParallelWarn, name);
+    return;
+  } // read value before first parallel only
+  __kmp_stg_parse_int(name, value, 0, KMP_MAX_ACTIVE_LEVELS_LIMIT,
+                      &__kmp_hot_teams_mode);
+} // __kmp_stg_parse_hot_teams_mode
+
+static void __kmp_stg_print_hot_teams_mode(kmp_str_buf_t *buffer,
+                                           char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_hot_teams_mode);
+} // __kmp_stg_print_hot_teams_mode
+
+#endif // KMP_NESTED_HOT_TEAMS
+
+// -----------------------------------------------------------------------------
+// KMP_HANDLE_SIGNALS
+
+#if KMP_HANDLE_SIGNALS
+
+static void __kmp_stg_parse_handle_signals(char const *name, char const *value,
+                                           void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_handle_signals);
+} // __kmp_stg_parse_handle_signals
+
+static void __kmp_stg_print_handle_signals(kmp_str_buf_t *buffer,
+                                           char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_handle_signals);
+} // __kmp_stg_print_handle_signals
+
+#endif // KMP_HANDLE_SIGNALS
+
+// -----------------------------------------------------------------------------
+// KMP_X_DEBUG, KMP_DEBUG, KMP_DEBUG_BUF_*, KMP_DIAG
+
+#ifdef KMP_DEBUG
+
+#define KMP_STG_X_DEBUG(x)                                                     \
+  static void __kmp_stg_parse_##x##_debug(char const *name, char const *value, \
+                                          void *data) {                        \
+    __kmp_stg_parse_int(name, value, 0, INT_MAX, &kmp_##x##_debug);            \
+  } /* __kmp_stg_parse_x_debug */                                              \
+  static void __kmp_stg_print_##x##_debug(kmp_str_buf_t *buffer,               \
+                                          char const *name, void *data) {      \
+    __kmp_stg_print_int(buffer, name, kmp_##x##_debug);                        \
+  } /* __kmp_stg_print_x_debug */
+
+KMP_STG_X_DEBUG(a)
+KMP_STG_X_DEBUG(b)
+KMP_STG_X_DEBUG(c)
+KMP_STG_X_DEBUG(d)
+KMP_STG_X_DEBUG(e)
+KMP_STG_X_DEBUG(f)
+
+#undef KMP_STG_X_DEBUG
+
+static void __kmp_stg_parse_debug(char const *name, char const *value,
+                                  void *data) {
+  int debug = 0;
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &debug);
+  if (kmp_a_debug < debug) {
+    kmp_a_debug = debug;
+  }
+  if (kmp_b_debug < debug) {
+    kmp_b_debug = debug;
+  }
+  if (kmp_c_debug < debug) {
+    kmp_c_debug = debug;
+  }
+  if (kmp_d_debug < debug) {
+    kmp_d_debug = debug;
+  }
+  if (kmp_e_debug < debug) {
+    kmp_e_debug = debug;
+  }
+  if (kmp_f_debug < debug) {
+    kmp_f_debug = debug;
+  }
+} // __kmp_stg_parse_debug
+
+static void __kmp_stg_parse_debug_buf(char const *name, char const *value,
+                                      void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_debug_buf);
+  // !!! TODO: Move buffer initialization of this file! It may works
+  // incorrectly if KMP_DEBUG_BUF is parsed before KMP_DEBUG_BUF_LINES or
+  // KMP_DEBUG_BUF_CHARS.
+  if (__kmp_debug_buf) {
+    int i;
+    int elements = __kmp_debug_buf_lines * __kmp_debug_buf_chars;
+
+    /* allocate and initialize all entries in debug buffer to empty */
+    __kmp_debug_buffer = (char *)__kmp_page_allocate(elements * sizeof(char));
+    for (i = 0; i < elements; i += __kmp_debug_buf_chars)
+      __kmp_debug_buffer[i] = '\0';
+
+    __kmp_debug_count = 0;
+  }
+  K_DIAG(1, ("__kmp_debug_buf = %d\n", __kmp_debug_buf));
+} // __kmp_stg_parse_debug_buf
+
+static void __kmp_stg_print_debug_buf(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_debug_buf);
+} // __kmp_stg_print_debug_buf
+
+static void __kmp_stg_parse_debug_buf_atomic(char const *name,
+                                             char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_debug_buf_atomic);
+} // __kmp_stg_parse_debug_buf_atomic
+
+static void __kmp_stg_print_debug_buf_atomic(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_debug_buf_atomic);
+} // __kmp_stg_print_debug_buf_atomic
+
+static void __kmp_stg_parse_debug_buf_chars(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_int(name, value, KMP_DEBUG_BUF_CHARS_MIN, INT_MAX,
+                      &__kmp_debug_buf_chars);
+} // __kmp_stg_debug_parse_buf_chars
+
+static void __kmp_stg_print_debug_buf_chars(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_debug_buf_chars);
+} // __kmp_stg_print_debug_buf_chars
+
+static void __kmp_stg_parse_debug_buf_lines(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_int(name, value, KMP_DEBUG_BUF_LINES_MIN, INT_MAX,
+                      &__kmp_debug_buf_lines);
+} // __kmp_stg_parse_debug_buf_lines
+
+static void __kmp_stg_print_debug_buf_lines(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_debug_buf_lines);
+} // __kmp_stg_print_debug_buf_lines
+
+static void __kmp_stg_parse_diag(char const *name, char const *value,
+                                 void *data) {
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &kmp_diag);
+} // __kmp_stg_parse_diag
+
+static void __kmp_stg_print_diag(kmp_str_buf_t *buffer, char const *name,
+                                 void *data) {
+  __kmp_stg_print_int(buffer, name, kmp_diag);
+} // __kmp_stg_print_diag
+
+#endif // KMP_DEBUG
+
+// -----------------------------------------------------------------------------
+// KMP_ALIGN_ALLOC
+
+static void __kmp_stg_parse_align_alloc(char const *name, char const *value,
+                                        void *data) {
+  __kmp_stg_parse_size(name, value, CACHE_LINE, INT_MAX, NULL,
+                       &__kmp_align_alloc, 1);
+} // __kmp_stg_parse_align_alloc
+
+static void __kmp_stg_print_align_alloc(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_size(buffer, name, __kmp_align_alloc);
+} // __kmp_stg_print_align_alloc
+
+// -----------------------------------------------------------------------------
+// KMP_PLAIN_BARRIER, KMP_FORKJOIN_BARRIER, KMP_REDUCTION_BARRIER
+
+// TODO: Remove __kmp_barrier_branch_bit_env_name varibale, remove loops from
+// parse and print functions, pass required info through data argument.
+
+static void __kmp_stg_parse_barrier_branch_bit(char const *name,
+                                               char const *value, void *data) {
+  const char *var;
+
+  /* ---------- Barrier branch bit control ------------ */
+  for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+    var = __kmp_barrier_branch_bit_env_name[i];
+    if ((strcmp(var, name) == 0) && (value != 0)) {
+      char *comma;
+
+      comma = CCAST(char *, strchr(value, ','));
+      __kmp_barrier_gather_branch_bits[i] =
+          (kmp_uint32)__kmp_str_to_int(value, ',');
+      /* is there a specified release parameter? */
+      if (comma == NULL) {
+        __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
+      } else {
+        __kmp_barrier_release_branch_bits[i] =
+            (kmp_uint32)__kmp_str_to_int(comma + 1, 0);
+
+        if (__kmp_barrier_release_branch_bits[i] > KMP_MAX_BRANCH_BITS) {
+          __kmp_msg(kmp_ms_warning,
+                    KMP_MSG(BarrReleaseValueInvalid, name, comma + 1),
+                    __kmp_msg_null);
+          __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
+        }
+      }
+      if (__kmp_barrier_gather_branch_bits[i] > KMP_MAX_BRANCH_BITS) {
+        KMP_WARNING(BarrGatherValueInvalid, name, value);
+        KMP_INFORM(Using_uint_Value, name, __kmp_barrier_gather_bb_dflt);
+        __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
+      }
+    }
+    K_DIAG(1, ("%s == %d,%d\n", __kmp_barrier_branch_bit_env_name[i],
+               __kmp_barrier_gather_branch_bits[i],
+               __kmp_barrier_release_branch_bits[i]))
+  }
+} // __kmp_stg_parse_barrier_branch_bit
+
+static void __kmp_stg_print_barrier_branch_bit(kmp_str_buf_t *buffer,
+                                               char const *name, void *data) {
+  const char *var;
+  for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+    var = __kmp_barrier_branch_bit_env_name[i];
+    if (strcmp(var, name) == 0) {
+      if (__kmp_env_format) {
+        KMP_STR_BUF_PRINT_NAME_EX(__kmp_barrier_branch_bit_env_name[i]);
+      } else {
+        __kmp_str_buf_print(buffer, "   %s='",
+                            __kmp_barrier_branch_bit_env_name[i]);
+      }
+      __kmp_str_buf_print(buffer, "%d,%d'\n",
+                          __kmp_barrier_gather_branch_bits[i],
+                          __kmp_barrier_release_branch_bits[i]);
+    }
+  }
+} // __kmp_stg_print_barrier_branch_bit
+
+// ----------------------------------------------------------------------------
+// KMP_PLAIN_BARRIER_PATTERN, KMP_FORKJOIN_BARRIER_PATTERN,
+// KMP_REDUCTION_BARRIER_PATTERN
+
+// TODO: Remove __kmp_barrier_pattern_name variable, remove loops from parse and
+// print functions, pass required data to functions through data argument.
+
+static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value,
+                                            void *data) {
+  const char *var;
+  /* ---------- Barrier method control ------------ */
+
+  static int dist_req = 0, non_dist_req = 0;
+  static bool warn = 1;
+  for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+    var = __kmp_barrier_pattern_env_name[i];
+
+    if ((strcmp(var, name) == 0) && (value != 0)) {
+      int j;
+      char *comma = CCAST(char *, strchr(value, ','));
+
+      /* handle first parameter: gather pattern */
+      for (j = bp_linear_bar; j < bp_last_bar; j++) {
+        if (__kmp_match_with_sentinel(__kmp_barrier_pattern_name[j], value, 1,
+                                      ',')) {
+          if (j == bp_dist_bar) {
+            dist_req++;
+          } else {
+            non_dist_req++;
+          }
+          __kmp_barrier_gather_pattern[i] = (kmp_bar_pat_e)j;
+          break;
+        }
+      }
+      if (j == bp_last_bar) {
+        KMP_WARNING(BarrGatherValueInvalid, name, value);
+        KMP_INFORM(Using_str_Value, name,
+                   __kmp_barrier_pattern_name[bp_linear_bar]);
+      }
+
+      /* handle second parameter: release pattern */
+      if (comma != NULL) {
+        for (j = bp_linear_bar; j < bp_last_bar; j++) {
+          if (__kmp_str_match(__kmp_barrier_pattern_name[j], 1, comma + 1)) {
+            if (j == bp_dist_bar) {
+              dist_req++;
+            } else {
+              non_dist_req++;
+            }
+            __kmp_barrier_release_pattern[i] = (kmp_bar_pat_e)j;
+            break;
+          }
+        }
+        if (j == bp_last_bar) {
+          __kmp_msg(kmp_ms_warning,
+                    KMP_MSG(BarrReleaseValueInvalid, name, comma + 1),
+                    __kmp_msg_null);
+          KMP_INFORM(Using_str_Value, name,
+                     __kmp_barrier_pattern_name[bp_linear_bar]);
+        }
+      }
+    }
+  }
+  if (dist_req != 0) {
+    // set all barriers to dist
+    if ((non_dist_req != 0) && warn) {
+      KMP_INFORM(BarrierPatternOverride, name,
+                 __kmp_barrier_pattern_name[bp_dist_bar]);
+      warn = 0;
+    }
+    for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+      if (__kmp_barrier_release_pattern[i] != bp_dist_bar)
+        __kmp_barrier_release_pattern[i] = bp_dist_bar;
+      if (__kmp_barrier_gather_pattern[i] != bp_dist_bar)
+        __kmp_barrier_gather_pattern[i] = bp_dist_bar;
+    }
+  }
+} // __kmp_stg_parse_barrier_pattern
+
+static void __kmp_stg_print_barrier_pattern(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  const char *var;
+  for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+    var = __kmp_barrier_pattern_env_name[i];
+    if (strcmp(var, name) == 0) {
+      int j = __kmp_barrier_gather_pattern[i];
+      int k = __kmp_barrier_release_pattern[i];
+      if (__kmp_env_format) {
+        KMP_STR_BUF_PRINT_NAME_EX(__kmp_barrier_pattern_env_name[i]);
+      } else {
+        __kmp_str_buf_print(buffer, "   %s='",
+                            __kmp_barrier_pattern_env_name[i]);
+      }
+      KMP_DEBUG_ASSERT(j < bp_last_bar && k < bp_last_bar);
+      __kmp_str_buf_print(buffer, "%s,%s'\n", __kmp_barrier_pattern_name[j],
+                          __kmp_barrier_pattern_name[k]);
+    }
+  }
+} // __kmp_stg_print_barrier_pattern
+
+// -----------------------------------------------------------------------------
+// KMP_ABORT_DELAY
+
+static void __kmp_stg_parse_abort_delay(char const *name, char const *value,
+                                        void *data) {
+  // Units of KMP_DELAY_ABORT are seconds, units of __kmp_abort_delay is
+  // milliseconds.
+  int delay = __kmp_abort_delay / 1000;
+  __kmp_stg_parse_int(name, value, 0, INT_MAX / 1000, &delay);
+  __kmp_abort_delay = delay * 1000;
+} // __kmp_stg_parse_abort_delay
+
+static void __kmp_stg_print_abort_delay(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_abort_delay);
+} // __kmp_stg_print_abort_delay
+
+// -----------------------------------------------------------------------------
+// KMP_CPUINFO_FILE
+
+static void __kmp_stg_parse_cpuinfo_file(char const *name, char const *value,
+                                         void *data) {
+#if KMP_AFFINITY_SUPPORTED
+  __kmp_stg_parse_str(name, value, &__kmp_cpuinfo_file);
+  K_DIAG(1, ("__kmp_cpuinfo_file == %s\n", __kmp_cpuinfo_file));
+#endif
+} //__kmp_stg_parse_cpuinfo_file
+
+static void __kmp_stg_print_cpuinfo_file(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+#if KMP_AFFINITY_SUPPORTED
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  if (__kmp_cpuinfo_file) {
+    __kmp_str_buf_print(buffer, "='%s'\n", __kmp_cpuinfo_file);
+  } else {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+#endif
+} //__kmp_stg_print_cpuinfo_file
+
+// -----------------------------------------------------------------------------
+// KMP_FORCE_REDUCTION, KMP_DETERMINISTIC_REDUCTION
+
+static void __kmp_stg_parse_force_reduction(char const *name, char const *value,
+                                            void *data) {
+  kmp_stg_fr_data_t *reduction = (kmp_stg_fr_data_t *)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, reduction->rivals);
+  if (rc) {
+    return;
+  }
+  if (reduction->force) {
+    if (value != 0) {
+      if (__kmp_str_match("critical", 0, value))
+        __kmp_force_reduction_method = critical_reduce_block;
+      else if (__kmp_str_match("atomic", 0, value))
+        __kmp_force_reduction_method = atomic_reduce_block;
+      else if (__kmp_str_match("tree", 0, value))
+        __kmp_force_reduction_method = tree_reduce_block;
+      else {
+        KMP_FATAL(UnknownForceReduction, name, value);
+      }
+    }
+  } else {
+    __kmp_stg_parse_bool(name, value, &__kmp_determ_red);
+    if (__kmp_determ_red) {
+      __kmp_force_reduction_method = tree_reduce_block;
+    } else {
+      __kmp_force_reduction_method = reduction_method_not_defined;
+    }
+  }
+  K_DIAG(1, ("__kmp_force_reduction_method == %d\n",
+             __kmp_force_reduction_method));
+} // __kmp_stg_parse_force_reduction
+
+static void __kmp_stg_print_force_reduction(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+
+  kmp_stg_fr_data_t *reduction = (kmp_stg_fr_data_t *)data;
+  if (reduction->force) {
+    if (__kmp_force_reduction_method == critical_reduce_block) {
+      __kmp_stg_print_str(buffer, name, "critical");
+    } else if (__kmp_force_reduction_method == atomic_reduce_block) {
+      __kmp_stg_print_str(buffer, name, "atomic");
+    } else if (__kmp_force_reduction_method == tree_reduce_block) {
+      __kmp_stg_print_str(buffer, name, "tree");
+    } else {
+      if (__kmp_env_format) {
+        KMP_STR_BUF_PRINT_NAME;
+      } else {
+        __kmp_str_buf_print(buffer, "   %s", name);
+      }
+      __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+    }
+  } else {
+    __kmp_stg_print_bool(buffer, name, __kmp_determ_red);
+  }
+
+} // __kmp_stg_print_force_reduction
+
+// -----------------------------------------------------------------------------
+// KMP_STORAGE_MAP
+
+static void __kmp_stg_parse_storage_map(char const *name, char const *value,
+                                        void *data) {
+  if (__kmp_str_match("verbose", 1, value)) {
+    __kmp_storage_map = TRUE;
+    __kmp_storage_map_verbose = TRUE;
+    __kmp_storage_map_verbose_specified = TRUE;
+
+  } else {
+    __kmp_storage_map_verbose = FALSE;
+    __kmp_stg_parse_bool(name, value, &__kmp_storage_map); // !!!
+  }
+} // __kmp_stg_parse_storage_map
+
+static void __kmp_stg_print_storage_map(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  if (__kmp_storage_map_verbose || __kmp_storage_map_verbose_specified) {
+    __kmp_stg_print_str(buffer, name, "verbose");
+  } else {
+    __kmp_stg_print_bool(buffer, name, __kmp_storage_map);
+  }
+} // __kmp_stg_print_storage_map
+
+// -----------------------------------------------------------------------------
+// KMP_ALL_THREADPRIVATE
+
+static void __kmp_stg_parse_all_threadprivate(char const *name,
+                                              char const *value, void *data) {
+  __kmp_stg_parse_int(name, value,
+                      __kmp_allThreadsSpecified ? __kmp_max_nth : 1,
+                      __kmp_max_nth, &__kmp_tp_capacity);
+} // __kmp_stg_parse_all_threadprivate
+
+static void __kmp_stg_print_all_threadprivate(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_tp_capacity);
+}
+
+// -----------------------------------------------------------------------------
+// KMP_FOREIGN_THREADS_THREADPRIVATE
+
+static void __kmp_stg_parse_foreign_threads_threadprivate(char const *name,
+                                                          char const *value,
+                                                          void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_foreign_tp);
+} // __kmp_stg_parse_foreign_threads_threadprivate
+
+static void __kmp_stg_print_foreign_threads_threadprivate(kmp_str_buf_t *buffer,
+                                                          char const *name,
+                                                          void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_foreign_tp);
+} // __kmp_stg_print_foreign_threads_threadprivate
+
+// -----------------------------------------------------------------------------
+// KMP_AFFINITY, GOMP_CPU_AFFINITY, KMP_TOPOLOGY_METHOD
+
+static inline const char *
+__kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
+  switch (type) {
+  case KMP_HW_CORE_TYPE_UNKNOWN:
+  case KMP_HW_MAX_NUM_CORE_TYPES:
+    return "unknown";
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  case KMP_HW_CORE_TYPE_ATOM:
+    return "intel_atom";
+  case KMP_HW_CORE_TYPE_CORE:
+    return "intel_core";
+#endif
+  }
+  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
+}
+
+#if KMP_AFFINITY_SUPPORTED
+// Parse the proc id list.  Return TRUE if successful, FALSE otherwise.
+static int __kmp_parse_affinity_proc_id_list(const char *var, const char *env,
+                                             const char **nextEnv,
+                                             char **proclist) {
+  const char *scan = env;
+  const char *next = scan;
+  int empty = TRUE;
+
+  *proclist = NULL;
+
+  for (;;) {
+    int start, end, stride;
+
+    SKIP_WS(scan);
+    next = scan;
+    if (*next == '\0') {
+      break;
+    }
+
+    if (*next == '{') {
+      int num;
+      next++; // skip '{'
+      SKIP_WS(next);
+      scan = next;
+
+      // Read the first integer in the set.
+      if ((*next < '0') || (*next > '9')) {
+        KMP_WARNING(AffSyntaxError, var);
+        return FALSE;
+      }
+      SKIP_DIGITS(next);
+      num = __kmp_str_to_int(scan, *next);
+      KMP_ASSERT(num >= 0);
+
+      for (;;) {
+        // Check for end of set.
+        SKIP_WS(next);
+        if (*next == '}') {
+          next++; // skip '}'
+          break;
+        }
+
+        // Skip optional comma.
+        if (*next == ',') {
+          next++;
+        }
+        SKIP_WS(next);
+
+        // Read the next integer in the set.
+        scan = next;
+        if ((*next < '0') || (*next > '9')) {
+          KMP_WARNING(AffSyntaxError, var);
+          return FALSE;
+        }
+
+        SKIP_DIGITS(next);
+        num = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT(num >= 0);
+      }
+      empty = FALSE;
+
+      SKIP_WS(next);
+      if (*next == ',') {
+        next++;
+      }
+      scan = next;
+      continue;
+    }
+
+    // Next character is not an integer => end of list
+    if ((*next < '0') || (*next > '9')) {
+      if (empty) {
+        KMP_WARNING(AffSyntaxError, var);
+        return FALSE;
+      }
+      break;
+    }
+
+    // Read the first integer.
+    SKIP_DIGITS(next);
+    start = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(start >= 0);
+    SKIP_WS(next);
+
+    // If this isn't a range, then go on.
+    if (*next != '-') {
+      empty = FALSE;
+
+      // Skip optional comma.
+      if (*next == ',') {
+        next++;
+      }
+      scan = next;
+      continue;
+    }
+
+    // This is a range.  Skip over the '-' and read in the 2nd int.
+    next++; // skip '-'
+    SKIP_WS(next);
+    scan = next;
+    if ((*next < '0') || (*next > '9')) {
+      KMP_WARNING(AffSyntaxError, var);
+      return FALSE;
+    }
+    SKIP_DIGITS(next);
+    end = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(end >= 0);
+
+    // Check for a stride parameter
+    stride = 1;
+    SKIP_WS(next);
+    if (*next == ':') {
+      // A stride is specified.  Skip over the ':" and read the 3rd int.
+      int sign = +1;
+      next++; // skip ':'
+      SKIP_WS(next);
+      scan = next;
+      if (*next == '-') {
+        sign = -1;
+        next++;
+        SKIP_WS(next);
+        scan = next;
+      }
+      if ((*next < '0') || (*next > '9')) {
+        KMP_WARNING(AffSyntaxError, var);
+        return FALSE;
+      }
+      SKIP_DIGITS(next);
+      stride = __kmp_str_to_int(scan, *next);
+      KMP_ASSERT(stride >= 0);
+      stride *= sign;
+    }
+
+    // Do some range checks.
+    if (stride == 0) {
+      KMP_WARNING(AffZeroStride, var);
+      return FALSE;
+    }
+    if (stride > 0) {
+      if (start > end) {
+        KMP_WARNING(AffStartGreaterEnd, var, start, end);
+        return FALSE;
+      }
+    } else {
+      if (start < end) {
+        KMP_WARNING(AffStrideLessZero, var, start, end);
+        return FALSE;
+      }
+    }
+    if ((end - start) / stride > 65536) {
+      KMP_WARNING(AffRangeTooBig, var, end, start, stride);
+      return FALSE;
+    }
+
+    empty = FALSE;
+
+    // Skip optional comma.
+    SKIP_WS(next);
+    if (*next == ',') {
+      next++;
+    }
+    scan = next;
+  }
+
+  *nextEnv = next;
+
+  {
+    ptrdiff_t len = next - env;
+    char *retlist = (char *)__kmp_allocate((len + 1) * sizeof(char));
+    KMP_MEMCPY_S(retlist, (len + 1) * sizeof(char), env, len * sizeof(char));
+    retlist[len] = '\0';
+    *proclist = retlist;
+  }
+  return TRUE;
+}
+
+// If KMP_AFFINITY is specified without a type, then
+// __kmp_affinity_notype should point to its setting.
+static kmp_setting_t *__kmp_affinity_notype = NULL;
+
+static void __kmp_parse_affinity_env(char const *name, char const *value,
+                                     kmp_affinity_t *out_affinity) {
+  char *buffer = NULL; // Copy of env var value.
+  char *buf = NULL; // Buffer for strtok_r() function.
+  char *next = NULL; // end of token / start of next.
+  const char *start; // start of current token (for err msgs)
+  int count = 0; // Counter of parsed integer numbers.
+  int number[2]; // Parsed numbers.
+
+  // Guards.
+  int type = 0;
+  int proclist = 0;
+  int verbose = 0;
+  int warnings = 0;
+  int respect = 0;
+  int gran = 0;
+  int dups = 0;
+  int reset = 0;
+  bool set = false;
+
+  KMP_ASSERT(value != NULL);
+
+  if (TCR_4(__kmp_init_middle)) {
+    KMP_WARNING(EnvMiddleWarn, name);
+    __kmp_env_toPrint(name, 0);
+    return;
+  }
+  __kmp_env_toPrint(name, 1);
+
+  buffer =
+      __kmp_str_format("%s", value); // Copy env var to keep original intact.
+  buf = buffer;
+  SKIP_WS(buf);
+
+// Helper macros.
+
+// If we see a parse error, emit a warning and scan to the next ",".
+//
+// FIXME - there's got to be a better way to print an error
+// message, hopefully without overwriting peices of buf.
+#define EMIT_WARN(skip, errlist)                                               \
+  {                                                                            \
+    char ch;                                                                   \
+    if (skip) {                                                                \
+      SKIP_TO(next, ',');                                                      \
+    }                                                                          \
+    ch = *next;                                                                \
+    *next = '\0';                                                              \
+    KMP_WARNING errlist;                                                       \
+    *next = ch;                                                                \
+    if (skip) {                                                                \
+      if (ch == ',')                                                           \
+        next++;                                                                \
+    }                                                                          \
+    buf = next;                                                                \
+  }
+
+#define _set_param(_guard, _var, _val)                                         \
+  {                                                                            \
+    if (_guard == 0) {                                                         \
+      _var = _val;                                                             \
+    } else {                                                                   \
+      EMIT_WARN(FALSE, (AffParamDefined, name, start));                        \
+    }                                                                          \
+    ++_guard;                                                                  \
+  }
+
+#define set_type(val) _set_param(type, out_affinity->type, val)
+#define set_verbose(val) _set_param(verbose, out_affinity->flags.verbose, val)
+#define set_warnings(val)                                                      \
+  _set_param(warnings, out_affinity->flags.warnings, val)
+#define set_respect(val) _set_param(respect, out_affinity->flags.respect, val)
+#define set_dups(val) _set_param(dups, out_affinity->flags.dups, val)
+#define set_proclist(val) _set_param(proclist, out_affinity->proclist, val)
+#define set_reset(val) _set_param(reset, out_affinity->flags.reset, val)
+
+#define set_gran(val, levels)                                                  \
+  {                                                                            \
+    if (gran == 0) {                                                           \
+      out_affinity->gran = val;                                                \
+      out_affinity->gran_levels = levels;                                      \
+    } else {                                                                   \
+      EMIT_WARN(FALSE, (AffParamDefined, name, start));                        \
+    }                                                                          \
+    ++gran;                                                                    \
+  }
+
+  KMP_DEBUG_ASSERT((__kmp_nested_proc_bind.bind_types != NULL) &&
+                   (__kmp_nested_proc_bind.used > 0));
+
+  while (*buf != '\0') {
+    start = next = buf;
+
+    if (__kmp_match_str("none", buf, CCAST(const char **, &next))) {
+      set_type(affinity_none);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+      buf = next;
+    } else if (__kmp_match_str("scatter", buf, CCAST(const char **, &next))) {
+      set_type(affinity_scatter);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("compact", buf, CCAST(const char **, &next))) {
+      set_type(affinity_compact);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("logical", buf, CCAST(const char **, &next))) {
+      set_type(affinity_logical);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("physical", buf, CCAST(const char **, &next))) {
+      set_type(affinity_physical);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("explicit", buf, CCAST(const char **, &next))) {
+      set_type(affinity_explicit);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("balanced", buf, CCAST(const char **, &next))) {
+      set_type(affinity_balanced);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("disabled", buf, CCAST(const char **, &next))) {
+      set_type(affinity_disabled);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+      buf = next;
+    } else if (__kmp_match_str("verbose", buf, CCAST(const char **, &next))) {
+      set_verbose(TRUE);
+      buf = next;
+    } else if (__kmp_match_str("noverbose", buf, CCAST(const char **, &next))) {
+      set_verbose(FALSE);
+      buf = next;
+    } else if (__kmp_match_str("warnings", buf, CCAST(const char **, &next))) {
+      set_warnings(TRUE);
+      buf = next;
+    } else if (__kmp_match_str("nowarnings", buf,
+                               CCAST(const char **, &next))) {
+      set_warnings(FALSE);
+      buf = next;
+    } else if (__kmp_match_str("respect", buf, CCAST(const char **, &next))) {
+      set_respect(TRUE);
+      buf = next;
+    } else if (__kmp_match_str("norespect", buf, CCAST(const char **, &next))) {
+      set_respect(FALSE);
+      buf = next;
+    } else if (__kmp_match_str("reset", buf, CCAST(const char **, &next))) {
+      set_reset(TRUE);
+      buf = next;
+    } else if (__kmp_match_str("noreset", buf, CCAST(const char **, &next))) {
+      set_reset(FALSE);
+      buf = next;
+    } else if (__kmp_match_str("duplicates", buf,
+                               CCAST(const char **, &next)) ||
+               __kmp_match_str("dups", buf, CCAST(const char **, &next))) {
+      set_dups(TRUE);
+      buf = next;
+    } else if (__kmp_match_str("noduplicates", buf,
+                               CCAST(const char **, &next)) ||
+               __kmp_match_str("nodups", buf, CCAST(const char **, &next))) {
+      set_dups(FALSE);
+      buf = next;
+    } else if (__kmp_match_str("granularity", buf,
+                               CCAST(const char **, &next)) ||
+               __kmp_match_str("gran", buf, CCAST(const char **, &next))) {
+      SKIP_WS(next);
+      if (*next != '=') {
+        EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+        continue;
+      }
+      next++; // skip '='
+      SKIP_WS(next);
+
+      buf = next;
+
+      // Have to try core_type and core_efficiency matches first since "core"
+      // will register as core granularity with "extra chars"
+      if (__kmp_match_str("core_type", buf, CCAST(const char **, &next))) {
+        set_gran(KMP_HW_CORE, -1);
+        out_affinity->flags.core_types_gran = 1;
+        buf = next;
+        set = true;
+      } else if (__kmp_match_str("core_efficiency", buf,
+                                 CCAST(const char **, &next)) ||
+                 __kmp_match_str("core_eff", buf,
+                                 CCAST(const char **, &next))) {
+        set_gran(KMP_HW_CORE, -1);
+        out_affinity->flags.core_effs_gran = 1;
+        buf = next;
+        set = true;
+      }
+      if (!set) {
+        // Try any hardware topology type for granularity
+        KMP_FOREACH_HW_TYPE(type) {
+          const char *name = __kmp_hw_get_keyword(type);
+          if (__kmp_match_str(name, buf, CCAST(const char **, &next))) {
+            set_gran(type, -1);
+            buf = next;
+            set = true;
+            break;
+          }
+        }
+      }
+      if (!set) {
+        // Support older names for different granularity layers
+        if (__kmp_match_str("fine", buf, CCAST(const char **, &next))) {
+          set_gran(KMP_HW_THREAD, -1);
+          buf = next;
+          set = true;
+        } else if (__kmp_match_str("package", buf,
+                                   CCAST(const char **, &next))) {
+          set_gran(KMP_HW_SOCKET, -1);
+          buf = next;
+          set = true;
+        } else if (__kmp_match_str("node", buf, CCAST(const char **, &next))) {
+          set_gran(KMP_HW_NUMA, -1);
+          buf = next;
+          set = true;
+#if KMP_GROUP_AFFINITY
+        } else if (__kmp_match_str("group", buf, CCAST(const char **, &next))) {
+          set_gran(KMP_HW_PROC_GROUP, -1);
+          buf = next;
+          set = true;
+#endif /* KMP_GROUP AFFINITY */
+        } else if ((*buf >= '0') && (*buf <= '9')) {
+          int n;
+          next = buf;
+          SKIP_DIGITS(next);
+          n = __kmp_str_to_int(buf, *next);
+          KMP_ASSERT(n >= 0);
+          buf = next;
+          set_gran(KMP_HW_UNKNOWN, n);
+          set = true;
+        } else {
+          EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+          continue;
+        }
+      }
+    } else if (__kmp_match_str("proclist", buf, CCAST(const char **, &next))) {
+      char *temp_proclist;
+
+      SKIP_WS(next);
+      if (*next != '=') {
+        EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+        continue;
+      }
+      next++; // skip '='
+      SKIP_WS(next);
+      if (*next != '[') {
+        EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+        continue;
+      }
+      next++; // skip '['
+      buf = next;
+      if (!__kmp_parse_affinity_proc_id_list(
+              name, buf, CCAST(const char **, &next), &temp_proclist)) {
+        // warning already emitted.
+        SKIP_TO(next, ']');
+        if (*next == ']')
+          next++;
+        SKIP_TO(next, ',');
+        if (*next == ',')
+          next++;
+        buf = next;
+        continue;
+      }
+      if (*next != ']') {
+        EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+        continue;
+      }
+      next++; // skip ']'
+      set_proclist(temp_proclist);
+    } else if ((*buf >= '0') && (*buf <= '9')) {
+      // Parse integer numbers -- permute and offset.
+      int n;
+      next = buf;
+      SKIP_DIGITS(next);
+      n = __kmp_str_to_int(buf, *next);
+      KMP_ASSERT(n >= 0);
+      buf = next;
+      if (count < 2) {
+        number[count] = n;
+      } else {
+        KMP_WARNING(AffManyParams, name, start);
+      }
+      ++count;
+    } else {
+      EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+      continue;
+    }
+
+    SKIP_WS(next);
+    if (*next == ',') {
+      next++;
+      SKIP_WS(next);
+    } else if (*next != '\0') {
+      const char *temp = next;
+      EMIT_WARN(TRUE, (ParseExtraCharsWarn, name, temp));
+      continue;
+    }
+    buf = next;
+  } // while
+
+#undef EMIT_WARN
+#undef _set_param
+#undef set_type
+#undef set_verbose
+#undef set_warnings
+#undef set_respect
+#undef set_granularity
+#undef set_reset
+
+  __kmp_str_free(&buffer);
+
+  if (proclist) {
+    if (!type) {
+      KMP_WARNING(AffProcListNoType, name);
+      out_affinity->type = affinity_explicit;
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+    } else if (out_affinity->type != affinity_explicit) {
+      KMP_WARNING(AffProcListNotExplicit, name);
+      KMP_ASSERT(out_affinity->proclist != NULL);
+      KMP_INTERNAL_FREE(out_affinity->proclist);
+      out_affinity->proclist = NULL;
+    }
+  }
+  switch (out_affinity->type) {
+  case affinity_logical:
+  case affinity_physical: {
+    if (count > 0) {
+      out_affinity->offset = number[0];
+    }
+    if (count > 1) {
+      KMP_WARNING(AffManyParamsForLogic, name, number[1]);
+    }
+  } break;
+  case affinity_balanced: {
+    if (count > 0) {
+      out_affinity->compact = number[0];
+    }
+    if (count > 1) {
+      out_affinity->offset = number[1];
+    }
+
+    if (__kmp_affinity.gran == KMP_HW_UNKNOWN) {
+      int verbose = out_affinity->flags.verbose;
+      int warnings = out_affinity->flags.warnings;
+#if KMP_MIC_SUPPORTED
+      if (__kmp_mic_type != non_mic) {
+        if (verbose || warnings) {
+          KMP_WARNING(AffGranUsing, out_affinity->env_var, "fine");
+        }
+        out_affinity->gran = KMP_HW_THREAD;
+      } else
+#endif
+      {
+        if (verbose || warnings) {
+          KMP_WARNING(AffGranUsing, out_affinity->env_var, "core");
+        }
+        out_affinity->gran = KMP_HW_CORE;
+      }
+    }
+  } break;
+  case affinity_scatter:
+  case affinity_compact: {
+    if (count > 0) {
+      out_affinity->compact = number[0];
+    }
+    if (count > 1) {
+      out_affinity->offset = number[1];
+    }
+  } break;
+  case affinity_explicit: {
+    if (out_affinity->proclist == NULL) {
+      KMP_WARNING(AffNoProcList, name);
+      out_affinity->type = affinity_none;
+    }
+    if (count > 0) {
+      KMP_WARNING(AffNoParam, name, "explicit");
+    }
+  } break;
+  case affinity_none: {
+    if (count > 0) {
+      KMP_WARNING(AffNoParam, name, "none");
+    }
+  } break;
+  case affinity_disabled: {
+    if (count > 0) {
+      KMP_WARNING(AffNoParam, name, "disabled");
+    }
+  } break;
+  case affinity_default: {
+    if (count > 0) {
+      KMP_WARNING(AffNoParam, name, "default");
+    }
+  } break;
+  default: {
+    KMP_ASSERT(0);
+  }
+  }
+} // __kmp_parse_affinity_env
+
+static void __kmp_stg_parse_affinity(char const *name, char const *value,
+                                     void *data) {
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, rivals);
+  if (rc) {
+    return;
+  }
+
+  __kmp_parse_affinity_env(name, value, &__kmp_affinity);
+
+} // __kmp_stg_parse_affinity
+static void __kmp_stg_parse_hh_affinity(char const *name, char const *value,
+                                        void *data) {
+  __kmp_parse_affinity_env(name, value, &__kmp_hh_affinity);
+  // Warn about unused parts of hidden helper affinity settings if specified.
+  if (__kmp_hh_affinity.flags.reset) {
+    KMP_WARNING(AffInvalidParam, name, "reset");
+  }
+  if (__kmp_hh_affinity.flags.respect != affinity_respect_mask_default) {
+    KMP_WARNING(AffInvalidParam, name, "respect");
+  }
+}
+
+static void __kmp_print_affinity_env(kmp_str_buf_t *buffer, char const *name,
+                                     const kmp_affinity_t &affinity) {
+  bool is_hh_affinity = (&affinity == &__kmp_hh_affinity);
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  if (affinity.flags.verbose) {
+    __kmp_str_buf_print(buffer, "%s,", "verbose");
+  } else {
+    __kmp_str_buf_print(buffer, "%s,", "noverbose");
+  }
+  if (affinity.flags.warnings) {
+    __kmp_str_buf_print(buffer, "%s,", "warnings");
+  } else {
+    __kmp_str_buf_print(buffer, "%s,", "nowarnings");
+  }
+  if (KMP_AFFINITY_CAPABLE()) {
+    // Hidden helper affinity does not affect global reset
+    // or respect flags. That is still solely controlled by KMP_AFFINITY.
+    if (!is_hh_affinity) {
+      if (affinity.flags.respect) {
+        __kmp_str_buf_print(buffer, "%s,", "respect");
+      } else {
+        __kmp_str_buf_print(buffer, "%s,", "norespect");
+      }
+      if (affinity.flags.reset) {
+        __kmp_str_buf_print(buffer, "%s,", "reset");
+      } else {
+        __kmp_str_buf_print(buffer, "%s,", "noreset");
+      }
+    }
+    __kmp_str_buf_print(buffer, "granularity=");
+    if (affinity.flags.core_types_gran)
+      __kmp_str_buf_print(buffer, "core_type,");
+    else if (affinity.flags.core_effs_gran) {
+      __kmp_str_buf_print(buffer, "core_eff,");
+    } else {
+      __kmp_str_buf_print(
+          buffer, "%s,", __kmp_hw_get_keyword(affinity.gran, /*plural=*/false));
+    }
+  }
+  if (!KMP_AFFINITY_CAPABLE()) {
+    __kmp_str_buf_print(buffer, "%s", "disabled");
+  } else {
+    int compact = affinity.compact;
+    int offset = affinity.offset;
+    switch (affinity.type) {
+    case affinity_none:
+      __kmp_str_buf_print(buffer, "%s", "none");
+      break;
+    case affinity_physical:
+      __kmp_str_buf_print(buffer, "%s,%d", "physical", offset);
+      break;
+    case affinity_logical:
+      __kmp_str_buf_print(buffer, "%s,%d", "logical", offset);
+      break;
+    case affinity_compact:
+      __kmp_str_buf_print(buffer, "%s,%d,%d", "compact", compact, offset);
+      break;
+    case affinity_scatter:
+      __kmp_str_buf_print(buffer, "%s,%d,%d", "scatter", compact, offset);
+      break;
+    case affinity_explicit:
+      __kmp_str_buf_print(buffer, "%s=[%s],%s", "proclist", affinity.proclist,
+                          "explicit");
+      break;
+    case affinity_balanced:
+      __kmp_str_buf_print(buffer, "%s,%d,%d", "balanced", compact, offset);
+      break;
+    case affinity_disabled:
+      __kmp_str_buf_print(buffer, "%s", "disabled");
+      break;
+    case affinity_default:
+      __kmp_str_buf_print(buffer, "%s", "default");
+      break;
+    default:
+      __kmp_str_buf_print(buffer, "%s", "<unknown>");
+      break;
+    }
+  }
+  __kmp_str_buf_print(buffer, "'\n");
+} //__kmp_stg_print_affinity
+
+static void __kmp_stg_print_affinity(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  __kmp_print_affinity_env(buffer, name, __kmp_affinity);
+}
+static void __kmp_stg_print_hh_affinity(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_print_affinity_env(buffer, name, __kmp_hh_affinity);
+}
+
+#ifdef KMP_GOMP_COMPAT
+
+static void __kmp_stg_parse_gomp_cpu_affinity(char const *name,
+                                              char const *value, void *data) {
+  const char *next = NULL;
+  char *temp_proclist;
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, rivals);
+  if (rc) {
+    return;
+  }
+
+  if (TCR_4(__kmp_init_middle)) {
+    KMP_WARNING(EnvMiddleWarn, name);
+    __kmp_env_toPrint(name, 0);
+    return;
+  }
+
+  __kmp_env_toPrint(name, 1);
+
+  if (__kmp_parse_affinity_proc_id_list(name, value, &next, &temp_proclist)) {
+    SKIP_WS(next);
+    if (*next == '\0') {
+      // GOMP_CPU_AFFINITY => granularity=fine,explicit,proclist=...
+      __kmp_affinity.proclist = temp_proclist;
+      __kmp_affinity.type = affinity_explicit;
+      __kmp_affinity.gran = KMP_HW_THREAD;
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+    } else {
+      KMP_WARNING(AffSyntaxError, name);
+      if (temp_proclist != NULL) {
+        KMP_INTERNAL_FREE((void *)temp_proclist);
+      }
+    }
+  } else {
+    // Warning already emitted
+    __kmp_affinity.type = affinity_none;
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+  }
+} // __kmp_stg_parse_gomp_cpu_affinity
+
+#endif /* KMP_GOMP_COMPAT */
+
+/*-----------------------------------------------------------------------------
+The OMP_PLACES proc id list parser. Here is the grammar:
+
+place_list := place
+place_list := place , place_list
+place := num
+place := place : num
+place := place : num : signed
+place := { subplacelist }
+place := ! place                  // (lowest priority)
+subplace_list := subplace
+subplace_list := subplace , subplace_list
+subplace := num
+subplace := num : num
+subplace := num : num : signed
+signed := num
+signed := + signed
+signed := - signed
+-----------------------------------------------------------------------------*/
+
+// Return TRUE if successful parse, FALSE otherwise
+static int __kmp_parse_subplace_list(const char *var, const char **scan) {
+  const char *next;
+
+  for (;;) {
+    int start, count, stride;
+
+    //
+    // Read in the starting proc id
+    //
+    SKIP_WS(*scan);
+    if ((**scan < '0') || (**scan > '9')) {
+      return FALSE;
+    }
+    next = *scan;
+    SKIP_DIGITS(next);
+    start = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(start >= 0);
+    *scan = next;
+
+    // valid follow sets are ',' ':' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}') {
+      break;
+    }
+    if (**scan == ',') {
+      (*scan)++; // skip ','
+      continue;
+    }
+    if (**scan != ':') {
+      return FALSE;
+    }
+    (*scan)++; // skip ':'
+
+    // Read count parameter
+    SKIP_WS(*scan);
+    if ((**scan < '0') || (**scan > '9')) {
+      return FALSE;
+    }
+    next = *scan;
+    SKIP_DIGITS(next);
+    count = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(count >= 0);
+    *scan = next;
+
+    // valid follow sets are ',' ':' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}') {
+      break;
+    }
+    if (**scan == ',') {
+      (*scan)++; // skip ','
+      continue;
+    }
+    if (**scan != ':') {
+      return FALSE;
+    }
+    (*scan)++; // skip ':'
+
+    // Read stride parameter
+    int sign = +1;
+    for (;;) {
+      SKIP_WS(*scan);
+      if (**scan == '+') {
+        (*scan)++; // skip '+'
+        continue;
+      }
+      if (**scan == '-') {
+        sign *= -1;
+        (*scan)++; // skip '-'
+        continue;
+      }
+      break;
+    }
+    SKIP_WS(*scan);
+    if ((**scan < '0') || (**scan > '9')) {
+      return FALSE;
+    }
+    next = *scan;
+    SKIP_DIGITS(next);
+    stride = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(stride >= 0);
+    *scan = next;
+    stride *= sign;
+
+    // valid follow sets are ',' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}') {
+      break;
+    }
+    if (**scan == ',') {
+      (*scan)++; // skip ','
+      continue;
+    }
+    return FALSE;
+  }
+  return TRUE;
+}
+
+// Return TRUE if successful parse, FALSE otherwise
+static int __kmp_parse_place(const char *var, const char **scan) {
+  const char *next;
+
+  // valid follow sets are '{' '!' and num
+  SKIP_WS(*scan);
+  if (**scan == '{') {
+    (*scan)++; // skip '{'
+    if (!__kmp_parse_subplace_list(var, scan)) {
+      return FALSE;
+    }
+    if (**scan != '}') {
+      return FALSE;
+    }
+    (*scan)++; // skip '}'
+  } else if (**scan == '!') {
+    (*scan)++; // skip '!'
+    return __kmp_parse_place(var, scan); //'!' has lower precedence than ':'
+  } else if ((**scan >= '0') && (**scan <= '9')) {
+    next = *scan;
+    SKIP_DIGITS(next);
+    int proc = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(proc >= 0);
+    *scan = next;
+  } else {
+    return FALSE;
+  }
+  return TRUE;
+}
+
+// Return TRUE if successful parse, FALSE otherwise
+static int __kmp_parse_place_list(const char *var, const char *env,
+                                  char **place_list) {
+  const char *scan = env;
+  const char *next = scan;
+
+  for (;;) {
+    int count, stride;
+
+    if (!__kmp_parse_place(var, &scan)) {
+      return FALSE;
+    }
+
+    // valid follow sets are ',' ':' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0') {
+      break;
+    }
+    if (*scan == ',') {
+      scan++; // skip ','
+      continue;
+    }
+    if (*scan != ':') {
+      return FALSE;
+    }
+    scan++; // skip ':'
+
+    // Read count parameter
+    SKIP_WS(scan);
+    if ((*scan < '0') || (*scan > '9')) {
+      return FALSE;
+    }
+    next = scan;
+    SKIP_DIGITS(next);
+    count = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(count >= 0);
+    scan = next;
+
+    // valid follow sets are ',' ':' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0') {
+      break;
+    }
+    if (*scan == ',') {
+      scan++; // skip ','
+      continue;
+    }
+    if (*scan != ':') {
+      return FALSE;
+    }
+    scan++; // skip ':'
+
+    // Read stride parameter
+    int sign = +1;
+    for (;;) {
+      SKIP_WS(scan);
+      if (*scan == '+') {
+        scan++; // skip '+'
+        continue;
+      }
+      if (*scan == '-') {
+        sign *= -1;
+        scan++; // skip '-'
+        continue;
+      }
+      break;
+    }
+    SKIP_WS(scan);
+    if ((*scan < '0') || (*scan > '9')) {
+      return FALSE;
+    }
+    next = scan;
+    SKIP_DIGITS(next);
+    stride = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(stride >= 0);
+    scan = next;
+    stride *= sign;
+
+    // valid follow sets are ',' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0') {
+      break;
+    }
+    if (*scan == ',') {
+      scan++; // skip ','
+      continue;
+    }
+
+    return FALSE;
+  }
+
+  {
+    ptrdiff_t len = scan - env;
+    char *retlist = (char *)__kmp_allocate((len + 1) * sizeof(char));
+    KMP_MEMCPY_S(retlist, (len + 1) * sizeof(char), env, len * sizeof(char));
+    retlist[len] = '\0';
+    *place_list = retlist;
+  }
+  return TRUE;
+}
+
+static inline void __kmp_places_set(enum affinity_type type, kmp_hw_t kind) {
+  __kmp_affinity.type = type;
+  __kmp_affinity.gran = kind;
+  __kmp_affinity.flags.dups = FALSE;
+  __kmp_affinity.flags.omp_places = TRUE;
+}
+
+static void __kmp_places_syntax_error_fallback(char const *name,
+                                               kmp_hw_t kind) {
+  const char *str = __kmp_hw_get_catalog_string(kind, /*plural=*/true);
+  KMP_WARNING(SyntaxErrorUsing, name, str);
+  __kmp_places_set(affinity_compact, kind);
+  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default)
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+}
+
+static void __kmp_stg_parse_places(char const *name, char const *value,
+                                   void *data) {
+  struct kmp_place_t {
+    const char *name;
+    kmp_hw_t type;
+  };
+  int count;
+  bool set = false;
+  const char *scan = value;
+  const char *next = scan;
+  kmp_place_t std_places[] = {{"threads", KMP_HW_THREAD},
+                              {"cores", KMP_HW_CORE},
+                              {"numa_domains", KMP_HW_NUMA},
+                              {"ll_caches", KMP_HW_LLC},
+                              {"sockets", KMP_HW_SOCKET}};
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, rivals);
+  if (rc) {
+    return;
+  }
+
+  // Standard choices
+  for (size_t i = 0; i < sizeof(std_places) / sizeof(std_places[0]); ++i) {
+    const kmp_place_t &place = std_places[i];
+    if (__kmp_match_str(place.name, scan, &next)) {
+      scan = next;
+      __kmp_places_set(affinity_compact, place.type);
+      set = true;
+      // Parse core attribute if it exists
+      if (KMP_HW_MAX_NUM_CORE_TYPES > 1) {
+        SKIP_WS(scan);
+        if (*scan == ':') {
+          if (place.type != KMP_HW_CORE) {
+            __kmp_places_syntax_error_fallback(name, place.type);
+            return;
+          }
+          scan++; // skip ':'
+          SKIP_WS(scan);
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+          if (__kmp_match_str("intel_core", scan, &next)) {
+            __kmp_affinity.core_attr_gran.core_type = KMP_HW_CORE_TYPE_CORE;
+            __kmp_affinity.core_attr_gran.valid = 1;
+            scan = next;
+          } else if (__kmp_match_str("intel_atom", scan, &next)) {
+            __kmp_affinity.core_attr_gran.core_type = KMP_HW_CORE_TYPE_ATOM;
+            __kmp_affinity.core_attr_gran.valid = 1;
+            scan = next;
+          } else
+#endif
+              if (__kmp_match_str("eff", scan, &next)) {
+            int eff;
+            if (!isdigit(*next)) {
+              __kmp_places_syntax_error_fallback(name, place.type);
+              return;
+            }
+            scan = next;
+            SKIP_DIGITS(next);
+            eff = __kmp_str_to_int(scan, *next);
+            if (eff < 0) {
+              __kmp_places_syntax_error_fallback(name, place.type);
+              return;
+            }
+            if (eff >= KMP_HW_MAX_NUM_CORE_EFFS)
+              eff = KMP_HW_MAX_NUM_CORE_EFFS - 1;
+            __kmp_affinity.core_attr_gran.core_eff = eff;
+            __kmp_affinity.core_attr_gran.valid = 1;
+            scan = next;
+          }
+          if (!__kmp_affinity.core_attr_gran.valid) {
+            __kmp_places_syntax_error_fallback(name, place.type);
+            return;
+          }
+        }
+      }
+      break;
+    }
+  }
+  // Implementation choices for OMP_PLACES based on internal types
+  if (!set) {
+    KMP_FOREACH_HW_TYPE(type) {
+      const char *name = __kmp_hw_get_keyword(type, true);
+      if (__kmp_match_str("unknowns", scan, &next))
+        continue;
+      if (__kmp_match_str(name, scan, &next)) {
+        scan = next;
+        __kmp_places_set(affinity_compact, type);
+        set = true;
+        break;
+      }
+    }
+  }
+  // Implementation choices for OMP_PLACES based on core attributes
+  if (!set) {
+    if (__kmp_match_str("core_types", scan, &next)) {
+      scan = next;
+      if (*scan != '\0') {
+        KMP_WARNING(ParseExtraCharsWarn, name, scan);
+      }
+      __kmp_places_set(affinity_compact, KMP_HW_CORE);
+      __kmp_affinity.flags.core_types_gran = 1;
+      set = true;
+    } else if (__kmp_match_str("core_effs", scan, &next) ||
+               __kmp_match_str("core_efficiencies", scan, &next)) {
+      scan = next;
+      if (*scan != '\0') {
+        KMP_WARNING(ParseExtraCharsWarn, name, scan);
+      }
+      __kmp_places_set(affinity_compact, KMP_HW_CORE);
+      __kmp_affinity.flags.core_effs_gran = 1;
+      set = true;
+    }
+  }
+  // Explicit place list
+  if (!set) {
+    if (__kmp_affinity.proclist != NULL) {
+      KMP_INTERNAL_FREE((void *)__kmp_affinity.proclist);
+      __kmp_affinity.proclist = NULL;
+    }
+    if (__kmp_parse_place_list(name, value, &__kmp_affinity.proclist)) {
+      __kmp_places_set(affinity_explicit, KMP_HW_THREAD);
+    } else {
+      // Syntax error fallback
+      __kmp_places_syntax_error_fallback(name, KMP_HW_CORE);
+    }
+    if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+    }
+    return;
+  }
+
+  kmp_hw_t gran = __kmp_affinity.gran;
+  if (__kmp_affinity.gran != KMP_HW_UNKNOWN) {
+    gran = __kmp_affinity.gran;
+  } else {
+    gran = KMP_HW_CORE;
+  }
+
+  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+  }
+
+  SKIP_WS(scan);
+  if (*scan == '\0') {
+    return;
+  }
+
+  // Parse option count parameter in parentheses
+  if (*scan != '(') {
+    __kmp_places_syntax_error_fallback(name, gran);
+    return;
+  }
+  scan++; // skip '('
+
+  SKIP_WS(scan);
+  next = scan;
+  SKIP_DIGITS(next);
+  count = __kmp_str_to_int(scan, *next);
+  KMP_ASSERT(count >= 0);
+  scan = next;
+
+  SKIP_WS(scan);
+  if (*scan != ')') {
+    __kmp_places_syntax_error_fallback(name, gran);
+    return;
+  }
+  scan++; // skip ')'
+
+  SKIP_WS(scan);
+  if (*scan != '\0') {
+    KMP_WARNING(ParseExtraCharsWarn, name, scan);
+  }
+  __kmp_affinity_num_places = count;
+}
+
+static void __kmp_stg_print_places(kmp_str_buf_t *buffer, char const *name,
+                                   void *data) {
+  enum affinity_type type = __kmp_affinity.type;
+  const char *proclist = __kmp_affinity.proclist;
+  kmp_hw_t gran = __kmp_affinity.gran;
+
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  if ((__kmp_nested_proc_bind.used == 0) ||
+      (__kmp_nested_proc_bind.bind_types == NULL) ||
+      (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  } else if (type == affinity_explicit) {
+    if (proclist != NULL) {
+      __kmp_str_buf_print(buffer, "='%s'\n", proclist);
+    } else {
+      __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+    }
+  } else if (type == affinity_compact) {
+    int num;
+    if (__kmp_affinity.num_masks > 0) {
+      num = __kmp_affinity.num_masks;
+    } else if (__kmp_affinity_num_places > 0) {
+      num = __kmp_affinity_num_places;
+    } else {
+      num = 0;
+    }
+    if (gran != KMP_HW_UNKNOWN) {
+      // If core_types or core_effs, just print and return
+      if (__kmp_affinity.flags.core_types_gran) {
+        __kmp_str_buf_print(buffer, "='%s'\n", "core_types");
+        return;
+      }
+      if (__kmp_affinity.flags.core_effs_gran) {
+        __kmp_str_buf_print(buffer, "='%s'\n", "core_effs");
+        return;
+      }
+
+      // threads, cores, sockets, cores:<attribute>, etc.
+      const char *name = __kmp_hw_get_keyword(gran, true);
+      __kmp_str_buf_print(buffer, "='%s", name);
+
+      // Add core attributes if it exists
+      if (__kmp_affinity.core_attr_gran.valid) {
+        kmp_hw_core_type_t ct =
+            (kmp_hw_core_type_t)__kmp_affinity.core_attr_gran.core_type;
+        int eff = __kmp_affinity.core_attr_gran.core_eff;
+        if (ct != KMP_HW_CORE_TYPE_UNKNOWN) {
+          const char *ct_name = __kmp_hw_get_core_type_keyword(ct);
+          __kmp_str_buf_print(buffer, ":%s", name, ct_name);
+        } else if (eff >= 0 && eff < KMP_HW_MAX_NUM_CORE_EFFS) {
+          __kmp_str_buf_print(buffer, ":eff%d", name, eff);
+        }
+      }
+
+      // Add the '(#)' part if it exists
+      if (num > 0)
+        __kmp_str_buf_print(buffer, "(%d)", num);
+      __kmp_str_buf_print(buffer, "'\n");
+    } else {
+      __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+    }
+  } else {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+}
+
+static void __kmp_stg_parse_topology_method(char const *name, char const *value,
+                                            void *data) {
+  if (__kmp_str_match("all", 1, value)) {
+    __kmp_affinity_top_method = affinity_top_method_all;
+  }
+#if KMP_USE_HWLOC
+  else if (__kmp_str_match("hwloc", 1, value)) {
+    __kmp_affinity_top_method = affinity_top_method_hwloc;
+  }
+#endif
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  else if (__kmp_str_match("cpuid_leaf31", 12, value) ||
+           __kmp_str_match("cpuid 1f", 8, value) ||
+           __kmp_str_match("cpuid 31", 8, value) ||
+           __kmp_str_match("cpuid1f", 7, value) ||
+           __kmp_str_match("cpuid31", 7, value) ||
+           __kmp_str_match("leaf 1f", 7, value) ||
+           __kmp_str_match("leaf 31", 7, value) ||
+           __kmp_str_match("leaf1f", 6, value) ||
+           __kmp_str_match("leaf31", 6, value)) {
+    __kmp_affinity_top_method = affinity_top_method_x2apicid_1f;
+  } else if (__kmp_str_match("x2apic id", 9, value) ||
+             __kmp_str_match("x2apic_id", 9, value) ||
+             __kmp_str_match("x2apic-id", 9, value) ||
+             __kmp_str_match("x2apicid", 8, value) ||
+             __kmp_str_match("cpuid leaf 11", 13, value) ||
+             __kmp_str_match("cpuid_leaf_11", 13, value) ||
+             __kmp_str_match("cpuid-leaf-11", 13, value) ||
+             __kmp_str_match("cpuid leaf11", 12, value) ||
+             __kmp_str_match("cpuid_leaf11", 12, value) ||
+             __kmp_str_match("cpuid-leaf11", 12, value) ||
+             __kmp_str_match("cpuidleaf 11", 12, value) ||
+             __kmp_str_match("cpuidleaf_11", 12, value) ||
+             __kmp_str_match("cpuidleaf-11", 12, value) ||
+             __kmp_str_match("cpuidleaf11", 11, value) ||
+             __kmp_str_match("cpuid 11", 8, value) ||
+             __kmp_str_match("cpuid_11", 8, value) ||
+             __kmp_str_match("cpuid-11", 8, value) ||
+             __kmp_str_match("cpuid11", 7, value) ||
+             __kmp_str_match("leaf 11", 7, value) ||
+             __kmp_str_match("leaf_11", 7, value) ||
+             __kmp_str_match("leaf-11", 7, value) ||
+             __kmp_str_match("leaf11", 6, value)) {
+    __kmp_affinity_top_method = affinity_top_method_x2apicid;
+  } else if (__kmp_str_match("apic id", 7, value) ||
+             __kmp_str_match("apic_id", 7, value) ||
+             __kmp_str_match("apic-id", 7, value) ||
+             __kmp_str_match("apicid", 6, value) ||
+             __kmp_str_match("cpuid leaf 4", 12, value) ||
+             __kmp_str_match("cpuid_leaf_4", 12, value) ||
+             __kmp_str_match("cpuid-leaf-4", 12, value) ||
+             __kmp_str_match("cpuid leaf4", 11, value) ||
+             __kmp_str_match("cpuid_leaf4", 11, value) ||
+             __kmp_str_match("cpuid-leaf4", 11, value) ||
+             __kmp_str_match("cpuidleaf 4", 11, value) ||
+             __kmp_str_match("cpuidleaf_4", 11, value) ||
+             __kmp_str_match("cpuidleaf-4", 11, value) ||
+             __kmp_str_match("cpuidleaf4", 10, value) ||
+             __kmp_str_match("cpuid 4", 7, value) ||
+             __kmp_str_match("cpuid_4", 7, value) ||
+             __kmp_str_match("cpuid-4", 7, value) ||
+             __kmp_str_match("cpuid4", 6, value) ||
+             __kmp_str_match("leaf 4", 6, value) ||
+             __kmp_str_match("leaf_4", 6, value) ||
+             __kmp_str_match("leaf-4", 6, value) ||
+             __kmp_str_match("leaf4", 5, value)) {
+    __kmp_affinity_top_method = affinity_top_method_apicid;
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+  else if (__kmp_str_match("/proc/cpuinfo", 2, value) ||
+           __kmp_str_match("cpuinfo", 5, value)) {
+    __kmp_affinity_top_method = affinity_top_method_cpuinfo;
+  }
+#if KMP_GROUP_AFFINITY
+  else if (__kmp_str_match("group", 1, value)) {
+    KMP_WARNING(StgDeprecatedValue, name, value, "all");
+    __kmp_affinity_top_method = affinity_top_method_group;
+  }
+#endif /* KMP_GROUP_AFFINITY */
+  else if (__kmp_str_match("flat", 1, value)) {
+    __kmp_affinity_top_method = affinity_top_method_flat;
+  } else {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+} // __kmp_stg_parse_topology_method
+
+static void __kmp_stg_print_topology_method(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  char const *value = NULL;
+
+  switch (__kmp_affinity_top_method) {
+  case affinity_top_method_default:
+    value = "default";
+    break;
+
+  case affinity_top_method_all:
+    value = "all";
+    break;
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  case affinity_top_method_x2apicid_1f:
+    value = "x2APIC id leaf 0x1f";
+    break;
+
+  case affinity_top_method_x2apicid:
+    value = "x2APIC id leaf 0xb";
+    break;
+
+  case affinity_top_method_apicid:
+    value = "APIC id";
+    break;
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#if KMP_USE_HWLOC
+  case affinity_top_method_hwloc:
+    value = "hwloc";
+    break;
+#endif
+
+  case affinity_top_method_cpuinfo:
+    value = "cpuinfo";
+    break;
+
+#if KMP_GROUP_AFFINITY
+  case affinity_top_method_group:
+    value = "group";
+    break;
+#endif /* KMP_GROUP_AFFINITY */
+
+  case affinity_top_method_flat:
+    value = "flat";
+    break;
+  }
+
+  if (value != NULL) {
+    __kmp_stg_print_str(buffer, name, value);
+  }
+} // __kmp_stg_print_topology_method
+
+// KMP_TEAMS_PROC_BIND
+struct kmp_proc_bind_info_t {
+  const char *name;
+  kmp_proc_bind_t proc_bind;
+};
+static kmp_proc_bind_info_t proc_bind_table[] = {
+    {"spread", proc_bind_spread},
+    {"true", proc_bind_spread},
+    {"close", proc_bind_close},
+    // teams-bind = false means "replicate the primary thread's affinity"
+    {"false", proc_bind_primary},
+    {"primary", proc_bind_primary}};
+static void __kmp_stg_parse_teams_proc_bind(char const *name, char const *value,
+                                            void *data) {
+  int valid;
+  const char *end;
+  valid = 0;
+  for (size_t i = 0; i < sizeof(proc_bind_table) / sizeof(proc_bind_table[0]);
+       ++i) {
+    if (__kmp_match_str(proc_bind_table[i].name, value, &end)) {
+      __kmp_teams_proc_bind = proc_bind_table[i].proc_bind;
+      valid = 1;
+      break;
+    }
+  }
+  if (!valid) {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+}
+static void __kmp_stg_print_teams_proc_bind(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  const char *value = KMP_I18N_STR(NotDefined);
+  for (size_t i = 0; i < sizeof(proc_bind_table) / sizeof(proc_bind_table[0]);
+       ++i) {
+    if (__kmp_teams_proc_bind == proc_bind_table[i].proc_bind) {
+      value = proc_bind_table[i].name;
+      break;
+    }
+  }
+  __kmp_stg_print_str(buffer, name, value);
+}
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+// OMP_PROC_BIND / bind-var is functional on all 4.0 builds, including OS X*
+// OMP_PLACES / place-partition-var is not.
+static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
+                                      void *data) {
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, rivals);
+  if (rc) {
+    return;
+  }
+
+  // In OMP 4.0 OMP_PROC_BIND is a vector of proc_bind types.
+  KMP_DEBUG_ASSERT((__kmp_nested_proc_bind.bind_types != NULL) &&
+                   (__kmp_nested_proc_bind.used > 0));
+
+  const char *buf = value;
+  const char *next;
+  int num;
+  SKIP_WS(buf);
+  if ((*buf >= '0') && (*buf <= '9')) {
+    next = buf;
+    SKIP_DIGITS(next);
+    num = __kmp_str_to_int(buf, *next);
+    KMP_ASSERT(num >= 0);
+    buf = next;
+    SKIP_WS(buf);
+  } else {
+    num = -1;
+  }
+
+  next = buf;
+  if (__kmp_match_str("disabled", buf, &next)) {
+    buf = next;
+    SKIP_WS(buf);
+#if KMP_AFFINITY_SUPPORTED
+    __kmp_affinity.type = affinity_disabled;
+#endif /* KMP_AFFINITY_SUPPORTED */
+    __kmp_nested_proc_bind.used = 1;
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+  } else if ((num == (int)proc_bind_false) ||
+             __kmp_match_str("false", buf, &next)) {
+    buf = next;
+    SKIP_WS(buf);
+#if KMP_AFFINITY_SUPPORTED
+    __kmp_affinity.type = affinity_none;
+#endif /* KMP_AFFINITY_SUPPORTED */
+    __kmp_nested_proc_bind.used = 1;
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+  } else if ((num == (int)proc_bind_true) ||
+             __kmp_match_str("true", buf, &next)) {
+    buf = next;
+    SKIP_WS(buf);
+    __kmp_nested_proc_bind.used = 1;
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+  } else {
+    // Count the number of values in the env var string
+    const char *scan;
+    int nelem = 1;
+    for (scan = buf; *scan != '\0'; scan++) {
+      if (*scan == ',') {
+        nelem++;
+      }
+    }
+
+    // Create / expand the nested proc_bind array as needed
+    if (__kmp_nested_proc_bind.size < nelem) {
+      __kmp_nested_proc_bind.bind_types =
+          (kmp_proc_bind_t *)KMP_INTERNAL_REALLOC(
+              __kmp_nested_proc_bind.bind_types,
+              sizeof(kmp_proc_bind_t) * nelem);
+      if (__kmp_nested_proc_bind.bind_types == NULL) {
+        KMP_FATAL(MemoryAllocFailed);
+      }
+      __kmp_nested_proc_bind.size = nelem;
+    }
+    __kmp_nested_proc_bind.used = nelem;
+
+    if (nelem > 1 && !__kmp_dflt_max_active_levels_set)
+      __kmp_dflt_max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+
+    // Save values in the nested proc_bind array
+    int i = 0;
+    for (;;) {
+      enum kmp_proc_bind_t bind;
+
+      if ((num == (int)proc_bind_primary) ||
+          __kmp_match_str("master", buf, &next) ||
+          __kmp_match_str("primary", buf, &next)) {
+        buf = next;
+        SKIP_WS(buf);
+        bind = proc_bind_primary;
+      } else if ((num == (int)proc_bind_close) ||
+                 __kmp_match_str("close", buf, &next)) {
+        buf = next;
+        SKIP_WS(buf);
+        bind = proc_bind_close;
+      } else if ((num == (int)proc_bind_spread) ||
+                 __kmp_match_str("spread", buf, &next)) {
+        buf = next;
+        SKIP_WS(buf);
+        bind = proc_bind_spread;
+      } else {
+        KMP_WARNING(StgInvalidValue, name, value);
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+        __kmp_nested_proc_bind.used = 1;
+        return;
+      }
+
+      __kmp_nested_proc_bind.bind_types[i++] = bind;
+      if (i >= nelem) {
+        break;
+      }
+      KMP_DEBUG_ASSERT(*buf == ',');
+      buf++;
+      SKIP_WS(buf);
+
+      // Read next value if it was specified as an integer
+      if ((*buf >= '0') && (*buf <= '9')) {
+        next = buf;
+        SKIP_DIGITS(next);
+        num = __kmp_str_to_int(buf, *next);
+        KMP_ASSERT(num >= 0);
+        buf = next;
+        SKIP_WS(buf);
+      } else {
+        num = -1;
+      }
+    }
+    SKIP_WS(buf);
+  }
+  if (*buf != '\0') {
+    KMP_WARNING(ParseExtraCharsWarn, name, buf);
+  }
+}
+
+static void __kmp_stg_print_proc_bind(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  int nelem = __kmp_nested_proc_bind.used;
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  if (nelem == 0) {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  } else {
+    int i;
+    __kmp_str_buf_print(buffer, "='", name);
+    for (i = 0; i < nelem; i++) {
+      switch (__kmp_nested_proc_bind.bind_types[i]) {
+      case proc_bind_false:
+        __kmp_str_buf_print(buffer, "false");
+        break;
+
+      case proc_bind_true:
+        __kmp_str_buf_print(buffer, "true");
+        break;
+
+      case proc_bind_primary:
+        __kmp_str_buf_print(buffer, "primary");
+        break;
+
+      case proc_bind_close:
+        __kmp_str_buf_print(buffer, "close");
+        break;
+
+      case proc_bind_spread:
+        __kmp_str_buf_print(buffer, "spread");
+        break;
+
+      case proc_bind_intel:
+        __kmp_str_buf_print(buffer, "intel");
+        break;
+
+      case proc_bind_default:
+        __kmp_str_buf_print(buffer, "default");
+        break;
+      }
+      if (i < nelem - 1) {
+        __kmp_str_buf_print(buffer, ",");
+      }
+    }
+    __kmp_str_buf_print(buffer, "'\n");
+  }
+}
+
+static void __kmp_stg_parse_display_affinity(char const *name,
+                                             char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_display_affinity);
+}
+static void __kmp_stg_print_display_affinity(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_display_affinity);
+}
+static void __kmp_stg_parse_affinity_format(char const *name, char const *value,
+                                            void *data) {
+  size_t length = KMP_STRLEN(value);
+  __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE, value,
+                         length);
+}
+static void __kmp_stg_print_affinity_format(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  __kmp_str_buf_print(buffer, "%s'\n", __kmp_affinity_format);
+}
+
+/*-----------------------------------------------------------------------------
+OMP_ALLOCATOR sets default allocator. Here is the grammar:
+
+<allocator>        |= <predef-allocator> | <predef-mem-space> |
+                      <predef-mem-space>:<traits>
+<traits>           |= <trait>=<value> | <trait>=<value>,<traits>
+<predef-allocator> |= omp_default_mem_alloc | omp_large_cap_mem_alloc |
+                      omp_const_mem_alloc | omp_high_bw_mem_alloc |
+                      omp_low_lat_mem_alloc | omp_cgroup_mem_alloc |
+                      omp_pteam_mem_alloc | omp_thread_mem_alloc
+<predef-mem-space> |= omp_default_mem_space | omp_large_cap_mem_space |
+                      omp_const_mem_space | omp_high_bw_mem_space |
+                      omp_low_lat_mem_space
+<trait>            |= sync_hint | alignment | access | pool_size | fallback |
+                      fb_data | pinned | partition
+<value>            |= one of the allowed values of trait |
+                      non-negative integer | <predef-allocator>
+-----------------------------------------------------------------------------*/
+
+static void __kmp_stg_parse_allocator(char const *name, char const *value,
+                                      void *data) {
+  const char *buf = value;
+  const char *next, *scan, *start;
+  char *key;
+  omp_allocator_handle_t al;
+  omp_memspace_handle_t ms = omp_default_mem_space;
+  bool is_memspace = false;
+  int ntraits = 0, count = 0;
+
+  SKIP_WS(buf);
+  next = buf;
+  const char *delim = strchr(buf, ':');
+  const char *predef_mem_space = strstr(buf, "mem_space");
+
+  bool is_memalloc = (!predef_mem_space && !delim) ? true : false;
+
+  // Count the number of traits in the env var string
+  if (delim) {
+    ntraits = 1;
+    for (scan = buf; *scan != '\0'; scan++) {
+      if (*scan == ',')
+        ntraits++;
+    }
+  }
+  omp_alloctrait_t *traits =
+      (omp_alloctrait_t *)KMP_ALLOCA(ntraits * sizeof(omp_alloctrait_t));
+
+// Helper macros
+#define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
+
+#define GET_NEXT(sentinel)                                                     \
+  {                                                                            \
+    SKIP_WS(next);                                                             \
+    if (*next == sentinel)                                                     \
+      next++;                                                                  \
+    SKIP_WS(next);                                                             \
+    scan = next;                                                               \
+  }
+
+#define SKIP_PAIR(key)                                                         \
+  {                                                                            \
+    char const str_delimiter[] = {',', 0};                                     \
+    char *value = __kmp_str_token(CCAST(char *, scan), str_delimiter,          \
+                                  CCAST(char **, &next));                      \
+    KMP_WARNING(StgInvalidValue, key, value);                                  \
+    ntraits--;                                                                 \
+    SKIP_WS(next);                                                             \
+    scan = next;                                                               \
+  }
+
+#define SET_KEY()                                                              \
+  {                                                                            \
+    char const str_delimiter[] = {'=', 0};                                     \
+    key = __kmp_str_token(CCAST(char *, start), str_delimiter,                 \
+                          CCAST(char **, &next));                              \
+    scan = next;                                                               \
+  }
+
+  scan = next;
+  while (*next != '\0') {
+    if (is_memalloc ||
+        __kmp_match_str("fb_data", scan, &next)) { // allocator check
+      start = scan;
+      GET_NEXT('=');
+      // check HBW and LCAP first as the only non-default supported
+      if (__kmp_match_str("omp_high_bw_mem_alloc", scan, &next)) {
+        SKIP_WS(next);
+        if (is_memalloc) {
+          if (__kmp_memkind_available) {
+            __kmp_def_allocator = omp_high_bw_mem_alloc;
+            return;
+          } else {
+            KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc");
+          }
+        } else {
+          traits[count].key = omp_atk_fb_data;
+          traits[count].value = RCAST(omp_uintptr_t, omp_high_bw_mem_alloc);
+        }
+      } else if (__kmp_match_str("omp_large_cap_mem_alloc", scan, &next)) {
+        SKIP_WS(next);
+        if (is_memalloc) {
+          if (__kmp_memkind_available) {
+            __kmp_def_allocator = omp_large_cap_mem_alloc;
+            return;
+          } else {
+            KMP_WARNING(OmpNoAllocator, "omp_large_cap_mem_alloc");
+          }
+        } else {
+          traits[count].key = omp_atk_fb_data;
+          traits[count].value = RCAST(omp_uintptr_t, omp_large_cap_mem_alloc);
+        }
+      } else if (__kmp_match_str("omp_default_mem_alloc", scan, &next)) {
+        // default requested
+        SKIP_WS(next);
+        if (!is_memalloc) {
+          traits[count].key = omp_atk_fb_data;
+          traits[count].value = RCAST(omp_uintptr_t, omp_default_mem_alloc);
+        }
+      } else if (__kmp_match_str("omp_const_mem_alloc", scan, &next)) {
+        SKIP_WS(next);
+        if (is_memalloc) {
+          KMP_WARNING(OmpNoAllocator, "omp_const_mem_alloc");
+        } else {
+          traits[count].key = omp_atk_fb_data;
+          traits[count].value = RCAST(omp_uintptr_t, omp_const_mem_alloc);
+        }
+      } else if (__kmp_match_str("omp_low_lat_mem_alloc", scan, &next)) {
+        SKIP_WS(next);
+        if (is_memalloc) {
+          KMP_WARNING(OmpNoAllocator, "omp_low_lat_mem_alloc");
+        } else {
+          traits[count].key = omp_atk_fb_data;
+          traits[count].value = RCAST(omp_uintptr_t, omp_low_lat_mem_alloc);
+        }
+      } else if (__kmp_match_str("omp_cgroup_mem_alloc", scan, &next)) {
+        SKIP_WS(next);
+        if (is_memalloc) {
+          KMP_WARNING(OmpNoAllocator, "omp_cgroup_mem_alloc");
+        } else {
+          traits[count].key = omp_atk_fb_data;
+          traits[count].value = RCAST(omp_uintptr_t, omp_cgroup_mem_alloc);
+        }
+      } else if (__kmp_match_str("omp_pteam_mem_alloc", scan, &next)) {
+        SKIP_WS(next);
+        if (is_memalloc) {
+          KMP_WARNING(OmpNoAllocator, "omp_pteam_mem_alloc");
+        } else {
+          traits[count].key = omp_atk_fb_data;
+          traits[count].value = RCAST(omp_uintptr_t, omp_pteam_mem_alloc);
+        }
+      } else if (__kmp_match_str("omp_thread_mem_alloc", scan, &next)) {
+        SKIP_WS(next);
+        if (is_memalloc) {
+          KMP_WARNING(OmpNoAllocator, "omp_thread_mem_alloc");
+        } else {
+          traits[count].key = omp_atk_fb_data;
+          traits[count].value = RCAST(omp_uintptr_t, omp_thread_mem_alloc);
+        }
+      } else {
+        if (!is_memalloc) {
+          SET_KEY();
+          SKIP_PAIR(key);
+          continue;
+        }
+      }
+      if (is_memalloc) {
+        __kmp_def_allocator = omp_default_mem_alloc;
+        if (next == buf || *next != '\0') {
+          // either no match or extra symbols present after the matched token
+          KMP_WARNING(StgInvalidValue, name, value);
+        }
+        return;
+      } else {
+        ++count;
+        if (count == ntraits)
+          break;
+        GET_NEXT(',');
+      }
+    } else { // memspace
+      if (!is_memspace) {
+        if (__kmp_match_str("omp_default_mem_space", scan, &next)) {
+          SKIP_WS(next);
+          ms = omp_default_mem_space;
+        } else if (__kmp_match_str("omp_large_cap_mem_space", scan, &next)) {
+          SKIP_WS(next);
+          ms = omp_large_cap_mem_space;
+        } else if (__kmp_match_str("omp_const_mem_space", scan, &next)) {
+          SKIP_WS(next);
+          ms = omp_const_mem_space;
+        } else if (__kmp_match_str("omp_high_bw_mem_space", scan, &next)) {
+          SKIP_WS(next);
+          ms = omp_high_bw_mem_space;
+        } else if (__kmp_match_str("omp_low_lat_mem_space", scan, &next)) {
+          SKIP_WS(next);
+          ms = omp_low_lat_mem_space;
+        } else {
+          __kmp_def_allocator = omp_default_mem_alloc;
+          if (next == buf || *next != '\0') {
+            // either no match or extra symbols present after the matched token
+            KMP_WARNING(StgInvalidValue, name, value);
+          }
+          return;
+        }
+        is_memspace = true;
+      }
+      if (delim) { // traits
+        GET_NEXT(':');
+        start = scan;
+        if (__kmp_match_str("sync_hint", scan, &next)) {
+          GET_NEXT('=');
+          traits[count].key = omp_atk_sync_hint;
+          if (__kmp_match_str("contended", scan, &next)) {
+            traits[count].value = omp_atv_contended;
+          } else if (__kmp_match_str("uncontended", scan, &next)) {
+            traits[count].value = omp_atv_uncontended;
+          } else if (__kmp_match_str("serialized", scan, &next)) {
+            traits[count].value = omp_atv_serialized;
+          } else if (__kmp_match_str("private", scan, &next)) {
+            traits[count].value = omp_atv_private;
+          } else {
+            SET_KEY();
+            SKIP_PAIR(key);
+            continue;
+          }
+        } else if (__kmp_match_str("alignment", scan, &next)) {
+          GET_NEXT('=');
+          if (!isdigit(*next)) {
+            SET_KEY();
+            SKIP_PAIR(key);
+            continue;
+          }
+          SKIP_DIGITS(next);
+          int n = __kmp_str_to_int(scan, ',');
+          if (n < 0 || !IS_POWER_OF_TWO(n)) {
+            SET_KEY();
+            SKIP_PAIR(key);
+            continue;
+          }
+          traits[count].key = omp_atk_alignment;
+          traits[count].value = n;
+        } else if (__kmp_match_str("access", scan, &next)) {
+          GET_NEXT('=');
+          traits[count].key = omp_atk_access;
+          if (__kmp_match_str("all", scan, &next)) {
+            traits[count].value = omp_atv_all;
+          } else if (__kmp_match_str("cgroup", scan, &next)) {
+            traits[count].value = omp_atv_cgroup;
+          } else if (__kmp_match_str("pteam", scan, &next)) {
+            traits[count].value = omp_atv_pteam;
+          } else if (__kmp_match_str("thread", scan, &next)) {
+            traits[count].value = omp_atv_thread;
+          } else {
+            SET_KEY();
+            SKIP_PAIR(key);
+            continue;
+          }
+        } else if (__kmp_match_str("pool_size", scan, &next)) {
+          GET_NEXT('=');
+          if (!isdigit(*next)) {
+            SET_KEY();
+            SKIP_PAIR(key);
+            continue;
+          }
+          SKIP_DIGITS(next);
+          int n = __kmp_str_to_int(scan, ',');
+          if (n < 0) {
+            SET_KEY();
+            SKIP_PAIR(key);
+            continue;
+          }
+          traits[count].key = omp_atk_pool_size;
+          traits[count].value = n;
+        } else if (__kmp_match_str("fallback", scan, &next)) {
+          GET_NEXT('=');
+          traits[count].key = omp_atk_fallback;
+          if (__kmp_match_str("default_mem_fb", scan, &next)) {
+            traits[count].value = omp_atv_default_mem_fb;
+          } else if (__kmp_match_str("null_fb", scan, &next)) {
+            traits[count].value = omp_atv_null_fb;
+          } else if (__kmp_match_str("abort_fb", scan, &next)) {
+            traits[count].value = omp_atv_abort_fb;
+          } else if (__kmp_match_str("allocator_fb", scan, &next)) {
+            traits[count].value = omp_atv_allocator_fb;
+          } else {
+            SET_KEY();
+            SKIP_PAIR(key);
+            continue;
+          }
+        } else if (__kmp_match_str("pinned", scan, &next)) {
+          GET_NEXT('=');
+          traits[count].key = omp_atk_pinned;
+          if (__kmp_str_match_true(next)) {
+            traits[count].value = omp_atv_true;
+          } else if (__kmp_str_match_false(next)) {
+            traits[count].value = omp_atv_false;
+          } else {
+            SET_KEY();
+            SKIP_PAIR(key);
+            continue;
+          }
+        } else if (__kmp_match_str("partition", scan, &next)) {
+          GET_NEXT('=');
+          traits[count].key = omp_atk_partition;
+          if (__kmp_match_str("environment", scan, &next)) {
+            traits[count].value = omp_atv_environment;
+          } else if (__kmp_match_str("nearest", scan, &next)) {
+            traits[count].value = omp_atv_nearest;
+          } else if (__kmp_match_str("blocked", scan, &next)) {
+            traits[count].value = omp_atv_blocked;
+          } else if (__kmp_match_str("interleaved", scan, &next)) {
+            traits[count].value = omp_atv_interleaved;
+          } else {
+            SET_KEY();
+            SKIP_PAIR(key);
+            continue;
+          }
+        } else {
+          SET_KEY();
+          SKIP_PAIR(key);
+          continue;
+        }
+        SKIP_WS(next);
+        ++count;
+        if (count == ntraits)
+          break;
+        GET_NEXT(',');
+      } // traits
+    } // memspace
+  } // while
+  al = __kmpc_init_allocator(__kmp_get_gtid(), ms, ntraits, traits);
+  __kmp_def_allocator = (al == omp_null_allocator) ? omp_default_mem_alloc : al;
+}
+
+static void __kmp_stg_print_allocator(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  if (__kmp_def_allocator == omp_default_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_default_mem_alloc");
+  } else if (__kmp_def_allocator == omp_high_bw_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_high_bw_mem_alloc");
+  } else if (__kmp_def_allocator == omp_large_cap_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_large_cap_mem_alloc");
+  } else if (__kmp_def_allocator == omp_const_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_const_mem_alloc");
+  } else if (__kmp_def_allocator == omp_low_lat_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_low_lat_mem_alloc");
+  } else if (__kmp_def_allocator == omp_cgroup_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_cgroup_mem_alloc");
+  } else if (__kmp_def_allocator == omp_pteam_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_pteam_mem_alloc");
+  } else if (__kmp_def_allocator == omp_thread_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_thread_mem_alloc");
+  }
+}
+
+// -----------------------------------------------------------------------------
+// OMP_DYNAMIC
+
+static void __kmp_stg_parse_omp_dynamic(char const *name, char const *value,
+                                        void *data) {
+  __kmp_stg_parse_bool(name, value, &(__kmp_global.g.g_dynamic));
+} // __kmp_stg_parse_omp_dynamic
+
+static void __kmp_stg_print_omp_dynamic(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_global.g.g_dynamic);
+} // __kmp_stg_print_omp_dynamic
+
+static void __kmp_stg_parse_kmp_dynamic_mode(char const *name,
+                                             char const *value, void *data) {
+  if (TCR_4(__kmp_init_parallel)) {
+    KMP_WARNING(EnvParallelWarn, name);
+    __kmp_env_toPrint(name, 0);
+    return;
+  }
+#ifdef USE_LOAD_BALANCE
+  else if (__kmp_str_match("load balance", 2, value) ||
+           __kmp_str_match("load_balance", 2, value) ||
+           __kmp_str_match("load-balance", 2, value) ||
+           __kmp_str_match("loadbalance", 2, value) ||
+           __kmp_str_match("balance", 1, value)) {
+    __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
+  }
+#endif /* USE_LOAD_BALANCE */
+  else if (__kmp_str_match("thread limit", 1, value) ||
+           __kmp_str_match("thread_limit", 1, value) ||
+           __kmp_str_match("thread-limit", 1, value) ||
+           __kmp_str_match("threadlimit", 1, value) ||
+           __kmp_str_match("limit", 2, value)) {
+    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
+  } else if (__kmp_str_match("random", 1, value)) {
+    __kmp_global.g.g_dynamic_mode = dynamic_random;
+  } else {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+} //__kmp_stg_parse_kmp_dynamic_mode
+
+static void __kmp_stg_print_kmp_dynamic_mode(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+#if KMP_DEBUG
+  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
+    __kmp_str_buf_print(buffer, "   %s: %s \n", name, KMP_I18N_STR(NotDefined));
+  }
+#ifdef USE_LOAD_BALANCE
+  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
+    __kmp_stg_print_str(buffer, name, "load balance");
+  }
+#endif /* USE_LOAD_BALANCE */
+  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
+    __kmp_stg_print_str(buffer, name, "thread limit");
+  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
+    __kmp_stg_print_str(buffer, name, "random");
+  } else {
+    KMP_ASSERT(0);
+  }
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_kmp_dynamic_mode
+
+#ifdef USE_LOAD_BALANCE
+
+// -----------------------------------------------------------------------------
+// KMP_LOAD_BALANCE_INTERVAL
+
+static void __kmp_stg_parse_ld_balance_interval(char const *name,
+                                                char const *value, void *data) {
+  double interval = __kmp_convert_to_double(value);
+  if (interval >= 0) {
+    __kmp_load_balance_interval = interval;
+  } else {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+} // __kmp_stg_parse_load_balance_interval
+
+static void __kmp_stg_print_ld_balance_interval(kmp_str_buf_t *buffer,
+                                                char const *name, void *data) {
+#if KMP_DEBUG
+  __kmp_str_buf_print(buffer, "   %s=%8.6f\n", name,
+                      __kmp_load_balance_interval);
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_load_balance_interval
+
+#endif /* USE_LOAD_BALANCE */
+
+// -----------------------------------------------------------------------------
+// KMP_INIT_AT_FORK
+
+static void __kmp_stg_parse_init_at_fork(char const *name, char const *value,
+                                         void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_need_register_atfork);
+  if (__kmp_need_register_atfork) {
+    __kmp_need_register_atfork_specified = TRUE;
+  }
+} // __kmp_stg_parse_init_at_fork
+
+static void __kmp_stg_print_init_at_fork(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_need_register_atfork_specified);
+} // __kmp_stg_print_init_at_fork
+
+// -----------------------------------------------------------------------------
+// KMP_SCHEDULE
+
+static void __kmp_stg_parse_schedule(char const *name, char const *value,
+                                     void *data) {
+
+  if (value != NULL) {
+    size_t length = KMP_STRLEN(value);
+    if (length > INT_MAX) {
+      KMP_WARNING(LongValue, name);
+    } else {
+      const char *semicolon;
+      if (value[length - 1] == '"' || value[length - 1] == '\'')
+        KMP_WARNING(UnbalancedQuotes, name);
+      do {
+        char sentinel;
+
+        semicolon = strchr(value, ';');
+        if (*value && semicolon != value) {
+          const char *comma = strchr(value, ',');
+
+          if (comma) {
+            ++comma;
+            sentinel = ',';
+          } else
+            sentinel = ';';
+          if (!__kmp_strcasecmp_with_sentinel("static", value, sentinel)) {
+            if (!__kmp_strcasecmp_with_sentinel("greedy", comma, ';')) {
+              __kmp_static = kmp_sch_static_greedy;
+              continue;
+            } else if (!__kmp_strcasecmp_with_sentinel("balanced", comma,
+                                                       ';')) {
+              __kmp_static = kmp_sch_static_balanced;
+              continue;
+            }
+          } else if (!__kmp_strcasecmp_with_sentinel("guided", value,
+                                                     sentinel)) {
+            if (!__kmp_strcasecmp_with_sentinel("iterative", comma, ';')) {
+              __kmp_guided = kmp_sch_guided_iterative_chunked;
+              continue;
+            } else if (!__kmp_strcasecmp_with_sentinel("analytical", comma,
+                                                       ';')) {
+              /* analytical not allowed for too many threads */
+              __kmp_guided = kmp_sch_guided_analytical_chunked;
+              continue;
+            }
+          }
+          KMP_WARNING(InvalidClause, name, value);
+        } else
+          KMP_WARNING(EmptyClause, name);
+      } while ((value = semicolon ? semicolon + 1 : NULL));
+    }
+  }
+
+} // __kmp_stg_parse__schedule
+
+static void __kmp_stg_print_schedule(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  if (__kmp_static == kmp_sch_static_greedy) {
+    __kmp_str_buf_print(buffer, "%s", "static,greedy");
+  } else if (__kmp_static == kmp_sch_static_balanced) {
+    __kmp_str_buf_print(buffer, "%s", "static,balanced");
+  }
+  if (__kmp_guided == kmp_sch_guided_iterative_chunked) {
+    __kmp_str_buf_print(buffer, ";%s'\n", "guided,iterative");
+  } else if (__kmp_guided == kmp_sch_guided_analytical_chunked) {
+    __kmp_str_buf_print(buffer, ";%s'\n", "guided,analytical");
+  }
+} // __kmp_stg_print_schedule
+
+// -----------------------------------------------------------------------------
+// OMP_SCHEDULE
+
+static inline void __kmp_omp_schedule_restore() {
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.deallocate();
+#endif
+  __kmp_chunk = 0;
+  __kmp_sched = kmp_sch_default;
+}
+
+// if parse_hier = true:
+//    Parse [HW,][modifier:]kind[,chunk]
+// else:
+//    Parse [modifier:]kind[,chunk]
+static const char *__kmp_parse_single_omp_schedule(const char *name,
+                                                   const char *value,
+                                                   bool parse_hier = false) {
+  /* get the specified scheduling style */
+  const char *ptr = value;
+  const char *delim;
+  int chunk = 0;
+  enum sched_type sched = kmp_sch_default;
+  if (*ptr == '\0')
+    return NULL;
+  delim = ptr;
+  while (*delim != ',' && *delim != ':' && *delim != '\0')
+    delim++;
+#if KMP_USE_HIER_SCHED
+  kmp_hier_layer_e layer = kmp_hier_layer_e::LAYER_THREAD;
+  if (parse_hier) {
+    if (*delim == ',') {
+      if (!__kmp_strcasecmp_with_sentinel("L1", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_L1;
+      } else if (!__kmp_strcasecmp_with_sentinel("L2", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_L2;
+      } else if (!__kmp_strcasecmp_with_sentinel("L3", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_L3;
+      } else if (!__kmp_strcasecmp_with_sentinel("NUMA", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_NUMA;
+      }
+    }
+    if (layer != kmp_hier_layer_e::LAYER_THREAD && *delim != ',') {
+      // If there is no comma after the layer, then this schedule is invalid
+      KMP_WARNING(StgInvalidValue, name, value);
+      __kmp_omp_schedule_restore();
+      return NULL;
+    } else if (layer != kmp_hier_layer_e::LAYER_THREAD) {
+      ptr = ++delim;
+      while (*delim != ',' && *delim != ':' && *delim != '\0')
+        delim++;
+    }
+  }
+#endif // KMP_USE_HIER_SCHED
+  // Read in schedule modifier if specified
+  enum sched_type sched_modifier = (enum sched_type)0;
+  if (*delim == ':') {
+    if (!__kmp_strcasecmp_with_sentinel("monotonic", ptr, *delim)) {
+      sched_modifier = sched_type::kmp_sch_modifier_monotonic;
+      ptr = ++delim;
+      while (*delim != ',' && *delim != ':' && *delim != '\0')
+        delim++;
+    } else if (!__kmp_strcasecmp_with_sentinel("nonmonotonic", ptr, *delim)) {
+      sched_modifier = sched_type::kmp_sch_modifier_nonmonotonic;
+      ptr = ++delim;
+      while (*delim != ',' && *delim != ':' && *delim != '\0')
+        delim++;
+    } else if (!parse_hier) {
+      // If there is no proper schedule modifier, then this schedule is invalid
+      KMP_WARNING(StgInvalidValue, name, value);
+      __kmp_omp_schedule_restore();
+      return NULL;
+    }
+  }
+  // Read in schedule kind (required)
+  if (!__kmp_strcasecmp_with_sentinel("dynamic", ptr, *delim))
+    sched = kmp_sch_dynamic_chunked;
+  else if (!__kmp_strcasecmp_with_sentinel("guided", ptr, *delim))
+    sched = kmp_sch_guided_chunked;
+  // AC: TODO: probably remove TRAPEZOIDAL (OMP 3.0 does not allow it)
+  else if (!__kmp_strcasecmp_with_sentinel("auto", ptr, *delim))
+    sched = kmp_sch_auto;
+  else if (!__kmp_strcasecmp_with_sentinel("trapezoidal", ptr, *delim))
+    sched = kmp_sch_trapezoidal;
+  else if (!__kmp_strcasecmp_with_sentinel("static", ptr, *delim))
+    sched = kmp_sch_static;
+#if KMP_STATIC_STEAL_ENABLED
+  else if (!__kmp_strcasecmp_with_sentinel("static_steal", ptr, *delim)) {
+    // replace static_steal with dynamic to better cope with ordered loops
+    sched = kmp_sch_dynamic_chunked;
+    sched_modifier = sched_type::kmp_sch_modifier_nonmonotonic;
+  }
+#endif
+  else {
+    // If there is no proper schedule kind, then this schedule is invalid
+    KMP_WARNING(StgInvalidValue, name, value);
+    __kmp_omp_schedule_restore();
+    return NULL;
+  }
+
+  // Read in schedule chunk size if specified
+  if (*delim == ',') {
+    ptr = delim + 1;
+    SKIP_WS(ptr);
+    if (!isdigit(*ptr)) {
+      // If there is no chunk after comma, then this schedule is invalid
+      KMP_WARNING(StgInvalidValue, name, value);
+      __kmp_omp_schedule_restore();
+      return NULL;
+    }
+    SKIP_DIGITS(ptr);
+    // auto schedule should not specify chunk size
+    if (sched == kmp_sch_auto) {
+      __kmp_msg(kmp_ms_warning, KMP_MSG(IgnoreChunk, name, delim),
+                __kmp_msg_null);
+    } else {
+      if (sched == kmp_sch_static)
+        sched = kmp_sch_static_chunked;
+      chunk = __kmp_str_to_int(delim + 1, *ptr);
+      if (chunk < 1) {
+        chunk = KMP_DEFAULT_CHUNK;
+        __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidChunk, name, delim),
+                  __kmp_msg_null);
+        KMP_INFORM(Using_int_Value, name, __kmp_chunk);
+        // AC: next block commented out until KMP_DEFAULT_CHUNK != KMP_MIN_CHUNK
+        // (to improve code coverage :)
+        // The default chunk size is 1 according to standard, thus making
+        // KMP_MIN_CHUNK not 1 we would introduce mess:
+        // wrong chunk becomes 1, but it will be impossible to explicitly set
+        // to 1 because it becomes KMP_MIN_CHUNK...
+        // } else if ( chunk < KMP_MIN_CHUNK ) {
+        //   chunk = KMP_MIN_CHUNK;
+      } else if (chunk > KMP_MAX_CHUNK) {
+        chunk = KMP_MAX_CHUNK;
+        __kmp_msg(kmp_ms_warning, KMP_MSG(LargeChunk, name, delim),
+                  __kmp_msg_null);
+        KMP_INFORM(Using_int_Value, name, chunk);
+      }
+    }
+  } else {
+    ptr = delim;
+  }
+
+  SCHEDULE_SET_MODIFIERS(sched, sched_modifier);
+
+#if KMP_USE_HIER_SCHED
+  if (layer != kmp_hier_layer_e::LAYER_THREAD) {
+    __kmp_hier_scheds.append(sched, chunk, layer);
+  } else
+#endif
+  {
+    __kmp_chunk = chunk;
+    __kmp_sched = sched;
+  }
+  return ptr;
+}
+
+static void __kmp_stg_parse_omp_schedule(char const *name, char const *value,
+                                         void *data) {
+  size_t length;
+  const char *ptr = value;
+  SKIP_WS(ptr);
+  if (value) {
+    length = KMP_STRLEN(value);
+    if (length) {
+      if (value[length - 1] == '"' || value[length - 1] == '\'')
+        KMP_WARNING(UnbalancedQuotes, name);
+/* get the specified scheduling style */
+#if KMP_USE_HIER_SCHED
+      if (!__kmp_strcasecmp_with_sentinel("EXPERIMENTAL", ptr, ' ')) {
+        SKIP_TOKEN(ptr);
+        SKIP_WS(ptr);
+        while ((ptr = __kmp_parse_single_omp_schedule(name, ptr, true))) {
+          while (*ptr == ' ' || *ptr == '\t' || *ptr == ':')
+            ptr++;
+          if (*ptr == '\0')
+            break;
+        }
+      } else
+#endif
+        __kmp_parse_single_omp_schedule(name, ptr);
+    } else
+      KMP_WARNING(EmptyString, name);
+  }
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.sort();
+#endif
+  K_DIAG(1, ("__kmp_static == %d\n", __kmp_static))
+  K_DIAG(1, ("__kmp_guided == %d\n", __kmp_guided))
+  K_DIAG(1, ("__kmp_sched == %d\n", __kmp_sched))
+  K_DIAG(1, ("__kmp_chunk == %d\n", __kmp_chunk))
+} // __kmp_stg_parse_omp_schedule
+
+static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  enum sched_type sched = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
+  if (SCHEDULE_HAS_MONOTONIC(__kmp_sched)) {
+    __kmp_str_buf_print(buffer, "monotonic:");
+  } else if (SCHEDULE_HAS_NONMONOTONIC(__kmp_sched)) {
+    __kmp_str_buf_print(buffer, "nonmonotonic:");
+  }
+  if (__kmp_chunk) {
+    switch (sched) {
+    case kmp_sch_dynamic_chunked:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "dynamic", __kmp_chunk);
+      break;
+    case kmp_sch_guided_iterative_chunked:
+    case kmp_sch_guided_analytical_chunked:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "guided", __kmp_chunk);
+      break;
+    case kmp_sch_trapezoidal:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "trapezoidal", __kmp_chunk);
+      break;
+    case kmp_sch_static:
+    case kmp_sch_static_chunked:
+    case kmp_sch_static_balanced:
+    case kmp_sch_static_greedy:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "static", __kmp_chunk);
+      break;
+    case kmp_sch_static_steal:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "static_steal", __kmp_chunk);
+      break;
+    case kmp_sch_auto:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "auto", __kmp_chunk);
+      break;
+    default:
+      KMP_ASSERT2(false, "Unhandled sched_type enumeration");
+      KMP_BUILTIN_UNREACHABLE;
+      break;
+    }
+  } else {
+    switch (sched) {
+    case kmp_sch_dynamic_chunked:
+      __kmp_str_buf_print(buffer, "%s'\n", "dynamic");
+      break;
+    case kmp_sch_guided_iterative_chunked:
+    case kmp_sch_guided_analytical_chunked:
+      __kmp_str_buf_print(buffer, "%s'\n", "guided");
+      break;
+    case kmp_sch_trapezoidal:
+      __kmp_str_buf_print(buffer, "%s'\n", "trapezoidal");
+      break;
+    case kmp_sch_static:
+    case kmp_sch_static_chunked:
+    case kmp_sch_static_balanced:
+    case kmp_sch_static_greedy:
+      __kmp_str_buf_print(buffer, "%s'\n", "static");
+      break;
+    case kmp_sch_static_steal:
+      __kmp_str_buf_print(buffer, "%s'\n", "static_steal");
+      break;
+    case kmp_sch_auto:
+      __kmp_str_buf_print(buffer, "%s'\n", "auto");
+      break;
+    default:
+      KMP_ASSERT2(false, "Unhandled sched_type enumeration");
+      KMP_BUILTIN_UNREACHABLE;
+      break;
+    }
+  }
+} // __kmp_stg_print_omp_schedule
+
+#if KMP_USE_HIER_SCHED
+// -----------------------------------------------------------------------------
+// KMP_DISP_HAND_THREAD
+static void __kmp_stg_parse_kmp_hand_thread(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_bool(name, value, &(__kmp_dispatch_hand_threading));
+} // __kmp_stg_parse_kmp_hand_thread
+
+static void __kmp_stg_print_kmp_hand_thread(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_dispatch_hand_threading);
+} // __kmp_stg_print_kmp_hand_thread
+#endif
+
+// -----------------------------------------------------------------------------
+// KMP_FORCE_MONOTONIC_DYNAMIC_SCHEDULE
+static void __kmp_stg_parse_kmp_force_monotonic(char const *name,
+                                                char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &(__kmp_force_monotonic));
+} // __kmp_stg_parse_kmp_force_monotonic
+
+static void __kmp_stg_print_kmp_force_monotonic(kmp_str_buf_t *buffer,
+                                                char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_force_monotonic);
+} // __kmp_stg_print_kmp_force_monotonic
+
+// -----------------------------------------------------------------------------
+// KMP_ATOMIC_MODE
+
+static void __kmp_stg_parse_atomic_mode(char const *name, char const *value,
+                                        void *data) {
+  // Modes: 0 -- do not change default; 1 -- Intel perf mode, 2 -- GOMP
+  // compatibility mode.
+  int mode = 0;
+  int max = 1;
+#ifdef KMP_GOMP_COMPAT
+  max = 2;
+#endif /* KMP_GOMP_COMPAT */
+  __kmp_stg_parse_int(name, value, 0, max, &mode);
+  // TODO; parse_int is not very suitable for this case. In case of overflow it
+  // is better to use
+  // 0 rather that max value.
+  if (mode > 0) {
+    __kmp_atomic_mode = mode;
+  }
+} // __kmp_stg_parse_atomic_mode
+
+static void __kmp_stg_print_atomic_mode(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_atomic_mode);
+} // __kmp_stg_print_atomic_mode
+
+// -----------------------------------------------------------------------------
+// KMP_CONSISTENCY_CHECK
+
+static void __kmp_stg_parse_consistency_check(char const *name,
+                                              char const *value, void *data) {
+  if (!__kmp_strcasecmp_with_sentinel("all", value, 0)) {
+    // Note, this will not work from kmp_set_defaults because th_cons stack was
+    // not allocated
+    // for existed thread(s) thus the first __kmp_push_<construct> will break
+    // with assertion.
+    // TODO: allocate th_cons if called from kmp_set_defaults.
+    __kmp_env_consistency_check = TRUE;
+  } else if (!__kmp_strcasecmp_with_sentinel("none", value, 0)) {
+    __kmp_env_consistency_check = FALSE;
+  } else {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+} // __kmp_stg_parse_consistency_check
+
+static void __kmp_stg_print_consistency_check(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+#if KMP_DEBUG
+  const char *value = NULL;
+
+  if (__kmp_env_consistency_check) {
+    value = "all";
+  } else {
+    value = "none";
+  }
+
+  if (value != NULL) {
+    __kmp_stg_print_str(buffer, name, value);
+  }
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_consistency_check
+
+#if USE_ITT_BUILD
+// -----------------------------------------------------------------------------
+// KMP_ITT_PREPARE_DELAY
+
+#if USE_ITT_NOTIFY
+
+static void __kmp_stg_parse_itt_prepare_delay(char const *name,
+                                              char const *value, void *data) {
+  // Experimental code: KMP_ITT_PREPARE_DELAY specifies numbert of loop
+  // iterations.
+  int delay = 0;
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &delay);
+  __kmp_itt_prepare_delay = delay;
+} // __kmp_str_parse_itt_prepare_delay
+
+static void __kmp_stg_print_itt_prepare_delay(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  __kmp_stg_print_uint64(buffer, name, __kmp_itt_prepare_delay);
+
+} // __kmp_str_print_itt_prepare_delay
+
+#endif // USE_ITT_NOTIFY
+#endif /* USE_ITT_BUILD */
+
+// -----------------------------------------------------------------------------
+// KMP_MALLOC_POOL_INCR
+
+static void __kmp_stg_parse_malloc_pool_incr(char const *name,
+                                             char const *value, void *data) {
+  __kmp_stg_parse_size(name, value, KMP_MIN_MALLOC_POOL_INCR,
+                       KMP_MAX_MALLOC_POOL_INCR, NULL, &__kmp_malloc_pool_incr,
+                       1);
+} // __kmp_stg_parse_malloc_pool_incr
+
+static void __kmp_stg_print_malloc_pool_incr(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_size(buffer, name, __kmp_malloc_pool_incr);
+
+} // _kmp_stg_print_malloc_pool_incr
+
+#ifdef KMP_DEBUG
+
+// -----------------------------------------------------------------------------
+// KMP_PAR_RANGE
+
+static void __kmp_stg_parse_par_range_env(char const *name, char const *value,
+                                          void *data) {
+  __kmp_stg_parse_par_range(name, value, &__kmp_par_range,
+                            __kmp_par_range_routine, __kmp_par_range_filename,
+                            &__kmp_par_range_lb, &__kmp_par_range_ub);
+} // __kmp_stg_parse_par_range_env
+
+static void __kmp_stg_print_par_range_env(kmp_str_buf_t *buffer,
+                                          char const *name, void *data) {
+  if (__kmp_par_range != 0) {
+    __kmp_stg_print_str(buffer, name, par_range_to_print);
+  }
+} // __kmp_stg_print_par_range_env
+
+#endif
+
+// -----------------------------------------------------------------------------
+// KMP_GTID_MODE
+
+static void __kmp_stg_parse_gtid_mode(char const *name, char const *value,
+                                      void *data) {
+  // Modes:
+  //   0 -- do not change default
+  //   1 -- sp search
+  //   2 -- use "keyed" TLS var, i.e.
+  //        pthread_getspecific(Linux* OS/OS X*) or TlsGetValue(Windows* OS)
+  //   3 -- __declspec(thread) TLS var in tdata section
+  int mode = 0;
+  int max = 2;
+#ifdef KMP_TDATA_GTID
+  max = 3;
+#endif /* KMP_TDATA_GTID */
+  __kmp_stg_parse_int(name, value, 0, max, &mode);
+  // TODO; parse_int is not very suitable for this case. In case of overflow it
+  // is better to use 0 rather that max value.
+  if (mode == 0) {
+    __kmp_adjust_gtid_mode = TRUE;
+  } else {
+    __kmp_gtid_mode = mode;
+    __kmp_adjust_gtid_mode = FALSE;
+  }
+} // __kmp_str_parse_gtid_mode
+
+static void __kmp_stg_print_gtid_mode(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  if (__kmp_adjust_gtid_mode) {
+    __kmp_stg_print_int(buffer, name, 0);
+  } else {
+    __kmp_stg_print_int(buffer, name, __kmp_gtid_mode);
+  }
+} // __kmp_stg_print_gtid_mode
+
+// -----------------------------------------------------------------------------
+// KMP_NUM_LOCKS_IN_BLOCK
+
+static void __kmp_stg_parse_lock_block(char const *name, char const *value,
+                                       void *data) {
+  __kmp_stg_parse_int(name, value, 0, KMP_INT_MAX, &__kmp_num_locks_in_block);
+} // __kmp_str_parse_lock_block
+
+static void __kmp_stg_print_lock_block(kmp_str_buf_t *buffer, char const *name,
+                                       void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_num_locks_in_block);
+} // __kmp_stg_print_lock_block
+
+// -----------------------------------------------------------------------------
+// KMP_LOCK_KIND
+
+#if KMP_USE_DYNAMIC_LOCK
+#define KMP_STORE_LOCK_SEQ(a) (__kmp_user_lock_seq = lockseq_##a)
+#else
+#define KMP_STORE_LOCK_SEQ(a)
+#endif
+
+static void __kmp_stg_parse_lock_kind(char const *name, char const *value,
+                                      void *data) {
+  if (__kmp_init_user_locks) {
+    KMP_WARNING(EnvLockWarn, name);
+    return;
+  }
+
+  if (__kmp_str_match("tas", 2, value) ||
+      __kmp_str_match("test and set", 2, value) ||
+      __kmp_str_match("test_and_set", 2, value) ||
+      __kmp_str_match("test-and-set", 2, value) ||
+      __kmp_str_match("test andset", 2, value) ||
+      __kmp_str_match("test_andset", 2, value) ||
+      __kmp_str_match("test-andset", 2, value) ||
+      __kmp_str_match("testand set", 2, value) ||
+      __kmp_str_match("testand_set", 2, value) ||
+      __kmp_str_match("testand-set", 2, value) ||
+      __kmp_str_match("testandset", 2, value)) {
+    __kmp_user_lock_kind = lk_tas;
+    KMP_STORE_LOCK_SEQ(tas);
+  }
+#if KMP_USE_FUTEX
+  else if (__kmp_str_match("futex", 1, value)) {
+    if (__kmp_futex_determine_capable()) {
+      __kmp_user_lock_kind = lk_futex;
+      KMP_STORE_LOCK_SEQ(futex);
+    } else {
+      KMP_WARNING(FutexNotSupported, name, value);
+    }
+  }
+#endif
+  else if (__kmp_str_match("ticket", 2, value)) {
+    __kmp_user_lock_kind = lk_ticket;
+    KMP_STORE_LOCK_SEQ(ticket);
+  } else if (__kmp_str_match("queuing", 1, value) ||
+             __kmp_str_match("queue", 1, value)) {
+    __kmp_user_lock_kind = lk_queuing;
+    KMP_STORE_LOCK_SEQ(queuing);
+  } else if (__kmp_str_match("drdpa ticket", 1, value) ||
+             __kmp_str_match("drdpa_ticket", 1, value) ||
+             __kmp_str_match("drdpa-ticket", 1, value) ||
+             __kmp_str_match("drdpaticket", 1, value) ||
+             __kmp_str_match("drdpa", 1, value)) {
+    __kmp_user_lock_kind = lk_drdpa;
+    KMP_STORE_LOCK_SEQ(drdpa);
+  }
+#if KMP_USE_ADAPTIVE_LOCKS
+  else if (__kmp_str_match("adaptive", 1, value)) {
+    if (__kmp_cpuinfo.flags.rtm) { // ??? Is cpuinfo available here?
+      __kmp_user_lock_kind = lk_adaptive;
+      KMP_STORE_LOCK_SEQ(adaptive);
+    } else {
+      KMP_WARNING(AdaptiveNotSupported, name, value);
+      __kmp_user_lock_kind = lk_queuing;
+      KMP_STORE_LOCK_SEQ(queuing);
+    }
+  }
+#endif // KMP_USE_ADAPTIVE_LOCKS
+#if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
+  else if (__kmp_str_match("rtm_queuing", 1, value)) {
+    if (__kmp_cpuinfo.flags.rtm) {
+      __kmp_user_lock_kind = lk_rtm_queuing;
+      KMP_STORE_LOCK_SEQ(rtm_queuing);
+    } else {
+      KMP_WARNING(AdaptiveNotSupported, name, value);
+      __kmp_user_lock_kind = lk_queuing;
+      KMP_STORE_LOCK_SEQ(queuing);
+    }
+  } else if (__kmp_str_match("rtm_spin", 1, value)) {
+    if (__kmp_cpuinfo.flags.rtm) {
+      __kmp_user_lock_kind = lk_rtm_spin;
+      KMP_STORE_LOCK_SEQ(rtm_spin);
+    } else {
+      KMP_WARNING(AdaptiveNotSupported, name, value);
+      __kmp_user_lock_kind = lk_tas;
+      KMP_STORE_LOCK_SEQ(queuing);
+    }
+  } else if (__kmp_str_match("hle", 1, value)) {
+    __kmp_user_lock_kind = lk_hle;
+    KMP_STORE_LOCK_SEQ(hle);
+  }
+#endif
+  else {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+}
+
+static void __kmp_stg_print_lock_kind(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  const char *value = NULL;
+
+  switch (__kmp_user_lock_kind) {
+  case lk_default:
+    value = "default";
+    break;
+
+  case lk_tas:
+    value = "tas";
+    break;
+
+#if KMP_USE_FUTEX
+  case lk_futex:
+    value = "futex";
+    break;
+#endif
+
+#if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
+  case lk_rtm_queuing:
+    value = "rtm_queuing";
+    break;
+
+  case lk_rtm_spin:
+    value = "rtm_spin";
+    break;
+
+  case lk_hle:
+    value = "hle";
+    break;
+#endif
+
+  case lk_ticket:
+    value = "ticket";
+    break;
+
+  case lk_queuing:
+    value = "queuing";
+    break;
+
+  case lk_drdpa:
+    value = "drdpa";
+    break;
+#if KMP_USE_ADAPTIVE_LOCKS
+  case lk_adaptive:
+    value = "adaptive";
+    break;
+#endif
+  }
+
+  if (value != NULL) {
+    __kmp_stg_print_str(buffer, name, value);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// KMP_SPIN_BACKOFF_PARAMS
+
+// KMP_SPIN_BACKOFF_PARAMS=max_backoff[,min_tick] (max backoff size, min tick
+// for machine pause)
+static void __kmp_stg_parse_spin_backoff_params(const char *name,
+                                                const char *value, void *data) {
+  const char *next = value;
+
+  int total = 0; // Count elements that were set. It'll be used as an array size
+  int prev_comma = FALSE; // For correct processing sequential commas
+  int i;
+
+  kmp_uint32 max_backoff = __kmp_spin_backoff_params.max_backoff;
+  kmp_uint32 min_tick = __kmp_spin_backoff_params.min_tick;
+
+  // Run only 3 iterations because it is enough to read two values or find a
+  // syntax error
+  for (i = 0; i < 3; i++) {
+    SKIP_WS(next);
+
+    if (*next == '\0') {
+      break;
+    }
+    // Next character is not an integer or not a comma OR number of values > 2
+    // => end of list
+    if (((*next < '0' || *next > '9') && *next != ',') || total > 2) {
+      KMP_WARNING(EnvSyntaxError, name, value);
+      return;
+    }
+    // The next character is ','
+    if (*next == ',') {
+      // ',' is the first character
+      if (total == 0 || prev_comma) {
+        total++;
+      }
+      prev_comma = TRUE;
+      next++; // skip ','
+      SKIP_WS(next);
+    }
+    // Next character is a digit
+    if (*next >= '0' && *next <= '9') {
+      int num;
+      const char *buf = next;
+      char const *msg = NULL;
+      prev_comma = FALSE;
+      SKIP_DIGITS(next);
+      total++;
+
+      const char *tmp = next;
+      SKIP_WS(tmp);
+      if ((*next == ' ' || *next == '\t') && (*tmp >= '0' && *tmp <= '9')) {
+        KMP_WARNING(EnvSpacesNotAllowed, name, value);
+        return;
+      }
+
+      num = __kmp_str_to_int(buf, *next);
+      if (num <= 0) { // The number of retries should be > 0
+        msg = KMP_I18N_STR(ValueTooSmall);
+        num = 1;
+      } else if (num > KMP_INT_MAX) {
+        msg = KMP_I18N_STR(ValueTooLarge);
+        num = KMP_INT_MAX;
+      }
+      if (msg != NULL) {
+        // Message is not empty. Print warning.
+        KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+        KMP_INFORM(Using_int_Value, name, num);
+      }
+      if (total == 1) {
+        max_backoff = num;
+      } else if (total == 2) {
+        min_tick = num;
+      }
+    }
+  }
+  KMP_DEBUG_ASSERT(total > 0);
+  if (total <= 0) {
+    KMP_WARNING(EnvSyntaxError, name, value);
+    return;
+  }
+  __kmp_spin_backoff_params.max_backoff = max_backoff;
+  __kmp_spin_backoff_params.min_tick = min_tick;
+}
+
+static void __kmp_stg_print_spin_backoff_params(kmp_str_buf_t *buffer,
+                                                char const *name, void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  __kmp_str_buf_print(buffer, "%d,%d'\n", __kmp_spin_backoff_params.max_backoff,
+                      __kmp_spin_backoff_params.min_tick);
+}
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+// -----------------------------------------------------------------------------
+// KMP_ADAPTIVE_LOCK_PROPS, KMP_SPECULATIVE_STATSFILE
+
+// Parse out values for the tunable parameters from a string of the form
+// KMP_ADAPTIVE_LOCK_PROPS=max_soft_retries[,max_badness]
+static void __kmp_stg_parse_adaptive_lock_props(const char *name,
+                                                const char *value, void *data) {
+  int max_retries = 0;
+  int max_badness = 0;
+
+  const char *next = value;
+
+  int total = 0; // Count elements that were set. It'll be used as an array size
+  int prev_comma = FALSE; // For correct processing sequential commas
+  int i;
+
+  // Save values in the structure __kmp_speculative_backoff_params
+  // Run only 3 iterations because it is enough to read two values or find a
+  // syntax error
+  for (i = 0; i < 3; i++) {
+    SKIP_WS(next);
+
+    if (*next == '\0') {
+      break;
+    }
+    // Next character is not an integer or not a comma OR number of values > 2
+    // => end of list
+    if (((*next < '0' || *next > '9') && *next != ',') || total > 2) {
+      KMP_WARNING(EnvSyntaxError, name, value);
+      return;
+    }
+    // The next character is ','
+    if (*next == ',') {
+      // ',' is the first character
+      if (total == 0 || prev_comma) {
+        total++;
+      }
+      prev_comma = TRUE;
+      next++; // skip ','
+      SKIP_WS(next);
+    }
+    // Next character is a digit
+    if (*next >= '0' && *next <= '9') {
+      int num;
+      const char *buf = next;
+      char const *msg = NULL;
+      prev_comma = FALSE;
+      SKIP_DIGITS(next);
+      total++;
+
+      const char *tmp = next;
+      SKIP_WS(tmp);
+      if ((*next == ' ' || *next == '\t') && (*tmp >= '0' && *tmp <= '9')) {
+        KMP_WARNING(EnvSpacesNotAllowed, name, value);
+        return;
+      }
+
+      num = __kmp_str_to_int(buf, *next);
+      if (num < 0) { // The number of retries should be >= 0
+        msg = KMP_I18N_STR(ValueTooSmall);
+        num = 1;
+      } else if (num > KMP_INT_MAX) {
+        msg = KMP_I18N_STR(ValueTooLarge);
+        num = KMP_INT_MAX;
+      }
+      if (msg != NULL) {
+        // Message is not empty. Print warning.
+        KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+        KMP_INFORM(Using_int_Value, name, num);
+      }
+      if (total == 1) {
+        max_retries = num;
+      } else if (total == 2) {
+        max_badness = num;
+      }
+    }
+  }
+  KMP_DEBUG_ASSERT(total > 0);
+  if (total <= 0) {
+    KMP_WARNING(EnvSyntaxError, name, value);
+    return;
+  }
+  __kmp_adaptive_backoff_params.max_soft_retries = max_retries;
+  __kmp_adaptive_backoff_params.max_badness = max_badness;
+}
+
+static void __kmp_stg_print_adaptive_lock_props(kmp_str_buf_t *buffer,
+                                                char const *name, void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  __kmp_str_buf_print(buffer, "%d,%d'\n",
+                      __kmp_adaptive_backoff_params.max_soft_retries,
+                      __kmp_adaptive_backoff_params.max_badness);
+} // __kmp_stg_print_adaptive_lock_props
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+
+static void __kmp_stg_parse_speculative_statsfile(char const *name,
+                                                  char const *value,
+                                                  void *data) {
+  __kmp_stg_parse_file(name, value, "",
+                       CCAST(char **, &__kmp_speculative_statsfile));
+} // __kmp_stg_parse_speculative_statsfile
+
+static void __kmp_stg_print_speculative_statsfile(kmp_str_buf_t *buffer,
+                                                  char const *name,
+                                                  void *data) {
+  if (__kmp_str_match("-", 0, __kmp_speculative_statsfile)) {
+    __kmp_stg_print_str(buffer, name, "stdout");
+  } else {
+    __kmp_stg_print_str(buffer, name, __kmp_speculative_statsfile);
+  }
+
+} // __kmp_stg_print_speculative_statsfile
+
+#endif // KMP_DEBUG_ADAPTIVE_LOCKS
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+// -----------------------------------------------------------------------------
+// KMP_HW_SUBSET (was KMP_PLACE_THREADS)
+// 2s16c,2t => 2S16C,2T => 2S16C \0 2T
+
+// Return KMP_HW_SUBSET preferred hardware type in case a token is ambiguously
+// short. The original KMP_HW_SUBSET environment variable had single letters:
+// s, c, t for sockets, cores, threads repsectively.
+static kmp_hw_t __kmp_hw_subset_break_tie(const kmp_hw_t *possible,
+                                          size_t num_possible) {
+  for (size_t i = 0; i < num_possible; ++i) {
+    if (possible[i] == KMP_HW_THREAD)
+      return KMP_HW_THREAD;
+    else if (possible[i] == KMP_HW_CORE)
+      return KMP_HW_CORE;
+    else if (possible[i] == KMP_HW_SOCKET)
+      return KMP_HW_SOCKET;
+  }
+  return KMP_HW_UNKNOWN;
+}
+
+// Return hardware type from string or HW_UNKNOWN if string cannot be parsed
+// This algorithm is very forgiving to the user in that, the instant it can
+// reduce the search space to one, it assumes that is the topology level the
+// user wanted, even if it is misspelled later in the token.
+static kmp_hw_t __kmp_stg_parse_hw_subset_name(char const *token) {
+  size_t index, num_possible, token_length;
+  kmp_hw_t possible[KMP_HW_LAST];
+  const char *end;
+
+  // Find the end of the hardware token string
+  end = token;
+  token_length = 0;
+  while (isalnum(*end) || *end == '_') {
+    token_length++;
+    end++;
+  }
+
+  // Set the possibilities to all hardware types
+  num_possible = 0;
+  KMP_FOREACH_HW_TYPE(type) { possible[num_possible++] = type; }
+
+  // Eliminate hardware types by comparing the front of the token
+  // with hardware names
+  // In most cases, the first letter in the token will indicate exactly
+  // which hardware type is parsed, e.g., 'C' = Core
+  index = 0;
+  while (num_possible > 1 && index < token_length) {
+    size_t n = num_possible;
+    char token_char = (char)toupper(token[index]);
+    for (size_t i = 0; i < n; ++i) {
+      const char *s;
+      kmp_hw_t type = possible[i];
+      s = __kmp_hw_get_keyword(type, false);
+      if (index < KMP_STRLEN(s)) {
+        char c = (char)toupper(s[index]);
+        // Mark hardware types for removal when the characters do not match
+        if (c != token_char) {
+          possible[i] = KMP_HW_UNKNOWN;
+          num_possible--;
+        }
+      }
+    }
+    // Remove hardware types that this token cannot be
+    size_t start = 0;
+    for (size_t i = 0; i < n; ++i) {
+      if (possible[i] != KMP_HW_UNKNOWN) {
+        kmp_hw_t temp = possible[i];
+        possible[i] = possible[start];
+        possible[start] = temp;
+        start++;
+      }
+    }
+    KMP_ASSERT(start == num_possible);
+    index++;
+  }
+
+  // Attempt to break a tie if user has very short token
+  // (e.g., is 'T' tile or thread?)
+  if (num_possible > 1)
+    return __kmp_hw_subset_break_tie(possible, num_possible);
+  if (num_possible == 1)
+    return possible[0];
+  return KMP_HW_UNKNOWN;
+}
+
+// The longest observable sequence of items can only be HW_LAST length
+// The input string is usually short enough, let's use 512 limit for now
+#define MAX_T_LEVEL KMP_HW_LAST
+#define MAX_STR_LEN 512
+static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
+                                      void *data) {
+  // Value example: 1s,5c@3,2T
+  // Which means "use 1 socket, 5 cores with offset 3, 2 threads per core"
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  if (strcmp(name, "KMP_PLACE_THREADS") == 0) {
+    KMP_INFORM(EnvVarDeprecated, name, "KMP_HW_SUBSET");
+  }
+  if (__kmp_stg_check_rivals(name, value, rivals)) {
+    return;
+  }
+
+  char *components[MAX_T_LEVEL];
+  char const *digits = "0123456789";
+  char input[MAX_STR_LEN];
+  size_t len = 0, mlen = MAX_STR_LEN;
+  int level = 0;
+  bool absolute = false;
+  // Canonicalize the string (remove spaces, unify delimiters, etc.)
+  char *pos = CCAST(char *, value);
+  while (*pos && mlen) {
+    if (*pos != ' ') { // skip spaces
+      if (len == 0 && *pos == ':') {
+        absolute = true;
+      } else {
+        input[len] = (char)(toupper(*pos));
+        if (input[len] == 'X')
+          input[len] = ','; // unify delimiters of levels
+        if (input[len] == 'O' && strchr(digits, *(pos + 1)))
+          input[len] = '@'; // unify delimiters of offset
+        len++;
+      }
+    }
+    mlen--;
+    pos++;
+  }
+  if (len == 0 || mlen == 0) {
+    goto err; // contents is either empty or too long
+  }
+  input[len] = '\0';
+  // Split by delimiter
+  pos = input;
+  components[level++] = pos;
+  while ((pos = strchr(pos, ','))) {
+    if (level >= MAX_T_LEVEL)
+      goto err; // too many components provided
+    *pos = '\0'; // modify input and avoid more copying
+    components[level++] = ++pos; // expect something after ","
+  }
+
+  __kmp_hw_subset = kmp_hw_subset_t::allocate();
+  if (absolute)
+    __kmp_hw_subset->set_absolute();
+
+  // Check each component
+  for (int i = 0; i < level; ++i) {
+    int core_level = 0;
+    char *core_components[MAX_T_LEVEL];
+    // Split possible core components by '&' delimiter
+    pos = components[i];
+    core_components[core_level++] = pos;
+    while ((pos = strchr(pos, '&'))) {
+      if (core_level >= MAX_T_LEVEL)
+        goto err; // too many different core types
+      *pos = '\0'; // modify input and avoid more copying
+      core_components[core_level++] = ++pos; // expect something after '&'
+    }
+
+    for (int j = 0; j < core_level; ++j) {
+      char *offset_ptr;
+      char *attr_ptr;
+      int offset = 0;
+      kmp_hw_attr_t attr;
+      int num;
+      // components may begin with an optional count of the number of resources
+      if (isdigit(*core_components[j])) {
+        num = atoi(core_components[j]);
+        if (num <= 0) {
+          goto err; // only positive integers are valid for count
+        }
+        pos = core_components[j] + strspn(core_components[j], digits);
+      } else if (*core_components[j] == '*') {
+        num = kmp_hw_subset_t::USE_ALL;
+        pos = core_components[j] + 1;
+      } else {
+        num = kmp_hw_subset_t::USE_ALL;
+        pos = core_components[j];
+      }
+
+      offset_ptr = strchr(core_components[j], '@');
+      attr_ptr = strchr(core_components[j], ':');
+
+      if (offset_ptr) {
+        offset = atoi(offset_ptr + 1); // save offset
+        *offset_ptr = '\0'; // cut the offset from the component
+      }
+      if (attr_ptr) {
+        attr.clear();
+        // save the attribute
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+        if (__kmp_str_match("intel_core", -1, attr_ptr + 1)) {
+          attr.set_core_type(KMP_HW_CORE_TYPE_CORE);
+        } else if (__kmp_str_match("intel_atom", -1, attr_ptr + 1)) {
+          attr.set_core_type(KMP_HW_CORE_TYPE_ATOM);
+        } else
+#endif
+        if (__kmp_str_match("eff", 3, attr_ptr + 1)) {
+          const char *number = attr_ptr + 1;
+          // skip the eff[iciency] token
+          while (isalpha(*number))
+            number++;
+          if (!isdigit(*number)) {
+            goto err;
+          }
+          int efficiency = atoi(number);
+          attr.set_core_eff(efficiency);
+        } else {
+          goto err;
+        }
+        *attr_ptr = '\0'; // cut the attribute from the component
+      }
+      // detect the component type
+      kmp_hw_t type = __kmp_stg_parse_hw_subset_name(pos);
+      if (type == KMP_HW_UNKNOWN) {
+        goto err;
+      }
+      // Only the core type can have attributes
+      if (attr && type != KMP_HW_CORE)
+        goto err;
+      // Must allow core be specified more than once
+      if (type != KMP_HW_CORE && __kmp_hw_subset->specified(type)) {
+        goto err;
+      }
+      __kmp_hw_subset->push_back(num, type, offset, attr);
+    }
+  }
+  return;
+err:
+  KMP_WARNING(AffHWSubsetInvalid, name, value);
+  if (__kmp_hw_subset) {
+    kmp_hw_subset_t::deallocate(__kmp_hw_subset);
+    __kmp_hw_subset = nullptr;
+  }
+  return;
+}
+
+static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  kmp_str_buf_t buf;
+  int depth;
+  if (!__kmp_hw_subset)
+    return;
+  __kmp_str_buf_init(&buf);
+  if (__kmp_env_format)
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  else
+    __kmp_str_buf_print(buffer, "   %s='", name);
+
+  depth = __kmp_hw_subset->get_depth();
+  for (int i = 0; i < depth; ++i) {
+    const auto &item = __kmp_hw_subset->at(i);
+    if (i > 0)
+      __kmp_str_buf_print(&buf, "%c", ',');
+    for (int j = 0; j < item.num_attrs; ++j) {
+      __kmp_str_buf_print(&buf, "%s%d%s", (j > 0 ? "&" : ""), item.num[j],
+                          __kmp_hw_get_keyword(item.type));
+      if (item.attr[j].is_core_type_valid())
+        __kmp_str_buf_print(
+            &buf, ":%s",
+            __kmp_hw_get_core_type_keyword(item.attr[j].get_core_type()));
+      if (item.attr[j].is_core_eff_valid())
+        __kmp_str_buf_print(&buf, ":eff%d", item.attr[j].get_core_eff());
+      if (item.offset[j])
+        __kmp_str_buf_print(&buf, "@%d", item.offset[j]);
+    }
+  }
+  __kmp_str_buf_print(buffer, "%s'\n", buf.str);
+  __kmp_str_buf_free(&buf);
+}
+
+#if USE_ITT_BUILD
+// -----------------------------------------------------------------------------
+// KMP_FORKJOIN_FRAMES
+
+static void __kmp_stg_parse_forkjoin_frames(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_forkjoin_frames);
+} // __kmp_stg_parse_forkjoin_frames
+
+static void __kmp_stg_print_forkjoin_frames(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_forkjoin_frames);
+} // __kmp_stg_print_forkjoin_frames
+
+// -----------------------------------------------------------------------------
+// KMP_FORKJOIN_FRAMES_MODE
+
+static void __kmp_stg_parse_forkjoin_frames_mode(char const *name,
+                                                 char const *value,
+                                                 void *data) {
+  __kmp_stg_parse_int(name, value, 0, 3, &__kmp_forkjoin_frames_mode);
+} // __kmp_stg_parse_forkjoin_frames
+
+static void __kmp_stg_print_forkjoin_frames_mode(kmp_str_buf_t *buffer,
+                                                 char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_forkjoin_frames_mode);
+} // __kmp_stg_print_forkjoin_frames
+#endif /* USE_ITT_BUILD */
+
+// -----------------------------------------------------------------------------
+// KMP_ENABLE_TASK_THROTTLING
+
+static void __kmp_stg_parse_task_throttling(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_enable_task_throttling);
+} // __kmp_stg_parse_task_throttling
+
+static void __kmp_stg_print_task_throttling(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling);
+} // __kmp_stg_print_task_throttling
+
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+// -----------------------------------------------------------------------------
+// KMP_USER_LEVEL_MWAIT
+
+static void __kmp_stg_parse_user_level_mwait(char const *name,
+                                             char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_user_level_mwait);
+} // __kmp_stg_parse_user_level_mwait
+
+static void __kmp_stg_print_user_level_mwait(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_user_level_mwait);
+} // __kmp_stg_print_user_level_mwait
+
+// -----------------------------------------------------------------------------
+// KMP_MWAIT_HINTS
+
+static void __kmp_stg_parse_mwait_hints(char const *name, char const *value,
+                                        void *data) {
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_mwait_hints);
+} // __kmp_stg_parse_mwait_hints
+
+static void __kmp_stg_print_mwait_hints(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_mwait_hints);
+} // __kmp_stg_print_mwait_hints
+
+#endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+
+#if KMP_HAVE_UMWAIT
+// -----------------------------------------------------------------------------
+// KMP_TPAUSE
+// 0 = don't use TPAUSE, 1 = use C0.1 state, 2 = use C0.2 state
+
+static void __kmp_stg_parse_tpause(char const *name, char const *value,
+                                   void *data) {
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_tpause_state);
+  if (__kmp_tpause_state != 0) {
+    // The actual hint passed to tpause is: 0 for C0.2 and 1 for C0.1
+    if (__kmp_tpause_state == 2) // use C0.2
+      __kmp_tpause_hint = 0; // default was set to 1 for C0.1
+  }
+} // __kmp_stg_parse_tpause
+
+static void __kmp_stg_print_tpause(kmp_str_buf_t *buffer, char const *name,
+                                   void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_tpause_state);
+} // __kmp_stg_print_tpause
+#endif // KMP_HAVE_UMWAIT
+
+// -----------------------------------------------------------------------------
+// OMP_DISPLAY_ENV
+
+static void __kmp_stg_parse_omp_display_env(char const *name, char const *value,
+                                            void *data) {
+  if (__kmp_str_match("VERBOSE", 1, value)) {
+    __kmp_display_env_verbose = TRUE;
+  } else {
+    __kmp_stg_parse_bool(name, value, &__kmp_display_env);
+  }
+} // __kmp_stg_parse_omp_display_env
+
+static void __kmp_stg_print_omp_display_env(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  if (__kmp_display_env_verbose) {
+    __kmp_stg_print_str(buffer, name, "VERBOSE");
+  } else {
+    __kmp_stg_print_bool(buffer, name, __kmp_display_env);
+  }
+} // __kmp_stg_print_omp_display_env
+
+static void __kmp_stg_parse_omp_cancellation(char const *name,
+                                             char const *value, void *data) {
+  if (TCR_4(__kmp_init_parallel)) {
+    KMP_WARNING(EnvParallelWarn, name);
+    return;
+  } // read value before first parallel only
+  __kmp_stg_parse_bool(name, value, &__kmp_omp_cancellation);
+} // __kmp_stg_parse_omp_cancellation
+
+static void __kmp_stg_print_omp_cancellation(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_omp_cancellation);
+} // __kmp_stg_print_omp_cancellation
+
+#if OMPT_SUPPORT
+int __kmp_tool = 1;
+
+static void __kmp_stg_parse_omp_tool(char const *name, char const *value,
+                                     void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_tool);
+} // __kmp_stg_parse_omp_tool
+
+static void __kmp_stg_print_omp_tool(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_BOOL_EX(name, __kmp_tool, "enabled", "disabled");
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=%s\n", name,
+                        __kmp_tool ? "enabled" : "disabled");
+  }
+} // __kmp_stg_print_omp_tool
+
+char *__kmp_tool_libraries = NULL;
+
+static void __kmp_stg_parse_omp_tool_libraries(char const *name,
+                                               char const *value, void *data) {
+  __kmp_stg_parse_str(name, value, &__kmp_tool_libraries);
+} // __kmp_stg_parse_omp_tool_libraries
+
+static void __kmp_stg_print_omp_tool_libraries(kmp_str_buf_t *buffer,
+                                               char const *name, void *data) {
+  if (__kmp_tool_libraries)
+    __kmp_stg_print_str(buffer, name, __kmp_tool_libraries);
+  else {
+    if (__kmp_env_format) {
+      KMP_STR_BUF_PRINT_NAME;
+    } else {
+      __kmp_str_buf_print(buffer, "   %s", name);
+    }
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+} // __kmp_stg_print_omp_tool_libraries
+
+char *__kmp_tool_verbose_init = NULL;
+
+static void __kmp_stg_parse_omp_tool_verbose_init(char const *name,
+                                                  char const *value,
+                                                  void *data) {
+  __kmp_stg_parse_str(name, value, &__kmp_tool_verbose_init);
+} // __kmp_stg_parse_omp_tool_libraries
+
+static void __kmp_stg_print_omp_tool_verbose_init(kmp_str_buf_t *buffer,
+                                                  char const *name,
+                                                  void *data) {
+  if (__kmp_tool_verbose_init)
+    __kmp_stg_print_str(buffer, name, __kmp_tool_verbose_init);
+  else {
+    if (__kmp_env_format) {
+      KMP_STR_BUF_PRINT_NAME;
+    } else {
+      __kmp_str_buf_print(buffer, "   %s", name);
+    }
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+} // __kmp_stg_print_omp_tool_verbose_init
+
+#endif
+
+// Table.
+
+static kmp_setting_t __kmp_stg_table[] = {
+
+    {"KMP_ALL_THREADS", __kmp_stg_parse_device_thread_limit, NULL, NULL, 0, 0},
+    {"KMP_BLOCKTIME", __kmp_stg_parse_blocktime, __kmp_stg_print_blocktime,
+     NULL, 0, 0},
+    {"KMP_USE_YIELD", __kmp_stg_parse_use_yield, __kmp_stg_print_use_yield,
+     NULL, 0, 0},
+    {"KMP_DUPLICATE_LIB_OK", __kmp_stg_parse_duplicate_lib_ok,
+     __kmp_stg_print_duplicate_lib_ok, NULL, 0, 0},
+    {"KMP_LIBRARY", __kmp_stg_parse_wait_policy, __kmp_stg_print_wait_policy,
+     NULL, 0, 0},
+    {"KMP_DEVICE_THREAD_LIMIT", __kmp_stg_parse_device_thread_limit,
+     __kmp_stg_print_device_thread_limit, NULL, 0, 0},
+#if KMP_USE_MONITOR
+    {"KMP_MONITOR_STACKSIZE", __kmp_stg_parse_monitor_stacksize,
+     __kmp_stg_print_monitor_stacksize, NULL, 0, 0},
+#endif
+    {"KMP_SETTINGS", __kmp_stg_parse_settings, __kmp_stg_print_settings, NULL,
+     0, 0},
+    {"KMP_STACKOFFSET", __kmp_stg_parse_stackoffset,
+     __kmp_stg_print_stackoffset, NULL, 0, 0},
+    {"KMP_STACKSIZE", __kmp_stg_parse_stacksize, __kmp_stg_print_stacksize,
+     NULL, 0, 0},
+    {"KMP_STACKPAD", __kmp_stg_parse_stackpad, __kmp_stg_print_stackpad, NULL,
+     0, 0},
+    {"KMP_VERSION", __kmp_stg_parse_version, __kmp_stg_print_version, NULL, 0,
+     0},
+    {"KMP_WARNINGS", __kmp_stg_parse_warnings, __kmp_stg_print_warnings, NULL,
+     0, 0},
+
+    {"KMP_NESTING_MODE", __kmp_stg_parse_nesting_mode,
+     __kmp_stg_print_nesting_mode, NULL, 0, 0},
+    {"OMP_NESTED", __kmp_stg_parse_nested, __kmp_stg_print_nested, NULL, 0, 0},
+    {"OMP_NUM_THREADS", __kmp_stg_parse_num_threads,
+     __kmp_stg_print_num_threads, NULL, 0, 0},
+    {"OMP_STACKSIZE", __kmp_stg_parse_stacksize, __kmp_stg_print_stacksize,
+     NULL, 0, 0},
+
+    {"KMP_TASKING", __kmp_stg_parse_tasking, __kmp_stg_print_tasking, NULL, 0,
+     0},
+    {"KMP_TASK_STEALING_CONSTRAINT", __kmp_stg_parse_task_stealing,
+     __kmp_stg_print_task_stealing, NULL, 0, 0},
+    {"OMP_MAX_ACTIVE_LEVELS", __kmp_stg_parse_max_active_levels,
+     __kmp_stg_print_max_active_levels, NULL, 0, 0},
+    {"OMP_DEFAULT_DEVICE", __kmp_stg_parse_default_device,
+     __kmp_stg_print_default_device, NULL, 0, 0},
+    {"OMP_TARGET_OFFLOAD", __kmp_stg_parse_target_offload,
+     __kmp_stg_print_target_offload, NULL, 0, 0},
+    {"OMP_MAX_TASK_PRIORITY", __kmp_stg_parse_max_task_priority,
+     __kmp_stg_print_max_task_priority, NULL, 0, 0},
+    {"KMP_TASKLOOP_MIN_TASKS", __kmp_stg_parse_taskloop_min_tasks,
+     __kmp_stg_print_taskloop_min_tasks, NULL, 0, 0},
+    {"OMP_THREAD_LIMIT", __kmp_stg_parse_thread_limit,
+     __kmp_stg_print_thread_limit, NULL, 0, 0},
+    {"KMP_TEAMS_THREAD_LIMIT", __kmp_stg_parse_teams_thread_limit,
+     __kmp_stg_print_teams_thread_limit, NULL, 0, 0},
+    {"OMP_NUM_TEAMS", __kmp_stg_parse_nteams, __kmp_stg_print_nteams, NULL, 0,
+     0},
+    {"OMP_TEAMS_THREAD_LIMIT", __kmp_stg_parse_teams_th_limit,
+     __kmp_stg_print_teams_th_limit, NULL, 0, 0},
+    {"OMP_WAIT_POLICY", __kmp_stg_parse_wait_policy,
+     __kmp_stg_print_wait_policy, NULL, 0, 0},
+    {"KMP_DISP_NUM_BUFFERS", __kmp_stg_parse_disp_buffers,
+     __kmp_stg_print_disp_buffers, NULL, 0, 0},
+#if KMP_NESTED_HOT_TEAMS
+    {"KMP_HOT_TEAMS_MAX_LEVEL", __kmp_stg_parse_hot_teams_level,
+     __kmp_stg_print_hot_teams_level, NULL, 0, 0},
+    {"KMP_HOT_TEAMS_MODE", __kmp_stg_parse_hot_teams_mode,
+     __kmp_stg_print_hot_teams_mode, NULL, 0, 0},
+#endif // KMP_NESTED_HOT_TEAMS
+
+#if KMP_HANDLE_SIGNALS
+    {"KMP_HANDLE_SIGNALS", __kmp_stg_parse_handle_signals,
+     __kmp_stg_print_handle_signals, NULL, 0, 0},
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    {"KMP_INHERIT_FP_CONTROL", __kmp_stg_parse_inherit_fp_control,
+     __kmp_stg_print_inherit_fp_control, NULL, 0, 0},
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#ifdef KMP_GOMP_COMPAT
+    {"GOMP_STACKSIZE", __kmp_stg_parse_stacksize, NULL, NULL, 0, 0},
+#endif
+
+#ifdef KMP_DEBUG
+    {"KMP_A_DEBUG", __kmp_stg_parse_a_debug, __kmp_stg_print_a_debug, NULL, 0,
+     0},
+    {"KMP_B_DEBUG", __kmp_stg_parse_b_debug, __kmp_stg_print_b_debug, NULL, 0,
+     0},
+    {"KMP_C_DEBUG", __kmp_stg_parse_c_debug, __kmp_stg_print_c_debug, NULL, 0,
+     0},
+    {"KMP_D_DEBUG", __kmp_stg_parse_d_debug, __kmp_stg_print_d_debug, NULL, 0,
+     0},
+    {"KMP_E_DEBUG", __kmp_stg_parse_e_debug, __kmp_stg_print_e_debug, NULL, 0,
+     0},
+    {"KMP_F_DEBUG", __kmp_stg_parse_f_debug, __kmp_stg_print_f_debug, NULL, 0,
+     0},
+    {"KMP_DEBUG", __kmp_stg_parse_debug, NULL, /* no print */ NULL, 0, 0},
+    {"KMP_DEBUG_BUF", __kmp_stg_parse_debug_buf, __kmp_stg_print_debug_buf,
+     NULL, 0, 0},
+    {"KMP_DEBUG_BUF_ATOMIC", __kmp_stg_parse_debug_buf_atomic,
+     __kmp_stg_print_debug_buf_atomic, NULL, 0, 0},
+    {"KMP_DEBUG_BUF_CHARS", __kmp_stg_parse_debug_buf_chars,
+     __kmp_stg_print_debug_buf_chars, NULL, 0, 0},
+    {"KMP_DEBUG_BUF_LINES", __kmp_stg_parse_debug_buf_lines,
+     __kmp_stg_print_debug_buf_lines, NULL, 0, 0},
+    {"KMP_DIAG", __kmp_stg_parse_diag, __kmp_stg_print_diag, NULL, 0, 0},
+
+    {"KMP_PAR_RANGE", __kmp_stg_parse_par_range_env,
+     __kmp_stg_print_par_range_env, NULL, 0, 0},
+#endif // KMP_DEBUG
+
+    {"KMP_ALIGN_ALLOC", __kmp_stg_parse_align_alloc,
+     __kmp_stg_print_align_alloc, NULL, 0, 0},
+
+    {"KMP_PLAIN_BARRIER", __kmp_stg_parse_barrier_branch_bit,
+     __kmp_stg_print_barrier_branch_bit, NULL, 0, 0},
+    {"KMP_PLAIN_BARRIER_PATTERN", __kmp_stg_parse_barrier_pattern,
+     __kmp_stg_print_barrier_pattern, NULL, 0, 0},
+    {"KMP_FORKJOIN_BARRIER", __kmp_stg_parse_barrier_branch_bit,
+     __kmp_stg_print_barrier_branch_bit, NULL, 0, 0},
+    {"KMP_FORKJOIN_BARRIER_PATTERN", __kmp_stg_parse_barrier_pattern,
+     __kmp_stg_print_barrier_pattern, NULL, 0, 0},
+#if KMP_FAST_REDUCTION_BARRIER
+    {"KMP_REDUCTION_BARRIER", __kmp_stg_parse_barrier_branch_bit,
+     __kmp_stg_print_barrier_branch_bit, NULL, 0, 0},
+    {"KMP_REDUCTION_BARRIER_PATTERN", __kmp_stg_parse_barrier_pattern,
+     __kmp_stg_print_barrier_pattern, NULL, 0, 0},
+#endif
+
+    {"KMP_ABORT_DELAY", __kmp_stg_parse_abort_delay,
+     __kmp_stg_print_abort_delay, NULL, 0, 0},
+    {"KMP_CPUINFO_FILE", __kmp_stg_parse_cpuinfo_file,
+     __kmp_stg_print_cpuinfo_file, NULL, 0, 0},
+    {"KMP_FORCE_REDUCTION", __kmp_stg_parse_force_reduction,
+     __kmp_stg_print_force_reduction, NULL, 0, 0},
+    {"KMP_DETERMINISTIC_REDUCTION", __kmp_stg_parse_force_reduction,
+     __kmp_stg_print_force_reduction, NULL, 0, 0},
+    {"KMP_STORAGE_MAP", __kmp_stg_parse_storage_map,
+     __kmp_stg_print_storage_map, NULL, 0, 0},
+    {"KMP_ALL_THREADPRIVATE", __kmp_stg_parse_all_threadprivate,
+     __kmp_stg_print_all_threadprivate, NULL, 0, 0},
+    {"KMP_FOREIGN_THREADS_THREADPRIVATE",
+     __kmp_stg_parse_foreign_threads_threadprivate,
+     __kmp_stg_print_foreign_threads_threadprivate, NULL, 0, 0},
+
+#if KMP_AFFINITY_SUPPORTED
+    {"KMP_AFFINITY", __kmp_stg_parse_affinity, __kmp_stg_print_affinity, NULL,
+     0, 0},
+    {"KMP_HIDDEN_HELPER_AFFINITY", __kmp_stg_parse_hh_affinity,
+     __kmp_stg_print_hh_affinity, NULL, 0, 0},
+#ifdef KMP_GOMP_COMPAT
+    {"GOMP_CPU_AFFINITY", __kmp_stg_parse_gomp_cpu_affinity, NULL,
+     /* no print */ NULL, 0, 0},
+#endif /* KMP_GOMP_COMPAT */
+    {"OMP_PROC_BIND", __kmp_stg_parse_proc_bind, __kmp_stg_print_proc_bind,
+     NULL, 0, 0},
+    {"KMP_TEAMS_PROC_BIND", __kmp_stg_parse_teams_proc_bind,
+     __kmp_stg_print_teams_proc_bind, NULL, 0, 0},
+    {"OMP_PLACES", __kmp_stg_parse_places, __kmp_stg_print_places, NULL, 0, 0},
+    {"KMP_TOPOLOGY_METHOD", __kmp_stg_parse_topology_method,
+     __kmp_stg_print_topology_method, NULL, 0, 0},
+
+#else
+
+    // KMP_AFFINITY is not supported on OS X*, nor is OMP_PLACES.
+    // OMP_PROC_BIND and proc-bind-var are supported, however.
+    {"OMP_PROC_BIND", __kmp_stg_parse_proc_bind, __kmp_stg_print_proc_bind,
+     NULL, 0, 0},
+
+#endif // KMP_AFFINITY_SUPPORTED
+    {"OMP_DISPLAY_AFFINITY", __kmp_stg_parse_display_affinity,
+     __kmp_stg_print_display_affinity, NULL, 0, 0},
+    {"OMP_AFFINITY_FORMAT", __kmp_stg_parse_affinity_format,
+     __kmp_stg_print_affinity_format, NULL, 0, 0},
+    {"KMP_INIT_AT_FORK", __kmp_stg_parse_init_at_fork,
+     __kmp_stg_print_init_at_fork, NULL, 0, 0},
+    {"KMP_SCHEDULE", __kmp_stg_parse_schedule, __kmp_stg_print_schedule, NULL,
+     0, 0},
+    {"OMP_SCHEDULE", __kmp_stg_parse_omp_schedule, __kmp_stg_print_omp_schedule,
+     NULL, 0, 0},
+#if KMP_USE_HIER_SCHED
+    {"KMP_DISP_HAND_THREAD", __kmp_stg_parse_kmp_hand_thread,
+     __kmp_stg_print_kmp_hand_thread, NULL, 0, 0},
+#endif
+    {"KMP_FORCE_MONOTONIC_DYNAMIC_SCHEDULE",
+     __kmp_stg_parse_kmp_force_monotonic, __kmp_stg_print_kmp_force_monotonic,
+     NULL, 0, 0},
+    {"KMP_ATOMIC_MODE", __kmp_stg_parse_atomic_mode,
+     __kmp_stg_print_atomic_mode, NULL, 0, 0},
+    {"KMP_CONSISTENCY_CHECK", __kmp_stg_parse_consistency_check,
+     __kmp_stg_print_consistency_check, NULL, 0, 0},
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    {"KMP_ITT_PREPARE_DELAY", __kmp_stg_parse_itt_prepare_delay,
+     __kmp_stg_print_itt_prepare_delay, NULL, 0, 0},
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+    {"KMP_MALLOC_POOL_INCR", __kmp_stg_parse_malloc_pool_incr,
+     __kmp_stg_print_malloc_pool_incr, NULL, 0, 0},
+    {"KMP_GTID_MODE", __kmp_stg_parse_gtid_mode, __kmp_stg_print_gtid_mode,
+     NULL, 0, 0},
+    {"OMP_DYNAMIC", __kmp_stg_parse_omp_dynamic, __kmp_stg_print_omp_dynamic,
+     NULL, 0, 0},
+    {"KMP_DYNAMIC_MODE", __kmp_stg_parse_kmp_dynamic_mode,
+     __kmp_stg_print_kmp_dynamic_mode, NULL, 0, 0},
+
+#ifdef USE_LOAD_BALANCE
+    {"KMP_LOAD_BALANCE_INTERVAL", __kmp_stg_parse_ld_balance_interval,
+     __kmp_stg_print_ld_balance_interval, NULL, 0, 0},
+#endif
+
+    {"KMP_NUM_LOCKS_IN_BLOCK", __kmp_stg_parse_lock_block,
+     __kmp_stg_print_lock_block, NULL, 0, 0},
+    {"KMP_LOCK_KIND", __kmp_stg_parse_lock_kind, __kmp_stg_print_lock_kind,
+     NULL, 0, 0},
+    {"KMP_SPIN_BACKOFF_PARAMS", __kmp_stg_parse_spin_backoff_params,
+     __kmp_stg_print_spin_backoff_params, NULL, 0, 0},
+#if KMP_USE_ADAPTIVE_LOCKS
+    {"KMP_ADAPTIVE_LOCK_PROPS", __kmp_stg_parse_adaptive_lock_props,
+     __kmp_stg_print_adaptive_lock_props, NULL, 0, 0},
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+    {"KMP_SPECULATIVE_STATSFILE", __kmp_stg_parse_speculative_statsfile,
+     __kmp_stg_print_speculative_statsfile, NULL, 0, 0},
+#endif
+#endif // KMP_USE_ADAPTIVE_LOCKS
+    {"KMP_PLACE_THREADS", __kmp_stg_parse_hw_subset, __kmp_stg_print_hw_subset,
+     NULL, 0, 0},
+    {"KMP_HW_SUBSET", __kmp_stg_parse_hw_subset, __kmp_stg_print_hw_subset,
+     NULL, 0, 0},
+#if USE_ITT_BUILD
+    {"KMP_FORKJOIN_FRAMES", __kmp_stg_parse_forkjoin_frames,
+     __kmp_stg_print_forkjoin_frames, NULL, 0, 0},
+    {"KMP_FORKJOIN_FRAMES_MODE", __kmp_stg_parse_forkjoin_frames_mode,
+     __kmp_stg_print_forkjoin_frames_mode, NULL, 0, 0},
+#endif
+    {"KMP_ENABLE_TASK_THROTTLING", __kmp_stg_parse_task_throttling,
+     __kmp_stg_print_task_throttling, NULL, 0, 0},
+
+    {"OMP_DISPLAY_ENV", __kmp_stg_parse_omp_display_env,
+     __kmp_stg_print_omp_display_env, NULL, 0, 0},
+    {"OMP_CANCELLATION", __kmp_stg_parse_omp_cancellation,
+     __kmp_stg_print_omp_cancellation, NULL, 0, 0},
+    {"OMP_ALLOCATOR", __kmp_stg_parse_allocator, __kmp_stg_print_allocator,
+     NULL, 0, 0},
+    {"LIBOMP_USE_HIDDEN_HELPER_TASK", __kmp_stg_parse_use_hidden_helper,
+     __kmp_stg_print_use_hidden_helper, NULL, 0, 0},
+    {"LIBOMP_NUM_HIDDEN_HELPER_THREADS",
+     __kmp_stg_parse_num_hidden_helper_threads,
+     __kmp_stg_print_num_hidden_helper_threads, NULL, 0, 0},
+#if OMPX_TASKGRAPH
+    {"KMP_MAX_TDGS", __kmp_stg_parse_max_tdgs, __kmp_std_print_max_tdgs, NULL,
+     0, 0},
+    {"KMP_TDG_DOT", __kmp_stg_parse_tdg_dot, __kmp_stg_print_tdg_dot, NULL, 0, 0},
+#endif
+
+#if OMPT_SUPPORT
+    {"OMP_TOOL", __kmp_stg_parse_omp_tool, __kmp_stg_print_omp_tool, NULL, 0,
+     0},
+    {"OMP_TOOL_LIBRARIES", __kmp_stg_parse_omp_tool_libraries,
+     __kmp_stg_print_omp_tool_libraries, NULL, 0, 0},
+    {"OMP_TOOL_VERBOSE_INIT", __kmp_stg_parse_omp_tool_verbose_init,
+     __kmp_stg_print_omp_tool_verbose_init, NULL, 0, 0},
+#endif
+
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+    {"KMP_USER_LEVEL_MWAIT", __kmp_stg_parse_user_level_mwait,
+     __kmp_stg_print_user_level_mwait, NULL, 0, 0},
+    {"KMP_MWAIT_HINTS", __kmp_stg_parse_mwait_hints,
+     __kmp_stg_print_mwait_hints, NULL, 0, 0},
+#endif
+
+#if KMP_HAVE_UMWAIT
+    {"KMP_TPAUSE", __kmp_stg_parse_tpause, __kmp_stg_print_tpause, NULL, 0, 0},
+#endif
+    {"", NULL, NULL, NULL, 0, 0}}; // settings
+
+static int const __kmp_stg_count =
+    sizeof(__kmp_stg_table) / sizeof(kmp_setting_t);
+
+static inline kmp_setting_t *__kmp_stg_find(char const *name) {
+
+  int i;
+  if (name != NULL) {
+    for (i = 0; i < __kmp_stg_count; ++i) {
+      if (strcmp(__kmp_stg_table[i].name, name) == 0) {
+        return &__kmp_stg_table[i];
+      }
+    }
+  }
+  return NULL;
+
+} // __kmp_stg_find
+
+static int __kmp_stg_cmp(void const *_a, void const *_b) {
+  const kmp_setting_t *a = RCAST(const kmp_setting_t *, _a);
+  const kmp_setting_t *b = RCAST(const kmp_setting_t *, _b);
+
+  // Process KMP_AFFINITY last.
+  // It needs to come after OMP_PLACES and GOMP_CPU_AFFINITY.
+  if (strcmp(a->name, "KMP_AFFINITY") == 0) {
+    if (strcmp(b->name, "KMP_AFFINITY") == 0) {
+      return 0;
+    }
+    return 1;
+  } else if (strcmp(b->name, "KMP_AFFINITY") == 0) {
+    return -1;
+  }
+  return strcmp(a->name, b->name);
+} // __kmp_stg_cmp
+
+static void __kmp_stg_init(void) {
+
+  static int initialized = 0;
+
+  if (!initialized) {
+
+    // Sort table.
+    qsort(__kmp_stg_table, __kmp_stg_count - 1, sizeof(kmp_setting_t),
+          __kmp_stg_cmp);
+
+    { // Initialize *_STACKSIZE data.
+      kmp_setting_t *kmp_stacksize =
+          __kmp_stg_find("KMP_STACKSIZE"); // 1st priority.
+#ifdef KMP_GOMP_COMPAT
+      kmp_setting_t *gomp_stacksize =
+          __kmp_stg_find("GOMP_STACKSIZE"); // 2nd priority.
+#endif
+      kmp_setting_t *omp_stacksize =
+          __kmp_stg_find("OMP_STACKSIZE"); // 3rd priority.
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      // !!! Compiler does not understand rivals is used and optimizes out
+      // assignments
+      // !!!     rivals[ i ++ ] = ...;
+      static kmp_setting_t *volatile rivals[4];
+      static kmp_stg_ss_data_t kmp_data = {1, CCAST(kmp_setting_t **, rivals)};
+#ifdef KMP_GOMP_COMPAT
+      static kmp_stg_ss_data_t gomp_data = {1024,
+                                            CCAST(kmp_setting_t **, rivals)};
+#endif
+      static kmp_stg_ss_data_t omp_data = {1024,
+                                           CCAST(kmp_setting_t **, rivals)};
+      int i = 0;
+
+      rivals[i++] = kmp_stacksize;
+#ifdef KMP_GOMP_COMPAT
+      if (gomp_stacksize != NULL) {
+        rivals[i++] = gomp_stacksize;
+      }
+#endif
+      rivals[i++] = omp_stacksize;
+      rivals[i++] = NULL;
+
+      kmp_stacksize->data = &kmp_data;
+#ifdef KMP_GOMP_COMPAT
+      if (gomp_stacksize != NULL) {
+        gomp_stacksize->data = &gomp_data;
+      }
+#endif
+      omp_stacksize->data = &omp_data;
+    }
+
+    { // Initialize KMP_LIBRARY and OMP_WAIT_POLICY data.
+      kmp_setting_t *kmp_library =
+          __kmp_stg_find("KMP_LIBRARY"); // 1st priority.
+      kmp_setting_t *omp_wait_policy =
+          __kmp_stg_find("OMP_WAIT_POLICY"); // 2nd priority.
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      static kmp_setting_t *volatile rivals[3];
+      static kmp_stg_wp_data_t kmp_data = {0, CCAST(kmp_setting_t **, rivals)};
+      static kmp_stg_wp_data_t omp_data = {1, CCAST(kmp_setting_t **, rivals)};
+      int i = 0;
+
+      rivals[i++] = kmp_library;
+      if (omp_wait_policy != NULL) {
+        rivals[i++] = omp_wait_policy;
+      }
+      rivals[i++] = NULL;
+
+      kmp_library->data = &kmp_data;
+      if (omp_wait_policy != NULL) {
+        omp_wait_policy->data = &omp_data;
+      }
+    }
+
+    { // Initialize KMP_DEVICE_THREAD_LIMIT and KMP_ALL_THREADS
+      kmp_setting_t *kmp_device_thread_limit =
+          __kmp_stg_find("KMP_DEVICE_THREAD_LIMIT"); // 1st priority.
+      kmp_setting_t *kmp_all_threads =
+          __kmp_stg_find("KMP_ALL_THREADS"); // 2nd priority.
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      static kmp_setting_t *volatile rivals[3];
+      int i = 0;
+
+      rivals[i++] = kmp_device_thread_limit;
+      rivals[i++] = kmp_all_threads;
+      rivals[i++] = NULL;
+
+      kmp_device_thread_limit->data = CCAST(kmp_setting_t **, rivals);
+      kmp_all_threads->data = CCAST(kmp_setting_t **, rivals);
+    }
+
+    { // Initialize KMP_HW_SUBSET and KMP_PLACE_THREADS
+      // 1st priority
+      kmp_setting_t *kmp_hw_subset = __kmp_stg_find("KMP_HW_SUBSET");
+      // 2nd priority
+      kmp_setting_t *kmp_place_threads = __kmp_stg_find("KMP_PLACE_THREADS");
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      static kmp_setting_t *volatile rivals[3];
+      int i = 0;
+
+      rivals[i++] = kmp_hw_subset;
+      rivals[i++] = kmp_place_threads;
+      rivals[i++] = NULL;
+
+      kmp_hw_subset->data = CCAST(kmp_setting_t **, rivals);
+      kmp_place_threads->data = CCAST(kmp_setting_t **, rivals);
+    }
+
+#if KMP_AFFINITY_SUPPORTED
+    { // Initialize KMP_AFFINITY, GOMP_CPU_AFFINITY, and OMP_PROC_BIND data.
+      kmp_setting_t *kmp_affinity =
+          __kmp_stg_find("KMP_AFFINITY"); // 1st priority.
+      KMP_DEBUG_ASSERT(kmp_affinity != NULL);
+
+#ifdef KMP_GOMP_COMPAT
+      kmp_setting_t *gomp_cpu_affinity =
+          __kmp_stg_find("GOMP_CPU_AFFINITY"); // 2nd priority.
+      KMP_DEBUG_ASSERT(gomp_cpu_affinity != NULL);
+#endif
+
+      kmp_setting_t *omp_proc_bind =
+          __kmp_stg_find("OMP_PROC_BIND"); // 3rd priority.
+      KMP_DEBUG_ASSERT(omp_proc_bind != NULL);
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      static kmp_setting_t *volatile rivals[4];
+      int i = 0;
+
+      rivals[i++] = kmp_affinity;
+
+#ifdef KMP_GOMP_COMPAT
+      rivals[i++] = gomp_cpu_affinity;
+      gomp_cpu_affinity->data = CCAST(kmp_setting_t **, rivals);
+#endif
+
+      rivals[i++] = omp_proc_bind;
+      omp_proc_bind->data = CCAST(kmp_setting_t **, rivals);
+      rivals[i++] = NULL;
+
+      static kmp_setting_t *volatile places_rivals[4];
+      i = 0;
+
+      kmp_setting_t *omp_places = __kmp_stg_find("OMP_PLACES"); // 3rd priority.
+      KMP_DEBUG_ASSERT(omp_places != NULL);
+
+      places_rivals[i++] = kmp_affinity;
+#ifdef KMP_GOMP_COMPAT
+      places_rivals[i++] = gomp_cpu_affinity;
+#endif
+      places_rivals[i++] = omp_places;
+      omp_places->data = CCAST(kmp_setting_t **, places_rivals);
+      places_rivals[i++] = NULL;
+    }
+#else
+// KMP_AFFINITY not supported, so OMP_PROC_BIND has no rivals.
+// OMP_PLACES not supported yet.
+#endif // KMP_AFFINITY_SUPPORTED
+
+    { // Initialize KMP_DETERMINISTIC_REDUCTION and KMP_FORCE_REDUCTION data.
+      kmp_setting_t *kmp_force_red =
+          __kmp_stg_find("KMP_FORCE_REDUCTION"); // 1st priority.
+      kmp_setting_t *kmp_determ_red =
+          __kmp_stg_find("KMP_DETERMINISTIC_REDUCTION"); // 2nd priority.
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      static kmp_setting_t *volatile rivals[3];
+      static kmp_stg_fr_data_t force_data = {1,
+                                             CCAST(kmp_setting_t **, rivals)};
+      static kmp_stg_fr_data_t determ_data = {0,
+                                              CCAST(kmp_setting_t **, rivals)};
+      int i = 0;
+
+      rivals[i++] = kmp_force_red;
+      if (kmp_determ_red != NULL) {
+        rivals[i++] = kmp_determ_red;
+      }
+      rivals[i++] = NULL;
+
+      kmp_force_red->data = &force_data;
+      if (kmp_determ_red != NULL) {
+        kmp_determ_red->data = &determ_data;
+      }
+    }
+
+    initialized = 1;
+  }
+
+  // Reset flags.
+  int i;
+  for (i = 0; i < __kmp_stg_count; ++i) {
+    __kmp_stg_table[i].set = 0;
+  }
+
+} // __kmp_stg_init
+
+static void __kmp_stg_parse(char const *name, char const *value) {
+  // On Windows* OS there are some nameless variables like "C:=C:\" (yeah,
+  // really nameless, they are presented in environment block as
+  // "=C:=C\\\x00=D:=D:\\\x00...", so let us skip them.
+  if (name[0] == 0) {
+    return;
+  }
+
+  if (value != NULL) {
+    kmp_setting_t *setting = __kmp_stg_find(name);
+    if (setting != NULL) {
+      setting->parse(name, value, setting->data);
+      setting->defined = 1;
+    }
+  }
+
+} // __kmp_stg_parse
+
+static int __kmp_stg_check_rivals( // 0 -- Ok, 1 -- errors found.
+    char const *name, // Name of variable.
+    char const *value, // Value of the variable.
+    kmp_setting_t **rivals // List of rival settings (must include current one).
+) {
+
+  if (rivals == NULL) {
+    return 0;
+  }
+
+  // Loop thru higher priority settings (listed before current).
+  int i = 0;
+  for (; strcmp(rivals[i]->name, name) != 0; i++) {
+    KMP_DEBUG_ASSERT(rivals[i] != NULL);
+
+#if KMP_AFFINITY_SUPPORTED
+    if (rivals[i] == __kmp_affinity_notype) {
+      // If KMP_AFFINITY is specified without a type name,
+      // it does not rival OMP_PROC_BIND or GOMP_CPU_AFFINITY.
+      continue;
+    }
+#endif
+
+    if (rivals[i]->set) {
+      KMP_WARNING(StgIgnored, name, rivals[i]->name);
+      return 1;
+    }
+  }
+
+  ++i; // Skip current setting.
+  return 0;
+
+} // __kmp_stg_check_rivals
+
+static int __kmp_env_toPrint(char const *name, int flag) {
+  int rc = 0;
+  kmp_setting_t *setting = __kmp_stg_find(name);
+  if (setting != NULL) {
+    rc = setting->defined;
+    if (flag >= 0) {
+      setting->defined = flag;
+    }
+  }
+  return rc;
+}
+
+#if defined(KMP_DEBUG) && KMP_AFFINITY_SUPPORTED
+static void __kmp_print_affinity_settings(const kmp_affinity_t *affinity) {
+  K_DIAG(1, ("%s:\n", affinity->env_var));
+  K_DIAG(1, ("    type     : %d\n", affinity->type));
+  K_DIAG(1, ("    compact  : %d\n", affinity->compact));
+  K_DIAG(1, ("    offset   : %d\n", affinity->offset));
+  K_DIAG(1, ("    verbose  : %u\n", affinity->flags.verbose));
+  K_DIAG(1, ("    warnings : %u\n", affinity->flags.warnings));
+  K_DIAG(1, ("    respect  : %u\n", affinity->flags.respect));
+  K_DIAG(1, ("    reset    : %u\n", affinity->flags.reset));
+  K_DIAG(1, ("    dups     : %u\n", affinity->flags.dups));
+  K_DIAG(1, ("    gran     : %d\n", (int)affinity->gran));
+  KMP_DEBUG_ASSERT(affinity->type != affinity_default);
+}
+#endif
+
+static void __kmp_aux_env_initialize(kmp_env_blk_t *block) {
+
+  char const *value;
+
+  /* OMP_NUM_THREADS */
+  value = __kmp_env_blk_var(block, "OMP_NUM_THREADS");
+  if (value) {
+    ompc_set_num_threads(__kmp_dflt_team_nth);
+  }
+
+  /* KMP_BLOCKTIME */
+  value = __kmp_env_blk_var(block, "KMP_BLOCKTIME");
+  if (value) {
+    int gtid, tid;
+    kmp_info_t *thread;
+
+    gtid = __kmp_entry_gtid();
+    tid = __kmp_tid_from_gtid(gtid);
+    thread = __kmp_thread_from_gtid(gtid);
+    __kmp_aux_set_blocktime(__kmp_dflt_blocktime, thread, tid);
+  }
+
+  /* OMP_NESTED */
+  value = __kmp_env_blk_var(block, "OMP_NESTED");
+  if (value) {
+    ompc_set_nested(__kmp_dflt_max_active_levels > 1);
+  }
+
+  /* OMP_DYNAMIC */
+  value = __kmp_env_blk_var(block, "OMP_DYNAMIC");
+  if (value) {
+    ompc_set_dynamic(__kmp_global.g.g_dynamic);
+  }
+}
+
+void __kmp_env_initialize(char const *string) {
+
+  kmp_env_blk_t block;
+  int i;
+
+  __kmp_stg_init();
+
+  // Hack!!!
+  if (string == NULL) {
+    // __kmp_max_nth = __kmp_sys_max_nth;
+    __kmp_threads_capacity =
+        __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
+  }
+  __kmp_env_blk_init(&block, string);
+
+  // update the set flag on all entries that have an env var
+  for (i = 0; i < block.count; ++i) {
+    if ((block.vars[i].name == NULL) || (*block.vars[i].name == '\0')) {
+      continue;
+    }
+    if (block.vars[i].value == NULL) {
+      continue;
+    }
+    kmp_setting_t *setting = __kmp_stg_find(block.vars[i].name);
+    if (setting != NULL) {
+      setting->set = 1;
+    }
+  }
+
+  // We need to know if blocktime was set when processing OMP_WAIT_POLICY
+  blocktime_str = __kmp_env_blk_var(&block, "KMP_BLOCKTIME");
+
+  // Special case. If we parse environment, not a string, process KMP_WARNINGS
+  // first.
+  if (string == NULL) {
+    char const *name = "KMP_WARNINGS";
+    char const *value = __kmp_env_blk_var(&block, name);
+    __kmp_stg_parse(name, value);
+  }
+
+#if KMP_AFFINITY_SUPPORTED
+  // Special case. KMP_AFFINITY is not a rival to other affinity env vars
+  // if no affinity type is specified.  We want to allow
+  // KMP_AFFINITY=[no],verbose/[no]warnings/etc.  to be enabled when
+  // specifying the affinity type via GOMP_CPU_AFFINITY or the OMP 4.0
+  // affinity mechanism.
+  __kmp_affinity_notype = NULL;
+  char const *aff_str = __kmp_env_blk_var(&block, "KMP_AFFINITY");
+  if (aff_str != NULL) {
+    // Check if the KMP_AFFINITY type is specified in the string.
+    // We just search the string for "compact", "scatter", etc.
+    // without really parsing the string.  The syntax of the
+    // KMP_AFFINITY env var is such that none of the affinity
+    // type names can appear anywhere other that the type
+    // specifier, even as substrings.
+    //
+    // I can't find a case-insensitive version of strstr on Windows* OS.
+    // Use the case-sensitive version for now. AIX does the same.
+
+#if KMP_OS_WINDOWS || KMP_OS_AIX
+#define FIND strstr
+#else
+#define FIND strcasestr
+#endif
+
+    if ((FIND(aff_str, "none") == NULL) &&
+        (FIND(aff_str, "physical") == NULL) &&
+        (FIND(aff_str, "logical") == NULL) &&
+        (FIND(aff_str, "compact") == NULL) &&
+        (FIND(aff_str, "scatter") == NULL) &&
+        (FIND(aff_str, "explicit") == NULL) &&
+        (FIND(aff_str, "balanced") == NULL) &&
+        (FIND(aff_str, "disabled") == NULL)) {
+      __kmp_affinity_notype = __kmp_stg_find("KMP_AFFINITY");
+    } else {
+      // A new affinity type is specified.
+      // Reset the affinity flags to their default values,
+      // in case this is called from kmp_set_defaults().
+      __kmp_affinity.type = affinity_default;
+      __kmp_affinity.gran = KMP_HW_UNKNOWN;
+      __kmp_affinity_top_method = affinity_top_method_default;
+      __kmp_affinity.flags.respect = affinity_respect_mask_default;
+    }
+#undef FIND
+
+    // Also reset the affinity flags if OMP_PROC_BIND is specified.
+    aff_str = __kmp_env_blk_var(&block, "OMP_PROC_BIND");
+    if (aff_str != NULL) {
+      __kmp_affinity.type = affinity_default;
+      __kmp_affinity.gran = KMP_HW_UNKNOWN;
+      __kmp_affinity_top_method = affinity_top_method_default;
+      __kmp_affinity.flags.respect = affinity_respect_mask_default;
+    }
+  }
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+  // Set up the nested proc bind type vector.
+  if (__kmp_nested_proc_bind.bind_types == NULL) {
+    __kmp_nested_proc_bind.bind_types =
+        (kmp_proc_bind_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_proc_bind_t));
+    if (__kmp_nested_proc_bind.bind_types == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    __kmp_nested_proc_bind.size = 1;
+    __kmp_nested_proc_bind.used = 1;
+#if KMP_AFFINITY_SUPPORTED
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_default;
+#else
+    // default proc bind is false if affinity not supported
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+#endif
+  }
+
+  // Set up the affinity format ICV
+  // Grab the default affinity format string from the message catalog
+  kmp_msg_t m =
+      __kmp_msg_format(kmp_i18n_msg_AffFormatDefault, "%P", "%i", "%n", "%A");
+  KMP_DEBUG_ASSERT(KMP_STRLEN(m.str) < KMP_AFFINITY_FORMAT_SIZE);
+
+  if (__kmp_affinity_format == NULL) {
+    __kmp_affinity_format =
+        (char *)KMP_INTERNAL_MALLOC(sizeof(char) * KMP_AFFINITY_FORMAT_SIZE);
+  }
+  KMP_STRCPY_S(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE, m.str);
+  __kmp_str_free(&m.str);
+
+  // Now process all of the settings.
+  for (i = 0; i < block.count; ++i) {
+    __kmp_stg_parse(block.vars[i].name, block.vars[i].value);
+  }
+
+  // If user locks have been allocated yet, don't reset the lock vptr table.
+  if (!__kmp_init_user_locks) {
+    if (__kmp_user_lock_kind == lk_default) {
+      __kmp_user_lock_kind = lk_queuing;
+    }
+#if KMP_USE_DYNAMIC_LOCK
+    __kmp_init_dynamic_user_locks();
+#else
+    __kmp_set_user_lock_vptrs(__kmp_user_lock_kind);
+#endif
+  } else {
+    KMP_DEBUG_ASSERT(string != NULL); // kmp_set_defaults() was called
+    KMP_DEBUG_ASSERT(__kmp_user_lock_kind != lk_default);
+// Binds lock functions again to follow the transition between different
+// KMP_CONSISTENCY_CHECK values. Calling this again is harmless as long
+// as we do not allow lock kind changes after making a call to any
+// user lock functions (true).
+#if KMP_USE_DYNAMIC_LOCK
+    __kmp_init_dynamic_user_locks();
+#else
+    __kmp_set_user_lock_vptrs(__kmp_user_lock_kind);
+#endif
+  }
+
+#if KMP_AFFINITY_SUPPORTED
+
+  if (!TCR_4(__kmp_init_middle)) {
+#if KMP_USE_HWLOC
+    // Force using hwloc when either tiles or numa nodes requested within
+    // KMP_HW_SUBSET or granularity setting and no other topology method
+    // is requested
+    if (__kmp_hw_subset &&
+        __kmp_affinity_top_method == affinity_top_method_default)
+      if (__kmp_hw_subset->specified(KMP_HW_NUMA) ||
+          __kmp_hw_subset->specified(KMP_HW_TILE) ||
+          __kmp_affinity.gran == KMP_HW_TILE ||
+          __kmp_affinity.gran == KMP_HW_NUMA)
+        __kmp_affinity_top_method = affinity_top_method_hwloc;
+    // Force using hwloc when tiles or numa nodes requested for OMP_PLACES
+    if (__kmp_affinity.gran == KMP_HW_NUMA ||
+        __kmp_affinity.gran == KMP_HW_TILE)
+      __kmp_affinity_top_method = affinity_top_method_hwloc;
+#endif
+    // Determine if the machine/OS is actually capable of supporting
+    // affinity.
+    const char *var = "KMP_AFFINITY";
+    KMPAffinity::pick_api();
+#if KMP_USE_HWLOC
+    // If Hwloc topology discovery was requested but affinity was also disabled,
+    // then tell user that Hwloc request is being ignored and use default
+    // topology discovery method.
+    if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
+        __kmp_affinity_dispatch->get_api_type() != KMPAffinity::HWLOC) {
+      KMP_WARNING(AffIgnoringHwloc, var);
+      __kmp_affinity_top_method = affinity_top_method_all;
+    }
+#endif
+    if (__kmp_affinity.type == affinity_disabled) {
+      KMP_AFFINITY_DISABLE();
+    } else if (!KMP_AFFINITY_CAPABLE()) {
+      __kmp_affinity_dispatch->determine_capable(var);
+      if (!KMP_AFFINITY_CAPABLE()) {
+        if (__kmp_affinity.flags.verbose ||
+            (__kmp_affinity.flags.warnings &&
+             (__kmp_affinity.type != affinity_default) &&
+             (__kmp_affinity.type != affinity_none) &&
+             (__kmp_affinity.type != affinity_disabled))) {
+          KMP_WARNING(AffNotSupported, var);
+        }
+        __kmp_affinity.type = affinity_disabled;
+        __kmp_affinity.flags.respect = FALSE;
+        __kmp_affinity.gran = KMP_HW_THREAD;
+      }
+    }
+
+    if (__kmp_affinity.type == affinity_disabled) {
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+    } else if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_true) {
+      // OMP_PROC_BIND=true maps to OMP_PROC_BIND=spread.
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_spread;
+    }
+
+    if (KMP_AFFINITY_CAPABLE()) {
+
+#if KMP_GROUP_AFFINITY
+      // This checks to see if the initial affinity mask is equal
+      // to a single windows processor group.  If it is, then we do
+      // not respect the initial affinity mask and instead, use the
+      // entire machine.
+      bool exactly_one_group = false;
+      if (__kmp_num_proc_groups > 1) {
+        int group;
+        bool within_one_group;
+        // Get the initial affinity mask and determine if it is
+        // contained within a single group.
+        kmp_affin_mask_t *init_mask;
+        KMP_CPU_ALLOC(init_mask);
+        __kmp_get_system_affinity(init_mask, TRUE);
+        group = __kmp_get_proc_group(init_mask);
+        within_one_group = (group >= 0);
+        // If the initial affinity is within a single group,
+        // then determine if it is equal to that single group.
+        if (within_one_group) {
+          DWORD num_bits_in_group = __kmp_GetActiveProcessorCount(group);
+          DWORD num_bits_in_mask = 0;
+          for (int bit = init_mask->begin(); bit != init_mask->end();
+               bit = init_mask->next(bit))
+            num_bits_in_mask++;
+          exactly_one_group = (num_bits_in_group == num_bits_in_mask);
+        }
+        KMP_CPU_FREE(init_mask);
+      }
+
+      // Handle the Win 64 group affinity stuff if there are multiple
+      // processor groups, or if the user requested it, and OMP 4.0
+      // affinity is not in effect.
+      if (__kmp_num_proc_groups > 1 &&
+          __kmp_affinity.type == affinity_default &&
+          __kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
+        // Do not respect the initial processor affinity mask if it is assigned
+        // exactly one Windows Processor Group since this is interpreted as the
+        // default OS assignment. Not respecting the mask allows the runtime to
+        // use all the logical processors in all groups.
+        if (__kmp_affinity.flags.respect == affinity_respect_mask_default &&
+            exactly_one_group) {
+          __kmp_affinity.flags.respect = FALSE;
+        }
+        // Use compact affinity with anticipation of pinning to at least the
+        // group granularity since threads can only be bound to one group.
+        if (__kmp_affinity.type == affinity_default) {
+          __kmp_affinity.type = affinity_compact;
+          __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+        }
+        if (__kmp_hh_affinity.type == affinity_default)
+          __kmp_hh_affinity.type = affinity_compact;
+        if (__kmp_affinity_top_method == affinity_top_method_default)
+          __kmp_affinity_top_method = affinity_top_method_all;
+        if (__kmp_affinity.gran == KMP_HW_UNKNOWN)
+          __kmp_affinity.gran = KMP_HW_PROC_GROUP;
+        if (__kmp_hh_affinity.gran == KMP_HW_UNKNOWN)
+          __kmp_hh_affinity.gran = KMP_HW_PROC_GROUP;
+      } else
+
+#endif /* KMP_GROUP_AFFINITY */
+
+      {
+        if (__kmp_affinity.flags.respect == affinity_respect_mask_default) {
+#if KMP_GROUP_AFFINITY
+          if (__kmp_num_proc_groups > 1 && exactly_one_group) {
+            __kmp_affinity.flags.respect = FALSE;
+          } else
+#endif /* KMP_GROUP_AFFINITY */
+          {
+            __kmp_affinity.flags.respect = TRUE;
+          }
+        }
+        if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
+            (__kmp_nested_proc_bind.bind_types[0] != proc_bind_default)) {
+          if (__kmp_affinity.type == affinity_default) {
+            __kmp_affinity.type = affinity_compact;
+            __kmp_affinity.flags.dups = FALSE;
+          }
+        } else if (__kmp_affinity.type == affinity_default) {
+#if KMP_MIC_SUPPORTED
+          if (__kmp_mic_type != non_mic) {
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+          } else
+#endif
+          {
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+          }
+#if KMP_MIC_SUPPORTED
+          if (__kmp_mic_type != non_mic) {
+            __kmp_affinity.type = affinity_scatter;
+          } else
+#endif
+          {
+            __kmp_affinity.type = affinity_none;
+          }
+        }
+        if (__kmp_hh_affinity.type == affinity_default)
+          __kmp_hh_affinity.type = affinity_none;
+        if ((__kmp_affinity.gran == KMP_HW_UNKNOWN) &&
+            (__kmp_affinity.gran_levels < 0)) {
+#if KMP_MIC_SUPPORTED
+          if (__kmp_mic_type != non_mic) {
+            __kmp_affinity.gran = KMP_HW_THREAD;
+          } else
+#endif
+          {
+            __kmp_affinity.gran = KMP_HW_CORE;
+          }
+        }
+        if ((__kmp_hh_affinity.gran == KMP_HW_UNKNOWN) &&
+            (__kmp_hh_affinity.gran_levels < 0)) {
+#if KMP_MIC_SUPPORTED
+          if (__kmp_mic_type != non_mic) {
+            __kmp_hh_affinity.gran = KMP_HW_THREAD;
+          } else
+#endif
+          {
+            __kmp_hh_affinity.gran = KMP_HW_CORE;
+          }
+        }
+        if (__kmp_affinity_top_method == affinity_top_method_default) {
+          __kmp_affinity_top_method = affinity_top_method_all;
+        }
+      }
+    } else {
+      // If affinity is disabled, then still need to assign topology method
+      // to attempt machine detection and affinity types
+      if (__kmp_affinity_top_method == affinity_top_method_default)
+        __kmp_affinity_top_method = affinity_top_method_all;
+      if (__kmp_affinity.type == affinity_default)
+        __kmp_affinity.type = affinity_disabled;
+      if (__kmp_hh_affinity.type == affinity_default)
+        __kmp_hh_affinity.type = affinity_disabled;
+    }
+
+#ifdef KMP_DEBUG
+    for (const kmp_affinity_t *affinity : __kmp_affinities)
+      __kmp_print_affinity_settings(affinity);
+    KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.bind_types[0] != proc_bind_default);
+    K_DIAG(1, ("__kmp_nested_proc_bind.bind_types[0] == %d\n",
+               __kmp_nested_proc_bind.bind_types[0]));
+#endif
+  }
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+  // Post-initialization step: some env. vars need their value's further
+  // processing
+  if (string != NULL) { // kmp_set_defaults() was called
+    __kmp_aux_env_initialize(&block);
+  }
+
+  __kmp_env_blk_free(&block);
+
+  KMP_MB();
+
+} // __kmp_env_initialize
+
+void __kmp_env_print() {
+
+  kmp_env_blk_t block;
+  int i;
+  kmp_str_buf_t buffer;
+
+  __kmp_stg_init();
+  __kmp_str_buf_init(&buffer);
+
+  __kmp_env_blk_init(&block, NULL);
+  __kmp_env_blk_sort(&block);
+
+  // Print real environment values.
+  __kmp_str_buf_print(&buffer, "\n%s\n\n", KMP_I18N_STR(UserSettings));
+  for (i = 0; i < block.count; ++i) {
+    char const *name = block.vars[i].name;
+    char const *value = block.vars[i].value;
+    if ((KMP_STRLEN(name) > 4 && strncmp(name, "KMP_", 4) == 0) ||
+        strncmp(name, "OMP_", 4) == 0
+#ifdef KMP_GOMP_COMPAT
+        || strncmp(name, "GOMP_", 5) == 0
+#endif // KMP_GOMP_COMPAT
+    ) {
+      __kmp_str_buf_print(&buffer, "   %s=%s\n", name, value);
+    }
+  }
+  __kmp_str_buf_print(&buffer, "\n");
+
+  // Print internal (effective) settings.
+  __kmp_str_buf_print(&buffer, "%s\n\n", KMP_I18N_STR(EffectiveSettings));
+  for (int i = 0; i < __kmp_stg_count; ++i) {
+    if (__kmp_stg_table[i].print != NULL) {
+      __kmp_stg_table[i].print(&buffer, __kmp_stg_table[i].name,
+                               __kmp_stg_table[i].data);
+    }
+  }
+
+  __kmp_printf("%s", buffer.str);
+
+  __kmp_env_blk_free(&block);
+  __kmp_str_buf_free(&buffer);
+
+  __kmp_printf("\n");
+
+} // __kmp_env_print
+
+void __kmp_env_print_2() {
+  __kmp_display_env_impl(__kmp_display_env, __kmp_display_env_verbose);
+} // __kmp_env_print_2
+
+void __kmp_display_env_impl(int display_env, int display_env_verbose) {
+  kmp_env_blk_t block;
+  kmp_str_buf_t buffer;
+
+  __kmp_env_format = 1;
+
+  __kmp_stg_init();
+  __kmp_str_buf_init(&buffer);
+
+  __kmp_env_blk_init(&block, NULL);
+  __kmp_env_blk_sort(&block);
+
+  __kmp_str_buf_print(&buffer, "\n%s\n", KMP_I18N_STR(DisplayEnvBegin));
+  __kmp_str_buf_print(&buffer, "   _OPENMP='%d'\n", __kmp_openmp_version);
+
+  for (int i = 0; i < __kmp_stg_count; ++i) {
+    if (__kmp_stg_table[i].print != NULL &&
+        ((display_env && strncmp(__kmp_stg_table[i].name, "OMP_", 4) == 0) ||
+         display_env_verbose)) {
+      __kmp_stg_table[i].print(&buffer, __kmp_stg_table[i].name,
+                               __kmp_stg_table[i].data);
+    }
+  }
+
+  __kmp_str_buf_print(&buffer, "%s\n", KMP_I18N_STR(DisplayEnvEnd));
+  __kmp_str_buf_print(&buffer, "\n");
+
+  __kmp_printf("%s", buffer.str);
+
+  __kmp_env_blk_free(&block);
+  __kmp_str_buf_free(&buffer);
+
+  __kmp_printf("\n");
+}
+
+#if OMPD_SUPPORT
+// Dump environment variables for OMPD
+void __kmp_env_dump() {
+
+  kmp_env_blk_t block;
+  kmp_str_buf_t buffer, env, notdefined;
+
+  __kmp_stg_init();
+  __kmp_str_buf_init(&buffer);
+  __kmp_str_buf_init(&env);
+  __kmp_str_buf_init(&notdefined);
+
+  __kmp_env_blk_init(&block, NULL);
+  __kmp_env_blk_sort(&block);
+
+  __kmp_str_buf_print(&notdefined, ": %s", KMP_I18N_STR(NotDefined));
+
+  for (int i = 0; i < __kmp_stg_count; ++i) {
+    if (__kmp_stg_table[i].print == NULL)
+      continue;
+    __kmp_str_buf_clear(&env);
+    __kmp_stg_table[i].print(&env, __kmp_stg_table[i].name,
+                             __kmp_stg_table[i].data);
+    if (env.used < 4) // valid definition must have indents (3) and a new line
+      continue;
+    if (strstr(env.str, notdefined.str))
+      // normalize the string
+      __kmp_str_buf_print(&buffer, "%s=undefined\n", __kmp_stg_table[i].name);
+    else
+      __kmp_str_buf_cat(&buffer, env.str + 3, env.used - 3);
+  }
+
+  ompd_env_block = (char *)__kmp_allocate(buffer.used + 1);
+  KMP_MEMCPY(ompd_env_block, buffer.str, buffer.used + 1);
+  ompd_env_block_size = (ompd_size_t)KMP_STRLEN(ompd_env_block);
+
+  __kmp_env_blk_free(&block);
+  __kmp_str_buf_free(&buffer);
+  __kmp_str_buf_free(&env);
+  __kmp_str_buf_free(&notdefined);
+}
+#endif // OMPD_SUPPORT
+
+// end of file
diff --git a/third_party/openmp/kmp_settings.h b/third_party/openmp/kmp_settings.h
new file mode 100644
index 000000000..92bbcff52
--- /dev/null
+++ b/third_party/openmp/kmp_settings.h
@@ -0,0 +1,69 @@
+/*
+ * kmp_settings.h -- Initialize environment variables
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_SETTINGS_H
+#define KMP_SETTINGS_H
+
+void __kmp_reset_global_vars(void);
+void __kmp_env_initialize(char const *);
+void __kmp_env_print();
+void __kmp_env_print_2();
+void __kmp_display_env_impl(int display_env, int display_env_verbose);
+#if OMPD_SUPPORT
+void __kmp_env_dump();
+#endif
+
+int __kmp_initial_threads_capacity(int req_nproc);
+void __kmp_init_dflt_team_nth();
+int __kmp_default_tp_capacity(int, int, int);
+
+#if KMP_MIC
+#define KMP_STR_BUF_PRINT_NAME                                                 \
+  __kmp_str_buf_print(buffer, "  %s %s", KMP_I18N_STR(Device), name)
+#define KMP_STR_BUF_PRINT_NAME_EX(x)                                           \
+  __kmp_str_buf_print(buffer, "  %s %s='", KMP_I18N_STR(Device), x)
+#define KMP_STR_BUF_PRINT_BOOL_EX(n, v, t, f)                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%s'\n", KMP_I18N_STR(Device), n,       \
+                      (v) ? t : f)
+#define KMP_STR_BUF_PRINT_BOOL                                                 \
+  KMP_STR_BUF_PRINT_BOOL_EX(name, value, "TRUE", "FALSE")
+#define KMP_STR_BUF_PRINT_INT                                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%d'\n", KMP_I18N_STR(Device), name,    \
+                      value)
+#define KMP_STR_BUF_PRINT_UINT64                                               \
+  __kmp_str_buf_print(buffer, "  %s %s='%" KMP_UINT64_SPEC "'\n",              \
+                      KMP_I18N_STR(Device), name, value);
+#define KMP_STR_BUF_PRINT_STR                                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%s'\n", KMP_I18N_STR(Device), name,    \
+                      value)
+#else
+#define KMP_STR_BUF_PRINT_NAME                                                 \
+  __kmp_str_buf_print(buffer, "  %s %s", KMP_I18N_STR(Host), name)
+#define KMP_STR_BUF_PRINT_NAME_EX(x)                                           \
+  __kmp_str_buf_print(buffer, "  %s %s='", KMP_I18N_STR(Host), x)
+#define KMP_STR_BUF_PRINT_BOOL_EX(n, v, t, f)                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%s'\n", KMP_I18N_STR(Host), n,         \
+                      (v) ? t : f)
+#define KMP_STR_BUF_PRINT_BOOL                                                 \
+  KMP_STR_BUF_PRINT_BOOL_EX(name, value, "TRUE", "FALSE")
+#define KMP_STR_BUF_PRINT_INT                                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%d'\n", KMP_I18N_STR(Host), name, value)
+#define KMP_STR_BUF_PRINT_UINT64                                               \
+  __kmp_str_buf_print(buffer, "  %s %s='%" KMP_UINT64_SPEC "'\n",              \
+                      KMP_I18N_STR(Host), name, value);
+#define KMP_STR_BUF_PRINT_STR                                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%s'\n", KMP_I18N_STR(Host), name, value)
+#endif
+
+#endif // KMP_SETTINGS_H
+
+// end of file //
diff --git a/third_party/openmp/kmp_stats.cpp b/third_party/openmp/kmp_stats.cpp
new file mode 100644
index 000000000..7f973130c
--- /dev/null
+++ b/third_party/openmp/kmp_stats.cpp
@@ -0,0 +1,929 @@
+/** @file kmp_stats.cpp
+ * Statistics gathering and processing.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_lock.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+
+#include <algorithm>
+#include <ctime>
+#include <iomanip>
+#include <sstream>
+#include <stdlib.h> // for atexit
+#include <cmath>
+
+#if LIBOMP_STATS
+
+#define STRINGIZE2(x) #x
+#define STRINGIZE(x) STRINGIZE2(x)
+
+#define expandName(name, flags, ignore) {STRINGIZE(name), flags},
+statInfo timeStat::timerInfo[] = {
+    KMP_FOREACH_TIMER(expandName, 0){"TIMER_LAST", 0}};
+const statInfo counter::counterInfo[] = {
+    KMP_FOREACH_COUNTER(expandName, 0){"COUNTER_LAST", 0}};
+#undef expandName
+
+#define expandName(ignore1, ignore2, ignore3) {0.0, 0.0, 0.0},
+kmp_stats_output_module::rgb_color kmp_stats_output_module::timerColorInfo[] = {
+    KMP_FOREACH_TIMER(expandName, 0){0.0, 0.0, 0.0}};
+#undef expandName
+
+const kmp_stats_output_module::rgb_color
+    kmp_stats_output_module::globalColorArray[] = {
+        {1.0, 0.0, 0.0}, // red
+        {1.0, 0.6, 0.0}, // orange
+        {1.0, 1.0, 0.0}, // yellow
+        {0.0, 1.0, 0.0}, // green
+        {0.0, 0.0, 1.0}, // blue
+        {0.6, 0.2, 0.8}, // purple
+        {1.0, 0.0, 1.0}, // magenta
+        {0.0, 0.4, 0.2}, // dark green
+        {1.0, 1.0, 0.6}, // light yellow
+        {0.6, 0.4, 0.6}, // dirty purple
+        {0.0, 1.0, 1.0}, // cyan
+        {1.0, 0.4, 0.8}, // pink
+        {0.5, 0.5, 0.5}, // grey
+        {0.8, 0.7, 0.5}, // brown
+        {0.6, 0.6, 1.0}, // light blue
+        {1.0, 0.7, 0.5}, // peach
+        {0.8, 0.5, 1.0}, // lavender
+        {0.6, 0.0, 0.0}, // dark red
+        {0.7, 0.6, 0.0}, // gold
+        {0.0, 0.0, 0.0} // black
+};
+
+// Ensure that the atexit handler only runs once.
+static uint32_t statsPrinted = 0;
+
+// output interface
+static kmp_stats_output_module *__kmp_stats_global_output = NULL;
+
+double logHistogram::binMax[] = {1.e1l, 1.e2l, 1.e3l, 1.e4l, 1.e5l, 1.e6l,
+                                 1.e7l, 1.e8l, 1.e9l, 1.e10l, 1.e11l, 1.e12l,
+                                 1.e13l, 1.e14l, 1.e15l, 1.e16l, 1.e17l, 1.e18l,
+                                 1.e19l, 1.e20l, 1.e21l, 1.e22l, 1.e23l, 1.e24l,
+                                 1.e25l, 1.e26l, 1.e27l, 1.e28l, 1.e29l, 1.e30l,
+                                 // Always have infinity be the last value
+                                 std::numeric_limits<double>::infinity()};
+
+/* ************* statistic member functions ************* */
+
+void statistic::addSample(double sample) {
+  sample -= offset;
+  KMP_DEBUG_ASSERT(std::isfinite(sample));
+
+  double delta = sample - meanVal;
+
+  sampleCount = sampleCount + 1;
+  meanVal = meanVal + delta / sampleCount;
+  m2 = m2 + delta * (sample - meanVal);
+
+  minVal = std::min(minVal, sample);
+  maxVal = std::max(maxVal, sample);
+  if (collectingHist)
+    hist.addSample(sample);
+}
+
+statistic &statistic::operator+=(const statistic &other) {
+  if (other.sampleCount == 0)
+    return *this;
+
+  if (sampleCount == 0) {
+    *this = other;
+    return *this;
+  }
+
+  uint64_t newSampleCount = sampleCount + other.sampleCount;
+  double dnsc = double(newSampleCount);
+  double dsc = double(sampleCount);
+  double dscBydnsc = dsc / dnsc;
+  double dosc = double(other.sampleCount);
+  double delta = other.meanVal - meanVal;
+
+  // Try to order these calculations to avoid overflows. If this were Fortran,
+  // then the compiler would not be able to re-order over brackets. In C++ it
+  // may be legal to do that (we certainly hope it doesn't, and CC+ Programming
+  // Language 2nd edition suggests it shouldn't, since it says that exploitation
+  // of associativity can only be made if the operation really is associative
+  // (which floating addition isn't...)).
+  meanVal = meanVal * dscBydnsc + other.meanVal * (1 - dscBydnsc);
+  m2 = m2 + other.m2 + dscBydnsc * dosc * delta * delta;
+  minVal = std::min(minVal, other.minVal);
+  maxVal = std::max(maxVal, other.maxVal);
+  sampleCount = newSampleCount;
+  if (collectingHist)
+    hist += other.hist;
+
+  return *this;
+}
+
+void statistic::scale(double factor) {
+  minVal = minVal * factor;
+  maxVal = maxVal * factor;
+  meanVal = meanVal * factor;
+  m2 = m2 * factor * factor;
+  return;
+}
+
+std::string statistic::format(char unit, bool total) const {
+  std::string result = formatSI((double)sampleCount, 9, ' ');
+
+  if (sampleCount == 0) {
+    result = result + std::string(", ") + formatSI(0.0, 9, unit);
+    result = result + std::string(", ") + formatSI(0.0, 9, unit);
+    result = result + std::string(", ") + formatSI(0.0, 9, unit);
+    if (total)
+      result = result + std::string(", ") + formatSI(0.0, 9, unit);
+    result = result + std::string(", ") + formatSI(0.0, 9, unit);
+  } else {
+    result = result + std::string(", ") + formatSI(minVal, 9, unit);
+    result = result + std::string(", ") + formatSI(meanVal, 9, unit);
+    result = result + std::string(", ") + formatSI(maxVal, 9, unit);
+    if (total)
+      result =
+          result + std::string(", ") + formatSI(meanVal * sampleCount, 9, unit);
+    result = result + std::string(", ") + formatSI(getSD(), 9, unit);
+  }
+  return result;
+}
+
+/* ************* histogram member functions ************* */
+
+// Lowest bin that has anything in it
+int logHistogram::minBin() const {
+  for (int i = 0; i < numBins; i++) {
+    if (bins[i].count != 0)
+      return i - logOffset;
+  }
+  return -logOffset;
+}
+
+// Highest bin that has anything in it
+int logHistogram::maxBin() const {
+  for (int i = numBins - 1; i >= 0; i--) {
+    if (bins[i].count != 0)
+      return i - logOffset;
+  }
+  return -logOffset;
+}
+
+// Which bin does this sample belong in ?
+uint32_t logHistogram::findBin(double sample) {
+  double v = std::fabs(sample);
+  // Simply loop up looking which bin to put it in.
+  // According to a micro-architect this is likely to be faster than a binary
+  // search, since
+  // it will only have one branch mis-predict
+  for (int b = 0; b < numBins - 1; b++)
+    if (binMax[b] > v)
+      return b;
+  return numBins - 1;
+}
+
+void logHistogram::addSample(double sample) {
+  if (sample == 0.0) {
+    zeroCount += 1;
+#ifdef KMP_DEBUG
+    _total++;
+    check();
+#endif
+    return;
+  }
+  KMP_DEBUG_ASSERT(std::isfinite(sample));
+  uint32_t bin = findBin(sample);
+  KMP_DEBUG_ASSERT(0 <= bin && bin < numBins);
+
+  bins[bin].count += 1;
+  bins[bin].total += sample;
+#ifdef KMP_DEBUG
+  _total++;
+  check();
+#endif
+}
+
+// This may not be the format we want, but it'll do for now
+std::string logHistogram::format(char unit) const {
+  std::stringstream result;
+
+  result << "Bin,                Count,     Total\n";
+  if (zeroCount) {
+    result << "0,              " << formatSI(zeroCount, 9, ' ') << ", ",
+        formatSI(0.0, 9, unit);
+    if (count(minBin()) == 0)
+      return result.str();
+    result << "\n";
+  }
+  for (int i = minBin(); i <= maxBin(); i++) {
+    result << "10**" << i << "<=v<";
+    if (i + 1 == numBins - 1)
+      result << "infinity, ";
+    else
+      result << "10**" << (i + 1) << ", ";
+    result << formatSI(count(i), 9, ' ') << ", " << formatSI(total(i), 9, unit);
+    if (i != maxBin())
+      result << "\n";
+  }
+
+  return result.str();
+}
+
+/* ************* explicitTimer member functions ************* */
+
+void explicitTimer::start(tsc_tick_count tick) {
+  startTime = tick;
+  totalPauseTime = 0;
+  if (timeStat::logEvent(timerEnumValue)) {
+    __kmp_stats_thread_ptr->incrementNestValue();
+  }
+  return;
+}
+
+void explicitTimer::stop(tsc_tick_count tick,
+                         kmp_stats_list *stats_ptr /* = nullptr */) {
+  if (startTime.getValue() == 0)
+    return;
+
+  stat->addSample(((tick - startTime) - totalPauseTime).ticks());
+
+  if (timeStat::logEvent(timerEnumValue)) {
+    if (!stats_ptr)
+      stats_ptr = __kmp_stats_thread_ptr;
+    stats_ptr->push_event(
+        startTime.getValue() - __kmp_stats_start_time.getValue(),
+        tick.getValue() - __kmp_stats_start_time.getValue(),
+        __kmp_stats_thread_ptr->getNestValue(), timerEnumValue);
+    stats_ptr->decrementNestValue();
+  }
+
+  /* We accept the risk that we drop a sample because it really did start at
+     t==0. */
+  startTime = 0;
+  return;
+}
+
+/* ************* partitionedTimers member functions ************* */
+partitionedTimers::partitionedTimers() { timer_stack.reserve(8); }
+
+// initialize the partitioned timers to an initial timer
+void partitionedTimers::init(explicitTimer timer) {
+  KMP_DEBUG_ASSERT(this->timer_stack.size() == 0);
+  timer_stack.push_back(timer);
+  timer_stack.back().start(tsc_tick_count::now());
+}
+
+// stop/save the current timer, and start the new timer (timer_pair)
+// There is a special condition where if the current timer is equal to
+// the one you are trying to push, then it only manipulates the stack,
+// and it won't stop/start the currently running timer.
+void partitionedTimers::push(explicitTimer timer) {
+  // get the current timer
+  // pause current timer
+  // push new timer
+  // start the new timer
+  explicitTimer *current_timer, *new_timer;
+  size_t stack_size;
+  KMP_DEBUG_ASSERT(this->timer_stack.size() > 0);
+  timer_stack.push_back(timer);
+  stack_size = timer_stack.size();
+  current_timer = &(timer_stack[stack_size - 2]);
+  new_timer = &(timer_stack[stack_size - 1]);
+  tsc_tick_count tick = tsc_tick_count::now();
+  current_timer->pause(tick);
+  new_timer->start(tick);
+}
+
+// stop/discard the current timer, and start the previously saved timer
+void partitionedTimers::pop() {
+  // get the current timer
+  // stop current timer (record event/sample)
+  // pop current timer
+  // get the new current timer and resume
+  explicitTimer *old_timer, *new_timer;
+  size_t stack_size = timer_stack.size();
+  KMP_DEBUG_ASSERT(stack_size > 1);
+  old_timer = &(timer_stack[stack_size - 1]);
+  new_timer = &(timer_stack[stack_size - 2]);
+  tsc_tick_count tick = tsc_tick_count::now();
+  old_timer->stop(tick);
+  new_timer->resume(tick);
+  timer_stack.pop_back();
+}
+
+void partitionedTimers::exchange(explicitTimer timer) {
+  // get the current timer
+  // stop current timer (record event/sample)
+  // push new timer
+  // start the new timer
+  explicitTimer *current_timer, *new_timer;
+  size_t stack_size;
+  KMP_DEBUG_ASSERT(this->timer_stack.size() > 0);
+  tsc_tick_count tick = tsc_tick_count::now();
+  stack_size = timer_stack.size();
+  current_timer = &(timer_stack[stack_size - 1]);
+  current_timer->stop(tick);
+  timer_stack.pop_back();
+  timer_stack.push_back(timer);
+  new_timer = &(timer_stack[stack_size - 1]);
+  new_timer->start(tick);
+}
+
+// Wind up all the currently running timers.
+// This pops off all the timers from the stack and clears the stack
+// After this is called, init() must be run again to initialize the
+// stack of timers
+void partitionedTimers::windup() {
+  while (timer_stack.size() > 1) {
+    this->pop();
+  }
+  // Pop the timer from the init() call
+  if (timer_stack.size() > 0) {
+    timer_stack.back().stop(tsc_tick_count::now());
+    timer_stack.pop_back();
+  }
+}
+
+/* ************* kmp_stats_event_vector member functions ************* */
+
+void kmp_stats_event_vector::deallocate() {
+  __kmp_free(events);
+  internal_size = 0;
+  allocated_size = 0;
+  events = NULL;
+}
+
+// This function is for qsort() which requires the compare function to return
+// either a negative number if event1 < event2, a positive number if event1 >
+// event2 or zero if event1 == event2. This sorts by start time (lowest to
+// highest).
+int compare_two_events(const void *event1, const void *event2) {
+  const kmp_stats_event *ev1 = RCAST(const kmp_stats_event *, event1);
+  const kmp_stats_event *ev2 = RCAST(const kmp_stats_event *, event2);
+
+  if (ev1->getStart() < ev2->getStart())
+    return -1;
+  else if (ev1->getStart() > ev2->getStart())
+    return 1;
+  else
+    return 0;
+}
+
+void kmp_stats_event_vector::sort() {
+  qsort(events, internal_size, sizeof(kmp_stats_event), compare_two_events);
+}
+
+/* ************* kmp_stats_list member functions ************* */
+
+// returns a pointer to newly created stats node
+kmp_stats_list *kmp_stats_list::push_back(int gtid) {
+  kmp_stats_list *newnode =
+      (kmp_stats_list *)__kmp_allocate(sizeof(kmp_stats_list));
+  // placement new, only requires space and pointer and initializes (so
+  // __kmp_allocate instead of C++ new[] is used)
+  new (newnode) kmp_stats_list();
+  newnode->setGtid(gtid);
+  newnode->prev = this->prev;
+  newnode->next = this;
+  newnode->prev->next = newnode;
+  newnode->next->prev = newnode;
+  return newnode;
+}
+void kmp_stats_list::deallocate() {
+  kmp_stats_list *ptr = this->next;
+  kmp_stats_list *delptr = this->next;
+  while (ptr != this) {
+    delptr = ptr;
+    ptr = ptr->next;
+    // placement new means we have to explicitly call destructor.
+    delptr->_event_vector.deallocate();
+    delptr->~kmp_stats_list();
+    __kmp_free(delptr);
+  }
+}
+kmp_stats_list::iterator kmp_stats_list::begin() {
+  kmp_stats_list::iterator it;
+  it.ptr = this->next;
+  return it;
+}
+kmp_stats_list::iterator kmp_stats_list::end() {
+  kmp_stats_list::iterator it;
+  it.ptr = this;
+  return it;
+}
+int kmp_stats_list::size() {
+  int retval;
+  kmp_stats_list::iterator it;
+  for (retval = 0, it = begin(); it != end(); it++, retval++) {
+  }
+  return retval;
+}
+
+/* ************* kmp_stats_list::iterator member functions ************* */
+
+kmp_stats_list::iterator::iterator() : ptr(NULL) {}
+kmp_stats_list::iterator::~iterator() {}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator++() {
+  this->ptr = this->ptr->next;
+  return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator++(int dummy) {
+  this->ptr = this->ptr->next;
+  return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator--() {
+  this->ptr = this->ptr->prev;
+  return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator--(int dummy) {
+  this->ptr = this->ptr->prev;
+  return *this;
+}
+bool kmp_stats_list::iterator::operator!=(const kmp_stats_list::iterator &rhs) {
+  return this->ptr != rhs.ptr;
+}
+bool kmp_stats_list::iterator::operator==(const kmp_stats_list::iterator &rhs) {
+  return this->ptr == rhs.ptr;
+}
+kmp_stats_list *kmp_stats_list::iterator::operator*() const {
+  return this->ptr;
+}
+
+/* *************  kmp_stats_output_module functions ************** */
+
+const char *kmp_stats_output_module::eventsFileName = NULL;
+const char *kmp_stats_output_module::plotFileName = NULL;
+int kmp_stats_output_module::printPerThreadFlag = 0;
+int kmp_stats_output_module::printPerThreadEventsFlag = 0;
+
+static char const *lastName(char *name) {
+  int l = (int)strlen(name);
+  for (int i = l - 1; i >= 0; --i) {
+    if (name[i] == '.')
+      name[i] = '_';
+    if (name[i] == '/')
+      return name + i + 1;
+  }
+  return name;
+}
+
+/* Read the name of the executable from /proc/self/cmdline */
+static char const *getImageName(char *buffer, size_t buflen) {
+  FILE *f = fopen("/proc/self/cmdline", "r");
+  buffer[0] = char(0);
+  if (!f)
+    return buffer;
+
+  // The file contains char(0) delimited words from the commandline.
+  // This just returns the last filename component of the first word on the
+  // line.
+  size_t n = fread(buffer, 1, buflen, f);
+  if (n == 0) {
+    fclose(f);
+    KMP_CHECK_SYSFAIL("fread", 1)
+  }
+  fclose(f);
+  buffer[buflen - 1] = char(0);
+  return lastName(buffer);
+}
+
+static void getTime(char *buffer, size_t buflen, bool underscores = false) {
+  time_t timer;
+
+  time(&timer);
+
+  struct tm *tm_info = localtime(&timer);
+  if (underscores)
+    strftime(buffer, buflen, "%Y-%m-%d_%H%M%S", tm_info);
+  else
+    strftime(buffer, buflen, "%Y-%m-%d %H%M%S", tm_info);
+}
+
+/* Generate a stats file name, expanding prototypes */
+static std::string generateFilename(char const *prototype,
+                                    char const *imageName) {
+  std::string res;
+
+  for (int i = 0; prototype[i] != char(0); i++) {
+    char ch = prototype[i];
+
+    if (ch == '%') {
+      i++;
+      if (prototype[i] == char(0))
+        break;
+
+      switch (prototype[i]) {
+      case 't': // Insert time and date
+      {
+        char date[26];
+        getTime(date, sizeof(date), true);
+        res += date;
+      } break;
+      case 'e': // Insert executable name
+        res += imageName;
+        break;
+      case 'p': // Insert pid
+      {
+        std::stringstream ss;
+        ss << getpid();
+        res += ss.str();
+      } break;
+      default:
+        res += prototype[i];
+        break;
+      }
+    } else
+      res += ch;
+  }
+  return res;
+}
+
+// init() is called very near the beginning of execution time in the constructor
+// of __kmp_stats_global_output
+void kmp_stats_output_module::init() {
+
+  char *statsFileName = getenv("KMP_STATS_FILE");
+  eventsFileName = getenv("KMP_STATS_EVENTS_FILE");
+  plotFileName = getenv("KMP_STATS_PLOT_FILE");
+  char *threadStats = getenv("KMP_STATS_THREADS");
+  char *threadEvents = getenv("KMP_STATS_EVENTS");
+
+  // set the stats output filenames based on environment variables and defaults
+  if (statsFileName) {
+    char imageName[1024];
+    // Process any escapes (e.g., %p, %e, %t) in the name
+    outputFileName = generateFilename(
+        statsFileName, getImageName(&imageName[0], sizeof(imageName)));
+  }
+  eventsFileName = eventsFileName ? eventsFileName : "events.dat";
+  plotFileName = plotFileName ? plotFileName : "events.plt";
+
+  // set the flags based on environment variables matching: true, on, 1, .true.
+  // , .t. , yes
+  printPerThreadFlag = __kmp_str_match_true(threadStats);
+  printPerThreadEventsFlag = __kmp_str_match_true(threadEvents);
+
+  if (printPerThreadEventsFlag) {
+    // assigns a color to each timer for printing
+    setupEventColors();
+  } else {
+    // will clear flag so that no event will be logged
+    timeStat::clearEventFlags();
+  }
+}
+
+void kmp_stats_output_module::setupEventColors() {
+  int i;
+  int globalColorIndex = 0;
+  int numGlobalColors = sizeof(globalColorArray) / sizeof(rgb_color);
+  for (i = 0; i < TIMER_LAST; i++) {
+    if (timeStat::logEvent((timer_e)i)) {
+      timerColorInfo[i] = globalColorArray[globalColorIndex];
+      globalColorIndex = (globalColorIndex + 1) % numGlobalColors;
+    }
+  }
+}
+
+void kmp_stats_output_module::printTimerStats(FILE *statsOut,
+                                              statistic const *theStats,
+                                              statistic const *totalStats) {
+  fprintf(statsOut,
+          "Timer,                             SampleCount,    Min,      "
+          "Mean,       Max,     Total,        SD\n");
+  for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
+    statistic const *stat = &theStats[s];
+    char tag = timeStat::noUnits(s) ? ' ' : 'T';
+
+    fprintf(statsOut, "%-35s, %s\n", timeStat::name(s),
+            stat->format(tag, true).c_str());
+  }
+  // Also print the Total_ versions of times.
+  for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
+    char tag = timeStat::noUnits(s) ? ' ' : 'T';
+    if (totalStats && !timeStat::noTotal(s))
+      fprintf(statsOut, "Total_%-29s, %s\n", timeStat::name(s),
+              totalStats[s].format(tag, true).c_str());
+  }
+
+  // Print histogram of statistics
+  if (theStats[0].haveHist()) {
+    fprintf(statsOut, "\nTimer distributions\n");
+    for (int s = 0; s < TIMER_LAST; s++) {
+      statistic const *stat = &theStats[s];
+
+      if (stat->getCount() != 0) {
+        char tag = timeStat::noUnits(timer_e(s)) ? ' ' : 'T';
+
+        fprintf(statsOut, "%s\n", timeStat::name(timer_e(s)));
+        fprintf(statsOut, "%s\n", stat->getHist()->format(tag).c_str());
+      }
+    }
+  }
+}
+
+void kmp_stats_output_module::printCounterStats(FILE *statsOut,
+                                                statistic const *theStats) {
+  fprintf(statsOut, "Counter,                 ThreadCount,    Min,      Mean,  "
+                    "     Max,     Total,        SD\n");
+  for (int s = 0; s < COUNTER_LAST; s++) {
+    statistic const *stat = &theStats[s];
+    fprintf(statsOut, "%-25s, %s\n", counter::name(counter_e(s)),
+            stat->format(' ', true).c_str());
+  }
+  // Print histogram of counters
+  if (theStats[0].haveHist()) {
+    fprintf(statsOut, "\nCounter distributions\n");
+    for (int s = 0; s < COUNTER_LAST; s++) {
+      statistic const *stat = &theStats[s];
+
+      if (stat->getCount() != 0) {
+        fprintf(statsOut, "%s\n", counter::name(counter_e(s)));
+        fprintf(statsOut, "%s\n", stat->getHist()->format(' ').c_str());
+      }
+    }
+  }
+}
+
+void kmp_stats_output_module::printCounters(FILE *statsOut,
+                                            counter const *theCounters) {
+  // We print all the counters even if they are zero.
+  // That makes it easier to slice them into a spreadsheet if you need to.
+  fprintf(statsOut, "\nCounter,                    Count\n");
+  for (int c = 0; c < COUNTER_LAST; c++) {
+    counter const *stat = &theCounters[c];
+    fprintf(statsOut, "%-25s, %s\n", counter::name(counter_e(c)),
+            formatSI((double)stat->getValue(), 9, ' ').c_str());
+  }
+}
+
+void kmp_stats_output_module::printEvents(FILE *eventsOut,
+                                          kmp_stats_event_vector *theEvents,
+                                          int gtid) {
+  // sort by start time before printing
+  theEvents->sort();
+  for (int i = 0; i < theEvents->size(); i++) {
+    kmp_stats_event ev = theEvents->at(i);
+    rgb_color color = getEventColor(ev.getTimerName());
+    fprintf(eventsOut, "%d %llu %llu %1.1f rgb(%1.1f,%1.1f,%1.1f) %s\n", gtid,
+            static_cast<unsigned long long>(ev.getStart()),
+            static_cast<unsigned long long>(ev.getStop()),
+            1.2 - (ev.getNestLevel() * 0.2), color.r, color.g, color.b,
+            timeStat::name(ev.getTimerName()));
+  }
+  return;
+}
+
+void kmp_stats_output_module::windupExplicitTimers() {
+  // Wind up any explicit timers. We assume that it's fair at this point to just
+  // walk all the explicit timers in all threads and say "it's over".
+  // If the timer wasn't running, this won't record anything anyway.
+  kmp_stats_list::iterator it;
+  for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
+    kmp_stats_list *ptr = *it;
+    ptr->getPartitionedTimers()->windup();
+    ptr->endLife();
+  }
+}
+
+void kmp_stats_output_module::printPloticusFile() {
+  int i;
+  int size = __kmp_stats_list->size();
+  kmp_safe_raii_file_t plotOut(plotFileName, "w+");
+  fprintf(plotOut, "#proc page\n"
+                   "   pagesize: 15 10\n"
+                   "   scale: 1.0\n\n");
+
+  fprintf(plotOut,
+          "#proc getdata\n"
+          "   file: %s\n\n",
+          eventsFileName);
+
+  fprintf(plotOut,
+          "#proc areadef\n"
+          "   title: OpenMP Sampling Timeline\n"
+          "   titledetails: align=center size=16\n"
+          "   rectangle: 1 1 13 9\n"
+          "   xautorange: datafield=2,3\n"
+          "   yautorange: -1 %d\n\n",
+          size);
+
+  fprintf(plotOut, "#proc xaxis\n"
+                   "   stubs: inc\n"
+                   "   stubdetails: size=12\n"
+                   "   label: Time (ticks)\n"
+                   "   labeldetails: size=14\n\n");
+
+  fprintf(plotOut,
+          "#proc yaxis\n"
+          "   stubs: inc 1\n"
+          "   stubrange: 0 %d\n"
+          "   stubdetails: size=12\n"
+          "   label: Thread #\n"
+          "   labeldetails: size=14\n\n",
+          size - 1);
+
+  fprintf(plotOut, "#proc bars\n"
+                   "   exactcolorfield: 5\n"
+                   "   axis: x\n"
+                   "   locfield: 1\n"
+                   "   segmentfields: 2 3\n"
+                   "   barwidthfield: 4\n\n");
+
+  // create legend entries corresponding to the timer color
+  for (i = 0; i < TIMER_LAST; i++) {
+    if (timeStat::logEvent((timer_e)i)) {
+      rgb_color c = getEventColor((timer_e)i);
+      fprintf(plotOut,
+              "#proc legendentry\n"
+              "   sampletype: color\n"
+              "   label: %s\n"
+              "   details: rgb(%1.1f,%1.1f,%1.1f)\n\n",
+              timeStat::name((timer_e)i), c.r, c.g, c.b);
+    }
+  }
+
+  fprintf(plotOut, "#proc legend\n"
+                   "   format: down\n"
+                   "   location: max max\n\n");
+  return;
+}
+
+static void outputEnvVariable(FILE *statsOut, char const *name) {
+  char const *value = getenv(name);
+  fprintf(statsOut, "# %s = %s\n", name, value ? value : "*unspecified*");
+}
+
+/* Print some useful information about
+   * the date and time this experiment ran.
+   * the machine on which it ran.
+   We output all of this as stylised comments, though we may decide to parse
+   some of it. */
+void kmp_stats_output_module::printHeaderInfo(FILE *statsOut) {
+  std::time_t now = std::time(0);
+  char buffer[40];
+  char hostName[80];
+
+  std::strftime(&buffer[0], sizeof(buffer), "%c", std::localtime(&now));
+  fprintf(statsOut, "# Time of run: %s\n", &buffer[0]);
+  if (gethostname(&hostName[0], sizeof(hostName)) == 0)
+    fprintf(statsOut, "# Hostname: %s\n", &hostName[0]);
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  fprintf(statsOut, "# CPU:  %s\n", &__kmp_cpuinfo.name[0]);
+  fprintf(statsOut, "# Family: %d, Model: %d, Stepping: %d\n",
+          __kmp_cpuinfo.family, __kmp_cpuinfo.model, __kmp_cpuinfo.stepping);
+  if (__kmp_cpuinfo.frequency == 0)
+    fprintf(statsOut, "# Nominal frequency: Unknown\n");
+  else
+    fprintf(statsOut, "# Nominal frequency: %sz\n",
+            formatSI(double(__kmp_cpuinfo.frequency), 9, 'H').c_str());
+  outputEnvVariable(statsOut, "KMP_HW_SUBSET");
+  outputEnvVariable(statsOut, "KMP_AFFINITY");
+  outputEnvVariable(statsOut, "KMP_BLOCKTIME");
+  outputEnvVariable(statsOut, "KMP_LIBRARY");
+  fprintf(statsOut, "# Production runtime built " __DATE__ " " __TIME__ "\n");
+#endif
+}
+
+void kmp_stats_output_module::outputStats(const char *heading) {
+  // Stop all the explicit timers in all threads
+  // Do this before declaring the local statistics because thay have
+  // constructors so will take time to create.
+  windupExplicitTimers();
+
+  statistic allStats[TIMER_LAST];
+  statistic totalStats[TIMER_LAST]; /* Synthesized, cross threads versions of
+                                       normal timer stats */
+  statistic allCounters[COUNTER_LAST];
+
+  kmp_safe_raii_file_t statsOut;
+  if (!outputFileName.empty()) {
+    statsOut.open(outputFileName.c_str(), "a+");
+  } else {
+    statsOut.set_stderr();
+  }
+
+  kmp_safe_raii_file_t eventsOut;
+  if (eventPrintingEnabled()) {
+    eventsOut.open(eventsFileName, "w+");
+  }
+
+  printHeaderInfo(statsOut);
+  fprintf(statsOut, "%s\n", heading);
+  // Accumulate across threads.
+  kmp_stats_list::iterator it;
+  for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
+    int t = (*it)->getGtid();
+    // Output per thread stats if requested.
+    if (printPerThreadFlag) {
+      fprintf(statsOut, "Thread %d\n", t);
+      printTimerStats(statsOut, (*it)->getTimers(), 0);
+      printCounters(statsOut, (*it)->getCounters());
+      fprintf(statsOut, "\n");
+    }
+    // Output per thread events if requested.
+    if (eventPrintingEnabled()) {
+      kmp_stats_event_vector events = (*it)->getEventVector();
+      printEvents(eventsOut, &events, t);
+    }
+
+    // Accumulate timers.
+    for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
+      // See if we should ignore this timer when aggregating
+      if ((timeStat::masterOnly(s) && (t != 0)) || // Timer only valid on
+          // primary thread and this thread is worker
+          (timeStat::workerOnly(s) && (t == 0)) // Timer only valid on worker
+          // and this thread is the primary thread
+          ) {
+        continue;
+      }
+
+      statistic *threadStat = (*it)->getTimer(s);
+      allStats[s] += *threadStat;
+
+      // Add Total stats for timers that are valid in more than one thread
+      if (!timeStat::noTotal(s))
+        totalStats[s].addSample(threadStat->getTotal());
+    }
+
+    // Accumulate counters.
+    for (counter_e c = counter_e(0); c < COUNTER_LAST; c = counter_e(c + 1)) {
+      if (counter::masterOnly(c) && t != 0)
+        continue;
+      allCounters[c].addSample((double)(*it)->getCounter(c)->getValue());
+    }
+  }
+
+  if (eventPrintingEnabled()) {
+    printPloticusFile();
+  }
+
+  fprintf(statsOut, "Aggregate for all threads\n");
+  printTimerStats(statsOut, &allStats[0], &totalStats[0]);
+  fprintf(statsOut, "\n");
+  printCounterStats(statsOut, &allCounters[0]);
+}
+
+/* *************  exported C functions ************** */
+
+// no name mangling for these functions, we want the c files to be able to get
+// at these functions
+extern "C" {
+
+void __kmp_reset_stats() {
+  kmp_stats_list::iterator it;
+  for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
+    timeStat *timers = (*it)->getTimers();
+    counter *counters = (*it)->getCounters();
+
+    for (int t = 0; t < TIMER_LAST; t++)
+      timers[t].reset();
+
+    for (int c = 0; c < COUNTER_LAST; c++)
+      counters[c].reset();
+
+    // reset the event vector so all previous events are "erased"
+    (*it)->resetEventVector();
+  }
+}
+
+// This function will reset all stats and stop all threads' explicit timers if
+// they haven't been stopped already.
+void __kmp_output_stats(const char *heading) {
+  __kmp_stats_global_output->outputStats(heading);
+  __kmp_reset_stats();
+}
+
+void __kmp_accumulate_stats_at_exit(void) {
+  // Only do this once.
+  if (KMP_XCHG_FIXED32(&statsPrinted, 1) != 0)
+    return;
+
+  __kmp_output_stats("Statistics on exit");
+}
+
+void __kmp_stats_init(void) {
+  __kmp_init_tas_lock(&__kmp_stats_lock);
+  __kmp_stats_start_time = tsc_tick_count::now();
+  __kmp_stats_global_output = new kmp_stats_output_module();
+  __kmp_stats_list = new kmp_stats_list();
+}
+
+void __kmp_stats_fini(void) {
+  __kmp_accumulate_stats_at_exit();
+  __kmp_stats_list->deallocate();
+  delete __kmp_stats_global_output;
+  delete __kmp_stats_list;
+}
+
+} // extern "C"
+
+#endif // LIBOMP_STATS
diff --git a/third_party/openmp/kmp_stats.h b/third_party/openmp/kmp_stats.h
new file mode 100644
index 000000000..f7f8f5f92
--- /dev/null
+++ b/third_party/openmp/kmp_stats.h
@@ -0,0 +1,1021 @@
+#ifndef KMP_STATS_H
+#define KMP_STATS_H
+
+/** @file kmp_stats.h
+ * Functions for collecting statistics.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_config.h"
+#include "kmp_debug.h"
+
+#if KMP_STATS_ENABLED
+/* Statistics accumulator.
+   Accumulates number of samples and computes min, max, mean, standard deviation
+   on the fly.
+
+   Online variance calculation algorithm from
+   http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
+ */
+
+#include "kmp_stats_timing.h"
+#include <limits>
+#include <math.h>
+#include <new> // placement new
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+/* Enable developer statistics here if you want them. They are more detailed
+   than is useful for application characterisation and are intended for the
+   runtime library developer. */
+#define KMP_DEVELOPER_STATS 0
+
+/* Enable/Disable histogram output */
+#define KMP_STATS_HIST 0
+
+/*!
+ * @ingroup STATS_GATHERING
+ * \brief flags to describe the statistic (timer or counter)
+ *
+ */
+enum stats_flags_e {
+  noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
+  onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread
+  noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
+  notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads
+  logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
+  //! KMP_STATS_EVENTS is on (valid only for timers)
+};
+
+/*!
+ * @ingroup STATS_GATHERING
+ * \brief the states which a thread can be in
+ *
+ */
+enum stats_state_e {
+  IDLE,
+  SERIAL_REGION,
+  FORK_JOIN_BARRIER,
+  PLAIN_BARRIER,
+  TASKWAIT,
+  TASKYIELD,
+  TASKGROUP,
+  IMPLICIT_TASK,
+  EXPLICIT_TASK,
+  TEAMS_REGION
+};
+
+/*!
+ * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
+ *
+ * @param macro a user defined macro that takes three arguments -
+ * macro(COUNTER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \details A counter counts the occurrence of some event. Each thread
+ * accumulates its own count, at the end of execution the counts are aggregated
+ * treating each thread as a separate measurement. (Unless onlyInMaster is set,
+ * in which case there's only a single measurement). The min,mean,max are
+ * therefore the values for the threads. Adding the counter here and then
+ * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you
+ * need to do. All of the tables and printing is generated from this macro.
+ * Format is "macro(name, flags, arg)"
+ *
+ * @ingroup STATS_GATHERING
+ */
+// clang-format off
+#define KMP_FOREACH_COUNTER(macro, arg)                                        \
+  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg)   \
+  macro(OMP_NESTED_PARALLEL, 0, arg)                                           \
+  macro(OMP_LOOP_STATIC, 0, arg)                                               \
+  macro(OMP_LOOP_STATIC_STEAL, 0, arg)                                         \
+  macro(OMP_LOOP_DYNAMIC, 0, arg)                                              \
+  macro(OMP_DISTRIBUTE, 0, arg)                                                \
+  macro(OMP_BARRIER, 0, arg)                                                   \
+  macro(OMP_CRITICAL, 0, arg)                                                  \
+  macro(OMP_SINGLE, 0, arg)                                                    \
+  macro(OMP_SECTIONS, 0, arg)                                                  \
+  macro(OMP_MASTER, 0, arg)                                                    \
+  macro(OMP_MASKED, 0, arg)                                                    \
+  macro(OMP_TEAMS, 0, arg)                                                     \
+  macro(OMP_set_lock, 0, arg)                                                  \
+  macro(OMP_test_lock, 0, arg)                                                 \
+  macro(REDUCE_wait, 0, arg)                                                   \
+  macro(REDUCE_nowait, 0, arg)                                                 \
+  macro(OMP_TASKYIELD, 0, arg)                                                 \
+  macro(OMP_TASKLOOP, 0, arg)                                                  \
+  macro(TASK_executed, 0, arg)                                                 \
+  macro(TASK_cancelled, 0, arg)                                                \
+  macro(TASK_stolen, 0, arg)
+// clang-format on
+
+/*!
+ * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
+ *
+ * @param macro a user defined macro that takes three arguments -
+ * macro(TIMER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \details A timer collects multiple samples of some count in each thread and
+ * then finally aggregates all of the samples from all of the threads. For most
+ * timers the printing code also provides an aggregation over the thread totals.
+ * These are printed as TOTAL_foo. The count is normally a time (in ticks),
+ * hence the name "timer". (But can be any value, so we use this for "number of
+ * arguments passed to fork" as well). For timers the threads are not
+ * significant, it's the individual observations that count, so the statistics
+ * are at that level. Format is "macro(name, flags, arg)"
+ *
+ * @ingroup STATS_GATHERING2
+ */
+// clang-format off
+#define KMP_FOREACH_TIMER(macro, arg)                                          \
+  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)                 \
+  macro (OMP_parallel, stats_flags_e::logEvent, arg)                           \
+  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg)                  \
+  macro (OMP_teams, stats_flags_e::logEvent, arg)                              \
+  macro (OMP_teams_overhead, stats_flags_e::logEvent, arg)                     \
+  macro (OMP_loop_static, 0, arg)                                              \
+  macro (OMP_loop_static_scheduling, 0, arg)                                   \
+  macro (OMP_loop_dynamic, 0, arg)                                             \
+  macro (OMP_loop_dynamic_scheduling, 0, arg)                                  \
+  macro (OMP_distribute, 0, arg)                                               \
+  macro (OMP_distribute_scheduling, 0, arg)                                    \
+  macro (OMP_critical, 0, arg)                                                 \
+  macro (OMP_critical_wait, 0, arg)                                            \
+  macro (OMP_single, 0, arg)                                                   \
+  macro (OMP_sections, 0, arg)                                                 \
+  macro (OMP_sections_overhead, 0, arg)                                        \
+  macro (OMP_master, 0, arg)                                                   \
+  macro (OMP_masked, 0, arg)                                                   \
+  macro (OMP_task_immediate, 0, arg)                                           \
+  macro (OMP_task_taskwait, 0, arg)                                            \
+  macro (OMP_task_taskyield, 0, arg)                                           \
+  macro (OMP_task_taskgroup, 0, arg)                                           \
+  macro (OMP_task_join_bar, 0, arg)                                            \
+  macro (OMP_task_plain_bar, 0, arg)                                           \
+  macro (OMP_taskloop_scheduling, 0, arg)                                      \
+  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                      \
+  macro (OMP_idle, stats_flags_e::logEvent, arg)                               \
+  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                       \
+  macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                       \
+  macro (OMP_serial, stats_flags_e::logEvent, arg)                             \
+  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,  \
+         arg)                                                                  \
+  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal,   \
+         arg)                                                                  \
+  macro (OMP_loop_static_iterations,                                           \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (OMP_loop_static_total_iterations,                                     \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (OMP_loop_dynamic_iterations,                                          \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (OMP_loop_dynamic_total_iterations,                                    \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (OMP_distribute_iterations,                                            \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
+// clang-format on
+
+// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
+//                           initializing OpenMP or being created by a primary
+//                           thread) until the thread is destroyed
+// OMP_parallel           -- Time thread spends executing work directly
+//                           within a #pragma omp parallel
+// OMP_parallel_overhead  -- Time thread spends setting up a parallel region
+// OMP_loop_static        -- Time thread spends executing loop iterations from
+//                           a statically scheduled loop
+// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
+//                               from a statically scheduled loop
+// OMP_loop_dynamic       -- Time thread spends executing loop iterations from
+//                           a dynamically scheduled loop
+// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
+//                                from a dynamically scheduled loop
+// OMP_critical           -- Time thread spends executing critical section
+// OMP_critical_wait      -- Time thread spends waiting to enter
+//                           a critical section
+// OMP_single             -- Time spent executing a "single" region
+// OMP_master             -- Time spent executing a "master" region
+// OMP_masked             -- Time spent executing a "masked" region
+// OMP_task_immediate     -- Time spent executing non-deferred tasks
+// OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
+//                           construct
+// OMP_task_taskyield     -- Time spent executing tasks inside a taskyield
+//                           construct
+// OMP_task_taskgroup     -- Time spent executing tasks inside a taskygroup
+//                           construct
+// OMP_task_join_bar      -- Time spent executing tasks inside a join barrier
+// OMP_task_plain_bar     -- Time spent executing tasks inside a barrier
+//                           construct
+// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
+//                            construct
+// OMP_plain_barrier      -- Time spent in a #pragma omp barrier construct or
+//                           inside implicit barrier at end of worksharing
+//                           construct
+// OMP_idle               -- Time worker threads spend waiting for next
+//                           parallel region
+// OMP_fork_barrier       -- Time spent in a the fork barrier surrounding a
+//                           parallel region
+// OMP_join_barrier       -- Time spent in a the join barrier surrounding a
+//                           parallel region
+// OMP_serial             -- Time thread zero spends executing serial code
+// OMP_set_numthreads     -- Values passed to omp_set_num_threads
+// OMP_PARALLEL_args      -- Number of arguments passed to a parallel region
+// OMP_loop_static_iterations -- Number of iterations thread is assigned for
+//                               statically scheduled loops
+// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
+//                                dynamically scheduled loops
+
+#if (KMP_DEVELOPER_STATS)
+// Timers which are of interest to runtime library developers, not end users.
+// These have to be explicitly enabled in addition to the other stats.
+
+// KMP_fork_barrier       -- time in __kmp_fork_barrier
+// KMP_join_barrier       -- time in __kmp_join_barrier
+// KMP_barrier            -- time in __kmp_barrier
+// KMP_end_split_barrier  -- time in __kmp_end_split_barrier
+// KMP_setup_icv_copy     -- time in __kmp_setup_icv_copy
+// KMP_icv_copy           -- start/stop timer for any ICV copying
+// KMP_linear_gather      -- time in __kmp_linear_barrier_gather
+// KMP_linear_release     -- time in __kmp_linear_barrier_release
+// KMP_tree_gather        -- time in __kmp_tree_barrier_gather
+// KMP_tree_release       -- time in __kmp_tree_barrier_release
+// KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
+// KMP_hyper_release      -- time in __kmp_hyper_barrier_release
+// KMP_dist_gather       -- time in __kmp_dist_barrier_gather
+// KMP_dist_release      -- time in __kmp_dist_barrier_release
+// clang-format off
+#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
+  macro(KMP_fork_call, 0, arg)                                                 \
+  macro(KMP_join_call, 0, arg)                                                 \
+  macro(KMP_end_split_barrier, 0, arg)                                         \
+  macro(KMP_hier_gather, 0, arg)                                               \
+  macro(KMP_hier_release, 0, arg)                                              \
+  macro(KMP_hyper_gather, 0, arg)                                              \
+  macro(KMP_hyper_release, 0, arg)                                             \
+  macro(KMP_dist_gather, 0, arg)                                              \
+  macro(KMP_dist_release, 0, arg)                                             \
+  macro(KMP_linear_gather, 0, arg)                                             \
+  macro(KMP_linear_release, 0, arg)                                            \
+  macro(KMP_tree_gather, 0, arg)                                               \
+  macro(KMP_tree_release, 0, arg)                                              \
+  macro(USER_resume, 0, arg)                                                   \
+  macro(USER_suspend, 0, arg)                                                  \
+  macro(USER_mwait, 0, arg)                                                    \
+  macro(KMP_allocate_team, 0, arg)                                             \
+  macro(KMP_setup_icv_copy, 0, arg)                                            \
+  macro(USER_icv_copy, 0, arg)                                                 \
+  macro (FOR_static_steal_stolen,                                              \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (FOR_static_steal_chunks,                                              \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
+#else
+#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
+#endif
+// clang-format on
+
+/*!
+ * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
+ *
+ * @param macro a user defined macro that takes three arguments -
+ * macro(TIMER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE
+ * BAD THINGS WILL HAPPEN!
+ *
+ * \details Explicit timers are ones where we need to allocate a timer itself
+ * (as well as the accumulated timing statistics). We allocate these on a
+ * per-thread basis, and explicitly start and stop them. Block timers just
+ * allocate the timer itself on the stack, and use the destructor to notice
+ * block exit; they don't need to be defined here. The name here should be the
+ * same as that of a timer above.
+ *
+ * @ingroup STATS_GATHERING
+ */
+#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
+
+#define ENUMERATE(name, ignore, prefix) prefix##name,
+enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
+
+enum explicit_timer_e {
+  KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
+};
+
+enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
+#undef ENUMERATE
+
+/*
+ * A logarithmic histogram. It accumulates the number of values in each power of
+ * ten bin.  So 1<=x<10, 10<=x<100, ...
+ * Mostly useful where we have some big outliers and want to see information
+ * about them.
+ */
+class logHistogram {
+  enum {
+    numBins = 31, /* Number of powers of 10. If this changes you need to change
+                   * the initializer for binMax */
+
+    /*
+     * If you want to use this to analyse values that may be less than 1, (for
+     * instance times in s), then the logOffset gives you negative powers.
+     * In our case here, we're just looking at times in ticks, or counts, so we
+     * can never see values with magnitude < 1 (other than zero), so we can set
+     * it to 0.  As above change the initializer if you change this.
+     */
+    logOffset = 0
+  };
+  uint32_t KMP_ALIGN_CACHE zeroCount;
+  struct {
+    uint32_t count;
+    double total;
+  } bins[numBins];
+
+  static double binMax[numBins];
+
+#ifdef KMP_DEBUG
+  uint64_t _total;
+
+  void check() const {
+    uint64_t t = zeroCount;
+    for (int i = 0; i < numBins; i++)
+      t += bins[i].count;
+    KMP_DEBUG_ASSERT(t == _total);
+  }
+#else
+  void check() const {}
+#endif
+
+public:
+  logHistogram() { reset(); }
+
+  logHistogram(logHistogram const &o) {
+    for (int i = 0; i < numBins; i++)
+      bins[i] = o.bins[i];
+#ifdef KMP_DEBUG
+    _total = o._total;
+#endif
+  }
+
+  void reset() {
+    zeroCount = 0;
+    for (int i = 0; i < numBins; i++) {
+      bins[i].count = 0;
+      bins[i].total = 0;
+    }
+
+#ifdef KMP_DEBUG
+    _total = 0;
+#endif
+  }
+  uint32_t count(int b) const { return bins[b + logOffset].count; }
+  double total(int b) const { return bins[b + logOffset].total; }
+  static uint32_t findBin(double sample);
+
+  logHistogram &operator+=(logHistogram const &o) {
+    zeroCount += o.zeroCount;
+    for (int i = 0; i < numBins; i++) {
+      bins[i].count += o.bins[i].count;
+      bins[i].total += o.bins[i].total;
+    }
+#ifdef KMP_DEBUG
+    _total += o._total;
+    check();
+#endif
+
+    return *this;
+  }
+
+  void addSample(double sample);
+  int minBin() const;
+  int maxBin() const;
+
+  std::string format(char) const;
+};
+
+class statistic {
+  double KMP_ALIGN_CACHE minVal;
+  double maxVal;
+  double meanVal;
+  double m2;
+  uint64_t sampleCount;
+  double offset;
+  bool collectingHist;
+  logHistogram hist;
+
+public:
+  statistic(bool doHist = bool(KMP_STATS_HIST)) {
+    reset();
+    collectingHist = doHist;
+  }
+  statistic(statistic const &o)
+      : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
+        sampleCount(o.sampleCount), offset(o.offset),
+        collectingHist(o.collectingHist), hist(o.hist) {}
+  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
+      : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
+        sampleCount(sc), offset(0.0), collectingHist(false) {}
+  bool haveHist() const { return collectingHist; }
+  double getMin() const { return minVal; }
+  double getMean() const { return meanVal; }
+  double getMax() const { return maxVal; }
+  uint64_t getCount() const { return sampleCount; }
+  double getSD() const { return sqrt(m2 / sampleCount); }
+  double getTotal() const { return sampleCount * meanVal; }
+  logHistogram const *getHist() const { return &hist; }
+  void setOffset(double d) { offset = d; }
+
+  void reset() {
+    minVal = (std::numeric_limits<double>::max)();
+    maxVal = -minVal;
+    meanVal = 0.0;
+    m2 = 0.0;
+    sampleCount = 0;
+    offset = 0.0;
+    hist.reset();
+  }
+  void addSample(double sample);
+  void scale(double factor);
+  void scaleDown(double f) { scale(1. / f); }
+  void forceCount(uint64_t count) { sampleCount = count; }
+  statistic &operator+=(statistic const &other);
+
+  std::string format(char unit, bool total = false) const;
+  std::string formatHist(char unit) const { return hist.format(unit); }
+};
+
+struct statInfo {
+  const char *name;
+  uint32_t flags;
+};
+
+class timeStat : public statistic {
+  static statInfo timerInfo[];
+
+public:
+  timeStat() : statistic() {}
+  static const char *name(timer_e e) { return timerInfo[e].name; }
+  static bool noTotal(timer_e e) {
+    return timerInfo[e].flags & stats_flags_e::noTotal;
+  }
+  static bool masterOnly(timer_e e) {
+    return timerInfo[e].flags & stats_flags_e::onlyInMaster;
+  }
+  static bool workerOnly(timer_e e) {
+    return timerInfo[e].flags & stats_flags_e::notInMaster;
+  }
+  static bool noUnits(timer_e e) {
+    return timerInfo[e].flags & stats_flags_e::noUnits;
+  }
+  static bool logEvent(timer_e e) {
+    return timerInfo[e].flags & stats_flags_e::logEvent;
+  }
+  static void clearEventFlags() {
+    for (int i = 0; i < TIMER_LAST; i++) {
+      timerInfo[i].flags &= (~(stats_flags_e::logEvent));
+    }
+  }
+};
+
+// Where we need explicitly to start and end the timer, this version can be used
+// Since these timers normally aren't nicely scoped, so don't have a good place
+// to live on the stack of the thread, they're more work to use.
+class explicitTimer {
+  timeStat *stat;
+  timer_e timerEnumValue;
+  tsc_tick_count startTime;
+  tsc_tick_count pauseStartTime;
+  tsc_tick_count::tsc_interval_t totalPauseTime;
+
+public:
+  explicitTimer(timeStat *s, timer_e te)
+      : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
+        totalPauseTime() {}
+
+  // void setStat(timeStat *s) { stat = s; }
+  void start(tsc_tick_count tick);
+  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
+  void resume(tsc_tick_count tick) {
+    totalPauseTime += (tick - pauseStartTime);
+  }
+  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
+  void reset() {
+    startTime = 0;
+    pauseStartTime = 0;
+    totalPauseTime = 0;
+  }
+  timer_e get_type() const { return timerEnumValue; }
+};
+
+// Where you need to partition a threads clock ticks into separate states
+// e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
+// DOING_NOTHING would render these conditions:
+// time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
+// No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
+// versa
+class partitionedTimers {
+private:
+  std::vector<explicitTimer> timer_stack;
+
+public:
+  partitionedTimers();
+  void init(explicitTimer timer);
+  void exchange(explicitTimer timer);
+  void push(explicitTimer timer);
+  void pop();
+  void windup();
+};
+
+// Special wrapper around the partitioned timers to aid timing code blocks
+// It avoids the need to have an explicit end, leaving the scope suffices.
+class blockPartitionedTimer {
+  partitionedTimers *part_timers;
+
+public:
+  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
+      : part_timers(pt) {
+    part_timers->push(timer);
+  }
+  ~blockPartitionedTimer() { part_timers->pop(); }
+};
+
+// Special wrapper around the thread state to aid in keeping state in code
+// blocks It avoids the need to have an explicit end, leaving the scope
+// suffices.
+class blockThreadState {
+  stats_state_e *state_pointer;
+  stats_state_e old_state;
+
+public:
+  blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
+      : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
+    *state_pointer = new_state;
+  }
+  ~blockThreadState() { *state_pointer = old_state; }
+};
+
+// If all you want is a count, then you can use this...
+// The individual per-thread counts will be aggregated into a statistic at
+// program exit.
+class counter {
+  uint64_t value;
+  static const statInfo counterInfo[];
+
+public:
+  counter() : value(0) {}
+  void increment() { value++; }
+  uint64_t getValue() const { return value; }
+  void reset() { value = 0; }
+  static const char *name(counter_e e) { return counterInfo[e].name; }
+  static bool masterOnly(counter_e e) {
+    return counterInfo[e].flags & stats_flags_e::onlyInMaster;
+  }
+};
+
+/* ****************************************************************
+    Class to implement an event
+
+    There are four components to an event: start time, stop time
+    nest_level, and timer_name.
+    The start and stop time should be obvious (recorded in clock ticks).
+    The nest_level relates to the bar width in the timeline graph.
+    The timer_name is used to determine which timer event triggered this event.
+
+    the interface to this class is through four read-only operations:
+    1) getStart()     -- returns the start time as 64 bit integer
+    2) getStop()      -- returns the stop time as 64 bit integer
+    3) getNestLevel() -- returns the nest level of the event
+    4) getTimerName() -- returns the timer name that triggered event
+
+    *MORE ON NEST_LEVEL*
+    The nest level is used in the bar graph that represents the timeline.
+    Its main purpose is for showing how events are nested inside eachother.
+    For example, say events, A, B, and C are recorded.  If the timeline
+    looks like this:
+
+Begin -------------------------------------------------------------> Time
+         |    |          |        |          |              |
+         A    B          C        C          B              A
+       start start     start     end        end            end
+
+       Then A, B, C will have a nest level of 1, 2, 3 respectively.
+       These values are then used to calculate the barwidth so you can
+       see that inside A, B has occurred, and inside B, C has occurred.
+       Currently, this is shown with A's bar width being larger than B's
+       bar width, and B's bar width being larger than C's bar width.
+
+**************************************************************** */
+class kmp_stats_event {
+  uint64_t start;
+  uint64_t stop;
+  int nest_level;
+  timer_e timer_name;
+
+public:
+  kmp_stats_event()
+      : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
+  kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
+      : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
+  inline uint64_t getStart() const { return start; }
+  inline uint64_t getStop() const { return stop; }
+  inline int getNestLevel() const { return nest_level; }
+  inline timer_e getTimerName() const { return timer_name; }
+};
+
+/* ****************************************************************
+    Class to implement a dynamically expandable array of events
+
+    ---------------------------------------------------------
+    | event 1 | event 2 | event 3 | event 4 | ... | event N |
+    ---------------------------------------------------------
+
+    An event is pushed onto the back of this array at every
+    explicitTimer->stop() call.  The event records the thread #,
+    start time, stop time, and nest level related to the bar width.
+
+    The event vector starts at size INIT_SIZE and grows (doubles in size)
+    if needed.  An implication of this behavior is that log(N)
+    reallocations are needed (where N is number of events).  If you want
+    to avoid reallocations, then set INIT_SIZE to a large value.
+
+    the interface to this class is through six operations:
+    1) reset() -- sets the internal_size back to 0 but does not deallocate any
+       memory
+    2) size()  -- returns the number of valid elements in the vector
+    3) push_back(start, stop, nest, timer_name) -- pushes an event onto
+       the back of the array
+    4) deallocate() -- frees all memory associated with the vector
+    5) sort() -- sorts the vector by start time
+    6) operator[index] or at(index) -- returns event reference at that index
+**************************************************************** */
+class kmp_stats_event_vector {
+  kmp_stats_event *events;
+  int internal_size;
+  int allocated_size;
+  static const int INIT_SIZE = 1024;
+
+public:
+  kmp_stats_event_vector() {
+    events =
+        (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
+    internal_size = 0;
+    allocated_size = INIT_SIZE;
+  }
+  ~kmp_stats_event_vector() {}
+  inline void reset() { internal_size = 0; }
+  inline int size() const { return internal_size; }
+  void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
+                 timer_e name) {
+    int i;
+    if (internal_size == allocated_size) {
+      kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
+          sizeof(kmp_stats_event) * allocated_size * 2);
+      for (i = 0; i < internal_size; i++)
+        tmp[i] = events[i];
+      __kmp_free(events);
+      events = tmp;
+      allocated_size *= 2;
+    }
+    events[internal_size] =
+        kmp_stats_event(start_time, stop_time, nest_level, name);
+    internal_size++;
+    return;
+  }
+  void deallocate();
+  void sort();
+  const kmp_stats_event &operator[](int index) const { return events[index]; }
+  kmp_stats_event &operator[](int index) { return events[index]; }
+  const kmp_stats_event &at(int index) const { return events[index]; }
+  kmp_stats_event &at(int index) { return events[index]; }
+};
+
+/* ****************************************************************
+    Class to implement a doubly-linked, circular, statistics list
+
+    |---| ---> |---| ---> |---| ---> |---| ---> ... next
+    |   |      |   |      |   |      |   |
+    |---| <--- |---| <--- |---| <--- |---| <--- ... prev
+    Sentinel   first      second     third
+    Node       node       node       node
+
+    The Sentinel Node is the user handle on the list.
+    The first node corresponds to thread 0's statistics.
+    The second node corresponds to thread 1's statistics and so on...
+
+    Each node has a _timers, _counters, and _explicitTimers array to hold that
+    thread's statistics. The _explicitTimers point to the correct _timer and
+    update its statistics at every stop() call. The explicitTimers' pointers are
+    set up in the constructor. Each node also has an event vector to hold that
+    thread's timing events. The event vector expands as necessary and records
+    the start-stop times for each timer.
+
+    The nestLevel variable is for plotting events and is related
+    to the bar width in the timeline graph.
+
+    Every thread will have a thread local pointer to its node in
+    the list.  The sentinel node is used by the primary thread to
+    store "dummy" statistics before __kmp_create_worker() is called.
+**************************************************************** */
+class kmp_stats_list {
+  int gtid;
+  timeStat _timers[TIMER_LAST + 1];
+  counter _counters[COUNTER_LAST + 1];
+  explicitTimer thread_life_timer;
+  partitionedTimers _partitionedTimers;
+  int _nestLevel; // one per thread
+  kmp_stats_event_vector _event_vector;
+  kmp_stats_list *next;
+  kmp_stats_list *prev;
+  stats_state_e state;
+  int thread_is_idle_flag;
+
+public:
+  kmp_stats_list()
+      : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
+                          TIMER_OMP_worker_thread_life),
+        _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
+        thread_is_idle_flag(0) {}
+  ~kmp_stats_list() {}
+  inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
+  inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
+  inline partitionedTimers *getPartitionedTimers() {
+    return &_partitionedTimers;
+  }
+  inline timeStat *getTimers() { return _timers; }
+  inline counter *getCounters() { return _counters; }
+  inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
+  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
+  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
+  inline void resetEventVector() { _event_vector.reset(); }
+  inline void incrementNestValue() { _nestLevel++; }
+  inline int getNestValue() { return _nestLevel; }
+  inline void decrementNestValue() { _nestLevel--; }
+  inline int getGtid() const { return gtid; }
+  inline void setGtid(int newgtid) { gtid = newgtid; }
+  inline void setState(stats_state_e newstate) { state = newstate; }
+  inline stats_state_e getState() const { return state; }
+  inline stats_state_e *getStatePointer() { return &state; }
+  inline bool isIdle() { return thread_is_idle_flag == 1; }
+  inline void setIdleFlag() { thread_is_idle_flag = 1; }
+  inline void resetIdleFlag() { thread_is_idle_flag = 0; }
+  kmp_stats_list *push_back(int gtid); // returns newly created list node
+  inline void push_event(uint64_t start_time, uint64_t stop_time,
+                         int nest_level, timer_e name) {
+    _event_vector.push_back(start_time, stop_time, nest_level, name);
+  }
+  void deallocate();
+  class iterator;
+  kmp_stats_list::iterator begin();
+  kmp_stats_list::iterator end();
+  int size();
+  class iterator {
+    kmp_stats_list *ptr;
+    friend kmp_stats_list::iterator kmp_stats_list::begin();
+    friend kmp_stats_list::iterator kmp_stats_list::end();
+
+  public:
+    iterator();
+    ~iterator();
+    iterator operator++();
+    iterator operator++(int dummy);
+    iterator operator--();
+    iterator operator--(int dummy);
+    bool operator!=(const iterator &rhs);
+    bool operator==(const iterator &rhs);
+    kmp_stats_list *operator*() const; // dereference operator
+  };
+};
+
+/* ****************************************************************
+   Class to encapsulate all output functions and the environment variables
+
+   This module holds filenames for various outputs (normal stats, events, plot
+   file), as well as coloring information for the plot file.
+
+   The filenames and flags variables are read from environment variables.
+   These are read once by the constructor of the global variable
+   __kmp_stats_output which calls init().
+
+   During this init() call, event flags for the timeStat::timerInfo[] global
+   array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
+
+   The only interface function that is public is outputStats(heading).  This
+   function should print out everything it needs to, either to files or stderr,
+   depending on the environment variables described below
+
+   ENVIRONMENT VARIABLES:
+   KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
+                     file, otherwise, print to stderr
+   KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
+                        either KMP_STATS_FILE or stderr
+   KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
+                          otherwise, the plot file is sent to "events.plt"
+   KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
+                       events
+   KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
+                            otherwise, output is sent to "events.dat"
+**************************************************************** */
+class kmp_stats_output_module {
+
+public:
+  struct rgb_color {
+    float r;
+    float g;
+    float b;
+  };
+
+private:
+  std::string outputFileName;
+  static const char *eventsFileName;
+  static const char *plotFileName;
+  static int printPerThreadFlag;
+  static int printPerThreadEventsFlag;
+  static const rgb_color globalColorArray[];
+  static rgb_color timerColorInfo[];
+
+  void init();
+  static void setupEventColors();
+  static void printPloticusFile();
+  static void printHeaderInfo(FILE *statsOut);
+  static void printTimerStats(FILE *statsOut, statistic const *theStats,
+                              statistic const *totalStats);
+  static void printCounterStats(FILE *statsOut, statistic const *theStats);
+  static void printCounters(FILE *statsOut, counter const *theCounters);
+  static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
+                          int gtid);
+  static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
+  static void windupExplicitTimers();
+  bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
+
+public:
+  kmp_stats_output_module() { init(); }
+  void outputStats(const char *heading);
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void __kmp_stats_init();
+void __kmp_stats_fini();
+void __kmp_reset_stats();
+void __kmp_output_stats(const char *);
+void __kmp_accumulate_stats_at_exit(void);
+// thread local pointer to stats node within list
+extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
+// head to stats list.
+extern kmp_stats_list *__kmp_stats_list;
+// lock for __kmp_stats_list
+extern kmp_tas_lock_t __kmp_stats_lock;
+// reference start time
+extern tsc_tick_count __kmp_stats_start_time;
+// interface to output
+extern kmp_stats_output_module __kmp_stats_output;
+
+#ifdef __cplusplus
+}
+#endif
+
+// Simple, standard interfaces that drop out completely if stats aren't enabled
+
+/*!
+ * \brief Adds value to specified timer (name).
+ *
+ * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
+ * @param value double precision sample value to add to statistics for the timer
+ *
+ * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to
+ * a timer statistics.
+ *
+ * @ingroup STATS_GATHERING
+ */
+#define KMP_COUNT_VALUE(name, value)                                           \
+  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
+
+/*!
+ * \brief Increments specified counter (name).
+ *
+ * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
+ *
+ * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics
+ * counter for the executing thread.
+ *
+ * @ingroup STATS_GATHERING
+ */
+#define KMP_COUNT_BLOCK(name)                                                  \
+  __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
+
+/*!
+ * \brief Outputs the current thread statistics and reset them.
+ *
+ * @param heading_string heading put above the final stats output
+ *
+ * \details Explicitly stops all timers and outputs all stats. Environment
+ * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a
+ * filename instead of stderr. Environment variable,
+ * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific
+ * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be
+ * defined with any value, which will print out thread specific stats, or it can
+ * be undefined (not specified in the environment) and thread specific stats
+ * won't be printed. It should be noted that all statistics are reset when this
+ * macro is called.
+ *
+ * @ingroup STATS_GATHERING
+ */
+#define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
+
+/*!
+ * \brief Initializes the partitioned timers to begin with name.
+ *
+ * @param name timer which you want this thread to begin with
+ *
+ * @ingroup STATS_GATHERING
+ */
+#define KMP_INIT_PARTITIONED_TIMERS(name)                                      \
+  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer(          \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
+
+#define KMP_TIME_PARTITIONED_BLOCK(name)                                       \
+  blockPartitionedTimer __PBLOCKTIME__(                                        \
+      __kmp_stats_thread_ptr->getPartitionedTimers(),                          \
+      explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name),            \
+                    TIMER_##name))
+
+#define KMP_PUSH_PARTITIONED_TIMER(name)                                       \
+  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer(          \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
+
+#define KMP_POP_PARTITIONED_TIMER()                                            \
+  __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
+
+#define KMP_EXCHANGE_PARTITIONED_TIMER(name)                                   \
+  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer(      \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
+
+#define KMP_SET_THREAD_STATE(state_name)                                       \
+  __kmp_stats_thread_ptr->setState(state_name)
+
+#define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
+
+#define KMP_SET_THREAD_STATE_BLOCK(state_name)                                 \
+  blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
+                                    state_name)
+
+/*!
+ * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
+ *
+ * \details Reset all stats for all threads.
+ *
+ * @ingroup STATS_GATHERING
+ */
+#define KMP_RESET_STATS() __kmp_reset_stats()
+
+#if (KMP_DEVELOPER_STATS)
+#define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
+#define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
+#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
+#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
+#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
+#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n)                            \
+  KMP_EXCHANGE_PARTITIONED_TIMER(n)
+#else
+// Null definitions
+#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
+#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
+#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
+#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#endif
+
+#else // KMP_STATS_ENABLED
+
+// Null definitions
+#define KMP_COUNT_VALUE(n, v) ((void)0)
+#define KMP_COUNT_BLOCK(n) ((void)0)
+
+#define KMP_OUTPUT_STATS(heading_string) ((void)0)
+#define KMP_RESET_STATS() ((void)0)
+
+#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
+#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
+#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
+#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
+#define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
+#define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
+#define KMP_POP_PARTITIONED_TIMER() ((void)0)
+#define KMP_SET_THREAD_STATE(state_name) ((void)0)
+#define KMP_GET_THREAD_STATE() ((void)0)
+#define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
+#endif // KMP_STATS_ENABLED
+
+#endif // KMP_STATS_H
diff --git a/third_party/openmp/kmp_stats_timing.cpp b/third_party/openmp/kmp_stats_timing.cpp
new file mode 100644
index 000000000..bdfe68c3f
--- /dev/null
+++ b/third_party/openmp/kmp_stats_timing.cpp
@@ -0,0 +1,130 @@
+/** @file kmp_stats_timing.cpp
+ * Timing functions
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "kmp.h"
+#include "kmp_stats_timing.h"
+
+using namespace std;
+
+#if KMP_HAVE_TICK_TIME
+#if KMP_MIC
+double tsc_tick_count::tick_time() {
+  // pretty bad assumption of 1GHz clock for MIC
+  return 1 / ((double)1000 * 1.e6);
+}
+#elif KMP_ARCH_X86 || KMP_ARCH_X86_64
+#include <string.h>
+// Extract the value from the CPUID information
+double tsc_tick_count::tick_time() {
+  static double result = 0.0;
+
+  if (result == 0.0) {
+    kmp_cpuid_t cpuinfo;
+    char brand[256];
+
+    __kmp_x86_cpuid(0x80000000, 0, &cpuinfo);
+    memset(brand, 0, sizeof(brand));
+    int ids = cpuinfo.eax;
+
+    for (unsigned int i = 2; i < (ids ^ 0x80000000) + 2; i++)
+      __kmp_x86_cpuid(i | 0x80000000, 0,
+                      (kmp_cpuid_t *)(brand + (i - 2) * sizeof(kmp_cpuid_t)));
+
+    char *start = &brand[0];
+    for (; *start == ' '; start++)
+      ;
+
+    char *end = brand + KMP_STRLEN(brand) - 3;
+    uint64_t multiplier;
+
+    if (*end == 'M')
+      multiplier = 1000LL * 1000LL;
+    else if (*end == 'G')
+      multiplier = 1000LL * 1000LL * 1000LL;
+    else if (*end == 'T')
+      multiplier = 1000LL * 1000LL * 1000LL * 1000LL;
+    else {
+      cout << "Error determining multiplier '" << *end << "'\n";
+      exit(-1);
+    }
+    *end = 0;
+    while (*end != ' ')
+      end--;
+    end++;
+
+    double freq = strtod(end, &start);
+    if (freq == 0.0) {
+      cout << "Error calculating frequency " << end << "\n";
+      exit(-1);
+    }
+
+    result = ((double)1.0) / (freq * multiplier);
+  }
+  return result;
+}
+#endif
+#endif
+
+static bool useSI = true;
+
+// Return a formatted string after normalising the value into
+// engineering style and using a suitable unit prefix (e.g. ms, us, ns).
+std::string formatSI(double interval, int width, char unit) {
+  std::stringstream os;
+
+  if (useSI) {
+    // Preserve accuracy for small numbers, since we only multiply and the
+    // positive powers of ten are precisely representable.
+    static struct {
+      double scale;
+      char prefix;
+    } ranges[] = {{1.e21, 'y'},  {1.e18, 'z'},  {1.e15, 'a'},  {1.e12, 'f'},
+                  {1.e9, 'p'},   {1.e6, 'n'},   {1.e3, 'u'},   {1.0, 'm'},
+                  {1.e-3, ' '},  {1.e-6, 'k'},  {1.e-9, 'M'},  {1.e-12, 'G'},
+                  {1.e-15, 'T'}, {1.e-18, 'P'}, {1.e-21, 'E'}, {1.e-24, 'Z'},
+                  {1.e-27, 'Y'}};
+
+    if (interval == 0.0) {
+      os << std::setw(width - 3) << std::right << "0.00" << std::setw(3)
+         << unit;
+      return os.str();
+    }
+
+    bool negative = false;
+    if (interval < 0.0) {
+      negative = true;
+      interval = -interval;
+    }
+
+    for (int i = 0; i < (int)(sizeof(ranges) / sizeof(ranges[0])); i++) {
+      if (interval * ranges[i].scale < 1.e0) {
+        interval = interval * 1000.e0 * ranges[i].scale;
+        os << std::fixed << std::setprecision(2) << std::setw(width - 3)
+           << std::right << (negative ? -interval : interval) << std::setw(2)
+           << ranges[i].prefix << std::setw(1) << unit;
+
+        return os.str();
+      }
+    }
+  }
+  os << std::setprecision(2) << std::fixed << std::right << std::setw(width - 3)
+     << interval << std::setw(3) << unit;
+
+  return os.str();
+}
diff --git a/third_party/openmp/kmp_stats_timing.h b/third_party/openmp/kmp_stats_timing.h
new file mode 100644
index 000000000..3feafbe34
--- /dev/null
+++ b/third_party/openmp/kmp_stats_timing.h
@@ -0,0 +1,116 @@
+#ifndef KMP_STATS_TIMING_H
+#define KMP_STATS_TIMING_H
+
+/** @file kmp_stats_timing.h
+ * Access to real time clock and timers.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_os.h"
+#include <limits>
+#include <stdint.h>
+#include <string>
+#if KMP_HAVE_X86INTRIN_H
+#include <x86intrin.h>
+#endif
+#include "libc/nexgen32e/rdtsc.h"
+
+class tsc_tick_count {
+private:
+  int64_t my_count;
+
+public:
+  class tsc_interval_t {
+    int64_t value;
+    explicit tsc_interval_t(int64_t _value) : value(_value) {}
+
+  public:
+    tsc_interval_t() : value(0) {} // Construct 0 time duration
+#if KMP_HAVE_TICK_TIME
+    double seconds() const; // Return the length of a time interval in seconds
+#endif
+    double ticks() const { return double(value); }
+    int64_t getValue() const { return value; }
+    tsc_interval_t &operator=(int64_t nvalue) {
+      value = nvalue;
+      return *this;
+    }
+
+    friend class tsc_tick_count;
+
+    friend tsc_interval_t operator-(const tsc_tick_count &t1,
+                                    const tsc_tick_count &t0);
+    friend tsc_interval_t operator-(const tsc_tick_count::tsc_interval_t &i1,
+                                    const tsc_tick_count::tsc_interval_t &i0);
+    friend tsc_interval_t &operator+=(tsc_tick_count::tsc_interval_t &i1,
+                                      const tsc_tick_count::tsc_interval_t &i0);
+  };
+
+#if KMP_HAVE___BUILTIN_READCYCLECOUNTER
+  tsc_tick_count()
+      : my_count(static_cast<int64_t>(__builtin_readcyclecounter())) {}
+#elif KMP_HAVE___RDTSC
+  tsc_tick_count() : my_count(static_cast<int64_t>(rdtsc())) {}
+#else
+#error Must have high resolution timer defined
+#endif
+  tsc_tick_count(int64_t value) : my_count(value) {}
+  int64_t getValue() const { return my_count; }
+  tsc_tick_count later(tsc_tick_count const other) const {
+    return my_count > other.my_count ? (*this) : other;
+  }
+  tsc_tick_count earlier(tsc_tick_count const other) const {
+    return my_count < other.my_count ? (*this) : other;
+  }
+#if KMP_HAVE_TICK_TIME
+  static double tick_time(); // returns seconds per cycle (period) of clock
+#endif
+  static tsc_tick_count now() {
+    return tsc_tick_count();
+  } // returns the rdtsc register value
+  friend tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count &t1,
+                                                  const tsc_tick_count &t0);
+};
+
+inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count &t1,
+                                                const tsc_tick_count &t0) {
+  return tsc_tick_count::tsc_interval_t(t1.my_count - t0.my_count);
+}
+
+inline tsc_tick_count::tsc_interval_t
+operator-(const tsc_tick_count::tsc_interval_t &i1,
+          const tsc_tick_count::tsc_interval_t &i0) {
+  return tsc_tick_count::tsc_interval_t(i1.value - i0.value);
+}
+
+inline tsc_tick_count::tsc_interval_t &
+operator+=(tsc_tick_count::tsc_interval_t &i1,
+           const tsc_tick_count::tsc_interval_t &i0) {
+  i1.value += i0.value;
+  return i1;
+}
+
+#if KMP_HAVE_TICK_TIME
+inline double tsc_tick_count::tsc_interval_t::seconds() const {
+  return value * tick_time();
+}
+#endif
+
+extern std::string formatSI(double interval, int width, char unit);
+
+inline std::string formatSeconds(double interval, int width) {
+  return formatSI(interval, width, 'S');
+}
+
+inline std::string formatTicks(double interval, int width) {
+  return formatSI(interval, width, 'T');
+}
+
+#endif // KMP_STATS_TIMING_H
diff --git a/third_party/openmp/kmp_str.cpp b/third_party/openmp/kmp_str.cpp
new file mode 100644
index 000000000..6ee2df724
--- /dev/null
+++ b/third_party/openmp/kmp_str.cpp
@@ -0,0 +1,840 @@
+/*
+ * kmp_str.cpp -- String manipulation routines.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_str.h"
+
+#include <stdarg.h> // va_*
+#include <stdio.h> // vsnprintf()
+#include <stdlib.h> // malloc(), realloc()
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+
+/* String buffer.
+
+   Usage:
+
+   // Declare buffer and initialize it.
+   kmp_str_buf_t  buffer;
+   __kmp_str_buf_init( & buffer );
+
+   // Print to buffer.
+   __kmp_str_buf_print(& buffer, "Error in file \"%s\" line %d\n", "foo.c", 12);
+   __kmp_str_buf_print(& buffer, "    <%s>\n", line);
+
+   // Use buffer contents. buffer.str is a pointer to data, buffer.used is a
+   // number of printed characters (not including terminating zero).
+   write( fd, buffer.str, buffer.used );
+
+   // Free buffer.
+   __kmp_str_buf_free( & buffer );
+
+   // Alternatively, you can detach allocated memory from buffer:
+   __kmp_str_buf_detach( & buffer );
+   return buffer.str;    // That memory should be freed eventually.
+
+   Notes:
+
+   * Buffer users may use buffer.str and buffer.used. Users should not change
+     any fields of buffer directly.
+   * buffer.str is never NULL. If buffer is empty, buffer.str points to empty
+     string ("").
+   * For performance reasons, buffer uses stack memory (buffer.bulk) first. If
+     stack memory is exhausted, buffer allocates memory on heap by malloc(), and
+     reallocates it by realloc() as amount of used memory grows.
+   * Buffer doubles amount of allocated memory each time it is exhausted.
+*/
+
+// TODO: __kmp_str_buf_print() can use thread local memory allocator.
+
+#define KMP_STR_BUF_INVARIANT(b)                                               \
+  {                                                                            \
+    KMP_DEBUG_ASSERT((b)->str != NULL);                                        \
+    KMP_DEBUG_ASSERT((b)->size >= sizeof((b)->bulk));                          \
+    KMP_DEBUG_ASSERT((b)->size % sizeof((b)->bulk) == 0);                      \
+    KMP_DEBUG_ASSERT((unsigned)(b)->used < (b)->size);                         \
+    KMP_DEBUG_ASSERT(                                                          \
+        (b)->size == sizeof((b)->bulk) ? (b)->str == &(b)->bulk[0] : 1);       \
+    KMP_DEBUG_ASSERT((b)->size > sizeof((b)->bulk) ? (b)->str != &(b)->bulk[0] \
+                                                   : 1);                       \
+  }
+
+void __kmp_str_buf_clear(kmp_str_buf_t *buffer) {
+  KMP_STR_BUF_INVARIANT(buffer);
+  if (buffer->used > 0) {
+    buffer->used = 0;
+    buffer->str[0] = 0;
+  }
+  KMP_STR_BUF_INVARIANT(buffer);
+} // __kmp_str_buf_clear
+
+void __kmp_str_buf_reserve(kmp_str_buf_t *buffer, size_t size) {
+  KMP_STR_BUF_INVARIANT(buffer);
+  KMP_DEBUG_ASSERT(size >= 0);
+
+  if (buffer->size < (unsigned int)size) {
+    // Calculate buffer size.
+    do {
+      buffer->size *= 2;
+    } while (buffer->size < (unsigned int)size);
+
+    // Enlarge buffer.
+    if (buffer->str == &buffer->bulk[0]) {
+      buffer->str = (char *)KMP_INTERNAL_MALLOC(buffer->size);
+      if (buffer->str == NULL) {
+        KMP_FATAL(MemoryAllocFailed);
+      }
+      KMP_MEMCPY_S(buffer->str, buffer->size, buffer->bulk, buffer->used + 1);
+    } else {
+      buffer->str = (char *)KMP_INTERNAL_REALLOC(buffer->str, buffer->size);
+      if (buffer->str == NULL) {
+        KMP_FATAL(MemoryAllocFailed);
+      }
+    }
+  }
+
+  KMP_DEBUG_ASSERT(buffer->size > 0);
+  KMP_DEBUG_ASSERT(buffer->size >= (unsigned)size);
+  KMP_STR_BUF_INVARIANT(buffer);
+} // __kmp_str_buf_reserve
+
+void __kmp_str_buf_detach(kmp_str_buf_t *buffer) {
+  KMP_STR_BUF_INVARIANT(buffer);
+
+  // If internal bulk is used, allocate memory and copy it.
+  if (buffer->size <= sizeof(buffer->bulk)) {
+    buffer->str = (char *)KMP_INTERNAL_MALLOC(buffer->size);
+    if (buffer->str == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    KMP_MEMCPY_S(buffer->str, buffer->size, buffer->bulk, buffer->used + 1);
+  }
+} // __kmp_str_buf_detach
+
+void __kmp_str_buf_free(kmp_str_buf_t *buffer) {
+  KMP_STR_BUF_INVARIANT(buffer);
+  if (buffer->size > sizeof(buffer->bulk)) {
+    KMP_INTERNAL_FREE(buffer->str);
+  }
+  buffer->str = buffer->bulk;
+  buffer->size = sizeof(buffer->bulk);
+  buffer->used = 0;
+  KMP_STR_BUF_INVARIANT(buffer);
+} // __kmp_str_buf_free
+
+void __kmp_str_buf_cat(kmp_str_buf_t *buffer, char const *str, size_t len) {
+  KMP_STR_BUF_INVARIANT(buffer);
+  KMP_DEBUG_ASSERT(str != NULL);
+  KMP_DEBUG_ASSERT(len >= 0);
+
+  __kmp_str_buf_reserve(buffer, buffer->used + len + 1);
+  buffer->str[buffer->used] = '\0';
+  KMP_STRNCAT_S(buffer->str + buffer->used, len + 1, str, len);
+  __kmp_type_convert(buffer->used + len, &(buffer->used));
+  KMP_STR_BUF_INVARIANT(buffer);
+} // __kmp_str_buf_cat
+
+void __kmp_str_buf_catbuf(kmp_str_buf_t *dest, const kmp_str_buf_t *src) {
+  KMP_DEBUG_ASSERT(dest);
+  KMP_DEBUG_ASSERT(src);
+  KMP_STR_BUF_INVARIANT(dest);
+  KMP_STR_BUF_INVARIANT(src);
+  if (!src->str || !src->used)
+    return;
+  __kmp_str_buf_reserve(dest, dest->used + src->used + 1);
+  dest->str[dest->used] = '\0';
+  KMP_STRNCAT_S(dest->str + dest->used, src->used + 1, src->str, src->used);
+  dest->used += src->used;
+  KMP_STR_BUF_INVARIANT(dest);
+} // __kmp_str_buf_catbuf
+
+// Return the number of characters written
+int __kmp_str_buf_vprint(kmp_str_buf_t *buffer, char const *format,
+                         va_list args) {
+  int rc;
+  KMP_STR_BUF_INVARIANT(buffer);
+
+  for (;;) {
+    int const free = buffer->size - buffer->used;
+    int size;
+
+    // Try to format string.
+    {
+      /* On Linux* OS Intel(R) 64, vsnprintf() modifies args argument, so
+         vsnprintf() crashes if it is called for the second time with the same
+         args. To prevent the crash, we have to pass a fresh intact copy of args
+         to vsnprintf() on each iteration.
+
+         Unfortunately, standard va_copy() macro is not available on Windows*
+         OS. However, it seems vsnprintf() does not modify args argument on
+         Windows* OS.
+      */
+
+#if !KMP_OS_WINDOWS
+      va_list _args;
+      va_copy(_args, args); // Make copy of args.
+#define args _args // Substitute args with its copy, _args.
+#endif // KMP_OS_WINDOWS
+      rc = KMP_VSNPRINTF(buffer->str + buffer->used, free, format, args);
+#if !KMP_OS_WINDOWS
+#undef args // Remove substitution.
+      va_end(_args);
+#endif // KMP_OS_WINDOWS
+    }
+
+    // No errors, string has been formatted.
+    if (rc >= 0 && rc < free) {
+      buffer->used += rc;
+      break;
+    }
+
+    // Error occurred, buffer is too small.
+    if (rc >= 0) {
+      // C99-conforming implementation of vsnprintf returns required buffer size
+      size = buffer->used + rc + 1;
+    } else {
+      // Older implementations just return -1. Double buffer size.
+      size = buffer->size * 2;
+    }
+
+    // Enlarge buffer.
+    __kmp_str_buf_reserve(buffer, size);
+
+    // And try again.
+  }
+
+  KMP_DEBUG_ASSERT(buffer->size > 0);
+  KMP_STR_BUF_INVARIANT(buffer);
+  return rc;
+} // __kmp_str_buf_vprint
+
+// Return the number of characters written
+int __kmp_str_buf_print(kmp_str_buf_t *buffer, char const *format, ...) {
+  int rc;
+  va_list args;
+  va_start(args, format);
+  rc = __kmp_str_buf_vprint(buffer, format, args);
+  va_end(args);
+  return rc;
+} // __kmp_str_buf_print
+
+/* The function prints specified size to buffer. Size is expressed using biggest
+   possible unit, for example 1024 is printed as "1k". */
+void __kmp_str_buf_print_size(kmp_str_buf_t *buf, size_t size) {
+  char const *names[] = {"", "k", "M", "G", "T", "P", "E", "Z", "Y"};
+  int const units = sizeof(names) / sizeof(char const *);
+  int u = 0;
+  if (size > 0) {
+    while ((size % 1024 == 0) && (u + 1 < units)) {
+      size = size / 1024;
+      ++u;
+    }
+  }
+
+  __kmp_str_buf_print(buf, "%" KMP_SIZE_T_SPEC "%s", size, names[u]);
+} // __kmp_str_buf_print_size
+
+void __kmp_str_fname_init(kmp_str_fname_t *fname, char const *path) {
+  fname->path = NULL;
+  fname->dir = NULL;
+  fname->base = NULL;
+
+  if (path != NULL) {
+    char *slash = NULL; // Pointer to the last character of dir.
+    char *base = NULL; // Pointer to the beginning of basename.
+    fname->path = __kmp_str_format("%s", path);
+    // Original code used strdup() function to copy a string, but on Windows* OS
+    // Intel(R) 64 it causes assertion id debug heap, so I had to replace
+    // strdup with __kmp_str_format().
+    if (KMP_OS_WINDOWS) {
+      __kmp_str_replace(fname->path, '\\', '/');
+    }
+    fname->dir = __kmp_str_format("%s", fname->path);
+    slash = strrchr(fname->dir, '/');
+    if (KMP_OS_WINDOWS &&
+        slash == NULL) { // On Windows* OS, if slash not found,
+      char first = (char)TOLOWER(fname->dir[0]); // look for drive.
+      if ('a' <= first && first <= 'z' && fname->dir[1] == ':') {
+        slash = &fname->dir[1];
+      }
+    }
+    base = (slash == NULL ? fname->dir : slash + 1);
+    fname->base = __kmp_str_format("%s", base); // Copy basename
+    *base = 0; // and truncate dir.
+  }
+
+} // kmp_str_fname_init
+
+void __kmp_str_fname_free(kmp_str_fname_t *fname) {
+  __kmp_str_free(&fname->path);
+  __kmp_str_free(&fname->dir);
+  __kmp_str_free(&fname->base);
+} // kmp_str_fname_free
+
+int __kmp_str_fname_match(kmp_str_fname_t const *fname, char const *pattern) {
+  int dir_match = 1;
+  int base_match = 1;
+
+  if (pattern != NULL) {
+    kmp_str_fname_t ptrn;
+    __kmp_str_fname_init(&ptrn, pattern);
+    dir_match = strcmp(ptrn.dir, "*/") == 0 ||
+                (fname->dir != NULL && __kmp_str_eqf(fname->dir, ptrn.dir));
+    base_match = strcmp(ptrn.base, "*") == 0 ||
+                 (fname->base != NULL && __kmp_str_eqf(fname->base, ptrn.base));
+    __kmp_str_fname_free(&ptrn);
+  }
+
+  return dir_match && base_match;
+} // __kmp_str_fname_match
+
+// Get the numeric fields from source location string.
+// For clang these fields are Line/Col of the start of the construct.
+// For icc these are LineBegin/LineEnd of the construct.
+// Function is fast as it does not duplicate string (which involves memory
+// allocation), and parses the string in place.
+void __kmp_str_loc_numbers(char const *Psource, int *LineBeg,
+                           int *LineEndOrCol) {
+  char *Str;
+  KMP_DEBUG_ASSERT(LineBeg);
+  KMP_DEBUG_ASSERT(LineEndOrCol);
+  // Parse Psource string ";file;func;line;line_end_or_column;;" to get
+  // numbers only, skipping string fields "file" and "func".
+
+  // Find 1-st semicolon.
+  KMP_DEBUG_ASSERT(Psource);
+#ifdef __cplusplus
+  Str = strchr(CCAST(char *, Psource), ';');
+#else
+  Str = strchr(Psource, ';');
+#endif
+  // Check returned pointer to see if the format of Psource is broken.
+  if (Str) {
+    // Find 2-nd semicolon.
+    Str = strchr(Str + 1, ';');
+  }
+  if (Str) {
+    // Find 3-rd semicolon.
+    Str = strchr(Str + 1, ';');
+  }
+  if (Str) {
+    // Read begin line number.
+    *LineBeg = atoi(Str + 1);
+    // Find 4-th semicolon.
+    Str = strchr(Str + 1, ';');
+  } else {
+    // Broken format of input string, cannot read the number.
+    *LineBeg = 0;
+  }
+  if (Str) {
+    // Read end line or column number.
+    *LineEndOrCol = atoi(Str + 1);
+  } else {
+    // Broken format of input string, cannot read the number.
+    *LineEndOrCol = 0;
+  }
+}
+
+kmp_str_loc_t __kmp_str_loc_init(char const *psource, bool init_fname) {
+  kmp_str_loc_t loc;
+
+  loc._bulk = NULL;
+  loc.file = NULL;
+  loc.func = NULL;
+  loc.line = 0;
+  loc.col = 0;
+
+  if (psource != NULL) {
+    char *str = NULL;
+    char *dummy = NULL;
+    char *line = NULL;
+    char *col = NULL;
+
+    // Copy psource to keep it intact.
+    loc._bulk = __kmp_str_format("%s", psource);
+
+    // Parse psource string: ";file;func;line;col;;"
+    str = loc._bulk;
+    __kmp_str_split(str, ';', &dummy, &str);
+    __kmp_str_split(str, ';', &loc.file, &str);
+    __kmp_str_split(str, ';', &loc.func, &str);
+    __kmp_str_split(str, ';', &line, &str);
+    __kmp_str_split(str, ';', &col, &str);
+
+    // Convert line and col into numberic values.
+    if (line != NULL) {
+      loc.line = atoi(line);
+      if (loc.line < 0) {
+        loc.line = 0;
+      }
+    }
+    if (col != NULL) {
+      loc.col = atoi(col);
+      if (loc.col < 0) {
+        loc.col = 0;
+      }
+    }
+  }
+
+  __kmp_str_fname_init(&loc.fname, init_fname ? loc.file : NULL);
+
+  return loc;
+} // kmp_str_loc_init
+
+void __kmp_str_loc_free(kmp_str_loc_t *loc) {
+  __kmp_str_fname_free(&loc->fname);
+  __kmp_str_free(&(loc->_bulk));
+  loc->file = NULL;
+  loc->func = NULL;
+} // kmp_str_loc_free
+
+/* This function is intended to compare file names. On Windows* OS file names
+   are case-insensitive, so functions performs case-insensitive comparison. On
+   Linux* OS it performs case-sensitive comparison. Note: The function returns
+   *true* if strings are *equal*. */
+int __kmp_str_eqf( // True, if strings are equal, false otherwise.
+    char const *lhs, // First string.
+    char const *rhs // Second string.
+) {
+  int result;
+#if KMP_OS_WINDOWS
+  result = (_stricmp(lhs, rhs) == 0);
+#else
+  result = (strcmp(lhs, rhs) == 0);
+#endif
+  return result;
+} // __kmp_str_eqf
+
+/* This function is like sprintf, but it *allocates* new buffer, which must be
+   freed eventually by __kmp_str_free(). The function is very convenient for
+   constructing strings, it successfully replaces strdup(), strcat(), it frees
+   programmer from buffer allocations and helps to avoid buffer overflows.
+   Examples:
+
+   str = __kmp_str_format("%s", orig); //strdup() doesn't care about buffer size
+   __kmp_str_free( & str );
+   str = __kmp_str_format( "%s%s", orig1, orig2 ); // strcat(), doesn't care
+                                                   // about buffer size.
+   __kmp_str_free( & str );
+   str = __kmp_str_format( "%s/%s.txt", path, file ); // constructing string.
+   __kmp_str_free( & str );
+
+   Performance note:
+   This function allocates memory with malloc() calls, so do not call it from
+   performance-critical code. In performance-critical code consider using
+   kmp_str_buf_t instead, since it uses stack-allocated buffer for short
+   strings.
+
+   Why does this function use malloc()?
+   1. __kmp_allocate() returns cache-aligned memory allocated with malloc().
+      There are no reasons in using __kmp_allocate() for strings due to extra
+      overhead while cache-aligned memory is not necessary.
+   2. __kmp_thread_malloc() cannot be used because it requires pointer to thread
+      structure. We need to perform string operations during library startup
+      (for example, in __kmp_register_library_startup()) when no thread
+      structures are allocated yet.
+   So standard malloc() is the only available option.
+*/
+
+char *__kmp_str_format( // Allocated string.
+    char const *format, // Format string.
+    ... // Other parameters.
+) {
+  va_list args;
+  int size = 512;
+  char *buffer = NULL;
+  int rc;
+
+  // Allocate buffer.
+  buffer = (char *)KMP_INTERNAL_MALLOC(size);
+  if (buffer == NULL) {
+    KMP_FATAL(MemoryAllocFailed);
+  }
+
+  for (;;) {
+    // Try to format string.
+    va_start(args, format);
+    rc = KMP_VSNPRINTF(buffer, size, format, args);
+    va_end(args);
+
+    // No errors, string has been formatted.
+    if (rc >= 0 && rc < size) {
+      break;
+    }
+
+    // Error occurred, buffer is too small.
+    if (rc >= 0) {
+      // C99-conforming implementation of vsnprintf returns required buffer
+      // size.
+      size = rc + 1;
+    } else {
+      // Older implementations just return -1.
+      size = size * 2;
+    }
+
+    // Enlarge buffer and try again.
+    buffer = (char *)KMP_INTERNAL_REALLOC(buffer, size);
+    if (buffer == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+  }
+
+  return buffer;
+} // func __kmp_str_format
+
+void __kmp_str_free(char **str) {
+  KMP_DEBUG_ASSERT(str != NULL);
+  KMP_INTERNAL_FREE(*str);
+  *str = NULL;
+} // func __kmp_str_free
+
+/* If len is zero, returns true iff target and data have exact case-insensitive
+   match. If len is negative, returns true iff target is a case-insensitive
+   substring of data. If len is positive, returns true iff target is a
+   case-insensitive substring of data or vice versa, and neither is shorter than
+   len. */
+int __kmp_str_match(char const *target, int len, char const *data) {
+  int i;
+  if (target == NULL || data == NULL) {
+    return FALSE;
+  }
+  for (i = 0; target[i] && data[i]; ++i) {
+    if (TOLOWER(target[i]) != TOLOWER(data[i])) {
+      return FALSE;
+    }
+  }
+  return ((len > 0) ? i >= len : (!target[i] && (len || !data[i])));
+} // __kmp_str_match
+
+// If data contains all of target, returns true, otherwise returns false.
+// len should be the length of target
+bool __kmp_str_contains(char const *target, int len, char const *data) {
+  int i = 0, j = 0, start = 0;
+  if (target == NULL || data == NULL) {
+    return FALSE;
+  }
+  while (target[i]) {
+    if (!data[j])
+      return FALSE;
+    if (TOLOWER(target[i]) != TOLOWER(data[j])) {
+      j = start + 1;
+      start = j;
+      i = 0;
+    } else {
+      if (i == 0)
+        start = j;
+      j++;
+      i++;
+    }
+  }
+
+  return i == len;
+} // __kmp_str_contains
+
+int __kmp_str_match_false(char const *data) {
+  int result =
+      __kmp_str_match("false", 1, data) || __kmp_str_match("off", 2, data) ||
+      __kmp_str_match("0", 1, data) || __kmp_str_match(".false.", 2, data) ||
+      __kmp_str_match(".f.", 2, data) || __kmp_str_match("no", 1, data) ||
+      __kmp_str_match("disabled", 0, data);
+  return result;
+} // __kmp_str_match_false
+
+int __kmp_str_match_true(char const *data) {
+  int result =
+      __kmp_str_match("true", 1, data) || __kmp_str_match("on", 2, data) ||
+      __kmp_str_match("1", 1, data) || __kmp_str_match(".true.", 2, data) ||
+      __kmp_str_match(".t.", 2, data) || __kmp_str_match("yes", 1, data) ||
+      __kmp_str_match("enabled", 0, data);
+  return result;
+} // __kmp_str_match_true
+
+void __kmp_str_replace(char *str, char search_for, char replace_with) {
+  char *found = NULL;
+
+  found = strchr(str, search_for);
+  while (found) {
+    *found = replace_with;
+    found = strchr(found + 1, search_for);
+  }
+} // __kmp_str_replace
+
+void __kmp_str_split(char *str, // I: String to split.
+                     char delim, // I: Character to split on.
+                     char **head, // O: Pointer to head (may be NULL).
+                     char **tail // O: Pointer to tail (may be NULL).
+) {
+  char *h = str;
+  char *t = NULL;
+  if (str != NULL) {
+    char *ptr = strchr(str, delim);
+    if (ptr != NULL) {
+      *ptr = 0;
+      t = ptr + 1;
+    }
+  }
+  if (head != NULL) {
+    *head = h;
+  }
+  if (tail != NULL) {
+    *tail = t;
+  }
+} // __kmp_str_split
+
+/* strtok_r() is not available on Windows* OS. This function reimplements
+   strtok_r(). */
+char *__kmp_str_token(
+    char *str, // String to split into tokens. Note: String *is* modified!
+    char const *delim, // Delimiters.
+    char **buf // Internal buffer.
+) {
+  char *token = NULL;
+#if KMP_OS_WINDOWS
+  // On Windows* OS there is no strtok_r() function. Let us implement it.
+  if (str != NULL) {
+    *buf = str; // First call, initialize buf.
+  }
+  *buf += strspn(*buf, delim); // Skip leading delimiters.
+  if (**buf != 0) { // Rest of the string is not yet empty.
+    token = *buf; // Use it as result.
+    *buf += strcspn(*buf, delim); // Skip non-delimiters.
+    if (**buf != 0) { // Rest of the string is not yet empty.
+      **buf = 0; // Terminate token here.
+      *buf += 1; // Advance buf to start with the next token next time.
+    }
+  }
+#else
+  // On Linux* OS and OS X*, strtok_r() is available. Let us use it.
+  token = strtok_r(str, delim, buf);
+#endif
+  return token;
+} // __kmp_str_token
+
+int __kmp_basic_str_to_int(char const *str) {
+  int result;
+  char const *t;
+
+  result = 0;
+
+  for (t = str; *t != '\0'; ++t) {
+    if (*t < '0' || *t > '9')
+      break;
+    result = (result * 10) + (*t - '0');
+  }
+
+  return result;
+}
+
+int __kmp_str_to_int(char const *str, char sentinel) {
+  int result, factor;
+  char const *t;
+
+  result = 0;
+
+  for (t = str; *t != '\0'; ++t) {
+    if (*t < '0' || *t > '9')
+      break;
+    result = (result * 10) + (*t - '0');
+  }
+
+  switch (*t) {
+  case '\0': /* the current default for no suffix is bytes */
+    factor = 1;
+    break;
+  case 'b':
+  case 'B': /* bytes */
+    ++t;
+    factor = 1;
+    break;
+  case 'k':
+  case 'K': /* kilo-bytes */
+    ++t;
+    factor = 1024;
+    break;
+  case 'm':
+  case 'M': /* mega-bytes */
+    ++t;
+    factor = (1024 * 1024);
+    break;
+  default:
+    if (*t != sentinel)
+      return (-1);
+    t = "";
+    factor = 1;
+  }
+
+  if (result > (INT_MAX / factor))
+    result = INT_MAX;
+  else
+    result *= factor;
+
+  return (*t != 0 ? 0 : result);
+} // __kmp_str_to_int
+
+/* The routine parses input string. It is expected it is a unsigned integer with
+   optional unit. Units are: "b" for bytes, "kb" or just "k" for kilobytes, "mb"
+   or "m" for megabytes, ..., "yb" or "y" for yottabytes. :-) Unit name is
+   case-insensitive. The routine returns 0 if everything is ok, or error code:
+   -1 in case of overflow, -2 in case of unknown unit. *size is set to parsed
+   value. In case of overflow *size is set to KMP_SIZE_T_MAX, in case of unknown
+   unit *size is set to zero. */
+void __kmp_str_to_size( // R: Error code.
+    char const *str, // I: String of characters, unsigned number and unit ("b",
+    // "kb", etc).
+    size_t *out, // O: Parsed number.
+    size_t dfactor, // I: The factor if none of the letters specified.
+    char const **error // O: Null if everything is ok, error message otherwise.
+) {
+
+  size_t value = 0;
+  size_t factor = 0;
+  int overflow = 0;
+  int i = 0;
+  int digit;
+
+  KMP_DEBUG_ASSERT(str != NULL);
+
+  // Skip spaces.
+  while (str[i] == ' ' || str[i] == '\t') {
+    ++i;
+  }
+
+  // Parse number.
+  if (str[i] < '0' || str[i] > '9') {
+    *error = KMP_I18N_STR(NotANumber);
+    return;
+  }
+  do {
+    digit = str[i] - '0';
+    overflow = overflow || (value > (KMP_SIZE_T_MAX - digit) / 10);
+    value = (value * 10) + digit;
+    ++i;
+  } while (str[i] >= '0' && str[i] <= '9');
+
+  // Skip spaces.
+  while (str[i] == ' ' || str[i] == '\t') {
+    ++i;
+  }
+
+// Parse unit.
+#define _case(ch, exp)                                                         \
+  case ch:                                                                     \
+  case ch - ('a' - 'A'): {                                                     \
+    size_t shift = (exp)*10;                                                   \
+    ++i;                                                                       \
+    if (shift < sizeof(size_t) * 8) {                                          \
+      factor = (size_t)(1) << shift;                                           \
+    } else {                                                                   \
+      overflow = 1;                                                            \
+    }                                                                          \
+  } break;
+  switch (str[i]) {
+    _case('k', 1); // Kilo
+    _case('m', 2); // Mega
+    _case('g', 3); // Giga
+    _case('t', 4); // Tera
+    _case('p', 5); // Peta
+    _case('e', 6); // Exa
+    _case('z', 7); // Zetta
+    _case('y', 8); // Yotta
+    // Oops. No more units...
+  }
+#undef _case
+  if (str[i] == 'b' || str[i] == 'B') { // Skip optional "b".
+    if (factor == 0) {
+      factor = 1;
+    }
+    ++i;
+  }
+  if (!(str[i] == ' ' || str[i] == '\t' || str[i] == 0)) { // Bad unit
+    *error = KMP_I18N_STR(BadUnit);
+    return;
+  }
+
+  if (factor == 0) {
+    factor = dfactor;
+  }
+
+  // Apply factor.
+  overflow = overflow || (value > (KMP_SIZE_T_MAX / factor));
+  value *= factor;
+
+  // Skip spaces.
+  while (str[i] == ' ' || str[i] == '\t') {
+    ++i;
+  }
+
+  if (str[i] != 0) {
+    *error = KMP_I18N_STR(IllegalCharacters);
+    return;
+  }
+
+  if (overflow) {
+    *error = KMP_I18N_STR(ValueTooLarge);
+    *out = KMP_SIZE_T_MAX;
+    return;
+  }
+
+  *error = NULL;
+  *out = value;
+} // __kmp_str_to_size
+
+void __kmp_str_to_uint( // R: Error code.
+    char const *str, // I: String of characters, unsigned number.
+    kmp_uint64 *out, // O: Parsed number.
+    char const **error // O: Null if everything is ok, error message otherwise.
+) {
+  size_t value = 0;
+  int overflow = 0;
+  int i = 0;
+  int digit;
+
+  KMP_DEBUG_ASSERT(str != NULL);
+
+  // Skip spaces.
+  while (str[i] == ' ' || str[i] == '\t') {
+    ++i;
+  }
+
+  // Parse number.
+  if (str[i] < '0' || str[i] > '9') {
+    *error = KMP_I18N_STR(NotANumber);
+    return;
+  }
+  do {
+    digit = str[i] - '0';
+    overflow = overflow || (value > (KMP_SIZE_T_MAX - digit) / 10);
+    value = (value * 10) + digit;
+    ++i;
+  } while (str[i] >= '0' && str[i] <= '9');
+
+  // Skip spaces.
+  while (str[i] == ' ' || str[i] == '\t') {
+    ++i;
+  }
+
+  if (str[i] != 0) {
+    *error = KMP_I18N_STR(IllegalCharacters);
+    return;
+  }
+
+  if (overflow) {
+    *error = KMP_I18N_STR(ValueTooLarge);
+    *out = (kmp_uint64)-1;
+    return;
+  }
+
+  *error = NULL;
+  *out = value;
+} // __kmp_str_to_unit
+
+// end of file //
diff --git a/third_party/openmp/kmp_str.h b/third_party/openmp/kmp_str.h
new file mode 100644
index 000000000..11f633cd8
--- /dev/null
+++ b/third_party/openmp/kmp_str.h
@@ -0,0 +1,128 @@
+/*
+ * kmp_str.h -- String manipulation routines.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_STR_H
+#define KMP_STR_H
+
+#include <stdarg.h>
+#include <string.h>
+
+#include "kmp_os.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#if KMP_OS_WINDOWS
+#define strdup _strdup
+#endif
+
+/*  some macros to replace ctype.h functions  */
+#define TOLOWER(c) ((((c) >= 'A') && ((c) <= 'Z')) ? ((c) + 'a' - 'A') : (c))
+
+struct kmp_str_buf {
+  char *str; // Pointer to buffer content, read only.
+  unsigned int size; // Do not change this field!
+  int used; // Number of characters printed to buffer, read only.
+  char bulk[512]; // Do not use this field!
+}; // struct kmp_str_buf
+typedef struct kmp_str_buf kmp_str_buf_t;
+
+#define __kmp_str_buf_init(b)                                                  \
+  {                                                                            \
+    (b)->str = (b)->bulk;                                                      \
+    (b)->size = sizeof((b)->bulk);                                             \
+    (b)->used = 0;                                                             \
+    (b)->bulk[0] = 0;                                                          \
+  }
+
+void __kmp_str_buf_clear(kmp_str_buf_t *buffer);
+void __kmp_str_buf_reserve(kmp_str_buf_t *buffer, size_t size);
+void __kmp_str_buf_detach(kmp_str_buf_t *buffer);
+void __kmp_str_buf_free(kmp_str_buf_t *buffer);
+void __kmp_str_buf_cat(kmp_str_buf_t *buffer, char const *str, size_t len);
+void __kmp_str_buf_catbuf(kmp_str_buf_t *dest, const kmp_str_buf_t *src);
+int __kmp_str_buf_vprint(kmp_str_buf_t *buffer, char const *format,
+                         va_list args);
+int __kmp_str_buf_print(kmp_str_buf_t *buffer, char const *format, ...);
+void __kmp_str_buf_print_size(kmp_str_buf_t *buffer, size_t size);
+
+/* File name parser.
+   Usage:
+
+   kmp_str_fname_t fname = __kmp_str_fname_init( path );
+   // Use fname.path (copy of original path ), fname.dir, fname.base.
+   // Note fname.dir concatenated with fname.base gives exact copy of path.
+   __kmp_str_fname_free( & fname );
+*/
+struct kmp_str_fname {
+  char *path;
+  char *dir;
+  char *base;
+}; // struct kmp_str_fname
+typedef struct kmp_str_fname kmp_str_fname_t;
+void __kmp_str_fname_init(kmp_str_fname_t *fname, char const *path);
+void __kmp_str_fname_free(kmp_str_fname_t *fname);
+// Compares file name with specified pattern. If pattern is NULL, any fname
+// matched.
+int __kmp_str_fname_match(kmp_str_fname_t const *fname, char const *pattern);
+
+/* The compiler provides source locations in string form
+   ";file;func;line;col;;". It is not convenient for manipulation. This
+   structure keeps source location in more convenient form.
+   Usage:
+
+   kmp_str_loc_t loc = __kmp_str_loc_init(ident->psource, false);
+   // use loc.file, loc.func, loc.line, loc.col.
+   // loc.fname is available if second argument of __kmp_str_loc_init is true.
+   __kmp_str_loc_free( & loc );
+
+   If psource is NULL or does not follow format above, file and/or func may be
+   NULL pointers.
+*/
+struct kmp_str_loc {
+  char *_bulk; // Do not use thid field.
+  kmp_str_fname_t fname; // Will be initialized if init_fname is true.
+  char *file;
+  char *func;
+  int line;
+  int col;
+}; // struct kmp_str_loc
+typedef struct kmp_str_loc kmp_str_loc_t;
+kmp_str_loc_t __kmp_str_loc_init(char const *psource, bool init_fname);
+void __kmp_str_loc_numbers(char const *Psource, int *Line, int *Col);
+void __kmp_str_loc_free(kmp_str_loc_t *loc);
+
+int __kmp_str_eqf(char const *lhs, char const *rhs);
+char *__kmp_str_format(char const *format, ...);
+void __kmp_str_free(char **str);
+int __kmp_str_match(char const *target, int len, char const *data);
+bool __kmp_str_contains(char const *target, int len, char const *data);
+int __kmp_str_match_false(char const *data);
+int __kmp_str_match_true(char const *data);
+void __kmp_str_replace(char *str, char search_for, char replace_with);
+void __kmp_str_split(char *str, char delim, char **head, char **tail);
+char *__kmp_str_token(char *str, char const *delim, char **buf);
+int __kmp_basic_str_to_int(char const *str);
+int __kmp_str_to_int(char const *str, char sentinel);
+
+void __kmp_str_to_size(char const *str, size_t *out, size_t dfactor,
+                       char const **error);
+void __kmp_str_to_uint(char const *str, kmp_uint64 *out, char const **error);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // KMP_STR_H
+
+// end of file //
diff --git a/third_party/openmp/kmp_stub.h b/third_party/openmp/kmp_stub.h
new file mode 100644
index 000000000..caaf783fe
--- /dev/null
+++ b/third_party/openmp/kmp_stub.h
@@ -0,0 +1,55 @@
+/*
+ * kmp_stub.h
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_STUB_H
+#define KMP_STUB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+void __kmps_set_blocktime(int arg);
+int __kmps_get_blocktime(void);
+void __kmps_set_dynamic(int arg);
+int __kmps_get_dynamic(void);
+void __kmps_set_library(int arg);
+int __kmps_get_library(void);
+void __kmps_set_nested(int arg);
+int __kmps_get_nested(void);
+void __kmps_set_stacksize(size_t arg);
+size_t __kmps_get_stacksize();
+
+#ifndef KMP_SCHED_TYPE_DEFINED
+#define KMP_SCHED_TYPE_DEFINED
+typedef enum kmp_sched {
+  kmp_sched_static = 1, // mapped to kmp_sch_static_chunked           (33)
+  kmp_sched_dynamic = 2, // mapped to kmp_sch_dynamic_chunked          (35)
+  kmp_sched_guided = 3, // mapped to kmp_sch_guided_chunked           (36)
+  kmp_sched_auto = 4, // mapped to kmp_sch_auto                     (38)
+  kmp_sched_default = kmp_sched_static // default scheduling
+} kmp_sched_t;
+#endif
+void __kmps_set_schedule(kmp_sched_t kind, int modifier);
+void __kmps_get_schedule(kmp_sched_t *kind, int *modifier);
+
+kmp_proc_bind_t __kmps_get_proc_bind(void);
+
+double __kmps_get_wtime();
+double __kmps_get_wtick();
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // KMP_STUB_H
+
+// end of file //
diff --git a/third_party/openmp/kmp_taskdeps.cpp b/third_party/openmp/kmp_taskdeps.cpp
new file mode 100644
index 000000000..f75294813
--- /dev/null
+++ b/third_party/openmp/kmp_taskdeps.cpp
@@ -0,0 +1,1039 @@
+/*
+ * kmp_taskdeps.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//#define KMP_SUPPORT_GRAPH_OUTPUT 1
+
+#include "kmp.h"
+#include "kmp_io.h"
+#include "kmp_wait_release.h"
+#include "kmp_taskdeps.h"
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+// TODO: Improve memory allocation? keep a list of pre-allocated structures?
+// allocate in blocks? re-use list finished list entries?
+// TODO: don't use atomic ref counters for stack-allocated nodes.
+// TODO: find an alternate to atomic refs for heap-allocated nodes?
+// TODO: Finish graph output support
+// TODO: kmp_lock_t seems a tad to big (and heavy weight) for this. Check other
+// runtime locks
+// TODO: Any ITT support needed?
+
+#ifdef KMP_SUPPORT_GRAPH_OUTPUT
+static std::atomic<kmp_int32> kmp_node_id_seed = 0;
+#endif
+
+static void __kmp_init_node(kmp_depnode_t *node) {
+  node->dn.successors = NULL;
+  node->dn.task = NULL; // will point to the right task
+  // once dependences have been processed
+  for (int i = 0; i < MAX_MTX_DEPS; ++i)
+    node->dn.mtx_locks[i] = NULL;
+  node->dn.mtx_num_locks = 0;
+  __kmp_init_lock(&node->dn.lock);
+  KMP_ATOMIC_ST_RLX(&node->dn.nrefs, 1); // init creates the first reference
+#ifdef KMP_SUPPORT_GRAPH_OUTPUT
+  node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed);
+#endif
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  __itt_sync_create(node, "OMP task dep node", NULL, 0);
+#endif
+}
+
+static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
+  KMP_ATOMIC_INC(&node->dn.nrefs);
+  return node;
+}
+
+enum { KMP_DEPHASH_OTHER_SIZE = 97, KMP_DEPHASH_MASTER_SIZE = 997 };
+
+size_t sizes[] = {997, 2003, 4001, 8191, 16001, 32003, 64007, 131071, 270029};
+const size_t MAX_GEN = 8;
+
+static inline size_t __kmp_dephash_hash(kmp_intptr_t addr, size_t hsize) {
+  // TODO alternate to try: set = (((Addr64)(addrUsefulBits * 9.618)) %
+  // m_num_sets );
+  return ((addr >> 6) ^ (addr >> 2)) % hsize;
+}
+
+static kmp_dephash_t *__kmp_dephash_extend(kmp_info_t *thread,
+                                           kmp_dephash_t *current_dephash) {
+  kmp_dephash_t *h;
+
+  size_t gen = current_dephash->generation + 1;
+  if (gen >= MAX_GEN)
+    return current_dephash;
+  size_t new_size = sizes[gen];
+
+  size_t size_to_allocate =
+      new_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
+
+#if USE_FAST_MEMORY
+  h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size_to_allocate);
+#else
+  h = (kmp_dephash_t *)__kmp_thread_malloc(thread, size_to_allocate);
+#endif
+
+  h->size = new_size;
+  h->nelements = current_dephash->nelements;
+  h->buckets = (kmp_dephash_entry **)(h + 1);
+  h->generation = gen;
+  h->nconflicts = 0;
+  h->last_all = current_dephash->last_all;
+
+  // make sure buckets are properly initialized
+  for (size_t i = 0; i < new_size; i++) {
+    h->buckets[i] = NULL;
+  }
+
+  // insert existing elements in the new table
+  for (size_t i = 0; i < current_dephash->size; i++) {
+    kmp_dephash_entry_t *next, *entry;
+    for (entry = current_dephash->buckets[i]; entry; entry = next) {
+      next = entry->next_in_bucket;
+      // Compute the new hash using the new size, and insert the entry in
+      // the new bucket.
+      size_t new_bucket = __kmp_dephash_hash(entry->addr, h->size);
+      entry->next_in_bucket = h->buckets[new_bucket];
+      if (entry->next_in_bucket) {
+        h->nconflicts++;
+      }
+      h->buckets[new_bucket] = entry;
+    }
+  }
+
+  // Free old hash table
+#if USE_FAST_MEMORY
+  __kmp_fast_free(thread, current_dephash);
+#else
+  __kmp_thread_free(thread, current_dephash);
+#endif
+
+  return h;
+}
+
+static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread,
+                                           kmp_taskdata_t *current_task) {
+  kmp_dephash_t *h;
+
+  size_t h_size;
+
+  if (current_task->td_flags.tasktype == TASK_IMPLICIT)
+    h_size = KMP_DEPHASH_MASTER_SIZE;
+  else
+    h_size = KMP_DEPHASH_OTHER_SIZE;
+
+  size_t size = h_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
+
+#if USE_FAST_MEMORY
+  h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size);
+#else
+  h = (kmp_dephash_t *)__kmp_thread_malloc(thread, size);
+#endif
+  h->size = h_size;
+
+  h->generation = 0;
+  h->nelements = 0;
+  h->nconflicts = 0;
+  h->buckets = (kmp_dephash_entry **)(h + 1);
+  h->last_all = NULL;
+
+  for (size_t i = 0; i < h_size; i++)
+    h->buckets[i] = 0;
+
+  return h;
+}
+
+static kmp_dephash_entry *__kmp_dephash_find(kmp_info_t *thread,
+                                             kmp_dephash_t **hash,
+                                             kmp_intptr_t addr) {
+  kmp_dephash_t *h = *hash;
+  if (h->nelements != 0 && h->nconflicts / h->size >= 1) {
+    *hash = __kmp_dephash_extend(thread, h);
+    h = *hash;
+  }
+  size_t bucket = __kmp_dephash_hash(addr, h->size);
+
+  kmp_dephash_entry_t *entry;
+  for (entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket)
+    if (entry->addr == addr)
+      break;
+
+  if (entry == NULL) {
+// create entry. This is only done by one thread so no locking required
+#if USE_FAST_MEMORY
+    entry = (kmp_dephash_entry_t *)__kmp_fast_allocate(
+        thread, sizeof(kmp_dephash_entry_t));
+#else
+    entry = (kmp_dephash_entry_t *)__kmp_thread_malloc(
+        thread, sizeof(kmp_dephash_entry_t));
+#endif
+    entry->addr = addr;
+    if (!h->last_all) // no predecessor task with omp_all_memory dependence
+      entry->last_out = NULL;
+    else // else link the omp_all_memory depnode to the new entry
+      entry->last_out = __kmp_node_ref(h->last_all);
+    entry->last_set = NULL;
+    entry->prev_set = NULL;
+    entry->last_flag = 0;
+    entry->mtx_lock = NULL;
+    entry->next_in_bucket = h->buckets[bucket];
+    h->buckets[bucket] = entry;
+    h->nelements++;
+    if (entry->next_in_bucket)
+      h->nconflicts++;
+  }
+  return entry;
+}
+
+static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
+                                          kmp_depnode_list_t *list,
+                                          kmp_depnode_t *node) {
+  kmp_depnode_list_t *new_head;
+
+#if USE_FAST_MEMORY
+  new_head = (kmp_depnode_list_t *)__kmp_fast_allocate(
+      thread, sizeof(kmp_depnode_list_t));
+#else
+  new_head = (kmp_depnode_list_t *)__kmp_thread_malloc(
+      thread, sizeof(kmp_depnode_list_t));
+#endif
+
+  new_head->node = __kmp_node_ref(node);
+  new_head->next = list;
+
+  return new_head;
+}
+
+static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
+                                          kmp_depnode_t *sink,
+                                          kmp_task_t *sink_task) {
+#if OMPX_TASKGRAPH
+  kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
+  kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
+  if (source->dn.task && sink_task) {
+    // Not supporting dependency between two tasks that one is within the TDG
+    // and the other is not
+    KMP_ASSERT(task_source->is_taskgraph == task_sink->is_taskgraph);
+  }
+  if (task_sink->is_taskgraph &&
+      __kmp_tdg_is_recording(task_sink->tdg->tdg_status)) {
+    kmp_node_info_t *source_info =
+        &task_sink->tdg->record_map[task_source->td_task_id];
+    bool exists = false;
+    for (int i = 0; i < source_info->nsuccessors; i++) {
+      if (source_info->successors[i] == task_sink->td_task_id) {
+        exists = true;
+        break;
+      }
+    }
+    if (!exists) {
+      if (source_info->nsuccessors >= source_info->successors_size) {
+        source_info->successors_size = 2 * source_info->successors_size;
+        kmp_int32 *old_succ_ids = source_info->successors;
+        kmp_int32 *new_succ_ids = (kmp_int32 *)__kmp_allocate(
+            source_info->successors_size * sizeof(kmp_int32));
+        source_info->successors = new_succ_ids;
+        __kmp_free(old_succ_ids);
+      }
+
+      source_info->successors[source_info->nsuccessors] = task_sink->td_task_id;
+      source_info->nsuccessors++;
+
+      kmp_node_info_t *sink_info =
+          &(task_sink->tdg->record_map[task_sink->td_task_id]);
+      sink_info->npredecessors++;
+    }
+  }
+#endif
+#ifdef KMP_SUPPORT_GRAPH_OUTPUT
+  kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
+  // do not use sink->dn.task as that is only filled after the dependences
+  // are already processed!
+  kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
+
+  __kmp_printf("%d(%s) -> %d(%s)\n", source->dn.id,
+               task_source->td_ident->psource, sink->dn.id,
+               task_sink->td_ident->psource);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  /* OMPT tracks dependences between task (a=source, b=sink) in which
+     task a blocks the execution of b through the ompt_new_dependence_callback
+     */
+  if (ompt_enabled.ompt_callback_task_dependence) {
+    kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
+    ompt_data_t *sink_data;
+    if (sink_task)
+      sink_data = &(KMP_TASK_TO_TASKDATA(sink_task)->ompt_task_info.task_data);
+    else
+      sink_data = &__kmp_threads[gtid]->th.ompt_thread_info.task_data;
+
+    ompt_callbacks.ompt_callback(ompt_callback_task_dependence)(
+        &(task_source->ompt_task_info.task_data), sink_data);
+  }
+#endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
+}
+
+kmp_base_depnode_t *__kmpc_task_get_depnode(kmp_task_t *task) {
+  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+  return td->td_depnode ? &(td->td_depnode->dn) : NULL;
+}
+
+kmp_depnode_list_t *__kmpc_task_get_successors(kmp_task_t *task) {
+  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+  return td->td_depnode->dn.successors;
+}
+
+static inline kmp_int32
+__kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
+                             kmp_task_t *task, kmp_depnode_t *node,
+                             kmp_depnode_list_t *plist) {
+  if (!plist)
+    return 0;
+  kmp_int32 npredecessors = 0;
+  // link node as successor of list elements
+  for (kmp_depnode_list_t *p = plist; p; p = p->next) {
+    kmp_depnode_t *dep = p->node;
+#if OMPX_TASKGRAPH
+    kmp_tdg_status tdg_status = KMP_TDG_NONE;
+    if (task) {
+      kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+      if (td->is_taskgraph)
+        tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
+      if (__kmp_tdg_is_recording(tdg_status))
+        __kmp_track_dependence(gtid, dep, node, task);
+    }
+#endif
+    if (dep->dn.task) {
+      KMP_ACQUIRE_DEPNODE(gtid, dep);
+      if (dep->dn.task) {
+        if (!dep->dn.successors || dep->dn.successors->node != node) {
+#if OMPX_TASKGRAPH
+          if (!(__kmp_tdg_is_recording(tdg_status)) && task)
+#endif
+            __kmp_track_dependence(gtid, dep, node, task);
+          dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
+          KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
+                        "%p\n",
+                        gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
+                        KMP_TASK_TO_TASKDATA(task)));
+          npredecessors++;
+        }
+      }
+      KMP_RELEASE_DEPNODE(gtid, dep);
+    }
+  }
+  return npredecessors;
+}
+
+// Add the edge 'sink' -> 'source' in the task dependency graph
+static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
+                                                     kmp_info_t *thread,
+                                                     kmp_task_t *task,
+                                                     kmp_depnode_t *source,
+                                                     kmp_depnode_t *sink) {
+  if (!sink)
+    return 0;
+  kmp_int32 npredecessors = 0;
+#if OMPX_TASKGRAPH
+  kmp_tdg_status tdg_status = KMP_TDG_NONE;
+  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+  if (task) {
+    if (td->is_taskgraph)
+      tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
+    if (__kmp_tdg_is_recording(tdg_status) && sink->dn.task)
+      __kmp_track_dependence(gtid, sink, source, task);
+  }
+#endif
+  if (sink->dn.task) {
+    // synchronously add source to sink' list of successors
+    KMP_ACQUIRE_DEPNODE(gtid, sink);
+    if (sink->dn.task) {
+      if (!sink->dn.successors || sink->dn.successors->node != source) {
+#if OMPX_TASKGRAPH
+        if (!(__kmp_tdg_is_recording(tdg_status)) && task)
+#endif
+          __kmp_track_dependence(gtid, sink, source, task);
+        sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
+        KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
+                    "%p\n",
+                    gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
+                    KMP_TASK_TO_TASKDATA(task)));
+#if OMPX_TASKGRAPH
+        if (__kmp_tdg_is_recording(tdg_status)) {
+          kmp_taskdata_t *tdd = KMP_TASK_TO_TASKDATA(sink->dn.task);
+          if (tdd->is_taskgraph) {
+            if (tdd->td_flags.onced)
+              // decrement npredecessors if sink->dn.task belongs to a taskgraph
+              // and
+              //  1) the task is reset to its initial state (by kmp_free_task) or
+              //  2) the task is complete but not yet reset
+              npredecessors--;
+          }
+        }
+#endif
+      npredecessors++;
+      }
+    }
+    KMP_RELEASE_DEPNODE(gtid, sink);
+  }
+  return npredecessors;
+}
+
+static inline kmp_int32
+__kmp_process_dep_all(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *h,
+                      bool dep_barrier, kmp_task_t *task) {
+  KA_TRACE(30, ("__kmp_process_dep_all: T#%d processing dep_all, "
+                "dep_barrier = %d\n",
+                gtid, dep_barrier));
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_int32 npredecessors = 0;
+
+  // process previous omp_all_memory node if any
+  npredecessors +=
+      __kmp_depnode_link_successor(gtid, thread, task, node, h->last_all);
+  __kmp_node_deref(thread, h->last_all);
+  if (!dep_barrier) {
+    h->last_all = __kmp_node_ref(node);
+  } else {
+    // if this is a sync point in the serial sequence, then the previous
+    // outputs are guaranteed to be completed after the execution of this
+    // task so the previous output nodes can be cleared.
+    h->last_all = NULL;
+  }
+
+  // process all regular dependences
+  for (size_t i = 0; i < h->size; i++) {
+    kmp_dephash_entry_t *info = h->buckets[i];
+    if (!info) // skip empty slots in dephash
+      continue;
+    for (; info; info = info->next_in_bucket) {
+      // for each entry the omp_all_memory works as OUT dependence
+      kmp_depnode_t *last_out = info->last_out;
+      kmp_depnode_list_t *last_set = info->last_set;
+      kmp_depnode_list_t *prev_set = info->prev_set;
+      if (last_set) {
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
+        __kmp_depnode_list_free(thread, last_set);
+        __kmp_depnode_list_free(thread, prev_set);
+        info->last_set = NULL;
+        info->prev_set = NULL;
+        info->last_flag = 0; // no sets in this dephash entry
+      } else {
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
+      }
+      __kmp_node_deref(thread, last_out);
+      if (!dep_barrier) {
+        info->last_out = __kmp_node_ref(node);
+      } else {
+        info->last_out = NULL;
+      }
+    }
+  }
+  KA_TRACE(30, ("__kmp_process_dep_all: T#%d found %d predecessors\n", gtid,
+                npredecessors));
+  return npredecessors;
+}
+
+template <bool filter>
+static inline kmp_int32
+__kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
+                   bool dep_barrier, kmp_int32 ndeps,
+                   kmp_depend_info_t *dep_list, kmp_task_t *task) {
+  KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d processing %d dependences : "
+                "dep_barrier = %d\n",
+                filter, gtid, ndeps, dep_barrier));
+
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_int32 npredecessors = 0;
+  for (kmp_int32 i = 0; i < ndeps; i++) {
+    const kmp_depend_info_t *dep = &dep_list[i];
+
+    if (filter && dep->base_addr == 0)
+      continue; // skip filtered entries
+
+    kmp_dephash_entry_t *info =
+        __kmp_dephash_find(thread, hash, dep->base_addr);
+    kmp_depnode_t *last_out = info->last_out;
+    kmp_depnode_list_t *last_set = info->last_set;
+    kmp_depnode_list_t *prev_set = info->prev_set;
+
+    if (dep->flags.out) { // out or inout --> clean lists if any
+      if (last_set) {
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
+        __kmp_depnode_list_free(thread, last_set);
+        __kmp_depnode_list_free(thread, prev_set);
+        info->last_set = NULL;
+        info->prev_set = NULL;
+        info->last_flag = 0; // no sets in this dephash entry
+      } else {
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
+      }
+      __kmp_node_deref(thread, last_out);
+      if (!dep_barrier) {
+        info->last_out = __kmp_node_ref(node);
+      } else {
+        // if this is a sync point in the serial sequence, then the previous
+        // outputs are guaranteed to be completed after the execution of this
+        // task so the previous output nodes can be cleared.
+        info->last_out = NULL;
+      }
+    } else { // either IN or MTX or SET
+      if (info->last_flag == 0 || info->last_flag == dep->flag) {
+        // last_set either didn't exist or of same dep kind
+        // link node as successor of the last_out if any
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
+        // link node as successor of all nodes in the prev_set if any
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, prev_set);
+        if (dep_barrier) {
+          // clean last_out and prev_set if any; don't touch last_set
+          __kmp_node_deref(thread, last_out);
+          info->last_out = NULL;
+          __kmp_depnode_list_free(thread, prev_set);
+          info->prev_set = NULL;
+        }
+      } else { // last_set is of different dep kind, make it prev_set
+        // link node as successor of all nodes in the last_set
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
+        // clean last_out if any
+        __kmp_node_deref(thread, last_out);
+        info->last_out = NULL;
+        // clean prev_set if any
+        __kmp_depnode_list_free(thread, prev_set);
+        if (!dep_barrier) {
+          // move last_set to prev_set, new last_set will be allocated
+          info->prev_set = last_set;
+        } else {
+          info->prev_set = NULL;
+          info->last_flag = 0;
+        }
+        info->last_set = NULL;
+      }
+      // for dep_barrier last_flag value should remain:
+      // 0 if last_set is empty, unchanged otherwise
+      if (!dep_barrier) {
+        info->last_flag = dep->flag; // store dep kind of the last_set
+        info->last_set = __kmp_add_node(thread, info->last_set, node);
+      }
+      // check if we are processing MTX dependency
+      if (dep->flag == KMP_DEP_MTX) {
+        if (info->mtx_lock == NULL) {
+          info->mtx_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
+          __kmp_init_lock(info->mtx_lock);
+        }
+        KMP_DEBUG_ASSERT(node->dn.mtx_num_locks < MAX_MTX_DEPS);
+        kmp_int32 m;
+        // Save lock in node's array
+        for (m = 0; m < MAX_MTX_DEPS; ++m) {
+          // sort pointers in decreasing order to avoid potential livelock
+          if (node->dn.mtx_locks[m] < info->mtx_lock) {
+            KMP_DEBUG_ASSERT(!node->dn.mtx_locks[node->dn.mtx_num_locks]);
+            for (int n = node->dn.mtx_num_locks; n > m; --n) {
+              // shift right all lesser non-NULL pointers
+              KMP_DEBUG_ASSERT(node->dn.mtx_locks[n - 1] != NULL);
+              node->dn.mtx_locks[n] = node->dn.mtx_locks[n - 1];
+            }
+            node->dn.mtx_locks[m] = info->mtx_lock;
+            break;
+          }
+        }
+        KMP_DEBUG_ASSERT(m < MAX_MTX_DEPS); // must break from loop
+        node->dn.mtx_num_locks++;
+      }
+    }
+  }
+  KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d found %d predecessors\n", filter,
+                gtid, npredecessors));
+  return npredecessors;
+}
+
+#define NO_DEP_BARRIER (false)
+#define DEP_BARRIER (true)
+
+// returns true if the task has any outstanding dependence
+static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
+                             kmp_task_t *task, kmp_dephash_t **hash,
+                             bool dep_barrier, kmp_int32 ndeps,
+                             kmp_depend_info_t *dep_list,
+                             kmp_int32 ndeps_noalias,
+                             kmp_depend_info_t *noalias_dep_list) {
+  int i, n_mtxs = 0, dep_all = 0;
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+#endif
+  KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependences for task %p : %d "
+                "possibly aliased dependences, %d non-aliased dependences : "
+                "dep_barrier=%d .\n",
+                gtid, taskdata, ndeps, ndeps_noalias, dep_barrier));
+
+  // Filter deps in dep_list
+  // TODO: Different algorithm for large dep_list ( > 10 ? )
+  for (i = 0; i < ndeps; i++) {
+    if (dep_list[i].base_addr != 0 &&
+        dep_list[i].base_addr != (kmp_intptr_t)KMP_SIZE_T_MAX) {
+      KMP_DEBUG_ASSERT(
+          dep_list[i].flag == KMP_DEP_IN || dep_list[i].flag == KMP_DEP_OUT ||
+          dep_list[i].flag == KMP_DEP_INOUT ||
+          dep_list[i].flag == KMP_DEP_MTX || dep_list[i].flag == KMP_DEP_SET);
+      for (int j = i + 1; j < ndeps; j++) {
+        if (dep_list[i].base_addr == dep_list[j].base_addr) {
+          if (dep_list[i].flag != dep_list[j].flag) {
+            // two different dependences on same address work identical to OUT
+            dep_list[i].flag = KMP_DEP_OUT;
+          }
+          dep_list[j].base_addr = 0; // Mark j element as void
+        }
+      }
+      if (dep_list[i].flag == KMP_DEP_MTX) {
+        // limit number of mtx deps to MAX_MTX_DEPS per node
+        if (n_mtxs < MAX_MTX_DEPS && task != NULL) {
+          ++n_mtxs;
+        } else {
+          dep_list[i].flag = KMP_DEP_OUT; // downgrade mutexinoutset to inout
+        }
+      }
+    } else if (dep_list[i].flag == KMP_DEP_ALL ||
+               dep_list[i].base_addr == (kmp_intptr_t)KMP_SIZE_T_MAX) {
+      // omp_all_memory dependence can be marked by compiler by either
+      // (addr=0 && flag=0x80) (flag KMP_DEP_ALL), or (addr=-1).
+      // omp_all_memory overrides all other dependences if any
+      dep_all = 1;
+      break;
+    }
+  }
+
+  // doesn't need to be atomic as no other thread is going to be accessing this
+  // node just yet.
+  // npredecessors is set -1 to ensure that none of the releasing tasks queues
+  // this task before we have finished processing all the dependences
+  node->dn.npredecessors = -1;
+
+  // used to pack all npredecessors additions into a single atomic operation at
+  // the end
+  int npredecessors;
+
+  if (!dep_all) { // regular dependences
+    npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier,
+                                             ndeps, dep_list, task);
+    npredecessors += __kmp_process_deps<false>(
+        gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task);
+  } else { // omp_all_memory dependence
+    npredecessors = __kmp_process_dep_all(gtid, node, *hash, dep_barrier, task);
+  }
+
+  node->dn.task = task;
+  KMP_MB();
+
+  // Account for our initial fake value
+  npredecessors++;
+
+  // Update predecessors and obtain current value to check if there are still
+  // any outstanding dependences (some tasks may have finished while we
+  // processed the dependences)
+  npredecessors =
+      node->dn.npredecessors.fetch_add(npredecessors) + npredecessors;
+
+  KA_TRACE(20, ("__kmp_check_deps: T#%d found %d predecessors for task %p \n",
+                gtid, npredecessors, taskdata));
+
+  // beyond this point the task could be queued (and executed) by a releasing
+  // task...
+  return npredecessors > 0 ? true : false;
+}
+
+/*!
+@ingroup TASKING
+@param loc_ref location of the original task directive
+@param gtid Global Thread ID of encountering thread
+@param new_task task thunk allocated by __kmp_omp_task_alloc() for the ''new
+task''
+@param ndeps Number of depend items with possible aliasing
+@param dep_list List of depend items with possible aliasing
+@param ndeps_noalias Number of depend items with no aliasing
+@param noalias_dep_list List of depend items with no aliasing
+
+@return Returns either TASK_CURRENT_NOT_QUEUED if the current task was not
+suspended and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
+
+Schedule a non-thread-switchable task with dependences for execution
+*/
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+                                    kmp_task_t *new_task, kmp_int32 ndeps,
+                                    kmp_depend_info_t *dep_list,
+                                    kmp_int32 ndeps_noalias,
+                                    kmp_depend_info_t *noalias_dep_list) {
+
+  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+  KA_TRACE(10, ("__kmpc_omp_task_with_deps(enter): T#%d loc=%p task=%p\n", gtid,
+                loc_ref, new_taskdata));
+  __kmp_assert_valid_gtid(gtid);
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+
+#if OMPX_TASKGRAPH
+  // record TDG with deps
+  if (new_taskdata->is_taskgraph &&
+      __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
+    kmp_tdg_info_t *tdg = new_taskdata->tdg;
+    // extend record_map if needed
+    if (new_taskdata->td_task_id >= tdg->map_size) {
+      __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
+      if (new_taskdata->td_task_id >= tdg->map_size) {
+        kmp_uint old_size = tdg->map_size;
+        kmp_uint new_size = old_size * 2;
+        kmp_node_info_t *old_record = tdg->record_map;
+        kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
+            new_size * sizeof(kmp_node_info_t));
+        KMP_MEMCPY(new_record, tdg->record_map,
+                   old_size * sizeof(kmp_node_info_t));
+        tdg->record_map = new_record;
+
+        __kmp_free(old_record);
+
+        for (kmp_int i = old_size; i < new_size; i++) {
+          kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
+              __kmp_successors_size * sizeof(kmp_int32));
+          new_record[i].task = nullptr;
+          new_record[i].successors = successorsList;
+          new_record[i].nsuccessors = 0;
+          new_record[i].npredecessors = 0;
+          new_record[i].successors_size = __kmp_successors_size;
+          KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
+        }
+        // update the size at the end, so that we avoid other
+        // threads use old_record while map_size is already updated
+        tdg->map_size = new_size;
+      }
+      __kmp_release_bootstrap_lock(&tdg->graph_lock);
+    }
+    tdg->record_map[new_taskdata->td_task_id].task = new_task;
+    tdg->record_map[new_taskdata->td_task_id].parent_task =
+        new_taskdata->td_parent;
+    KMP_ATOMIC_INC(&tdg->num_tasks);
+  }
+#endif
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    if (!current_task->ompt_task_info.frame.enter_frame.ptr)
+      current_task->ompt_task_info.frame.enter_frame.ptr =
+          OMPT_GET_FRAME_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          &(current_task->ompt_task_info.task_data),
+          &(current_task->ompt_task_info.frame),
+          &(new_taskdata->ompt_task_info.task_data),
+          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
+          OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
+    }
+
+    new_taskdata->ompt_task_info.frame.enter_frame.ptr =
+        OMPT_GET_FRAME_ADDRESS(0);
+  }
+
+#if OMPT_OPTIONAL
+  /* OMPT grab all dependences if requested by the tool */
+  if (ndeps + ndeps_noalias > 0 && ompt_enabled.ompt_callback_dependences) {
+    kmp_int32 i;
+
+    int ompt_ndeps = ndeps + ndeps_noalias;
+    ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
+        thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
+
+    KMP_ASSERT(ompt_deps != NULL);
+
+    for (i = 0; i < ndeps; i++) {
+      ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
+      if (dep_list[i].base_addr == KMP_SIZE_T_MAX)
+        ompt_deps[i].dependence_type = ompt_dependence_type_out_all_memory;
+      else if (dep_list[i].flags.in && dep_list[i].flags.out)
+        ompt_deps[i].dependence_type = ompt_dependence_type_inout;
+      else if (dep_list[i].flags.out)
+        ompt_deps[i].dependence_type = ompt_dependence_type_out;
+      else if (dep_list[i].flags.in)
+        ompt_deps[i].dependence_type = ompt_dependence_type_in;
+      else if (dep_list[i].flags.mtx)
+        ompt_deps[i].dependence_type = ompt_dependence_type_mutexinoutset;
+      else if (dep_list[i].flags.set)
+        ompt_deps[i].dependence_type = ompt_dependence_type_inoutset;
+      else if (dep_list[i].flags.all)
+        ompt_deps[i].dependence_type = ompt_dependence_type_out_all_memory;
+    }
+    for (i = 0; i < ndeps_noalias; i++) {
+      ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
+      if (noalias_dep_list[i].base_addr == KMP_SIZE_T_MAX)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_out_all_memory;
+      else if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
+      else if (noalias_dep_list[i].flags.out)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
+      else if (noalias_dep_list[i].flags.in)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
+      else if (noalias_dep_list[i].flags.mtx)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_mutexinoutset;
+      else if (noalias_dep_list[i].flags.set)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
+      else if (noalias_dep_list[i].flags.all)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_out_all_memory;
+    }
+    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
+        &(new_taskdata->ompt_task_info.task_data), ompt_deps, ompt_ndeps);
+    /* We can now free the allocated memory for the dependences */
+    /* For OMPD we might want to delay the free until end of this function */
+    KMP_OMPT_DEPS_FREE(thread, ompt_deps);
+  }
+#endif /* OMPT_OPTIONAL */
+#endif /* OMPT_SUPPORT */
+
+  bool serial = current_task->td_flags.team_serial ||
+                current_task->td_flags.tasking_ser ||
+                current_task->td_flags.final;
+  kmp_task_team_t *task_team = thread->th.th_task_team;
+  serial = serial &&
+           !(task_team && (task_team->tt.tt_found_proxy_tasks ||
+                           task_team->tt.tt_hidden_helper_task_encountered));
+
+  if (!serial && (ndeps > 0 || ndeps_noalias > 0)) {
+    /* if no dependences have been tracked yet, create the dependence hash */
+    if (current_task->td_dephash == NULL)
+      current_task->td_dephash = __kmp_dephash_create(thread, current_task);
+
+#if USE_FAST_MEMORY
+    kmp_depnode_t *node =
+        (kmp_depnode_t *)__kmp_fast_allocate(thread, sizeof(kmp_depnode_t));
+#else
+    kmp_depnode_t *node =
+        (kmp_depnode_t *)__kmp_thread_malloc(thread, sizeof(kmp_depnode_t));
+#endif
+
+    __kmp_init_node(node);
+    new_taskdata->td_depnode = node;
+
+    if (__kmp_check_deps(gtid, node, new_task, &current_task->td_dephash,
+                         NO_DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
+                         noalias_dep_list)) {
+      KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had blocking "
+                    "dependences: "
+                    "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
+                    gtid, loc_ref, new_taskdata));
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
+      }
+#endif
+      return TASK_CURRENT_NOT_QUEUED;
+    }
+  } else {
+    KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependences "
+                  "for task (serialized) loc=%p task=%p\n",
+                  gtid, loc_ref, new_taskdata));
+  }
+
+  KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking "
+                "dependences : "
+                "loc=%p task=%p, transferring to __kmp_omp_task\n",
+                gtid, loc_ref, new_taskdata));
+
+  kmp_int32 ret = __kmp_omp_task(gtid, new_task, true);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
+  }
+#endif
+  return ret;
+}
+
+#if OMPT_SUPPORT
+void __ompt_taskwait_dep_finish(kmp_taskdata_t *current_task,
+                                ompt_data_t *taskwait_task_data) {
+  if (ompt_enabled.ompt_callback_task_schedule) {
+    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
+        taskwait_task_data, ompt_taskwait_complete, NULL);
+  }
+  current_task->ompt_task_info.frame.enter_frame.ptr = NULL;
+  *taskwait_task_data = ompt_data_none;
+}
+#endif /* OMPT_SUPPORT */
+
+/*!
+@ingroup TASKING
+@param loc_ref location of the original task directive
+@param gtid Global Thread ID of encountering thread
+@param ndeps Number of depend items with possible aliasing
+@param dep_list List of depend items with possible aliasing
+@param ndeps_noalias Number of depend items with no aliasing
+@param noalias_dep_list List of depend items with no aliasing
+
+Blocks the current task until all specifies dependences have been fulfilled.
+*/
+void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
+                          kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
+                          kmp_depend_info_t *noalias_dep_list) {
+  __kmpc_omp_taskwait_deps_51(loc_ref, gtid, ndeps, dep_list, ndeps_noalias,
+                              noalias_dep_list, false);
+}
+
+/* __kmpc_omp_taskwait_deps_51 : Function for OpenMP 5.1 nowait clause.
+                                 Placeholder for taskwait with nowait clause.
+                                 Earlier code of __kmpc_omp_wait_deps() is now
+                                 in this function.
+*/
+void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
+                                 kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                                 kmp_int32 ndeps_noalias,
+                                 kmp_depend_info_t *noalias_dep_list,
+                                 kmp_int32 has_no_wait) {
+  KA_TRACE(10, ("__kmpc_omp_taskwait_deps(enter): T#%d loc=%p nowait#%d\n",
+                gtid, loc_ref, has_no_wait));
+  if (ndeps == 0 && ndeps_noalias == 0) {
+    KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d has no dependences to "
+                  "wait upon : loc=%p\n",
+                  gtid, loc_ref));
+    return;
+  }
+  __kmp_assert_valid_gtid(gtid);
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+
+#if OMPT_SUPPORT
+  // this function represents a taskwait construct with depend clause
+  // We signal 4 events:
+  //  - creation of the taskwait task
+  //  - dependences of the taskwait task
+  //  - schedule and finish of the taskwait task
+  ompt_data_t *taskwait_task_data = &thread->th.ompt_thread_info.task_data;
+  KMP_ASSERT(taskwait_task_data->ptr == NULL);
+  if (ompt_enabled.enabled) {
+    if (!current_task->ompt_task_info.frame.enter_frame.ptr)
+      current_task->ompt_task_info.frame.enter_frame.ptr =
+          OMPT_GET_FRAME_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          &(current_task->ompt_task_info.task_data),
+          &(current_task->ompt_task_info.frame), taskwait_task_data,
+          ompt_task_taskwait | ompt_task_undeferred | ompt_task_mergeable, 1,
+          OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
+    }
+  }
+
+#if OMPT_OPTIONAL
+  /* OMPT grab all dependences if requested by the tool */
+  if (ndeps + ndeps_noalias > 0 && ompt_enabled.ompt_callback_dependences) {
+    kmp_int32 i;
+
+    int ompt_ndeps = ndeps + ndeps_noalias;
+    ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
+        thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
+
+    KMP_ASSERT(ompt_deps != NULL);
+
+    for (i = 0; i < ndeps; i++) {
+      ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
+      if (dep_list[i].flags.in && dep_list[i].flags.out)
+        ompt_deps[i].dependence_type = ompt_dependence_type_inout;
+      else if (dep_list[i].flags.out)
+        ompt_deps[i].dependence_type = ompt_dependence_type_out;
+      else if (dep_list[i].flags.in)
+        ompt_deps[i].dependence_type = ompt_dependence_type_in;
+      else if (dep_list[i].flags.mtx)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_mutexinoutset;
+      else if (dep_list[i].flags.set)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
+    }
+    for (i = 0; i < ndeps_noalias; i++) {
+      ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
+      if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
+      else if (noalias_dep_list[i].flags.out)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
+      else if (noalias_dep_list[i].flags.in)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
+      else if (noalias_dep_list[i].flags.mtx)
+        ompt_deps[ndeps + i].dependence_type =
+            ompt_dependence_type_mutexinoutset;
+      else if (noalias_dep_list[i].flags.set)
+        ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inoutset;
+    }
+    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
+        taskwait_task_data, ompt_deps, ompt_ndeps);
+    /* We can now free the allocated memory for the dependences */
+    /* For OMPD we might want to delay the free until end of this function */
+    KMP_OMPT_DEPS_FREE(thread, ompt_deps);
+    ompt_deps = NULL;
+  }
+#endif /* OMPT_OPTIONAL */
+#endif /* OMPT_SUPPORT */
+
+  // We can return immediately as:
+  // - dependences are not computed in serial teams (except with proxy tasks)
+  // - if the dephash is not yet created it means we have nothing to wait for
+  bool ignore = current_task->td_flags.team_serial ||
+                current_task->td_flags.tasking_ser ||
+                current_task->td_flags.final;
+  ignore =
+      ignore && thread->th.th_task_team != NULL &&
+      thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE &&
+      thread->th.th_task_team->tt.tt_hidden_helper_task_encountered == FALSE;
+  ignore = ignore || current_task->td_dephash == NULL;
+
+  if (ignore) {
+    KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d has no blocking "
+                  "dependences : loc=%p\n",
+                  gtid, loc_ref));
+#if OMPT_SUPPORT
+    __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
+#endif /* OMPT_SUPPORT */
+    return;
+  }
+
+  kmp_depnode_t node = {0};
+  __kmp_init_node(&node);
+
+  if (!__kmp_check_deps(gtid, &node, NULL, &current_task->td_dephash,
+                        DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
+                        noalias_dep_list)) {
+    KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d has no blocking "
+                  "dependences : loc=%p\n",
+                  gtid, loc_ref));
+#if OMPT_SUPPORT
+    __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
+#endif /* OMPT_SUPPORT */
+    return;
+  }
+
+  int thread_finished = FALSE;
+  kmp_flag_32<false, false> flag(
+      (std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
+  while (node.dn.npredecessors > 0) {
+    flag.execute_tasks(thread, gtid, FALSE,
+                       &thread_finished USE_ITT_BUILD_ARG(NULL),
+                       __kmp_task_stealing_constraint);
+  }
+
+#if OMPT_SUPPORT
+  __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
+#endif /* OMPT_SUPPORT */
+  KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d finished waiting : loc=%p\
+                \n",
+                gtid, loc_ref));
+}
diff --git a/third_party/openmp/kmp_taskdeps.h b/third_party/openmp/kmp_taskdeps.h
new file mode 100644
index 000000000..d2ab51515
--- /dev/null
+++ b/third_party/openmp/kmp_taskdeps.h
@@ -0,0 +1,209 @@
+/*
+ * kmp_taskdeps.h
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_TASKDEPS_H
+#define KMP_TASKDEPS_H
+
+#include "kmp.h"
+
+#define KMP_ACQUIRE_DEPNODE(gtid, n) __kmp_acquire_lock(&(n)->dn.lock, (gtid))
+#define KMP_RELEASE_DEPNODE(gtid, n) __kmp_release_lock(&(n)->dn.lock, (gtid))
+
+static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) {
+  if (!node)
+    return;
+
+  kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1;
+  KMP_DEBUG_ASSERT(n >= 0);
+  if (n == 0) {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    __itt_sync_destroy(node);
+#endif
+    KMP_ASSERT(node->dn.nrefs == 0);
+#if USE_FAST_MEMORY
+    __kmp_fast_free(thread, node);
+#else
+    __kmp_thread_free(thread, node);
+#endif
+  }
+}
+
+static inline void __kmp_depnode_list_free(kmp_info_t *thread,
+                                           kmp_depnode_list *list) {
+  kmp_depnode_list *next;
+
+  for (; list; list = next) {
+    next = list->next;
+
+    __kmp_node_deref(thread, list->node);
+#if USE_FAST_MEMORY
+    __kmp_fast_free(thread, list);
+#else
+    __kmp_thread_free(thread, list);
+#endif
+  }
+}
+
+static inline void __kmp_dephash_free_entries(kmp_info_t *thread,
+                                              kmp_dephash_t *h) {
+  for (size_t i = 0; i < h->size; i++) {
+    if (h->buckets[i]) {
+      kmp_dephash_entry_t *next;
+      for (kmp_dephash_entry_t *entry = h->buckets[i]; entry; entry = next) {
+        next = entry->next_in_bucket;
+        __kmp_depnode_list_free(thread, entry->last_set);
+        __kmp_depnode_list_free(thread, entry->prev_set);
+        __kmp_node_deref(thread, entry->last_out);
+        if (entry->mtx_lock) {
+          __kmp_destroy_lock(entry->mtx_lock);
+          __kmp_free(entry->mtx_lock);
+        }
+#if USE_FAST_MEMORY
+        __kmp_fast_free(thread, entry);
+#else
+        __kmp_thread_free(thread, entry);
+#endif
+      }
+      h->buckets[i] = 0;
+    }
+  }
+  __kmp_node_deref(thread, h->last_all);
+  h->last_all = NULL;
+}
+
+static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) {
+  __kmp_dephash_free_entries(thread, h);
+#if USE_FAST_MEMORY
+  __kmp_fast_free(thread, h);
+#else
+  __kmp_thread_free(thread, h);
+#endif
+}
+
+extern void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start);
+
+static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
+
+#if OMPX_TASKGRAPH
+  if (task->is_taskgraph && !(__kmp_tdg_is_recording(task->tdg->tdg_status))) {
+    kmp_node_info_t *TaskInfo = &(task->tdg->record_map[task->td_task_id]);
+
+    for (int i = 0; i < TaskInfo->nsuccessors; i++) {
+      kmp_int32 successorNumber = TaskInfo->successors[i];
+      kmp_node_info_t *successor = &(task->tdg->record_map[successorNumber]);
+      kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->npredecessors_counter) - 1;
+      if (successor->task != nullptr && npredecessors == 0) {
+        __kmp_omp_task(gtid, successor->task, false);
+      }
+    }
+    return;
+  }
+#endif
+
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_depnode_t *node = task->td_depnode;
+
+  // Check mutexinoutset dependencies, release locks
+  if (UNLIKELY(node && (node->dn.mtx_num_locks < 0))) {
+    // negative num_locks means all locks were acquired
+    node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
+    for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
+      KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
+      __kmp_release_lock(node->dn.mtx_locks[i], gtid);
+    }
+  }
+
+  if (task->td_dephash) {
+    KA_TRACE(
+        40, ("__kmp_release_deps: T#%d freeing dependencies hash of task %p.\n",
+             gtid, task));
+    __kmp_dephash_free(thread, task->td_dephash);
+    task->td_dephash = NULL;
+  }
+
+  if (!node)
+    return;
+
+  KA_TRACE(20, ("__kmp_release_deps: T#%d notifying successors of task %p.\n",
+                gtid, task));
+
+  KMP_ACQUIRE_DEPNODE(gtid, node);
+#if OMPX_TASKGRAPH
+  if (!task->is_taskgraph ||
+      (task->is_taskgraph && !__kmp_tdg_is_recording(task->tdg->tdg_status)))
+#endif
+    node->dn.task =
+        NULL; // mark this task as finished, so no new dependencies are generated
+  KMP_RELEASE_DEPNODE(gtid, node);
+
+  kmp_depnode_list_t *next;
+  kmp_taskdata_t *next_taskdata;
+  for (kmp_depnode_list_t *p = node->dn.successors; p; p = next) {
+    kmp_depnode_t *successor = p->node;
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    __itt_sync_releasing(successor);
+#endif
+    kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->dn.npredecessors) - 1;
+
+    // successor task can be NULL for wait_depends or because deps are still
+    // being processed
+    if (npredecessors == 0) {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      __itt_sync_acquired(successor);
+#endif
+      KMP_MB();
+      if (successor->dn.task) {
+        KA_TRACE(20, ("__kmp_release_deps: T#%d successor %p of %p scheduled "
+                      "for execution.\n",
+                      gtid, successor->dn.task, task));
+        // If a regular task depending on a hidden helper task, when the
+        // hidden helper task is done, the regular task should be executed by
+        // its encountering team.
+        if (KMP_HIDDEN_HELPER_THREAD(gtid)) {
+          // Hidden helper thread can only execute hidden helper tasks
+          KMP_ASSERT(task->td_flags.hidden_helper);
+          next_taskdata = KMP_TASK_TO_TASKDATA(successor->dn.task);
+          // If the dependent task is a regular task, we need to push to its
+          // encountering thread's queue; otherwise, it can be pushed to its own
+          // queue.
+          if (!next_taskdata->td_flags.hidden_helper) {
+            kmp_int32 encountering_gtid =
+                next_taskdata->td_alloc_thread->th.th_info.ds.ds_gtid;
+            kmp_int32 encountering_tid = __kmp_tid_from_gtid(encountering_gtid);
+            __kmpc_give_task(successor->dn.task, encountering_tid);
+          } else {
+            __kmp_omp_task(gtid, successor->dn.task, false);
+          }
+        } else {
+          __kmp_omp_task(gtid, successor->dn.task, false);
+        }
+      }
+    }
+
+    next = p->next;
+    __kmp_node_deref(thread, p->node);
+#if USE_FAST_MEMORY
+    __kmp_fast_free(thread, p);
+#else
+    __kmp_thread_free(thread, p);
+#endif
+  }
+
+  __kmp_node_deref(thread, node);
+
+  KA_TRACE(
+      20,
+      ("__kmp_release_deps: T#%d all successors of %p notified of completion\n",
+       gtid, task));
+}
+
+#endif // KMP_TASKDEPS_H
diff --git a/third_party/openmp/kmp_tasking.cpp b/third_party/openmp/kmp_tasking.cpp
new file mode 100644
index 000000000..6e8b948ef
--- /dev/null
+++ b/third_party/openmp/kmp_tasking.cpp
@@ -0,0 +1,5717 @@
+/*
+ * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_wait_release.h"
+#include "kmp_taskdeps.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+#if ENABLE_LIBOMPTARGET
+static void (*tgt_target_nowait_query)(void **);
+
+void __kmp_init_target_task() {
+  *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
+}
+#endif
+
+/* forward declaration */
+static void __kmp_enable_tasking(kmp_task_team_t *task_team,
+                                 kmp_info_t *this_thr);
+static void __kmp_alloc_task_deque(kmp_info_t *thread,
+                                   kmp_thread_data_t *thread_data);
+static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
+                                           kmp_task_team_t *task_team);
+static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
+#if OMPX_TASKGRAPH
+static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
+int __kmp_taskloop_task(int gtid, void *ptask);
+#endif
+
+#ifdef BUILD_TIED_TASK_STACK
+
+//  __kmp_trace_task_stack: print the tied tasks from the task stack in order
+//  from top do bottom
+//
+//  gtid: global thread identifier for thread containing stack
+//  thread_data: thread data for task team thread containing stack
+//  threshold: value above which the trace statement triggers
+//  location: string identifying call site of this function (for trace)
+static void __kmp_trace_task_stack(kmp_int32 gtid,
+                                   kmp_thread_data_t *thread_data,
+                                   int threshold, char *location) {
+  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
+  kmp_taskdata_t **stack_top = task_stack->ts_top;
+  kmp_int32 entries = task_stack->ts_entries;
+  kmp_taskdata_t *tied_task;
+
+  KA_TRACE(
+      threshold,
+      ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
+       "first_block = %p, stack_top = %p \n",
+       location, gtid, entries, task_stack->ts_first_block, stack_top));
+
+  KMP_DEBUG_ASSERT(stack_top != NULL);
+  KMP_DEBUG_ASSERT(entries > 0);
+
+  while (entries != 0) {
+    KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
+    // fix up ts_top if we need to pop from previous block
+    if (entries & TASK_STACK_INDEX_MASK == 0) {
+      kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
+
+      stack_block = stack_block->sb_prev;
+      stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
+    }
+
+    // finish bookkeeping
+    stack_top--;
+    entries--;
+
+    tied_task = *stack_top;
+
+    KMP_DEBUG_ASSERT(tied_task != NULL);
+    KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
+
+    KA_TRACE(threshold,
+             ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
+              "stack_top=%p, tied_task=%p\n",
+              location, gtid, entries, stack_top, tied_task));
+  }
+  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
+
+  KA_TRACE(threshold,
+           ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
+            location, gtid));
+}
+
+//  __kmp_init_task_stack: initialize the task stack for the first time
+//  after a thread_data structure is created.
+//  It should not be necessary to do this again (assuming the stack works).
+//
+//  gtid: global thread identifier of calling thread
+//  thread_data: thread data for task team thread containing stack
+static void __kmp_init_task_stack(kmp_int32 gtid,
+                                  kmp_thread_data_t *thread_data) {
+  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
+  kmp_stack_block_t *first_block;
+
+  // set up the first block of the stack
+  first_block = &task_stack->ts_first_block;
+  task_stack->ts_top = (kmp_taskdata_t **)first_block;
+  memset((void *)first_block, '\0',
+         TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
+
+  // initialize the stack to be empty
+  task_stack->ts_entries = TASK_STACK_EMPTY;
+  first_block->sb_next = NULL;
+  first_block->sb_prev = NULL;
+}
+
+//  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
+//
+//  gtid: global thread identifier for calling thread
+//  thread_data: thread info for thread containing stack
+static void __kmp_free_task_stack(kmp_int32 gtid,
+                                  kmp_thread_data_t *thread_data) {
+  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
+  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
+
+  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
+  // free from the second block of the stack
+  while (stack_block != NULL) {
+    kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
+
+    stack_block->sb_next = NULL;
+    stack_block->sb_prev = NULL;
+    if (stack_block != &task_stack->ts_first_block) {
+      __kmp_thread_free(thread,
+                        stack_block); // free the block, if not the first
+    }
+    stack_block = next_block;
+  }
+  // initialize the stack to be empty
+  task_stack->ts_entries = 0;
+  task_stack->ts_top = NULL;
+}
+
+//  __kmp_push_task_stack: Push the tied task onto the task stack.
+//     Grow the stack if necessary by allocating another block.
+//
+//  gtid: global thread identifier for calling thread
+//  thread: thread info for thread containing stack
+//  tied_task: the task to push on the stack
+static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
+                                  kmp_taskdata_t *tied_task) {
+  // GEH - need to consider what to do if tt_threads_data not allocated yet
+  kmp_thread_data_t *thread_data =
+      &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
+  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
+
+  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
+    return; // Don't push anything on stack if team or team tasks are serialized
+  }
+
+  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
+  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
+
+  KA_TRACE(20,
+           ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
+            gtid, thread, tied_task));
+  // Store entry
+  *(task_stack->ts_top) = tied_task;
+
+  // Do bookkeeping for next push
+  task_stack->ts_top++;
+  task_stack->ts_entries++;
+
+  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
+    // Find beginning of this task block
+    kmp_stack_block_t *stack_block =
+        (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
+
+    // Check if we already have a block
+    if (stack_block->sb_next !=
+        NULL) { // reset ts_top to beginning of next block
+      task_stack->ts_top = &stack_block->sb_next->sb_block[0];
+    } else { // Alloc new block and link it up
+      kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
+          thread, sizeof(kmp_stack_block_t));
+
+      task_stack->ts_top = &new_block->sb_block[0];
+      stack_block->sb_next = new_block;
+      new_block->sb_prev = stack_block;
+      new_block->sb_next = NULL;
+
+      KA_TRACE(
+          30,
+          ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
+           gtid, tied_task, new_block));
+    }
+  }
+  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
+                tied_task));
+}
+
+//  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
+//  the task, just check to make sure it matches the ending task passed in.
+//
+//  gtid: global thread identifier for the calling thread
+//  thread: thread info structure containing stack
+//  tied_task: the task popped off the stack
+//  ending_task: the task that is ending (should match popped task)
+static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
+                                 kmp_taskdata_t *ending_task) {
+  // GEH - need to consider what to do if tt_threads_data not allocated yet
+  kmp_thread_data_t *thread_data =
+      &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
+  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
+  kmp_taskdata_t *tied_task;
+
+  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
+    // Don't pop anything from stack if team or team tasks are serialized
+    return;
+  }
+
+  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
+  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
+
+  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
+                thread));
+
+  // fix up ts_top if we need to pop from previous block
+  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
+    kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
+
+    stack_block = stack_block->sb_prev;
+    task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
+  }
+
+  // finish bookkeeping
+  task_stack->ts_top--;
+  task_stack->ts_entries--;
+
+  tied_task = *(task_stack->ts_top);
+
+  KMP_DEBUG_ASSERT(tied_task != NULL);
+  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
+  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
+
+  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
+                tied_task));
+  return;
+}
+#endif /* BUILD_TIED_TASK_STACK */
+
+// returns 1 if new task is allowed to execute, 0 otherwise
+// checks Task Scheduling constraint (if requested) and
+// mutexinoutset dependencies if any
+static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
+                                  const kmp_taskdata_t *tasknew,
+                                  const kmp_taskdata_t *taskcurr) {
+  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
+    // Check if the candidate obeys the Task Scheduling Constraints (TSC)
+    // only descendant of all deferred tied tasks can be scheduled, checking
+    // the last one is enough, as it in turn is the descendant of all others
+    kmp_taskdata_t *current = taskcurr->td_last_tied;
+    KMP_DEBUG_ASSERT(current != NULL);
+    // check if the task is not suspended on barrier
+    if (current->td_flags.tasktype == TASK_EXPLICIT ||
+        current->td_taskwait_thread > 0) { // <= 0 on barrier
+      kmp_int32 level = current->td_level;
+      kmp_taskdata_t *parent = tasknew->td_parent;
+      while (parent != current && parent->td_level > level) {
+        // check generation up to the level of the current task
+        parent = parent->td_parent;
+        KMP_DEBUG_ASSERT(parent != NULL);
+      }
+      if (parent != current)
+        return false;
+    }
+  }
+  // Check mutexinoutset dependencies, acquire locks
+  kmp_depnode_t *node = tasknew->td_depnode;
+#if OMPX_TASKGRAPH
+  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
+#else
+  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
+#endif
+    for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
+      KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
+      if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
+        continue;
+      // could not get the lock, release previous locks
+      for (int j = i - 1; j >= 0; --j)
+        __kmp_release_lock(node->dn.mtx_locks[j], gtid);
+      return false;
+    }
+    // negative num_locks means all locks acquired successfully
+    node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
+  }
+  return true;
+}
+
+// __kmp_realloc_task_deque:
+// Re-allocates a task deque for a particular thread, copies the content from
+// the old deque and adjusts the necessary data structures relating to the
+// deque. This operation must be done with the deque_lock being held
+static void __kmp_realloc_task_deque(kmp_info_t *thread,
+                                     kmp_thread_data_t *thread_data) {
+  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
+  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
+  kmp_int32 new_size = 2 * size;
+
+  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
+                "%d] for thread_data %p\n",
+                __kmp_gtid_from_thread(thread), size, new_size, thread_data));
+
+  kmp_taskdata_t **new_deque =
+      (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
+
+  int i, j;
+  for (i = thread_data->td.td_deque_head, j = 0; j < size;
+       i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
+    new_deque[j] = thread_data->td.td_deque[i];
+
+  __kmp_free(thread_data->td.td_deque);
+
+  thread_data->td.td_deque_head = 0;
+  thread_data->td.td_deque_tail = size;
+  thread_data->td.td_deque = new_deque;
+  thread_data->td.td_deque_size = new_size;
+}
+
+static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
+  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
+  kmp_thread_data_t *thread_data = &l->td;
+  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
+  thread_data->td.td_deque_last_stolen = -1;
+  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
+                "for thread_data %p\n",
+                __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
+  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
+      INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
+  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
+  return l;
+}
+
+// The function finds the deque of priority tasks with given priority, or
+// allocates a new deque and put it into sorted (high -> low) list of deques.
+// Deques of non-default priority tasks are shared between all threads in team,
+// as opposed to per-thread deques of tasks with default priority.
+// The function is called under the lock task_team->tt.tt_task_pri_lock.
+static kmp_thread_data_t *
+__kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
+  kmp_thread_data_t *thread_data;
+  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
+  if (lst->priority == pri) {
+    // Found queue of tasks with given priority.
+    thread_data = &lst->td;
+  } else if (lst->priority < pri) {
+    // All current priority queues contain tasks with lower priority.
+    // Allocate new one for given priority tasks.
+    kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
+    thread_data = &list->td;
+    list->priority = pri;
+    list->next = lst;
+    task_team->tt.tt_task_pri_list = list;
+  } else { // task_team->tt.tt_task_pri_list->priority > pri
+    kmp_task_pri_t *next_queue = lst->next;
+    while (next_queue && next_queue->priority > pri) {
+      lst = next_queue;
+      next_queue = lst->next;
+    }
+    // lst->priority > pri && (next == NULL || pri >= next->priority)
+    if (next_queue == NULL) {
+      // No queue with pri priority, need to allocate new one.
+      kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
+      thread_data = &list->td;
+      list->priority = pri;
+      list->next = NULL;
+      lst->next = list;
+    } else if (next_queue->priority == pri) {
+      // Found queue of tasks with given priority.
+      thread_data = &next_queue->td;
+    } else { // lst->priority > pri > next->priority
+      // insert newly allocated between existed queues
+      kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
+      thread_data = &list->td;
+      list->priority = pri;
+      list->next = next_queue;
+      lst->next = list;
+    }
+  }
+  return thread_data;
+}
+
+//  __kmp_push_priority_task: Add a task to the team's priority task deque
+static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
+                                          kmp_taskdata_t *taskdata,
+                                          kmp_task_team_t *task_team,
+                                          kmp_int32 pri) {
+  kmp_thread_data_t *thread_data = NULL;
+  KA_TRACE(20,
+           ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
+            gtid, taskdata, pri));
+
+  // Find task queue specific to priority value
+  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
+  if (UNLIKELY(lst == NULL)) {
+    __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
+    if (task_team->tt.tt_task_pri_list == NULL) {
+      // List of queues is still empty, allocate one.
+      kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
+      thread_data = &list->td;
+      list->priority = pri;
+      list->next = NULL;
+      task_team->tt.tt_task_pri_list = list;
+    } else {
+      // Other thread initialized a queue. Check if it fits and get thread_data.
+      thread_data = __kmp_get_priority_deque_data(task_team, pri);
+    }
+    __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
+  } else {
+    if (lst->priority == pri) {
+      // Found queue of tasks with given priority.
+      thread_data = &lst->td;
+    } else {
+      __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
+      thread_data = __kmp_get_priority_deque_data(task_team, pri);
+      __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
+    }
+  }
+  KMP_DEBUG_ASSERT(thread_data);
+
+  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+  // Check if deque is full
+  if (TCR_4(thread_data->td.td_deque_ntasks) >=
+      TASK_DEQUE_SIZE(thread_data->td)) {
+    if (__kmp_enable_task_throttling &&
+        __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
+                              thread->th.th_current_task)) {
+      __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+      KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
+                    "TASK_NOT_PUSHED for task %p\n",
+                    gtid, taskdata));
+      return TASK_NOT_PUSHED;
+    } else {
+      // expand deque to push the task which is not allowed to execute
+      __kmp_realloc_task_deque(thread, thread_data);
+    }
+  }
+  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
+                   TASK_DEQUE_SIZE(thread_data->td));
+  // Push taskdata.
+  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
+  // Wrap index.
+  thread_data->td.td_deque_tail =
+      (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
+  TCW_4(thread_data->td.td_deque_ntasks,
+        TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
+  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
+  KMP_FSYNC_RELEASING(taskdata); // releasing child
+  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
+                "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
+                gtid, taskdata, thread_data->td.td_deque_ntasks,
+                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+  task_team->tt.tt_num_task_pri++; // atomic inc
+  return TASK_SUCCESSFULLY_PUSHED;
+}
+
+//  __kmp_push_task: Add a task to the thread's deque
+static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+
+  // If we encounter a hidden helper task, and the current thread is not a
+  // hidden helper thread, we have to give the task to any hidden helper thread
+  // starting from its shadow one.
+  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
+               !KMP_HIDDEN_HELPER_THREAD(gtid))) {
+    kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
+    __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
+    // Signal the hidden helper threads.
+    __kmp_hidden_helper_worker_thread_signal();
+    return TASK_SUCCESSFULLY_PUSHED;
+  }
+
+  kmp_task_team_t *task_team = thread->th.th_task_team;
+  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
+  kmp_thread_data_t *thread_data;
+
+  KA_TRACE(20,
+           ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
+
+  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
+    // untied task needs to increment counter so that the task structure is not
+    // freed prematurely
+    kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
+    KMP_DEBUG_USE_VAR(counter);
+    KA_TRACE(
+        20,
+        ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
+         gtid, counter, taskdata));
+  }
+
+  // The first check avoids building task_team thread data if serialized
+  if (UNLIKELY(taskdata->td_flags.task_serial)) {
+    KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
+                  "TASK_NOT_PUSHED for task %p\n",
+                  gtid, taskdata));
+    return TASK_NOT_PUSHED;
+  }
+
+  // Now that serialized tasks have returned, we can assume that we are not in
+  // immediate exec mode
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
+    __kmp_enable_tasking(task_team, thread);
+  }
+  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
+  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
+
+  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
+      __kmp_max_task_priority > 0) {
+    int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
+    return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
+  }
+
+  // Find tasking deque specific to encountering thread
+  thread_data = &task_team->tt.tt_threads_data[tid];
+
+  // No lock needed since only owner can allocate. If the task is hidden_helper,
+  // we don't need it either because we have initialized the dequeue for hidden
+  // helper thread data.
+  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
+    __kmp_alloc_task_deque(thread, thread_data);
+  }
+
+  int locked = 0;
+  // Check if deque is full
+  if (TCR_4(thread_data->td.td_deque_ntasks) >=
+      TASK_DEQUE_SIZE(thread_data->td)) {
+    if (__kmp_enable_task_throttling &&
+        __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
+                              thread->th.th_current_task)) {
+      KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
+                    "TASK_NOT_PUSHED for task %p\n",
+                    gtid, taskdata));
+      return TASK_NOT_PUSHED;
+    } else {
+      __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+      locked = 1;
+      if (TCR_4(thread_data->td.td_deque_ntasks) >=
+          TASK_DEQUE_SIZE(thread_data->td)) {
+        // expand deque to push the task which is not allowed to execute
+        __kmp_realloc_task_deque(thread, thread_data);
+      }
+    }
+  }
+  // Lock the deque for the task push operation
+  if (!locked) {
+    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+    // Need to recheck as we can get a proxy task from thread outside of OpenMP
+    if (TCR_4(thread_data->td.td_deque_ntasks) >=
+        TASK_DEQUE_SIZE(thread_data->td)) {
+      if (__kmp_enable_task_throttling &&
+          __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
+                                thread->th.th_current_task)) {
+        __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+        KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
+                      "returning TASK_NOT_PUSHED for task %p\n",
+                      gtid, taskdata));
+        return TASK_NOT_PUSHED;
+      } else {
+        // expand deque to push the task which is not allowed to execute
+        __kmp_realloc_task_deque(thread, thread_data);
+      }
+    }
+  }
+  // Must have room since no thread can add tasks but calling thread
+  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
+                   TASK_DEQUE_SIZE(thread_data->td));
+
+  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
+      taskdata; // Push taskdata
+  // Wrap index.
+  thread_data->td.td_deque_tail =
+      (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
+  TCW_4(thread_data->td.td_deque_ntasks,
+        TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
+  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
+  KMP_FSYNC_RELEASING(taskdata); // releasing child
+  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
+                "task=%p ntasks=%d head=%u tail=%u\n",
+                gtid, taskdata, thread_data->td.td_deque_ntasks,
+                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+
+  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+
+  return TASK_SUCCESSFULLY_PUSHED;
+}
+
+// __kmp_pop_current_task_from_thread: set up current task from called thread
+// when team ends
+//
+// this_thr: thread structure to set current_task in.
+void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
+  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
+                "this_thread=%p, curtask=%p, "
+                "curtask_parent=%p\n",
+                0, this_thr, this_thr->th.th_current_task,
+                this_thr->th.th_current_task->td_parent));
+
+  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
+
+  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
+                "this_thread=%p, curtask=%p, "
+                "curtask_parent=%p\n",
+                0, this_thr, this_thr->th.th_current_task,
+                this_thr->th.th_current_task->td_parent));
+}
+
+// __kmp_push_current_task_to_thread: set up current task in called thread for a
+// new team
+//
+// this_thr: thread structure to set up
+// team: team for implicit task data
+// tid: thread within team to set up
+void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
+                                       int tid) {
+  // current task of the thread is a parent of the new just created implicit
+  // tasks of new team
+  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
+                "curtask=%p "
+                "parent_task=%p\n",
+                tid, this_thr, this_thr->th.th_current_task,
+                team->t.t_implicit_task_taskdata[tid].td_parent));
+
+  KMP_DEBUG_ASSERT(this_thr != NULL);
+
+  if (tid == 0) {
+    if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
+      team->t.t_implicit_task_taskdata[0].td_parent =
+          this_thr->th.th_current_task;
+      this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
+    }
+  } else {
+    team->t.t_implicit_task_taskdata[tid].td_parent =
+        team->t.t_implicit_task_taskdata[0].td_parent;
+    this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
+  }
+
+  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
+                "curtask=%p "
+                "parent_task=%p\n",
+                tid, this_thr, this_thr->th.th_current_task,
+                team->t.t_implicit_task_taskdata[tid].td_parent));
+}
+
+// __kmp_task_start: bookkeeping for a task starting execution
+//
+// GTID: global thread id of calling thread
+// task: task starting execution
+// current_task: task suspending
+static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
+                             kmp_taskdata_t *current_task) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_info_t *thread = __kmp_threads[gtid];
+
+  KA_TRACE(10,
+           ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
+            gtid, taskdata, current_task));
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
+
+  // mark currently executing task as suspended
+  // TODO: GEH - make sure root team implicit task is initialized properly.
+  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
+  current_task->td_flags.executing = 0;
+
+// Add task to stack if tied
+#ifdef BUILD_TIED_TASK_STACK
+  if (taskdata->td_flags.tiedness == TASK_TIED) {
+    __kmp_push_task_stack(gtid, thread, taskdata);
+  }
+#endif /* BUILD_TIED_TASK_STACK */
+
+  // mark starting task as executing and as current task
+  thread->th.th_current_task = taskdata;
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
+                   taskdata->td_flags.tiedness == TASK_UNTIED);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
+                   taskdata->td_flags.tiedness == TASK_UNTIED);
+  taskdata->td_flags.started = 1;
+  taskdata->td_flags.executing = 1;
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+
+  // GEH TODO: shouldn't we pass some sort of location identifier here?
+  // APT: yes, we will pass location here.
+  // need to store current thread state (in a thread or taskdata structure)
+  // before setting work_state, otherwise wrong state is set after end of task
+
+  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
+
+  return;
+}
+
+#if OMPT_SUPPORT
+//------------------------------------------------------------------------------
+// __ompt_task_init:
+//   Initialize OMPT fields maintained by a task. This will only be called after
+//   ompt_start_tool, so we already know whether ompt is enabled or not.
+
+static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
+  // The calls to __ompt_task_init already have the ompt_enabled condition.
+  task->ompt_task_info.task_data.value = 0;
+  task->ompt_task_info.frame.exit_frame = ompt_data_none;
+  task->ompt_task_info.frame.enter_frame = ompt_data_none;
+  task->ompt_task_info.frame.exit_frame_flags =
+      ompt_frame_runtime | ompt_frame_framepointer;
+  task->ompt_task_info.frame.enter_frame_flags =
+      ompt_frame_runtime | ompt_frame_framepointer;
+  task->ompt_task_info.dispatch_chunk.start = 0;
+  task->ompt_task_info.dispatch_chunk.iterations = 0;
+}
+
+// __ompt_task_start:
+//   Build and trigger task-begin event
+static inline void __ompt_task_start(kmp_task_t *task,
+                                     kmp_taskdata_t *current_task,
+                                     kmp_int32 gtid) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  ompt_task_status_t status = ompt_task_switch;
+  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
+    status = ompt_task_yield;
+    __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
+  }
+  /* let OMPT know that we're about to run this task */
+  if (ompt_enabled.ompt_callback_task_schedule) {
+    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
+        &(current_task->ompt_task_info.task_data), status,
+        &(taskdata->ompt_task_info.task_data));
+  }
+  taskdata->ompt_task_info.scheduling_parent = current_task;
+}
+
+// __ompt_task_finish:
+//   Build and trigger final task-schedule event
+static inline void __ompt_task_finish(kmp_task_t *task,
+                                      kmp_taskdata_t *resumed_task,
+                                      ompt_task_status_t status) {
+  if (ompt_enabled.ompt_callback_task_schedule) {
+    kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+    if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
+        taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
+      status = ompt_task_cancel;
+    }
+
+    /* let OMPT know that we're returning to the callee task */
+    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
+        &(taskdata->ompt_task_info.task_data), status,
+        (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
+  }
+}
+#endif
+
+template <bool ompt>
+static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
+                                               kmp_task_t *task,
+                                               void *frame_address,
+                                               void *return_address) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
+
+  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
+                "current_task=%p\n",
+                gtid, loc_ref, taskdata, current_task));
+
+  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
+    // untied task needs to increment counter so that the task structure is not
+    // freed prematurely
+    kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
+    KMP_DEBUG_USE_VAR(counter);
+    KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
+                  "incremented for task %p\n",
+                  gtid, counter, taskdata));
+  }
+
+  taskdata->td_flags.task_serial =
+      1; // Execute this task immediately, not deferred.
+  __kmp_task_start(gtid, task, current_task);
+
+#if OMPT_SUPPORT
+  if (ompt) {
+    if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
+      current_task->ompt_task_info.frame.enter_frame.ptr =
+          taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
+      current_task->ompt_task_info.frame.enter_frame_flags =
+          taskdata->ompt_task_info.frame.exit_frame_flags =
+              ompt_frame_application | ompt_frame_framepointer;
+    }
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          &(parent_info->task_data), &(parent_info->frame),
+          &(taskdata->ompt_task_info.task_data),
+          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
+          return_address);
+    }
+    __ompt_task_start(task, current_task, gtid);
+  }
+#endif // OMPT_SUPPORT
+
+  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
+                loc_ref, taskdata));
+}
+
+#if OMPT_SUPPORT
+OMPT_NOINLINE
+static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
+                                           kmp_task_t *task,
+                                           void *frame_address,
+                                           void *return_address) {
+  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
+                                           return_address);
+}
+#endif // OMPT_SUPPORT
+
+// __kmpc_omp_task_begin_if0: report that a given serialized task has started
+// execution
+//
+// loc_ref: source location information; points to beginning of task block.
+// gtid: global thread number.
+// task: task thunk for the started task.
+#ifdef __s390x__
+// This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
+// In order for it to work correctly, the caller also needs to be compiled with
+// backchain. If a caller is compiled without backchain,
+// OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
+// crash.
+__attribute__((target("backchain")))
+#endif
+void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
+                               kmp_task_t *task) {
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
+                                   OMPT_GET_FRAME_ADDRESS(1),
+                                   OMPT_LOAD_RETURN_ADDRESS(gtid));
+    return;
+  }
+#endif
+  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
+}
+
+#ifdef TASK_UNUSED
+// __kmpc_omp_task_begin: report that a given task has started execution
+// NEVER GENERATED BY COMPILER, DEPRECATED!!!
+void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
+  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
+
+  KA_TRACE(
+      10,
+      ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
+       gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
+
+  __kmp_task_start(gtid, task, current_task);
+
+  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
+                loc_ref, KMP_TASK_TO_TASKDATA(task)));
+  return;
+}
+#endif // TASK_UNUSED
+
+// __kmp_free_task: free the current task space and the space for shareds
+//
+// gtid: Global thread ID of calling thread
+// taskdata: task to free
+// thread: thread data structure of caller
+static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
+                            kmp_info_t *thread) {
+  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
+                taskdata));
+
+  // Check to make sure all flags and counters have the correct values
+  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
+                   taskdata->td_flags.task_serial == 1);
+  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
+  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
+  // Clear data to not be re-used later by mistake.
+  task->data1.destructors = NULL;
+  task->data2.priority = 0;
+
+  taskdata->td_flags.freed = 1;
+#if OMPX_TASKGRAPH
+  // do not free tasks in taskgraph
+  if (!taskdata->is_taskgraph) {
+#endif
+// deallocate the taskdata and shared variable blocks associated with this task
+#if USE_FAST_MEMORY
+  __kmp_fast_free(thread, taskdata);
+#else /* ! USE_FAST_MEMORY */
+  __kmp_thread_free(thread, taskdata);
+#endif
+#if OMPX_TASKGRAPH
+  } else {
+    taskdata->td_flags.complete = 0;
+    taskdata->td_flags.started = 0;
+    taskdata->td_flags.freed = 0;
+    taskdata->td_flags.executing = 0;
+    taskdata->td_flags.task_serial =
+        (taskdata->td_parent->td_flags.final ||
+          taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
+
+    // taskdata->td_allow_completion_event.pending_events_count = 1;
+    KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
+    KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
+    // start at one because counts current task and children
+    KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
+  }
+#endif
+
+  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
+}
+
+// __kmp_free_task_and_ancestors: free the current task and ancestors without
+// children
+//
+// gtid: Global thread ID of calling thread
+// taskdata: task to free
+// thread: thread data structure of caller
+static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
+                                          kmp_taskdata_t *taskdata,
+                                          kmp_info_t *thread) {
+  // Proxy tasks must always be allowed to free their parents
+  // because they can be run in background even in serial mode.
+  kmp_int32 team_serial =
+      (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
+      !taskdata->td_flags.proxy;
+  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
+
+  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
+  KMP_DEBUG_ASSERT(children >= 0);
+
+  // Now, go up the ancestor tree to see if any ancestors can now be freed.
+  while (children == 0) {
+    kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
+
+    KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
+                  "and freeing itself\n",
+                  gtid, taskdata));
+
+    // --- Deallocate my ancestor task ---
+    __kmp_free_task(gtid, taskdata, thread);
+
+    taskdata = parent_taskdata;
+
+    if (team_serial)
+      return;
+    // Stop checking ancestors at implicit task instead of walking up ancestor
+    // tree to avoid premature deallocation of ancestors.
+    if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
+      if (taskdata->td_dephash) { // do we need to cleanup dephash?
+        int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
+        kmp_tasking_flags_t flags_old = taskdata->td_flags;
+        if (children == 0 && flags_old.complete == 1) {
+          kmp_tasking_flags_t flags_new = flags_old;
+          flags_new.complete = 0;
+          if (KMP_COMPARE_AND_STORE_ACQ32(
+                  RCAST(kmp_int32 *, &taskdata->td_flags),
+                  *RCAST(kmp_int32 *, &flags_old),
+                  *RCAST(kmp_int32 *, &flags_new))) {
+            KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
+                           "dephash of implicit task %p\n",
+                           gtid, taskdata));
+            // cleanup dephash of finished implicit task
+            __kmp_dephash_free_entries(thread, taskdata->td_dephash);
+          }
+        }
+      }
+      return;
+    }
+    // Predecrement simulated by "- 1" calculation
+    children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
+    KMP_DEBUG_ASSERT(children >= 0);
+  }
+
+  KA_TRACE(
+      20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
+           "not freeing it yet\n",
+           gtid, taskdata, children));
+}
+
+// Only need to keep track of child task counts if any of the following:
+// 1. team parallel and tasking not serialized;
+// 2. it is a proxy or detachable or hidden helper task
+// 3. the children counter of its parent task is greater than 0.
+// The reason for the 3rd one is for serialized team that found detached task,
+// hidden helper task, T. In this case, the execution of T is still deferred,
+// and it is also possible that a regular task depends on T. In this case, if we
+// don't track the children, task synchronization will be broken.
+static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
+  kmp_tasking_flags_t flags = taskdata->td_flags;
+  bool ret = !(flags.team_serial || flags.tasking_ser);
+  ret = ret || flags.proxy == TASK_PROXY ||
+        flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
+  ret = ret ||
+        KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
+#if OMPX_TASKGRAPH
+  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
+    ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
+#endif
+  return ret;
+}
+
+// __kmp_task_finish: bookkeeping to do when a task finishes execution
+//
+// gtid: global thread ID for calling thread
+// task: task to be finished
+// resumed_task: task to be resumed.  (may be NULL if task is serialized)
+//
+// template<ompt>: effectively ompt_enabled.enabled!=0
+// the version with ompt=false is inlined, allowing to optimize away all ompt
+// code in this case
+template <bool ompt>
+static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
+                              kmp_taskdata_t *resumed_task) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_task_team_t *task_team =
+      thread->th.th_task_team; // might be NULL for serial teams...
+#if OMPX_TASKGRAPH
+  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
+  bool is_taskgraph;
+#endif
+#if KMP_DEBUG
+  kmp_int32 children = 0;
+#endif
+  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
+                "task %p\n",
+                gtid, taskdata, resumed_task));
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
+
+#if OMPX_TASKGRAPH
+  is_taskgraph = taskdata->is_taskgraph;
+#endif
+
+// Pop task from stack if tied
+#ifdef BUILD_TIED_TASK_STACK
+  if (taskdata->td_flags.tiedness == TASK_TIED) {
+    __kmp_pop_task_stack(gtid, thread, taskdata);
+  }
+#endif /* BUILD_TIED_TASK_STACK */
+
+  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
+    // untied task needs to check the counter so that the task structure is not
+    // freed prematurely
+    kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
+    KA_TRACE(
+        20,
+        ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
+         gtid, counter, taskdata));
+    if (counter > 0) {
+      // untied task is not done, to be continued possibly by other thread, do
+      // not free it now
+      if (resumed_task == NULL) {
+        KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
+        resumed_task = taskdata->td_parent; // In a serialized task, the resumed
+        // task is the parent
+      }
+      thread->th.th_current_task = resumed_task; // restore current_task
+      resumed_task->td_flags.executing = 1; // resume previous task
+      KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
+                    "resuming task %p\n",
+                    gtid, taskdata, resumed_task));
+      return;
+    }
+  }
+
+  // bookkeeping for resuming task:
+  // GEH - note tasking_ser => task_serial
+  KMP_DEBUG_ASSERT(
+      (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
+      taskdata->td_flags.task_serial);
+  if (taskdata->td_flags.task_serial) {
+    if (resumed_task == NULL) {
+      resumed_task = taskdata->td_parent; // In a serialized task, the resumed
+      // task is the parent
+    }
+  } else {
+    KMP_DEBUG_ASSERT(resumed_task !=
+                     NULL); // verify that resumed task is passed as argument
+  }
+
+  /* If the tasks' destructor thunk flag has been set, we need to invoke the
+     destructor thunk that has been generated by the compiler. The code is
+     placed here, since at this point other tasks might have been released
+     hence overlapping the destructor invocations with some other work in the
+     released tasks.  The OpenMP spec is not specific on when the destructors
+     are invoked, so we should be free to choose. */
+  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
+    kmp_routine_entry_t destr_thunk = task->data1.destructors;
+    KMP_ASSERT(destr_thunk);
+    destr_thunk(gtid, task);
+  }
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+
+  bool completed = true;
+  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
+    if (taskdata->td_allow_completion_event.type ==
+        KMP_EVENT_ALLOW_COMPLETION) {
+      // event hasn't been fulfilled yet. Try to detach task.
+      __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
+      if (taskdata->td_allow_completion_event.type ==
+          KMP_EVENT_ALLOW_COMPLETION) {
+        // task finished execution
+        KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
+        taskdata->td_flags.executing = 0; // suspend the finishing task
+
+#if OMPT_SUPPORT
+        // For a detached task, which is not completed, we switch back
+        // the omp_fulfill_event signals completion
+        // locking is necessary to avoid a race with ompt_task_late_fulfill
+        if (ompt)
+          __ompt_task_finish(task, resumed_task, ompt_task_detach);
+#endif
+
+        // no access to taskdata after this point!
+        // __kmp_fulfill_event might free taskdata at any time from now
+
+        taskdata->td_flags.proxy = TASK_PROXY; // proxify!
+        completed = false;
+      }
+      __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
+    }
+  }
+
+  // Tasks with valid target async handles must be re-enqueued.
+  if (taskdata->td_target_data.async_handle != NULL) {
+    // Note: no need to translate gtid to its shadow. If the current thread is a
+    // hidden helper one, then the gtid is already correct. Otherwise, hidden
+    // helper threads are disabled, and gtid refers to a OpenMP thread.
+    __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
+    if (KMP_HIDDEN_HELPER_THREAD(gtid))
+      __kmp_hidden_helper_worker_thread_signal();
+    completed = false;
+  }
+
+  if (completed) {
+    taskdata->td_flags.complete = 1; // mark the task as completed
+#if OMPX_TASKGRAPH
+    taskdata->td_flags.onced = 1; // mark the task as ran once already
+#endif
+
+#if OMPT_SUPPORT
+    // This is not a detached task, we are done here
+    if (ompt)
+      __ompt_task_finish(task, resumed_task, ompt_task_complete);
+#endif
+    // TODO: What would be the balance between the conditions in the function
+    // and an atomic operation?
+    if (__kmp_track_children_task(taskdata)) {
+      __kmp_release_deps(gtid, taskdata);
+      // Predecrement simulated by "- 1" calculation
+#if KMP_DEBUG
+      children = -1 +
+#endif
+          KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
+      KMP_DEBUG_ASSERT(children >= 0);
+#if OMPX_TASKGRAPH
+      if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
+#else
+      if (taskdata->td_taskgroup)
+#endif
+        KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
+    } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
+                             task_team->tt.tt_hidden_helper_task_encountered)) {
+      // if we found proxy or hidden helper tasks there could exist a dependency
+      // chain with the proxy task as origin
+      __kmp_release_deps(gtid, taskdata);
+    }
+    // td_flags.executing must be marked as 0 after __kmp_release_deps has been
+    // called. Othertwise, if a task is executed immediately from the
+    // release_deps code, the flag will be reset to 1 again by this same
+    // function
+    KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
+    taskdata->td_flags.executing = 0; // suspend the finishing task
+
+    // Decrement the counter of hidden helper tasks to be executed.
+    if (taskdata->td_flags.hidden_helper) {
+      // Hidden helper tasks can only be executed by hidden helper threads.
+      KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
+      KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
+    }
+  }
+
+  KA_TRACE(
+      20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
+           gtid, taskdata, children));
+
+  // Free this task and then ancestor tasks if they have no children.
+  // Restore th_current_task first as suggested by John:
+  // johnmc: if an asynchronous inquiry peers into the runtime system
+  // it doesn't see the freed task as the current task.
+  thread->th.th_current_task = resumed_task;
+  if (completed)
+    __kmp_free_task_and_ancestors(gtid, taskdata, thread);
+
+  // TODO: GEH - make sure root team implicit task is initialized properly.
+  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
+  resumed_task->td_flags.executing = 1; // resume previous task
+
+#if OMPX_TASKGRAPH
+  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
+      taskdata->td_taskgroup) {
+    // TDG: we only release taskgroup barrier here because
+    // free_task_and_ancestors will call
+    // __kmp_free_task, which resets all task parameters such as
+    // taskdata->started, etc. If we release the barrier earlier, these
+    // parameters could be read before being reset. This is not an issue for
+    // non-TDG implementation because we never reuse a task(data) structure
+    KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
+  }
+#endif
+
+  KA_TRACE(
+      10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
+           gtid, taskdata, resumed_task));
+
+  return;
+}
+
+template <bool ompt>
+static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
+                                                  kmp_int32 gtid,
+                                                  kmp_task_t *task) {
+  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
+                gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  // this routine will provide task to resume
+  __kmp_task_finish<ompt>(gtid, task, NULL);
+
+  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
+                gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
+
+#if OMPT_SUPPORT
+  if (ompt) {
+    ompt_frame_t *ompt_frame;
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame = ompt_data_none;
+    ompt_frame->enter_frame_flags =
+        ompt_frame_runtime | ompt_frame_framepointer;
+  }
+#endif
+
+  return;
+}
+
+#if OMPT_SUPPORT
+OMPT_NOINLINE
+void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
+                                       kmp_task_t *task) {
+  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
+}
+#endif // OMPT_SUPPORT
+
+// __kmpc_omp_task_complete_if0: report that a task has completed execution
+//
+// loc_ref: source location information; points to end of task block.
+// gtid: global thread number.
+// task: task thunk for the completed task.
+void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_task_t *task) {
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
+    return;
+  }
+#endif
+  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
+}
+
+#ifdef TASK_UNUSED
+// __kmpc_omp_task_complete: report that a task has completed execution
+// NEVER GENERATED BY COMPILER, DEPRECATED!!!
+void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
+                              kmp_task_t *task) {
+  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
+                loc_ref, KMP_TASK_TO_TASKDATA(task)));
+
+  __kmp_task_finish<false>(gtid, task,
+                           NULL); // Not sure how to find task to resume
+
+  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
+                loc_ref, KMP_TASK_TO_TASKDATA(task)));
+  return;
+}
+#endif // TASK_UNUSED
+
+// __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
+// task for a given thread
+//
+// loc_ref:  reference to source location of parallel region
+// this_thr:  thread data structure corresponding to implicit task
+// team: team for this_thr
+// tid: thread id of given thread within team
+// set_curr_task: TRUE if need to push current task to thread
+// NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
+// have already been done elsewhere.
+// TODO: Get better loc_ref.  Value passed in may be NULL
+void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
+                              kmp_team_t *team, int tid, int set_curr_task) {
+  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
+
+  KF_TRACE(
+      10,
+      ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
+       tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
+
+  task->td_task_id = KMP_GEN_TASK_ID();
+  task->td_team = team;
+  //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
+  //    in debugger)
+  task->td_ident = loc_ref;
+  task->td_taskwait_ident = NULL;
+  task->td_taskwait_counter = 0;
+  task->td_taskwait_thread = 0;
+
+  task->td_flags.tiedness = TASK_TIED;
+  task->td_flags.tasktype = TASK_IMPLICIT;
+  task->td_flags.proxy = TASK_FULL;
+
+  // All implicit tasks are executed immediately, not deferred
+  task->td_flags.task_serial = 1;
+  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
+  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
+
+  task->td_flags.started = 1;
+  task->td_flags.executing = 1;
+  task->td_flags.complete = 0;
+  task->td_flags.freed = 0;
+#if OMPX_TASKGRAPH
+  task->td_flags.onced = 0;
+#endif
+
+  task->td_depnode = NULL;
+  task->td_last_tied = task;
+  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
+
+  if (set_curr_task) { // only do this init first time thread is created
+    KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
+    // Not used: don't need to deallocate implicit task
+    KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
+    task->td_taskgroup = NULL; // An implicit task does not have taskgroup
+    task->td_dephash = NULL;
+    __kmp_push_current_task_to_thread(this_thr, team, tid);
+  } else {
+    KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
+    KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
+  }
+
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled))
+    __ompt_task_init(task, tid);
+#endif
+
+  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
+                team, task));
+}
+
+// __kmp_finish_implicit_task: Release resources associated to implicit tasks
+// at the end of parallel regions. Some resources are kept for reuse in the next
+// parallel region.
+//
+// thread:  thread data structure corresponding to implicit task
+void __kmp_finish_implicit_task(kmp_info_t *thread) {
+  kmp_taskdata_t *task = thread->th.th_current_task;
+  if (task->td_dephash) {
+    int children;
+    task->td_flags.complete = 1;
+#if OMPX_TASKGRAPH
+    task->td_flags.onced = 1;
+#endif
+    children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
+    kmp_tasking_flags_t flags_old = task->td_flags;
+    if (children == 0 && flags_old.complete == 1) {
+      kmp_tasking_flags_t flags_new = flags_old;
+      flags_new.complete = 0;
+      if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
+                                      *RCAST(kmp_int32 *, &flags_old),
+                                      *RCAST(kmp_int32 *, &flags_new))) {
+        KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
+                       "dephash of implicit task %p\n",
+                       thread->th.th_info.ds.ds_gtid, task));
+        __kmp_dephash_free_entries(thread, task->td_dephash);
+      }
+    }
+  }
+}
+
+// __kmp_free_implicit_task: Release resources associated to implicit tasks
+// when these are destroyed regions
+//
+// thread:  thread data structure corresponding to implicit task
+void __kmp_free_implicit_task(kmp_info_t *thread) {
+  kmp_taskdata_t *task = thread->th.th_current_task;
+  if (task && task->td_dephash) {
+    __kmp_dephash_free(thread, task->td_dephash);
+    task->td_dephash = NULL;
+  }
+}
+
+// Round up a size to a power of two specified by val: Used to insert padding
+// between structures co-allocated using a single malloc() call
+static size_t __kmp_round_up_to_val(size_t size, size_t val) {
+  if (size & (val - 1)) {
+    size &= ~(val - 1);
+    if (size <= KMP_SIZE_T_MAX - val) {
+      size += val; // Round up if there is no overflow.
+    }
+  }
+  return size;
+} // __kmp_round_up_to_va
+
+// __kmp_task_alloc: Allocate the taskdata and task data structures for a task
+//
+// loc_ref: source location information
+// gtid: global thread number.
+// flags: include tiedness & task type (explicit vs. implicit) of the ''new''
+// task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
+// sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
+// private vars accessed in task.
+// sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
+// in task.
+// task_entry: Pointer to task code entry point generated by compiler.
+// returns: a pointer to the allocated kmp_task_t structure (task).
+kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                             kmp_tasking_flags_t *flags,
+                             size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                             kmp_routine_entry_t task_entry) {
+  kmp_task_t *task;
+  kmp_taskdata_t *taskdata;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_team_t *team = thread->th.th_team;
+  kmp_taskdata_t *parent_task = thread->th.th_current_task;
+  size_t shareds_offset;
+
+  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
+    __kmp_middle_initialize();
+
+  if (flags->hidden_helper) {
+    if (__kmp_enable_hidden_helper) {
+      if (!TCR_4(__kmp_init_hidden_helper))
+        __kmp_hidden_helper_initialize();
+    } else {
+      // If the hidden helper task is not enabled, reset the flag to FALSE.
+      flags->hidden_helper = FALSE;
+    }
+  }
+
+  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
+                "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
+                gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
+                sizeof_shareds, task_entry));
+
+  KMP_DEBUG_ASSERT(parent_task);
+  if (parent_task->td_flags.final) {
+    if (flags->merged_if0) {
+    }
+    flags->final = 1;
+  }
+
+  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
+    // Untied task encountered causes the TSC algorithm to check entire deque of
+    // the victim thread. If no untied task encountered, then checking the head
+    // of the deque should be enough.
+    KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
+  }
+
+  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
+  // the tasking setup
+  // when that happens is too late.
+  if (UNLIKELY(flags->proxy == TASK_PROXY ||
+               flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
+    if (flags->proxy == TASK_PROXY) {
+      flags->tiedness = TASK_UNTIED;
+      flags->merged_if0 = 1;
+    }
+    /* are we running in a sequential parallel or tskm_immediate_exec... we need
+       tasking support enabled */
+    if ((thread->th.th_task_team) == NULL) {
+      /* This should only happen if the team is serialized
+          setup a task team and propagate it to the thread */
+      KMP_DEBUG_ASSERT(team->t.t_serialized);
+      KA_TRACE(30,
+               ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
+                gtid));
+      // 1 indicates setup the current team regardless of nthreads
+      __kmp_task_team_setup(thread, team, 1);
+      thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
+    }
+    kmp_task_team_t *task_team = thread->th.th_task_team;
+
+    /* tasking must be enabled now as the task might not be pushed */
+    if (!KMP_TASKING_ENABLED(task_team)) {
+      KA_TRACE(
+          30,
+          ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
+      __kmp_enable_tasking(task_team, thread);
+      kmp_int32 tid = thread->th.th_info.ds.ds_tid;
+      kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
+      // No lock needed since only owner can allocate
+      if (thread_data->td.td_deque == NULL) {
+        __kmp_alloc_task_deque(thread, thread_data);
+      }
+    }
+
+    if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
+        task_team->tt.tt_found_proxy_tasks == FALSE)
+      TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
+    if (flags->hidden_helper &&
+        task_team->tt.tt_hidden_helper_task_encountered == FALSE)
+      TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
+  }
+
+  // Calculate shared structure offset including padding after kmp_task_t struct
+  // to align pointers in shared struct
+  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
+  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
+
+  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
+  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
+                shareds_offset));
+  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
+                sizeof_shareds));
+
+  // Avoid double allocation here by combining shareds with taskdata
+#if USE_FAST_MEMORY
+  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
+                                                               sizeof_shareds);
+#else /* ! USE_FAST_MEMORY */
+  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
+                                                               sizeof_shareds);
+#endif /* USE_FAST_MEMORY */
+
+  task = KMP_TASKDATA_TO_TASK(taskdata);
+
+// Make sure task & taskdata are aligned appropriately
+#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
+  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
+  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
+#else
+  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
+  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
+#endif
+  if (sizeof_shareds > 0) {
+    // Avoid double allocation here by combining shareds with taskdata
+    task->shareds = &((char *)taskdata)[shareds_offset];
+    // Make sure shareds struct is aligned to pointer size
+    KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
+                     0);
+  } else {
+    task->shareds = NULL;
+  }
+  task->routine = task_entry;
+  task->part_id = 0; // AC: Always start with 0 part id
+
+  taskdata->td_task_id = KMP_GEN_TASK_ID();
+  taskdata->td_team = thread->th.th_team;
+  taskdata->td_alloc_thread = thread;
+  taskdata->td_parent = parent_task;
+  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
+  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
+  taskdata->td_ident = loc_ref;
+  taskdata->td_taskwait_ident = NULL;
+  taskdata->td_taskwait_counter = 0;
+  taskdata->td_taskwait_thread = 0;
+  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
+  // avoid copying icvs for proxy tasks
+  if (flags->proxy == TASK_FULL)
+    copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
+
+  taskdata->td_flags = *flags;
+  taskdata->td_task_team = thread->th.th_task_team;
+  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
+  taskdata->td_flags.tasktype = TASK_EXPLICIT;
+  // If it is hidden helper task, we need to set the team and task team
+  // correspondingly.
+  if (flags->hidden_helper) {
+    kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
+    taskdata->td_team = shadow_thread->th.th_team;
+    taskdata->td_task_team = shadow_thread->th.th_task_team;
+  }
+
+  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
+  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
+
+  // GEH - TODO: fix this to copy parent task's value of team_serial flag
+  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
+
+  // GEH - Note we serialize the task if the team is serialized to make sure
+  // implicit parallel region tasks are not left until program termination to
+  // execute. Also, it helps locality to execute immediately.
+
+  taskdata->td_flags.task_serial =
+      (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
+       taskdata->td_flags.tasking_ser || flags->merged_if0);
+
+  taskdata->td_flags.started = 0;
+  taskdata->td_flags.executing = 0;
+  taskdata->td_flags.complete = 0;
+  taskdata->td_flags.freed = 0;
+#if OMPX_TASKGRAPH
+  taskdata->td_flags.onced = 0;
+#endif
+  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
+  // start at one because counts current task and children
+  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
+  taskdata->td_taskgroup =
+      parent_task->td_taskgroup; // task inherits taskgroup from the parent task
+  taskdata->td_dephash = NULL;
+  taskdata->td_depnode = NULL;
+  taskdata->td_target_data.async_handle = NULL;
+  if (flags->tiedness == TASK_UNTIED)
+    taskdata->td_last_tied = NULL; // will be set when the task is scheduled
+  else
+    taskdata->td_last_tied = taskdata;
+  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled))
+    __ompt_task_init(taskdata, gtid);
+#endif
+  // TODO: What would be the balance between the conditions in the function and
+  // an atomic operation?
+  if (__kmp_track_children_task(taskdata)) {
+    KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
+    if (parent_task->td_taskgroup)
+      KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
+    // Only need to keep track of allocated child tasks for explicit tasks since
+    // implicit not deallocated
+    if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
+      KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
+    }
+    if (flags->hidden_helper) {
+      taskdata->td_flags.task_serial = FALSE;
+      // Increment the number of hidden helper tasks to be executed
+      KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
+    }
+  }
+
+#if OMPX_TASKGRAPH
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
+      (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
+    taskdata->is_taskgraph = 1;
+    taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+    taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
+  }
+#endif
+  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
+                gtid, taskdata, taskdata->td_parent));
+
+  return task;
+}
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_int32 flags, size_t sizeof_kmp_task_t,
+                                  size_t sizeof_shareds,
+                                  kmp_routine_entry_t task_entry) {
+  kmp_task_t *retval;
+  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
+  __kmp_assert_valid_gtid(gtid);
+  input_flags->native = FALSE;
+  // __kmp_task_alloc() sets up all other runtime flags
+  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
+                "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
+                gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
+                input_flags->proxy ? "proxy" : "",
+                input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
+                sizeof_shareds, task_entry));
+
+  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
+                            sizeof_shareds, task_entry);
+
+  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
+
+  return retval;
+}
+
+kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                         kmp_int32 flags,
+                                         size_t sizeof_kmp_task_t,
+                                         size_t sizeof_shareds,
+                                         kmp_routine_entry_t task_entry,
+                                         kmp_int64 device_id) {
+  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
+  // target task is untied defined in the specification
+  input_flags.tiedness = TASK_UNTIED;
+
+  if (__kmp_enable_hidden_helper)
+    input_flags.hidden_helper = TRUE;
+
+  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
+                               sizeof_shareds, task_entry);
+}
+
+/*!
+@ingroup TASKING
+@param loc_ref location of the original task directive
+@param gtid Global Thread ID of encountering thread
+@param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
+task''
+@param naffins Number of affinity items
+@param affin_list List of affinity items
+@return Returns non-zero if registering affinity information was not successful.
+ Returns 0 if registration was successful
+This entry registers the affinity information attached to a task with the task
+thunk structure kmp_taskdata_t.
+*/
+kmp_int32
+__kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_task_t *new_task, kmp_int32 naffins,
+                                  kmp_task_affinity_info_t *affin_list) {
+  return 0;
+}
+
+//  __kmp_invoke_task: invoke the specified task
+//
+// gtid: global thread ID of caller
+// task: the task to invoke
+// current_task: the task to resume after task invocation
+#ifdef __s390x__
+__attribute__((target("backchain")))
+#endif
+static void
+__kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
+                  kmp_taskdata_t *current_task) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_info_t *thread;
+  int discard = 0 /* false */;
+  KA_TRACE(
+      30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
+           gtid, taskdata, current_task));
+  KMP_DEBUG_ASSERT(task);
+  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
+               taskdata->td_flags.complete == 1)) {
+    // This is a proxy task that was already completed but it needs to run
+    // its bottom-half finish
+    KA_TRACE(
+        30,
+        ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
+         gtid, taskdata));
+
+    __kmp_bottom_half_finish_proxy(gtid, task);
+
+    KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
+                  "proxy task %p, resuming task %p\n",
+                  gtid, taskdata, current_task));
+
+    return;
+  }
+
+#if OMPT_SUPPORT
+  // For untied tasks, the first task executed only calls __kmpc_omp_task and
+  // does not execute code.
+  ompt_thread_info_t oldInfo;
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    // Store the threads states and restore them after the task
+    thread = __kmp_threads[gtid];
+    oldInfo = thread->th.ompt_thread_info;
+    thread->th.ompt_thread_info.wait_id = 0;
+    thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
+                                            ? ompt_state_work_serial
+                                            : ompt_state_work_parallel;
+    taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+
+  // Proxy tasks are not handled by the runtime
+  if (taskdata->td_flags.proxy != TASK_PROXY) {
+    __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
+  }
+
+  // TODO: cancel tasks if the parallel region has also been cancelled
+  // TODO: check if this sequence can be hoisted above __kmp_task_start
+  // if cancellation has been enabled for this run ...
+  if (UNLIKELY(__kmp_omp_cancellation)) {
+    thread = __kmp_threads[gtid];
+    kmp_team_t *this_team = thread->th.th_team;
+    kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
+    if ((taskgroup && taskgroup->cancel_request) ||
+        (this_team->t.t_cancel_request == cancel_parallel)) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+      ompt_data_t *task_data;
+      if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
+        __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
+        ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+            task_data,
+            ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
+                                                      : ompt_cancel_parallel) |
+                ompt_cancel_discarded_task,
+            NULL);
+      }
+#endif
+      KMP_COUNT_BLOCK(TASK_cancelled);
+      // this task belongs to a task group and we need to cancel it
+      discard = 1 /* true */;
+    }
+  }
+
+  // Invoke the task routine and pass in relevant data.
+  // Thunks generated by gcc take a different argument list.
+  if (!discard) {
+    if (taskdata->td_flags.tiedness == TASK_UNTIED) {
+      taskdata->td_last_tied = current_task->td_last_tied;
+      KMP_DEBUG_ASSERT(taskdata->td_last_tied);
+    }
+#if KMP_STATS_ENABLED
+    KMP_COUNT_BLOCK(TASK_executed);
+    switch (KMP_GET_THREAD_STATE()) {
+    case FORK_JOIN_BARRIER:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
+      break;
+    case PLAIN_BARRIER:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
+      break;
+    case TASKYIELD:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
+      break;
+    case TASKWAIT:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
+      break;
+    case TASKGROUP:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
+      break;
+    default:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
+      break;
+    }
+#endif // KMP_STATS_ENABLED
+
+// OMPT task begin
+#if OMPT_SUPPORT
+    if (UNLIKELY(ompt_enabled.enabled))
+      __ompt_task_start(task, current_task, gtid);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
+                 taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
+      ompt_data_t instance = ompt_data_none;
+      instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
+      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+      ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
+          &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
+          ompt_dispatch_taskloop_chunk, instance);
+      taskdata->ompt_task_info.dispatch_chunk = {0, 0};
+    }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP)
+      ompd_bp_task_begin();
+#endif
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    kmp_uint64 cur_time;
+    kmp_int32 kmp_itt_count_task =
+        __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
+        current_task->td_flags.tasktype == TASK_IMPLICIT;
+    if (kmp_itt_count_task) {
+      thread = __kmp_threads[gtid];
+      // Time outer level explicit task on barrier for adjusting imbalance time
+      if (thread->th.th_bar_arrive_time)
+        cur_time = __itt_get_timestamp();
+      else
+        kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
+    }
+    KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
+#endif
+
+#if ENABLE_LIBOMPTARGET
+    if (taskdata->td_target_data.async_handle != NULL) {
+      // If we have a valid target async handle, that means that we have already
+      // executed the task routine once. We must query for the handle completion
+      // instead of re-executing the routine.
+      KMP_ASSERT(tgt_target_nowait_query);
+      tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
+    } else
+#endif
+    if (task->routine != NULL) {
+#ifdef KMP_GOMP_COMPAT
+      if (taskdata->td_flags.native) {
+        ((void (*)(void *))(*(task->routine)))(task->shareds);
+      } else
+#endif /* KMP_GOMP_COMPAT */
+      {
+        (*(task->routine))(gtid, task);
+      }
+    }
+    KMP_POP_PARTITIONED_TIMER();
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if (kmp_itt_count_task) {
+      // Barrier imbalance - adjust arrive time with the task duration
+      thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
+    }
+    KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
+    KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
+#endif
+  }
+
+#if OMPD_SUPPORT
+  if (ompd_state & OMPD_ENABLE_BP)
+    ompd_bp_task_end();
+#endif
+
+  // Proxy tasks are not handled by the runtime
+  if (taskdata->td_flags.proxy != TASK_PROXY) {
+#if OMPT_SUPPORT
+    if (UNLIKELY(ompt_enabled.enabled)) {
+      thread->th.ompt_thread_info = oldInfo;
+      if (taskdata->td_flags.tiedness == TASK_TIED) {
+        taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
+      }
+      __kmp_task_finish<true>(gtid, task, current_task);
+    } else
+#endif
+      __kmp_task_finish<false>(gtid, task, current_task);
+  }
+
+  KA_TRACE(
+      30,
+      ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
+       gtid, taskdata, current_task));
+  return;
+}
+
+// __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
+//
+// loc_ref: location of original task pragma (ignored)
+// gtid: Global Thread ID of encountering thread
+// new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
+// Returns:
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
+//    be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
+//    resumed later.
+kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
+                                kmp_task_t *new_task) {
+  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+
+  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
+                loc_ref, new_taskdata));
+
+#if OMPT_SUPPORT
+  kmp_taskdata_t *parent;
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    parent = new_taskdata->td_parent;
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
+          &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
+          OMPT_GET_RETURN_ADDRESS(0));
+    }
+  }
+#endif
+
+  /* Should we execute the new task or queue it? For now, let's just always try
+     to queue it.  If the queue fills up, then we'll execute it.  */
+
+  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
+  { // Execute this task immediately
+    kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
+    new_taskdata->td_flags.task_serial = 1;
+    __kmp_invoke_task(gtid, new_task, current_task);
+  }
+
+  KA_TRACE(
+      10,
+      ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
+       "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
+       gtid, loc_ref, new_taskdata));
+
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    parent->ompt_task_info.frame.enter_frame = ompt_data_none;
+  }
+#endif
+  return TASK_CURRENT_NOT_QUEUED;
+}
+
+// __kmp_omp_task: Schedule a non-thread-switchable task for execution
+//
+// gtid: Global Thread ID of encountering thread
+// new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
+// serialize_immediate: if TRUE then if the task is executed immediately its
+// execution will be serialized
+// Returns:
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
+//    be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
+//    resumed later.
+kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
+                         bool serialize_immediate) {
+  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+
+#if OMPX_TASKGRAPH
+  if (new_taskdata->is_taskgraph &&
+      __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
+    kmp_tdg_info_t *tdg = new_taskdata->tdg;
+    // extend the record_map if needed
+    if (new_taskdata->td_task_id >= new_taskdata->tdg->map_size) {
+      __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
+      // map_size could have been updated by another thread if recursive
+      // taskloop
+      if (new_taskdata->td_task_id >= tdg->map_size) {
+        kmp_uint old_size = tdg->map_size;
+        kmp_uint new_size = old_size * 2;
+        kmp_node_info_t *old_record = tdg->record_map;
+        kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
+            new_size * sizeof(kmp_node_info_t));
+
+        KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
+        tdg->record_map = new_record;
+
+        __kmp_free(old_record);
+
+        for (kmp_int i = old_size; i < new_size; i++) {
+          kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
+              __kmp_successors_size * sizeof(kmp_int32));
+          new_record[i].task = nullptr;
+          new_record[i].successors = successorsList;
+          new_record[i].nsuccessors = 0;
+          new_record[i].npredecessors = 0;
+          new_record[i].successors_size = __kmp_successors_size;
+          KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
+        }
+        // update the size at the end, so that we avoid other
+        // threads use old_record while map_size is already updated
+        tdg->map_size = new_size;
+      }
+      __kmp_release_bootstrap_lock(&tdg->graph_lock);
+    }
+    // record a task
+    if (tdg->record_map[new_taskdata->td_task_id].task == nullptr) {
+      tdg->record_map[new_taskdata->td_task_id].task = new_task;
+      tdg->record_map[new_taskdata->td_task_id].parent_task =
+          new_taskdata->td_parent;
+      KMP_ATOMIC_INC(&tdg->num_tasks);
+    }
+  }
+#endif
+
+  /* Should we execute the new task or queue it? For now, let's just always try
+     to queue it.  If the queue fills up, then we'll execute it.  */
+  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
+      __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
+  { // Execute this task immediately
+    kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
+    if (serialize_immediate)
+      new_taskdata->td_flags.task_serial = 1;
+    __kmp_invoke_task(gtid, new_task, current_task);
+  } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
+             __kmp_wpolicy_passive) {
+    kmp_info_t *this_thr = __kmp_threads[gtid];
+    kmp_team_t *team = this_thr->th.th_team;
+    kmp_int32 nthreads = this_thr->th.th_team_nproc;
+    for (int i = 0; i < nthreads; ++i) {
+      kmp_info_t *thread = team->t.t_threads[i];
+      if (thread == this_thr)
+        continue;
+      if (thread->th.th_sleep_loc != NULL) {
+        __kmp_null_resume_wrapper(thread);
+        break; // awake one thread at a time
+      }
+    }
+  }
+  return TASK_CURRENT_NOT_QUEUED;
+}
+
+// __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
+// non-thread-switchable task from the parent thread only!
+//
+// loc_ref: location of original task pragma (ignored)
+// gtid: Global Thread ID of encountering thread
+// new_task: non-thread-switchable task thunk allocated by
+// __kmp_omp_task_alloc()
+// Returns:
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
+//    be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
+//    resumed later.
+kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
+                          kmp_task_t *new_task) {
+  kmp_int32 res;
+  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
+
+#if KMP_DEBUG || OMPT_SUPPORT
+  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+#endif
+  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
+                new_taskdata));
+  __kmp_assert_valid_gtid(gtid);
+
+#if OMPT_SUPPORT
+  kmp_taskdata_t *parent = NULL;
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    if (!new_taskdata->td_flags.started) {
+      OMPT_STORE_RETURN_ADDRESS(gtid);
+      parent = new_taskdata->td_parent;
+      if (!parent->ompt_task_info.frame.enter_frame.ptr) {
+        parent->ompt_task_info.frame.enter_frame.ptr =
+            OMPT_GET_FRAME_ADDRESS(0);
+      }
+      if (ompt_enabled.ompt_callback_task_create) {
+        ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+            &(parent->ompt_task_info.task_data),
+            &(parent->ompt_task_info.frame),
+            &(new_taskdata->ompt_task_info.task_data),
+            ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
+            OMPT_LOAD_RETURN_ADDRESS(gtid));
+      }
+    } else {
+      // We are scheduling the continuation of an UNTIED task.
+      // Scheduling back to the parent task.
+      __ompt_task_finish(new_task,
+                         new_taskdata->ompt_task_info.scheduling_parent,
+                         ompt_task_switch);
+      new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
+    }
+  }
+#endif
+
+  res = __kmp_omp_task(gtid, new_task, true);
+
+  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
+                "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
+                gtid, loc_ref, new_taskdata));
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
+    parent->ompt_task_info.frame.enter_frame = ompt_data_none;
+  }
+#endif
+  return res;
+}
+
+// __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
+// a taskloop task with the correct OMPT return address
+//
+// loc_ref: location of original task pragma (ignored)
+// gtid: Global Thread ID of encountering thread
+// new_task: non-thread-switchable task thunk allocated by
+// __kmp_omp_task_alloc()
+// codeptr_ra: return address for OMPT callback
+// Returns:
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
+//    be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
+//    resumed later.
+kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_task_t *new_task, void *codeptr_ra) {
+  kmp_int32 res;
+  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
+
+#if KMP_DEBUG || OMPT_SUPPORT
+  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+#endif
+  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
+                new_taskdata));
+
+#if OMPT_SUPPORT
+  kmp_taskdata_t *parent = NULL;
+  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
+    parent = new_taskdata->td_parent;
+    if (!parent->ompt_task_info.frame.enter_frame.ptr)
+      parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
+          &(new_taskdata->ompt_task_info.task_data),
+          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
+          codeptr_ra);
+    }
+  }
+#endif
+
+  res = __kmp_omp_task(gtid, new_task, true);
+
+  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
+                "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
+                gtid, loc_ref, new_taskdata));
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
+    parent->ompt_task_info.frame.enter_frame = ompt_data_none;
+  }
+#endif
+  return res;
+}
+
+template <bool ompt>
+static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
+                                              void *frame_address,
+                                              void *return_address) {
+  kmp_taskdata_t *taskdata = nullptr;
+  kmp_info_t *thread;
+  int thread_finished = FALSE;
+  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
+
+  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    thread = __kmp_threads[gtid];
+    taskdata = thread->th.th_current_task;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    ompt_data_t *my_task_data;
+    ompt_data_t *my_parallel_data;
+
+    if (ompt) {
+      my_task_data = &(taskdata->ompt_task_info.task_data);
+      my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
+
+      taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
+
+      if (ompt_enabled.ompt_callback_sync_region) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+            ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
+            my_task_data, return_address);
+      }
+
+      if (ompt_enabled.ompt_callback_sync_region_wait) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+            ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
+            my_task_data, return_address);
+      }
+    }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+
+// Debugger: The taskwait is active. Store location and thread encountered the
+// taskwait.
+#if USE_ITT_BUILD
+// Note: These values are used by ITT events as well.
+#endif /* USE_ITT_BUILD */
+    taskdata->td_taskwait_counter += 1;
+    taskdata->td_taskwait_ident = loc_ref;
+    taskdata->td_taskwait_thread = gtid + 1;
+
+#if USE_ITT_BUILD
+    void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
+#endif /* USE_ITT_NOTIFY */
+#endif /* USE_ITT_BUILD */
+
+    bool must_wait =
+        !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
+
+    must_wait = must_wait || (thread->th.th_task_team != NULL &&
+                              thread->th.th_task_team->tt.tt_found_proxy_tasks);
+    // If hidden helper thread is encountered, we must enable wait here.
+    must_wait =
+        must_wait ||
+        (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
+         thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
+
+    if (must_wait) {
+      kmp_flag_32<false, false> flag(
+          RCAST(std::atomic<kmp_uint32> *,
+                &(taskdata->td_incomplete_child_tasks)),
+          0U);
+      while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
+        flag.execute_tasks(thread, gtid, FALSE,
+                           &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
+                           __kmp_task_stealing_constraint);
+      }
+    }
+#if USE_ITT_BUILD
+    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
+    KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
+#endif /* USE_ITT_BUILD */
+
+    // Debugger:  The taskwait is completed. Location remains, but thread is
+    // negated.
+    taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt) {
+      if (ompt_enabled.ompt_callback_sync_region_wait) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+            ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
+            my_task_data, return_address);
+      }
+      if (ompt_enabled.ompt_callback_sync_region) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+            ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
+            my_task_data, return_address);
+      }
+      taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
+    }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+  }
+
+  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
+                "returning TASK_CURRENT_NOT_QUEUED\n",
+                gtid, taskdata));
+
+  return TASK_CURRENT_NOT_QUEUED;
+}
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+OMPT_NOINLINE
+static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
+                                          void *frame_address,
+                                          void *return_address) {
+  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
+                                            return_address);
+}
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+
+// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
+// complete
+kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
+                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
+  }
+#endif
+  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
+}
+
+// __kmpc_omp_taskyield: switch to a different task
+kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
+  kmp_taskdata_t *taskdata = NULL;
+  kmp_info_t *thread;
+  int thread_finished = FALSE;
+
+  KMP_COUNT_BLOCK(OMP_TASKYIELD);
+  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
+
+  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
+                gtid, loc_ref, end_part));
+  __kmp_assert_valid_gtid(gtid);
+
+  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
+    thread = __kmp_threads[gtid];
+    taskdata = thread->th.th_current_task;
+// Should we model this as a task wait or not?
+// Debugger: The taskwait is active. Store location and thread encountered the
+// taskwait.
+#if USE_ITT_BUILD
+// Note: These values are used by ITT events as well.
+#endif /* USE_ITT_BUILD */
+    taskdata->td_taskwait_counter += 1;
+    taskdata->td_taskwait_ident = loc_ref;
+    taskdata->td_taskwait_thread = gtid + 1;
+
+#if USE_ITT_BUILD
+    void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
+#endif /* USE_ITT_NOTIFY */
+#endif /* USE_ITT_BUILD */
+    if (!taskdata->td_flags.team_serial) {
+      kmp_task_team_t *task_team = thread->th.th_task_team;
+      if (task_team != NULL) {
+        if (KMP_TASKING_ENABLED(task_team)) {
+#if OMPT_SUPPORT
+          if (UNLIKELY(ompt_enabled.enabled))
+            thread->th.ompt_thread_info.ompt_task_yielded = 1;
+#endif
+          __kmp_execute_tasks_32(
+              thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
+              &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
+              __kmp_task_stealing_constraint);
+#if OMPT_SUPPORT
+          if (UNLIKELY(ompt_enabled.enabled))
+            thread->th.ompt_thread_info.ompt_task_yielded = 0;
+#endif
+        }
+      }
+    }
+#if USE_ITT_BUILD
+    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+    // Debugger:  The taskwait is completed. Location remains, but thread is
+    // negated.
+    taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
+  }
+
+  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
+                "returning TASK_CURRENT_NOT_QUEUED\n",
+                gtid, taskdata));
+
+  return TASK_CURRENT_NOT_QUEUED;
+}
+
+// Task Reduction implementation
+//
+// Note: initial implementation didn't take into account the possibility
+// to specify omp_orig for initializer of the UDR (user defined reduction).
+// Corrected implementation takes into account the omp_orig object.
+// Compiler is free to use old implementation if omp_orig is not specified.
+
+/*!
+@ingroup BASIC_TYPES
+@{
+*/
+
+/*!
+Flags for special info per task reduction item.
+*/
+typedef struct kmp_taskred_flags {
+  /*! 1 - use lazy alloc/init (e.g. big objects, num tasks < num threads) */
+  unsigned lazy_priv : 1;
+  unsigned reserved31 : 31;
+} kmp_taskred_flags_t;
+
+/*!
+Internal struct for reduction data item related info set up by compiler.
+*/
+typedef struct kmp_task_red_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single parameter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+} kmp_task_red_input_t;
+
+/*!
+Internal struct for reduction data item related info saved by the library.
+*/
+typedef struct kmp_taskred_data {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+  void *reduce_priv; /**< array of thread specific items */
+  void *reduce_pend; /**< end of private data for faster comparison op */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_comb; /**< data combiner routine */
+  void *reduce_init; /**< data initialization routine (two parameters) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_orig; /**< original item (can be used in UDR initializer) */
+} kmp_taskred_data_t;
+
+/*!
+Internal struct for reduction data item related info set up by compiler.
+
+New interface: added reduce_orig field to provide omp_orig for UDR initializer.
+*/
+typedef struct kmp_taskred_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  void *reduce_orig; /**< original reduction item used for initialization */
+  size_t reduce_size; /**< size of data item */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (two parameters) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+} kmp_taskred_input_t;
+/*!
+@}
+*/
+
+template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
+template <>
+void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
+                                             kmp_task_red_input_t &src) {
+  item.reduce_orig = NULL;
+}
+template <>
+void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
+                                            kmp_taskred_input_t &src) {
+  if (src.reduce_orig != NULL) {
+    item.reduce_orig = src.reduce_orig;
+  } else {
+    item.reduce_orig = src.reduce_shar;
+  } // non-NULL reduce_orig means new interface used
+}
+
+template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
+template <>
+void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
+                                           size_t offset) {
+  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
+}
+template <>
+void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
+                                          size_t offset) {
+  ((void (*)(void *, void *))item.reduce_init)(
+      (char *)(item.reduce_priv) + offset, item.reduce_orig);
+}
+
+template <typename T>
+void *__kmp_task_reduction_init(int gtid, int num, T *data) {
+  __kmp_assert_valid_gtid(gtid);
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
+  kmp_uint32 nth = thread->th.th_team_nproc;
+  kmp_taskred_data_t *arr;
+
+  // check input data just in case
+  KMP_ASSERT(tg != NULL);
+  KMP_ASSERT(data != NULL);
+  KMP_ASSERT(num > 0);
+  if (nth == 1 && !__kmp_enable_hidden_helper) {
+    KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
+                  gtid, tg));
+    return (void *)tg;
+  }
+  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
+                gtid, tg, num));
+  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
+      thread, num * sizeof(kmp_taskred_data_t));
+  for (int i = 0; i < num; ++i) {
+    size_t size = data[i].reduce_size - 1;
+    // round the size up to cache line per thread-specific item
+    size += CACHE_LINE - size % CACHE_LINE;
+    KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
+    arr[i].reduce_shar = data[i].reduce_shar;
+    arr[i].reduce_size = size;
+    arr[i].flags = data[i].flags;
+    arr[i].reduce_comb = data[i].reduce_comb;
+    arr[i].reduce_init = data[i].reduce_init;
+    arr[i].reduce_fini = data[i].reduce_fini;
+    __kmp_assign_orig<T>(arr[i], data[i]);
+    if (!arr[i].flags.lazy_priv) {
+      // allocate cache-line aligned block and fill it with zeros
+      arr[i].reduce_priv = __kmp_allocate(nth * size);
+      arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
+      if (arr[i].reduce_init != NULL) {
+        // initialize all thread-specific items
+        for (size_t j = 0; j < nth; ++j) {
+          __kmp_call_init<T>(arr[i], j * size);
+        }
+      }
+    } else {
+      // only allocate space for pointers now,
+      // objects will be lazily allocated/initialized if/when requested
+      // note that __kmp_allocate zeroes the allocated memory
+      arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
+    }
+  }
+  tg->reduce_data = (void *)arr;
+  tg->reduce_num_data = num;
+  return (void *)tg;
+}
+
+/*!
+@ingroup TASKING
+@param gtid      Global thread ID
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for the taskgroup.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has single parameter - pointer to object to be initialized. That means
+the reduction either does not use omp_orig object, or the omp_orig is accessible
+without help of the runtime library.
+*/
+void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+#if OMPX_TASKGRAPH
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
+    kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+    this_tdg->rec_taskred_data =
+        __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
+    this_tdg->rec_num_taskred = num;
+    KMP_MEMCPY(this_tdg->rec_taskred_data, data,
+               sizeof(kmp_task_red_input_t) * num);
+  }
+#endif
+  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
+}
+
+/*!
+@ingroup TASKING
+@param gtid      Global thread ID
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for the taskgroup.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has two parameters, pointer to object to be initialized and pointer to omp_orig
+*/
+void *__kmpc_taskred_init(int gtid, int num, void *data) {
+#if OMPX_TASKGRAPH
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
+    kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+    this_tdg->rec_taskred_data =
+        __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
+    this_tdg->rec_num_taskred = num;
+    KMP_MEMCPY(this_tdg->rec_taskred_data, data,
+               sizeof(kmp_task_red_input_t) * num);
+  }
+#endif
+  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
+}
+
+// Copy task reduction data (except for shared pointers).
+template <typename T>
+void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
+                                    kmp_taskgroup_t *tg, void *reduce_data) {
+  kmp_taskred_data_t *arr;
+  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
+                " from data %p\n",
+                thr, tg, reduce_data));
+  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
+      thr, num * sizeof(kmp_taskred_data_t));
+  // threads will share private copies, thunk routines, sizes, flags, etc.:
+  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
+  for (int i = 0; i < num; ++i) {
+    arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
+  }
+  tg->reduce_data = (void *)arr;
+  tg->reduce_num_data = num;
+}
+
+/*!
+@ingroup TASKING
+@param gtid    Global thread ID
+@param tskgrp  The taskgroup ID (optional)
+@param data    Shared location of the item
+@return The pointer to per-thread data
+
+Get thread-specific location of data item
+*/
+void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
+  __kmp_assert_valid_gtid(gtid);
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_int32 nth = thread->th.th_team_nproc;
+  if (nth == 1)
+    return data; // nothing to do
+
+  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
+  if (tg == NULL)
+    tg = thread->th.th_current_task->td_taskgroup;
+  KMP_ASSERT(tg != NULL);
+  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
+  kmp_int32 num = tg->reduce_num_data;
+  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
+
+#if OMPX_TASKGRAPH
+  if ((thread->th.th_current_task->is_taskgraph) &&
+      (!__kmp_tdg_is_recording(
+          __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
+    tg = thread->th.th_current_task->td_taskgroup;
+    KMP_ASSERT(tg != NULL);
+    KMP_ASSERT(tg->reduce_data != NULL);
+    arr = (kmp_taskred_data_t *)(tg->reduce_data);
+    num = tg->reduce_num_data;
+  }
+#endif
+
+  KMP_ASSERT(data != NULL);
+  while (tg != NULL) {
+    for (int i = 0; i < num; ++i) {
+      if (!arr[i].flags.lazy_priv) {
+        if (data == arr[i].reduce_shar ||
+            (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
+          return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
+      } else {
+        // check shared location first
+        void **p_priv = (void **)(arr[i].reduce_priv);
+        if (data == arr[i].reduce_shar)
+          goto found;
+        // check if we get some thread specific location as parameter
+        for (int j = 0; j < nth; ++j)
+          if (data == p_priv[j])
+            goto found;
+        continue; // not found, continue search
+      found:
+        if (p_priv[tid] == NULL) {
+          // allocate thread specific object lazily
+          p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
+          if (arr[i].reduce_init != NULL) {
+            if (arr[i].reduce_orig != NULL) { // new interface
+              ((void (*)(void *, void *))arr[i].reduce_init)(
+                  p_priv[tid], arr[i].reduce_orig);
+            } else { // old interface (single parameter)
+              ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
+            }
+          }
+        }
+        return p_priv[tid];
+      }
+    }
+    KMP_ASSERT(tg->parent);
+    tg = tg->parent;
+    arr = (kmp_taskred_data_t *)(tg->reduce_data);
+    num = tg->reduce_num_data;
+  }
+  KMP_ASSERT2(0, "Unknown task reduction item");
+  return NULL; // ERROR, this line never executed
+}
+
+// Finalize task reduction.
+// Called from __kmpc_end_taskgroup()
+static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
+  kmp_int32 nth = th->th.th_team_nproc;
+  KMP_DEBUG_ASSERT(
+      nth > 1 ||
+      __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
+                                   // are using hidden helper threads
+  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
+  kmp_int32 num = tg->reduce_num_data;
+  for (int i = 0; i < num; ++i) {
+    void *sh_data = arr[i].reduce_shar;
+    void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
+    void (*f_comb)(void *, void *) =
+        (void (*)(void *, void *))(arr[i].reduce_comb);
+    if (!arr[i].flags.lazy_priv) {
+      void *pr_data = arr[i].reduce_priv;
+      size_t size = arr[i].reduce_size;
+      for (int j = 0; j < nth; ++j) {
+        void *priv_data = (char *)pr_data + j * size;
+        f_comb(sh_data, priv_data); // combine results
+        if (f_fini)
+          f_fini(priv_data); // finalize if needed
+      }
+    } else {
+      void **pr_data = (void **)(arr[i].reduce_priv);
+      for (int j = 0; j < nth; ++j) {
+        if (pr_data[j] != NULL) {
+          f_comb(sh_data, pr_data[j]); // combine results
+          if (f_fini)
+            f_fini(pr_data[j]); // finalize if needed
+          __kmp_free(pr_data[j]);
+        }
+      }
+    }
+    __kmp_free(arr[i].reduce_priv);
+  }
+  __kmp_thread_free(th, arr);
+  tg->reduce_data = NULL;
+  tg->reduce_num_data = 0;
+}
+
+// Cleanup task reduction data for parallel or worksharing,
+// do not touch task private data other threads still working with.
+// Called from __kmpc_end_taskgroup()
+static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
+  __kmp_thread_free(th, tg->reduce_data);
+  tg->reduce_data = NULL;
+  tg->reduce_num_data = 0;
+}
+
+template <typename T>
+void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
+                                         int num, T *data) {
+  __kmp_assert_valid_gtid(gtid);
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_int32 nth = thr->th.th_team_nproc;
+  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
+  if (nth == 1) {
+    KA_TRACE(10,
+             ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
+              gtid, thr->th.th_current_task->td_taskgroup));
+    return (void *)thr->th.th_current_task->td_taskgroup;
+  }
+  kmp_team_t *team = thr->th.th_team;
+  void *reduce_data;
+  kmp_taskgroup_t *tg;
+  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
+  if (reduce_data == NULL &&
+      __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
+                                 (void *)1)) {
+    // single thread enters this block to initialize common reduction data
+    KMP_DEBUG_ASSERT(reduce_data == NULL);
+    // first initialize own data, then make a copy other threads can use
+    tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
+    reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
+    KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
+    // fini counters should be 0 at this point
+    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
+    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
+  } else {
+    while (
+        (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
+        (void *)1) { // wait for task reduction initialization
+      KMP_CPU_PAUSE();
+    }
+    KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
+    tg = thr->th.th_current_task->td_taskgroup;
+    __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
+  }
+  return tg;
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location info
+@param gtid      Global thread ID
+@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for a parallel or worksharing.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has single parameter - pointer to object to be initialized. That means
+the reduction either does not use omp_orig object, or the omp_orig is accessible
+without help of the runtime library.
+*/
+void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
+                                          int num, void *data) {
+  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
+                                            (kmp_task_red_input_t *)data);
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location info
+@param gtid      Global thread ID
+@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for a parallel or worksharing.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has two parameters, pointer to object to be initialized and pointer to omp_orig
+*/
+void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
+                                   void *data) {
+  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
+                                            (kmp_taskred_input_t *)data);
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location info
+@param gtid      Global thread ID
+@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+
+Finalize task reduction for a parallel or worksharing.
+*/
+void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
+  __kmpc_end_taskgroup(loc, gtid);
+}
+
+// __kmpc_taskgroup: Start a new taskgroup
+void __kmpc_taskgroup(ident_t *loc, int gtid) {
+  __kmp_assert_valid_gtid(gtid);
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *taskdata = thread->th.th_current_task;
+  kmp_taskgroup_t *tg_new =
+      (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
+  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
+  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
+  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
+  tg_new->parent = taskdata->td_taskgroup;
+  tg_new->reduce_data = NULL;
+  tg_new->reduce_num_data = 0;
+  tg_new->gomp_data = NULL;
+  taskdata->td_taskgroup = tg_new;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
+    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+    kmp_team_t *team = thread->th.th_team;
+    ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
+    // FIXME: I think this is wrong for lwt!
+    ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
+
+    ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+        ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
+        &(my_task_data), codeptr);
+  }
+#endif
+}
+
+// __kmpc_end_taskgroup: Wait until all tasks generated by the current task
+//                       and its descendants are complete
+void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
+  __kmp_assert_valid_gtid(gtid);
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *taskdata = thread->th.th_current_task;
+  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
+  int thread_finished = FALSE;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_team_t *team;
+  ompt_data_t my_task_data;
+  ompt_data_t my_parallel_data;
+  void *codeptr = nullptr;
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    team = thread->th.th_team;
+    my_task_data = taskdata->ompt_task_info.task_data;
+    // FIXME: I think this is wrong for lwt!
+    my_parallel_data = team->t.ompt_team_info.parallel_data;
+    codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  }
+#endif
+
+  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
+  KMP_DEBUG_ASSERT(taskgroup != NULL);
+  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
+
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    // mark task as waiting not on a barrier
+    taskdata->td_taskwait_counter += 1;
+    taskdata->td_taskwait_ident = loc;
+    taskdata->td_taskwait_thread = gtid + 1;
+#if USE_ITT_BUILD
+    // For ITT the taskgroup wait is similar to taskwait until we need to
+    // distinguish them
+    void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
+#endif /* USE_ITT_NOTIFY */
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
+          &(my_task_data), codeptr);
+    }
+#endif
+
+    if (!taskdata->td_flags.team_serial ||
+        (thread->th.th_task_team != NULL &&
+         (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
+          thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
+      kmp_flag_32<false, false> flag(
+          RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
+      while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
+        flag.execute_tasks(thread, gtid, FALSE,
+                           &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
+                           __kmp_task_stealing_constraint);
+      }
+    }
+    taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
+          &(my_task_data), codeptr);
+    }
+#endif
+
+#if USE_ITT_BUILD
+    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
+    KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
+#endif /* USE_ITT_BUILD */
+  }
+  KMP_DEBUG_ASSERT(taskgroup->count == 0);
+
+  if (taskgroup->reduce_data != NULL &&
+      !taskgroup->gomp_data) { // need to reduce?
+    int cnt;
+    void *reduce_data;
+    kmp_team_t *t = thread->th.th_team;
+    kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
+    // check if <priv> data of the first reduction variable shared for the team
+    void *priv0 = arr[0].reduce_priv;
+    if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
+        ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
+      // finishing task reduction on parallel
+      cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
+      if (cnt == thread->th.th_team_nproc - 1) {
+        // we are the last thread passing __kmpc_reduction_modifier_fini()
+        // finalize task reduction:
+        __kmp_task_reduction_fini(thread, taskgroup);
+        // cleanup fields in the team structure:
+        // TODO: is relaxed store enough here (whole barrier should follow)?
+        __kmp_thread_free(thread, reduce_data);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
+      } else {
+        // we are not the last thread passing __kmpc_reduction_modifier_fini(),
+        // so do not finalize reduction, just clean own copy of the data
+        __kmp_task_reduction_clean(thread, taskgroup);
+      }
+    } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
+                   NULL &&
+               ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
+      // finishing task reduction on worksharing
+      cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
+      if (cnt == thread->th.th_team_nproc - 1) {
+        // we are the last thread passing __kmpc_reduction_modifier_fini()
+        __kmp_task_reduction_fini(thread, taskgroup);
+        // cleanup fields in team structure:
+        // TODO: is relaxed store enough here (whole barrier should follow)?
+        __kmp_thread_free(thread, reduce_data);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
+      } else {
+        // we are not the last thread passing __kmpc_reduction_modifier_fini(),
+        // so do not finalize reduction, just clean own copy of the data
+        __kmp_task_reduction_clean(thread, taskgroup);
+      }
+    } else {
+      // finishing task reduction on taskgroup
+      __kmp_task_reduction_fini(thread, taskgroup);
+    }
+  }
+  // Restore parent taskgroup for the current task
+  taskdata->td_taskgroup = taskgroup->parent;
+  __kmp_thread_free(thread, taskgroup);
+
+  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
+                gtid, taskdata));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
+    ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+        ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
+        &(my_task_data), codeptr);
+  }
+#endif
+}
+
+static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
+                                           kmp_task_team_t *task_team,
+                                           kmp_int32 is_constrained) {
+  kmp_task_t *task = NULL;
+  kmp_taskdata_t *taskdata;
+  kmp_taskdata_t *current;
+  kmp_thread_data_t *thread_data;
+  int ntasks = task_team->tt.tt_num_task_pri;
+  if (ntasks == 0) {
+    KA_TRACE(
+        20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
+    return NULL;
+  }
+  do {
+    // decrement num_tasks to "reserve" one task to get for execution
+    if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
+                                   ntasks - 1))
+      break;
+    ntasks = task_team->tt.tt_num_task_pri;
+  } while (ntasks > 0);
+  if (ntasks == 0) {
+    KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
+                  __kmp_get_gtid()));
+    return NULL;
+  }
+  // We got a "ticket" to get a "reserved" priority task
+  int deque_ntasks;
+  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
+  do {
+    KMP_ASSERT(list != NULL);
+    thread_data = &list->td;
+    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+    deque_ntasks = thread_data->td.td_deque_ntasks;
+    if (deque_ntasks == 0) {
+      __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+      KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
+                    __kmp_get_gtid(), thread_data));
+      list = list->next;
+    }
+  } while (deque_ntasks == 0);
+  KMP_DEBUG_ASSERT(deque_ntasks);
+  int target = thread_data->td.td_deque_head;
+  current = __kmp_threads[gtid]->th.th_current_task;
+  taskdata = thread_data->td.td_deque[target];
+  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
+    // Bump head pointer and Wrap.
+    thread_data->td.td_deque_head =
+        (target + 1) & TASK_DEQUE_MASK(thread_data->td);
+  } else {
+    if (!task_team->tt.tt_untied_task_encountered) {
+      // The TSC does not allow to steal victim task
+      __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+      KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
+                    "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
+                    gtid, thread_data, task_team, deque_ntasks, target,
+                    thread_data->td.td_deque_tail));
+      task_team->tt.tt_num_task_pri++; // atomic inc, restore value
+      return NULL;
+    }
+    int i;
+    // walk through the deque trying to steal any task
+    taskdata = NULL;
+    for (i = 1; i < deque_ntasks; ++i) {
+      target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
+      taskdata = thread_data->td.td_deque[target];
+      if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
+        break; // found task to execute
+      } else {
+        taskdata = NULL;
+      }
+    }
+    if (taskdata == NULL) {
+      // No appropriate candidate found to execute
+      __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+      KA_TRACE(
+          10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
+               "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
+               gtid, thread_data, task_team, deque_ntasks,
+               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+      task_team->tt.tt_num_task_pri++; // atomic inc, restore value
+      return NULL;
+    }
+    int prev = target;
+    for (i = i + 1; i < deque_ntasks; ++i) {
+      // shift remaining tasks in the deque left by 1
+      target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
+      thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
+      prev = target;
+    }
+    KMP_DEBUG_ASSERT(
+        thread_data->td.td_deque_tail ==
+        (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
+    thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
+  }
+  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
+  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+  task = KMP_TASKDATA_TO_TASK(taskdata);
+  return task;
+}
+
+// __kmp_remove_my_task: remove a task from my own deque
+static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
+                                        kmp_task_team_t *task_team,
+                                        kmp_int32 is_constrained) {
+  kmp_task_t *task;
+  kmp_taskdata_t *taskdata;
+  kmp_thread_data_t *thread_data;
+  kmp_uint32 tail;
+
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
+                   NULL); // Caller should check this condition
+
+  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
+
+  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
+                gtid, thread_data->td.td_deque_ntasks,
+                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+
+  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
+    KA_TRACE(10,
+             ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
+              "ntasks=%d head=%u tail=%u\n",
+              gtid, thread_data->td.td_deque_ntasks,
+              thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+    return NULL;
+  }
+
+  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+
+  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
+    __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+    KA_TRACE(10,
+             ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
+              "ntasks=%d head=%u tail=%u\n",
+              gtid, thread_data->td.td_deque_ntasks,
+              thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+    return NULL;
+  }
+
+  tail = (thread_data->td.td_deque_tail - 1) &
+         TASK_DEQUE_MASK(thread_data->td); // Wrap index.
+  taskdata = thread_data->td.td_deque[tail];
+
+  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
+                             thread->th.th_current_task)) {
+    // The TSC does not allow to steal victim task
+    __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+    KA_TRACE(10,
+             ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
+              "ntasks=%d head=%u tail=%u\n",
+              gtid, thread_data->td.td_deque_ntasks,
+              thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+    return NULL;
+  }
+
+  thread_data->td.td_deque_tail = tail;
+  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
+
+  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+
+  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
+                "ntasks=%d head=%u tail=%u\n",
+                gtid, taskdata, thread_data->td.td_deque_ntasks,
+                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+
+  task = KMP_TASKDATA_TO_TASK(taskdata);
+  return task;
+}
+
+// __kmp_steal_task: remove a task from another thread's deque
+// Assume that calling thread has already checked existence of
+// task_team thread_data before calling this routine.
+static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
+                                    kmp_task_team_t *task_team,
+                                    std::atomic<kmp_int32> *unfinished_threads,
+                                    int *thread_finished,
+                                    kmp_int32 is_constrained) {
+  kmp_task_t *task;
+  kmp_taskdata_t *taskdata;
+  kmp_taskdata_t *current;
+  kmp_thread_data_t *victim_td, *threads_data;
+  kmp_int32 target;
+  kmp_int32 victim_tid;
+
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+
+  threads_data = task_team->tt.tt_threads_data;
+  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
+
+  victim_tid = victim_thr->th.th_info.ds.ds_tid;
+  victim_td = &threads_data[victim_tid];
+
+  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
+                "task_team=%p ntasks=%d head=%u tail=%u\n",
+                gtid, __kmp_gtid_from_thread(victim_thr), task_team,
+                victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
+                victim_td->td.td_deque_tail));
+
+  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
+    KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
+                  "task_team=%p ntasks=%d head=%u tail=%u\n",
+                  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
+                  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
+                  victim_td->td.td_deque_tail));
+    return NULL;
+  }
+
+  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
+
+  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
+  // Check again after we acquire the lock
+  if (ntasks == 0) {
+    __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
+    KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
+                  "task_team=%p ntasks=%d head=%u tail=%u\n",
+                  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
+                  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
+    return NULL;
+  }
+
+  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
+  current = __kmp_threads[gtid]->th.th_current_task;
+  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
+  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
+    // Bump head pointer and Wrap.
+    victim_td->td.td_deque_head =
+        (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
+  } else {
+    if (!task_team->tt.tt_untied_task_encountered) {
+      // The TSC does not allow to steal victim task
+      __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
+      KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
+                    "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
+                    gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
+                    victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
+      return NULL;
+    }
+    int i;
+    // walk through victim's deque trying to steal any task
+    target = victim_td->td.td_deque_head;
+    taskdata = NULL;
+    for (i = 1; i < ntasks; ++i) {
+      target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
+      taskdata = victim_td->td.td_deque[target];
+      if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
+        break; // found victim task
+      } else {
+        taskdata = NULL;
+      }
+    }
+    if (taskdata == NULL) {
+      // No appropriate candidate to steal found
+      __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
+      KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
+                    "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
+                    gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
+                    victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
+      return NULL;
+    }
+    int prev = target;
+    for (i = i + 1; i < ntasks; ++i) {
+      // shift remaining tasks in the deque left by 1
+      target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
+      victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
+      prev = target;
+    }
+    KMP_DEBUG_ASSERT(
+        victim_td->td.td_deque_tail ==
+        (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
+    victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
+  }
+  if (*thread_finished) {
+    // We need to un-mark this victim as a finished victim.  This must be done
+    // before releasing the lock, or else other threads (starting with the
+    // primary thread victim) might be prematurely released from the barrier!!!
+#if KMP_DEBUG
+    kmp_int32 count =
+#endif
+        KMP_ATOMIC_INC(unfinished_threads);
+    KA_TRACE(
+        20,
+        ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
+         gtid, count + 1, task_team));
+    *thread_finished = FALSE;
+  }
+  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
+
+  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
+
+  KMP_COUNT_BLOCK(TASK_stolen);
+  KA_TRACE(10,
+           ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
+            "task_team=%p ntasks=%d head=%u tail=%u\n",
+            gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
+            ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
+
+  task = KMP_TASKDATA_TO_TASK(taskdata);
+  return task;
+}
+
+// __kmp_execute_tasks_template: Choose and execute tasks until either the
+// condition is statisfied (return true) or there are none left (return false).
+//
+// final_spin is TRUE if this is the spin at the release barrier.
+// thread_finished indicates whether the thread is finished executing all
+// the tasks it has on its deque, and is at the release barrier.
+// spinner is the location on which to spin.
+// spinner == NULL means only execute a single task and return.
+// checker is the value to check to terminate the spin.
+template <class C>
+static inline int __kmp_execute_tasks_template(
+    kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
+    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+    kmp_int32 is_constrained) {
+  kmp_task_team_t *task_team = thread->th.th_task_team;
+  kmp_thread_data_t *threads_data;
+  kmp_task_t *task;
+  kmp_info_t *other_thread;
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  std::atomic<kmp_int32> *unfinished_threads;
+  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
+                      tid = thread->th.th_info.ds.ds_tid;
+
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
+
+  if (task_team == NULL || current_task == NULL)
+    return FALSE;
+
+  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
+                "*thread_finished=%d\n",
+                gtid, final_spin, *thread_finished));
+
+  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
+
+  KMP_DEBUG_ASSERT(threads_data != NULL);
+
+  nthreads = task_team->tt.tt_nproc;
+  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
+  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
+                   task_team->tt.tt_hidden_helper_task_encountered);
+  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
+
+  while (1) { // Outer loop keeps trying to find tasks in case of single thread
+    // getting tasks from target constructs
+    while (1) { // Inner loop to find a task and execute it
+      task = NULL;
+      if (task_team->tt.tt_num_task_pri) { // get priority task first
+        task = __kmp_get_priority_task(gtid, task_team, is_constrained);
+      }
+      if (task == NULL && use_own_tasks) { // check own queue next
+        task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
+      }
+      if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
+        int asleep = 1;
+        use_own_tasks = 0;
+        // Try to steal from the last place I stole from successfully.
+        if (victim_tid == -2) { // haven't stolen anything yet
+          victim_tid = threads_data[tid].td.td_deque_last_stolen;
+          if (victim_tid !=
+              -1) // if we have a last stolen from victim, get the thread
+            other_thread = threads_data[victim_tid].td.td_thr;
+        }
+        if (victim_tid != -1) { // found last victim
+          asleep = 0;
+        } else if (!new_victim) { // no recent steals and we haven't already
+          // used a new victim; select a random thread
+          do { // Find a different thread to steal work from.
+            // Pick a random thread. Initial plan was to cycle through all the
+            // threads, and only return if we tried to steal from every thread,
+            // and failed.  Arch says that's not such a great idea.
+            victim_tid = __kmp_get_random(thread) % (nthreads - 1);
+            if (victim_tid >= tid) {
+              ++victim_tid; // Adjusts random distribution to exclude self
+            }
+            // Found a potential victim
+            other_thread = threads_data[victim_tid].td.td_thr;
+            // There is a slight chance that __kmp_enable_tasking() did not wake
+            // up all threads waiting at the barrier.  If victim is sleeping,
+            // then wake it up. Since we were going to pay the cache miss
+            // penalty for referencing another thread's kmp_info_t struct
+            // anyway,
+            // the check shouldn't cost too much performance at this point. In
+            // extra barrier mode, tasks do not sleep at the separate tasking
+            // barrier, so this isn't a problem.
+            asleep = 0;
+            if ((__kmp_tasking_mode == tskm_task_teams) &&
+                (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
+                (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
+                 NULL)) {
+              asleep = 1;
+              __kmp_null_resume_wrapper(other_thread);
+              // A sleeping thread should not have any tasks on it's queue.
+              // There is a slight possibility that it resumes, steals a task
+              // from another thread, which spawns more tasks, all in the time
+              // that it takes this thread to check => don't write an assertion
+              // that the victim's queue is empty.  Try stealing from a
+              // different thread.
+            }
+          } while (asleep);
+        }
+
+        if (!asleep) {
+          // We have a victim to try to steal from
+          task = __kmp_steal_task(other_thread, gtid, task_team,
+                                  unfinished_threads, thread_finished,
+                                  is_constrained);
+        }
+        if (task != NULL) { // set last stolen to victim
+          if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
+            threads_data[tid].td.td_deque_last_stolen = victim_tid;
+            // The pre-refactored code did not try more than 1 successful new
+            // vicitm, unless the last one generated more local tasks;
+            // new_victim keeps track of this
+            new_victim = 1;
+          }
+        } else { // No tasks found; unset last_stolen
+          KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
+          victim_tid = -2; // no successful victim found
+        }
+      }
+
+      if (task == NULL)
+        break; // break out of tasking loop
+
+// Found a task; execute it
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+        if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
+          // get the object reliably
+          itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+        }
+        __kmp_itt_task_starting(itt_sync_obj);
+      }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+      __kmp_invoke_task(gtid, task, current_task);
+#if USE_ITT_BUILD
+      if (itt_sync_obj != NULL)
+        __kmp_itt_task_finished(itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+      // If this thread is only partway through the barrier and the condition is
+      // met, then return now, so that the barrier gather/release pattern can
+      // proceed. If this thread is in the last spin loop in the barrier,
+      // waiting to be released, we know that the termination condition will not
+      // be satisfied, so don't waste any cycles checking it.
+      if (flag == NULL || (!final_spin && flag->done_check())) {
+        KA_TRACE(
+            15,
+            ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
+             gtid));
+        return TRUE;
+      }
+      if (thread->th.th_task_team == NULL) {
+        break;
+      }
+      KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
+      // If execution of a stolen task results in more tasks being placed on our
+      // run queue, reset use_own_tasks
+      if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
+        KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
+                      "other tasks, restart\n",
+                      gtid));
+        use_own_tasks = 1;
+        new_victim = 0;
+      }
+    }
+
+    // The task source has been exhausted. If in final spin loop of barrier,
+    // check if termination condition is satisfied. The work queue may be empty
+    // but there might be proxy tasks still executing.
+    if (final_spin &&
+        KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
+      // First, decrement the #unfinished threads, if that has not already been
+      // done.  This decrement might be to the spin location, and result in the
+      // termination condition being satisfied.
+      if (!*thread_finished) {
+#if KMP_DEBUG
+        kmp_int32 count = -1 +
+#endif
+            KMP_ATOMIC_DEC(unfinished_threads);
+        KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
+                      "unfinished_threads to %d task_team=%p\n",
+                      gtid, count, task_team));
+        *thread_finished = TRUE;
+      }
+
+      // It is now unsafe to reference thread->th.th_team !!!
+      // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
+      // thread to pass through the barrier, where it might reset each thread's
+      // th.th_team field for the next parallel region. If we can steal more
+      // work, we know that this has not happened yet.
+      if (flag != NULL && flag->done_check()) {
+        KA_TRACE(
+            15,
+            ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
+             gtid));
+        return TRUE;
+      }
+    }
+
+    // If this thread's task team is NULL, primary thread has recognized that
+    // there are no more tasks; bail out
+    if (thread->th.th_task_team == NULL) {
+      KA_TRACE(15,
+               ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
+      return FALSE;
+    }
+
+    // Check the flag again to see if it has already done in case to be trapped
+    // into infinite loop when a if0 task depends on a hidden helper task
+    // outside any parallel region. Detached tasks are not impacted in this case
+    // because the only thread executing this function has to execute the proxy
+    // task so it is in another code path that has the same check.
+    if (flag == NULL || (!final_spin && flag->done_check())) {
+      KA_TRACE(15,
+               ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
+                gtid));
+      return TRUE;
+    }
+
+    // We could be getting tasks from target constructs; if this is the only
+    // thread, keep trying to execute tasks from own queue
+    if (nthreads == 1 &&
+        KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
+      use_own_tasks = 1;
+    else {
+      KA_TRACE(15,
+               ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
+      return FALSE;
+    }
+  }
+}
+
+template <bool C, bool S>
+int __kmp_execute_tasks_32(
+    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
+    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+    kmp_int32 is_constrained) {
+  return __kmp_execute_tasks_template(
+      thread, gtid, flag, final_spin,
+      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+template <bool C, bool S>
+int __kmp_execute_tasks_64(
+    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
+    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+    kmp_int32 is_constrained) {
+  return __kmp_execute_tasks_template(
+      thread, gtid, flag, final_spin,
+      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+template <bool C, bool S>
+int __kmp_atomic_execute_tasks_64(
+    kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
+    int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+    kmp_int32 is_constrained) {
+  return __kmp_execute_tasks_template(
+      thread, gtid, flag, final_spin,
+      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+int __kmp_execute_tasks_oncore(
+    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
+    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+    kmp_int32 is_constrained) {
+  return __kmp_execute_tasks_template(
+      thread, gtid, flag, final_spin,
+      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+template int
+__kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
+                                     kmp_flag_32<false, false> *, int,
+                                     int *USE_ITT_BUILD_ARG(void *), kmp_int32);
+
+template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
+                                                 kmp_flag_64<false, true> *,
+                                                 int,
+                                                 int *USE_ITT_BUILD_ARG(void *),
+                                                 kmp_int32);
+
+template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
+                                                 kmp_flag_64<true, false> *,
+                                                 int,
+                                                 int *USE_ITT_BUILD_ARG(void *),
+                                                 kmp_int32);
+
+template int __kmp_atomic_execute_tasks_64<false, true>(
+    kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
+    int *USE_ITT_BUILD_ARG(void *), kmp_int32);
+
+template int __kmp_atomic_execute_tasks_64<true, false>(
+    kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
+    int *USE_ITT_BUILD_ARG(void *), kmp_int32);
+
+// __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
+// next barrier so they can assist in executing enqueued tasks.
+// First thread in allocates the task team atomically.
+static void __kmp_enable_tasking(kmp_task_team_t *task_team,
+                                 kmp_info_t *this_thr) {
+  kmp_thread_data_t *threads_data;
+  int nthreads, i, is_init_thread;
+
+  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
+                __kmp_gtid_from_thread(this_thr)));
+
+  KMP_DEBUG_ASSERT(task_team != NULL);
+  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
+
+  nthreads = task_team->tt.tt_nproc;
+  KMP_DEBUG_ASSERT(nthreads > 0);
+  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
+
+  // Allocate or increase the size of threads_data if necessary
+  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
+
+  if (!is_init_thread) {
+    // Some other thread already set up the array.
+    KA_TRACE(
+        20,
+        ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
+         __kmp_gtid_from_thread(this_thr)));
+    return;
+  }
+  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
+  KMP_DEBUG_ASSERT(threads_data != NULL);
+
+  if (__kmp_tasking_mode == tskm_task_teams &&
+      (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
+    // Release any threads sleeping at the barrier, so that they can steal
+    // tasks and execute them.  In extra barrier mode, tasks do not sleep
+    // at the separate tasking barrier, so this isn't a problem.
+    for (i = 0; i < nthreads; i++) {
+      void *sleep_loc;
+      kmp_info_t *thread = threads_data[i].td.td_thr;
+
+      if (i == this_thr->th.th_info.ds.ds_tid) {
+        continue;
+      }
+      // Since we haven't locked the thread's suspend mutex lock at this
+      // point, there is a small window where a thread might be putting
+      // itself to sleep, but hasn't set the th_sleep_loc field yet.
+      // To work around this, __kmp_execute_tasks_template() periodically checks
+      // see if other threads are sleeping (using the same random mechanism that
+      // is used for task stealing) and awakens them if they are.
+      if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
+          NULL) {
+        KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
+                      __kmp_gtid_from_thread(this_thr),
+                      __kmp_gtid_from_thread(thread)));
+        __kmp_null_resume_wrapper(thread);
+      } else {
+        KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
+                      __kmp_gtid_from_thread(this_thr),
+                      __kmp_gtid_from_thread(thread)));
+      }
+    }
+  }
+
+  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
+                __kmp_gtid_from_thread(this_thr)));
+}
+
+/* // TODO: Check the comment consistency
+ * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
+ * like a shadow of the kmp_team_t data struct, with a different lifetime.
+ * After a child * thread checks into a barrier and calls __kmp_release() from
+ * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
+ * longer assume that the kmp_team_t structure is intact (at any moment, the
+ * primary thread may exit the barrier code and free the team data structure,
+ * and return the threads to the thread pool).
+ *
+ * This does not work with the tasking code, as the thread is still
+ * expected to participate in the execution of any tasks that may have been
+ * spawned my a member of the team, and the thread still needs access to all
+ * to each thread in the team, so that it can steal work from it.
+ *
+ * Enter the existence of the kmp_task_team_t struct.  It employs a reference
+ * counting mechanism, and is allocated by the primary thread before calling
+ * __kmp_<barrier_kind>_release, and then is release by the last thread to
+ * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
+ * of the kmp_task_team_t structs for consecutive barriers can overlap
+ * (and will, unless the primary thread is the last thread to exit the barrier
+ * release phase, which is not typical). The existence of such a struct is
+ * useful outside the context of tasking.
+ *
+ * We currently use the existence of the threads array as an indicator that
+ * tasks were spawned since the last barrier.  If the structure is to be
+ * useful outside the context of tasking, then this will have to change, but
+ * not setting the field minimizes the performance impact of tasking on
+ * barriers, when no explicit tasks were spawned (pushed, actually).
+ */
+
+static kmp_task_team_t *__kmp_free_task_teams =
+    NULL; // Free list for task_team data structures
+// Lock for task team data structures
+kmp_bootstrap_lock_t __kmp_task_team_lock =
+    KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
+
+// __kmp_alloc_task_deque:
+// Allocates a task deque for a particular thread, and initialize the necessary
+// data structures relating to the deque.  This only happens once per thread
+// per task team since task teams are recycled. No lock is needed during
+// allocation since each thread allocates its own deque.
+static void __kmp_alloc_task_deque(kmp_info_t *thread,
+                                   kmp_thread_data_t *thread_data) {
+  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
+  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
+
+  // Initialize last stolen task field to "none"
+  thread_data->td.td_deque_last_stolen = -1;
+
+  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
+  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
+  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
+
+  KE_TRACE(
+      10,
+      ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
+       __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
+  // Allocate space for task deque, and zero the deque
+  // Cannot use __kmp_thread_calloc() because threads not around for
+  // kmp_reap_task_team( ).
+  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
+      INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
+  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
+}
+
+// __kmp_free_task_deque:
+// Deallocates a task deque for a particular thread. Happens at library
+// deallocation so don't need to reset all thread data fields.
+static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
+  if (thread_data->td.td_deque != NULL) {
+    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+    TCW_4(thread_data->td.td_deque_ntasks, 0);
+    __kmp_free(thread_data->td.td_deque);
+    thread_data->td.td_deque = NULL;
+    __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+  }
+
+#ifdef BUILD_TIED_TASK_STACK
+  // GEH: Figure out what to do here for td_susp_tied_tasks
+  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
+    __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
+  }
+#endif // BUILD_TIED_TASK_STACK
+}
+
+// __kmp_realloc_task_threads_data:
+// Allocates a threads_data array for a task team, either by allocating an
+// initial array or enlarging an existing array.  Only the first thread to get
+// the lock allocs or enlarges the array and re-initializes the array elements.
+// That thread returns "TRUE", the rest return "FALSE".
+// Assumes that the new array size is given by task_team -> tt.tt_nproc.
+// The current size is given by task_team -> tt.tt_max_threads.
+static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
+                                           kmp_task_team_t *task_team) {
+  kmp_thread_data_t **threads_data_p;
+  kmp_int32 nthreads, maxthreads;
+  int is_init_thread = FALSE;
+
+  if (TCR_4(task_team->tt.tt_found_tasks)) {
+    // Already reallocated and initialized.
+    return FALSE;
+  }
+
+  threads_data_p = &task_team->tt.tt_threads_data;
+  nthreads = task_team->tt.tt_nproc;
+  maxthreads = task_team->tt.tt_max_threads;
+
+  // All threads must lock when they encounter the first task of the implicit
+  // task region to make sure threads_data fields are (re)initialized before
+  // used.
+  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
+
+  if (!TCR_4(task_team->tt.tt_found_tasks)) {
+    // first thread to enable tasking
+    kmp_team_t *team = thread->th.th_team;
+    int i;
+
+    is_init_thread = TRUE;
+    if (maxthreads < nthreads) {
+
+      if (*threads_data_p != NULL) {
+        kmp_thread_data_t *old_data = *threads_data_p;
+        kmp_thread_data_t *new_data = NULL;
+
+        KE_TRACE(
+            10,
+            ("__kmp_realloc_task_threads_data: T#%d reallocating "
+             "threads data for task_team %p, new_size = %d, old_size = %d\n",
+             __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
+        // Reallocate threads_data to have more elements than current array
+        // Cannot use __kmp_thread_realloc() because threads not around for
+        // kmp_reap_task_team( ).  Note all new array entries are initialized
+        // to zero by __kmp_allocate().
+        new_data = (kmp_thread_data_t *)__kmp_allocate(
+            nthreads * sizeof(kmp_thread_data_t));
+        // copy old data to new data
+        KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
+                     (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
+
+#ifdef BUILD_TIED_TASK_STACK
+        // GEH: Figure out if this is the right thing to do
+        for (i = maxthreads; i < nthreads; i++) {
+          kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
+          __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
+        }
+#endif // BUILD_TIED_TASK_STACK
+       // Install the new data and free the old data
+        (*threads_data_p) = new_data;
+        __kmp_free(old_data);
+      } else {
+        KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
+                      "threads data for task_team %p, size = %d\n",
+                      __kmp_gtid_from_thread(thread), task_team, nthreads));
+        // Make the initial allocate for threads_data array, and zero entries
+        // Cannot use __kmp_thread_calloc() because threads not around for
+        // kmp_reap_task_team( ).
+        *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
+            nthreads * sizeof(kmp_thread_data_t));
+#ifdef BUILD_TIED_TASK_STACK
+        // GEH: Figure out if this is the right thing to do
+        for (i = 0; i < nthreads; i++) {
+          kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
+          __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
+        }
+#endif // BUILD_TIED_TASK_STACK
+      }
+      task_team->tt.tt_max_threads = nthreads;
+    } else {
+      // If array has (more than) enough elements, go ahead and use it
+      KMP_DEBUG_ASSERT(*threads_data_p != NULL);
+    }
+
+    // initialize threads_data pointers back to thread_info structures
+    for (i = 0; i < nthreads; i++) {
+      kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
+      thread_data->td.td_thr = team->t.t_threads[i];
+
+      if (thread_data->td.td_deque_last_stolen >= nthreads) {
+        // The last stolen field survives across teams / barrier, and the number
+        // of threads may have changed.  It's possible (likely?) that a new
+        // parallel region will exhibit the same behavior as previous region.
+        thread_data->td.td_deque_last_stolen = -1;
+      }
+    }
+
+    KMP_MB();
+    TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
+  }
+
+  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
+  return is_init_thread;
+}
+
+// __kmp_free_task_threads_data:
+// Deallocates a threads_data array for a task team, including any attached
+// tasking deques.  Only occurs at library shutdown.
+static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
+  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
+  if (task_team->tt.tt_threads_data != NULL) {
+    int i;
+    for (i = 0; i < task_team->tt.tt_max_threads; i++) {
+      __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
+    }
+    __kmp_free(task_team->tt.tt_threads_data);
+    task_team->tt.tt_threads_data = NULL;
+  }
+  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
+}
+
+// __kmp_free_task_pri_list:
+// Deallocates tasking deques used for priority tasks.
+// Only occurs at library shutdown.
+static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
+  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
+  if (task_team->tt.tt_task_pri_list != NULL) {
+    kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
+    while (list != NULL) {
+      kmp_task_pri_t *next = list->next;
+      __kmp_free_task_deque(&list->td);
+      __kmp_free(list);
+      list = next;
+    }
+    task_team->tt.tt_task_pri_list = NULL;
+  }
+  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
+}
+
+// __kmp_allocate_task_team:
+// Allocates a task team associated with a specific team, taking it from
+// the global task team free list if possible.  Also initializes data
+// structures.
+static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
+                                                 kmp_team_t *team) {
+  kmp_task_team_t *task_team = NULL;
+  int nthreads;
+
+  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
+                (thread ? __kmp_gtid_from_thread(thread) : -1), team));
+
+  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
+    // Take a task team from the task team pool
+    __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
+    if (__kmp_free_task_teams != NULL) {
+      task_team = __kmp_free_task_teams;
+      TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
+      task_team->tt.tt_next = NULL;
+    }
+    __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
+  }
+
+  if (task_team == NULL) {
+    KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
+                  "task team for team %p\n",
+                  __kmp_gtid_from_thread(thread), team));
+    // Allocate a new task team if one is not available. Cannot use
+    // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
+    task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
+    __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
+    __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
+#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
+    // suppress race conditions detection on synchronization flags in debug mode
+    // this helps to analyze library internals eliminating false positives
+    __itt_suppress_mark_range(
+        __itt_suppress_range, __itt_suppress_threading_errors,
+        &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
+    __itt_suppress_mark_range(__itt_suppress_range,
+                              __itt_suppress_threading_errors,
+                              CCAST(kmp_uint32 *, &task_team->tt.tt_active),
+                              sizeof(task_team->tt.tt_active));
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
+    // Note: __kmp_allocate zeroes returned memory, othewise we would need:
+    // task_team->tt.tt_threads_data = NULL;
+    // task_team->tt.tt_max_threads = 0;
+    // task_team->tt.tt_next = NULL;
+  }
+
+  TCW_4(task_team->tt.tt_found_tasks, FALSE);
+  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
+  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
+
+  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
+  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
+  TCW_4(task_team->tt.tt_active, TRUE);
+
+  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
+                "unfinished_threads init'd to %d\n",
+                (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
+                KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
+  return task_team;
+}
+
+// __kmp_free_task_team:
+// Frees the task team associated with a specific thread, and adds it
+// to the global task team free list.
+void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
+  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
+                thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
+
+  // Put task team back on free list
+  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
+
+  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
+  task_team->tt.tt_next = __kmp_free_task_teams;
+  TCW_PTR(__kmp_free_task_teams, task_team);
+
+  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
+}
+
+// __kmp_reap_task_teams:
+// Free all the task teams on the task team free list.
+// Should only be done during library shutdown.
+// Cannot do anything that needs a thread structure or gtid since they are
+// already gone.
+void __kmp_reap_task_teams(void) {
+  kmp_task_team_t *task_team;
+
+  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
+    // Free all task_teams on the free list
+    __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
+    while ((task_team = __kmp_free_task_teams) != NULL) {
+      __kmp_free_task_teams = task_team->tt.tt_next;
+      task_team->tt.tt_next = NULL;
+
+      // Free threads_data if necessary
+      if (task_team->tt.tt_threads_data != NULL) {
+        __kmp_free_task_threads_data(task_team);
+      }
+      if (task_team->tt.tt_task_pri_list != NULL) {
+        __kmp_free_task_pri_list(task_team);
+      }
+      __kmp_free(task_team);
+    }
+    __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
+  }
+}
+
+// __kmp_wait_to_unref_task_teams:
+// Some threads could still be in the fork barrier release code, possibly
+// trying to steal tasks.  Wait for each thread to unreference its task team.
+void __kmp_wait_to_unref_task_teams(void) {
+  kmp_info_t *thread;
+  kmp_uint32 spins;
+  kmp_uint64 time;
+  int done;
+
+  KMP_INIT_YIELD(spins);
+  KMP_INIT_BACKOFF(time);
+
+  for (;;) {
+    done = TRUE;
+
+    // TODO: GEH - this may be is wrong because some sync would be necessary
+    // in case threads are added to the pool during the traversal. Need to
+    // verify that lock for thread pool is held when calling this routine.
+    for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
+         thread = thread->th.th_next_pool) {
+#if KMP_OS_WINDOWS
+      DWORD exit_val;
+#endif
+      if (TCR_PTR(thread->th.th_task_team) == NULL) {
+        KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
+                      __kmp_gtid_from_thread(thread)));
+        continue;
+      }
+#if KMP_OS_WINDOWS
+      // TODO: GEH - add this check for Linux* OS / OS X* as well?
+      if (!__kmp_is_thread_alive(thread, &exit_val)) {
+        thread->th.th_task_team = NULL;
+        continue;
+      }
+#endif
+
+      done = FALSE; // Because th_task_team pointer is not NULL for this thread
+
+      KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
+                    "unreference task_team\n",
+                    __kmp_gtid_from_thread(thread)));
+
+      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+        void *sleep_loc;
+        // If the thread is sleeping, awaken it.
+        if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
+            NULL) {
+          KA_TRACE(
+              10,
+              ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
+               __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
+          __kmp_null_resume_wrapper(thread);
+        }
+      }
+    }
+    if (done) {
+      break;
+    }
+
+    // If oversubscribed or have waited a bit, yield.
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
+  }
+}
+
+void __kmp_shift_task_state_stack(kmp_info_t *this_thr, kmp_uint8 value) {
+  // Shift values from th_task_state_top+1 to task_state_stack_sz
+  if (this_thr->th.th_task_state_top + 1 >=
+      this_thr->th.th_task_state_stack_sz) { // increase size
+    kmp_uint32 new_size = 2 * this_thr->th.th_task_state_stack_sz;
+    kmp_uint8 *old_stack, *new_stack;
+    kmp_uint32 i;
+    new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
+    for (i = 0; i <= this_thr->th.th_task_state_top; ++i) {
+      new_stack[i] = this_thr->th.th_task_state_memo_stack[i];
+    }
+    // If we need to reallocate do the shift at the same time.
+    for (; i < this_thr->th.th_task_state_stack_sz; ++i) {
+      new_stack[i + 1] = this_thr->th.th_task_state_memo_stack[i];
+    }
+    for (i = this_thr->th.th_task_state_stack_sz; i < new_size;
+         ++i) { // zero-init rest of stack
+      new_stack[i] = 0;
+    }
+    old_stack = this_thr->th.th_task_state_memo_stack;
+    this_thr->th.th_task_state_memo_stack = new_stack;
+    this_thr->th.th_task_state_stack_sz = new_size;
+    __kmp_free(old_stack);
+  } else {
+    kmp_uint8 *end;
+    kmp_uint32 i;
+
+    end = &this_thr->th
+               .th_task_state_memo_stack[this_thr->th.th_task_state_stack_sz];
+
+    for (i = this_thr->th.th_task_state_stack_sz - 1;
+         i > this_thr->th.th_task_state_top; i--, end--)
+      end[0] = end[-1];
+  }
+  this_thr->th.th_task_state_memo_stack[this_thr->th.th_task_state_top + 1] =
+      value;
+}
+
+// __kmp_task_team_setup:  Create a task_team for the current team, but use
+// an already created, unused one if it already exists.
+void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+
+  // If this task_team hasn't been created yet, allocate it. It will be used in
+  // the region after the next.
+  // If it exists, it is the current task team and shouldn't be touched yet as
+  // it may still be in use.
+  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
+      (always || team->t.t_nproc > 1)) {
+    team->t.t_task_team[this_thr->th.th_task_state] =
+        __kmp_allocate_task_team(this_thr, team);
+    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
+                  " for team %d at parity=%d\n",
+                  __kmp_gtid_from_thread(this_thr),
+                  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
+                  this_thr->th.th_task_state));
+  }
+  if (this_thr->th.th_task_state == 1 && always && team->t.t_nproc == 1) {
+    // fix task state stack to adjust for proxy and helper tasks
+    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d needs to shift stack"
+                  " for team %d at parity=%d\n",
+                  __kmp_gtid_from_thread(this_thr), team->t.t_id,
+                  this_thr->th.th_task_state));
+    __kmp_shift_task_state_stack(this_thr, this_thr->th.th_task_state);
+  }
+
+  // After threads exit the release, they will call sync, and then point to this
+  // other task_team; make sure it is allocated and properly initialized. As
+  // threads spin in the barrier release phase, they will continue to use the
+  // previous task_team struct(above), until they receive the signal to stop
+  // checking for tasks (they can't safely reference the kmp_team_t struct,
+  // which could be reallocated by the primary thread). No task teams are formed
+  // for serialized teams.
+  if (team->t.t_nproc > 1) {
+    int other_team = 1 - this_thr->th.th_task_state;
+    KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
+    if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
+      team->t.t_task_team[other_team] =
+          __kmp_allocate_task_team(this_thr, team);
+      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
+                    "task_team %p for team %d at parity=%d\n",
+                    __kmp_gtid_from_thread(this_thr),
+                    team->t.t_task_team[other_team], team->t.t_id, other_team));
+    } else { // Leave the old task team struct in place for the upcoming region;
+      // adjust as needed
+      kmp_task_team_t *task_team = team->t.t_task_team[other_team];
+      if (!task_team->tt.tt_active ||
+          team->t.t_nproc != task_team->tt.tt_nproc) {
+        TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
+        TCW_4(task_team->tt.tt_found_tasks, FALSE);
+        TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+        TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
+        KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
+                          team->t.t_nproc);
+        TCW_4(task_team->tt.tt_active, TRUE);
+      }
+      // if team size has changed, the first thread to enable tasking will
+      // realloc threads_data if necessary
+      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
+                    "%p for team %d at parity=%d\n",
+                    __kmp_gtid_from_thread(this_thr),
+                    team->t.t_task_team[other_team], team->t.t_id, other_team));
+    }
+  }
+
+  // For regular thread, task enabling should be called when the task is going
+  // to be pushed to a dequeue. However, for the hidden helper thread, we need
+  // it ahead of time so that some operations can be performed without race
+  // condition.
+  if (this_thr == __kmp_hidden_helper_main_thread) {
+    for (int i = 0; i < 2; ++i) {
+      kmp_task_team_t *task_team = team->t.t_task_team[i];
+      if (KMP_TASKING_ENABLED(task_team)) {
+        continue;
+      }
+      __kmp_enable_tasking(task_team, this_thr);
+      for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
+        kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
+        if (thread_data->td.td_deque == NULL) {
+          __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
+        }
+      }
+    }
+  }
+}
+
+// __kmp_task_team_sync: Propagation of task team data from team to threads
+// which happens just after the release phase of a team barrier.  This may be
+// called by any thread, but only for teams with # threads > 1.
+void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+
+  // Toggle the th_task_state field, to switch which task_team this thread
+  // refers to
+  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
+
+  // It is now safe to propagate the task team pointer from the team struct to
+  // the current thread.
+  TCW_PTR(this_thr->th.th_task_team,
+          team->t.t_task_team[this_thr->th.th_task_state]);
+  KA_TRACE(20,
+           ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
+            "%p from Team #%d (parity=%d)\n",
+            __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
+            team->t.t_id, this_thr->th.th_task_state));
+}
+
+// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
+// barrier gather phase. Only called by primary thread if #threads in team > 1
+// or if proxy tasks were created.
+//
+// wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
+// by passing in 0 optionally as the last argument. When wait is zero, primary
+// thread does not wait for unfinished_threads to reach 0.
+void __kmp_task_team_wait(
+    kmp_info_t *this_thr,
+    kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
+  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
+
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
+
+  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
+    if (wait) {
+      KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
+                    "(for unfinished_threads to reach 0) on task_team = %p\n",
+                    __kmp_gtid_from_thread(this_thr), task_team));
+      // Worker threads may have dropped through to release phase, but could
+      // still be executing tasks. Wait here for tasks to complete. To avoid
+      // memory contention, only primary thread checks termination condition.
+      kmp_flag_32<false, false> flag(
+          RCAST(std::atomic<kmp_uint32> *,
+                &task_team->tt.tt_unfinished_threads),
+          0U);
+      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    // Deactivate the old task team, so that the worker threads will stop
+    // referencing it while spinning.
+    KA_TRACE(
+        20,
+        ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
+         "setting active to false, setting local and team's pointer to NULL\n",
+         __kmp_gtid_from_thread(this_thr), task_team));
+    KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
+                     task_team->tt.tt_found_proxy_tasks == TRUE ||
+                     task_team->tt.tt_hidden_helper_task_encountered == TRUE);
+    TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+    TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
+    KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
+    TCW_SYNC_4(task_team->tt.tt_active, FALSE);
+    KMP_MB();
+
+    TCW_PTR(this_thr->th.th_task_team, NULL);
+  }
+}
+
+// __kmp_tasking_barrier:
+// This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
+// Internal function to execute all tasks prior to a regular barrier or a join
+// barrier. It is a full barrier itself, which unfortunately turns regular
+// barriers into double barriers and join barriers into 1 1/2 barriers.
+void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
+  std::atomic<kmp_uint32> *spin = RCAST(
+      std::atomic<kmp_uint32> *,
+      &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
+  int flag = FALSE;
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
+
+#if USE_ITT_BUILD
+  KMP_FSYNC_SPIN_INIT(spin, NULL);
+#endif /* USE_ITT_BUILD */
+  kmp_flag_32<false, false> spin_flag(spin, 0U);
+  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
+                                  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
+#if USE_ITT_BUILD
+    // TODO: What about itt_sync_obj??
+    KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
+#endif /* USE_ITT_BUILD */
+
+    if (TCR_4(__kmp_global.g.g_done)) {
+      if (__kmp_global.g.g_abort)
+        __kmp_abort_thread();
+      break;
+    }
+    KMP_YIELD(TRUE);
+  }
+#if USE_ITT_BUILD
+  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
+#endif /* USE_ITT_BUILD */
+}
+
+// __kmp_give_task puts a task into a given thread queue if:
+//  - the queue for that thread was created
+//  - there's space in that queue
+// Because of this, __kmp_push_task needs to check if there's space after
+// getting the lock
+static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
+                            kmp_int32 pass) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_task_team_t *task_team = taskdata->td_task_team;
+
+  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
+                taskdata, tid));
+
+  // If task_team is NULL something went really bad...
+  KMP_DEBUG_ASSERT(task_team != NULL);
+
+  bool result = false;
+  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
+
+  if (thread_data->td.td_deque == NULL) {
+    // There's no queue in this thread, go find another one
+    // We're guaranteed that at least one thread has a queue
+    KA_TRACE(30,
+             ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
+              tid, taskdata));
+    return result;
+  }
+
+  if (TCR_4(thread_data->td.td_deque_ntasks) >=
+      TASK_DEQUE_SIZE(thread_data->td)) {
+    KA_TRACE(
+        30,
+        ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
+         taskdata, tid));
+
+    // if this deque is bigger than the pass ratio give a chance to another
+    // thread
+    if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
+      return result;
+
+    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+    if (TCR_4(thread_data->td.td_deque_ntasks) >=
+        TASK_DEQUE_SIZE(thread_data->td)) {
+      // expand deque to push the task which is not allowed to execute
+      __kmp_realloc_task_deque(thread, thread_data);
+    }
+
+  } else {
+
+    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+
+    if (TCR_4(thread_data->td.td_deque_ntasks) >=
+        TASK_DEQUE_SIZE(thread_data->td)) {
+      KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
+                    "thread %d.\n",
+                    taskdata, tid));
+
+      // if this deque is bigger than the pass ratio give a chance to another
+      // thread
+      if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
+        goto release_and_exit;
+
+      __kmp_realloc_task_deque(thread, thread_data);
+    }
+  }
+
+  // lock is held here, and there is space in the deque
+
+  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
+  // Wrap index.
+  thread_data->td.td_deque_tail =
+      (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
+  TCW_4(thread_data->td.td_deque_ntasks,
+        TCR_4(thread_data->td.td_deque_ntasks) + 1);
+
+  result = true;
+  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
+                taskdata, tid));
+
+release_and_exit:
+  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+
+  return result;
+}
+
+#define PROXY_TASK_FLAG 0x40000000
+/* The finish of the proxy tasks is divided in two pieces:
+    - the top half is the one that can be done from a thread outside the team
+    - the bottom half must be run from a thread within the team
+
+   In order to run the bottom half the task gets queued back into one of the
+   threads of the team. Once the td_incomplete_child_task counter of the parent
+   is decremented the threads can leave the barriers. So, the bottom half needs
+   to be queued before the counter is decremented. The top half is therefore
+   divided in two parts:
+    - things that can be run before queuing the bottom half
+    - things that must be run after queuing the bottom half
+
+   This creates a second race as the bottom half can free the task before the
+   second top half is executed. To avoid this we use the
+   td_incomplete_child_task of the proxy task to synchronize the top and bottom
+   half. */
+static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
+  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+
+  taskdata->td_flags.complete = 1; // mark the task as completed
+#if OMPX_TASKGRAPH
+  taskdata->td_flags.onced = 1;
+#endif
+
+  if (taskdata->td_taskgroup)
+    KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
+
+  // Create an imaginary children for this task so the bottom half cannot
+  // release the task before we have completed the second top half
+  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
+}
+
+static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
+#if KMP_DEBUG
+  kmp_int32 children = 0;
+  // Predecrement simulated by "- 1" calculation
+  children = -1 +
+#endif
+      KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
+  KMP_DEBUG_ASSERT(children >= 0);
+
+  // Remove the imaginary children
+  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
+}
+
+static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
+  kmp_info_t *thread = __kmp_threads[gtid];
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
+                   1); // top half must run before bottom half
+
+  // We need to wait to make sure the top half is finished
+  // Spinning here should be ok as this should happen quickly
+  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
+          PROXY_TASK_FLAG) > 0)
+    ;
+
+  __kmp_release_deps(gtid, taskdata);
+  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
+}
+
+/*!
+@ingroup TASKING
+@param gtid Global Thread ID of encountering thread
+@param ptask Task which execution is completed
+
+Execute the completion of a proxy task from a thread of that is part of the
+team. Run first and bottom halves directly.
+*/
+void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
+  KMP_DEBUG_ASSERT(ptask != NULL);
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
+  KA_TRACE(
+      10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
+           gtid, taskdata));
+  __kmp_assert_valid_gtid(gtid);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
+
+  __kmp_first_top_half_finish_proxy(taskdata);
+  __kmp_second_top_half_finish_proxy(taskdata);
+  __kmp_bottom_half_finish_proxy(gtid, ptask);
+
+  KA_TRACE(10,
+           ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
+            gtid, taskdata));
+}
+
+void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
+  KMP_DEBUG_ASSERT(ptask != NULL);
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
+
+  // Enqueue task to complete bottom half completion from a thread within the
+  // corresponding team
+  kmp_team_t *team = taskdata->td_team;
+  kmp_int32 nthreads = team->t.t_nproc;
+  kmp_info_t *thread;
+
+  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
+  // but we cannot use __kmp_get_random here
+  kmp_int32 start_k = start % nthreads;
+  kmp_int32 pass = 1;
+  kmp_int32 k = start_k;
+
+  do {
+    // For now we're just linearly trying to find a thread
+    thread = team->t.t_threads[k];
+    k = (k + 1) % nthreads;
+
+    // we did a full pass through all the threads
+    if (k == start_k)
+      pass = pass << 1;
+
+  } while (!__kmp_give_task(thread, k, ptask, pass));
+
+  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
+    // awake at least one thread to execute given task
+    for (int i = 0; i < nthreads; ++i) {
+      thread = team->t.t_threads[i];
+      if (thread->th.th_sleep_loc != NULL) {
+        __kmp_null_resume_wrapper(thread);
+        break;
+      }
+    }
+  }
+}
+
+/*!
+@ingroup TASKING
+@param ptask Task which execution is completed
+
+Execute the completion of a proxy task from a thread that could not belong to
+the team.
+*/
+void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
+  KMP_DEBUG_ASSERT(ptask != NULL);
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
+
+  KA_TRACE(
+      10,
+      ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
+       taskdata));
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
+
+  __kmp_first_top_half_finish_proxy(taskdata);
+
+  __kmpc_give_task(ptask);
+
+  __kmp_second_top_half_finish_proxy(taskdata);
+
+  KA_TRACE(
+      10,
+      ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
+       taskdata));
+}
+
+kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
+                                                kmp_task_t *task) {
+  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
+    td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
+    td->td_allow_completion_event.ed.task = task;
+    __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
+  }
+  return &td->td_allow_completion_event;
+}
+
+void __kmp_fulfill_event(kmp_event_t *event) {
+  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
+    kmp_task_t *ptask = event->ed.task;
+    kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
+    bool detached = false;
+    int gtid = __kmp_get_gtid();
+
+    // The associated task might have completed or could be completing at this
+    // point.
+    // We need to take the lock to avoid races
+    __kmp_acquire_tas_lock(&event->lock, gtid);
+    if (taskdata->td_flags.proxy == TASK_PROXY) {
+      detached = true;
+    } else {
+#if OMPT_SUPPORT
+      // The OMPT event must occur under mutual exclusion,
+      // otherwise the tool might access ptask after free
+      if (UNLIKELY(ompt_enabled.enabled))
+        __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
+#endif
+    }
+    event->type = KMP_EVENT_UNINITIALIZED;
+    __kmp_release_tas_lock(&event->lock, gtid);
+
+    if (detached) {
+#if OMPT_SUPPORT
+      // We free ptask afterwards and know the task is finished,
+      // so locking is not necessary
+      if (UNLIKELY(ompt_enabled.enabled))
+        __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
+#endif
+      // If the task detached complete the proxy task
+      if (gtid >= 0) {
+        kmp_team_t *team = taskdata->td_team;
+        kmp_info_t *thread = __kmp_get_thread();
+        if (thread->th.th_team == team) {
+          __kmpc_proxy_task_completed(gtid, ptask);
+          return;
+        }
+      }
+
+      // fallback
+      __kmpc_proxy_task_completed_ooo(ptask);
+    }
+  }
+}
+
+// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
+// for taskloop
+//
+// thread:   allocating thread
+// task_src: pointer to source task to be duplicated
+// taskloop_recur: used only when dealing with taskgraph,
+//      indicating whether we need to update task->td_task_id
+// returns:  a pointer to the allocated kmp_task_t structure (task).
+kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
+#if OMPX_TASKGRAPH
+                                 , int taskloop_recur
+#endif
+) {
+  kmp_task_t *task;
+  kmp_taskdata_t *taskdata;
+  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
+  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
+  size_t shareds_offset;
+  size_t task_size;
+
+  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
+                task_src));
+  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
+                   TASK_FULL); // it should not be proxy task
+  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
+  task_size = taskdata_src->td_size_alloc;
+
+  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
+  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
+                task_size));
+#if USE_FAST_MEMORY
+  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
+#else
+  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
+#endif /* USE_FAST_MEMORY */
+  KMP_MEMCPY(taskdata, taskdata_src, task_size);
+
+  task = KMP_TASKDATA_TO_TASK(taskdata);
+
+  // Initialize new task (only specific fields not affected by memcpy)
+#if OMPX_TASKGRAPH
+  if (!taskdata->is_taskgraph || taskloop_recur)
+    taskdata->td_task_id = KMP_GEN_TASK_ID();
+  else if (taskdata->is_taskgraph &&
+           __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
+    taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
+#else
+  taskdata->td_task_id = KMP_GEN_TASK_ID();
+#endif
+  if (task->shareds != NULL) { // need setup shareds pointer
+    shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
+    task->shareds = &((char *)taskdata)[shareds_offset];
+    KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
+                     0);
+  }
+  taskdata->td_alloc_thread = thread;
+  taskdata->td_parent = parent_task;
+  // task inherits the taskgroup from the parent task
+  taskdata->td_taskgroup = parent_task->td_taskgroup;
+  // tied task needs to initialize the td_last_tied at creation,
+  // untied one does this when it is scheduled for execution
+  if (taskdata->td_flags.tiedness == TASK_TIED)
+    taskdata->td_last_tied = taskdata;
+
+  // Only need to keep track of child task counts if team parallel and tasking
+  // not serialized
+  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
+    KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
+    if (parent_task->td_taskgroup)
+      KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
+    // Only need to keep track of allocated child tasks for explicit tasks since
+    // implicit not deallocated
+    if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
+      KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
+  }
+
+  KA_TRACE(20,
+           ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
+            thread, taskdata, taskdata->td_parent));
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled))
+    __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
+#endif
+  return task;
+}
+
+// Routine optionally generated by the compiler for setting the lastprivate flag
+// and calling needed constructors for private/firstprivate objects
+// (used to form taskloop tasks from pattern task)
+// Parameters: dest task, src task, lastprivate flag.
+typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
+
+KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
+
+// class to encapsulate manipulating loop bounds in a taskloop task.
+// this abstracts away the Intel vs GOMP taskloop interface for setting/getting
+// the loop bound variables.
+class kmp_taskloop_bounds_t {
+  kmp_task_t *task;
+  const kmp_taskdata_t *taskdata;
+  size_t lower_offset;
+  size_t upper_offset;
+
+public:
+  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
+      : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
+        lower_offset((char *)lb - (char *)task),
+        upper_offset((char *)ub - (char *)task) {
+    KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
+    KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
+  }
+  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
+      : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
+        lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
+  size_t get_lower_offset() const { return lower_offset; }
+  size_t get_upper_offset() const { return upper_offset; }
+  kmp_uint64 get_lb() const {
+    kmp_int64 retval;
+#if defined(KMP_GOMP_COMPAT)
+    // Intel task just returns the lower bound normally
+    if (!taskdata->td_flags.native) {
+      retval = *(kmp_int64 *)((char *)task + lower_offset);
+    } else {
+      // GOMP task has to take into account the sizeof(long)
+      if (taskdata->td_size_loop_bounds == 4) {
+        kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
+        retval = (kmp_int64)*lb;
+      } else {
+        kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
+        retval = (kmp_int64)*lb;
+      }
+    }
+#else
+    (void)taskdata;
+    retval = *(kmp_int64 *)((char *)task + lower_offset);
+#endif // defined(KMP_GOMP_COMPAT)
+    return retval;
+  }
+  kmp_uint64 get_ub() const {
+    kmp_int64 retval;
+#if defined(KMP_GOMP_COMPAT)
+    // Intel task just returns the upper bound normally
+    if (!taskdata->td_flags.native) {
+      retval = *(kmp_int64 *)((char *)task + upper_offset);
+    } else {
+      // GOMP task has to take into account the sizeof(long)
+      if (taskdata->td_size_loop_bounds == 4) {
+        kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
+        retval = (kmp_int64)*ub;
+      } else {
+        kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
+        retval = (kmp_int64)*ub;
+      }
+    }
+#else
+    retval = *(kmp_int64 *)((char *)task + upper_offset);
+#endif // defined(KMP_GOMP_COMPAT)
+    return retval;
+  }
+  void set_lb(kmp_uint64 lb) {
+#if defined(KMP_GOMP_COMPAT)
+    // Intel task just sets the lower bound normally
+    if (!taskdata->td_flags.native) {
+      *(kmp_uint64 *)((char *)task + lower_offset) = lb;
+    } else {
+      // GOMP task has to take into account the sizeof(long)
+      if (taskdata->td_size_loop_bounds == 4) {
+        kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
+        *lower = (kmp_uint32)lb;
+      } else {
+        kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
+        *lower = (kmp_uint64)lb;
+      }
+    }
+#else
+    *(kmp_uint64 *)((char *)task + lower_offset) = lb;
+#endif // defined(KMP_GOMP_COMPAT)
+  }
+  void set_ub(kmp_uint64 ub) {
+#if defined(KMP_GOMP_COMPAT)
+    // Intel task just sets the upper bound normally
+    if (!taskdata->td_flags.native) {
+      *(kmp_uint64 *)((char *)task + upper_offset) = ub;
+    } else {
+      // GOMP task has to take into account the sizeof(long)
+      if (taskdata->td_size_loop_bounds == 4) {
+        kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
+        *upper = (kmp_uint32)ub;
+      } else {
+        kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
+        *upper = (kmp_uint64)ub;
+      }
+    }
+#else
+    *(kmp_uint64 *)((char *)task + upper_offset) = ub;
+#endif // defined(KMP_GOMP_COMPAT)
+  }
+};
+
+// __kmp_taskloop_linear: Start tasks of the taskloop linearly
+//
+// loc        Source location information
+// gtid       Global thread ID
+// task       Pattern task, exposes the loop iteration range
+// lb         Pointer to loop lower bound in task structure
+// ub         Pointer to loop upper bound in task structure
+// st         Loop stride
+// ub_glob    Global upper bound (used for lastprivate check)
+// num_tasks  Number of tasks to execute
+// grainsize  Number of loop iterations per task
+// extras     Number of chunks with grainsize+1 iterations
+// last_chunk Reduction of grainsize for last task
+// tc         Iterations count
+// task_dup   Tasks duplication routine
+// codeptr_ra Return address for OMPT events
+void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
+                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+                           kmp_uint64 grainsize, kmp_uint64 extras,
+                           kmp_int64 last_chunk, kmp_uint64 tc,
+#if OMPT_SUPPORT
+                           void *codeptr_ra,
+#endif
+                           void *task_dup) {
+  KMP_COUNT_BLOCK(OMP_TASKLOOP);
+  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
+  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  // compiler provides global bounds here
+  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
+  kmp_uint64 lower = task_bounds.get_lb();
+  kmp_uint64 upper = task_bounds.get_ub();
+  kmp_uint64 i;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  kmp_task_t *next_task;
+  kmp_int32 lastpriv = 0;
+
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
+                             (last_chunk < 0 ? last_chunk : extras));
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
+                "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
+                gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
+                ub_glob, st, task_dup));
+
+  // Launch num_tasks tasks, assign grainsize iterations each task
+  for (i = 0; i < num_tasks; ++i) {
+    kmp_uint64 chunk_minus_1;
+    if (extras == 0) {
+      chunk_minus_1 = grainsize - 1;
+    } else {
+      chunk_minus_1 = grainsize;
+      --extras; // first extras iterations get bigger chunk (grainsize+1)
+    }
+    upper = lower + st * chunk_minus_1;
+    if (upper > *ub) {
+      upper = *ub;
+    }
+    if (i == num_tasks - 1) {
+      // schedule the last task, set lastprivate flag if needed
+      if (st == 1) { // most common case
+        KMP_DEBUG_ASSERT(upper == *ub);
+        if (upper == ub_glob)
+          lastpriv = 1;
+      } else if (st > 0) { // positive loop stride
+        KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
+        if ((kmp_uint64)st > ub_glob - upper)
+          lastpriv = 1;
+      } else { // negative loop stride
+        KMP_DEBUG_ASSERT(upper + st < *ub);
+        if (upper - ub_glob < (kmp_uint64)(-st))
+          lastpriv = 1;
+      }
+    }
+
+#if OMPX_TASKGRAPH
+    next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
+#else
+    next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+#endif
+
+    kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
+    kmp_taskloop_bounds_t next_task_bounds =
+        kmp_taskloop_bounds_t(next_task, task_bounds);
+
+    // adjust task-specific bounds
+    next_task_bounds.set_lb(lower);
+    if (next_taskdata->td_flags.native) {
+      next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
+    } else {
+      next_task_bounds.set_ub(upper);
+    }
+    if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
+                           // etc.
+      ptask_dup(next_task, task, lastpriv);
+    KA_TRACE(40,
+             ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
+              "upper %lld stride %lld, (offsets %p %p)\n",
+              gtid, i, next_task, lower, upper, st,
+              next_task_bounds.get_lower_offset(),
+              next_task_bounds.get_upper_offset()));
+#if OMPT_SUPPORT
+    __kmp_omp_taskloop_task(NULL, gtid, next_task,
+                            codeptr_ra); // schedule new task
+#if OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_dispatch) {
+      OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
+                              lower, upper, st);
+    }
+#endif // OMPT_OPTIONAL
+#else
+    __kmp_omp_task(gtid, next_task, true); // schedule new task
+#endif
+    lower = upper + st; // adjust lower bound for the next iteration
+  }
+  // free the pattern task and exit
+  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
+  // do not execute the pattern task, just do internal bookkeeping
+  __kmp_task_finish<false>(gtid, task, current_task);
+}
+
+// Structure to keep taskloop parameters for auxiliary task
+// kept in the shareds of the task structure.
+typedef struct __taskloop_params {
+  kmp_task_t *task;
+  kmp_uint64 *lb;
+  kmp_uint64 *ub;
+  void *task_dup;
+  kmp_int64 st;
+  kmp_uint64 ub_glob;
+  kmp_uint64 num_tasks;
+  kmp_uint64 grainsize;
+  kmp_uint64 extras;
+  kmp_int64 last_chunk;
+  kmp_uint64 tc;
+  kmp_uint64 num_t_min;
+#if OMPT_SUPPORT
+  void *codeptr_ra;
+#endif
+} __taskloop_params_t;
+
+void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
+                          kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
+                          kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
+                          kmp_uint64,
+#if OMPT_SUPPORT
+                          void *,
+#endif
+                          void *);
+
+// Execute part of the taskloop submitted as a task.
+int __kmp_taskloop_task(int gtid, void *ptask) {
+  __taskloop_params_t *p =
+      (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
+  kmp_task_t *task = p->task;
+  kmp_uint64 *lb = p->lb;
+  kmp_uint64 *ub = p->ub;
+  void *task_dup = p->task_dup;
+  //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  kmp_int64 st = p->st;
+  kmp_uint64 ub_glob = p->ub_glob;
+  kmp_uint64 num_tasks = p->num_tasks;
+  kmp_uint64 grainsize = p->grainsize;
+  kmp_uint64 extras = p->extras;
+  kmp_int64 last_chunk = p->last_chunk;
+  kmp_uint64 tc = p->tc;
+  kmp_uint64 num_t_min = p->num_t_min;
+#if OMPT_SUPPORT
+  void *codeptr_ra = p->codeptr_ra;
+#endif
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+  KA_TRACE(20,
+           ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
+            " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
+            gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
+            st, task_dup));
+#endif
+  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
+  if (num_tasks > num_t_min)
+    __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                         grainsize, extras, last_chunk, tc, num_t_min,
+#if OMPT_SUPPORT
+                         codeptr_ra,
+#endif
+                         task_dup);
+  else
+    __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, last_chunk, tc,
+#if OMPT_SUPPORT
+                          codeptr_ra,
+#endif
+                          task_dup);
+
+  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
+  return 0;
+}
+
+// Schedule part of the taskloop as a task,
+// execute the rest of the taskloop.
+//
+// loc        Source location information
+// gtid       Global thread ID
+// task       Pattern task, exposes the loop iteration range
+// lb         Pointer to loop lower bound in task structure
+// ub         Pointer to loop upper bound in task structure
+// st         Loop stride
+// ub_glob    Global upper bound (used for lastprivate check)
+// num_tasks  Number of tasks to execute
+// grainsize  Number of loop iterations per task
+// extras     Number of chunks with grainsize+1 iterations
+// last_chunk Reduction of grainsize for last task
+// tc         Iterations count
+// num_t_min  Threshold to launch tasks recursively
+// task_dup   Tasks duplication routine
+// codeptr_ra Return address for OMPT events
+void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
+                          kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                          kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+                          kmp_uint64 grainsize, kmp_uint64 extras,
+                          kmp_int64 last_chunk, kmp_uint64 tc,
+                          kmp_uint64 num_t_min,
+#if OMPT_SUPPORT
+                          void *codeptr_ra,
+#endif
+                          void *task_dup) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
+  KA_TRACE(20,
+           ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
+            " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
+            gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
+            st, task_dup));
+  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  kmp_uint64 lower = *lb;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  //  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  kmp_task_t *next_task;
+  size_t lower_offset =
+      (char *)lb - (char *)task; // remember offset of lb in the task structure
+  size_t upper_offset =
+      (char *)ub - (char *)task; // remember offset of ub in the task structure
+
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
+                             (last_chunk < 0 ? last_chunk : extras));
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+
+  // split the loop in two halves
+  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
+  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
+  kmp_uint64 gr_size0 = grainsize;
+  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
+  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
+  if (last_chunk < 0) {
+    ext0 = ext1 = 0;
+    last_chunk1 = last_chunk;
+    tc0 = grainsize * n_tsk0;
+    tc1 = tc - tc0;
+  } else if (n_tsk0 <= extras) {
+    gr_size0++; // integrate extras into grainsize
+    ext0 = 0; // no extra iters in 1st half
+    ext1 = extras - n_tsk0; // remaining extras
+    tc0 = gr_size0 * n_tsk0;
+    tc1 = tc - tc0;
+  } else { // n_tsk0 > extras
+    ext1 = 0; // no extra iters in 2nd half
+    ext0 = extras;
+    tc1 = grainsize * n_tsk1;
+    tc0 = tc - tc1;
+  }
+  ub0 = lower + st * (tc0 - 1);
+  lb1 = ub0 + st;
+
+  // create pattern task for 2nd half of the loop
+#if OMPX_TASKGRAPH
+  next_task = __kmp_task_dup_alloc(thread, task,
+                                   /* taskloop_recur */ 1);
+#else
+  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
+#endif
+  // adjust lower bound (upper bound is not changed) for the 2nd half
+  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
+  if (ptask_dup != NULL) // construct firstprivates, etc.
+    ptask_dup(next_task, task, 0);
+  *ub = ub0; // adjust upper bound for the 1st half
+
+  // create auxiliary task for 2nd half of the loop
+  // make sure new task has same parent task as the pattern task
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  thread->th.th_current_task = taskdata->td_parent;
+  kmp_task_t *new_task =
+      __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
+                            sizeof(__taskloop_params_t), &__kmp_taskloop_task);
+  // restore current task
+  thread->th.th_current_task = current_task;
+  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
+  p->task = next_task;
+  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
+  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
+  p->task_dup = task_dup;
+  p->st = st;
+  p->ub_glob = ub_glob;
+  p->num_tasks = n_tsk1;
+  p->grainsize = grainsize;
+  p->extras = ext1;
+  p->last_chunk = last_chunk1;
+  p->tc = tc1;
+  p->num_t_min = num_t_min;
+#if OMPT_SUPPORT
+  p->codeptr_ra = codeptr_ra;
+#endif
+
+#if OMPX_TASKGRAPH
+  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
+  new_task_data->tdg = taskdata->tdg;
+  new_task_data->is_taskgraph = 0;
+#endif
+
+#if OMPT_SUPPORT
+  // schedule new task with correct return address for OMPT events
+  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
+#else
+  __kmp_omp_task(gtid, new_task, true); // schedule new task
+#endif
+
+  // execute the 1st half of current subrange
+  if (n_tsk0 > num_t_min)
+    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
+                         ext0, last_chunk0, tc0, num_t_min,
+#if OMPT_SUPPORT
+                         codeptr_ra,
+#endif
+                         task_dup);
+  else
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
+                          gr_size0, ext0, last_chunk0, tc0,
+#if OMPT_SUPPORT
+                          codeptr_ra,
+#endif
+                          task_dup);
+
+  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
+}
+
+static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                           int nogroup, int sched, kmp_uint64 grainsize,
+                           int modifier, void *task_dup) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+  if (nogroup == 0) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    __kmpc_taskgroup(loc, gtid);
+  }
+
+#if OMPX_TASKGRAPH
+  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
+#endif
+  // =========================================================================
+  // calculate loop parameters
+  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
+  kmp_uint64 tc;
+  // compiler provides global bounds here
+  kmp_uint64 lower = task_bounds.get_lb();
+  kmp_uint64 upper = task_bounds.get_ub();
+  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
+  kmp_uint64 num_tasks = 0, extras = 0;
+  kmp_int64 last_chunk =
+      0; // reduce grainsize of last task by last_chunk in strict mode
+  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+
+  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
+                "grain %llu(%d, %d), dup %p\n",
+                gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
+                task_dup));
+
+  // compute trip count
+  if (st == 1) { // most common case
+    tc = upper - lower + 1;
+  } else if (st < 0) {
+    tc = (lower - upper) / (-st) + 1;
+  } else { // st > 0
+    tc = (upper - lower) / st + 1;
+  }
+  if (tc == 0) {
+    KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
+    // free the pattern task and exit
+    __kmp_task_start(gtid, task, current_task);
+    // do not execute anything for zero-trip loop
+    __kmp_task_finish<false>(gtid, task, current_task);
+    return;
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  if (num_tasks_min == 0)
+    // TODO: can we choose better default heuristic?
+    num_tasks_min =
+        KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
+
+  // compute num_tasks/grainsize based on the input provided
+  switch (sched) {
+  case 0: // no schedule clause specified, we can choose the default
+    // let's try to schedule (team_size*10) tasks
+    grainsize = thread->th.th_team_nproc * 10;
+    KMP_FALLTHROUGH();
+  case 2: // num_tasks provided
+    if (grainsize > tc) {
+      num_tasks = tc; // too big num_tasks requested, adjust values
+      grainsize = 1;
+      extras = 0;
+    } else {
+      num_tasks = grainsize;
+      grainsize = tc / num_tasks;
+      extras = tc % num_tasks;
+    }
+    break;
+  case 1: // grainsize provided
+    if (grainsize > tc) {
+      num_tasks = 1;
+      grainsize = tc; // too big grainsize requested, adjust values
+      extras = 0;
+    } else {
+      if (modifier) {
+        num_tasks = (tc + grainsize - 1) / grainsize;
+        last_chunk = tc - (num_tasks * grainsize);
+        extras = 0;
+      } else {
+        num_tasks = tc / grainsize;
+        // adjust grainsize for balanced distribution of iterations
+        grainsize = tc / num_tasks;
+        extras = tc % num_tasks;
+      }
+    }
+    break;
+  default:
+    KMP_ASSERT2(0, "unknown scheduling of taskloop");
+  }
+
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
+                             (last_chunk < 0 ? last_chunk : extras));
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+  // =========================================================================
+
+  // check if clause value first
+  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
+  if (if_val == 0) { // if(0) specified, mark task as serial
+    taskdata->td_flags.task_serial = 1;
+    taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
+    // always start serial tasks linearly
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, last_chunk, tc,
+#if OMPT_SUPPORT
+                          OMPT_GET_RETURN_ADDRESS(0),
+#endif
+                          task_dup);
+    // !taskdata->td_flags.native => currently force linear spawning of tasks
+    // for GOMP_taskloop
+  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
+    KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
+                  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
+                  last_chunk));
+    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                         grainsize, extras, last_chunk, tc, num_tasks_min,
+#if OMPT_SUPPORT
+                         OMPT_GET_RETURN_ADDRESS(0),
+#endif
+                         task_dup);
+  } else {
+    KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
+                  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
+                  last_chunk));
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, last_chunk, tc,
+#if OMPT_SUPPORT
+                          OMPT_GET_RETURN_ADDRESS(0),
+#endif
+                          task_dup);
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
+        &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  if (nogroup == 0) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    __kmpc_end_taskgroup(loc, gtid);
+  }
+  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location information
+@param gtid      Global thread ID
+@param task      Task structure
+@param if_val    Value of the if clause
+@param lb        Pointer to loop lower bound in task structure
+@param ub        Pointer to loop upper bound in task structure
+@param st        Loop stride
+@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
+@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+@param grainsize Schedule value if specified
+@param task_dup  Tasks duplication routine
+
+Execute the taskloop construct.
+*/
+void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
+                     int sched, kmp_uint64 grainsize, void *task_dup) {
+  __kmp_assert_valid_gtid(gtid);
+  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
+  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
+                 0, task_dup);
+  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location information
+@param gtid      Global thread ID
+@param task      Task structure
+@param if_val    Value of the if clause
+@param lb        Pointer to loop lower bound in task structure
+@param ub        Pointer to loop upper bound in task structure
+@param st        Loop stride
+@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
+@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+@param grainsize Schedule value if specified
+@param modifier  Modifier 'strict' for sched, 1 if present, 0 otherwise
+@param task_dup  Tasks duplication routine
+
+Execute the taskloop construct.
+*/
+void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                       kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                       int nogroup, int sched, kmp_uint64 grainsize,
+                       int modifier, void *task_dup) {
+  __kmp_assert_valid_gtid(gtid);
+  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
+  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
+                 modifier, task_dup);
+  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
+}
+
+/*!
+@ingroup TASKING
+@param gtid Global Thread ID of current thread
+@return Returns a pointer to the thread's current task async handle. If no task
+is present or gtid is invalid, returns NULL.
+
+Acqurires a pointer to the target async handle from the current task.
+*/
+void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid) {
+  if (gtid == KMP_GTID_DNE)
+    return NULL;
+
+  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
+  kmp_taskdata_t *taskdata = thread->th.th_current_task;
+
+  if (!taskdata)
+    return NULL;
+
+  return &taskdata->td_target_data.async_handle;
+}
+
+/*!
+@ingroup TASKING
+@param gtid Global Thread ID of current thread
+@return Returns TRUE if the current task being executed of the given thread has
+a task team allocated to it. Otherwise, returns FALSE.
+
+Checks if the current thread has a task team.
+*/
+bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
+  if (gtid == KMP_GTID_DNE)
+    return FALSE;
+
+  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
+  kmp_taskdata_t *taskdata = thread->th.th_current_task;
+
+  if (!taskdata)
+    return FALSE;
+
+  return taskdata->td_task_team != NULL;
+}
+
+#if OMPX_TASKGRAPH
+// __kmp_find_tdg: identify a TDG through its ID
+// gtid:   Global Thread ID
+// tdg_id: ID of the TDG
+// returns: If a TDG corresponding to this ID is found and not
+// its initial state, return the pointer to it, otherwise nullptr
+static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
+  kmp_tdg_info_t *res = nullptr;
+  if (__kmp_max_tdgs == 0)
+    return res;
+
+  if (__kmp_global_tdgs == NULL)
+    __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
+        sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
+
+  if ((__kmp_global_tdgs[tdg_id]) &&
+      (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
+    res = __kmp_global_tdgs[tdg_id];
+  return res;
+}
+
+// __kmp_print_tdg_dot: prints the TDG to a dot file
+// tdg:    ID of the TDG
+void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg) {
+  kmp_int32 tdg_id = tdg->tdg_id;
+  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
+
+  char file_name[20];
+  sprintf(file_name, "tdg_%d.dot", tdg_id);
+  kmp_safe_raii_file_t tdg_file(file_name, "w");
+
+  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
+  fprintf(tdg_file,
+          "digraph TDG {\n"
+          "   compound=true\n"
+          "   subgraph cluster {\n"
+          "      label=TDG_%d\n",
+          tdg_id);
+  for (kmp_int32 i = 0; i < num_tasks; i++) {
+    fprintf(tdg_file, "      %d[style=bold]\n", i);
+  }
+  fprintf(tdg_file, "   }\n");
+  for (kmp_int32 i = 0; i < num_tasks; i++) {
+    kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
+    kmp_int32 *successors = tdg->record_map[i].successors;
+    if (nsuccessors > 0) {
+      for (kmp_int32 j = 0; j < nsuccessors; j++)
+        fprintf(tdg_file, "   %d -> %d \n", i, successors[j]);
+    }
+  }
+  fprintf(tdg_file, "}");
+  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
+}
+
+// __kmp_start_record: launch the execution of a previous
+// recorded TDG
+// gtid:   Global Thread ID
+// tdg:    ID of the TDG
+void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
+  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
+  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
+                tdg->tdg_id, tdg->num_roots));
+  kmp_node_info_t *this_record_map = tdg->record_map;
+  kmp_int32 *this_root_tasks = tdg->root_tasks;
+  kmp_int32 this_num_roots = tdg->num_roots;
+  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
+
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *parent_task = thread->th.th_current_task;
+
+  if (tdg->rec_taskred_data) {
+    __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
+  }
+
+  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
+    kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
+
+    td->td_parent = parent_task;
+    this_record_map[j].parent_task = parent_task;
+
+    kmp_taskgroup_t *parent_taskgroup =
+        this_record_map[j].parent_task->td_taskgroup;
+
+    KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
+                      this_record_map[j].npredecessors);
+    KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
+
+    if (parent_taskgroup) {
+      KMP_ATOMIC_INC(&parent_taskgroup->count);
+      // The taskgroup is different so we must update it
+      td->td_taskgroup = parent_taskgroup;
+    } else if (td->td_taskgroup != nullptr) {
+      // If the parent doesnt have a taskgroup, remove it from the task
+      td->td_taskgroup = nullptr;
+    }
+    if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
+      KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
+  }
+
+  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
+    __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
+  }
+  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
+                tdg->tdg_id, tdg->num_roots));
+}
+
+// __kmp_start_record: set up a TDG structure and turn the
+// recording flag to true
+// gtid:        Global Thread ID of the encountering thread
+// input_flags: Flags associated with the TDG
+// tdg_id:      ID of the TDG to record
+static inline void __kmp_start_record(kmp_int32 gtid,
+                                      kmp_taskgraph_flags_t *flags,
+                                      kmp_int32 tdg_id) {
+  kmp_tdg_info_t *tdg =
+      (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
+  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
+  // Initializing the TDG structure
+  tdg->tdg_id = tdg_id;
+  tdg->map_size = INIT_MAPSIZE;
+  tdg->num_roots = -1;
+  tdg->root_tasks = nullptr;
+  tdg->tdg_status = KMP_TDG_RECORDING;
+  tdg->rec_num_taskred = 0;
+  tdg->rec_taskred_data = nullptr;
+  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
+
+  // Initializing the list of nodes in this TDG
+  kmp_node_info_t *this_record_map =
+      (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
+  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
+    kmp_int32 *successorsList =
+        (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
+    this_record_map[i].task = nullptr;
+    this_record_map[i].successors = successorsList;
+    this_record_map[i].nsuccessors = 0;
+    this_record_map[i].npredecessors = 0;
+    this_record_map[i].successors_size = __kmp_successors_size;
+    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
+  }
+
+  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
+}
+
+// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
+// the beginning of the record process of a task region
+// loc_ref:     Location of TDG, not used yet
+// gtid:        Global Thread ID of the encountering thread
+// input_flags: Flags associated with the TDG
+// tdg_id:      ID of the TDG to record, for now, incremental integer
+// returns:     1 if we record, otherwise, 0
+kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
+                                   kmp_int32 input_flags, kmp_int32 tdg_id) {
+
+  kmp_int32 res;
+  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
+  KA_TRACE(10,
+           ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
+            gtid, loc_ref, input_flags, tdg_id));
+
+  if (__kmp_max_tdgs == 0) {
+    KA_TRACE(
+        10,
+        ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
+         "__kmp_max_tdgs = 0\n",
+         gtid, loc_ref, input_flags, tdg_id));
+    return 1;
+  }
+
+  __kmpc_taskgroup(loc_ref, gtid);
+  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
+    // TODO: use re_record flag
+    __kmp_exec_tdg(gtid, tdg);
+    res = 0;
+  } else {
+    __kmp_curr_tdg_idx = tdg_id;
+    KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
+    __kmp_start_record(gtid, flags, tdg_id);
+    __kmp_num_tdg++;
+    res = 1;
+  }
+  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
+                gtid, tdg_id, res ? "record" : "execute"));
+  return res;
+}
+
+// __kmp_end_record: set up a TDG after recording it
+// gtid:   Global thread ID
+// tdg:    Pointer to the TDG
+void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
+  // Store roots
+  kmp_node_info_t *this_record_map = tdg->record_map;
+  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
+  kmp_int32 *this_root_tasks =
+      (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
+  kmp_int32 this_map_size = tdg->map_size;
+  kmp_int32 this_num_roots = 0;
+  kmp_info_t *thread = __kmp_threads[gtid];
+
+  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
+    if (this_record_map[i].npredecessors == 0) {
+      this_root_tasks[this_num_roots++] = i;
+    }
+  }
+
+  // Update with roots info and mapsize
+  tdg->map_size = this_map_size;
+  tdg->num_roots = this_num_roots;
+  tdg->root_tasks = this_root_tasks;
+  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
+  tdg->tdg_status = KMP_TDG_READY;
+
+  if (thread->th.th_current_task->td_dephash) {
+    __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
+    thread->th.th_current_task->td_dephash = NULL;
+  }
+
+  // Reset predecessor counter
+  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
+    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
+                      this_record_map[i].npredecessors);
+  }
+  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
+
+  if (__kmp_tdg_dot)
+    __kmp_print_tdg_dot(tdg);
+}
+
+// __kmpc_end_record_task: wrapper around __kmp_end_record to mark
+// the end of recording phase
+//
+// loc_ref:      Source location information
+// gtid:         Global thread ID
+// input_flags:  Flags attached to the graph
+// tdg_id:       ID of the TDG just finished recording
+void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
+                            kmp_int32 input_flags, kmp_int32 tdg_id) {
+  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
+
+  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
+                " tdg=%d with flags=%d\n",
+                gtid, loc_ref, tdg_id, input_flags));
+  if (__kmp_max_tdgs) {
+    // TODO: use input_flags->nowait
+    __kmpc_end_taskgroup(loc_ref, gtid);
+    if (__kmp_tdg_is_recording(tdg->tdg_status))
+      __kmp_end_record(gtid, tdg);
+  }
+  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
+                " tdg=%d, its status is now READY\n",
+                gtid, loc_ref, tdg_id));
+}
+#endif
diff --git a/third_party/openmp/kmp_threadprivate.cpp b/third_party/openmp/kmp_threadprivate.cpp
new file mode 100644
index 000000000..b79ac7d6d
--- /dev/null
+++ b/third_party/openmp/kmp_threadprivate.cpp
@@ -0,0 +1,798 @@
+/*
+ * kmp_threadprivate.cpp -- OpenMP threadprivate support library
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+
+#define USE_CHECKS_COMMON
+
+#define KMP_INLINE_SUBR 1
+
+void kmp_threadprivate_insert_private_data(int gtid, void *pc_addr,
+                                           void *data_addr, size_t pc_size);
+struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
+                                                void *data_addr,
+                                                size_t pc_size);
+
+struct shared_table __kmp_threadprivate_d_table;
+
+static
+#ifdef KMP_INLINE_SUBR
+    __forceinline
+#endif
+    struct private_common *
+    __kmp_threadprivate_find_task_common(struct common_table *tbl, int gtid,
+                                         void *pc_addr)
+
+{
+  struct private_common *tn;
+
+#ifdef KMP_TASK_COMMON_DEBUG
+  KC_TRACE(10, ("__kmp_threadprivate_find_task_common: thread#%d, called with "
+                "address %p\n",
+                gtid, pc_addr));
+  dump_list();
+#endif
+
+  for (tn = tbl->data[KMP_HASH(pc_addr)]; tn; tn = tn->next) {
+    if (tn->gbl_addr == pc_addr) {
+#ifdef KMP_TASK_COMMON_DEBUG
+      KC_TRACE(10, ("__kmp_threadprivate_find_task_common: thread#%d, found "
+                    "node %p on list\n",
+                    gtid, pc_addr));
+#endif
+      return tn;
+    }
+  }
+  return 0;
+}
+
+static
+#ifdef KMP_INLINE_SUBR
+    __forceinline
+#endif
+    struct shared_common *
+    __kmp_find_shared_task_common(struct shared_table *tbl, int gtid,
+                                  void *pc_addr) {
+  struct shared_common *tn;
+
+  for (tn = tbl->data[KMP_HASH(pc_addr)]; tn; tn = tn->next) {
+    if (tn->gbl_addr == pc_addr) {
+#ifdef KMP_TASK_COMMON_DEBUG
+      KC_TRACE(
+          10,
+          ("__kmp_find_shared_task_common: thread#%d, found node %p on list\n",
+           gtid, pc_addr));
+#endif
+      return tn;
+    }
+  }
+  return 0;
+}
+
+// Create a template for the data initialized storage. Either the template is
+// NULL indicating zero fill, or the template is a copy of the original data.
+static struct private_data *__kmp_init_common_data(void *pc_addr,
+                                                   size_t pc_size) {
+  struct private_data *d;
+  size_t i;
+  char *p;
+
+  d = (struct private_data *)__kmp_allocate(sizeof(struct private_data));
+  /*
+      d->data = 0;  // AC: commented out because __kmp_allocate zeroes the
+     memory
+      d->next = 0;
+  */
+  d->size = pc_size;
+  d->more = 1;
+
+  p = (char *)pc_addr;
+
+  for (i = pc_size; i > 0; --i) {
+    if (*p++ != '\0') {
+      d->data = __kmp_allocate(pc_size);
+      KMP_MEMCPY(d->data, pc_addr, pc_size);
+      break;
+    }
+  }
+
+  return d;
+}
+
+// Initialize the data area from the template.
+static void __kmp_copy_common_data(void *pc_addr, struct private_data *d) {
+  char *addr = (char *)pc_addr;
+
+  for (size_t offset = 0; d != 0; d = d->next) {
+    for (int i = d->more; i > 0; --i) {
+      if (d->data == 0)
+        memset(&addr[offset], '\0', d->size);
+      else
+        KMP_MEMCPY(&addr[offset], d->data, d->size);
+      offset += d->size;
+    }
+  }
+}
+
+/* we are called from __kmp_serial_initialize() with __kmp_initz_lock held. */
+void __kmp_common_initialize(void) {
+  if (!TCR_4(__kmp_init_common)) {
+    int q;
+#ifdef KMP_DEBUG
+    int gtid;
+#endif
+
+    __kmp_threadpriv_cache_list = NULL;
+
+#ifdef KMP_DEBUG
+    /* verify the uber masters were initialized */
+    for (gtid = 0; gtid < __kmp_threads_capacity; gtid++)
+      if (__kmp_root[gtid]) {
+        KMP_DEBUG_ASSERT(__kmp_root[gtid]->r.r_uber_thread);
+        for (q = 0; q < KMP_HASH_TABLE_SIZE; ++q)
+          KMP_DEBUG_ASSERT(
+              !__kmp_root[gtid]->r.r_uber_thread->th.th_pri_common->data[q]);
+        /*                    __kmp_root[ gitd ]-> r.r_uber_thread ->
+         * th.th_pri_common -> data[ q ] = 0;*/
+      }
+#endif /* KMP_DEBUG */
+
+    for (q = 0; q < KMP_HASH_TABLE_SIZE; ++q)
+      __kmp_threadprivate_d_table.data[q] = 0;
+
+    TCW_4(__kmp_init_common, TRUE);
+  }
+}
+
+/* Call all destructors for threadprivate data belonging to all threads.
+   Currently unused! */
+void __kmp_common_destroy(void) {
+  if (TCR_4(__kmp_init_common)) {
+    int q;
+
+    TCW_4(__kmp_init_common, FALSE);
+
+    for (q = 0; q < KMP_HASH_TABLE_SIZE; ++q) {
+      int gtid;
+      struct private_common *tn;
+      struct shared_common *d_tn;
+
+      /* C++ destructors need to be called once per thread before exiting.
+         Don't call destructors for primary thread though unless we used copy
+         constructor */
+
+      for (d_tn = __kmp_threadprivate_d_table.data[q]; d_tn;
+           d_tn = d_tn->next) {
+        if (d_tn->is_vec) {
+          if (d_tn->dt.dtorv != 0) {
+            for (gtid = 0; gtid < __kmp_all_nth; ++gtid) {
+              if (__kmp_threads[gtid]) {
+                if ((__kmp_foreign_tp) ? (!KMP_INITIAL_GTID(gtid))
+                                       : (!KMP_UBER_GTID(gtid))) {
+                  tn = __kmp_threadprivate_find_task_common(
+                      __kmp_threads[gtid]->th.th_pri_common, gtid,
+                      d_tn->gbl_addr);
+                  if (tn) {
+                    (*d_tn->dt.dtorv)(tn->par_addr, d_tn->vec_len);
+                  }
+                }
+              }
+            }
+            if (d_tn->obj_init != 0) {
+              (*d_tn->dt.dtorv)(d_tn->obj_init, d_tn->vec_len);
+            }
+          }
+        } else {
+          if (d_tn->dt.dtor != 0) {
+            for (gtid = 0; gtid < __kmp_all_nth; ++gtid) {
+              if (__kmp_threads[gtid]) {
+                if ((__kmp_foreign_tp) ? (!KMP_INITIAL_GTID(gtid))
+                                       : (!KMP_UBER_GTID(gtid))) {
+                  tn = __kmp_threadprivate_find_task_common(
+                      __kmp_threads[gtid]->th.th_pri_common, gtid,
+                      d_tn->gbl_addr);
+                  if (tn) {
+                    (*d_tn->dt.dtor)(tn->par_addr);
+                  }
+                }
+              }
+            }
+            if (d_tn->obj_init != 0) {
+              (*d_tn->dt.dtor)(d_tn->obj_init);
+            }
+          }
+        }
+      }
+      __kmp_threadprivate_d_table.data[q] = 0;
+    }
+  }
+}
+
+/* Call all destructors for threadprivate data belonging to this thread */
+void __kmp_common_destroy_gtid(int gtid) {
+  struct private_common *tn;
+  struct shared_common *d_tn;
+
+  if (!TCR_4(__kmp_init_gtid)) {
+    // This is possible when one of multiple roots initiates early library
+    // termination in a sequential region while other teams are active, and its
+    // child threads are about to end.
+    return;
+  }
+
+  KC_TRACE(10, ("__kmp_common_destroy_gtid: T#%d called\n", gtid));
+  if ((__kmp_foreign_tp) ? (!KMP_INITIAL_GTID(gtid)) : (!KMP_UBER_GTID(gtid))) {
+
+    if (TCR_4(__kmp_init_common)) {
+
+      /* Cannot do this here since not all threads have destroyed their data */
+      /* TCW_4(__kmp_init_common, FALSE); */
+
+      for (tn = __kmp_threads[gtid]->th.th_pri_head; tn; tn = tn->link) {
+
+        d_tn = __kmp_find_shared_task_common(&__kmp_threadprivate_d_table, gtid,
+                                             tn->gbl_addr);
+        if (d_tn == NULL)
+          continue;
+        if (d_tn->is_vec) {
+          if (d_tn->dt.dtorv != 0) {
+            (void)(*d_tn->dt.dtorv)(tn->par_addr, d_tn->vec_len);
+          }
+          if (d_tn->obj_init != 0) {
+            (void)(*d_tn->dt.dtorv)(d_tn->obj_init, d_tn->vec_len);
+          }
+        } else {
+          if (d_tn->dt.dtor != 0) {
+            (void)(*d_tn->dt.dtor)(tn->par_addr);
+          }
+          if (d_tn->obj_init != 0) {
+            (void)(*d_tn->dt.dtor)(d_tn->obj_init);
+          }
+        }
+      }
+      KC_TRACE(30, ("__kmp_common_destroy_gtid: T#%d threadprivate destructors "
+                    "complete\n",
+                    gtid));
+    }
+  }
+}
+
+#ifdef KMP_TASK_COMMON_DEBUG
+static void dump_list(void) {
+  int p, q;
+
+  for (p = 0; p < __kmp_all_nth; ++p) {
+    if (!__kmp_threads[p])
+      continue;
+    for (q = 0; q < KMP_HASH_TABLE_SIZE; ++q) {
+      if (__kmp_threads[p]->th.th_pri_common->data[q]) {
+        struct private_common *tn;
+
+        KC_TRACE(10, ("\tdump_list: gtid:%d addresses\n", p));
+
+        for (tn = __kmp_threads[p]->th.th_pri_common->data[q]; tn;
+             tn = tn->next) {
+          KC_TRACE(10,
+                   ("\tdump_list: THREADPRIVATE: Serial %p -> Parallel %p\n",
+                    tn->gbl_addr, tn->par_addr));
+        }
+      }
+    }
+  }
+}
+#endif /* KMP_TASK_COMMON_DEBUG */
+
+// NOTE: this routine is to be called only from the serial part of the program.
+void kmp_threadprivate_insert_private_data(int gtid, void *pc_addr,
+                                           void *data_addr, size_t pc_size) {
+  struct shared_common **lnk_tn, *d_tn;
+  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
+                   __kmp_threads[gtid]->th.th_root->r.r_active == 0);
+
+  d_tn = __kmp_find_shared_task_common(&__kmp_threadprivate_d_table, gtid,
+                                       pc_addr);
+
+  if (d_tn == 0) {
+    d_tn = (struct shared_common *)__kmp_allocate(sizeof(struct shared_common));
+
+    d_tn->gbl_addr = pc_addr;
+    d_tn->pod_init = __kmp_init_common_data(data_addr, pc_size);
+    /*
+            d_tn->obj_init = 0;  // AC: commented out because __kmp_allocate
+       zeroes the memory
+            d_tn->ct.ctor = 0;
+            d_tn->cct.cctor = 0;;
+            d_tn->dt.dtor = 0;
+            d_tn->is_vec = FALSE;
+            d_tn->vec_len = 0L;
+    */
+    d_tn->cmn_size = pc_size;
+
+    __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+    lnk_tn = &(__kmp_threadprivate_d_table.data[KMP_HASH(pc_addr)]);
+
+    d_tn->next = *lnk_tn;
+    *lnk_tn = d_tn;
+
+    __kmp_release_lock(&__kmp_global_lock, gtid);
+  }
+}
+
+struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
+                                                void *data_addr,
+                                                size_t pc_size) {
+  struct private_common *tn, **tt;
+  struct shared_common *d_tn;
+
+  /* +++++++++ START OF CRITICAL SECTION +++++++++ */
+  __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+  tn = (struct private_common *)__kmp_allocate(sizeof(struct private_common));
+
+  tn->gbl_addr = pc_addr;
+
+  d_tn = __kmp_find_shared_task_common(
+      &__kmp_threadprivate_d_table, gtid,
+      pc_addr); /* Only the MASTER data table exists. */
+
+  if (d_tn != 0) {
+    /* This threadprivate variable has already been seen. */
+
+    if (d_tn->pod_init == 0 && d_tn->obj_init == 0) {
+      d_tn->cmn_size = pc_size;
+
+      if (d_tn->is_vec) {
+        if (d_tn->ct.ctorv != 0) {
+          /* Construct from scratch so no prototype exists */
+          d_tn->obj_init = 0;
+        } else if (d_tn->cct.cctorv != 0) {
+          /* Now data initialize the prototype since it was previously
+           * registered */
+          d_tn->obj_init = (void *)__kmp_allocate(d_tn->cmn_size);
+          (void)(*d_tn->cct.cctorv)(d_tn->obj_init, pc_addr, d_tn->vec_len);
+        } else {
+          d_tn->pod_init = __kmp_init_common_data(data_addr, d_tn->cmn_size);
+        }
+      } else {
+        if (d_tn->ct.ctor != 0) {
+          /* Construct from scratch so no prototype exists */
+          d_tn->obj_init = 0;
+        } else if (d_tn->cct.cctor != 0) {
+          /* Now data initialize the prototype since it was previously
+             registered */
+          d_tn->obj_init = (void *)__kmp_allocate(d_tn->cmn_size);
+          (void)(*d_tn->cct.cctor)(d_tn->obj_init, pc_addr);
+        } else {
+          d_tn->pod_init = __kmp_init_common_data(data_addr, d_tn->cmn_size);
+        }
+      }
+    }
+  } else {
+    struct shared_common **lnk_tn;
+
+    d_tn = (struct shared_common *)__kmp_allocate(sizeof(struct shared_common));
+    d_tn->gbl_addr = pc_addr;
+    d_tn->cmn_size = pc_size;
+    d_tn->pod_init = __kmp_init_common_data(data_addr, pc_size);
+    /*
+            d_tn->obj_init = 0;  // AC: commented out because __kmp_allocate
+       zeroes the memory
+            d_tn->ct.ctor = 0;
+            d_tn->cct.cctor = 0;
+            d_tn->dt.dtor = 0;
+            d_tn->is_vec = FALSE;
+            d_tn->vec_len = 0L;
+    */
+    lnk_tn = &(__kmp_threadprivate_d_table.data[KMP_HASH(pc_addr)]);
+
+    d_tn->next = *lnk_tn;
+    *lnk_tn = d_tn;
+  }
+
+  tn->cmn_size = d_tn->cmn_size;
+
+  if ((__kmp_foreign_tp) ? (KMP_INITIAL_GTID(gtid)) : (KMP_UBER_GTID(gtid))) {
+    tn->par_addr = (void *)pc_addr;
+  } else {
+    tn->par_addr = (void *)__kmp_allocate(tn->cmn_size);
+  }
+
+  __kmp_release_lock(&__kmp_global_lock, gtid);
+  /* +++++++++ END OF CRITICAL SECTION +++++++++ */
+
+#ifdef USE_CHECKS_COMMON
+  if (pc_size > d_tn->cmn_size) {
+    KC_TRACE(
+        10, ("__kmp_threadprivate_insert: THREADPRIVATE: %p (%" KMP_UINTPTR_SPEC
+             " ,%" KMP_UINTPTR_SPEC ")\n",
+             pc_addr, pc_size, d_tn->cmn_size));
+    KMP_FATAL(TPCommonBlocksInconsist);
+  }
+#endif /* USE_CHECKS_COMMON */
+
+  tt = &(__kmp_threads[gtid]->th.th_pri_common->data[KMP_HASH(pc_addr)]);
+
+#ifdef KMP_TASK_COMMON_DEBUG
+  if (*tt != 0) {
+    KC_TRACE(
+        10,
+        ("__kmp_threadprivate_insert: WARNING! thread#%d: collision on %p\n",
+         gtid, pc_addr));
+  }
+#endif
+  tn->next = *tt;
+  *tt = tn;
+
+#ifdef KMP_TASK_COMMON_DEBUG
+  KC_TRACE(10,
+           ("__kmp_threadprivate_insert: thread#%d, inserted node %p on list\n",
+            gtid, pc_addr));
+  dump_list();
+#endif
+
+  /* Link the node into a simple list */
+
+  tn->link = __kmp_threads[gtid]->th.th_pri_head;
+  __kmp_threads[gtid]->th.th_pri_head = tn;
+
+  if ((__kmp_foreign_tp) ? (KMP_INITIAL_GTID(gtid)) : (KMP_UBER_GTID(gtid)))
+    return tn;
+
+  /* if C++ object with copy constructor, use it;
+   * else if C++ object with constructor, use it for the non-primary thread
+     copies only;
+   * else use pod_init and memcpy
+   *
+   * C++ constructors need to be called once for each non-primary thread on
+   * allocate
+   * C++ copy constructors need to be called once for each thread on allocate */
+
+  /* C++ object with constructors/destructors; don't call constructors for
+     primary thread though */
+  if (d_tn->is_vec) {
+    if (d_tn->ct.ctorv != 0) {
+      (void)(*d_tn->ct.ctorv)(tn->par_addr, d_tn->vec_len);
+    } else if (d_tn->cct.cctorv != 0) {
+      (void)(*d_tn->cct.cctorv)(tn->par_addr, d_tn->obj_init, d_tn->vec_len);
+    } else if (tn->par_addr != tn->gbl_addr) {
+      __kmp_copy_common_data(tn->par_addr, d_tn->pod_init);
+    }
+  } else {
+    if (d_tn->ct.ctor != 0) {
+      (void)(*d_tn->ct.ctor)(tn->par_addr);
+    } else if (d_tn->cct.cctor != 0) {
+      (void)(*d_tn->cct.cctor)(tn->par_addr, d_tn->obj_init);
+    } else if (tn->par_addr != tn->gbl_addr) {
+      __kmp_copy_common_data(tn->par_addr, d_tn->pod_init);
+    }
+  }
+  /* !BUILD_OPENMP_C
+      if (tn->par_addr != tn->gbl_addr)
+          __kmp_copy_common_data( tn->par_addr, d_tn->pod_init ); */
+
+  return tn;
+}
+
+/* ------------------------------------------------------------------------ */
+/* We are currently parallel, and we know the thread id.                    */
+/* ------------------------------------------------------------------------ */
+
+/*!
+ @ingroup THREADPRIVATE
+
+ @param loc source location information
+ @param data  pointer to data being privatized
+ @param ctor  pointer to constructor function for data
+ @param cctor  pointer to copy constructor function for data
+ @param dtor  pointer to destructor function for data
+
+ Register constructors and destructors for thread private data.
+ This function is called when executing in parallel, when we know the thread id.
+*/
+void __kmpc_threadprivate_register(ident_t *loc, void *data, kmpc_ctor ctor,
+                                   kmpc_cctor cctor, kmpc_dtor dtor) {
+  struct shared_common *d_tn, **lnk_tn;
+
+  KC_TRACE(10, ("__kmpc_threadprivate_register: called\n"));
+
+#ifdef USE_CHECKS_COMMON
+  /* copy constructor must be zero for current code gen (Nov 2002 - jph) */
+  KMP_ASSERT(cctor == 0);
+#endif /* USE_CHECKS_COMMON */
+
+  /* Only the global data table exists. */
+  d_tn = __kmp_find_shared_task_common(&__kmp_threadprivate_d_table, -1, data);
+
+  if (d_tn == 0) {
+    d_tn = (struct shared_common *)__kmp_allocate(sizeof(struct shared_common));
+    d_tn->gbl_addr = data;
+
+    d_tn->ct.ctor = ctor;
+    d_tn->cct.cctor = cctor;
+    d_tn->dt.dtor = dtor;
+    /*
+            d_tn->is_vec = FALSE;  // AC: commented out because __kmp_allocate
+       zeroes the memory
+            d_tn->vec_len = 0L;
+            d_tn->obj_init = 0;
+            d_tn->pod_init = 0;
+    */
+    lnk_tn = &(__kmp_threadprivate_d_table.data[KMP_HASH(data)]);
+
+    d_tn->next = *lnk_tn;
+    *lnk_tn = d_tn;
+  }
+}
+
+void *__kmpc_threadprivate(ident_t *loc, kmp_int32 global_tid, void *data,
+                           size_t size) {
+  void *ret;
+  struct private_common *tn;
+
+  KC_TRACE(10, ("__kmpc_threadprivate: T#%d called\n", global_tid));
+
+#ifdef USE_CHECKS_COMMON
+  if (!__kmp_init_serial)
+    KMP_FATAL(RTLNotInitialized);
+#endif /* USE_CHECKS_COMMON */
+
+  if (!__kmp_threads[global_tid]->th.th_root->r.r_active && !__kmp_foreign_tp) {
+    /* The parallel address will NEVER overlap with the data_address */
+    /* dkp: 3rd arg to kmp_threadprivate_insert_private_data() is the
+     * data_address; use data_address = data */
+
+    KC_TRACE(20, ("__kmpc_threadprivate: T#%d inserting private data\n",
+                  global_tid));
+    kmp_threadprivate_insert_private_data(global_tid, data, data, size);
+
+    ret = data;
+  } else {
+    KC_TRACE(
+        50,
+        ("__kmpc_threadprivate: T#%d try to find private data at address %p\n",
+         global_tid, data));
+    tn = __kmp_threadprivate_find_task_common(
+        __kmp_threads[global_tid]->th.th_pri_common, global_tid, data);
+
+    if (tn) {
+      KC_TRACE(20, ("__kmpc_threadprivate: T#%d found data\n", global_tid));
+#ifdef USE_CHECKS_COMMON
+      if ((size_t)size > tn->cmn_size) {
+        KC_TRACE(10, ("THREADPRIVATE: %p (%" KMP_UINTPTR_SPEC
+                      " ,%" KMP_UINTPTR_SPEC ")\n",
+                      data, size, tn->cmn_size));
+        KMP_FATAL(TPCommonBlocksInconsist);
+      }
+#endif /* USE_CHECKS_COMMON */
+    } else {
+      /* The parallel address will NEVER overlap with the data_address */
+      /* dkp: 3rd arg to kmp_threadprivate_insert() is the data_address; use
+       * data_address = data */
+      KC_TRACE(20, ("__kmpc_threadprivate: T#%d inserting data\n", global_tid));
+      tn = kmp_threadprivate_insert(global_tid, data, data, size);
+    }
+
+    ret = tn->par_addr;
+  }
+  KC_TRACE(10, ("__kmpc_threadprivate: T#%d exiting; return value = %p\n",
+                global_tid, ret));
+
+  return ret;
+}
+
+static kmp_cached_addr_t *__kmp_find_cache(void *data) {
+  kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+  while (ptr && ptr->data != data)
+    ptr = ptr->next;
+  return ptr;
+}
+
+/*!
+ @ingroup THREADPRIVATE
+ @param loc source location information
+ @param global_tid  global thread number
+ @param data  pointer to data to privatize
+ @param size  size of data to privatize
+ @param cache  pointer to cache
+ @return pointer to private storage
+
+ Allocate private storage for threadprivate data.
+*/
+void *
+__kmpc_threadprivate_cached(ident_t *loc,
+                            kmp_int32 global_tid, // gtid.
+                            void *data, // Pointer to original global variable.
+                            size_t size, // Size of original global variable.
+                            void ***cache) {
+  KC_TRACE(10, ("__kmpc_threadprivate_cached: T#%d called with cache: %p, "
+                "address: %p, size: %" KMP_SIZE_T_SPEC "\n",
+                global_tid, *cache, data, size));
+
+  if (TCR_PTR(*cache) == 0) {
+    __kmp_acquire_lock(&__kmp_global_lock, global_tid);
+
+    if (TCR_PTR(*cache) == 0) {
+      __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
+      // Compiler often passes in NULL cache, even if it's already been created
+      void **my_cache;
+      kmp_cached_addr_t *tp_cache_addr;
+      // Look for an existing cache
+      tp_cache_addr = __kmp_find_cache(data);
+      if (!tp_cache_addr) { // Cache was never created; do it now
+        __kmp_tp_cached = 1;
+        KMP_ITT_IGNORE(my_cache = (void **)__kmp_allocate(
+                           sizeof(void *) * __kmp_tp_capacity +
+                           sizeof(kmp_cached_addr_t)););
+        // No need to zero the allocated memory; __kmp_allocate does that.
+        KC_TRACE(50, ("__kmpc_threadprivate_cached: T#%d allocated cache at "
+                      "address %p\n",
+                      global_tid, my_cache));
+        /* TODO: free all this memory in __kmp_common_destroy using
+         * __kmp_threadpriv_cache_list */
+        /* Add address of mycache to linked list for cleanup later  */
+        tp_cache_addr = (kmp_cached_addr_t *)&my_cache[__kmp_tp_capacity];
+        tp_cache_addr->addr = my_cache;
+        tp_cache_addr->data = data;
+        tp_cache_addr->compiler_cache = cache;
+        tp_cache_addr->next = __kmp_threadpriv_cache_list;
+        __kmp_threadpriv_cache_list = tp_cache_addr;
+      } else { // A cache was already created; use it
+        my_cache = tp_cache_addr->addr;
+        tp_cache_addr->compiler_cache = cache;
+      }
+      KMP_MB();
+
+      TCW_PTR(*cache, my_cache);
+      __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
+
+      KMP_MB();
+    }
+    __kmp_release_lock(&__kmp_global_lock, global_tid);
+  }
+
+  void *ret;
+  if ((ret = TCR_PTR((*cache)[global_tid])) == 0) {
+    ret = __kmpc_threadprivate(loc, global_tid, data, (size_t)size);
+
+    TCW_PTR((*cache)[global_tid], ret);
+  }
+  KC_TRACE(10,
+           ("__kmpc_threadprivate_cached: T#%d exiting; return value = %p\n",
+            global_tid, ret));
+  return ret;
+}
+
+// This function should only be called when both __kmp_tp_cached_lock and
+// kmp_forkjoin_lock are held.
+void __kmp_threadprivate_resize_cache(int newCapacity) {
+  KC_TRACE(10, ("__kmp_threadprivate_resize_cache: called with size: %d\n",
+                newCapacity));
+
+  kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+
+  while (ptr) {
+    if (ptr->data) { // this location has an active cache; resize it
+      void **my_cache;
+      KMP_ITT_IGNORE(my_cache =
+                         (void **)__kmp_allocate(sizeof(void *) * newCapacity +
+                                                 sizeof(kmp_cached_addr_t)););
+      // No need to zero the allocated memory; __kmp_allocate does that.
+      KC_TRACE(50, ("__kmp_threadprivate_resize_cache: allocated cache at %p\n",
+                    my_cache));
+      // Now copy old cache into new cache
+      void **old_cache = ptr->addr;
+      for (int i = 0; i < __kmp_tp_capacity; ++i) {
+        my_cache[i] = old_cache[i];
+      }
+
+      // Add address of new my_cache to linked list for cleanup later
+      kmp_cached_addr_t *tp_cache_addr;
+      tp_cache_addr = (kmp_cached_addr_t *)&my_cache[newCapacity];
+      tp_cache_addr->addr = my_cache;
+      tp_cache_addr->data = ptr->data;
+      tp_cache_addr->compiler_cache = ptr->compiler_cache;
+      tp_cache_addr->next = __kmp_threadpriv_cache_list;
+      __kmp_threadpriv_cache_list = tp_cache_addr;
+
+      // Copy new cache to compiler's location: We can copy directly
+      // to (*compiler_cache) if compiler guarantees it will keep
+      // using the same location for the cache. This is not yet true
+      // for some compilers, in which case we have to check if
+      // compiler_cache is still pointing at old cache, and if so, we
+      // can point it at the new cache with an atomic compare&swap
+      // operation. (Old method will always work, but we should shift
+      // to new method (commented line below) when Intel and Clang
+      // compilers use new method.)
+      (void)KMP_COMPARE_AND_STORE_PTR(tp_cache_addr->compiler_cache, old_cache,
+                                      my_cache);
+      // TCW_PTR(*(tp_cache_addr->compiler_cache), my_cache);
+
+      // If the store doesn't happen here, the compiler's old behavior will
+      // inevitably call __kmpc_threadprivate_cache with a new location for the
+      // cache, and that function will store the resized cache there at that
+      // point.
+
+      // Nullify old cache's data pointer so we skip it next time
+      ptr->data = NULL;
+    }
+    ptr = ptr->next;
+  }
+  // After all caches are resized, update __kmp_tp_capacity to the new size
+  *(volatile int *)&__kmp_tp_capacity = newCapacity;
+}
+
+/*!
+ @ingroup THREADPRIVATE
+ @param loc source location information
+ @param data  pointer to data being privatized
+ @param ctor  pointer to constructor function for data
+ @param cctor  pointer to copy constructor function for data
+ @param dtor  pointer to destructor function for data
+ @param vector_length length of the vector (bytes or elements?)
+ Register vector constructors and destructors for thread private data.
+*/
+void __kmpc_threadprivate_register_vec(ident_t *loc, void *data,
+                                       kmpc_ctor_vec ctor, kmpc_cctor_vec cctor,
+                                       kmpc_dtor_vec dtor,
+                                       size_t vector_length) {
+  struct shared_common *d_tn, **lnk_tn;
+
+  KC_TRACE(10, ("__kmpc_threadprivate_register_vec: called\n"));
+
+#ifdef USE_CHECKS_COMMON
+  /* copy constructor must be zero for current code gen (Nov 2002 - jph) */
+  KMP_ASSERT(cctor == 0);
+#endif /* USE_CHECKS_COMMON */
+
+  d_tn = __kmp_find_shared_task_common(
+      &__kmp_threadprivate_d_table, -1,
+      data); /* Only the global data table exists. */
+
+  if (d_tn == 0) {
+    d_tn = (struct shared_common *)__kmp_allocate(sizeof(struct shared_common));
+    d_tn->gbl_addr = data;
+
+    d_tn->ct.ctorv = ctor;
+    d_tn->cct.cctorv = cctor;
+    d_tn->dt.dtorv = dtor;
+    d_tn->is_vec = TRUE;
+    d_tn->vec_len = (size_t)vector_length;
+    // d_tn->obj_init = 0;  // AC: __kmp_allocate zeroes the memory
+    // d_tn->pod_init = 0;
+    lnk_tn = &(__kmp_threadprivate_d_table.data[KMP_HASH(data)]);
+
+    d_tn->next = *lnk_tn;
+    *lnk_tn = d_tn;
+  }
+}
+
+void __kmp_cleanup_threadprivate_caches() {
+  kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+
+  while (ptr) {
+    void **cache = ptr->addr;
+    __kmp_threadpriv_cache_list = ptr->next;
+    if (*ptr->compiler_cache)
+      *ptr->compiler_cache = NULL;
+    ptr->compiler_cache = NULL;
+    ptr->data = NULL;
+    ptr->addr = NULL;
+    ptr->next = NULL;
+    // Threadprivate data pointed at by cache entries are destroyed at end of
+    // __kmp_launch_thread with __kmp_common_destroy_gtid.
+    __kmp_free(cache); // implicitly frees ptr too
+    ptr = __kmp_threadpriv_cache_list;
+  }
+}
diff --git a/third_party/openmp/kmp_utility.cpp b/third_party/openmp/kmp_utility.cpp
new file mode 100644
index 000000000..f901eaca9
--- /dev/null
+++ b/third_party/openmp/kmp_utility.cpp
@@ -0,0 +1,420 @@
+/*
+ * kmp_utility.cpp -- Utility routines for the OpenMP support library.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_str.h"
+#include "kmp_wrapper_getpid.h"
+#include <float.h>
+
+static const char *unknown = "unknown";
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+/* NOTE: If called before serial_initialize (i.e. from runtime_initialize), then
+   the debugging package has not been initialized yet, and only "0" will print
+   debugging output since the environment variables have not been read. */
+
+#ifdef KMP_DEBUG
+static int trace_level = 5;
+#endif
+
+/* LOG_ID_BITS  = ( 1 + floor( log_2( max( log_per_phy - 1, 1 ))))
+ * APIC_ID      = (PHY_ID << LOG_ID_BITS) | LOG_ID
+ * PHY_ID       = APIC_ID >> LOG_ID_BITS
+ */
+int __kmp_get_physical_id(int log_per_phy, int apic_id) {
+  int index_lsb, index_msb, temp;
+
+  if (log_per_phy > 1) {
+    index_lsb = 0;
+    index_msb = 31;
+
+    temp = log_per_phy;
+    while ((temp & 1) == 0) {
+      temp >>= 1;
+      index_lsb++;
+    }
+
+    temp = log_per_phy;
+    while ((temp & 0x80000000) == 0) {
+      temp <<= 1;
+      index_msb--;
+    }
+
+    /* If >1 bits were set in log_per_phy, choose next higher power of 2 */
+    if (index_lsb != index_msb)
+      index_msb++;
+
+    return ((int)(apic_id >> index_msb));
+  }
+
+  return apic_id;
+}
+
+/*
+ * LOG_ID_BITS  = ( 1 + floor( log_2( max( log_per_phy - 1, 1 ))))
+ * APIC_ID      = (PHY_ID << LOG_ID_BITS) | LOG_ID
+ * LOG_ID       = APIC_ID & (( 1 << LOG_ID_BITS ) - 1 )
+ */
+int __kmp_get_logical_id(int log_per_phy, int apic_id) {
+  unsigned current_bit;
+  int bits_seen;
+
+  if (log_per_phy <= 1)
+    return (0);
+
+  bits_seen = 0;
+
+  for (current_bit = 1; log_per_phy != 0; current_bit <<= 1) {
+    if (log_per_phy & current_bit) {
+      log_per_phy &= ~current_bit;
+      bits_seen++;
+    }
+  }
+
+  /* If exactly 1 bit was set in log_per_phy, choose next lower power of 2 */
+  if (bits_seen == 1) {
+    current_bit >>= 1;
+  }
+
+  return ((int)((current_bit - 1) & apic_id));
+}
+
+static kmp_uint64 __kmp_parse_frequency( // R: Frequency in Hz.
+    char const *frequency // I: Float number and unit: MHz, GHz, or TGz.
+) {
+
+  double value = 0.0;
+  char *unit = NULL;
+  kmp_uint64 result = 0; /* Zero is a better unknown value than all ones. */
+
+  if (frequency == NULL) {
+    return result;
+  }
+  value = strtod(frequency, &unit);
+  if (0 < value &&
+      value <= DBL_MAX) { // Good value (not overflow, underflow, etc).
+    if (strcmp(unit, "MHz") == 0) {
+      value = value * 1.0E+6;
+    } else if (strcmp(unit, "GHz") == 0) {
+      value = value * 1.0E+9;
+    } else if (strcmp(unit, "THz") == 0) {
+      value = value * 1.0E+12;
+    } else { // Wrong unit.
+      return result;
+    }
+    result = (kmp_uint64)value; // rounds down
+  }
+  return result;
+
+} // func __kmp_parse_cpu_frequency
+
+void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
+  struct kmp_cpuid buf;
+  int max_arg;
+  int log_per_phy;
+#ifdef KMP_DEBUG
+  int cflush_size;
+#endif
+
+  p->initialized = 1;
+
+  p->flags.sse2 = 1; // Assume SSE2 by default.
+
+  __kmp_x86_cpuid(0, 0, &buf);
+
+  KA_TRACE(trace_level,
+           ("INFO: CPUID %d: EAX=0x%08X EBX=0x%08X ECX=0x%08X EDX=0x%08X\n", 0,
+            buf.eax, buf.ebx, buf.ecx, buf.edx));
+
+  max_arg = buf.eax;
+
+  p->apic_id = -1;
+
+  if (max_arg >= 1) {
+    int i;
+    kmp_uint32 t, data[4];
+
+    __kmp_x86_cpuid(1, 0, &buf);
+    KA_TRACE(trace_level,
+             ("INFO: CPUID %d: EAX=0x%08X EBX=0x%08X ECX=0x%08X EDX=0x%08X\n",
+              1, buf.eax, buf.ebx, buf.ecx, buf.edx));
+
+    {
+#define get_value(reg, lo, mask) (((reg) >> (lo)) & (mask))
+
+      p->signature = buf.eax;
+      p->family = get_value(buf.eax, 20, 0xff) + get_value(buf.eax, 8, 0x0f);
+      p->model =
+          (get_value(buf.eax, 16, 0x0f) << 4) + get_value(buf.eax, 4, 0x0f);
+      p->stepping = get_value(buf.eax, 0, 0x0f);
+
+#undef get_value
+
+      KA_TRACE(trace_level, (" family = %d, model = %d, stepping = %d\n",
+                             p->family, p->model, p->stepping));
+    }
+
+    for (t = buf.ebx, i = 0; i < 4; t >>= 8, ++i) {
+      data[i] = (t & 0xff);
+    }
+
+    p->flags.sse2 = (buf.edx >> 26) & 1;
+
+#ifdef KMP_DEBUG
+
+    if ((buf.edx >> 4) & 1) {
+      /* TSC - Timestamp Counter Available */
+      KA_TRACE(trace_level, (" TSC"));
+    }
+    if ((buf.edx >> 8) & 1) {
+      /* CX8 - CMPXCHG8B Instruction Available */
+      KA_TRACE(trace_level, (" CX8"));
+    }
+    if ((buf.edx >> 9) & 1) {
+      /* APIC - Local APIC Present (multi-processor operation support */
+      KA_TRACE(trace_level, (" APIC"));
+    }
+    if ((buf.edx >> 15) & 1) {
+      /* CMOV - Conditional MOVe Instruction Available */
+      KA_TRACE(trace_level, (" CMOV"));
+    }
+    if ((buf.edx >> 18) & 1) {
+      /* PSN - Processor Serial Number Available */
+      KA_TRACE(trace_level, (" PSN"));
+    }
+    if ((buf.edx >> 19) & 1) {
+      /* CLFLUSH - Cache Flush Instruction Available */
+      cflush_size =
+          data[1] * 8; /* Bits 15-08: CLFLUSH line size = 8 (64 bytes) */
+      KA_TRACE(trace_level, (" CLFLUSH(%db)", cflush_size));
+    }
+    if ((buf.edx >> 21) & 1) {
+      /* DTES - Debug Trace & EMON Store */
+      KA_TRACE(trace_level, (" DTES"));
+    }
+    if ((buf.edx >> 22) & 1) {
+      /* ACPI - ACPI Support Available */
+      KA_TRACE(trace_level, (" ACPI"));
+    }
+    if ((buf.edx >> 23) & 1) {
+      /* MMX - Multimedia Extensions */
+      KA_TRACE(trace_level, (" MMX"));
+    }
+    if ((buf.edx >> 25) & 1) {
+      /* SSE - SSE Instructions */
+      KA_TRACE(trace_level, (" SSE"));
+    }
+    if ((buf.edx >> 26) & 1) {
+      /* SSE2 - SSE2 Instructions */
+      KA_TRACE(trace_level, (" SSE2"));
+    }
+    if ((buf.edx >> 27) & 1) {
+      /* SLFSNP - Self-Snooping Cache */
+      KA_TRACE(trace_level, (" SLFSNP"));
+    }
+#endif /* KMP_DEBUG */
+
+    if ((buf.edx >> 28) & 1) {
+      /* Bits 23-16: Logical Processors per Physical Processor (1 for P4) */
+      log_per_phy = data[2];
+      p->apic_id = data[3]; /* Bits 31-24: Processor Initial APIC ID (X) */
+      KA_TRACE(trace_level, (" HT(%d TPUs)", log_per_phy));
+      p->physical_id = __kmp_get_physical_id(log_per_phy, p->apic_id);
+      p->logical_id = __kmp_get_logical_id(log_per_phy, p->apic_id);
+    }
+#ifdef KMP_DEBUG
+    if ((buf.edx >> 29) & 1) {
+      /* ATHROTL - Automatic Throttle Control */
+      KA_TRACE(trace_level, (" ATHROTL"));
+    }
+    KA_TRACE(trace_level, (" ]\n"));
+
+    for (i = 2; i <= max_arg; ++i) {
+      __kmp_x86_cpuid(i, 0, &buf);
+      KA_TRACE(trace_level,
+               ("INFO: CPUID %d: EAX=0x%08X EBX=0x%08X ECX=0x%08X EDX=0x%08X\n",
+                i, buf.eax, buf.ebx, buf.ecx, buf.edx));
+    }
+#endif
+    p->flags.rtm = 0;
+    p->flags.hybrid = 0;
+    if (max_arg > 7) {
+      /* RTM bit CPUID.07:EBX, bit 11 */
+      /* HYRBID bit CPUID.07:EDX, bit 15 */
+      __kmp_x86_cpuid(7, 0, &buf);
+      p->flags.rtm = (buf.ebx >> 11) & 1;
+      p->flags.hybrid = (buf.edx >> 15) & 1;
+      if (p->flags.rtm) {
+        KA_TRACE(trace_level, (" RTM"));
+      }
+      if (p->flags.hybrid) {
+        KA_TRACE(trace_level, (" HYBRID"));
+      }
+    }
+  }
+
+  { // Parse CPU brand string for frequency, saving the string for later.
+    int i;
+    kmp_cpuid_t *base = (kmp_cpuid_t *)&p->name[0];
+
+    // Get CPU brand string.
+    for (i = 0; i < 3; ++i) {
+      __kmp_x86_cpuid(0x80000002 + i, 0, base + i);
+    }
+    p->name[sizeof(p->name) - 1] = 0; // Just in case. ;-)
+    KA_TRACE(trace_level, ("cpu brand string: \"%s\"\n", &p->name[0]));
+
+    // Parse frequency.
+    p->frequency = __kmp_parse_frequency(strrchr(&p->name[0], ' '));
+    KA_TRACE(trace_level,
+             ("cpu frequency from brand string: %" KMP_UINT64_SPEC "\n",
+              p->frequency));
+  }
+}
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+void __kmp_expand_host_name(char *buffer, size_t size) {
+  KMP_DEBUG_ASSERT(size >= sizeof(unknown));
+#if KMP_OS_WINDOWS
+  {
+    DWORD s = size;
+
+    if (!GetComputerNameA(buffer, &s))
+      KMP_STRCPY_S(buffer, size, unknown);
+  }
+#elif KMP_OS_WASI
+  KMP_STRCPY_S(buffer, size, unknown);
+#else
+  buffer[size - 2] = 0;
+  if (gethostname(buffer, size) || buffer[size - 2] != 0)
+    KMP_STRCPY_S(buffer, size, unknown);
+#endif
+}
+
+/* Expand the meta characters in the filename:
+ * Currently defined characters are:
+ * %H the hostname
+ * %P the number of threads used.
+ * %I the unique identifier for this run.
+ */
+
+void __kmp_expand_file_name(char *result, size_t rlen, char *pattern) {
+  char *pos = result, *end = result + rlen - 1;
+  char buffer[256];
+  int default_cpu_width = 1;
+  int snp_result;
+
+  KMP_DEBUG_ASSERT(rlen > 0);
+  *end = 0;
+  {
+    int i;
+    for (i = __kmp_xproc; i >= 10; i /= 10, ++default_cpu_width)
+      ;
+  }
+
+  if (pattern != NULL) {
+    while (*pattern != '\0' && pos < end) {
+      if (*pattern != '%') {
+        *pos++ = *pattern++;
+      } else {
+        char *old_pattern = pattern;
+        int width = 1;
+        int cpu_width = default_cpu_width;
+
+        ++pattern;
+
+        if (*pattern >= '0' && *pattern <= '9') {
+          width = 0;
+          do {
+            width = (width * 10) + *pattern++ - '0';
+          } while (*pattern >= '0' && *pattern <= '9');
+          if (width < 0 || width > 1024)
+            width = 1;
+
+          cpu_width = width;
+        }
+
+        switch (*pattern) {
+        case 'H':
+        case 'h': {
+          __kmp_expand_host_name(buffer, sizeof(buffer));
+          KMP_STRNCPY(pos, buffer, end - pos + 1);
+          if (*end == 0) {
+            while (*pos)
+              ++pos;
+            ++pattern;
+          } else
+            pos = end;
+        } break;
+        case 'P':
+        case 'p': {
+          snp_result = KMP_SNPRINTF(pos, end - pos + 1, "%0*d", cpu_width,
+                                    __kmp_dflt_team_nth);
+          if (snp_result >= 0 && snp_result <= end - pos) {
+            while (*pos)
+              ++pos;
+            ++pattern;
+          } else
+            pos = end;
+        } break;
+        case 'I':
+        case 'i': {
+          pid_t id = getpid();
+#if (KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) && defined(__MINGW32__)
+          snp_result = KMP_SNPRINTF(pos, end - pos + 1, "%0*lld", width, id);
+#else
+          snp_result = KMP_SNPRINTF(pos, end - pos + 1, "%0*d", width, id);
+#endif
+          if (snp_result >= 0 && snp_result <= end - pos) {
+            while (*pos)
+              ++pos;
+            ++pattern;
+          } else
+            pos = end;
+          break;
+        }
+        case '%': {
+          *pos++ = '%';
+          ++pattern;
+          break;
+        }
+        default: {
+          *pos++ = '%';
+          pattern = old_pattern + 1;
+          break;
+        }
+        }
+      }
+    }
+    /* TODO: How do we get rid of this? */
+    if (*pattern != '\0')
+      KMP_FATAL(FileNameTooLong);
+  }
+
+  *pos = '\0';
+}
+
+#if !OMPT_SUPPORT
+extern "C" {
+typedef struct ompt_start_tool_result_t ompt_start_tool_result_t;
+// Define symbols expected by VERSION script
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+                                          const char *runtime_version) {
+  return nullptr;
+}
+
+void ompt_libomp_connect(ompt_start_tool_result_t *result) { result = nullptr; }
+}
+#endif
diff --git a/third_party/openmp/kmp_utils.h b/third_party/openmp/kmp_utils.h
new file mode 100644
index 000000000..a557f929e
--- /dev/null
+++ b/third_party/openmp/kmp_utils.h
@@ -0,0 +1,55 @@
+/*
+ * kmp_utils.h -- Utilities that used internally
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef __KMP_UTILS_H__
+#define __KMP_UTILS_H__
+
+#include <cstddef>
+
+#include "kmp.h"
+
+/// A simple pure header implementation of VLA that aims to replace uses of
+/// actual VLA, which can cause compile warning. This class by default creates a
+/// stack buffer that can accomodate \p N elements. If the number of elements is
+/// greater than \p N, then a heap buffer will be allocated and used to
+/// accomodate the elements. Similar to the actual VLA, we don't check boundary
+/// (for now), so we will not store the number of elements. We can always revise
+/// it later.
+template <typename T, unsigned N = 8> class SimpleVLA final {
+  T StackBuffer[N];
+  T *HeapBuffer = nullptr;
+  T *Ptr = StackBuffer;
+
+public:
+  SimpleVLA() = delete;
+  SimpleVLA(const SimpleVLA &) = delete;
+  SimpleVLA(SimpleVLA &&) = delete;
+  SimpleVLA &operator=(const SimpleVLA &) = delete;
+  SimpleVLA &operator=(SimpleVLA &&) = delete;
+
+  explicit SimpleVLA(unsigned NumOfElements) noexcept {
+    if (NumOfElements > N) {
+      HeapBuffer =
+          reinterpret_cast<T *>(__kmp_allocate(NumOfElements * sizeof(T)));
+      Ptr = HeapBuffer;
+    }
+  }
+
+  ~SimpleVLA() {
+    if (HeapBuffer)
+      __kmp_free(HeapBuffer);
+  }
+
+  operator T *() noexcept { return Ptr; }
+  operator const T *() const noexcept { return Ptr; }
+};
+
+#endif
diff --git a/third_party/openmp/kmp_version.cpp b/third_party/openmp/kmp_version.cpp
new file mode 100644
index 000000000..39d0f6084
--- /dev/null
+++ b/third_party/openmp/kmp_version.cpp
@@ -0,0 +1,204 @@
+/*
+ * kmp_version.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_io.h"
+#include "kmp_version.h"
+
+// Replace with snapshot date YYYYMMDD for promotion build.
+#define KMP_VERSION_BUILD 20140926
+
+// Helper macros to convert value of macro to string literal.
+#define _stringer(x) #x
+#define stringer(x) _stringer(x)
+
+// Detect compiler.
+#if KMP_COMPILER_ICX
+#define KMP_COMPILER __VERSION__
+#elif KMP_COMPILER_ICC
+#if __INTEL_COMPILER == 1010
+#define KMP_COMPILER "Intel(R) C++ Compiler 10.1"
+#elif __INTEL_COMPILER == 1100
+#define KMP_COMPILER "Intel(R) C++ Compiler 11.0"
+#elif __INTEL_COMPILER == 1110
+#define KMP_COMPILER "Intel(R) C++ Compiler 11.1"
+#elif __INTEL_COMPILER == 1200
+#define KMP_COMPILER "Intel(R) C++ Compiler 12.0"
+#elif __INTEL_COMPILER == 1210
+#define KMP_COMPILER "Intel(R) C++ Compiler 12.1"
+#elif __INTEL_COMPILER == 1300
+#define KMP_COMPILER "Intel(R) C++ Compiler 13.0"
+#elif __INTEL_COMPILER == 1310
+#define KMP_COMPILER "Intel(R) C++ Compiler 13.1"
+#elif __INTEL_COMPILER == 1400
+#define KMP_COMPILER "Intel(R) C++ Compiler 14.0"
+#elif __INTEL_COMPILER == 1410
+#define KMP_COMPILER "Intel(R) C++ Compiler 14.1"
+#elif __INTEL_COMPILER == 1500
+#define KMP_COMPILER "Intel(R) C++ Compiler 15.0"
+#elif __INTEL_COMPILER == 1600
+#define KMP_COMPILER "Intel(R) C++ Compiler 16.0"
+#elif __INTEL_COMPILER == 1700
+#define KMP_COMPILER "Intel(R) C++ Compiler 17.0"
+#elif __INTEL_COMPILER == 1800
+#define KMP_COMPILER "Intel(R) C++ Compiler 18.0"
+#elif __INTEL_COMPILER == 1900
+#define KMP_COMPILER "Intel(R) C++ Compiler 19.0"
+#elif __INTEL_COMPILER == 1910
+#define KMP_COMPILER "Intel(R) C++ Compiler 19.1"
+#elif __INTEL_COMPILER > 1910
+#define KMP_COMPILER                                                           \
+  "Intel(R) C++ Compiler Classic " stringer(__INTEL_COMPILER) "." stringer(    \
+      __INTEL_COMPILER_UPDATE)
+#endif
+#elif KMP_COMPILER_CLANG
+#define KMP_COMPILER                                                           \
+  "Clang " stringer(__clang_major__) "." stringer(__clang_minor__)
+#elif KMP_COMPILER_GCC
+#define KMP_COMPILER "GCC " stringer(__GNUC__) "." stringer(__GNUC_MINOR__)
+#elif KMP_COMPILER_MSVC
+#define KMP_COMPILER "MSVC " stringer(_MSC_FULL_VER)
+#endif
+#ifndef KMP_COMPILER
+#warning "Unknown compiler"
+#define KMP_COMPILER "unknown compiler"
+#endif
+
+// Detect librray type (perf, stub).
+#ifdef KMP_STUB
+#define KMP_LIB_TYPE "stub"
+#else
+#define KMP_LIB_TYPE "performance"
+#endif // KMP_LIB_TYPE
+
+// Detect link type (static, dynamic).
+#if KMP_DYNAMIC_LIB
+#define KMP_LINK_TYPE "dynamic"
+#else
+#define KMP_LINK_TYPE "static"
+#endif // KMP_LINK_TYPE
+
+// Finally, define strings.
+#define KMP_LIBRARY KMP_LIB_TYPE " library (" KMP_LINK_TYPE ")"
+#define KMP_COPYRIGHT ""
+
+int const __kmp_version_major = KMP_VERSION_MAJOR;
+int const __kmp_version_minor = KMP_VERSION_MINOR;
+int const __kmp_version_build = KMP_VERSION_BUILD;
+int const __kmp_openmp_version = 201611;
+
+/* Do NOT change the format of this string!  Intel(R) Thread Profiler checks for
+   a specific format some changes in the recognition routine there need to be
+   made before this is changed. */
+char const __kmp_copyright[] = KMP_VERSION_PREFIX KMP_LIBRARY
+    " ver. " stringer(KMP_VERSION_MAJOR) "." stringer(
+        KMP_VERSION_MINOR) "." stringer(KMP_VERSION_BUILD) " " KMP_COPYRIGHT;
+
+char const __kmp_version_copyright[] = KMP_VERSION_PREFIX KMP_COPYRIGHT;
+char const __kmp_version_lib_ver[] =
+    KMP_VERSION_PREFIX "version: " stringer(KMP_VERSION_MAJOR) "." stringer(
+        KMP_VERSION_MINOR) "." stringer(KMP_VERSION_BUILD);
+char const __kmp_version_lib_type[] =
+    KMP_VERSION_PREFIX "library type: " KMP_LIB_TYPE;
+char const __kmp_version_link_type[] =
+    KMP_VERSION_PREFIX "link type: " KMP_LINK_TYPE;
+char const __kmp_version_build_time[] = KMP_VERSION_PREFIX "build time: "
+                                                           "no_timestamp";
+#if KMP_MIC2
+char const __kmp_version_target_env[] =
+    KMP_VERSION_PREFIX "target environment: MIC2";
+#endif
+char const __kmp_version_build_compiler[] =
+    KMP_VERSION_PREFIX "build compiler: " KMP_COMPILER;
+
+// Called at serial initialization time.
+static int __kmp_version_1_printed = FALSE;
+
+void __kmp_print_version_1(void) {
+  if (__kmp_version_1_printed) {
+    return;
+  }
+  __kmp_version_1_printed = TRUE;
+
+#ifndef KMP_STUB
+  kmp_str_buf_t buffer;
+  __kmp_str_buf_init(&buffer);
+  // Print version strings skipping initial magic.
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN]);
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_lib_type[KMP_VERSION_MAGIC_LEN]);
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_link_type[KMP_VERSION_MAGIC_LEN]);
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_build_time[KMP_VERSION_MAGIC_LEN]);
+#if KMP_MIC
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_target_env[KMP_VERSION_MAGIC_LEN]);
+#endif
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_build_compiler[KMP_VERSION_MAGIC_LEN]);
+#if defined(KMP_GOMP_COMPAT)
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_alt_comp[KMP_VERSION_MAGIC_LEN]);
+#endif /* defined(KMP_GOMP_COMPAT) */
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_omp_api[KMP_VERSION_MAGIC_LEN]);
+  __kmp_str_buf_print(&buffer, "%sdynamic error checking: %s\n",
+                      KMP_VERSION_PREF_STR,
+                      (__kmp_env_consistency_check ? "yes" : "no"));
+#ifdef KMP_DEBUG
+  for (int i = bs_plain_barrier; i < bs_last_barrier; ++i) {
+    __kmp_str_buf_print(
+        &buffer, "%s%s barrier branch bits: gather=%u, release=%u\n",
+        KMP_VERSION_PREF_STR, __kmp_barrier_type_name[i],
+        __kmp_barrier_gather_branch_bits[i],
+        __kmp_barrier_release_branch_bits[i]); // __kmp_str_buf_print
+  }
+  for (int i = bs_plain_barrier; i < bs_last_barrier; ++i) {
+    __kmp_str_buf_print(
+        &buffer, "%s%s barrier pattern: gather=%s, release=%s\n",
+        KMP_VERSION_PREF_STR, __kmp_barrier_type_name[i],
+        __kmp_barrier_pattern_name[__kmp_barrier_gather_pattern[i]],
+        __kmp_barrier_pattern_name
+            [__kmp_barrier_release_pattern[i]]); // __kmp_str_buf_print
+  }
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_lock[KMP_VERSION_MAGIC_LEN]);
+#endif
+  __kmp_str_buf_print(
+      &buffer, "%sthread affinity support: %s\n", KMP_VERSION_PREF_STR,
+#if KMP_AFFINITY_SUPPORTED
+      (KMP_AFFINITY_CAPABLE()
+           ? (__kmp_affinity.type == affinity_none ? "not used" : "yes")
+           : "no")
+#else
+      "no"
+#endif
+  );
+  __kmp_printf("%s", buffer.str);
+  __kmp_str_buf_free(&buffer);
+  K_DIAG(1, ("KMP_VERSION is true\n"));
+#endif // KMP_STUB
+} // __kmp_print_version_1
+
+// Called at parallel initialization time.
+static int __kmp_version_2_printed = FALSE;
+
+void __kmp_print_version_2(void) {
+  if (__kmp_version_2_printed) {
+    return;
+  }
+  __kmp_version_2_printed = TRUE;
+} // __kmp_print_version_2
+
+// end of file //
diff --git a/third_party/openmp/kmp_version.h b/third_party/openmp/kmp_version.h
new file mode 100644
index 000000000..6ce40eecb
--- /dev/null
+++ b/third_party/openmp/kmp_version.h
@@ -0,0 +1,66 @@
+/*
+ * kmp_version.h -- version number for this release
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_VERSION_H
+#define KMP_VERSION_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#ifndef KMP_VERSION_MAJOR
+#error KMP_VERSION_MAJOR macro is not defined.
+#endif
+#define KMP_VERSION_MINOR 0
+/* Using "magic" prefix in all the version strings is rather convenient to get
+   static version info from binaries by using standard utilities "strings" and
+   "grep", e. g.:
+        $ strings libomp.so | grep "@(#)"
+   gives clean list of all version strings in the library. Leading zero helps
+   to keep version string separate from printable characters which may occurs
+   just before version string. */
+#define KMP_VERSION_MAGIC_STR "\x00@(#) "
+#define KMP_VERSION_MAGIC_LEN 6 // Length of KMP_VERSION_MAGIC_STR.
+#define KMP_VERSION_PREF_STR "LLVM OMP "
+#define KMP_VERSION_PREFIX KMP_VERSION_MAGIC_STR KMP_VERSION_PREF_STR
+
+/* declare all the version string constants for KMP_VERSION env. variable */
+extern int const __kmp_version_major;
+extern int const __kmp_version_minor;
+extern int const __kmp_version_build;
+extern int const __kmp_openmp_version;
+extern char const
+    __kmp_copyright[]; // Old variable, kept for compatibility with ITC and ITP.
+extern char const __kmp_version_copyright[];
+extern char const __kmp_version_lib_ver[];
+extern char const __kmp_version_lib_type[];
+extern char const __kmp_version_link_type[];
+extern char const __kmp_version_build_time[];
+extern char const __kmp_version_target_env[];
+extern char const __kmp_version_build_compiler[];
+extern char const __kmp_version_alt_comp[];
+extern char const __kmp_version_omp_api[];
+// ??? extern char const __kmp_version_debug[];
+extern char const __kmp_version_lock[];
+extern char const __kmp_version_nested_stats_reporting[];
+extern char const __kmp_version_ftnstdcall[];
+extern char const __kmp_version_ftncdecl[];
+extern char const __kmp_version_ftnextra[];
+
+void __kmp_print_version_1(void);
+void __kmp_print_version_2(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif /* KMP_VERSION_H */
diff --git a/third_party/openmp/kmp_wait_release.cpp b/third_party/openmp/kmp_wait_release.cpp
new file mode 100644
index 000000000..d41ddf231
--- /dev/null
+++ b/third_party/openmp/kmp_wait_release.cpp
@@ -0,0 +1,51 @@
+/*
+ * kmp_wait_release.cpp -- Wait/Release implementation
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_wait_release.h"
+
+void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64<> *flag,
+                   int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  if (final_spin)
+    __kmp_wait_template<kmp_flag_64<>, TRUE>(
+        this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj));
+  else
+    __kmp_wait_template<kmp_flag_64<>, FALSE>(
+        this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj));
+}
+
+void __kmp_release_64(kmp_flag_64<> *flag) { __kmp_release_template(flag); }
+
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+template <bool C, bool S>
+void __kmp_mwait_32(int th_gtid, kmp_flag_32<C, S> *flag) {
+  __kmp_mwait_template(th_gtid, flag);
+}
+template <bool C, bool S>
+void __kmp_mwait_64(int th_gtid, kmp_flag_64<C, S> *flag) {
+  __kmp_mwait_template(th_gtid, flag);
+}
+template <bool C, bool S>
+void __kmp_atomic_mwait_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag) {
+  __kmp_mwait_template(th_gtid, flag);
+}
+void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag) {
+  __kmp_mwait_template(th_gtid, flag);
+}
+
+template void __kmp_mwait_32<false, false>(int, kmp_flag_32<false, false> *);
+template void __kmp_mwait_64<false, true>(int, kmp_flag_64<false, true> *);
+template void __kmp_mwait_64<true, false>(int, kmp_flag_64<true, false> *);
+template void
+__kmp_atomic_mwait_64<false, true>(int, kmp_atomic_flag_64<false, true> *);
+template void
+__kmp_atomic_mwait_64<true, false>(int, kmp_atomic_flag_64<true, false> *);
+#endif
diff --git a/third_party/openmp/kmp_wait_release.h b/third_party/openmp/kmp_wait_release.h
new file mode 100644
index 000000000..12d5d0677
--- /dev/null
+++ b/third_party/openmp/kmp_wait_release.h
@@ -0,0 +1,1051 @@
+/*
+ * kmp_wait_release.h -- Wait/Release implementation
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_WAIT_RELEASE_H
+#define KMP_WAIT_RELEASE_H
+
+#include "kmp.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+/*!
+@defgroup WAIT_RELEASE Wait/Release operations
+
+The definitions and functions here implement the lowest level thread
+synchronizations of suspending a thread and awaking it. They are used to build
+higher level operations such as barriers and fork/join.
+*/
+
+/*!
+@ingroup WAIT_RELEASE
+@{
+*/
+
+struct flag_properties {
+  unsigned int type : 16;
+  unsigned int reserved : 16;
+};
+
+template <enum flag_type FlagType> struct flag_traits {};
+
+template <> struct flag_traits<flag32> {
+  typedef kmp_uint32 flag_t;
+  static const flag_type t = flag32;
+  static inline flag_t tcr(flag_t f) { return TCR_4(f); }
+  static inline flag_t test_then_add4(volatile flag_t *f) {
+    return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f));
+  }
+  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_OR32(f, v);
+  }
+  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_AND32(f, v);
+  }
+};
+
+template <> struct flag_traits<atomic_flag64> {
+  typedef kmp_uint64 flag_t;
+  static const flag_type t = atomic_flag64;
+  static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+  static inline flag_t test_then_add4(volatile flag_t *f) {
+    return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
+  }
+  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_OR64(f, v);
+  }
+  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_AND64(f, v);
+  }
+};
+
+template <> struct flag_traits<flag64> {
+  typedef kmp_uint64 flag_t;
+  static const flag_type t = flag64;
+  static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+  static inline flag_t test_then_add4(volatile flag_t *f) {
+    return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
+  }
+  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_OR64(f, v);
+  }
+  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_AND64(f, v);
+  }
+};
+
+template <> struct flag_traits<flag_oncore> {
+  typedef kmp_uint64 flag_t;
+  static const flag_type t = flag_oncore;
+  static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+  static inline flag_t test_then_add4(volatile flag_t *f) {
+    return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
+  }
+  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_OR64(f, v);
+  }
+  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_AND64(f, v);
+  }
+};
+
+/*! Base class for all flags */
+template <flag_type FlagType> class kmp_flag {
+protected:
+  flag_properties t; /**< "Type" of the flag in loc */
+  kmp_info_t *waiting_threads[1]; /**< Threads sleeping on this thread. */
+  kmp_uint32 num_waiting_threads; /**< Num threads sleeping on this thread. */
+  std::atomic<bool> *sleepLoc;
+
+public:
+  typedef flag_traits<FlagType> traits_type;
+  kmp_flag() : t({FlagType, 0U}), num_waiting_threads(0), sleepLoc(nullptr) {}
+  kmp_flag(int nwaiters)
+      : t({FlagType, 0U}), num_waiting_threads(nwaiters), sleepLoc(nullptr) {}
+  kmp_flag(std::atomic<bool> *sloc)
+      : t({FlagType, 0U}), num_waiting_threads(0), sleepLoc(sloc) {}
+  /*! @result the flag_type */
+  flag_type get_type() { return (flag_type)(t.type); }
+
+  /*! param i in   index into waiting_threads
+   *  @result the thread that is waiting at index i */
+  kmp_info_t *get_waiter(kmp_uint32 i) {
+    KMP_DEBUG_ASSERT(i < num_waiting_threads);
+    return waiting_threads[i];
+  }
+  /*! @result num_waiting_threads */
+  kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+  /*! @param thr in   the thread which is now waiting
+   *  Insert a waiting thread at index 0. */
+  void set_waiter(kmp_info_t *thr) {
+    waiting_threads[0] = thr;
+    num_waiting_threads = 1;
+  }
+  enum barrier_type get_bt() { return bs_last_barrier; }
+};
+
+/*! Base class for wait/release volatile flag */
+template <typename PtrType, flag_type FlagType, bool Sleepable>
+class kmp_flag_native : public kmp_flag<FlagType> {
+protected:
+  volatile PtrType *loc;
+  PtrType checker; /**< When flag==checker, it has been released. */
+  typedef flag_traits<FlagType> traits_type;
+
+public:
+  typedef PtrType flag_t;
+  kmp_flag_native(volatile PtrType *p) : kmp_flag<FlagType>(), loc(p) {}
+  kmp_flag_native(volatile PtrType *p, kmp_info_t *thr)
+      : kmp_flag<FlagType>(1), loc(p) {
+    this->waiting_threads[0] = thr;
+  }
+  kmp_flag_native(volatile PtrType *p, PtrType c)
+      : kmp_flag<FlagType>(), loc(p), checker(c) {}
+  kmp_flag_native(volatile PtrType *p, PtrType c, std::atomic<bool> *sloc)
+      : kmp_flag<FlagType>(sloc), loc(p), checker(c) {}
+  virtual ~kmp_flag_native() {}
+  void *operator new(size_t size) { return __kmp_allocate(size); }
+  void operator delete(void *p) { __kmp_free(p); }
+  volatile PtrType *get() { return loc; }
+  void *get_void_p() { return RCAST(void *, CCAST(PtrType *, loc)); }
+  void set(volatile PtrType *new_loc) { loc = new_loc; }
+  PtrType load() { return *loc; }
+  void store(PtrType val) { *loc = val; }
+  /*! @result true if the flag object has been released. */
+  virtual bool done_check() {
+    if (Sleepable && !(this->sleepLoc))
+      return (traits_type::tcr(*(this->get())) & ~KMP_BARRIER_SLEEP_STATE) ==
+             checker;
+    else
+      return traits_type::tcr(*(this->get())) == checker;
+  }
+  /*! @param old_loc in   old value of flag
+   *  @result true if the flag's old value indicates it was released. */
+  virtual bool done_check_val(PtrType old_loc) { return old_loc == checker; }
+  /*! @result true if the flag object is not yet released.
+   * Used in __kmp_wait_template like:
+   * @code
+   * while (flag.notdone_check()) { pause(); }
+   * @endcode */
+  virtual bool notdone_check() {
+    return traits_type::tcr(*(this->get())) != checker;
+  }
+  /*! @result Actual flag value before release was applied.
+   * Trigger all waiting threads to run by modifying flag to release state. */
+  void internal_release() {
+    (void)traits_type::test_then_add4((volatile PtrType *)this->get());
+  }
+  /*! @result Actual flag value before sleep bit(s) set.
+   * Notes that there is at least one thread sleeping on the flag by setting
+   * sleep bit(s). */
+  PtrType set_sleeping() {
+    if (this->sleepLoc) {
+      this->sleepLoc->store(true);
+      return *(this->get());
+    }
+    return traits_type::test_then_or((volatile PtrType *)this->get(),
+                                     KMP_BARRIER_SLEEP_STATE);
+  }
+  /*! @result Actual flag value before sleep bit(s) cleared.
+   * Notes that there are no longer threads sleeping on the flag by clearing
+   * sleep bit(s). */
+  void unset_sleeping() {
+    if (this->sleepLoc) {
+      this->sleepLoc->store(false);
+      return;
+    }
+    traits_type::test_then_and((volatile PtrType *)this->get(),
+                               ~KMP_BARRIER_SLEEP_STATE);
+  }
+  /*! @param old_loc in   old value of flag
+   * Test if there are threads sleeping on the flag's old value in old_loc. */
+  bool is_sleeping_val(PtrType old_loc) {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return old_loc & KMP_BARRIER_SLEEP_STATE;
+  }
+  /*! Test whether there are threads sleeping on the flag. */
+  bool is_sleeping() {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return is_sleeping_val(*(this->get()));
+  }
+  bool is_any_sleeping() {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return is_sleeping_val(*(this->get()));
+  }
+  kmp_uint8 *get_stolen() { return NULL; }
+};
+
+/*! Base class for wait/release atomic flag */
+template <typename PtrType, flag_type FlagType, bool Sleepable>
+class kmp_flag_atomic : public kmp_flag<FlagType> {
+protected:
+  std::atomic<PtrType> *loc; /**< Pointer to flag location to wait on */
+  PtrType checker; /**< Flag == checker means it has been released. */
+public:
+  typedef flag_traits<FlagType> traits_type;
+  typedef PtrType flag_t;
+  kmp_flag_atomic(std::atomic<PtrType> *p) : kmp_flag<FlagType>(), loc(p) {}
+  kmp_flag_atomic(std::atomic<PtrType> *p, kmp_info_t *thr)
+      : kmp_flag<FlagType>(1), loc(p) {
+    this->waiting_threads[0] = thr;
+  }
+  kmp_flag_atomic(std::atomic<PtrType> *p, PtrType c)
+      : kmp_flag<FlagType>(), loc(p), checker(c) {}
+  kmp_flag_atomic(std::atomic<PtrType> *p, PtrType c, std::atomic<bool> *sloc)
+      : kmp_flag<FlagType>(sloc), loc(p), checker(c) {}
+  /*! @result the pointer to the actual flag */
+  std::atomic<PtrType> *get() { return loc; }
+  /*! @result void* pointer to the actual flag */
+  void *get_void_p() { return RCAST(void *, loc); }
+  /*! @param new_loc in   set loc to point at new_loc */
+  void set(std::atomic<PtrType> *new_loc) { loc = new_loc; }
+  /*! @result flag value */
+  PtrType load() { return loc->load(std::memory_order_acquire); }
+  /*! @param val the new flag value to be stored */
+  void store(PtrType val) { loc->store(val, std::memory_order_release); }
+  /*! @result true if the flag object has been released. */
+  bool done_check() {
+    if (Sleepable && !(this->sleepLoc))
+      return (this->load() & ~KMP_BARRIER_SLEEP_STATE) == checker;
+    else
+      return this->load() == checker;
+  }
+  /*! @param old_loc in   old value of flag
+   * @result true if the flag's old value indicates it was released. */
+  bool done_check_val(PtrType old_loc) { return old_loc == checker; }
+  /*! @result true if the flag object is not yet released.
+   * Used in __kmp_wait_template like:
+   * @code
+   * while (flag.notdone_check()) { pause(); }
+   * @endcode */
+  bool notdone_check() { return this->load() != checker; }
+  /*! @result Actual flag value before release was applied.
+   * Trigger all waiting threads to run by modifying flag to release state. */
+  void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
+  /*! @result Actual flag value before sleep bit(s) set.
+   * Notes that there is at least one thread sleeping on the flag by setting
+   * sleep bit(s). */
+  PtrType set_sleeping() {
+    if (this->sleepLoc) {
+      this->sleepLoc->store(true);
+      return *(this->get());
+    }
+    return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
+  }
+  /*! @result Actual flag value before sleep bit(s) cleared.
+   * Notes that there are no longer threads sleeping on the flag by clearing
+   * sleep bit(s). */
+  void unset_sleeping() {
+    if (this->sleepLoc) {
+      this->sleepLoc->store(false);
+      return;
+    }
+    KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
+  }
+  /*! @param old_loc in   old value of flag
+   * Test whether there are threads sleeping on flag's old value in old_loc. */
+  bool is_sleeping_val(PtrType old_loc) {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return old_loc & KMP_BARRIER_SLEEP_STATE;
+  }
+  /*! Test whether there are threads sleeping on the flag. */
+  bool is_sleeping() {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return is_sleeping_val(this->load());
+  }
+  bool is_any_sleeping() {
+    if (this->sleepLoc)
+      return this->sleepLoc->load();
+    return is_sleeping_val(this->load());
+  }
+  kmp_uint8 *get_stolen() { return NULL; }
+};
+
+#if OMPT_SUPPORT
+OMPT_NOINLINE
+static void __ompt_implicit_task_end(kmp_info_t *this_thr,
+                                     ompt_state_t ompt_state,
+                                     ompt_data_t *tId) {
+  int ds_tid = this_thr->th.th_info.ds.ds_tid;
+  if (ompt_state == ompt_state_wait_barrier_implicit) {
+    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+#if OMPT_OPTIONAL
+    void *codeptr = NULL;
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
+          codeptr);
+    }
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
+          codeptr);
+    }
+#endif
+    if (!KMP_MASTER_TID(ds_tid)) {
+      if (ompt_enabled.ompt_callback_implicit_task) {
+        int flags = this_thr->th.ompt_thread_info.parallel_flags;
+        flags = (flags & ompt_parallel_league) ? ompt_task_initial
+                                               : ompt_task_implicit;
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_end, NULL, tId, 0, ds_tid, flags);
+      }
+      // return to idle state
+      this_thr->th.ompt_thread_info.state = ompt_state_idle;
+    } else {
+      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+  }
+}
+#endif
+
+/* Spin wait loop that first does pause/yield, then sleep. A thread that calls
+   __kmp_wait_*  must make certain that another thread calls __kmp_release
+   to wake it back up to prevent deadlocks!
+
+   NOTE: We may not belong to a team at this point.  */
+template <class C, bool final_spin, bool Cancellable = false,
+          bool Sleepable = true>
+static inline bool
+__kmp_wait_template(kmp_info_t *this_thr,
+                    C *flag USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  volatile void *spin = flag->get();
+#endif
+  kmp_uint32 spins;
+  int th_gtid;
+  int tasks_completed = FALSE;
+#if !KMP_USE_MONITOR
+  kmp_uint64 poll_count;
+  kmp_uint64 hibernate_goal;
+#else
+  kmp_uint32 hibernate;
+#endif
+  kmp_uint64 time;
+
+  KMP_FSYNC_SPIN_INIT(spin, NULL);
+  if (flag->done_check()) {
+    KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
+    return false;
+  }
+  th_gtid = this_thr->th.th_info.ds.ds_gtid;
+  if (Cancellable) {
+    kmp_team_t *team = this_thr->th.th_team;
+    if (team && team->t.t_cancel_request == cancel_parallel)
+      return true;
+  }
+#if KMP_OS_UNIX
+  if (final_spin)
+    KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
+#endif
+  KA_TRACE(20,
+           ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
+#if KMP_STATS_ENABLED
+  stats_state_e thread_state = KMP_GET_THREAD_STATE();
+#endif
+
+/* OMPT Behavior:
+THIS function is called from
+  __kmp_barrier (2 times)  (implicit or explicit barrier in parallel regions)
+            these have join / fork behavior
+
+       In these cases, we don't change the state or trigger events in THIS
+function.
+       Events are triggered in the calling code (__kmp_barrier):
+
+                state := ompt_state_overhead
+            barrier-begin
+            barrier-wait-begin
+                state := ompt_state_wait_barrier
+          call join-barrier-implementation (finally arrive here)
+          {}
+          call fork-barrier-implementation (finally arrive here)
+          {}
+                state := ompt_state_overhead
+            barrier-wait-end
+            barrier-end
+                state := ompt_state_work_parallel
+
+
+  __kmp_fork_barrier  (after thread creation, before executing implicit task)
+          call fork-barrier-implementation (finally arrive here)
+          {} // worker arrive here with state = ompt_state_idle
+
+
+  __kmp_join_barrier  (implicit barrier at end of parallel region)
+                state := ompt_state_barrier_implicit
+            barrier-begin
+            barrier-wait-begin
+          call join-barrier-implementation (finally arrive here
+final_spin=FALSE)
+          {
+          }
+  __kmp_fork_barrier  (implicit barrier at end of parallel region)
+          call fork-barrier-implementation (finally arrive here final_spin=TRUE)
+
+       Worker after task-team is finished:
+            barrier-wait-end
+            barrier-end
+            implicit-task-end
+            idle-begin
+                state := ompt_state_idle
+
+       Before leaving, if state = ompt_state_idle
+            idle-end
+                state := ompt_state_overhead
+*/
+#if OMPT_SUPPORT
+  ompt_state_t ompt_entry_state;
+  ompt_data_t *tId;
+  if (ompt_enabled.enabled) {
+    ompt_entry_state = this_thr->th.ompt_thread_info.state;
+    if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
+        KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
+      ompt_lw_taskteam_t *team = NULL;
+      if (this_thr->th.th_team)
+        team = this_thr->th.th_team->t.ompt_serialized_team_info;
+      if (team) {
+        tId = &(team->ompt_task_info.task_data);
+      } else {
+        tId = OMPT_CUR_TASK_DATA(this_thr);
+      }
+    } else {
+      tId = &(this_thr->th.ompt_thread_info.task_data);
+    }
+    if (final_spin && (__kmp_tasking_mode == tskm_immediate_exec ||
+                       this_thr->th.th_task_team == NULL)) {
+      // implicit task is done. Either no taskqueue, or task-team finished
+      __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
+    }
+  }
+#endif
+
+  KMP_INIT_YIELD(spins); // Setup for waiting
+  KMP_INIT_BACKOFF(time);
+
+  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
+      __kmp_pause_status == kmp_soft_paused) {
+#if KMP_USE_MONITOR
+// The worker threads cannot rely on the team struct existing at this point.
+// Use the bt values cached in the thread struct instead.
+#ifdef KMP_ADJUST_BLOCKTIME
+    if (__kmp_pause_status == kmp_soft_paused ||
+        (__kmp_zero_bt && !this_thr->th.th_team_bt_set))
+      // Force immediate suspend if not set by user and more threads than
+      // available procs
+      hibernate = 0;
+    else
+      hibernate = this_thr->th.th_team_bt_intervals;
+#else
+    hibernate = this_thr->th.th_team_bt_intervals;
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+    /* If the blocktime is nonzero, we want to make sure that we spin wait for
+       the entirety of the specified #intervals, plus up to one interval more.
+       This increment make certain that this thread doesn't go to sleep too
+       soon.  */
+    if (hibernate != 0)
+      hibernate++;
+
+    // Add in the current time value.
+    hibernate += TCR_4(__kmp_global.g.g_time.dt.t_value);
+    KF_TRACE(20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
+                  th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
+                  hibernate - __kmp_global.g.g_time.dt.t_value));
+#else
+    if (__kmp_pause_status == kmp_soft_paused) {
+      // Force immediate suspend
+      hibernate_goal = KMP_NOW();
+    } else
+      hibernate_goal = KMP_NOW() + this_thr->th.th_team_bt_intervals;
+    poll_count = 0;
+    (void)poll_count;
+#endif // KMP_USE_MONITOR
+  }
+
+  KMP_MB();
+
+  // Main wait spin loop
+  while (flag->notdone_check()) {
+    kmp_task_team_t *task_team = NULL;
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      task_team = this_thr->th.th_task_team;
+      /* If the thread's task team pointer is NULL, it means one of 3 things:
+         1) A newly-created thread is first being released by
+         __kmp_fork_barrier(), and its task team has not been set up yet.
+         2) All tasks have been executed to completion.
+         3) Tasking is off for this region.  This could be because we are in a
+         serialized region (perhaps the outer one), or else tasking was manually
+         disabled (KMP_TASKING=0).  */
+      if (task_team != NULL) {
+        if (TCR_SYNC_4(task_team->tt.tt_active)) {
+          if (KMP_TASKING_ENABLED(task_team)) {
+            flag->execute_tasks(
+                this_thr, th_gtid, final_spin,
+                &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+          } else
+            this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+        } else {
+          KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
+#if OMPT_SUPPORT
+          // task-team is done now, other cases should be catched above
+          if (final_spin && ompt_enabled.enabled)
+            __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
+#endif
+          this_thr->th.th_task_team = NULL;
+          this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+        }
+      } else {
+        this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+      } // if
+    } // if
+
+    KMP_FSYNC_SPIN_PREPARE(CCAST(void *, spin));
+    if (TCR_4(__kmp_global.g.g_done)) {
+      if (__kmp_global.g.g_abort)
+        __kmp_abort_thread();
+      break;
+    }
+
+    // If we are oversubscribed, or have waited a bit (and
+    // KMP_LIBRARY=throughput), then yield
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
+
+#if KMP_STATS_ENABLED
+    // Check if thread has been signalled to idle state
+    // This indicates that the logical "join-barrier" has finished
+    if (this_thr->th.th_stats->isIdle() &&
+        KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) {
+      KMP_SET_THREAD_STATE(IDLE);
+      KMP_PUSH_PARTITIONED_TIMER(OMP_idle);
+    }
+#endif
+    // Check if the barrier surrounding this wait loop has been cancelled
+    if (Cancellable) {
+      kmp_team_t *team = this_thr->th.th_team;
+      if (team && team->t.t_cancel_request == cancel_parallel)
+        break;
+    }
+
+    // For hidden helper thread, if task_team is nullptr, it means the main
+    // thread has not released the barrier. We cannot wait here because once the
+    // main thread releases all children barriers, all hidden helper threads are
+    // still sleeping. This leads to a problem that following configuration,
+    // such as task team sync, will not be performed such that this thread does
+    // not have task team. Usually it is not bad. However, a corner case is,
+    // when the first task encountered is an untied task, the check in
+    // __kmp_task_alloc will crash because it uses the task team pointer without
+    // checking whether it is nullptr. It is probably under some kind of
+    // assumption.
+    if (task_team && KMP_HIDDEN_HELPER_WORKER_THREAD(th_gtid) &&
+        !TCR_4(__kmp_hidden_helper_team_done)) {
+      // If there is still hidden helper tasks to be executed, the hidden helper
+      // thread will not enter a waiting status.
+      if (KMP_ATOMIC_LD_ACQ(&__kmp_unexecuted_hidden_helper_tasks) == 0) {
+        __kmp_hidden_helper_worker_thread_wait();
+      }
+      continue;
+    }
+
+    // Don't suspend if KMP_BLOCKTIME is set to "infinite"
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+        __kmp_pause_status != kmp_soft_paused)
+      continue;
+
+    // Don't suspend if there is a likelihood of new tasks being spawned.
+    if (task_team != NULL && TCR_4(task_team->tt.tt_found_tasks) &&
+        !__kmp_wpolicy_passive)
+      continue;
+
+#if KMP_USE_MONITOR
+    // If we have waited a bit more, fall asleep
+    if (TCR_4(__kmp_global.g.g_time.dt.t_value) < hibernate)
+      continue;
+#else
+    if (KMP_BLOCKING(hibernate_goal, poll_count++))
+      continue;
+#endif
+    // Don't suspend if wait loop designated non-sleepable
+    // in template parameters
+    if (!Sleepable)
+      continue;
+
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+    if (__kmp_mwait_enabled || __kmp_umwait_enabled) {
+      KF_TRACE(50, ("__kmp_wait_sleep: T#%d using monitor/mwait\n", th_gtid));
+      flag->mwait(th_gtid);
+    } else {
+#endif
+      KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
+#if KMP_OS_UNIX
+      if (final_spin)
+        KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
+#endif
+      flag->suspend(th_gtid);
+#if KMP_OS_UNIX
+      if (final_spin)
+        KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
+#endif
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+    }
+#endif
+
+    if (TCR_4(__kmp_global.g.g_done)) {
+      if (__kmp_global.g.g_abort)
+        __kmp_abort_thread();
+      break;
+    } else if (__kmp_tasking_mode != tskm_immediate_exec &&
+               this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
+      this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+    }
+    // TODO: If thread is done with work and times out, disband/free
+  }
+
+#if OMPT_SUPPORT
+  ompt_state_t ompt_exit_state = this_thr->th.ompt_thread_info.state;
+  if (ompt_enabled.enabled && ompt_exit_state != ompt_state_undefined) {
+#if OMPT_OPTIONAL
+    if (final_spin) {
+      __ompt_implicit_task_end(this_thr, ompt_exit_state, tId);
+      ompt_exit_state = this_thr->th.ompt_thread_info.state;
+    }
+#endif
+    if (ompt_exit_state == ompt_state_idle) {
+      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+  }
+#endif
+#if KMP_STATS_ENABLED
+  // If we were put into idle state, pop that off the state stack
+  if (KMP_GET_THREAD_STATE() == IDLE) {
+    KMP_POP_PARTITIONED_TIMER();
+    KMP_SET_THREAD_STATE(thread_state);
+    this_thr->th.th_stats->resetIdleFlag();
+  }
+#endif
+
+#if KMP_OS_UNIX
+  if (final_spin)
+    KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
+#endif
+  KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
+  if (Cancellable) {
+    kmp_team_t *team = this_thr->th.th_team;
+    if (team && team->t.t_cancel_request == cancel_parallel) {
+      if (tasks_completed) {
+        // undo the previous decrement of unfinished_threads so that the
+        // thread can decrement at the join barrier with no problem
+        kmp_task_team_t *task_team = this_thr->th.th_task_team;
+        std::atomic<kmp_int32> *unfinished_threads =
+            &(task_team->tt.tt_unfinished_threads);
+        KMP_ATOMIC_INC(unfinished_threads);
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+// Set up a monitor on the flag variable causing the calling thread to wait in
+// a less active state until the flag variable is modified.
+template <class C>
+static inline void __kmp_mwait_template(int th_gtid, C *flag) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_mwait);
+  kmp_info_t *th = __kmp_threads[th_gtid];
+
+  KF_TRACE(30, ("__kmp_mwait_template: T#%d enter for flag = %p\n", th_gtid,
+                flag->get()));
+
+  // User-level mwait is available
+  KMP_DEBUG_ASSERT(__kmp_mwait_enabled || __kmp_umwait_enabled);
+
+  __kmp_suspend_initialize_thread(th);
+  __kmp_lock_suspend_mx(th);
+
+  volatile void *spin = flag->get();
+  void *cacheline = (void *)(kmp_uintptr_t(spin) & ~(CACHE_LINE - 1));
+
+  if (!flag->done_check()) {
+    // Mark thread as no longer active
+    th->th.th_active = FALSE;
+    if (th->th.th_active_in_pool) {
+      th->th.th_active_in_pool = FALSE;
+      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
+      KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
+    }
+    flag->set_sleeping();
+    KF_TRACE(50, ("__kmp_mwait_template: T#%d calling monitor\n", th_gtid));
+#if KMP_HAVE_UMWAIT
+    if (__kmp_umwait_enabled) {
+      __kmp_umonitor(cacheline);
+    }
+#elif KMP_HAVE_MWAIT
+    if (__kmp_mwait_enabled) {
+      __kmp_mm_monitor(cacheline, 0, 0);
+    }
+#endif
+    // To avoid a race, check flag between 'monitor' and 'mwait'. A write to
+    // the address could happen after the last time we checked and before
+    // monitoring started, in which case monitor can't detect the change.
+    if (flag->done_check())
+      flag->unset_sleeping();
+    else {
+      // if flag changes here, wake-up happens immediately
+      TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+      th->th.th_sleep_loc_type = flag->get_type();
+      __kmp_unlock_suspend_mx(th);
+      KF_TRACE(50, ("__kmp_mwait_template: T#%d calling mwait\n", th_gtid));
+#if KMP_HAVE_UMWAIT
+      if (__kmp_umwait_enabled) {
+        __kmp_umwait(1, 100); // to do: enable ctrl via hints, backoff counter
+      }
+#elif KMP_HAVE_MWAIT
+      if (__kmp_mwait_enabled) {
+        __kmp_mm_mwait(0, __kmp_mwait_hints);
+      }
+#endif
+      KF_TRACE(50, ("__kmp_mwait_template: T#%d mwait done\n", th_gtid));
+      __kmp_lock_suspend_mx(th);
+      // Clean up sleep info; doesn't matter how/why this thread stopped waiting
+      if (flag->is_sleeping())
+        flag->unset_sleeping();
+      TCW_PTR(th->th.th_sleep_loc, NULL);
+      th->th.th_sleep_loc_type = flag_unset;
+    }
+    // Mark thread as active again
+    th->th.th_active = TRUE;
+    if (TCR_4(th->th.th_in_pool)) {
+      KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
+      th->th.th_active_in_pool = TRUE;
+    }
+  } // Drop out to main wait loop to check flag, handle tasks, etc.
+  __kmp_unlock_suspend_mx(th);
+  KF_TRACE(30, ("__kmp_mwait_template: T#%d exit\n", th_gtid));
+}
+#endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+
+/* Release any threads specified as waiting on the flag by releasing the flag
+   and resume the waiting thread if indicated by the sleep bit(s). A thread that
+   calls __kmp_wait_template must call this function to wake up the potentially
+   sleeping thread and prevent deadlocks!  */
+template <class C> static inline void __kmp_release_template(C *flag) {
+#ifdef KMP_DEBUG
+  int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+#endif
+  KF_TRACE(20, ("__kmp_release: T#%d releasing flag(%x)\n", gtid, flag->get()));
+  KMP_DEBUG_ASSERT(flag->get());
+  KMP_FSYNC_RELEASING(flag->get_void_p());
+
+  flag->internal_release();
+
+  KF_TRACE(100, ("__kmp_release: T#%d set new spin=%d\n", gtid, flag->get(),
+                 flag->load()));
+
+  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+    // Only need to check sleep stuff if infinite block time not set.
+    // Are *any* threads waiting on flag sleeping?
+    if (flag->is_any_sleeping()) {
+      for (unsigned int i = 0; i < flag->get_num_waiters(); ++i) {
+        // if sleeping waiter exists at i, sets current_waiter to i inside flag
+        kmp_info_t *waiter = flag->get_waiter(i);
+        if (waiter) {
+          int wait_gtid = waiter->th.th_info.ds.ds_gtid;
+          // Wake up thread if needed
+          KF_TRACE(50, ("__kmp_release: T#%d waking up thread T#%d since sleep "
+                        "flag(%p) set\n",
+                        gtid, wait_gtid, flag->get()));
+          flag->resume(wait_gtid); // unsets flag's current_waiter when done
+        }
+      }
+    }
+  }
+}
+
+template <bool Cancellable, bool Sleepable>
+class kmp_flag_32 : public kmp_flag_atomic<kmp_uint32, flag32, Sleepable> {
+public:
+  kmp_flag_32(std::atomic<kmp_uint32> *p)
+      : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p) {}
+  kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
+      : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p, thr) {}
+  kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
+      : kmp_flag_atomic<kmp_uint32, flag32, Sleepable>(p, c) {}
+  void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+  void mwait(int th_gtid) { __kmp_mwait_32(th_gtid, this); }
+#endif
+  void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
+  int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+                    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+                    kmp_int32 is_constrained) {
+    return __kmp_execute_tasks_32(
+        this_thr, gtid, this, final_spin,
+        thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+  }
+  bool wait(kmp_info_t *this_thr,
+            int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+    if (final_spin)
+      return __kmp_wait_template<kmp_flag_32, TRUE, Cancellable, Sleepable>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      return __kmp_wait_template<kmp_flag_32, FALSE, Cancellable, Sleepable>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  void release() { __kmp_release_template(this); }
+  flag_type get_ptr_type() { return flag32; }
+};
+
+template <bool Cancellable, bool Sleepable>
+class kmp_flag_64 : public kmp_flag_native<kmp_uint64, flag64, Sleepable> {
+public:
+  kmp_flag_64(volatile kmp_uint64 *p)
+      : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p) {}
+  kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
+      : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, thr) {}
+  kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
+      : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, c) {}
+  kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c, std::atomic<bool> *loc)
+      : kmp_flag_native<kmp_uint64, flag64, Sleepable>(p, c, loc) {}
+  void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+  void mwait(int th_gtid) { __kmp_mwait_64(th_gtid, this); }
+#endif
+  void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
+  int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+                    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+                    kmp_int32 is_constrained) {
+    return __kmp_execute_tasks_64(
+        this_thr, gtid, this, final_spin,
+        thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+  }
+  bool wait(kmp_info_t *this_thr,
+            int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+    if (final_spin)
+      return __kmp_wait_template<kmp_flag_64, TRUE, Cancellable, Sleepable>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      return __kmp_wait_template<kmp_flag_64, FALSE, Cancellable, Sleepable>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  void release() { __kmp_release_template(this); }
+  flag_type get_ptr_type() { return flag64; }
+};
+
+template <bool Cancellable, bool Sleepable>
+class kmp_atomic_flag_64
+    : public kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable> {
+public:
+  kmp_atomic_flag_64(std::atomic<kmp_uint64> *p)
+      : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p) {}
+  kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_info_t *thr)
+      : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, thr) {}
+  kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_uint64 c)
+      : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, c) {}
+  kmp_atomic_flag_64(std::atomic<kmp_uint64> *p, kmp_uint64 c,
+                     std::atomic<bool> *loc)
+      : kmp_flag_atomic<kmp_uint64, atomic_flag64, Sleepable>(p, c, loc) {}
+  void suspend(int th_gtid) { __kmp_atomic_suspend_64(th_gtid, this); }
+  void mwait(int th_gtid) { __kmp_atomic_mwait_64(th_gtid, this); }
+  void resume(int th_gtid) { __kmp_atomic_resume_64(th_gtid, this); }
+  int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+                    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+                    kmp_int32 is_constrained) {
+    return __kmp_atomic_execute_tasks_64(
+        this_thr, gtid, this, final_spin,
+        thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+  }
+  bool wait(kmp_info_t *this_thr,
+            int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+    if (final_spin)
+      return __kmp_wait_template<kmp_atomic_flag_64, TRUE, Cancellable,
+                                 Sleepable>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      return __kmp_wait_template<kmp_atomic_flag_64, FALSE, Cancellable,
+                                 Sleepable>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  void release() { __kmp_release_template(this); }
+  flag_type get_ptr_type() { return atomic_flag64; }
+};
+
+// Hierarchical 64-bit on-core barrier instantiation
+class kmp_flag_oncore : public kmp_flag_native<kmp_uint64, flag_oncore, false> {
+  kmp_uint32 offset; /**< Portion of flag of interest for an operation. */
+  bool flag_switch; /**< Indicates a switch in flag location. */
+  enum barrier_type bt; /**< Barrier type. */
+  kmp_info_t *this_thr; /**< Thread to redirect to different flag location. */
+#if USE_ITT_BUILD
+  void *itt_sync_obj; /**< ITT object to pass to new flag location. */
+#endif
+  unsigned char &byteref(volatile kmp_uint64 *loc, size_t offset) {
+    return (RCAST(unsigned char *, CCAST(kmp_uint64 *, loc)))[offset];
+  }
+
+public:
+  kmp_flag_oncore(volatile kmp_uint64 *p)
+      : kmp_flag_native<kmp_uint64, flag_oncore, false>(p), flag_switch(false) {
+  }
+  kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
+      : kmp_flag_native<kmp_uint64, flag_oncore, false>(p), offset(idx),
+        flag_switch(false),
+        bt(bs_last_barrier) USE_ITT_BUILD_ARG(itt_sync_obj(nullptr)) {}
+  kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx,
+                  enum barrier_type bar_t,
+                  kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt))
+      : kmp_flag_native<kmp_uint64, flag_oncore, false>(p, c), offset(idx),
+        flag_switch(false), bt(bar_t),
+        this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {}
+  virtual ~kmp_flag_oncore() override {}
+  void *operator new(size_t size) { return __kmp_allocate(size); }
+  void operator delete(void *p) { __kmp_free(p); }
+  bool done_check_val(kmp_uint64 old_loc) override {
+    return byteref(&old_loc, offset) == checker;
+  }
+  bool done_check() override { return done_check_val(*get()); }
+  bool notdone_check() override {
+    // Calculate flag_switch
+    if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG)
+      flag_switch = true;
+    if (byteref(get(), offset) != 1 && !flag_switch)
+      return true;
+    else if (flag_switch) {
+      this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
+      kmp_flag_64<> flag(&this_thr->th.th_bar[bt].bb.b_go,
+                         (kmp_uint64)KMP_BARRIER_STATE_BUMP);
+      __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    return false;
+  }
+  void internal_release() {
+    // Other threads can write their own bytes simultaneously.
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
+      byteref(get(), offset) = 1;
+    } else {
+      kmp_uint64 mask = 0;
+      byteref(&mask, offset) = 1;
+      KMP_TEST_THEN_OR64(get(), mask);
+    }
+  }
+  void wait(kmp_info_t *this_thr, int final_spin) {
+    if (final_spin)
+      __kmp_wait_template<kmp_flag_oncore, TRUE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      __kmp_wait_template<kmp_flag_oncore, FALSE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  void release() { __kmp_release_template(this); }
+  void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); }
+#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
+  void mwait(int th_gtid) { __kmp_mwait_oncore(th_gtid, this); }
+#endif
+  void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); }
+  int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+                    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+                    kmp_int32 is_constrained) {
+#if OMPD_SUPPORT
+    int ret = __kmp_execute_tasks_oncore(
+        this_thr, gtid, this, final_spin,
+        thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+    if (ompd_state & OMPD_ENABLE_BP)
+      ompd_bp_task_end();
+    return ret;
+#else
+    return __kmp_execute_tasks_oncore(
+        this_thr, gtid, this, final_spin,
+        thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+#endif
+  }
+  enum barrier_type get_bt() { return bt; }
+  flag_type get_ptr_type() { return flag_oncore; }
+};
+
+static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) {
+  int gtid = __kmp_gtid_from_thread(thr);
+  void *flag = CCAST(void *, thr->th.th_sleep_loc);
+  flag_type type = thr->th.th_sleep_loc_type;
+  if (!flag)
+    return;
+  // Attempt to wake up a thread: examine its type and call appropriate template
+  switch (type) {
+  case flag32:
+    __kmp_resume_32(gtid, RCAST(kmp_flag_32<> *, flag));
+    break;
+  case flag64:
+    __kmp_resume_64(gtid, RCAST(kmp_flag_64<> *, flag));
+    break;
+  case atomic_flag64:
+    __kmp_atomic_resume_64(gtid, RCAST(kmp_atomic_flag_64<> *, flag));
+    break;
+  case flag_oncore:
+    __kmp_resume_oncore(gtid, RCAST(kmp_flag_oncore *, flag));
+    break;
+  case flag_unset:
+    KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d is unset\n", type));
+    break;
+  }
+}
+
+/*!
+@}
+*/
+
+#endif // KMP_WAIT_RELEASE_H
diff --git a/third_party/openmp/kmp_wrapper_getpid.h b/third_party/openmp/kmp_wrapper_getpid.h
new file mode 100644
index 000000000..3527f22fe
--- /dev/null
+++ b/third_party/openmp/kmp_wrapper_getpid.h
@@ -0,0 +1,85 @@
+/*
+ * kmp_wrapper_getpid.h -- getpid() declaration.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_WRAPPER_GETPID_H
+#define KMP_WRAPPER_GETPID_H
+
+#if KMP_OS_UNIX
+
+// On Unix-like systems (Linux* OS and OS X*) getpid() is declared in standard
+// headers.
+#if !defined(KMP_OS_AIX)
+#include <sys/syscall.h>
+#endif
+#include <sys/types.h>
+#include <unistd.h>
+#if KMP_OS_DARWIN
+// OS X
+#define __kmp_gettid() pthread_mach_thread_np(pthread_self())
+#elif defined(__COSMOPOLITAN__)
+#include "libc/calls/calls.h"
+#define __kmp_gettid() gettid()
+#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
+#include <pthread_np.h>
+#define __kmp_gettid() pthread_getthreadid_np()
+#elif KMP_OS_NETBSD
+#include <lwp.h>
+#define __kmp_gettid() _lwp_self()
+#elif KMP_OS_OPENBSD
+#define __kmp_gettid() getthrid()
+#elif KMP_OS_AIX
+#include <pthread.h>
+#define __kmp_gettid() pthread_self()
+#elif defined(SYS_gettid)
+// Hopefully other Unix systems define SYS_gettid syscall for getting os thread
+// id
+#define __kmp_gettid() syscall(SYS_gettid)
+#else
+#warning No gettid found, use getpid instead
+#define __kmp_gettid() getpid()
+#endif
+
+#elif KMP_OS_WINDOWS
+
+// On Windows* OS _getpid() returns int (not pid_t) and is declared in
+// "process.h".
+#include <process.h>
+// Let us simulate Unix.
+#if KMP_MSVC_COMPAT
+typedef int pid_t;
+#endif
+#define getpid _getpid
+#define __kmp_gettid() GetCurrentThreadId()
+
+#else
+
+#error Unknown or unsupported OS.
+
+#endif
+
+/* TODO: All the libomp source code uses pid_t type for storing the result of
+   getpid(), it is good. But often it printed as "%d", that is not good, because
+   it ignores pid_t definition (may pid_t be longer that int?). It seems all pid
+   prints should be rewritten as:
+
+   printf( "%" KMP_UINT64_SPEC, (kmp_uint64) pid );
+
+   or (at least) as
+
+   printf( "%" KMP_UINT32_SPEC, (kmp_uint32) pid );
+
+   (kmp_uint32, kmp_uint64, KMP_UINT64_SPEC, and KMP_UNIT32_SPEC are defined in
+   "kmp_os.h".)  */
+
+#endif // KMP_WRAPPER_GETPID_H
+
+// end of file //
diff --git a/third_party/openmp/kmp_wrapper_malloc.h b/third_party/openmp/kmp_wrapper_malloc.h
new file mode 100644
index 000000000..1f75e88a2
--- /dev/null
+++ b/third_party/openmp/kmp_wrapper_malloc.h
@@ -0,0 +1,196 @@
+/*
+ * kmp_wrapper_malloc.h -- Wrappers for memory allocation routines
+ *                         (malloc(), free(), and others).
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_WRAPPER_MALLOC_H
+#define KMP_WRAPPER_MALLOC_H
+
+/* This header serves for 3 purposes:
+   1. Declaring standard memory allocation routines in OS-independent way.
+   2. Passing source location info through memory allocation wrappers.
+   3. Enabling native memory debugging capabilities.
+
+   1. Declaring standard memory allocation routines in OS-independent way.
+   -----------------------------------------------------------------------
+   On Linux* OS, alloca() function is declared in <alloca.h> header, while on
+   Windows* OS there is no <alloca.h> header, function _alloca() (note
+   underscore!) is declared in <malloc.h>. This header eliminates these
+   differences, so client code including "kmp_wrapper_malloc.h" can rely on
+   following routines:
+
+        malloc
+        calloc
+        realloc
+        free
+        alloca
+
+   in OS-independent way. It also enables memory tracking capabilities in debug
+   build. (Currently it is available only on Windows* OS.)
+
+   2. Passing source location info through memory allocation wrappers.
+   -------------------------------------------------------------------
+   Some tools may help debugging memory errors, for example, report memory
+   leaks. However, memory allocation wrappers may hinder source location.
+   For example:
+
+   void * aligned_malloc( int size ) {
+     void * ptr = malloc( size ); // All the memory leaks will be reported at
+                                  // this line.
+     // some adjustments...
+     return ptr;
+   };
+
+   ptr = aligned_malloc( size ); // Memory leak will *not* be detected here. :-(
+
+   To overcome the problem, information about original source location should
+   be passed through all the memory allocation wrappers, for example:
+
+   void * aligned_malloc( int size, char const * file, int line ) {
+     void * ptr = _malloc_dbg( size, file, line );
+     // some adjustments...
+     return ptr;
+   };
+   void * ptr = aligned_malloc( size, __FILE__, __LINE__ );
+
+   This is a good idea for debug, but passing additional arguments impacts
+   performance. Disabling extra arguments in release version of the software
+   introduces too many conditional compilation, which makes code unreadable.
+   This header defines few macros and functions facilitating it:
+
+   void * _aligned_malloc( int size KMP_SRC_LOC_DECL ) {
+     void * ptr = malloc_src_loc( size KMP_SRC_LOC_PARM );
+     // some adjustments...
+     return ptr;
+   };
+   #define aligned_malloc( size ) _aligned_malloc( (size) KMP_SRC_LOC_CURR )
+   // Use macro instead of direct call to function.
+
+   void * ptr = aligned_malloc( size );  // Bingo! Memory leak will be
+                                         // reported at this line.
+
+   3. Enabling native memory debugging capabilities.
+   -------------------------------------------------
+   Some platforms may offer memory debugging capabilities. For example, debug
+   version of Microsoft RTL tracks all memory allocations and can report memory
+   leaks. This header enables this, and makes report more useful (see "Passing
+   source location info through memory allocation wrappers").
+*/
+
+#include <stdlib.h>
+
+#include "kmp_os.h"
+
+// Include alloca() declaration.
+#if KMP_OS_WINDOWS
+#include <malloc.h> // Windows* OS: _alloca() declared in "malloc.h".
+#if KMP_MSVC_COMPAT
+#define alloca _alloca // Allow to use alloca() with no underscore.
+#endif
+#elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD
+// Declared in "stdlib.h".
+#elif KMP_OS_UNIX
+#include <alloca.h> // Linux* OS and OS X*: alloc() declared in "alloca".
+#else
+#error Unknown or unsupported OS.
+#endif
+
+/* KMP_SRC_LOC_DECL -- Declaring source location parameters, to be used in
+   function declaration.
+   KMP_SRC_LOC_PARM -- Source location parameters, to be used to pass
+   parameters to underlying levels.
+   KMP_SRC_LOC_CURR -- Source location arguments describing current location,
+   to be used at top-level.
+
+   Typical usage:
+   void * _aligned_malloc( int size KMP_SRC_LOC_DECL ) {
+     // Note: Comma is missed before KMP_SRC_LOC_DECL.
+     KE_TRACE( 25, ( "called from %s:%d\n", KMP_SRC_LOC_PARM ) );
+     ...
+   }
+   #define aligned_malloc( size ) _aligned_malloc( (size) KMP_SRC_LOC_CURR )
+   // Use macro instead of direct call to function -- macro passes info
+   // about current source location to the func.
+*/
+#if KMP_DEBUG
+#define KMP_SRC_LOC_DECL , char const *_file_, int _line_
+#define KMP_SRC_LOC_PARM , _file_, _line_
+#define KMP_SRC_LOC_CURR , __FILE__, __LINE__
+#else
+#define KMP_SRC_LOC_DECL
+#define KMP_SRC_LOC_PARM
+#define KMP_SRC_LOC_CURR
+#endif // KMP_DEBUG
+
+/* malloc_src_loc() and free_src_loc() are pseudo-functions (really macros)
+   with accepts extra arguments (source location info) in debug mode. They
+   should be used in place of malloc() and free(), this allows enabling native
+   memory debugging capabilities (if any).
+
+   Typical usage:
+   ptr = malloc_src_loc( size KMP_SRC_LOC_PARM );
+   // Inside memory allocation wrapper, or
+   ptr = malloc_src_loc( size KMP_SRC_LOC_CURR );
+   // Outside of memory allocation wrapper.
+*/
+#define malloc_src_loc(args) _malloc_src_loc(args)
+#define free_src_loc(args) _free_src_loc(args)
+/* Depending on build mode (debug or release), malloc_src_loc is declared with
+   1 or 3 parameters, but calls to malloc_src_loc() are always the same:
+
+   ... malloc_src_loc( size KMP_SRC_LOC_PARM ); // or KMP_SRC_LOC_CURR
+
+   Compiler issues warning/error "too few arguments in macro invocation".
+   Declaring two macros, malloc_src_loc() and _malloc_src_loc(), overcomes the
+   problem. */
+
+#if KMP_DEBUG
+
+#if KMP_OS_WINDOWS && _DEBUG && !defined(__MINGW32__)
+// KMP_DEBUG != _DEBUG. MS debug RTL is available only if _DEBUG is defined.
+
+// Windows* OS has native memory debugging capabilities. Enable them.
+
+#include <crtdbg.h>
+
+#define KMP_MEM_BLOCK _CLIENT_BLOCK
+#define malloc(size) _malloc_dbg((size), KMP_MEM_BLOCK, __FILE__, __LINE__)
+#define calloc(num, size)                                                      \
+  _calloc_dbg((num), (size), KMP_MEM_BLOCK, __FILE__, __LINE__)
+#define realloc(ptr, size)                                                     \
+  _realloc_dbg((ptr), (size), KMP_MEM_BLOCK, __FILE__, __LINE__)
+#define free(ptr) _free_dbg((ptr), KMP_MEM_BLOCK)
+
+#define _malloc_src_loc(size, file, line)                                      \
+  _malloc_dbg((size), KMP_MEM_BLOCK, (file), (line))
+#define _free_src_loc(ptr, file, line) _free_dbg((ptr), KMP_MEM_BLOCK)
+
+#else
+
+// Linux* OS, OS X*, or non-debug Windows* OS.
+
+#define _malloc_src_loc(size, file, line) malloc((size))
+#define _free_src_loc(ptr, file, line) free((ptr))
+
+#endif
+
+#else
+
+// In release build malloc_src_loc() and free_src_loc() do not have extra
+// parameters.
+#define _malloc_src_loc(size) malloc((size))
+#define _free_src_loc(ptr) free((ptr))
+
+#endif // KMP_DEBUG
+
+#endif // KMP_WRAPPER_MALLOC_H
+
+// end of file //
diff --git a/third_party/openmp/omp-tools.h b/third_party/openmp/omp-tools.h
new file mode 100644
index 000000000..a3ec0309d
--- /dev/null
+++ b/third_party/openmp/omp-tools.h
@@ -0,0 +1,1417 @@
+/*
+ * include/omp-tools.h.var
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPT__
+#define __OMPT__
+
+/*****************************************************************************
+ * system include files
+ *****************************************************************************/
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef DEPRECATION_WARNINGS 
+# ifdef __cplusplus
+# define DEPRECATED_51 [[deprecated("as of 5.1")]]
+# else
+# define DEPRECATED_51 __attribute__((deprecated("as of 5.1")))
+#endif
+#else
+#define DEPRECATED_51
+#endif
+
+/*****************************************************************************
+ * iteration macros
+ *****************************************************************************/
+
+#define FOREACH_OMPT_INQUIRY_FN(macro)      \
+    macro (ompt_enumerate_states)           \
+    macro (ompt_enumerate_mutex_impls)      \
+                                            \
+    macro (ompt_set_callback)               \
+    macro (ompt_get_callback)               \
+                                            \
+    macro (ompt_get_state)                  \
+                                            \
+    macro (ompt_get_parallel_info)          \
+    macro (ompt_get_task_info)              \
+    macro (ompt_get_task_memory)            \
+    macro (ompt_get_thread_data)            \
+    macro (ompt_get_unique_id)              \
+    macro (ompt_finalize_tool)              \
+                                            \
+    macro(ompt_get_num_procs)               \
+    macro(ompt_get_num_places)              \
+    macro(ompt_get_place_proc_ids)          \
+    macro(ompt_get_place_num)               \
+    macro(ompt_get_partition_place_nums)    \
+    macro(ompt_get_proc_id)                 \
+                                            \
+    macro(ompt_get_target_info)             \
+    macro(ompt_get_num_devices)
+
+#define FOREACH_OMPT_STATE(macro)                                                                \
+                                                                                                \
+    /* first available state */                                                                 \
+    macro (ompt_state_undefined, 0x102)      /* undefined thread state */                        \
+                                                                                                \
+    /* work states (0..15) */                                                                   \
+    macro (ompt_state_work_serial, 0x000)    /* working outside parallel */                      \
+    macro (ompt_state_work_parallel, 0x001)  /* working within parallel */                       \
+    macro (ompt_state_work_reduction, 0x002) /* performing a reduction */                        \
+                                                                                                \
+    /* barrier wait states (16..31) */                                                          \
+    macro (ompt_state_wait_barrier, 0x010)   /* waiting at a barrier */                          \
+    macro (ompt_state_wait_barrier_implicit_parallel, 0x011)                                     \
+                                            /* implicit barrier at the end of parallel region */\
+    macro (ompt_state_wait_barrier_implicit_workshare, 0x012)                                    \
+                                            /* implicit barrier at the end of worksharing */    \
+    macro (ompt_state_wait_barrier_implicit, 0x013)  /* implicit barrier */                      \
+    macro (ompt_state_wait_barrier_explicit, 0x014)  /* explicit barrier */                      \
+                                                                                                \
+    /* task wait states (32..63) */                                                             \
+    macro (ompt_state_wait_taskwait, 0x020)  /* waiting at a taskwait */                         \
+    macro (ompt_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */                        \
+                                                                                                \
+    /* mutex wait states (64..127) */                                                           \
+    macro (ompt_state_wait_mutex, 0x040)                                                         \
+    macro (ompt_state_wait_lock, 0x041)      /* waiting for lock */                              \
+    macro (ompt_state_wait_critical, 0x042)  /* waiting for critical */                          \
+    macro (ompt_state_wait_atomic, 0x043)    /* waiting for atomic */                            \
+    macro (ompt_state_wait_ordered, 0x044)   /* waiting for ordered */                           \
+                                                                                                \
+    /* target wait states (128..255) */                                                         \
+    macro (ompt_state_wait_target, 0x080)        /* waiting for target region */                 \
+    macro (ompt_state_wait_target_map, 0x081)    /* waiting for target data mapping operation */ \
+    macro (ompt_state_wait_target_update, 0x082) /* waiting for target update operation */       \
+                                                                                                \
+    /* misc (256..511) */                                                                       \
+    macro (ompt_state_idle, 0x100)           /* waiting for work */                              \
+    macro (ompt_state_overhead, 0x101)       /* overhead excluding wait states */                \
+                                                                                                \
+    /* implementation-specific states (512..) */
+
+
+#define FOREACH_KMP_MUTEX_IMPL(macro)                                                \
+    macro (kmp_mutex_impl_none, 0)         /* unknown implementation */              \
+    macro (kmp_mutex_impl_spin, 1)         /* based on spin */                       \
+    macro (kmp_mutex_impl_queuing, 2)      /* based on some fair policy */           \
+    macro (kmp_mutex_impl_speculative, 3)  /* based on HW-supported speculation */
+
+#define FOREACH_OMPT_HOST_EVENT(macro)                                                                                   \
+                                                                                                                         \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_thread_begin,      ompt_callback_thread_begin_t,       1) /* thread begin                    */ \
+    macro (ompt_callback_thread_end,        ompt_callback_thread_end_t,         2) /* thread end                      */ \
+                                                                                                                         \
+    macro (ompt_callback_parallel_begin,    ompt_callback_parallel_begin_t,     3) /* parallel begin                  */ \
+    macro (ompt_callback_parallel_end,      ompt_callback_parallel_end_t,       4) /* parallel end                    */ \
+                                                                                                                         \
+    macro (ompt_callback_task_create,       ompt_callback_task_create_t,        5) /* task begin                      */ \
+    macro (ompt_callback_task_schedule,     ompt_callback_task_schedule_t,      6) /* task schedule                   */ \
+    macro (ompt_callback_implicit_task,     ompt_callback_implicit_task_t,      7) /* implicit task                   */ \
+                                                                                                                         \
+    macro (ompt_callback_control_tool,      ompt_callback_control_tool_t,      11) /* control tool                    */ \
+                                                                                                                         \
+    /* Optional Events */                                                                                                \
+    macro (ompt_callback_sync_region_wait,  ompt_callback_sync_region_t,       16) /* sync region wait begin or end   */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_released,    ompt_callback_mutex_t,             17) /* mutex released                  */ \
+                                                                                                                         \
+    macro (ompt_callback_dependences,       ompt_callback_dependences_t,       18) /* report task dependences         */ \
+    macro (ompt_callback_task_dependence,   ompt_callback_task_dependence_t,   19) /* report task dependence          */ \
+                                                                                                                         \
+    macro (ompt_callback_work,              ompt_callback_work_t,              20) /* task at work begin or end       */ \
+                                                                                                                         \
+    macro (ompt_callback_masked,            ompt_callback_masked_t,            21) /* task at masked begin or end     */ \
+                                                                                                                         \
+    macro (ompt_callback_sync_region,       ompt_callback_sync_region_t,       23) /* sync region begin or end        */ \
+                                                                                                                         \
+    macro (ompt_callback_lock_init,         ompt_callback_mutex_acquire_t,     24) /* lock init                       */ \
+    macro (ompt_callback_lock_destroy,      ompt_callback_mutex_t,             25) /* lock destroy                    */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_acquire,     ompt_callback_mutex_acquire_t,     26) /* mutex acquire                   */ \
+    macro (ompt_callback_mutex_acquired,    ompt_callback_mutex_t,             27) /* mutex acquired                  */ \
+                                                                                                                         \
+    macro (ompt_callback_nest_lock,         ompt_callback_nest_lock_t,         28) /* nest lock                       */ \
+                                                                                                                         \
+    macro (ompt_callback_flush,             ompt_callback_flush_t,             29) /* after executing flush           */ \
+                                                                                                                         \
+    macro (ompt_callback_cancel,            ompt_callback_cancel_t,            30) /* cancel innermost binding region */ \
+                                                                                                                         \
+    macro (ompt_callback_reduction,         ompt_callback_sync_region_t,       31) /* reduction                       */ \
+                                                                                                                         \
+    macro (ompt_callback_dispatch,          ompt_callback_dispatch_t,          32) /* dispatch of work                */ \
+    macro (ompt_callback_error,             ompt_callback_error_t,             37) /* error                           */
+
+#define FOREACH_OMPT_DEVICE_EVENT(macro)                                                                                 \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize               */ \
+    macro (ompt_callback_device_finalize,   ompt_callback_device_finalize_t,   13) /* device finalize                 */ \
+                                                                                                                         \
+    macro (ompt_callback_device_load,       ompt_callback_device_load_t,       14) /* device load                     */ \
+    macro (ompt_callback_device_unload,     ompt_callback_device_unload_t,     15) /* device unload                   */
+
+#define FOREACH_OMPT_NOEMI_EVENT(macro)                                                                                  \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_target,            ompt_callback_target_t,             8) /* target                          */ \
+    macro (ompt_callback_target_data_op,    ompt_callback_target_data_op_t,     9) /* target data op                  */ \
+    macro (ompt_callback_target_submit,     ompt_callback_target_submit_t,     10) /* target  submit                  */ \
+    /* Optional Events */                                                                                                \
+    macro (ompt_callback_target_map,        ompt_callback_target_map_t,        22) /* target map                      */
+
+#define FOREACH_OMPT_EMI_EVENT(macro)                                                                                    \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_target_emi,        ompt_callback_target_emi_t,        33) /* target                          */ \
+    macro (ompt_callback_target_data_op_emi,ompt_callback_target_data_op_emi_t,34) /* target data op                  */ \
+    macro (ompt_callback_target_submit_emi, ompt_callback_target_submit_emi_t, 35) /* target submit                   */ \
+    /* Optional Events */                                                                                                \
+    macro (ompt_callback_target_map_emi,    ompt_callback_target_map_emi_t,    36) /* target map                      */
+
+#define FOREACH_OMPT_50_TARGET_EVENT(macro)                                                                              \
+    FOREACH_OMPT_DEVICE_EVENT(macro)                                                                                     \
+    FOREACH_OMPT_NOEMI_EVENT(macro) 
+
+#define FOREACH_OMPT_51_TARGET_EVENT(macro)                                                                              \
+    FOREACH_OMPT_DEVICE_EVENT(macro)                                                                                     \
+    FOREACH_OMPT_EMI_EVENT(macro) 
+
+#define FOREACH_OMPT_EVENT(macro)                                                                                        \
+    FOREACH_OMPT_HOST_EVENT(macro)                                                                                       \
+    FOREACH_OMPT_DEVICE_EVENT(macro)                                                                                     \
+    FOREACH_OMPT_NOEMI_EVENT(macro)                                                                                      \
+    FOREACH_OMPT_EMI_EVENT(macro)
+
+#define FOREACH_OMPT_51_EVENT(macro)                                                                                     \
+    FOREACH_OMPT_HOST_EVENT(macro)                                                                                       \
+    FOREACH_OMPT_DEVICE_EVENT(macro)                                                                                     \
+    FOREACH_OMPT_EMI_EVENT(macro)
+
+/*****************************************************************************
+ * implementation specific types
+ *****************************************************************************/
+
+typedef enum kmp_mutex_impl_t {
+#define kmp_mutex_impl_macro(impl, code) impl = code,
+    FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
+#undef kmp_mutex_impl_macro
+} kmp_mutex_impl_t;
+
+/*****************************************************************************
+ * definitions generated from spec
+ *****************************************************************************/
+
+typedef enum ompt_callbacks_t {
+  ompt_callback_thread_begin             = 1,
+  ompt_callback_thread_end               = 2,
+  ompt_callback_parallel_begin           = 3,
+  ompt_callback_parallel_end             = 4,
+  ompt_callback_task_create              = 5,
+  ompt_callback_task_schedule            = 6,
+  ompt_callback_implicit_task            = 7,
+  ompt_callback_target                   = 8,
+  ompt_callback_target_data_op           = 9,
+  ompt_callback_target_submit            = 10,
+  ompt_callback_control_tool             = 11,
+  ompt_callback_device_initialize        = 12,
+  ompt_callback_device_finalize          = 13,
+  ompt_callback_device_load              = 14,
+  ompt_callback_device_unload            = 15,
+  ompt_callback_sync_region_wait         = 16,
+  ompt_callback_mutex_released           = 17,
+  ompt_callback_dependences              = 18,
+  ompt_callback_task_dependence          = 19,
+  ompt_callback_work                     = 20,
+  ompt_callback_master     DEPRECATED_51 = 21,
+  ompt_callback_masked                   = 21,
+  ompt_callback_target_map               = 22,
+  ompt_callback_sync_region              = 23,
+  ompt_callback_lock_init                = 24,
+  ompt_callback_lock_destroy             = 25,
+  ompt_callback_mutex_acquire            = 26,
+  ompt_callback_mutex_acquired           = 27,
+  ompt_callback_nest_lock                = 28,
+  ompt_callback_flush                    = 29,
+  ompt_callback_cancel                   = 30,
+  ompt_callback_reduction                = 31,
+  ompt_callback_dispatch                 = 32,
+  ompt_callback_target_emi               = 33,
+  ompt_callback_target_data_op_emi       = 34,
+  ompt_callback_target_submit_emi        = 35,
+  ompt_callback_target_map_emi           = 36,
+  ompt_callback_error                    = 37
+} ompt_callbacks_t;
+
+typedef enum ompt_record_t {
+  ompt_record_ompt               = 1,
+  ompt_record_native             = 2,
+  ompt_record_invalid            = 3
+} ompt_record_t;
+
+typedef enum ompt_record_native_t {
+  ompt_record_native_info  = 1,
+  ompt_record_native_event = 2
+} ompt_record_native_t;
+
+typedef enum ompt_set_result_t {
+  ompt_set_error            = 0,
+  ompt_set_never            = 1,
+  ompt_set_impossible       = 2,
+  ompt_set_sometimes        = 3,
+  ompt_set_sometimes_paired = 4,
+  ompt_set_always           = 5
+} ompt_set_result_t;
+
+typedef uint64_t ompt_id_t;
+
+typedef uint64_t ompt_device_time_t;
+
+typedef uint64_t ompt_buffer_cursor_t;
+
+typedef enum ompt_thread_t {
+  ompt_thread_initial                 = 1,
+  ompt_thread_worker                  = 2,
+  ompt_thread_other                   = 3,
+  ompt_thread_unknown                 = 4
+} ompt_thread_t;
+
+typedef enum ompt_scope_endpoint_t {
+  ompt_scope_begin                    = 1,
+  ompt_scope_end                      = 2,
+  ompt_scope_beginend                 = 3
+} ompt_scope_endpoint_t;
+
+typedef enum ompt_dispatch_t {
+  ompt_dispatch_iteration             = 1,
+  ompt_dispatch_section               = 2,
+  ompt_dispatch_ws_loop_chunk         = 3,
+  ompt_dispatch_taskloop_chunk        = 4,
+  ompt_dispatch_distribute_chunk      = 5
+} ompt_dispatch_t;
+
+typedef enum ompt_sync_region_t {
+  ompt_sync_region_barrier                DEPRECATED_51 = 1,
+  ompt_sync_region_barrier_implicit       DEPRECATED_51 = 2,
+  ompt_sync_region_barrier_explicit       = 3,
+  ompt_sync_region_barrier_implementation = 4,
+  ompt_sync_region_taskwait               = 5,
+  ompt_sync_region_taskgroup              = 6,
+  ompt_sync_region_reduction              = 7,
+  ompt_sync_region_barrier_implicit_workshare = 8,
+  ompt_sync_region_barrier_implicit_parallel = 9,
+  ompt_sync_region_barrier_teams = 10
+} ompt_sync_region_t;
+
+typedef enum ompt_target_data_op_t {
+  ompt_target_data_alloc                      = 1,
+  ompt_target_data_transfer_to_device         = 2,
+  ompt_target_data_transfer_from_device       = 3,
+  ompt_target_data_delete                     = 4,
+  ompt_target_data_associate                  = 5,
+  ompt_target_data_disassociate               = 6,
+  ompt_target_data_alloc_async                = 17,
+  ompt_target_data_transfer_to_device_async   = 18,
+  ompt_target_data_transfer_from_device_async = 19,
+  ompt_target_data_delete_async               = 20
+} ompt_target_data_op_t;
+
+typedef enum ompt_work_t {
+  ompt_work_loop               = 1,
+  ompt_work_sections           = 2,
+  ompt_work_single_executor    = 3,
+  ompt_work_single_other       = 4,
+  ompt_work_workshare          = 5,
+  ompt_work_distribute         = 6,
+  ompt_work_taskloop           = 7,
+  ompt_work_scope              = 8,
+  ompt_work_loop_static        = 10,
+  ompt_work_loop_dynamic       = 11,
+  ompt_work_loop_guided        = 12,
+  ompt_work_loop_other         = 13
+} ompt_work_t;
+
+typedef enum ompt_mutex_t {
+  ompt_mutex_lock                     = 1,
+  ompt_mutex_test_lock                = 2,
+  ompt_mutex_nest_lock                = 3,
+  ompt_mutex_test_nest_lock           = 4,
+  ompt_mutex_critical                 = 5,
+  ompt_mutex_atomic                   = 6,
+  ompt_mutex_ordered                  = 7
+} ompt_mutex_t;
+
+typedef enum ompt_native_mon_flag_t {
+  ompt_native_data_motion_explicit    = 0x01,
+  ompt_native_data_motion_implicit    = 0x02,
+  ompt_native_kernel_invocation       = 0x04,
+  ompt_native_kernel_execution        = 0x08,
+  ompt_native_driver                  = 0x10,
+  ompt_native_runtime                 = 0x20,
+  ompt_native_overhead                = 0x40,
+  ompt_native_idleness                = 0x80
+} ompt_native_mon_flag_t;
+
+typedef enum ompt_task_flag_t {
+  ompt_task_initial                   = 0x00000001,
+  ompt_task_implicit                  = 0x00000002,
+  ompt_task_explicit                  = 0x00000004,
+  ompt_task_target                    = 0x00000008,
+  ompt_task_taskwait                  = 0x00000010,
+  ompt_task_undeferred                = 0x08000000,
+  ompt_task_untied                    = 0x10000000,
+  ompt_task_final                     = 0x20000000,
+  ompt_task_mergeable                 = 0x40000000,
+  ompt_task_merged                    = 0x80000000
+} ompt_task_flag_t;
+
+typedef enum ompt_task_status_t {
+  ompt_task_complete      = 1,
+  ompt_task_yield         = 2,
+  ompt_task_cancel        = 3,
+  ompt_task_detach        = 4,
+  ompt_task_early_fulfill = 5,
+  ompt_task_late_fulfill  = 6,
+  ompt_task_switch        = 7,
+  ompt_taskwait_complete  = 8
+} ompt_task_status_t;
+
+typedef enum ompt_target_t {
+  ompt_target                         = 1,
+  ompt_target_enter_data              = 2,
+  ompt_target_exit_data               = 3,
+  ompt_target_update                  = 4,
+  ompt_target_nowait                  = 9,
+  ompt_target_enter_data_nowait       = 10,
+  ompt_target_exit_data_nowait        = 11,
+  ompt_target_update_nowait           = 12
+} ompt_target_t;
+
+typedef enum ompt_parallel_flag_t {
+  ompt_parallel_invoker_program = 0x00000001,
+  ompt_parallel_invoker_runtime = 0x00000002,
+  ompt_parallel_league          = 0x40000000,
+  ompt_parallel_team            = 0x80000000
+} ompt_parallel_flag_t;
+
+typedef enum ompt_target_map_flag_t {
+  ompt_target_map_flag_to             = 0x01,
+  ompt_target_map_flag_from           = 0x02,
+  ompt_target_map_flag_alloc          = 0x04,
+  ompt_target_map_flag_release        = 0x08,
+  ompt_target_map_flag_delete         = 0x10,
+  ompt_target_map_flag_implicit       = 0x20
+} ompt_target_map_flag_t;
+
+typedef enum ompt_dependence_type_t {
+  ompt_dependence_type_in               = 1,
+  ompt_dependence_type_out              = 2,
+  ompt_dependence_type_inout            = 3,
+  ompt_dependence_type_mutexinoutset    = 4,
+  ompt_dependence_type_source           = 5,
+  ompt_dependence_type_sink             = 6,
+  ompt_dependence_type_inoutset         = 7,
+  ompt_dependence_type_out_all_memory   = 34,
+  ompt_dependence_type_inout_all_memory = 35
+} ompt_dependence_type_t;
+
+typedef enum ompt_severity_t {
+  ompt_warning                         = 1,
+  ompt_fatal                           = 2
+} ompt_severity_t;
+
+typedef enum ompt_cancel_flag_t {
+  ompt_cancel_parallel       = 0x01,
+  ompt_cancel_sections       = 0x02,
+  ompt_cancel_loop           = 0x04,
+  ompt_cancel_taskgroup      = 0x08,
+  ompt_cancel_activated      = 0x10,
+  ompt_cancel_detected       = 0x20,
+  ompt_cancel_discarded_task = 0x40
+} ompt_cancel_flag_t;
+
+typedef uint64_t ompt_hwid_t;
+
+typedef uint64_t ompt_wait_id_t;
+
+typedef enum ompt_frame_flag_t {
+  ompt_frame_runtime        = 0x00,
+  ompt_frame_application    = 0x01,
+  ompt_frame_cfa            = 0x10,
+  ompt_frame_framepointer   = 0x20,
+  ompt_frame_stackaddress   = 0x30
+} ompt_frame_flag_t;
+
+typedef enum ompt_state_t {
+  ompt_state_work_serial                      = 0x000,
+  ompt_state_work_parallel                    = 0x001,
+  ompt_state_work_reduction                   = 0x002,
+
+  ompt_state_wait_barrier                     DEPRECATED_51 = 0x010,
+  ompt_state_wait_barrier_implicit_parallel   = 0x011,
+  ompt_state_wait_barrier_implicit_workshare  = 0x012,
+  ompt_state_wait_barrier_implicit            DEPRECATED_51 = 0x013,
+  ompt_state_wait_barrier_explicit            = 0x014,
+  ompt_state_wait_barrier_implementation      = 0x015,
+  ompt_state_wait_barrier_teams               = 0x016,
+
+  ompt_state_wait_taskwait                    = 0x020,
+  ompt_state_wait_taskgroup                   = 0x021,
+
+  ompt_state_wait_mutex                       = 0x040,
+  ompt_state_wait_lock                        = 0x041,
+  ompt_state_wait_critical                    = 0x042,
+  ompt_state_wait_atomic                      = 0x043,
+  ompt_state_wait_ordered                     = 0x044,
+
+  ompt_state_wait_target                      = 0x080,
+  ompt_state_wait_target_map                  = 0x081,
+  ompt_state_wait_target_update               = 0x082,
+
+  ompt_state_idle                             = 0x100,
+  ompt_state_overhead                         = 0x101,
+  ompt_state_undefined                        = 0x102
+} ompt_state_t;
+
+typedef uint64_t (*ompt_get_unique_id_t) (void);
+
+typedef uint64_t ompd_size_t;
+
+typedef uint64_t ompd_wait_id_t;
+
+typedef uint64_t ompd_addr_t;
+typedef int64_t  ompd_word_t;
+typedef uint64_t ompd_seg_t;
+
+typedef uint64_t ompd_device_t;
+
+typedef uint64_t ompd_thread_id_t;
+
+typedef enum ompd_scope_t {
+  ompd_scope_global = 1,
+  ompd_scope_address_space = 2,
+  ompd_scope_thread = 3,
+  ompd_scope_parallel = 4,
+  ompd_scope_implicit_task = 5,
+  ompd_scope_task = 6
+} ompd_scope_t;
+
+typedef uint64_t ompd_icv_id_t;
+
+typedef enum ompd_rc_t {
+  ompd_rc_ok = 0,
+  ompd_rc_unavailable = 1,
+  ompd_rc_stale_handle = 2,
+  ompd_rc_bad_input = 3,
+  ompd_rc_error = 4,
+  ompd_rc_unsupported = 5,
+  ompd_rc_needs_state_tracking = 6,
+  ompd_rc_incompatible = 7,
+  ompd_rc_device_read_error = 8,
+  ompd_rc_device_write_error = 9,
+  ompd_rc_nomem = 10,
+  ompd_rc_incomplete = 11,
+  ompd_rc_callback_error = 12
+} ompd_rc_t;
+
+typedef void (*ompt_interface_fn_t) (void);
+
+typedef ompt_interface_fn_t (*ompt_function_lookup_t) (
+  const char *interface_function_name
+);
+
+typedef union ompt_data_t {
+  uint64_t value;
+  void *ptr;
+} ompt_data_t;
+
+typedef struct ompt_frame_t {
+  ompt_data_t exit_frame;
+  ompt_data_t enter_frame;
+  int exit_frame_flags;
+  int enter_frame_flags;
+} ompt_frame_t;
+
+typedef void (*ompt_callback_t) (void);
+
+typedef void ompt_device_t;
+
+typedef void ompt_buffer_t;
+
+typedef void (*ompt_callback_buffer_request_t) (
+  int device_num,
+  ompt_buffer_t **buffer,
+  size_t *bytes
+);
+
+typedef void (*ompt_callback_buffer_complete_t) (
+  int device_num,
+  ompt_buffer_t *buffer,
+  size_t bytes,
+  ompt_buffer_cursor_t begin,
+  int buffer_owned
+);
+
+typedef void (*ompt_finalize_t) (
+  ompt_data_t *tool_data
+);
+
+typedef int (*ompt_initialize_t) (
+  ompt_function_lookup_t lookup,
+  int initial_device_num,
+  ompt_data_t *tool_data
+);
+
+typedef struct ompt_start_tool_result_t {
+  ompt_initialize_t initialize;
+  ompt_finalize_t finalize;
+  ompt_data_t tool_data;
+} ompt_start_tool_result_t;
+
+typedef struct ompt_record_abstract_t {
+  ompt_record_native_t rclass;
+  const char *type;
+  ompt_device_time_t start_time;
+  ompt_device_time_t end_time;
+  ompt_hwid_t hwid;
+} ompt_record_abstract_t;
+
+typedef struct ompt_dependence_t {
+  ompt_data_t variable;
+  ompt_dependence_type_t dependence_type;
+} ompt_dependence_t;
+
+typedef struct ompt_dispatch_chunk_t {
+  uint64_t start;
+  uint64_t iterations;
+} ompt_dispatch_chunk_t;
+
+typedef int (*ompt_enumerate_states_t) (
+  int current_state,
+  int *next_state,
+  const char **next_state_name
+);
+
+typedef int (*ompt_enumerate_mutex_impls_t) (
+  int current_impl,
+  int *next_impl,
+  const char **next_impl_name
+);
+
+typedef ompt_set_result_t (*ompt_set_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t callback
+);
+
+typedef int (*ompt_get_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t *callback
+);
+
+typedef ompt_data_t *(*ompt_get_thread_data_t) (void);
+
+typedef int (*ompt_get_num_procs_t) (void);
+
+typedef int (*ompt_get_num_places_t) (void);
+
+typedef int (*ompt_get_place_proc_ids_t) (
+  int place_num,
+  int ids_size,
+  int *ids
+);
+
+typedef int (*ompt_get_place_num_t) (void);
+
+typedef int (*ompt_get_partition_place_nums_t) (
+  int place_nums_size,
+  int *place_nums
+);
+
+typedef int (*ompt_get_proc_id_t) (void);
+
+typedef int (*ompt_get_state_t) (
+  ompt_wait_id_t *wait_id
+);
+
+typedef int (*ompt_get_parallel_info_t) (
+  int ancestor_level,
+  ompt_data_t **parallel_data,
+  int *team_size
+);
+
+typedef int (*ompt_get_task_info_t) (
+  int ancestor_level,
+  int *flags,
+  ompt_data_t **task_data,
+  ompt_frame_t **task_frame,
+  ompt_data_t **parallel_data,
+  int *thread_num
+);
+
+typedef int (*ompt_get_task_memory_t)(
+  void **addr,
+  size_t *size,
+  int block
+);
+
+typedef int (*ompt_get_target_info_t) (
+  uint64_t *device_num,
+  ompt_id_t *target_id,
+  ompt_id_t *host_op_id
+);
+
+typedef int (*ompt_get_num_devices_t) (void);
+
+typedef void (*ompt_finalize_tool_t) (void);
+
+typedef int (*ompt_get_device_num_procs_t) (
+  ompt_device_t *device
+);
+
+typedef ompt_device_time_t (*ompt_get_device_time_t) (
+  ompt_device_t *device
+);
+
+typedef double (*ompt_translate_time_t) (
+  ompt_device_t *device,
+  ompt_device_time_t time
+);
+
+typedef ompt_set_result_t (*ompt_set_trace_ompt_t) (
+  ompt_device_t *device,
+  unsigned int enable,
+  unsigned int etype
+);
+
+typedef ompt_set_result_t (*ompt_set_trace_native_t) (
+  ompt_device_t *device,
+  int enable,
+  int flags
+);
+
+typedef int (*ompt_start_trace_t) (
+  ompt_device_t *device,
+  ompt_callback_buffer_request_t request,
+  ompt_callback_buffer_complete_t complete
+);
+
+typedef int (*ompt_pause_trace_t) (
+  ompt_device_t *device,
+  int begin_pause
+);
+
+typedef int (*ompt_flush_trace_t) (
+  ompt_device_t *device
+);
+
+typedef int (*ompt_stop_trace_t) (
+  ompt_device_t *device
+);
+
+typedef int (*ompt_advance_buffer_cursor_t) (
+  ompt_device_t *device,
+  ompt_buffer_t *buffer,
+  size_t size,
+  ompt_buffer_cursor_t current,
+  ompt_buffer_cursor_t *next
+);
+
+typedef ompt_record_t (*ompt_get_record_type_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+
+typedef void *(*ompt_get_record_native_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current,
+  ompt_id_t *host_op_id
+);
+
+typedef ompt_record_abstract_t *
+(*ompt_get_record_abstract_t) (
+  void *native_record
+);
+
+typedef void (*ompt_callback_thread_begin_t) (
+  ompt_thread_t thread_type,
+  ompt_data_t *thread_data
+);
+
+typedef struct ompt_record_thread_begin_t {
+  ompt_thread_t thread_type;
+} ompt_record_thread_begin_t;
+
+typedef void (*ompt_callback_thread_end_t) (
+  ompt_data_t *thread_data
+);
+
+typedef void (*ompt_callback_parallel_begin_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *parallel_data,
+  unsigned int requested_parallelism,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_parallel_begin_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t parallel_id;
+  unsigned int requested_parallelism;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_begin_t;
+
+typedef void (*ompt_callback_parallel_end_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *encountering_task_data,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_parallel_end_t {
+  ompt_id_t parallel_id;
+  ompt_id_t encountering_task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_end_t;
+
+typedef void (*ompt_callback_work_t) (
+  ompt_work_t work_type,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  uint64_t count,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_work_t {
+  ompt_work_t work_type;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  uint64_t count;
+  const void *codeptr_ra;
+} ompt_record_work_t;
+
+typedef void (*ompt_callback_dispatch_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  ompt_dispatch_t kind,
+  ompt_data_t instance
+);
+
+typedef struct ompt_record_dispatch_t {
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  ompt_dispatch_t kind;
+  ompt_data_t instance;
+} ompt_record_dispatch_t;
+
+typedef void (*ompt_callback_task_create_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *new_task_data,
+  int flags,
+  int has_dependences,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_task_create_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t new_task_id;
+  int flags;
+  int has_dependences;
+  const void *codeptr_ra;
+} ompt_record_task_create_t;
+
+typedef void (*ompt_callback_dependences_t) (
+  ompt_data_t *task_data,
+  const ompt_dependence_t *deps,
+  int ndeps
+);
+
+typedef struct ompt_record_dependences_t {
+  ompt_id_t task_id;
+  ompt_dependence_t dep;
+  int ndeps;
+} ompt_record_dependences_t;
+
+typedef void (*ompt_callback_task_dependence_t) (
+  ompt_data_t *src_task_data,
+  ompt_data_t *sink_task_data
+);
+
+typedef struct ompt_record_task_dependence_t {
+  ompt_id_t src_task_id;
+  ompt_id_t sink_task_id;
+} ompt_record_task_dependence_t;
+
+typedef void (*ompt_callback_task_schedule_t) (
+  ompt_data_t *prior_task_data,
+  ompt_task_status_t prior_task_status,
+  ompt_data_t *next_task_data
+);
+
+typedef struct ompt_record_task_schedule_t {
+  ompt_id_t prior_task_id;
+  ompt_task_status_t prior_task_status;
+  ompt_id_t next_task_id;
+} ompt_record_task_schedule_t;
+
+typedef void (*ompt_callback_implicit_task_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  unsigned int actual_parallelism,
+  unsigned int index,
+  int flags
+);
+
+typedef struct ompt_record_implicit_task_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  unsigned int actual_parallelism;
+  unsigned int index;
+  int flags;
+} ompt_record_implicit_task_t;
+
+typedef void (*ompt_callback_masked_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+
+typedef ompt_callback_masked_t ompt_callback_master_t DEPRECATED_51;
+
+typedef struct ompt_record_masked_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_masked_t;
+
+typedef void (*ompt_callback_sync_region_t) (
+  ompt_sync_region_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_sync_region_t {
+  ompt_sync_region_t kind;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_sync_region_t;
+
+typedef void (*ompt_callback_mutex_acquire_t) (
+  ompt_mutex_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_mutex_acquire_t {
+  ompt_mutex_t kind;
+  unsigned int hint;
+  unsigned int impl;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_acquire_t;
+
+typedef void (*ompt_callback_mutex_t) (
+  ompt_mutex_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_mutex_t {
+  ompt_mutex_t kind;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_t;
+
+typedef void (*ompt_callback_nest_lock_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_nest_lock_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_nest_lock_t;
+
+typedef void (*ompt_callback_flush_t) (
+  ompt_data_t *thread_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_flush_t {
+  const void *codeptr_ra;
+} ompt_record_flush_t;
+
+typedef void (*ompt_callback_cancel_t) (
+  ompt_data_t *task_data,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_cancel_t {
+  ompt_id_t task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_cancel_t;
+
+typedef void (*ompt_callback_device_initialize_t) (
+  int device_num,
+  const char *type,
+  ompt_device_t *device,
+  ompt_function_lookup_t lookup,
+  const char *documentation
+);
+
+typedef void (*ompt_callback_device_finalize_t) (
+  int device_num
+);
+
+typedef void (*ompt_callback_device_load_t) (
+  int device_num,
+  const char *filename,
+  int64_t offset_in_file,
+  void *vma_in_file,
+  size_t bytes,
+  void *host_addr,
+  void *device_addr,
+  uint64_t module_id
+);
+
+typedef void (*ompt_callback_device_unload_t) (
+  int device_num,
+  uint64_t module_id
+);
+
+typedef void (*ompt_callback_target_data_op_emi_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *target_task_data,
+  ompt_data_t *target_data,
+  ompt_id_t *host_op_id,
+  ompt_target_data_op_t optype,
+  void *src_addr,
+  int src_device_num,
+  void *dest_addr,
+  int dest_device_num,
+  size_t bytes,
+  const void *codeptr_ra
+);
+
+typedef void (*ompt_callback_target_data_op_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  ompt_target_data_op_t optype,
+  void *src_addr,
+  int src_device_num,
+  void *dest_addr,
+  int dest_device_num,
+  size_t bytes,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_data_op_t {
+  ompt_id_t host_op_id;
+  ompt_target_data_op_t optype;
+  void *src_addr;
+  int src_device_num;
+  void *dest_addr;
+  int dest_device_num;
+  size_t bytes;
+  ompt_device_time_t end_time;
+  const void *codeptr_ra;
+} ompt_record_target_data_op_t;
+
+typedef void (*ompt_callback_target_emi_t) (
+  ompt_target_t kind,
+  ompt_scope_endpoint_t endpoint,
+  int device_num,
+  ompt_data_t *task_data,
+  ompt_data_t *target_task_data,
+  ompt_data_t *target_data,
+  const void *codeptr_ra
+);
+
+typedef void (*ompt_callback_target_t) (
+  ompt_target_t kind,
+  ompt_scope_endpoint_t endpoint,
+  int device_num,
+  ompt_data_t *task_data,
+  ompt_id_t target_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_t {
+  ompt_target_t kind;
+  ompt_scope_endpoint_t endpoint;
+  int device_num;
+  ompt_id_t task_id;
+  ompt_id_t target_id;
+  const void *codeptr_ra;
+} ompt_record_target_t;
+
+typedef void (*ompt_callback_target_map_emi_t) (
+  ompt_data_t *target_data,
+  unsigned int nitems,
+  void **host_addr,
+  void **device_addr,
+  size_t *bytes,
+  unsigned int *mapping_flags,
+  const void *codeptr_ra
+);
+
+typedef void (*ompt_callback_target_map_t) (
+  ompt_id_t target_id,
+  unsigned int nitems,
+  void **host_addr,
+  void **device_addr,
+  size_t *bytes,
+  unsigned int *mapping_flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_map_t {
+  ompt_id_t target_id;
+  unsigned int nitems;
+  void **host_addr;
+  void **device_addr;
+  size_t *bytes;
+  unsigned int *mapping_flags;
+  const void *codeptr_ra;
+} ompt_record_target_map_t;
+
+typedef void (*ompt_callback_target_submit_emi_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *target_data,
+  ompt_id_t *host_op_id,
+  unsigned int requested_num_teams
+);
+
+typedef void (*ompt_callback_target_submit_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  unsigned int requested_num_teams
+);
+
+typedef struct ompt_record_target_kernel_t {
+  ompt_id_t host_op_id;
+  unsigned int requested_num_teams;
+  unsigned int granted_num_teams;
+  ompt_device_time_t end_time;
+} ompt_record_target_kernel_t;
+
+typedef int (*ompt_callback_control_tool_t) (
+  uint64_t command,
+  uint64_t modifier,
+  void *arg,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_control_tool_t {
+  uint64_t command;
+  uint64_t modifier;
+  const void *codeptr_ra;
+} ompt_record_control_tool_t;
+
+typedef void (*ompt_callback_error_t) (
+  ompt_severity_t severity,
+  const char *message, size_t length,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_error_t {
+  ompt_severity_t severity;
+  const char *message;
+  size_t length;
+  const void *codeptr_ra;
+} ompt_record_error_t;
+
+typedef struct ompd_address_t {
+  ompd_seg_t segment;
+  ompd_addr_t address;
+} ompd_address_t;
+
+typedef struct ompd_frame_info_t {
+  ompd_address_t frame_address;
+  ompd_word_t frame_flag;
+} ompd_frame_info_t;
+
+typedef struct _ompd_aspace_handle ompd_address_space_handle_t;
+typedef struct _ompd_thread_handle ompd_thread_handle_t;
+typedef struct _ompd_parallel_handle ompd_parallel_handle_t;
+typedef struct _ompd_task_handle ompd_task_handle_t;
+
+typedef struct _ompd_aspace_cont ompd_address_space_context_t;
+typedef struct _ompd_thread_cont ompd_thread_context_t;
+
+typedef struct ompd_device_type_sizes_t {
+  uint8_t sizeof_char;
+  uint8_t sizeof_short;
+  uint8_t sizeof_int;
+  uint8_t sizeof_long;
+  uint8_t sizeof_long_long;
+  uint8_t sizeof_pointer;
+} ompd_device_type_sizes_t;
+
+void ompd_dll_locations_valid(void);
+
+typedef ompd_rc_t (*ompd_callback_memory_alloc_fn_t)(ompd_size_t nbytes,
+                                                     void **ptr);
+
+typedef ompd_rc_t (*ompd_callback_memory_free_fn_t)(void *ptr);
+
+typedef ompd_rc_t (*ompd_callback_get_thread_context_for_thread_id_fn_t)(
+    ompd_address_space_context_t *address_space_context, ompd_thread_id_t kind,
+    ompd_size_t sizeof_thread_id, const void *thread_id,
+    ompd_thread_context_t **thread_context);
+
+typedef ompd_rc_t (*ompd_callback_sizeof_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_device_type_sizes_t *sizes);
+
+typedef ompd_rc_t (*ompd_callback_symbol_addr_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_thread_context_t *thread_context, const char *symbol_name,
+    ompd_address_t *symbol_addr, const char *file_name);
+
+typedef ompd_rc_t (*ompd_callback_memory_read_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_thread_context_t *thread_context, const ompd_address_t *addr,
+    ompd_size_t nbytes, void *buffer);
+
+typedef ompd_rc_t (*ompd_callback_memory_write_fn_t)(
+    ompd_address_space_context_t *address_space_context,
+    ompd_thread_context_t *thread_context, const ompd_address_t *addr,
+    ompd_size_t nbytes, const void *buffer);
+
+typedef ompd_rc_t (*ompd_callback_device_host_fn_t)(
+    ompd_address_space_context_t *address_space_context, const void *input,
+    ompd_size_t unit_size, ompd_size_t count, void *output);
+
+typedef ompd_rc_t (*ompd_callback_print_string_fn_t)(const char *string,
+                                                     int category);
+
+typedef struct ompd_callbacks_t {
+  ompd_callback_memory_alloc_fn_t alloc_memory;
+  ompd_callback_memory_free_fn_t free_memory;
+  ompd_callback_print_string_fn_t print_string;
+  ompd_callback_sizeof_fn_t sizeof_type;
+  ompd_callback_symbol_addr_fn_t symbol_addr_lookup;
+  ompd_callback_memory_read_fn_t read_memory;
+  ompd_callback_memory_write_fn_t write_memory;
+  ompd_callback_memory_read_fn_t read_string;
+  ompd_callback_device_host_fn_t device_to_host;
+  ompd_callback_device_host_fn_t host_to_device;
+  ompd_callback_get_thread_context_for_thread_id_fn_t
+      get_thread_context_for_thread_id;
+} ompd_callbacks_t;
+
+void ompd_bp_parallel_begin(void);
+
+void ompd_bp_parallel_end(void);
+
+void ompd_bp_task_begin(void);
+
+void ompd_bp_task_end(void);
+
+void ompd_bp_thread_begin(void);
+
+void ompd_bp_thread_end(void);
+
+void ompd_bp_device_begin(void);
+
+void ompd_bp_device_end(void);
+
+ompd_rc_t ompd_initialize(ompd_word_t api_version,
+                          const ompd_callbacks_t *callbacks);
+
+ompd_rc_t ompd_get_api_version(ompd_word_t *version);
+
+ompd_rc_t ompd_get_version_string(const char **string);
+
+ompd_rc_t ompd_finalize(void);
+
+ompd_rc_t ompd_process_initialize(ompd_address_space_context_t *context,
+                                  ompd_address_space_handle_t **handle);
+
+ompd_rc_t ompd_device_initialize(ompd_address_space_handle_t *process_handle,
+                                 ompd_address_space_context_t *device_context,
+                                 ompd_device_t kind, ompd_size_t sizeof_id,
+                                 void *id,
+                                 ompd_address_space_handle_t **device_handle);
+
+ompd_rc_t ompd_rel_address_space_handle(ompd_address_space_handle_t *handle);
+
+ompd_rc_t ompd_get_omp_version(ompd_address_space_handle_t *address_space,
+                               ompd_word_t *omp_version);
+
+ompd_rc_t
+ompd_get_omp_version_string(ompd_address_space_handle_t *address_space,
+                            const char **string);
+
+ompd_rc_t ompd_get_thread_in_parallel(ompd_parallel_handle_t *parallel_handle,
+                                      int thread_num,
+                                      ompd_thread_handle_t **thread_handle);
+
+ompd_rc_t ompd_get_thread_handle(ompd_address_space_handle_t *handle,
+                                 ompd_thread_id_t kind,
+                                 ompd_size_t sizeof_thread_id,
+                                 const void *thread_id,
+                                 ompd_thread_handle_t **thread_handle);
+
+ompd_rc_t ompd_rel_thread_handle(ompd_thread_handle_t *thread_handle);
+
+ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1,
+                                     ompd_thread_handle_t *thread_handle_2,
+                                     int *cmp_value);
+
+ompd_rc_t ompd_get_thread_id(ompd_thread_handle_t *thread_handle,
+                             ompd_thread_id_t kind,
+                             ompd_size_t sizeof_thread_id, void *thread_id);
+
+ompd_rc_t
+ompd_get_curr_parallel_handle(ompd_thread_handle_t *thread_handle,
+                              ompd_parallel_handle_t **parallel_handle);
+
+ompd_rc_t ompd_get_enclosing_parallel_handle(
+    ompd_parallel_handle_t *parallel_handle,
+    ompd_parallel_handle_t **enclosing_parallel_handle);
+
+ompd_rc_t
+ompd_get_task_parallel_handle(ompd_task_handle_t *task_handle,
+                              ompd_parallel_handle_t **task_parallel_handle);
+
+ompd_rc_t ompd_rel_parallel_handle(ompd_parallel_handle_t *parallel_handle);
+
+ompd_rc_t
+ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1,
+                             ompd_parallel_handle_t *parallel_handle_2,
+                             int *cmp_value);
+
+ompd_rc_t ompd_get_curr_task_handle(ompd_thread_handle_t *thread_handle,
+                                    ompd_task_handle_t **task_handle);
+
+ompd_rc_t
+ompd_get_generating_task_handle(ompd_task_handle_t *task_handle,
+                                ompd_task_handle_t **generating_task_handle);
+
+ompd_rc_t
+ompd_get_scheduling_task_handle(ompd_task_handle_t *task_handle,
+                                ompd_task_handle_t **scheduling_task_handle);
+
+ompd_rc_t ompd_get_task_in_parallel(ompd_parallel_handle_t *parallel_handle,
+                                    int thread_num,
+                                    ompd_task_handle_t **task_handle);
+
+ompd_rc_t ompd_rel_task_handle(ompd_task_handle_t *task_handle);
+
+ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1,
+                                   ompd_task_handle_t *task_handle_2,
+                                   int *cmp_value);
+
+ompd_rc_t ompd_get_task_function(ompd_task_handle_t *task_handle,
+                                 ompd_address_t *entry_point);
+
+ompd_rc_t ompd_get_task_frame(ompd_task_handle_t *task_handle,
+                              ompd_frame_info_t *exit_frame,
+                              ompd_frame_info_t *enter_frame);
+
+ompd_rc_t
+ompd_enumerate_states(ompd_address_space_handle_t *address_space_handle,
+                      ompd_word_t current_state, ompd_word_t *next_state,
+                      const char **next_state_name, ompd_word_t *more_enums);
+
+ompd_rc_t ompd_get_state(ompd_thread_handle_t *thread_handle,
+                         ompd_word_t *state, ompd_wait_id_t *wait_id);
+
+ompd_rc_t
+ompd_get_display_control_vars(ompd_address_space_handle_t *address_space_handle,
+                              const char *const **control_vars);
+
+ompd_rc_t ompd_rel_display_control_vars(const char *const **control_vars);
+
+ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle,
+                              ompd_icv_id_t current, ompd_icv_id_t *next_id,
+                              const char **next_icv_name,
+                              ompd_scope_t *next_scope, int *more);
+
+ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope,
+                                  ompd_icv_id_t icv_id, ompd_word_t *icv_value);
+
+ompd_rc_t ompd_get_icv_string_from_scope(void *handle, ompd_scope_t scope,
+                                         ompd_icv_id_t icv_id,
+                                         const char **icv_string);
+
+ompd_rc_t ompd_get_tool_data(void *handle, ompd_scope_t scope,
+                             ompd_word_t *value, ompd_address_t *ptr);
+
+typedef struct ompt_record_ompt_t {
+  ompt_callbacks_t type;
+  ompt_device_time_t time;
+  ompt_id_t thread_id;
+  ompt_id_t target_id;
+  union {
+    ompt_record_thread_begin_t thread_begin;
+    ompt_record_parallel_begin_t parallel_begin;
+    ompt_record_parallel_end_t parallel_end;
+    ompt_record_work_t work;
+    ompt_record_dispatch_t dispatch;
+    ompt_record_task_create_t task_create;
+    ompt_record_dependences_t dependences;
+    ompt_record_task_dependence_t task_dependence;
+    ompt_record_task_schedule_t task_schedule;
+    ompt_record_implicit_task_t implicit_task;
+    ompt_record_masked_t masked;
+    ompt_record_sync_region_t sync_region;
+    ompt_record_mutex_acquire_t mutex_acquire;
+    ompt_record_mutex_t mutex;
+    ompt_record_nest_lock_t nest_lock;
+    ompt_record_flush_t flush;
+    ompt_record_cancel_t cancel;
+    ompt_record_target_t target;
+    ompt_record_target_data_op_t target_data_op;
+    ompt_record_target_map_t target_map;
+    ompt_record_target_kernel_t target_kernel;
+    ompt_record_control_tool_t control_tool;
+  } record;
+} ompt_record_ompt_t;
+
+typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+
+#define ompt_id_none 0
+#define ompt_data_none {0}
+#define ompt_time_none 0
+#define ompt_hwid_none 0
+#define ompt_addr_none ~0
+#define ompt_mutex_impl_none 0
+#define ompt_wait_id_none 0
+
+#define ompd_segment_none 0
+
+#endif /* __OMPT__ */
diff --git a/third_party/openmp/omp.h b/third_party/openmp/omp.h
new file mode 100644
index 000000000..5821bf10a
--- /dev/null
+++ b/third_party/openmp/omp.h
@@ -0,0 +1,521 @@
+/*
+ * include/omp.h.var
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef __OMP_H
+#   define __OMP_H
+
+#   include <stddef.h>
+#   include <stdlib.h>
+#   include <stdint.h>
+
+#   define KMP_VERSION_MAJOR    5
+#   define KMP_VERSION_MINOR    0
+#   define KMP_VERSION_BUILD    31337
+#   define KMP_BUILD_DATE       "2024-01-28"
+
+#   ifdef __cplusplus
+    extern "C" {
+#   endif
+
+#   define omp_set_affinity_format   ompc_set_affinity_format
+#   define omp_get_affinity_format   ompc_get_affinity_format
+#   define omp_display_affinity      ompc_display_affinity
+#   define omp_capture_affinity      ompc_capture_affinity
+
+#   if defined(_WIN32)
+#       define __KAI_KMPC_CONVENTION __cdecl
+#       ifndef __KMP_IMP
+#           define __KMP_IMP __declspec(dllimport)
+#       endif
+#   else
+#       define __KAI_KMPC_CONVENTION
+#       ifndef __KMP_IMP
+#           define __KMP_IMP
+#       endif
+#   endif
+
+    /* schedule kind constants */
+    typedef enum omp_sched_t {
+        omp_sched_static  = 1,
+        omp_sched_dynamic = 2,
+        omp_sched_guided  = 3,
+        omp_sched_auto    = 4,
+        omp_sched_monotonic = 0x80000000
+    } omp_sched_t;
+
+    /* set API functions */
+    extern void   __KAI_KMPC_CONVENTION  omp_set_num_threads (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_dynamic     (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nested      (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_max_active_levels (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_schedule          (omp_sched_t, int);
+
+    /* query API functions */
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_dynamic      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_nested       (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_num   (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_procs    (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_parallel      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_final         (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_active_level        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_level               (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_ancestor_thread_num (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_team_size           (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_limit        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_active_levels   (void);
+    extern void   __KAI_KMPC_CONVENTION  omp_get_schedule            (omp_sched_t *, int *);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_task_priority   (void);
+
+    /* lock API functions */
+    typedef struct omp_lock_t {
+        void * _lk;
+    } omp_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_lock    (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_lock     (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_lock   (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_lock (omp_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_lock    (omp_lock_t *);
+
+    /* nested lock API functions */
+    typedef struct omp_nest_lock_t {
+        void * _lk;
+    } omp_nest_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_nest_lock    (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nest_lock     (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_nest_lock   (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_nest_lock (omp_nest_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_nest_lock    (omp_nest_lock_t *);
+
+    /* OpenMP 5.0  Synchronization hints*/
+    typedef enum omp_sync_hint_t {
+        omp_sync_hint_none           = 0,
+        omp_lock_hint_none           = omp_sync_hint_none,
+        omp_sync_hint_uncontended    = 1,
+        omp_lock_hint_uncontended    = omp_sync_hint_uncontended,
+        omp_sync_hint_contended      = (1<<1),
+        omp_lock_hint_contended      = omp_sync_hint_contended,
+        omp_sync_hint_nonspeculative = (1<<2),
+        omp_lock_hint_nonspeculative = omp_sync_hint_nonspeculative,
+        omp_sync_hint_speculative    = (1<<3),
+        omp_lock_hint_speculative    = omp_sync_hint_speculative,
+        kmp_lock_hint_hle            = (1<<16),
+        kmp_lock_hint_rtm            = (1<<17),
+        kmp_lock_hint_adaptive       = (1<<18)
+    } omp_sync_hint_t;
+
+    /* lock hint type for dynamic user lock */
+    typedef omp_sync_hint_t omp_lock_hint_t;
+
+    /* hinted lock initializers */
+    extern void __KAI_KMPC_CONVENTION omp_init_lock_with_hint(omp_lock_t *, omp_lock_hint_t);
+    extern void __KAI_KMPC_CONVENTION omp_init_nest_lock_with_hint(omp_nest_lock_t *, omp_lock_hint_t);
+
+    /* time API functions */
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtime (void);
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtick (void);
+
+    /* OpenMP 4.0 */
+    extern int  __KAI_KMPC_CONVENTION  omp_get_default_device (void);
+    extern void __KAI_KMPC_CONVENTION  omp_set_default_device (int);
+    extern int  __KAI_KMPC_CONVENTION  omp_is_initial_device (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_num_devices (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_num_teams (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_team_num (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_cancellation (void);
+
+    /* OpenMP 4.5 */
+    extern int   __KAI_KMPC_CONVENTION  omp_get_initial_device (void);
+    extern void* __KAI_KMPC_CONVENTION  omp_target_alloc(size_t, int);
+    extern void  __KAI_KMPC_CONVENTION  omp_target_free(void *, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_is_present(const void *, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_memcpy(void *, const void *, size_t, size_t, size_t, int, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_memcpy_rect(void *, const void *, size_t, int, const size_t *,
+                                            const size_t *, const size_t *, const size_t *, const size_t *, int, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_associate_ptr(const void *, const void *, size_t, size_t, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_disassociate_ptr(const void *, int);
+
+    /* OpenMP 5.0 */
+    extern int   __KAI_KMPC_CONVENTION  omp_get_device_num (void);
+    typedef void * omp_depend_t;
+
+    /* OpenMP 5.1 interop */
+    typedef intptr_t omp_intptr_t;
+
+    /* 0..omp_get_num_interop_properties()-1 are reserved for implementation-defined properties */
+    typedef enum omp_interop_property {
+        omp_ipr_fr_id = -1,
+        omp_ipr_fr_name = -2,
+        omp_ipr_vendor = -3,
+        omp_ipr_vendor_name = -4,
+        omp_ipr_device_num = -5,
+        omp_ipr_platform = -6,
+        omp_ipr_device = -7,
+        omp_ipr_device_context = -8,
+        omp_ipr_targetsync = -9,
+        omp_ipr_first = -9
+    } omp_interop_property_t;
+
+    #define omp_interop_none 0
+
+    typedef enum omp_interop_rc {
+        omp_irc_no_value = 1,
+        omp_irc_success = 0,
+        omp_irc_empty = -1,
+        omp_irc_out_of_range = -2,
+        omp_irc_type_int = -3,
+        omp_irc_type_ptr = -4,
+        omp_irc_type_str = -5,
+        omp_irc_other = -6
+    } omp_interop_rc_t;
+
+    typedef enum omp_interop_fr {
+        omp_ifr_cuda = 1,
+        omp_ifr_cuda_driver = 2,
+        omp_ifr_opencl = 3,
+        omp_ifr_sycl = 4,
+        omp_ifr_hip = 5,
+        omp_ifr_level_zero = 6,
+        omp_ifr_last = 7
+    } omp_interop_fr_t;
+
+    typedef void * omp_interop_t;
+
+    /*!
+     * The `omp_get_num_interop_properties` routine retrieves the number of implementation-defined properties available for an `omp_interop_t` object.
+     */
+    extern int          __KAI_KMPC_CONVENTION  omp_get_num_interop_properties(const omp_interop_t);
+    /*!
+     * The `omp_get_interop_int` routine retrieves an integer property from an `omp_interop_t` object.
+     */
+    extern omp_intptr_t __KAI_KMPC_CONVENTION  omp_get_interop_int(const omp_interop_t, omp_interop_property_t, int *);
+    /*!
+     * The `omp_get_interop_ptr` routine retrieves a pointer property from an `omp_interop_t` object.
+     */
+    extern void *       __KAI_KMPC_CONVENTION  omp_get_interop_ptr(const omp_interop_t, omp_interop_property_t, int *);
+    /*!
+     * The `omp_get_interop_str` routine retrieves a string property from an `omp_interop_t` object.
+     */
+    extern const char * __KAI_KMPC_CONVENTION  omp_get_interop_str(const omp_interop_t, omp_interop_property_t, int *);
+    /*!
+     * The `omp_get_interop_name` routine retrieves a property name from an `omp_interop_t` object.
+     */
+    extern const char * __KAI_KMPC_CONVENTION  omp_get_interop_name(const omp_interop_t, omp_interop_property_t);
+    /*!
+     * The `omp_get_interop_type_desc` routine retrieves a description of the type of a property associated with an `omp_interop_t` object.
+     */
+    extern const char * __KAI_KMPC_CONVENTION  omp_get_interop_type_desc(const omp_interop_t, omp_interop_property_t);
+    /*!
+     * The `omp_get_interop_rc_desc` routine retrieves a description of the return code associated with an `omp_interop_t` object.
+     */
+    extern const char * __KAI_KMPC_CONVENTION  omp_get_interop_rc_desc(const omp_interop_t, omp_interop_rc_t);
+
+    /* OpenMP 5.1 device memory routines */
+
+    /*!
+     * The `omp_target_memcpy_async` routine asynchronously performs a copy between any combination of host and device pointers.
+     */
+    extern int    __KAI_KMPC_CONVENTION  omp_target_memcpy_async(void *, const void *, size_t, size_t, size_t, int,
+                                             int, int, omp_depend_t *);
+    /*!
+     * The `omp_target_memcpy_rect_async` routine asynchronously performs a copy between any combination of host and device pointers.
+     */
+    extern int    __KAI_KMPC_CONVENTION  omp_target_memcpy_rect_async(void *, const void *, size_t, int, const size_t *,
+                                             const size_t *, const size_t *, const size_t *, const size_t *, int, int,
+                                             int, omp_depend_t *);
+
+    /* OpenMP 6.0 device memory routines */
+    extern void * __KAI_KMPC_CONVENTION omp_target_memset(void *, int, size_t, int);
+    extern void * __KAI_KMPC_CONVENTION omp_target_memset_async(void *, int, size_t, int, int, omp_depend_t *);
+
+    /*!
+     * The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device.
+     */
+    extern void * __KAI_KMPC_CONVENTION  omp_get_mapped_ptr(const void *, int);
+    extern int    __KAI_KMPC_CONVENTION  omp_target_is_accessible(const void *, size_t, int);
+
+    /* kmp API functions */
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_stacksize          (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize          (int);
+    extern size_t __KAI_KMPC_CONVENTION  kmp_get_stacksize_s        (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize_s        (size_t);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_blocktime          (void);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_library            (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_blocktime          (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library            (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_serial     (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_turnaround (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_throughput (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_defaults           (char const *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_disp_num_buffers   (int);
+
+    /* Intel affinity API */
+    typedef void * kmp_affinity_mask_t;
+
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_max_proc    (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_create_affinity_mask     (kmp_affinity_mask_t *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_destroy_affinity_mask    (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_unset_affinity_mask_proc (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+
+    /* OpenMP 4.0 affinity API */
+    typedef enum omp_proc_bind_t {
+        omp_proc_bind_false = 0,
+        omp_proc_bind_true = 1,
+        omp_proc_bind_master = 2,
+        omp_proc_bind_close = 3,
+        omp_proc_bind_spread = 4
+    } omp_proc_bind_t;
+
+    extern omp_proc_bind_t __KAI_KMPC_CONVENTION omp_get_proc_bind (void);
+
+    /* OpenMP 4.5 affinity API */
+    extern int  __KAI_KMPC_CONVENTION omp_get_num_places (void);
+    extern int  __KAI_KMPC_CONVENTION omp_get_place_num_procs (int);
+    extern void __KAI_KMPC_CONVENTION omp_get_place_proc_ids (int, int *);
+    extern int  __KAI_KMPC_CONVENTION omp_get_place_num (void);
+    extern int  __KAI_KMPC_CONVENTION omp_get_partition_num_places (void);
+    extern void __KAI_KMPC_CONVENTION omp_get_partition_place_nums (int *);
+
+    extern void * __KAI_KMPC_CONVENTION  kmp_malloc  (size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_aligned_malloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_calloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_realloc (void *, size_t);
+    extern void   __KAI_KMPC_CONVENTION  kmp_free    (void *);
+
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_on(void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_off(void);
+
+    /* OpenMP 5.0 Tool Control */
+    typedef enum omp_control_tool_result_t {
+        omp_control_tool_notool = -2,
+        omp_control_tool_nocallback = -1,
+        omp_control_tool_success = 0,
+        omp_control_tool_ignored = 1
+    } omp_control_tool_result_t;
+
+    typedef enum omp_control_tool_t {
+        omp_control_tool_start = 1,
+        omp_control_tool_pause = 2,
+        omp_control_tool_flush = 3,
+        omp_control_tool_end = 4
+    } omp_control_tool_t;
+
+    extern int __KAI_KMPC_CONVENTION omp_control_tool(int, int, void*);
+
+    /* OpenMP 5.0 Memory Management */
+    typedef uintptr_t omp_uintptr_t;
+
+    typedef enum {
+        omp_atk_sync_hint = 1,
+        omp_atk_alignment = 2,
+        omp_atk_access = 3,
+        omp_atk_pool_size = 4,
+        omp_atk_fallback = 5,
+        omp_atk_fb_data = 6,
+        omp_atk_pinned = 7,
+        omp_atk_partition = 8
+    } omp_alloctrait_key_t;
+
+    typedef enum {
+        omp_atv_false = 0,
+        omp_atv_true = 1,
+        omp_atv_contended = 3,
+        omp_atv_uncontended = 4,
+        omp_atv_serialized = 5,
+        omp_atv_sequential = omp_atv_serialized, // (deprecated)
+        omp_atv_private = 6,
+        omp_atv_all = 7,
+        omp_atv_thread = 8,
+        omp_atv_pteam = 9,
+        omp_atv_cgroup = 10,
+        omp_atv_default_mem_fb = 11,
+        omp_atv_null_fb = 12,
+        omp_atv_abort_fb = 13,
+        omp_atv_allocator_fb = 14,
+        omp_atv_environment = 15,
+        omp_atv_nearest = 16,
+        omp_atv_blocked = 17,
+        omp_atv_interleaved = 18
+    } omp_alloctrait_value_t;
+    #define omp_atv_default ((omp_uintptr_t)-1)
+
+    typedef struct {
+        omp_alloctrait_key_t key;
+        omp_uintptr_t value;
+    } omp_alloctrait_t;
+
+#   if defined(_WIN32)
+    // On Windows cl and icl do not support 64-bit enum, let's use integer then.
+    typedef omp_uintptr_t omp_allocator_handle_t;
+    extern __KMP_IMP omp_allocator_handle_t const omp_null_allocator;
+    extern __KMP_IMP omp_allocator_handle_t const omp_default_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_large_cap_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_const_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_high_bw_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_low_lat_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_cgroup_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_pteam_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_thread_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const llvm_omp_target_host_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const llvm_omp_target_device_mem_alloc;
+
+    typedef omp_uintptr_t omp_memspace_handle_t;
+    extern __KMP_IMP omp_memspace_handle_t const omp_default_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const omp_large_cap_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space;
+#   else
+#       if __cplusplus >= 201103
+    typedef enum omp_allocator_handle_t : omp_uintptr_t
+#       else
+    typedef enum omp_allocator_handle_t
+#       endif
+    {
+      omp_null_allocator = 0,
+      omp_default_mem_alloc = 1,
+      omp_large_cap_mem_alloc = 2,
+      omp_const_mem_alloc = 3,
+      omp_high_bw_mem_alloc = 4,
+      omp_low_lat_mem_alloc = 5,
+      omp_cgroup_mem_alloc = 6,
+      omp_pteam_mem_alloc = 7,
+      omp_thread_mem_alloc = 8,
+      llvm_omp_target_host_mem_alloc = 100,
+      llvm_omp_target_shared_mem_alloc = 101,
+      llvm_omp_target_device_mem_alloc = 102,
+      KMP_ALLOCATOR_MAX_HANDLE = UINTPTR_MAX
+    } omp_allocator_handle_t;
+#       if __cplusplus >= 201103
+    typedef enum omp_memspace_handle_t : omp_uintptr_t
+#       else
+    typedef enum omp_memspace_handle_t
+#       endif
+    {
+      omp_default_mem_space = 0,
+      omp_large_cap_mem_space = 1,
+      omp_const_mem_space = 2,
+      omp_high_bw_mem_space = 3,
+      omp_low_lat_mem_space = 4,
+      llvm_omp_target_host_mem_space = 100,
+      llvm_omp_target_shared_mem_space = 101,
+      llvm_omp_target_device_mem_space = 102,
+      KMP_MEMSPACE_MAX_HANDLE = UINTPTR_MAX
+    } omp_memspace_handle_t;
+#   endif
+    extern omp_allocator_handle_t __KAI_KMPC_CONVENTION omp_init_allocator(omp_memspace_handle_t m,
+                                                       int ntraits, omp_alloctrait_t traits[]);
+    extern void __KAI_KMPC_CONVENTION omp_destroy_allocator(omp_allocator_handle_t allocator);
+
+    extern void __KAI_KMPC_CONVENTION omp_set_default_allocator(omp_allocator_handle_t a);
+    extern omp_allocator_handle_t __KAI_KMPC_CONVENTION omp_get_default_allocator(void);
+#   ifdef __cplusplus
+    extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a = omp_null_allocator);
+    extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
+                                                         omp_allocator_handle_t a = omp_null_allocator);
+    extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size,
+                                                  omp_allocator_handle_t a = omp_null_allocator);
+    extern void *__KAI_KMPC_CONVENTION omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
+                                                          omp_allocator_handle_t a = omp_null_allocator);
+    extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size,
+                                                   omp_allocator_handle_t allocator = omp_null_allocator,
+                                                   omp_allocator_handle_t free_allocator = omp_null_allocator);
+    extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
+#   else
+    extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
+    extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
+                                                         omp_allocator_handle_t a);
+    extern void *__KAI_KMPC_CONVENTION omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t a);
+    extern void *__KAI_KMPC_CONVENTION omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
+                                                          omp_allocator_handle_t a);
+    extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
+                                                   omp_allocator_handle_t free_allocator);
+    extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
+#   endif
+
+    /* OpenMP 5.0 Affinity Format */
+    extern void __KAI_KMPC_CONVENTION omp_set_affinity_format(char const *);
+    extern size_t __KAI_KMPC_CONVENTION omp_get_affinity_format(char *, size_t);
+    extern void __KAI_KMPC_CONVENTION omp_display_affinity(char const *);
+    extern size_t __KAI_KMPC_CONVENTION omp_capture_affinity(char *, size_t, char const *);
+
+    /* OpenMP 5.0 events */
+#   if defined(_WIN32)
+    // On Windows cl and icl do not support 64-bit enum, let's use integer then.
+    typedef omp_uintptr_t omp_event_handle_t;
+#   else
+    typedef enum omp_event_handle_t { KMP_EVENT_MAX_HANDLE = UINTPTR_MAX } omp_event_handle_t;
+#   endif
+    extern void __KAI_KMPC_CONVENTION omp_fulfill_event ( omp_event_handle_t event );
+
+    /* OpenMP 5.0 Pause Resources */
+    typedef enum omp_pause_resource_t {
+      omp_pause_resume = 0,
+      omp_pause_soft = 1,
+      omp_pause_hard = 2
+    } omp_pause_resource_t;
+    extern int __KAI_KMPC_CONVENTION omp_pause_resource(omp_pause_resource_t, int);
+    extern int __KAI_KMPC_CONVENTION omp_pause_resource_all(omp_pause_resource_t);
+
+    extern int __KAI_KMPC_CONVENTION omp_get_supported_active_levels(void);
+
+    /* OpenMP 5.1 */
+    extern void __KAI_KMPC_CONVENTION omp_set_num_teams(int num_teams);
+    extern int __KAI_KMPC_CONVENTION omp_get_max_teams(void);
+    extern void __KAI_KMPC_CONVENTION omp_set_teams_thread_limit(int limit);
+    extern int __KAI_KMPC_CONVENTION omp_get_teams_thread_limit(void);
+
+    /* OpenMP 5.1 Display Environment */
+    extern void omp_display_env(int verbose);
+
+#   if defined(_OPENMP) && _OPENMP >= 201811
+    #pragma omp begin declare variant match(device={kind(host)})
+    static inline int omp_is_initial_device(void) { return 1; }
+    #pragma omp end declare variant
+    #pragma omp begin declare variant match(device={kind(nohost)})
+    static inline int omp_is_initial_device(void) { return 0; }
+    #pragma omp end declare variant
+#   endif
+
+    /* OpenMP 5.2 */
+    extern int __KAI_KMPC_CONVENTION omp_in_explicit_task(void);
+
+    /* LLVM Extensions */
+    extern void *llvm_omp_target_dynamic_shared_alloc(void);
+
+#   undef __KAI_KMPC_CONVENTION
+#   undef __KMP_IMP
+
+    /* Warning:
+       The following typedefs are not standard, deprecated and will be removed in a future release.
+    */
+    typedef int     omp_int_t;
+    typedef double  omp_wtime_t;
+
+#   ifdef __cplusplus
+    }
+#   endif
+
+#endif /* __OMP_H */
diff --git a/third_party/openmp/ompd-specific.cpp b/third_party/openmp/ompd-specific.cpp
new file mode 100644
index 000000000..c4018789e
--- /dev/null
+++ b/third_party/openmp/ompd-specific.cpp
@@ -0,0 +1,154 @@
+/*
+ * ompd-specific.cpp -- OpenMP debug support
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ompd-specific.h"
+
+#if OMPD_SUPPORT
+
+/**
+ * Declaration of symbols to hold struct size and member offset information
+ */
+
+#define ompd_declare_access(t, m) uint64_t ompd_access__##t##__##m;
+OMPD_FOREACH_ACCESS(ompd_declare_access)
+#undef ompd_declare_access
+
+#define ompd_declare_sizeof_member(t, m) uint64_t ompd_sizeof__##t##__##m;
+OMPD_FOREACH_ACCESS(ompd_declare_sizeof_member)
+#undef ompd_declare_sizeof_member
+
+#define ompd_declare_bitfield(t, m) uint64_t ompd_bitfield__##t##__##m;
+OMPD_FOREACH_BITFIELD(ompd_declare_bitfield)
+#undef ompd_declare_bitfield
+
+#define ompd_declare_sizeof(t) uint64_t ompd_sizeof__##t;
+OMPD_FOREACH_SIZEOF(ompd_declare_sizeof)
+#undef ompd_declare_sizeof
+
+volatile const char **ompd_dll_locations = NULL;
+uint64_t ompd_state = 0;
+
+char *ompd_env_block = NULL;
+ompd_size_t ompd_env_block_size = 0;
+
+void ompd_init() {
+
+  static int ompd_initialized = 0;
+
+  if (ompd_initialized)
+    return;
+
+    /**
+     * Calculate member offsets for structs and unions
+     */
+
+#define ompd_init_access(t, m)                                                 \
+  ompd_access__##t##__##m = (uint64_t) & (((t *)0)->m);
+  OMPD_FOREACH_ACCESS(ompd_init_access)
+#undef ompd_init_access
+
+  /**
+   * Create bit mask for bitfield access
+   */
+
+#define ompd_init_bitfield(t, m)                                               \
+  ompd_bitfield__##t##__##m = 0;                                               \
+  ((t *)(&ompd_bitfield__##t##__##m))->m = 1;
+  OMPD_FOREACH_BITFIELD(ompd_init_bitfield)
+#undef ompd_init_bitfield
+
+  /**
+   * Calculate type size information
+   */
+
+#define ompd_init_sizeof_member(t, m)                                          \
+  ompd_sizeof__##t##__##m = sizeof(((t *)0)->m);
+  OMPD_FOREACH_ACCESS(ompd_init_sizeof_member)
+#undef ompd_init_sizeof_member
+
+#define ompd_init_sizeof(t) ompd_sizeof__##t = sizeof(t);
+  OMPD_FOREACH_SIZEOF(ompd_init_sizeof)
+#undef ompd_init_sizeof
+
+  char *libname = NULL;
+
+#if KMP_OS_UNIX
+  // Find the location of libomp.so thru dladdr and replace the libomp with
+  // libompd to get the full path of libompd
+  Dl_info dl_info;
+  int ret = dladdr((void *)ompd_init, &dl_info);
+  if (!ret) {
+    fprintf(stderr, "%s\n", dlerror());
+  }
+  int lib_path_length;
+  if (strrchr(dl_info.dli_fname, '/')) {
+    lib_path_length = strrchr(dl_info.dli_fname, '/') - dl_info.dli_fname;
+    libname =
+        (char *)malloc(lib_path_length + 12 /*for '/libompd.so' and '\0'*/);
+    strncpy(libname, dl_info.dli_fname, lib_path_length);
+    memcpy(libname + lib_path_length, "/libompd.so\0", 12);
+  }
+#endif
+
+  const char *ompd_env_var = getenv("OMP_DEBUG");
+  if (ompd_env_var && !strcmp(ompd_env_var, "enabled")) {
+    fprintf(stderr, "OMP_OMPD active\n");
+    ompt_enabled.enabled = 1;
+    ompd_state |= OMPD_ENABLE_BP;
+  }
+
+  ompd_initialized = 1;
+  ompd_dll_locations = (volatile const char **)malloc(3 * sizeof(const char *));
+  ompd_dll_locations[0] = "libompd.so";
+  ompd_dll_locations[1] = libname;
+  ompd_dll_locations[2] = NULL;
+  ompd_dll_locations_valid();
+}
+
+void __attribute__((noinline)) ompd_dll_locations_valid(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+
+void ompd_bp_parallel_begin(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+void ompd_bp_parallel_end(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+void ompd_bp_task_begin(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+void ompd_bp_task_end(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+void ompd_bp_thread_begin(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+void ompd_bp_thread_end(void) {
+  /* naive way of implementing hard to opt-out empty function
+     we might want to use a separate object file? */
+  asm("");
+}
+
+#endif /* OMPD_SUPPORT */
diff --git a/third_party/openmp/ompd-specific.h b/third_party/openmp/ompd-specific.h
new file mode 100644
index 000000000..21809ef52
--- /dev/null
+++ b/third_party/openmp/ompd-specific.h
@@ -0,0 +1,154 @@
+/*
+ * ompd-specific.h -- OpenMP debug support
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "omp-tools.h"
+#include <stdint.h>
+
+#ifndef __OMPD_SPECIFIC_H__
+#define __OMPD_SPECIFIC_H__
+
+#if OMPD_SUPPORT
+
+void ompd_init();
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern char *ompd_env_block;
+extern ompd_size_t ompd_env_block_size;
+extern char *__kmp_tool_verbose_init;
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+extern uint64_t ompd_state;
+#define OMPD_ENABLE_BP 0x1
+
+#define OMPD_FOREACH_ACCESS(OMPD_ACCESS)                                       \
+  OMPD_ACCESS(kmp_base_info_t, th_current_task)                                \
+  OMPD_ACCESS(kmp_base_info_t, th_team)                                        \
+  OMPD_ACCESS(kmp_base_info_t, th_info)                                        \
+  OMPD_ACCESS(kmp_base_info_t, ompt_thread_info)                               \
+                                                                               \
+  OMPD_ACCESS(kmp_base_root_t, r_in_parallel)                                  \
+                                                                               \
+  OMPD_ACCESS(kmp_base_team_t, ompt_team_info)                                 \
+  OMPD_ACCESS(kmp_base_team_t, ompt_serialized_team_info)                      \
+  OMPD_ACCESS(kmp_base_team_t, t_active_level)                                 \
+  OMPD_ACCESS(kmp_base_team_t, t_implicit_task_taskdata)                       \
+  OMPD_ACCESS(kmp_base_team_t, t_master_tid)                                   \
+  OMPD_ACCESS(kmp_base_team_t, t_nproc)                                        \
+  OMPD_ACCESS(kmp_base_team_t, t_level)                                        \
+  OMPD_ACCESS(kmp_base_team_t, t_parent)                                       \
+  OMPD_ACCESS(kmp_base_team_t, t_pkfn)                                         \
+  OMPD_ACCESS(kmp_base_team_t, t_threads)                                      \
+                                                                               \
+  OMPD_ACCESS(kmp_desc_t, ds)                                                  \
+                                                                               \
+  OMPD_ACCESS(kmp_desc_base_t, ds_thread)                                      \
+  OMPD_ACCESS(kmp_desc_base_t, ds_tid)                                         \
+                                                                               \
+  OMPD_ACCESS(kmp_info_t, th)                                                  \
+                                                                               \
+  OMPD_ACCESS(kmp_r_sched_t, r_sched_type)                                     \
+  OMPD_ACCESS(kmp_r_sched_t, chunk)                                            \
+                                                                               \
+  OMPD_ACCESS(kmp_root_t, r)                                                   \
+                                                                               \
+  OMPD_ACCESS(kmp_internal_control_t, dynamic)                                 \
+  OMPD_ACCESS(kmp_internal_control_t, max_active_levels)                       \
+  OMPD_ACCESS(kmp_internal_control_t, nproc)                                   \
+  OMPD_ACCESS(kmp_internal_control_t, proc_bind)                               \
+  OMPD_ACCESS(kmp_internal_control_t, sched)                                   \
+  OMPD_ACCESS(kmp_internal_control_t, default_device)                          \
+  OMPD_ACCESS(kmp_internal_control_t, thread_limit)                            \
+                                                                               \
+  OMPD_ACCESS(kmp_taskdata_t, ompt_task_info)                                  \
+  OMPD_ACCESS(kmp_taskdata_t, td_flags)                                        \
+  OMPD_ACCESS(kmp_taskdata_t, td_icvs)                                         \
+  OMPD_ACCESS(kmp_taskdata_t, td_parent)                                       \
+  OMPD_ACCESS(kmp_taskdata_t, td_team)                                         \
+                                                                               \
+  OMPD_ACCESS(kmp_task_t, routine)                                             \
+                                                                               \
+  OMPD_ACCESS(kmp_team_p, t)                                                   \
+                                                                               \
+  OMPD_ACCESS(kmp_nested_nthreads_t, used)                                     \
+  OMPD_ACCESS(kmp_nested_nthreads_t, nth)                                      \
+                                                                               \
+  OMPD_ACCESS(kmp_nested_proc_bind_t, used)                                    \
+  OMPD_ACCESS(kmp_nested_proc_bind_t, bind_types)                              \
+                                                                               \
+  OMPD_ACCESS(ompt_task_info_t, frame)                                         \
+  OMPD_ACCESS(ompt_task_info_t, scheduling_parent)                             \
+  OMPD_ACCESS(ompt_task_info_t, task_data)                                     \
+                                                                               \
+  OMPD_ACCESS(ompt_team_info_t, parallel_data)                                 \
+                                                                               \
+  OMPD_ACCESS(ompt_thread_info_t, state)                                       \
+  OMPD_ACCESS(ompt_thread_info_t, wait_id)                                     \
+  OMPD_ACCESS(ompt_thread_info_t, thread_data)                                 \
+                                                                               \
+  OMPD_ACCESS(ompt_data_t, value)                                              \
+  OMPD_ACCESS(ompt_data_t, ptr)                                                \
+                                                                               \
+  OMPD_ACCESS(ompt_frame_t, exit_frame)                                        \
+  OMPD_ACCESS(ompt_frame_t, enter_frame)                                       \
+                                                                               \
+  OMPD_ACCESS(ompt_lw_taskteam_t, parent)                                      \
+  OMPD_ACCESS(ompt_lw_taskteam_t, ompt_team_info)                              \
+  OMPD_ACCESS(ompt_lw_taskteam_t, ompt_task_info)
+
+#define OMPD_FOREACH_BITFIELD(OMPD_BITFIELD)                                   \
+  OMPD_BITFIELD(kmp_tasking_flags_t, final)                                    \
+  OMPD_BITFIELD(kmp_tasking_flags_t, tiedness)                                 \
+  OMPD_BITFIELD(kmp_tasking_flags_t, tasktype)                                 \
+  OMPD_BITFIELD(kmp_tasking_flags_t, task_serial)                              \
+  OMPD_BITFIELD(kmp_tasking_flags_t, tasking_ser)                              \
+  OMPD_BITFIELD(kmp_tasking_flags_t, team_serial)                              \
+  OMPD_BITFIELD(kmp_tasking_flags_t, started)                                  \
+  OMPD_BITFIELD(kmp_tasking_flags_t, executing)                                \
+  OMPD_BITFIELD(kmp_tasking_flags_t, complete)                                 \
+  OMPD_BITFIELD(kmp_tasking_flags_t, freed)                                    \
+  OMPD_BITFIELD(kmp_tasking_flags_t, native)
+
+#define OMPD_FOREACH_SIZEOF(OMPD_SIZEOF)                                       \
+  OMPD_SIZEOF(kmp_info_t)                                                      \
+  OMPD_SIZEOF(kmp_taskdata_t)                                                  \
+  OMPD_SIZEOF(kmp_task_t)                                                      \
+  OMPD_SIZEOF(kmp_tasking_flags_t)                                             \
+  OMPD_SIZEOF(kmp_thread_t)                                                    \
+  OMPD_SIZEOF(ompt_data_t)                                                     \
+  OMPD_SIZEOF(ompt_id_t)                                                       \
+  OMPD_SIZEOF(__kmp_avail_proc)                                                \
+  OMPD_SIZEOF(__kmp_max_nth)                                                   \
+  OMPD_SIZEOF(__kmp_stksize)                                                   \
+  OMPD_SIZEOF(__kmp_omp_cancellation)                                          \
+  OMPD_SIZEOF(__kmp_max_task_priority)                                         \
+  OMPD_SIZEOF(__kmp_display_affinity)                                          \
+  OMPD_SIZEOF(__kmp_affinity_format)                                           \
+  OMPD_SIZEOF(__kmp_tool_libraries)                                            \
+  OMPD_SIZEOF(__kmp_tool_verbose_init)                                         \
+  OMPD_SIZEOF(__kmp_tool)                                                      \
+  OMPD_SIZEOF(ompd_state)                                                      \
+  OMPD_SIZEOF(kmp_nested_nthreads_t)                                           \
+  OMPD_SIZEOF(__kmp_nested_nth)                                                \
+  OMPD_SIZEOF(kmp_nested_proc_bind_t)                                          \
+  OMPD_SIZEOF(__kmp_nested_proc_bind)                                          \
+  OMPD_SIZEOF(int)                                                             \
+  OMPD_SIZEOF(char)                                                            \
+  OMPD_SIZEOF(__kmp_gtid)                                                      \
+  OMPD_SIZEOF(__kmp_nth)
+
+#endif /* OMPD_SUPPORT */
+#endif
diff --git a/third_party/openmp/ompt-event-specific.h b/third_party/openmp/ompt-event-specific.h
new file mode 100644
index 000000000..7736ba853
--- /dev/null
+++ b/third_party/openmp/ompt-event-specific.h
@@ -0,0 +1,110 @@
+/******************************************************************************
+ * File: ompt-event-specific.h
+ *
+ * Description:
+ *
+ *   specify which of the OMPT events are implemented by this runtime system
+ *   and the level of their implementation by a runtime system.
+ *****************************************************************************/
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPT_EVENT_SPECIFIC_H__
+#define __OMPT_EVENT_SPECIFIC_H__
+
+#define _ompt_tokenpaste_helper(x, y) x##y
+#define _ompt_tokenpaste(x, y) _ompt_tokenpaste_helper(x, y)
+#define ompt_event_implementation_status(e) _ompt_tokenpaste(e, _implemented)
+
+/*----------------------------------------------------------------------------
+ | Specify whether an event may occur or not, and whether event callbacks
+ | never, sometimes, or always occur.
+ |
+ | The values for these constants are defined in section 6.1.2 of
+ | the OMPT TR. They are exposed to tools through ompt_set_callback.
+ +--------------------------------------------------------------------------*/
+
+#define ompt_event_UNIMPLEMENTED ompt_set_never
+#define ompt_event_MAY_CONVENIENT ompt_set_sometimes
+#define ompt_event_MAY_ALWAYS ompt_set_always
+
+#if OMPT_OPTIONAL
+#define ompt_event_MAY_ALWAYS_OPTIONAL ompt_event_MAY_ALWAYS
+#else
+#define ompt_event_MAY_ALWAYS_OPTIONAL ompt_event_UNIMPLEMENTED
+#endif
+
+/*----------------------------------------------------------------------------
+ | Mandatory Events
+ +--------------------------------------------------------------------------*/
+
+#define ompt_callback_thread_begin_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_thread_end_implemented ompt_event_MAY_ALWAYS
+
+#define ompt_callback_parallel_begin_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_parallel_end_implemented ompt_event_MAY_ALWAYS
+
+#define ompt_callback_task_create_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_task_schedule_implemented ompt_event_MAY_ALWAYS
+
+#define ompt_callback_implicit_task_implemented ompt_event_MAY_ALWAYS
+
+#define ompt_callback_target_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_emi_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_data_op_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_data_op_emi_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_submit_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_submit_emi_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_control_tool_implemented ompt_event_MAY_ALWAYS
+
+#define ompt_callback_device_initialize_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_device_finalize_implemented ompt_event_MAY_ALWAYS
+
+#define ompt_callback_device_load_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_device_unload_implemented ompt_event_UNIMPLEMENTED
+
+/*----------------------------------------------------------------------------
+ | Optional Events
+ +--------------------------------------------------------------------------*/
+
+#define ompt_callback_sync_region_wait_implemented                             \
+  ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_mutex_released_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_dependences_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_task_dependence_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_work_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_masked_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_target_map_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_target_map_emi_implemented ompt_event_UNIMPLEMENTED
+
+#define ompt_callback_sync_region_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_lock_init_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_lock_destroy_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_mutex_acquire_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_mutex_acquired_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_nest_lock_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_flush_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_cancel_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_reduction_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_dispatch_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_error_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#endif
diff --git a/third_party/openmp/ompt-general.cpp b/third_party/openmp/ompt-general.cpp
new file mode 100644
index 000000000..79db1c5d5
--- /dev/null
+++ b/third_party/openmp/ompt-general.cpp
@@ -0,0 +1,938 @@
+/*
+ * ompt-general.cpp -- OMPT implementation of interface functions
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/*****************************************************************************
+ * system include files
+ ****************************************************************************/
+
+#include <assert.h>
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if KMP_OS_UNIX
+#include <dlfcn.h>
+#endif
+
+// #define dlsym {{NONONONO}}
+
+/*****************************************************************************
+ * ompt include files
+ ****************************************************************************/
+
+#include "ompt-specific.cpp"
+
+/*****************************************************************************
+ * macros
+ ****************************************************************************/
+
+#define ompt_get_callback_success 1
+#define ompt_get_callback_failure 0
+
+#define no_tool_present 0
+
+#define OMPT_API_ROUTINE static
+
+#ifndef OMPT_STR_MATCH
+#define OMPT_STR_MATCH(haystack, needle) (!strcasecmp(haystack, needle))
+#endif
+
+// prints for an enabled OMP_TOOL_VERBOSE_INIT.
+// In the future a prefix could be added in the first define, the second define
+// omits the prefix to allow for continued lines. Example: "PREFIX: Start
+// tool... Success." instead of "PREFIX: Start tool... PREFIX: Success."
+#define OMPT_VERBOSE_INIT_PRINT(...)                                           \
+  if (verbose_init)                                                            \
+  fprintf(verbose_file, __VA_ARGS__)
+#define OMPT_VERBOSE_INIT_CONTINUED_PRINT(...)                                 \
+  if (verbose_init)                                                            \
+  fprintf(verbose_file, __VA_ARGS__)
+
+static FILE *verbose_file;
+static int verbose_init;
+
+/*****************************************************************************
+ * types
+ ****************************************************************************/
+
+typedef struct {
+  const char *state_name;
+  ompt_state_t state_id;
+} ompt_state_info_t;
+
+typedef struct {
+  const char *name;
+  kmp_mutex_impl_t id;
+} kmp_mutex_impl_info_t;
+
+enum tool_setting_e {
+  omp_tool_error,
+  omp_tool_unset,
+  omp_tool_disabled,
+  omp_tool_enabled
+};
+
+/*****************************************************************************
+ * global variables
+ ****************************************************************************/
+
+ompt_callbacks_active_t ompt_enabled;
+
+ompt_state_info_t ompt_state_info[] = {
+#define ompt_state_macro(state, code) {#state, state},
+    FOREACH_OMPT_STATE(ompt_state_macro)
+#undef ompt_state_macro
+};
+
+kmp_mutex_impl_info_t kmp_mutex_impl_info[] = {
+#define kmp_mutex_impl_macro(name, id) {#name, name},
+    FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
+#undef kmp_mutex_impl_macro
+};
+
+ompt_callbacks_internal_t ompt_callbacks;
+
+static ompt_start_tool_result_t *ompt_start_tool_result = NULL;
+
+#if KMP_OS_WINDOWS
+static HMODULE ompt_tool_module = NULL;
+#define OMPT_DLCLOSE(Lib) FreeLibrary(Lib)
+#else
+static void *ompt_tool_module = NULL;
+#define OMPT_DLCLOSE(Lib) dlclose(Lib)
+#endif
+
+/// Used to track the initializer and the finalizer provided by libomptarget
+static ompt_start_tool_result_t *libomptarget_ompt_result = NULL;
+
+/*****************************************************************************
+ * forward declarations
+ ****************************************************************************/
+
+static ompt_interface_fn_t ompt_fn_lookup(const char *s);
+
+OMPT_API_ROUTINE ompt_data_t *ompt_get_thread_data(void);
+
+/*****************************************************************************
+ * initialization and finalization (private operations)
+ ****************************************************************************/
+
+typedef ompt_start_tool_result_t *(*ompt_start_tool_t)(unsigned int,
+                                                       const char *);
+
+#if KMP_OS_DARWIN
+
+// While Darwin supports weak symbols, the library that wishes to provide a new
+// implementation has to link against this runtime which defeats the purpose
+// of having tools that are agnostic of the underlying runtime implementation.
+//
+// Fortunately, the linker includes all symbols of an executable in the global
+// symbol table by default so dlsym() even finds static implementations of
+// ompt_start_tool. For this to work on Linux, -Wl,--export-dynamic needs to be
+// passed when building the application which we don't want to rely on.
+
+static ompt_start_tool_result_t *ompt_tool_darwin(unsigned int omp_version,
+                                                  const char *runtime_version) {
+  ompt_start_tool_result_t *ret = NULL;
+  // Search symbol in the current address space.
+  ompt_start_tool_t start_tool =
+      (ompt_start_tool_t)dlsym(RTLD_DEFAULT, "ompt_start_tool");
+  if (start_tool) {
+    ret = start_tool(omp_version, runtime_version);
+  }
+  return ret;
+}
+
+#elif OMPT_HAVE_WEAK_ATTRIBUTE
+
+// On Unix-like systems that support weak symbols the following implementation
+// of ompt_start_tool() will be used in case no tool-supplied implementation of
+// this function is present in the address space of a process.
+
+_OMP_EXTERN OMPT_WEAK_ATTRIBUTE ompt_start_tool_result_t *
+ompt_start_tool(unsigned int omp_version, const char *runtime_version) {
+  ompt_start_tool_result_t *ret = NULL;
+  // Search next symbol in the current address space. This can happen if the
+  // runtime library is linked before the tool. Since glibc 2.2 strong symbols
+  // don't override weak symbols that have been found before unless the user
+  // sets the environment variable LD_DYNAMIC_WEAK.
+  ompt_start_tool_t next_tool =
+      (ompt_start_tool_t)dlsym(RTLD_NEXT, "ompt_start_tool");
+  if (next_tool) {
+    ret = next_tool(omp_version, runtime_version);
+  }
+  return ret;
+}
+
+#elif OMPT_HAVE_PSAPI
+
+// On Windows, the ompt_tool_windows function is used to find the
+// ompt_start_tool symbol across all modules loaded by a process. If
+// ompt_start_tool is found, ompt_start_tool's return value is used to
+// initialize the tool. Otherwise, NULL is returned and OMPT won't be enabled.
+
+#include <psapi.h>
+#pragma comment(lib, "psapi.lib")
+
+// The number of loaded modules to start enumeration with EnumProcessModules()
+#define NUM_MODULES 128
+
+static ompt_start_tool_result_t *
+ompt_tool_windows(unsigned int omp_version, const char *runtime_version) {
+  int i;
+  DWORD needed, new_size;
+  HMODULE *modules;
+  HANDLE process = GetCurrentProcess();
+  modules = (HMODULE *)malloc(NUM_MODULES * sizeof(HMODULE));
+  ompt_start_tool_t ompt_tool_p = NULL;
+
+#if OMPT_DEBUG
+  printf("ompt_tool_windows(): looking for ompt_start_tool\n");
+#endif
+  if (!EnumProcessModules(process, modules, NUM_MODULES * sizeof(HMODULE),
+                          &needed)) {
+    // Regardless of the error reason use the stub initialization function
+    free(modules);
+    return NULL;
+  }
+  // Check if NUM_MODULES is enough to list all modules
+  new_size = needed / sizeof(HMODULE);
+  if (new_size > NUM_MODULES) {
+#if OMPT_DEBUG
+    printf("ompt_tool_windows(): resize buffer to %d bytes\n", needed);
+#endif
+    modules = (HMODULE *)realloc(modules, needed);
+    // If resizing failed use the stub function.
+    if (!EnumProcessModules(process, modules, needed, &needed)) {
+      free(modules);
+      return NULL;
+    }
+  }
+  for (i = 0; i < new_size; ++i) {
+    (FARPROC &)ompt_tool_p = GetProcAddress(modules[i], "ompt_start_tool");
+    if (ompt_tool_p) {
+#if OMPT_DEBUG
+      TCHAR modName[MAX_PATH];
+      if (GetModuleFileName(modules[i], modName, MAX_PATH))
+        printf("ompt_tool_windows(): ompt_start_tool found in module %s\n",
+               modName);
+#endif
+      free(modules);
+      return (*ompt_tool_p)(omp_version, runtime_version);
+    }
+#if OMPT_DEBUG
+    else {
+      TCHAR modName[MAX_PATH];
+      if (GetModuleFileName(modules[i], modName, MAX_PATH))
+        printf("ompt_tool_windows(): ompt_start_tool not found in module %s\n",
+               modName);
+    }
+#endif
+  }
+  free(modules);
+  return NULL;
+}
+#else
+#error Activation of OMPT is not supported on this platform.
+#endif
+
+static ompt_start_tool_result_t *
+ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) {
+  ompt_start_tool_result_t *ret = NULL;
+  ompt_start_tool_t start_tool = NULL;
+#if KMP_OS_WINDOWS
+  // Cannot use colon to describe a list of absolute paths on Windows
+  const char *sep = ";";
+#else
+  const char *sep = ":";
+#endif
+
+  OMPT_VERBOSE_INIT_PRINT("----- START LOGGING OF TOOL REGISTRATION -----\n");
+  OMPT_VERBOSE_INIT_PRINT("Search for OMP tool in current address space... ");
+
+#if KMP_OS_DARWIN
+  // Try in the current address space
+  ret = ompt_tool_darwin(omp_version, runtime_version);
+#elif OMPT_HAVE_WEAK_ATTRIBUTE
+  ret = ompt_start_tool(omp_version, runtime_version);
+#elif OMPT_HAVE_PSAPI
+  ret = ompt_tool_windows(omp_version, runtime_version);
+#else
+#error Activation of OMPT is not supported on this platform.
+#endif
+  if (ret) {
+    OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success.\n");
+    OMPT_VERBOSE_INIT_PRINT(
+        "Tool was started and is using the OMPT interface.\n");
+    OMPT_VERBOSE_INIT_PRINT("----- END LOGGING OF TOOL REGISTRATION -----\n");
+    return ret;
+  }
+
+  // Try tool-libraries-var ICV
+  OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed.\n");
+  const char *tool_libs = getenv("OMP_TOOL_LIBRARIES");
+  if (tool_libs) {
+    OMPT_VERBOSE_INIT_PRINT("Searching tool libraries...\n");
+    OMPT_VERBOSE_INIT_PRINT("OMP_TOOL_LIBRARIES = %s\n", tool_libs);
+    char *libs = __kmp_str_format("%s", tool_libs);
+    char *buf;
+    char *fname = __kmp_str_token(libs, sep, &buf);
+    // Reset dl-error
+    dlerror();
+
+    while (fname) {
+#if KMP_OS_UNIX
+      OMPT_VERBOSE_INIT_PRINT("Opening %s... ", fname);
+      void *h = dlopen(fname, RTLD_LAZY);
+      if (!h) {
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", dlerror());
+      } else {
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success. \n");
+        OMPT_VERBOSE_INIT_PRINT("Searching for ompt_start_tool in %s... ",
+                                fname);
+        dlerror(); // Clear any existing error
+        start_tool = (ompt_start_tool_t)dlsym(h, "ompt_start_tool");
+        if (!start_tool) {
+          char *error = dlerror();
+          if (error != NULL) {
+            OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", error);
+          } else {
+            OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n",
+                                              "ompt_start_tool = NULL");
+          }
+        } else
+#elif KMP_OS_WINDOWS
+      OMPT_VERBOSE_INIT_PRINT("Opening %s... ", fname);
+      HMODULE h = LoadLibrary(fname);
+      if (!h) {
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: Error %u\n",
+                                          (unsigned)GetLastError());
+      } else {
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success. \n");
+        OMPT_VERBOSE_INIT_PRINT("Searching for ompt_start_tool in %s... ",
+                                fname);
+        start_tool = (ompt_start_tool_t)GetProcAddress(h, "ompt_start_tool");
+        if (!start_tool) {
+          OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: Error %u\n",
+                                            (unsigned)GetLastError());
+        } else
+#else
+#error Activation of OMPT is not supported on this platform.
+#endif
+        { // if (start_tool)
+          ret = (*start_tool)(omp_version, runtime_version);
+          if (ret) {
+            OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success.\n");
+            OMPT_VERBOSE_INIT_PRINT(
+                "Tool was started and is using the OMPT interface.\n");
+            ompt_tool_module = h;
+            break;
+          }
+          OMPT_VERBOSE_INIT_CONTINUED_PRINT(
+              "Found but not using the OMPT interface.\n");
+          OMPT_VERBOSE_INIT_PRINT("Continuing search...\n");
+        }
+        OMPT_DLCLOSE(h);
+      }
+      fname = __kmp_str_token(NULL, sep, &buf);
+    }
+    __kmp_str_free(&libs);
+  } else {
+    OMPT_VERBOSE_INIT_PRINT("No OMP_TOOL_LIBRARIES defined.\n");
+  }
+
+  // usable tool found in tool-libraries
+  if (ret) {
+    OMPT_VERBOSE_INIT_PRINT("----- END LOGGING OF TOOL REGISTRATION -----\n");
+    return ret;
+  }
+
+#if KMP_OS_UNIX
+  { // Non-standard: load archer tool if application is built with TSan
+    const char *fname = "libarcher.so";
+    OMPT_VERBOSE_INIT_PRINT(
+        "...searching tool libraries failed. Using archer tool.\n");
+    OMPT_VERBOSE_INIT_PRINT("Opening %s... ", fname);
+    void *h = dlopen(fname, RTLD_LAZY);
+    if (h) {
+      OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success.\n");
+      OMPT_VERBOSE_INIT_PRINT("Searching for ompt_start_tool in %s... ", fname);
+      start_tool = (ompt_start_tool_t)dlsym(h, "ompt_start_tool");
+      if (start_tool) {
+        ret = (*start_tool)(omp_version, runtime_version);
+        if (ret) {
+          OMPT_VERBOSE_INIT_CONTINUED_PRINT("Success.\n");
+          OMPT_VERBOSE_INIT_PRINT(
+              "Tool was started and is using the OMPT interface.\n");
+          OMPT_VERBOSE_INIT_PRINT(
+              "----- END LOGGING OF TOOL REGISTRATION -----\n");
+          return ret;
+        }
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT(
+            "Found but not using the OMPT interface.\n");
+      } else {
+        OMPT_VERBOSE_INIT_CONTINUED_PRINT("Failed: %s\n", dlerror());
+      }
+    }
+  }
+#endif
+  OMPT_VERBOSE_INIT_PRINT("No OMP tool loaded.\n");
+  OMPT_VERBOSE_INIT_PRINT("----- END LOGGING OF TOOL REGISTRATION -----\n");
+  return ret;
+}
+
+void ompt_pre_init() {
+  //--------------------------------------------------
+  // Execute the pre-initialization logic only once.
+  //--------------------------------------------------
+  static int ompt_pre_initialized = 0;
+
+  if (ompt_pre_initialized)
+    return;
+
+  ompt_pre_initialized = 1;
+
+  //--------------------------------------------------
+  // Use a tool iff a tool is enabled and available.
+  //--------------------------------------------------
+  const char *ompt_env_var = getenv("OMP_TOOL");
+  tool_setting_e tool_setting = omp_tool_error;
+
+  if (!ompt_env_var || !strcmp(ompt_env_var, ""))
+    tool_setting = omp_tool_unset;
+  else if (OMPT_STR_MATCH(ompt_env_var, "disabled"))
+    tool_setting = omp_tool_disabled;
+  else if (OMPT_STR_MATCH(ompt_env_var, "enabled"))
+    tool_setting = omp_tool_enabled;
+
+  const char *ompt_env_verbose_init = getenv("OMP_TOOL_VERBOSE_INIT");
+  // possible options: disabled | stdout | stderr | <filename>
+  // if set, not empty and not disabled -> prepare for logging
+  if (ompt_env_verbose_init && strcmp(ompt_env_verbose_init, "") &&
+      !OMPT_STR_MATCH(ompt_env_verbose_init, "disabled")) {
+    verbose_init = 1;
+    if (OMPT_STR_MATCH(ompt_env_verbose_init, "STDERR"))
+      verbose_file = stderr;
+    else if (OMPT_STR_MATCH(ompt_env_verbose_init, "STDOUT"))
+      verbose_file = stdout;
+    else
+      verbose_file = fopen(ompt_env_verbose_init, "w");
+  } else
+    verbose_init = 0;
+
+#if OMPT_DEBUG
+  printf("ompt_pre_init(): tool_setting = %d\n", tool_setting);
+#endif
+  switch (tool_setting) {
+  case omp_tool_disabled:
+    OMPT_VERBOSE_INIT_PRINT("OMP tool disabled. \n");
+    break;
+
+  case omp_tool_unset:
+  case omp_tool_enabled:
+
+    //--------------------------------------------------
+    // Load tool iff specified in environment variable
+    //--------------------------------------------------
+    ompt_start_tool_result =
+        ompt_try_start_tool(__kmp_openmp_version, ompt_get_runtime_version());
+
+    memset(&ompt_enabled, 0, sizeof(ompt_enabled));
+    break;
+
+  case omp_tool_error:
+    fprintf(stderr,
+            "Warning: OMP_TOOL has invalid value \"%s\".\n"
+            "  legal values are (NULL,\"\",\"disabled\","
+            "\"enabled\").\n",
+            ompt_env_var);
+    break;
+  }
+  if (verbose_init && verbose_file != stderr && verbose_file != stdout)
+    fclose(verbose_file);
+#if OMPT_DEBUG
+  printf("ompt_pre_init(): ompt_enabled = %d\n", ompt_enabled.enabled);
+#endif
+}
+
+extern "C" int omp_get_initial_device(void);
+
+void ompt_post_init() {
+  //--------------------------------------------------
+  // Execute the post-initialization logic only once.
+  //--------------------------------------------------
+  static int ompt_post_initialized = 0;
+
+  if (ompt_post_initialized)
+    return;
+
+  ompt_post_initialized = 1;
+
+  //--------------------------------------------------
+  // Initialize the tool if so indicated.
+  //--------------------------------------------------
+  if (ompt_start_tool_result) {
+    ompt_enabled.enabled = !!ompt_start_tool_result->initialize(
+        ompt_fn_lookup, 0, // [jart] omp_get_initial_device(),
+        &(ompt_start_tool_result->tool_data));
+
+    if (!ompt_enabled.enabled) {
+      // tool not enabled, zero out the bitmap, and done
+      memset(&ompt_enabled, 0, sizeof(ompt_enabled));
+      return;
+    }
+
+    kmp_info_t *root_thread = ompt_get_thread();
+
+    ompt_set_thread_state(root_thread, ompt_state_overhead);
+
+    if (ompt_enabled.ompt_callback_thread_begin) {
+      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
+          ompt_thread_initial, __ompt_get_thread_data_internal());
+    }
+    ompt_data_t *task_data = nullptr;
+    ompt_data_t *parallel_data = nullptr;
+    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
+                                  NULL);
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
+    }
+
+    ompt_set_thread_state(root_thread, ompt_state_work_serial);
+  }
+}
+
+void ompt_fini() {
+  if (ompt_enabled.enabled) {
+    if (ompt_start_tool_result && ompt_start_tool_result->finalize) {
+      ompt_start_tool_result->finalize(&(ompt_start_tool_result->tool_data));
+    }
+    if (libomptarget_ompt_result && libomptarget_ompt_result->finalize) {
+      libomptarget_ompt_result->finalize(NULL);
+    }
+  }
+
+  if (ompt_tool_module)
+    OMPT_DLCLOSE(ompt_tool_module);
+  memset(&ompt_enabled, 0, sizeof(ompt_enabled));
+}
+
+/*****************************************************************************
+ * interface operations
+ ****************************************************************************/
+
+/*****************************************************************************
+ * state
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_enumerate_states(int current_state, int *next_state,
+                                           const char **next_state_name) {
+  const static int len = sizeof(ompt_state_info) / sizeof(ompt_state_info_t);
+  int i = 0;
+
+  for (i = 0; i < len - 1; i++) {
+    if (ompt_state_info[i].state_id == current_state) {
+      *next_state = ompt_state_info[i + 1].state_id;
+      *next_state_name = ompt_state_info[i + 1].state_name;
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+OMPT_API_ROUTINE int ompt_enumerate_mutex_impls(int current_impl,
+                                                int *next_impl,
+                                                const char **next_impl_name) {
+  const static int len =
+      sizeof(kmp_mutex_impl_info) / sizeof(kmp_mutex_impl_info_t);
+  int i = 0;
+  for (i = 0; i < len - 1; i++) {
+    if (kmp_mutex_impl_info[i].id != current_impl)
+      continue;
+    *next_impl = kmp_mutex_impl_info[i + 1].id;
+    *next_impl_name = kmp_mutex_impl_info[i + 1].name;
+    return 1;
+  }
+  return 0;
+}
+
+/*****************************************************************************
+ * callbacks
+ ****************************************************************************/
+
+OMPT_API_ROUTINE ompt_set_result_t ompt_set_callback(ompt_callbacks_t which,
+                                                     ompt_callback_t callback) {
+  switch (which) {
+
+#define ompt_event_macro(event_name, callback_type, event_id)                  \
+  case event_name:                                                             \
+    ompt_callbacks.ompt_callback(event_name) = (callback_type)callback;        \
+    ompt_enabled.event_name = (callback != 0);                                 \
+    if (callback)                                                              \
+      return ompt_event_implementation_status(event_name);                     \
+    else                                                                       \
+      return ompt_set_always;
+
+    FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+
+  default:
+    return ompt_set_error;
+  }
+}
+
+OMPT_API_ROUTINE int ompt_get_callback(ompt_callbacks_t which,
+                                       ompt_callback_t *callback) {
+  if (!ompt_enabled.enabled)
+    return ompt_get_callback_failure;
+
+  switch (which) {
+
+#define ompt_event_macro(event_name, callback_type, event_id)                  \
+  case event_name: {                                                           \
+    ompt_callback_t mycb =                                                     \
+        (ompt_callback_t)ompt_callbacks.ompt_callback(event_name);             \
+    if (ompt_enabled.event_name && mycb) {                                     \
+      *callback = mycb;                                                        \
+      return ompt_get_callback_success;                                        \
+    }                                                                          \
+    return ompt_get_callback_failure;                                          \
+  }
+
+    FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+
+  default:
+    return ompt_get_callback_failure;
+  }
+}
+
+/*****************************************************************************
+ * parallel regions
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_parallel_info(int ancestor_level,
+                                            ompt_data_t **parallel_data,
+                                            int *team_size) {
+  if (!ompt_enabled.enabled)
+    return 0;
+  return __ompt_get_parallel_info_internal(ancestor_level, parallel_data,
+                                           team_size);
+}
+
+OMPT_API_ROUTINE int ompt_get_state(ompt_wait_id_t *wait_id) {
+  if (!ompt_enabled.enabled)
+    return ompt_state_work_serial;
+  int thread_state = __ompt_get_state_internal(wait_id);
+
+  if (thread_state == ompt_state_undefined) {
+    thread_state = ompt_state_work_serial;
+  }
+
+  return thread_state;
+}
+
+/*****************************************************************************
+ * tasks
+ ****************************************************************************/
+
+OMPT_API_ROUTINE ompt_data_t *ompt_get_thread_data(void) {
+  if (!ompt_enabled.enabled)
+    return NULL;
+  return __ompt_get_thread_data_internal();
+}
+
+OMPT_API_ROUTINE int ompt_get_task_info(int ancestor_level, int *type,
+                                        ompt_data_t **task_data,
+                                        ompt_frame_t **task_frame,
+                                        ompt_data_t **parallel_data,
+                                        int *thread_num) {
+  if (!ompt_enabled.enabled)
+    return 0;
+  return __ompt_get_task_info_internal(ancestor_level, type, task_data,
+                                       task_frame, parallel_data, thread_num);
+}
+
+OMPT_API_ROUTINE int ompt_get_task_memory(void **addr, size_t *size,
+                                          int block) {
+  return __ompt_get_task_memory_internal(addr, size, block);
+}
+
+/*****************************************************************************
+ * num_procs
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_num_procs(void) {
+  // copied from kmp_ftn_entry.h (but modified: OMPT can only be called when
+  // runtime is initialized)
+  return __kmp_avail_proc;
+}
+
+/*****************************************************************************
+ * places
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_num_places(void) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  return __kmp_affinity.num_masks;
+#endif
+}
+
+OMPT_API_ROUTINE int ompt_get_place_proc_ids(int place_num, int ids_size,
+                                             int *ids) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  int i, count;
+  int tmp_ids[ids_size];
+  for (int j = 0; j < ids_size; j++)
+    tmp_ids[j] = 0;
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  if (place_num < 0 || place_num >= (int)__kmp_affinity.num_masks)
+    return 0;
+  /* TODO: Is this safe for asynchronous call from signal handler during runtime
+   * shutdown? */
+  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity.masks, place_num);
+  count = 0;
+  KMP_CPU_SET_ITERATE(i, mask) {
+    if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
+        (!KMP_CPU_ISSET(i, mask))) {
+      continue;
+    }
+    if (count < ids_size)
+      tmp_ids[count] = i;
+    count++;
+  }
+  if (ids_size >= count) {
+    for (i = 0; i < count; i++) {
+      ids[i] = tmp_ids[i];
+    }
+  }
+  return count;
+#endif
+}
+
+OMPT_API_ROUTINE int ompt_get_place_num(void) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!ompt_enabled.enabled || __kmp_get_gtid() < 0)
+    return -1;
+
+  int gtid;
+  kmp_info_t *thread;
+  if (!KMP_AFFINITY_CAPABLE())
+    return -1;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  if (thread == NULL || thread->th.th_current_place < 0)
+    return -1;
+  return thread->th.th_current_place;
+#endif
+}
+
+OMPT_API_ROUTINE int ompt_get_partition_place_nums(int place_nums_size,
+                                                   int *place_nums) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  if (!ompt_enabled.enabled || __kmp_get_gtid() < 0)
+    return 0;
+
+  int i, gtid, place_num, first_place, last_place, start, end;
+  kmp_info_t *thread;
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  if (thread == NULL)
+    return 0;
+  first_place = thread->th.th_first_place;
+  last_place = thread->th.th_last_place;
+  if (first_place < 0 || last_place < 0)
+    return 0;
+  if (first_place <= last_place) {
+    start = first_place;
+    end = last_place;
+  } else {
+    start = last_place;
+    end = first_place;
+  }
+  if (end - start <= place_nums_size)
+    for (i = 0, place_num = start; place_num <= end; ++place_num, ++i) {
+      place_nums[i] = place_num;
+    }
+  return end - start + 1;
+#endif
+}
+
+/*****************************************************************************
+ * places
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_proc_id(void) {
+  if (!ompt_enabled.enabled || __kmp_get_gtid() < 0)
+    return -1;
+#if KMP_HAVE_SCHED_GETCPU
+  return sched_getcpu();
+#elif KMP_OS_WINDOWS
+  PROCESSOR_NUMBER pn;
+  GetCurrentProcessorNumberEx(&pn);
+  return 64 * pn.Group + pn.Number;
+#else
+  return -1;
+#endif
+}
+
+/*****************************************************************************
+ * compatability
+ ****************************************************************************/
+
+/*
+ * Currently unused function
+OMPT_API_ROUTINE int ompt_get_ompt_version() { return OMPT_VERSION; }
+*/
+
+/*****************************************************************************
+ * application-facing API
+ ****************************************************************************/
+
+/*----------------------------------------------------------------------------
+ | control
+ ---------------------------------------------------------------------------*/
+
+int __kmp_control_tool(uint64_t command, uint64_t modifier, void *arg) {
+
+  if (ompt_enabled.enabled) {
+    if (ompt_enabled.ompt_callback_control_tool) {
+      return ompt_callbacks.ompt_callback(ompt_callback_control_tool)(
+          command, modifier, arg, OMPT_LOAD_RETURN_ADDRESS(__kmp_entry_gtid()));
+    } else {
+      return -1;
+    }
+  } else {
+    return -2;
+  }
+}
+
+/*****************************************************************************
+ * misc
+ ****************************************************************************/
+
+OMPT_API_ROUTINE uint64_t ompt_get_unique_id(void) {
+  return __ompt_get_unique_id_internal();
+}
+
+OMPT_API_ROUTINE void ompt_finalize_tool(void) { __kmp_internal_end_atexit(); }
+
+/*****************************************************************************
+ * Target
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_target_info(uint64_t *device_num,
+                                          ompt_id_t *target_id,
+                                          ompt_id_t *host_op_id) {
+  return 0; // thread is not in a target region
+}
+
+OMPT_API_ROUTINE int ompt_get_num_devices(void) {
+  return 1; // only one device (the current device) is available
+}
+
+/*****************************************************************************
+ * API inquiry for tool
+ ****************************************************************************/
+
+static ompt_interface_fn_t ompt_fn_lookup(const char *s) {
+
+#define ompt_interface_fn(fn)                                                  \
+  fn##_t fn##_f = fn;                                                          \
+  if (strcmp(s, #fn) == 0)                                                     \
+    return (ompt_interface_fn_t)fn##_f;
+
+  FOREACH_OMPT_INQUIRY_FN(ompt_interface_fn)
+
+#undef ompt_interface_fn
+
+  return NULL;
+}
+
+static ompt_data_t *ompt_get_task_data() { return __ompt_get_task_data(); }
+
+static ompt_data_t *ompt_get_target_task_data() {
+  return __ompt_get_target_task_data();
+}
+
+/// Lookup function to query libomp callbacks registered by the tool
+static ompt_interface_fn_t ompt_libomp_target_fn_lookup(const char *s) {
+#define provide_fn(fn)                                                         \
+  if (strcmp(s, #fn) == 0)                                                     \
+    return (ompt_interface_fn_t)fn;
+
+  provide_fn(ompt_get_callback);
+  provide_fn(ompt_get_task_data);
+  provide_fn(ompt_get_target_task_data);
+#undef provide_fn
+
+#define ompt_interface_fn(fn, type, code)                                      \
+  if (strcmp(s, #fn) == 0)                                                     \
+    return (ompt_interface_fn_t)ompt_callbacks.ompt_callback(fn);
+
+  FOREACH_OMPT_DEVICE_EVENT(ompt_interface_fn)
+  FOREACH_OMPT_EMI_EVENT(ompt_interface_fn)
+  FOREACH_OMPT_NOEMI_EVENT(ompt_interface_fn)
+#undef ompt_interface_fn
+
+  return (ompt_interface_fn_t)0;
+}
+
+/// This function is called by the libomptarget connector to assign
+/// callbacks already registered with libomp.
+_OMP_EXTERN void ompt_libomp_connect(ompt_start_tool_result_t *result) {
+  OMPT_VERBOSE_INIT_PRINT("libomp --> OMPT: Enter ompt_libomp_connect\n");
+
+  // Ensure libomp callbacks have been added if not already
+  __ompt_force_initialization();
+
+  if (ompt_enabled.enabled &&
+      // Callbacks are initiated only if the device initialize callback
+      // has been registered by the tool
+      ompt_callbacks.ompt_callback(ompt_callback_device_initialize)) {
+    if (result) {
+      OMPT_VERBOSE_INIT_PRINT(
+          "libomp --> OMPT: Connecting with libomptarget\n");
+      // Pass in the libomp lookup function so that the already registered
+      // functions can be extracted and assigned to the callbacks in
+      // libomptarget
+      result->initialize(ompt_libomp_target_fn_lookup,
+                         /* initial_device_num */ 0, /* tool_data */ nullptr);
+      // Track the object provided by libomptarget so that the finalizer can be
+      // called during OMPT finalization
+      libomptarget_ompt_result = result;
+    }
+  }
+  OMPT_VERBOSE_INIT_PRINT("libomp --> OMPT: Exit ompt_libomp_connect\n");
+}
diff --git a/third_party/openmp/ompt-internal.h b/third_party/openmp/ompt-internal.h
new file mode 100644
index 000000000..0d77413d5
--- /dev/null
+++ b/third_party/openmp/ompt-internal.h
@@ -0,0 +1,127 @@
+/*
+ * ompt-internal.h - header of OMPT internal data structures
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPT_INTERNAL_H__
+#define __OMPT_INTERNAL_H__
+
+#include "ompt-event-specific.h"
+#include "omp-tools.h"
+
+#define OMPT_VERSION 1
+
+#define _OMP_EXTERN extern "C"
+
+#define OMPT_INVOKER(x)                                                        \
+  ((x == fork_context_gnu) ? ompt_parallel_invoker_program                     \
+                           : ompt_parallel_invoker_runtime)
+
+#define ompt_callback(e) e##_callback
+
+typedef struct ompt_callbacks_internal_s {
+#define ompt_event_macro(event, callback, eventid)                             \
+  callback ompt_callback(event);
+
+  FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+} ompt_callbacks_internal_t;
+
+typedef struct ompt_callbacks_active_s {
+  unsigned int enabled : 1;
+#define ompt_event_macro(event, callback, eventid) unsigned int event : 1;
+
+  FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+} ompt_callbacks_active_t;
+
+#define TASK_TYPE_DETAILS_FORMAT(info)                                         \
+  ((info->td_flags.task_serial || info->td_flags.tasking_ser)                  \
+       ? ompt_task_undeferred                                                  \
+       : 0x0) |                                                                \
+      ((!(info->td_flags.tiedness)) ? ompt_task_untied : 0x0) |                \
+      (info->td_flags.final ? ompt_task_final : 0x0) |                         \
+      (info->td_flags.merged_if0 ? ompt_task_mergeable : 0x0)
+
+typedef struct {
+  ompt_frame_t frame;
+  ompt_data_t task_data;
+  struct kmp_taskdata *scheduling_parent;
+  int thread_num;
+  ompt_dispatch_chunk_t dispatch_chunk;
+} ompt_task_info_t;
+
+typedef struct {
+  ompt_data_t parallel_data;
+  void *master_return_address;
+} ompt_team_info_t;
+
+typedef struct ompt_lw_taskteam_s {
+  ompt_team_info_t ompt_team_info;
+  ompt_task_info_t ompt_task_info;
+  int heap;
+  struct ompt_lw_taskteam_s *parent;
+} ompt_lw_taskteam_t;
+
+typedef struct {
+  ompt_data_t thread_data;
+  ompt_data_t task_data; /* stored here from implicit barrier-begin until
+                            implicit-task-end */
+  ompt_data_t target_task_data; /* required by target support */
+  void *return_address; /* stored here on entry of runtime */
+  ompt_state_t state;
+  ompt_wait_id_t wait_id;
+  int ompt_task_yielded;
+  int parallel_flags; // information for the last parallel region invoked
+  void *idle_frame;
+} ompt_thread_info_t;
+
+extern ompt_callbacks_internal_t ompt_callbacks;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#if USE_FAST_MEMORY
+#define KMP_OMPT_DEPS_ALLOC __kmp_fast_allocate
+#define KMP_OMPT_DEPS_FREE __kmp_fast_free
+#else
+#define KMP_OMPT_DEPS_ALLOC __kmp_thread_malloc
+#define KMP_OMPT_DEPS_FREE __kmp_thread_free
+#endif
+#endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ompt_pre_init(void);
+void ompt_post_init(void);
+void ompt_fini(void);
+
+#define OMPT_GET_RETURN_ADDRESS(level) __builtin_return_address(level)
+#define OMPT_GET_FRAME_ADDRESS(level) __builtin_frame_address(level)
+
+int __kmp_control_tool(uint64_t command, uint64_t modifier, void *arg);
+
+extern ompt_callbacks_active_t ompt_enabled;
+
+#if KMP_OS_WINDOWS
+#define UNLIKELY(x) (x)
+#define OMPT_NOINLINE __declspec(noinline)
+#else
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#define OMPT_NOINLINE __attribute__((noinline))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/third_party/openmp/ompt-specific.cpp b/third_party/openmp/ompt-specific.cpp
new file mode 100644
index 000000000..9743f35d2
--- /dev/null
+++ b/third_party/openmp/ompt-specific.cpp
@@ -0,0 +1,526 @@
+/*
+ * ompt-specific.cpp -- OMPT internal functions
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//******************************************************************************
+// include files
+//******************************************************************************
+
+#include "kmp.h"
+#include "ompt-specific.h"
+
+#if KMP_OS_UNIX
+#include <dlfcn.h>
+#endif
+
+#if KMP_OS_WINDOWS
+#define THREAD_LOCAL __declspec(thread)
+#else
+#define THREAD_LOCAL __thread
+#endif
+
+#define OMPT_WEAK_ATTRIBUTE KMP_WEAK_ATTRIBUTE_INTERNAL
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define LWT_FROM_TEAM(team) (team)->t.ompt_serialized_team_info
+
+#define OMPT_THREAD_ID_BITS 16
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+//----------------------------------------------------------
+// traverse the team and task hierarchy
+// note: __ompt_get_teaminfo and __ompt_get_task_info_object
+//       traverse the hierarchy similarly and need to be
+//       kept consistent
+//----------------------------------------------------------
+
+ompt_team_info_t *__ompt_get_teaminfo(int depth, int *size) {
+  kmp_info_t *thr = ompt_get_thread();
+
+  if (thr) {
+    kmp_team *team = thr->th.th_team;
+    if (team == NULL)
+      return NULL;
+
+    ompt_lw_taskteam_t *next_lwt = LWT_FROM_TEAM(team), *lwt = NULL;
+
+    while (depth > 0) {
+      // next lightweight team (if any)
+      if (lwt)
+        lwt = lwt->parent;
+
+      // next heavyweight team (if any) after
+      // lightweight teams are exhausted
+      if (!lwt && team) {
+        if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          team = team->t.t_parent;
+          if (team) {
+            next_lwt = LWT_FROM_TEAM(team);
+          }
+        }
+      }
+
+      depth--;
+    }
+
+    if (lwt) {
+      // lightweight teams have one task
+      if (size)
+        *size = 1;
+
+      // return team info for lightweight team
+      return &lwt->ompt_team_info;
+    } else if (team) {
+      // extract size from heavyweight team
+      if (size)
+        *size = team->t.t_nproc;
+
+      // return team info for heavyweight team
+      return &team->t.ompt_team_info;
+    }
+  }
+
+  return NULL;
+}
+
+ompt_task_info_t *__ompt_get_task_info_object(int depth) {
+  ompt_task_info_t *info = NULL;
+  kmp_info_t *thr = ompt_get_thread();
+
+  if (thr) {
+    kmp_taskdata_t *taskdata = thr->th.th_current_task;
+    ompt_lw_taskteam_t *lwt = NULL,
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+
+    while (depth > 0) {
+      // next lightweight team (if any)
+      if (lwt)
+        lwt = lwt->parent;
+
+      // next heavyweight team (if any) after
+      // lightweight teams are exhausted
+      if (!lwt && taskdata) {
+        if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          taskdata = taskdata->td_parent;
+          if (taskdata) {
+            next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+          }
+        }
+      }
+      depth--;
+    }
+
+    if (lwt) {
+      info = &lwt->ompt_task_info;
+    } else if (taskdata) {
+      info = &taskdata->ompt_task_info;
+    }
+  }
+
+  return info;
+}
+
+ompt_task_info_t *__ompt_get_scheduling_taskinfo(int depth) {
+  ompt_task_info_t *info = NULL;
+  kmp_info_t *thr = ompt_get_thread();
+
+  if (thr) {
+    kmp_taskdata_t *taskdata = thr->th.th_current_task;
+
+    ompt_lw_taskteam_t *lwt = NULL,
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+
+    while (depth > 0) {
+      // next lightweight team (if any)
+      if (lwt)
+        lwt = lwt->parent;
+
+      // next heavyweight team (if any) after
+      // lightweight teams are exhausted
+      if (!lwt && taskdata) {
+        // first try scheduling parent (for explicit task scheduling)
+        if (taskdata->ompt_task_info.scheduling_parent) {
+          taskdata = taskdata->ompt_task_info.scheduling_parent;
+        } else if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          // then go for implicit tasks
+          taskdata = taskdata->td_parent;
+          if (taskdata) {
+            next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+          }
+        }
+      }
+      depth--;
+    }
+
+    if (lwt) {
+      info = &lwt->ompt_task_info;
+    } else if (taskdata) {
+      info = &taskdata->ompt_task_info;
+    }
+  }
+
+  return info;
+}
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+//----------------------------------------------------------
+// initialization support
+//----------------------------------------------------------
+
+void __ompt_force_initialization() { __kmp_serial_initialize(); }
+
+//----------------------------------------------------------
+// thread support
+//----------------------------------------------------------
+
+ompt_data_t *__ompt_get_thread_data_internal() {
+  if (__kmp_get_gtid() >= 0) {
+    kmp_info_t *thread = ompt_get_thread();
+    if (thread == NULL)
+      return NULL;
+    return &(thread->th.ompt_thread_info.thread_data);
+  }
+  return NULL;
+}
+
+//----------------------------------------------------------
+// state support
+//----------------------------------------------------------
+
+void __ompt_thread_assign_wait_id(void *variable) {
+  kmp_info_t *ti = ompt_get_thread();
+
+  if (ti)
+    ti->th.ompt_thread_info.wait_id = (ompt_wait_id_t)(uintptr_t)variable;
+}
+
+int __ompt_get_state_internal(ompt_wait_id_t *omp_wait_id) {
+  kmp_info_t *ti = ompt_get_thread();
+
+  if (ti) {
+    if (omp_wait_id)
+      *omp_wait_id = ti->th.ompt_thread_info.wait_id;
+    return ti->th.ompt_thread_info.state;
+  }
+  return ompt_state_undefined;
+}
+
+//----------------------------------------------------------
+// parallel region support
+//----------------------------------------------------------
+
+int __ompt_get_parallel_info_internal(int ancestor_level,
+                                      ompt_data_t **parallel_data,
+                                      int *team_size) {
+  if (__kmp_get_gtid() >= 0) {
+    ompt_team_info_t *info;
+    if (team_size) {
+      info = __ompt_get_teaminfo(ancestor_level, team_size);
+    } else {
+      info = __ompt_get_teaminfo(ancestor_level, NULL);
+    }
+    if (parallel_data) {
+      *parallel_data = info ? &(info->parallel_data) : NULL;
+    }
+    return info ? 2 : 0;
+  } else {
+    return 0;
+  }
+}
+
+//----------------------------------------------------------
+// lightweight task team support
+//----------------------------------------------------------
+
+void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, int gtid,
+                             ompt_data_t *ompt_pid, void *codeptr) {
+  // initialize parallel_data with input, return address to parallel_data on
+  // exit
+  lwt->ompt_team_info.parallel_data = *ompt_pid;
+  lwt->ompt_team_info.master_return_address = codeptr;
+  lwt->ompt_task_info.task_data.value = 0;
+  lwt->ompt_task_info.frame.enter_frame = ompt_data_none;
+  lwt->ompt_task_info.frame.exit_frame = ompt_data_none;
+  lwt->ompt_task_info.scheduling_parent = NULL;
+  lwt->heap = 0;
+  lwt->parent = 0;
+}
+
+void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
+                             int on_heap, bool always) {
+  ompt_lw_taskteam_t *link_lwt = lwt;
+  if (always ||
+      thr->th.th_team->t.t_serialized >
+          1) { // we already have a team, so link the new team and swap values
+    if (on_heap) { // the lw_taskteam cannot stay on stack, allocate it on heap
+      link_lwt =
+          (ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
+    }
+    link_lwt->heap = on_heap;
+
+    // would be swap in the (on_stack) case.
+    ompt_team_info_t tmp_team = lwt->ompt_team_info;
+    link_lwt->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
+    *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
+
+    // link the taskteam into the list of taskteams:
+    ompt_lw_taskteam_t *my_parent =
+        thr->th.th_team->t.ompt_serialized_team_info;
+    link_lwt->parent = my_parent;
+    thr->th.th_team->t.ompt_serialized_team_info = link_lwt;
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP) {
+      ompd_bp_parallel_begin();
+    }
+#endif
+
+    ompt_task_info_t tmp_task = lwt->ompt_task_info;
+    link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
+  } else {
+    // this is the first serialized team, so we just store the values in the
+    // team and drop the taskteam-object
+    *OMPT_CUR_TEAM_INFO(thr) = lwt->ompt_team_info;
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP) {
+      ompd_bp_parallel_begin();
+    }
+#endif
+    *OMPT_CUR_TASK_INFO(thr) = lwt->ompt_task_info;
+  }
+}
+
+void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
+  ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info;
+  if (lwtask) {
+    ompt_task_info_t tmp_task = lwtask->ompt_task_info;
+    lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
+#if OMPD_SUPPORT
+    if (ompd_state & OMPD_ENABLE_BP) {
+      ompd_bp_parallel_end();
+    }
+#endif
+    thr->th.th_team->t.ompt_serialized_team_info = lwtask->parent;
+
+    ompt_team_info_t tmp_team = lwtask->ompt_team_info;
+    lwtask->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
+    *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
+
+    if (lwtask->heap) {
+      __kmp_free(lwtask);
+      lwtask = NULL;
+    }
+  }
+  //    return lwtask;
+}
+
+//----------------------------------------------------------
+// task support
+//----------------------------------------------------------
+
+ompt_data_t *__ompt_get_task_data() {
+  kmp_info_t *thr = ompt_get_thread();
+  ompt_data_t *task_data = thr ? OMPT_CUR_TASK_DATA(thr) : NULL;
+  return task_data;
+}
+
+ompt_data_t *__ompt_get_target_task_data() {
+  return &__kmp_threads[__kmp_get_gtid()]->th.ompt_thread_info.target_task_data;
+}
+
+int __ompt_get_task_info_internal(int ancestor_level, int *type,
+                                  ompt_data_t **task_data,
+                                  ompt_frame_t **task_frame,
+                                  ompt_data_t **parallel_data,
+                                  int *thread_num) {
+  if (__kmp_get_gtid() < 0)
+    return 0;
+
+  if (ancestor_level < 0)
+    return 0;
+
+  // copied from __ompt_get_scheduling_taskinfo
+  ompt_task_info_t *info = NULL;
+  ompt_team_info_t *team_info = NULL;
+  kmp_info_t *thr = ompt_get_thread();
+  int level = ancestor_level;
+
+  if (thr) {
+    kmp_taskdata_t *taskdata = thr->th.th_current_task;
+    if (taskdata == NULL)
+      return 0;
+    kmp_team *team = thr->th.th_team, *prev_team = NULL;
+    if (team == NULL)
+      return 0;
+    ompt_lw_taskteam_t *lwt = NULL,
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+
+    while (ancestor_level > 0) {
+      // next lightweight team (if any)
+      if (lwt)
+        lwt = lwt->parent;
+
+      // next heavyweight team (if any) after
+      // lightweight teams are exhausted
+      if (!lwt && taskdata) {
+        // first try scheduling parent (for explicit task scheduling)
+        if (taskdata->ompt_task_info.scheduling_parent) {
+          taskdata = taskdata->ompt_task_info.scheduling_parent;
+        } else if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          // then go for implicit tasks
+          taskdata = taskdata->td_parent;
+          if (team == NULL)
+            return 0;
+          prev_team = team;
+          team = team->t.t_parent;
+          if (taskdata) {
+            next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+          }
+        }
+      }
+      ancestor_level--;
+    }
+
+    if (lwt) {
+      info = &lwt->ompt_task_info;
+      team_info = &lwt->ompt_team_info;
+      if (type) {
+        *type = ompt_task_implicit;
+      }
+    } else if (taskdata) {
+      info = &taskdata->ompt_task_info;
+      team_info = &team->t.ompt_team_info;
+      if (type) {
+        if (taskdata->td_parent) {
+          *type = (taskdata->td_flags.tasktype ? ompt_task_explicit
+                                               : ompt_task_implicit) |
+                  TASK_TYPE_DETAILS_FORMAT(taskdata);
+        } else {
+          *type = ompt_task_initial;
+        }
+      }
+    }
+    if (task_data) {
+      *task_data = info ? &info->task_data : NULL;
+    }
+    if (task_frame) {
+      // OpenMP spec asks for the scheduling task to be returned.
+      *task_frame = info ? &info->frame : NULL;
+    }
+    if (parallel_data) {
+      *parallel_data = team_info ? &(team_info->parallel_data) : NULL;
+    }
+    if (thread_num) {
+      if (level == 0)
+        *thread_num = __kmp_get_tid();
+      else if (lwt)
+        *thread_num = 0;
+      else if (!prev_team) {
+        // The innermost parallel region contains at least one explicit task.
+        // The task at level > 0 is either an implicit task that
+        // corresponds to the mentioned region or one of the explicit tasks
+        // nested inside the same region. Note that the task isn't the
+        // innermost explicit tasks (because of condition level > 0).
+        // Since the task at this level still belongs to the innermost parallel
+        // region, thread_num is determined the same way as for level==0.
+        *thread_num = __kmp_get_tid();
+      } else
+        *thread_num = prev_team->t.t_master_tid;
+      //        *thread_num = team->t.t_master_tid;
+    }
+    return info ? 2 : 0;
+  }
+  return 0;
+}
+
+int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) {
+  *size = 0;
+  if (blocknum != 0)
+    return 0; // support only a single block
+
+  kmp_info_t *thr = ompt_get_thread();
+  if (!thr)
+    return 0;
+
+  kmp_taskdata_t *taskdata = thr->th.th_current_task;
+
+  if (taskdata->td_flags.tasktype != TASK_EXPLICIT)
+    return 0; // support only explicit task
+
+  *addr = taskdata;
+  *size = taskdata->td_size_alloc;
+  return 0;
+}
+
+//----------------------------------------------------------
+// team support
+//----------------------------------------------------------
+
+void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid) {
+  team->t.ompt_team_info.parallel_data = ompt_pid;
+}
+
+//----------------------------------------------------------
+// misc
+//----------------------------------------------------------
+
+static uint64_t __ompt_get_unique_id_internal() {
+  static uint64_t thread = 1;
+  static THREAD_LOCAL uint64_t ID = 0;
+  if (ID == 0) {
+    uint64_t new_thread = KMP_TEST_THEN_INC64((kmp_int64 *)&thread);
+    ID = new_thread << (sizeof(uint64_t) * 8 - OMPT_THREAD_ID_BITS);
+  }
+  return ++ID;
+}
+
+ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type bt,
+                                           kmp_info_t *thr) {
+  if (bt == bs_forkjoin_barrier)
+    return ompt_sync_region_barrier_implicit;
+
+  if (bt != bs_plain_barrier)
+    return ompt_sync_region_barrier_implementation;
+
+  if (!thr->th.th_ident)
+    return ompt_sync_region_barrier;
+
+  kmp_int32 flags = thr->th.th_ident->flags;
+
+  if ((flags & KMP_IDENT_BARRIER_EXPL) != 0)
+    return ompt_sync_region_barrier_explicit;
+
+  if ((flags & KMP_IDENT_BARRIER_IMPL) != 0)
+    return ompt_sync_region_barrier_implicit;
+
+  return ompt_sync_region_barrier_implementation;
+}
diff --git a/third_party/openmp/ompt-specific.h b/third_party/openmp/ompt-specific.h
new file mode 100644
index 000000000..63c59c3fb
--- /dev/null
+++ b/third_party/openmp/ompt-specific.h
@@ -0,0 +1,178 @@
+/*
+ * ompt-specific.h - header of OMPT internal functions implementation
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPT_SPECIFIC_H
+#define OMPT_SPECIFIC_H
+
+#include "kmp.h"
+
+#if OMPT_SUPPORT
+/*****************************************************************************
+ * forward declarations
+ ****************************************************************************/
+
+/// Entrypoint used by libomptarget to register callbacks in libomp, if not
+/// done already
+void __ompt_force_initialization();
+
+void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid);
+void __ompt_thread_assign_wait_id(void *variable);
+
+void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, int gtid,
+                             ompt_data_t *ompt_pid, void *codeptr);
+
+void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
+                             int on_heap, bool always = false);
+
+void __ompt_lw_taskteam_unlink(kmp_info_t *thr);
+
+ompt_team_info_t *__ompt_get_teaminfo(int depth, int *size);
+
+ompt_data_t *__ompt_get_task_data();
+
+ompt_data_t *__ompt_get_target_task_data();
+
+ompt_task_info_t *__ompt_get_task_info_object(int depth);
+
+int __ompt_get_parallel_info_internal(int ancestor_level,
+                                      ompt_data_t **parallel_data,
+                                      int *team_size);
+
+int __ompt_get_task_info_internal(int ancestor_level, int *type,
+                                  ompt_data_t **task_data,
+                                  ompt_frame_t **task_frame,
+                                  ompt_data_t **parallel_data, int *thread_num);
+
+ompt_data_t *__ompt_get_thread_data_internal();
+
+/*
+ * Unused currently
+static uint64_t __ompt_get_get_unique_id_internal();
+*/
+
+ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type, kmp_info_t *);
+
+/*****************************************************************************
+ * macros
+ ****************************************************************************/
+
+#define OMPT_CUR_TASK_INFO(thr) (&((thr)->th.th_current_task->ompt_task_info))
+#define OMPT_CUR_TASK_DATA(thr)                                                \
+  (&((thr)->th.th_current_task->ompt_task_info.task_data))
+#define OMPT_CUR_TEAM_INFO(thr) (&((thr)->th.th_team->t.ompt_team_info))
+#define OMPT_CUR_TEAM_DATA(thr)                                                \
+  (&((thr)->th.th_team->t.ompt_team_info.parallel_data))
+
+#define OMPT_HAVE_WEAK_ATTRIBUTE KMP_HAVE_WEAK_ATTRIBUTE
+#define OMPT_HAVE_PSAPI KMP_HAVE_PSAPI
+#define OMPT_STR_MATCH(haystack, needle) __kmp_str_match(haystack, 0, needle)
+
+inline void *__ompt_load_return_address(int gtid) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  void *return_address = thr->th.ompt_thread_info.return_address;
+  thr->th.ompt_thread_info.return_address = NULL;
+  return return_address;
+}
+
+/*#define OMPT_STORE_RETURN_ADDRESS(gtid) \
+  if (ompt_enabled.enabled && gtid >= 0 && __kmp_threads[gtid] &&              \
+      !__kmp_threads[gtid]->th.ompt_thread_info.return_address)                \
+  __kmp_threads[gtid]->th.ompt_thread_info.return_address =                    \
+      __builtin_return_address(0)*/
+#define OMPT_STORE_RETURN_ADDRESS(gtid)                                        \
+  OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address(0)};
+#define OMPT_LOAD_RETURN_ADDRESS(gtid) __ompt_load_return_address(gtid)
+#define OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid)                                  \
+  ((ompt_enabled.enabled && gtid >= 0 && __kmp_threads[gtid] &&                \
+    __kmp_threads[gtid]->th.ompt_thread_info.return_address)                   \
+       ? __ompt_load_return_address(gtid)                                      \
+       : __builtin_return_address(0))
+
+#define OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, incr)                           \
+  do {                                                                         \
+    if (incr > 0) {                                                            \
+      chunk.start = static_cast<uint64_t>(lb);                                 \
+      chunk.iterations = static_cast<uint64_t>(((ub) - (lb)) / (incr) + 1);    \
+    } else {                                                                   \
+      chunk.start = static_cast<uint64_t>(ub);                                 \
+      chunk.iterations = static_cast<uint64_t>(((lb) - (ub)) / -(incr) + 1);   \
+    }                                                                          \
+  } while (0)
+
+//******************************************************************************
+// inline functions
+//******************************************************************************
+
+inline kmp_info_t *ompt_get_thread_gtid(int gtid) {
+  return (gtid >= 0) ? __kmp_thread_from_gtid(gtid) : NULL;
+}
+
+inline kmp_info_t *ompt_get_thread() {
+  int gtid = __kmp_get_gtid();
+  return ompt_get_thread_gtid(gtid);
+}
+
+inline void ompt_set_thread_state(kmp_info_t *thread, ompt_state_t state) {
+  if (thread)
+    thread->th.ompt_thread_info.state = state;
+}
+
+inline const char *ompt_get_runtime_version() {
+  return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
+}
+
+class OmptReturnAddressGuard {
+private:
+  bool SetAddress{false};
+  int Gtid;
+
+public:
+  OmptReturnAddressGuard(int Gtid, void *ReturnAddress) : Gtid(Gtid) {
+    if (ompt_enabled.enabled && Gtid >= 0 && __kmp_threads[Gtid] &&
+        !__kmp_threads[Gtid]->th.ompt_thread_info.return_address) {
+      SetAddress = true;
+      __kmp_threads[Gtid]->th.ompt_thread_info.return_address = ReturnAddress;
+    }
+  }
+  ~OmptReturnAddressGuard() {
+    if (SetAddress)
+      __kmp_threads[Gtid]->th.ompt_thread_info.return_address = NULL;
+  }
+};
+
+#endif // OMPT_SUPPORT
+
+// macros providing the OMPT callbacks for reduction clause
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid)                                    \
+  ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr);                    \
+  ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);                \
+  void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
+#define OMPT_REDUCTION_BEGIN                                                   \
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) {          \
+    ompt_callbacks.ompt_callback(ompt_callback_reduction)(                     \
+        ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data,        \
+        my_task_data, return_address);                                         \
+  }
+#define OMPT_REDUCTION_END                                                     \
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) {          \
+    ompt_callbacks.ompt_callback(ompt_callback_reduction)(                     \
+        ompt_sync_region_reduction, ompt_scope_end, my_parallel_data,          \
+        my_task_data, return_address);                                         \
+  }
+#else // OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid)
+#define OMPT_REDUCTION_BEGIN
+#define OMPT_REDUCTION_END
+#endif // ! OMPT_SUPPORT && OMPT_OPTIONAL
+
+#endif
diff --git a/third_party/openmp/ompx.h b/third_party/openmp/ompx.h
new file mode 100644
index 000000000..5dd8e8355
--- /dev/null
+++ b/third_party/openmp/ompx.h
@@ -0,0 +1,165 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPX_H
+#define __OMPX_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int omp_get_ancestor_thread_num(int);
+int omp_get_team_size(int);
+
+#ifdef __cplusplus
+}
+#endif
+
+/// Target kernel language extensions
+///
+/// These extensions exist for the host to allow fallback implementations,
+/// however, they cannot be arbitrarily composed with OpenMP. If the rules of
+/// the kernel language are followed, the host fallbacks should behave as
+/// expected since the kernel is represented as 3 sequential outer loops, one
+/// for each grid dimension, and three (nested) parallel loops, one for each
+/// block dimension. This fallback is not supposed to be optimal and should be
+/// configurable by the user.
+///
+///{
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+  ompx_relaxed = __ATOMIC_RELAXED,
+  ompx_aquire = __ATOMIC_ACQUIRE,
+  ompx_release = __ATOMIC_RELEASE,
+  ompx_acq_rel = __ATOMIC_ACQ_REL,
+  ompx_seq_cst = __ATOMIC_SEQ_CST,
+};
+
+enum {
+  ompx_dim_x = 0,
+  ompx_dim_y = 1,
+  ompx_dim_z = 2,
+};
+
+/// ompx_{thread,block}_{id,dim}
+///{
+#pragma omp begin declare variant match(device = {kind(cpu)})
+#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(NAME, VALUE)                     \
+  static inline int ompx_##NAME(int Dim) { return VALUE; }
+
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(thread_id,
+                                      omp_get_ancestor_thread_num(Dim + 1))
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(block_dim, omp_get_team_size(Dim + 1))
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(block_id, 0)
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(grid_dim, 1)
+#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C
+///}
+
+/// ompx_{sync_block}_{,divergent}
+///{
+#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(RETTY, NAME, ARGS, BODY)         \
+  static inline RETTY ompx_##NAME(ARGS) { BODY; }
+
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block, int Ordering,
+                                      _Pragma("omp barrier"));
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_acq_rel, void,
+                                      ompx_sync_block(ompx_acq_rel));
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_divergent, int Ordering,
+                                      ompx_sync_block(Ordering));
+#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C
+///}
+
+#pragma omp end declare variant
+
+/// ompx_{sync_block}_{,divergent}
+///{
+#define _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(RETTY, NAME, ARGS)         \
+  RETTY ompx_##NAME(ARGS);
+
+_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block, int Ordering);
+_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_acq_rel, void);
+_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_divergent, int Ordering);
+#undef _TGT_KERNEL_LANGUAGE_DECL_SYNC_C
+///}
+
+/// ompx_{thread,block}_{id,dim}_{x,y,z}
+///{
+#define _TGT_KERNEL_LANGUAGE_DECL_GRID_C(NAME)                                 \
+  int ompx_##NAME(int Dim);                                                    \
+  static inline int ompx_##NAME##_x() { return ompx_##NAME(ompx_dim_x); }      \
+  static inline int ompx_##NAME##_y() { return ompx_##NAME(ompx_dim_y); }      \
+  static inline int ompx_##NAME##_z() { return ompx_##NAME(ompx_dim_z); }
+
+_TGT_KERNEL_LANGUAGE_DECL_GRID_C(thread_id)
+_TGT_KERNEL_LANGUAGE_DECL_GRID_C(block_dim)
+_TGT_KERNEL_LANGUAGE_DECL_GRID_C(block_id)
+_TGT_KERNEL_LANGUAGE_DECL_GRID_C(grid_dim)
+#undef _TGT_KERNEL_LANGUAGE_DECL_GRID_C
+///}
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+namespace ompx {
+
+enum {
+  dim_x = ompx_dim_x,
+  dim_y = ompx_dim_y,
+  dim_z = ompx_dim_z,
+};
+
+enum {
+  relaxed = ompx_relaxed ,
+  aquire = ompx_aquire,
+  release = ompx_release,
+  acc_rel = ompx_acq_rel,
+  seq_cst = ompx_seq_cst,
+};
+
+/// ompx::{thread,block}_{id,dim}_{,x,y,z}
+///{
+#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(NAME)                          \
+  static inline int NAME(int Dim) noexcept { return ompx_##NAME(Dim); }        \
+  static inline int NAME##_x() noexcept { return NAME(ompx_dim_x); }           \
+  static inline int NAME##_y() noexcept { return NAME(ompx_dim_y); }           \
+  static inline int NAME##_z() noexcept { return NAME(ompx_dim_z); }
+
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(thread_id)
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(block_dim)
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(block_id)
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(grid_dim)
+#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX
+///}
+
+/// ompx_{sync_block}_{,divergent}
+///{
+#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(RETTY, NAME, ARGS, CALL_ARGS)  \
+  static inline RETTY NAME(ARGS) {               \
+    return ompx_##NAME(CALL_ARGS);                                             \
+  }
+
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block, int Ordering = acc_rel,
+                                        Ordering);
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block_divergent,
+                                        int Ordering = acc_rel, Ordering);
+#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX
+///}
+
+} // namespace ompx
+#endif
+
+///}
+
+#endif /* __OMPX_H */
diff --git a/third_party/openmp/util1.cpp b/third_party/openmp/util1.cpp
new file mode 100644
index 000000000..1e668d644
--- /dev/null
+++ b/third_party/openmp/util1.cpp
@@ -0,0 +1,2822 @@
+/*
+ * z_Linux_util.cpp -- platform specific routines.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_itt.h"
+#include "kmp_lock.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#include "kmp_wait_release.h"
+#include "libc/intrin/kprintf.h"
+#include "kmp_wrapper_getpid.h"
+
+#if !KMP_OS_DRAGONFLY && !KMP_OS_FREEBSD && !KMP_OS_NETBSD && !KMP_OS_OPENBSD
+#include <alloca.h>
+#endif
+#include <math.h> // HUGE_VAL.
+#if KMP_OS_LINUX
+#include <semaphore.h>
+#endif // KMP_OS_LINUX
+#include <sys/resource.h>
+#if !KMP_OS_AIX
+#include <sys/syscall.h>
+#endif
+#include <sys/time.h>
+#include <sys/times.h>
+#include <unistd.h>
+
+#if KMP_OS_LINUX
+#include <sys/sysinfo.h>
+#if KMP_USE_FUTEX
+// We should really include <futex.h>, but that causes compatibility problems on
+// different Linux* OS distributions that either require that you include (or
+// break when you try to include) <pci/types.h>. Since all we need is the two
+// macros below (which are part of the kernel ABI, so can't change) we just
+// define the constants here and don't include <futex.h>
+#ifndef FUTEX_WAIT
+#define FUTEX_WAIT 0
+#endif
+#ifndef FUTEX_WAKE
+#define FUTEX_WAKE 1
+#endif
+#endif
+#elif KMP_OS_DARWIN
+#include <mach/mach.h>
+#include <sys/sysctl.h>
+#elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD
+#include <sys/types.h>
+//#include <sys/sysctl.h>
+#include <sys/user.h>
+#include <pthread.h>
+#elif KMP_OS_NETBSD || KMP_OS_OPENBSD
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif KMP_OS_SOLARIS
+#include <sys/loadavg.h>
+#endif
+
+#include <ctype.h>
+#include <dirent.h>
+#include <fcntl.h>
+
+struct kmp_sys_timer {
+  struct timespec start;
+};
+
+#ifndef TIMEVAL_TO_TIMESPEC
+// Convert timeval to timespec.
+#define TIMEVAL_TO_TIMESPEC(tv, ts)                                            \
+  do {                                                                         \
+    (ts)->tv_sec = (tv)->tv_sec;                                               \
+    (ts)->tv_nsec = (tv)->tv_usec * 1000;                                      \
+  } while (0)
+#endif
+
+// Convert timespec to nanoseconds.
+#define TS2NS(timespec)                                                        \
+  (((timespec).tv_sec * (long int)1e9) + (timespec).tv_nsec)
+
+static struct kmp_sys_timer __kmp_sys_timer_data;
+
+#if KMP_HANDLE_SIGNALS
+typedef void (*sig_func_t)(int);
+STATIC_EFI2_WORKAROUND struct sigaction __kmp_sighldrs[NSIG];
+static sigset_t __kmp_sigset;
+#endif
+
+static int __kmp_init_runtime = FALSE;
+
+static int __kmp_fork_count = 0;
+
+static pthread_condattr_t __kmp_suspend_cond_attr;
+static pthread_mutexattr_t __kmp_suspend_mutex_attr;
+
+static kmp_cond_align_t __kmp_wait_cv;
+static kmp_mutex_align_t __kmp_wait_mx;
+
+kmp_uint64 __kmp_ticks_per_msec = 1000000;
+kmp_uint64 __kmp_ticks_per_usec = 1000;
+
+#ifdef DEBUG_SUSPEND
+static void __kmp_print_cond(char *buffer, kmp_cond_align_t *cond) {
+  KMP_SNPRINTF(buffer, 128, "(cond (lock (%ld, %d)), (descr (%p)))",
+               cond->c_cond.__c_lock.__status, cond->c_cond.__c_lock.__spinlock,
+               cond->c_cond.__c_waiting);
+}
+#endif
+
+#if ((KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED)
+
+/* Affinity support */
+
+void __kmp_affinity_bind_thread(int which) {
+  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+              "Illegal set affinity operation when not capable");
+
+  kmp_affin_mask_t *mask;
+  KMP_CPU_ALLOC_ON_STACK(mask);
+  KMP_CPU_ZERO(mask);
+  KMP_CPU_SET(which, mask);
+  __kmp_set_system_affinity(mask, TRUE);
+  KMP_CPU_FREE_FROM_STACK(mask);
+}
+
+/* Determine if we can access affinity functionality on this version of
+ * Linux* OS by checking __NR_sched_{get,set}affinity system calls, and set
+ * __kmp_affin_mask_size to the appropriate value (0 means not capable). */
+void __kmp_affinity_determine_capable(const char *env_var) {
+  // Check and see if the OS supports thread affinity.
+
+#if KMP_OS_LINUX && !defined(__COSMOPOLITAN__)
+#define KMP_CPU_SET_SIZE_LIMIT (1024 * 1024)
+#define KMP_CPU_SET_TRY_SIZE CACHE_LINE
+#elif KMP_OS_FREEBSD || defined(__COSMOPOLITAN__)
+#define KMP_CPU_SET_SIZE_LIMIT (sizeof(cpuset_t))
+#endif
+
+  int verbose = __kmp_affinity.flags.verbose;
+  int warnings = __kmp_affinity.flags.warnings;
+  enum affinity_type type = __kmp_affinity.type;
+
+#if KMP_OS_LINUX && !defined(__COSMOPOLITAN__)
+  long gCode;
+  unsigned char *buf;
+  buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
+
+  // If the syscall returns a suggestion for the size,
+  // then we don't have to search for an appropriate size.
+  gCode = syscall(__NR_sched_getaffinity, 0, KMP_CPU_SET_TRY_SIZE, buf);
+  KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                "initial getaffinity call returned %ld errno = %d\n",
+                gCode, errno));
+
+  if (gCode < 0 && errno != EINVAL) {
+    // System call not supported
+    if (verbose ||
+        (warnings && (type != affinity_none) && (type != affinity_default) &&
+         (type != affinity_disabled))) {
+      int error = errno;
+      kmp_msg_t err_code = KMP_ERR(error);
+      __kmp_msg(kmp_ms_warning, KMP_MSG(GetAffSysCallNotSupported, env_var),
+                err_code, __kmp_msg_null);
+      if (__kmp_generate_warnings == kmp_warnings_off) {
+        __kmp_str_free(&err_code.str);
+      }
+    }
+    KMP_AFFINITY_DISABLE();
+    KMP_INTERNAL_FREE(buf);
+    return;
+  } else if (gCode > 0) {
+    // The optimal situation: the OS returns the size of the buffer it expects.
+    KMP_AFFINITY_ENABLE(gCode);
+    KA_TRACE(10, ("__kmp_affinity_determine_capable: "
+                  "affinity supported (mask size %d)\n",
+                  (int)__kmp_affin_mask_size));
+    KMP_INTERNAL_FREE(buf);
+    return;
+  }
+
+  // Call the getaffinity system call repeatedly with increasing set sizes
+  // until we succeed, or reach an upper bound on the search.
+  KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                "searching for proper set size\n"));
+  int size;
+  for (size = 1; size <= KMP_CPU_SET_SIZE_LIMIT; size *= 2) {
+    gCode = syscall(__NR_sched_getaffinity, 0, size, buf);
+    KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                  "getaffinity for mask size %ld returned %ld errno = %d\n",
+                  size, gCode, errno));
+
+    if (gCode < 0) {
+      if (errno == ENOSYS) {
+        // We shouldn't get here
+        KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                      "inconsistent OS call behavior: errno == ENOSYS for mask "
+                      "size %d\n",
+                      size));
+        if (verbose ||
+            (warnings && (type != affinity_none) &&
+             (type != affinity_default) && (type != affinity_disabled))) {
+          int error = errno;
+          kmp_msg_t err_code = KMP_ERR(error);
+          __kmp_msg(kmp_ms_warning, KMP_MSG(GetAffSysCallNotSupported, env_var),
+                    err_code, __kmp_msg_null);
+          if (__kmp_generate_warnings == kmp_warnings_off) {
+            __kmp_str_free(&err_code.str);
+          }
+        }
+        KMP_AFFINITY_DISABLE();
+        KMP_INTERNAL_FREE(buf);
+        return;
+      }
+      continue;
+    }
+
+    KMP_AFFINITY_ENABLE(gCode);
+    KA_TRACE(10, ("__kmp_affinity_determine_capable: "
+                  "affinity supported (mask size %d)\n",
+                  (int)__kmp_affin_mask_size));
+    KMP_INTERNAL_FREE(buf);
+    return;
+  }
+#elif KMP_OS_FREEBSD || defined(__COSMOPOLITAN__)
+  long gCode;
+  unsigned char *buf;
+  buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
+  gCode = pthread_getaffinity_np(pthread_self(), KMP_CPU_SET_SIZE_LIMIT,
+                                 reinterpret_cast<cpuset_t *>(buf));
+  KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                "initial getaffinity call returned %d errno = %d\n",
+                gCode, errno));
+  if (gCode == 0) {
+    KMP_AFFINITY_ENABLE(KMP_CPU_SET_SIZE_LIMIT);
+    KA_TRACE(10, ("__kmp_affinity_determine_capable: "
+                  "affinity supported (mask size %d)\n",
+                  (int)__kmp_affin_mask_size));
+    KMP_INTERNAL_FREE(buf);
+    return;
+  }
+#endif
+  KMP_INTERNAL_FREE(buf);
+
+  // Affinity is not supported
+  KMP_AFFINITY_DISABLE();
+  KA_TRACE(10, ("__kmp_affinity_determine_capable: "
+                "cannot determine mask size - affinity not supported\n"));
+  if (verbose || (warnings && (type != affinity_none) &&
+                  (type != affinity_default) && (type != affinity_disabled))) {
+    KMP_WARNING(AffCantGetMaskSize, env_var);
+  }
+}
+
+#endif // KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+
+#if KMP_USE_FUTEX
+
+int __kmp_futex_determine_capable() {
+#ifdef __COSMOPOLITAN__
+  return 1;
+#else
+  int loc = 0;
+  long rc = syscall(__NR_futex, &loc, FUTEX_WAKE, 1, NULL, NULL, 0);
+  int retval = (rc == 0) || (errno != ENOSYS);
+
+  KA_TRACE(10,
+           ("__kmp_futex_determine_capable: rc = %d errno = %d\n", rc, errno));
+  KA_TRACE(10, ("__kmp_futex_determine_capable: futex syscall%s supported\n",
+                retval ? "" : " not"));
+
+  return retval;
+#endif
+}
+
+#endif // KMP_USE_FUTEX
+
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_WASM) && (!KMP_ASM_INTRINS)
+/* Only 32-bit "add-exchange" instruction on IA-32 architecture causes us to
+   use compare_and_store for these routines */
+
+kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 d) {
+  kmp_int8 old_value, new_value;
+
+  old_value = TCR_1(*p);
+  new_value = old_value | d;
+
+  while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_1(*p);
+    new_value = old_value | d;
+  }
+  return old_value;
+}
+
+kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 d) {
+  kmp_int8 old_value, new_value;
+
+  old_value = TCR_1(*p);
+  new_value = old_value & d;
+
+  while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_1(*p);
+    new_value = old_value & d;
+  }
+  return old_value;
+}
+
+kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 d) {
+  kmp_uint32 old_value, new_value;
+
+  old_value = TCR_4(*p);
+  new_value = old_value | d;
+
+  while (!KMP_COMPARE_AND_STORE_REL32(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_4(*p);
+    new_value = old_value | d;
+  }
+  return old_value;
+}
+
+kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 d) {
+  kmp_uint32 old_value, new_value;
+
+  old_value = TCR_4(*p);
+  new_value = old_value & d;
+
+  while (!KMP_COMPARE_AND_STORE_REL32(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_4(*p);
+    new_value = old_value & d;
+  }
+  return old_value;
+}
+
+#if KMP_ARCH_X86 || KMP_ARCH_WASM
+kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 d) {
+  kmp_int8 old_value, new_value;
+
+  old_value = TCR_1(*p);
+  new_value = old_value + d;
+
+  while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_1(*p);
+    new_value = old_value + d;
+  }
+  return old_value;
+}
+
+kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 d) {
+  kmp_int64 old_value, new_value;
+
+  old_value = TCR_8(*p);
+  new_value = old_value + d;
+
+  while (!KMP_COMPARE_AND_STORE_REL64(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_8(*p);
+    new_value = old_value + d;
+  }
+  return old_value;
+}
+#endif /* KMP_ARCH_X86 */
+
+kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 d) {
+  kmp_uint64 old_value, new_value;
+
+  old_value = TCR_8(*p);
+  new_value = old_value | d;
+  while (!KMP_COMPARE_AND_STORE_REL64(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_8(*p);
+    new_value = old_value | d;
+  }
+  return old_value;
+}
+
+kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 d) {
+  kmp_uint64 old_value, new_value;
+
+  old_value = TCR_8(*p);
+  new_value = old_value & d;
+  while (!KMP_COMPARE_AND_STORE_REL64(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_8(*p);
+    new_value = old_value & d;
+  }
+  return old_value;
+}
+
+#endif /* (KMP_ARCH_X86 || KMP_ARCH_X86_64) && (! KMP_ASM_INTRINS) */
+
+void __kmp_terminate_thread(int gtid) {
+  int status;
+  kmp_info_t *th = __kmp_threads[gtid];
+
+  if (!th)
+    return;
+
+#ifdef KMP_CANCEL_THREADS
+  KA_TRACE(10, ("__kmp_terminate_thread: kill (%d)\n", gtid));
+  status = pthread_cancel(th->th.th_info.ds.ds_thread);
+  if (status != 0 && status != ESRCH) {
+    __kmp_fatal(KMP_MSG(CantTerminateWorkerThread), KMP_ERR(status),
+                __kmp_msg_null);
+  }
+#endif
+  KMP_YIELD(TRUE);
+} //
+
+/* Set thread stack info according to values returned by pthread_getattr_np().
+   If values are unreasonable, assume call failed and use incremental stack
+   refinement method instead. Returns TRUE if the stack parameters could be
+   determined exactly, FALSE if incremental refinement is necessary. */
+static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
+  int stack_data;
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+    KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX
+  pthread_attr_t attr;
+  int status;
+  size_t size = 0;
+  void *addr = 0;
+
+  /* Always do incremental stack refinement for ubermaster threads since the
+     initial thread stack range can be reduced by sibling thread creation so
+     pthread_attr_getstack may cause thread gtid aliasing */
+  if (!KMP_UBER_GTID(gtid)) {
+
+    /* Fetch the real thread attributes */
+    status = pthread_attr_init(&attr);
+    KMP_CHECK_SYSFAIL("pthread_attr_init", status);
+#if (KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD) && !defined(__COSMOPOLITAN__)
+    status = pthread_attr_get_np(pthread_self(), &attr);
+    KMP_CHECK_SYSFAIL("pthread_attr_get_np", status);
+#else
+    status = pthread_getattr_np(pthread_self(), &attr);
+    KMP_CHECK_SYSFAIL("pthread_getattr_np", status);
+#endif
+    status = pthread_attr_getstack(&attr, &addr, &size);
+    KMP_CHECK_SYSFAIL("pthread_attr_getstack", status);
+    KA_TRACE(60,
+             ("__kmp_set_stack_info: T#%d pthread_attr_getstack returned size:"
+              " %lu, low addr: %p\n",
+              gtid, size, addr));
+    status = pthread_attr_destroy(&attr);
+    KMP_CHECK_SYSFAIL("pthread_attr_destroy", status);
+  }
+
+  if (size != 0 && addr != 0) { // was stack parameter determination successful?
+    /* Store the correct base and size */
+    TCW_PTR(th->th.th_info.ds.ds_stackbase, (((char *)addr) + size));
+    TCW_PTR(th->th.th_info.ds.ds_stacksize, size);
+    TCW_4(th->th.th_info.ds.ds_stackgrow, FALSE);
+    return TRUE;
+  }
+#endif /* KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD  \
+          || KMP_OS_HURD || KMP_OS_SOLARIS */
+  /* Use incremental refinement starting from initial conservative estimate */
+  TCW_PTR(th->th.th_info.ds.ds_stacksize, 0);
+  TCW_PTR(th->th.th_info.ds.ds_stackbase, &stack_data);
+  TCW_4(th->th.th_info.ds.ds_stackgrow, TRUE);
+  return FALSE;
+}
+
+static void *__kmp_launch_worker(void *thr) {
+  int status, old_type, old_state;
+#ifdef KMP_BLOCK_SIGNALS
+  sigset_t new_set, old_set;
+#endif /* KMP_BLOCK_SIGNALS */
+  void *exit_val;
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+    KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS
+  void *volatile padding = 0;
+#endif
+  int gtid;
+
+  gtid = ((kmp_info_t *)thr)->th.th_info.ds.ds_gtid;
+  __kmp_gtid_set_specific(gtid);
+#ifdef KMP_TDATA_GTID
+  __kmp_gtid = gtid;
+#endif
+#if KMP_STATS_ENABLED
+  // set thread local index to point to thread-specific stats
+  __kmp_stats_thread_ptr = ((kmp_info_t *)thr)->th.th_stats;
+  __kmp_stats_thread_ptr->startLife();
+  KMP_SET_THREAD_STATE(IDLE);
+  KMP_INIT_PARTITIONED_TIMERS(OMP_idle);
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_thread_name(gtid);
+#endif /* USE_ITT_BUILD */
+
+#if KMP_AFFINITY_SUPPORTED
+  __kmp_affinity_bind_init_mask(gtid);
+#endif
+
+#ifdef KMP_CANCEL_THREADS
+  status = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old_type);
+  KMP_CHECK_SYSFAIL("pthread_setcanceltype", status);
+  // josh todo: isn't PTHREAD_CANCEL_ENABLE default for newly-created threads?
+  status = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &old_state);
+  KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  // Set FP control regs to be a copy of the parallel initialization thread's.
+  __kmp_clear_x87_fpu_status_word();
+  __kmp_load_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
+  __kmp_load_mxcsr(&__kmp_init_mxcsr);
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#ifdef KMP_BLOCK_SIGNALS
+  status = sigfillset(&new_set);
+  KMP_CHECK_SYSFAIL_ERRNO("sigfillset", status);
+  status = pthread_sigmask(SIG_BLOCK, &new_set, &old_set);
+  KMP_CHECK_SYSFAIL("pthread_sigmask", status);
+#endif /* KMP_BLOCK_SIGNALS */
+
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+    KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS
+  if (__kmp_stkoffset > 0 && gtid > 0) {
+    padding = KMP_ALLOCA(gtid * __kmp_stkoffset);
+    (void)padding;
+  }
+#endif
+
+  KMP_MB();
+  __kmp_set_stack_info(gtid, (kmp_info_t *)thr);
+
+  __kmp_check_stack_overlap((kmp_info_t *)thr);
+
+  exit_val = __kmp_launch_thread((kmp_info_t *)thr);
+
+#ifdef KMP_BLOCK_SIGNALS
+  status = pthread_sigmask(SIG_SETMASK, &old_set, NULL);
+  KMP_CHECK_SYSFAIL("pthread_sigmask", status);
+#endif /* KMP_BLOCK_SIGNALS */
+
+  return exit_val;
+}
+
+#if KMP_USE_MONITOR
+/* The monitor thread controls all of the threads in the complex */
+
+static void *__kmp_launch_monitor(void *thr) {
+  int status, old_type, old_state;
+#ifdef KMP_BLOCK_SIGNALS
+  sigset_t new_set;
+#endif /* KMP_BLOCK_SIGNALS */
+  struct timespec interval;
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(10, ("__kmp_launch_monitor: #1 launched\n"));
+
+  /* register us as the monitor thread */
+  __kmp_gtid_set_specific(KMP_GTID_MONITOR);
+#ifdef KMP_TDATA_GTID
+  __kmp_gtid = KMP_GTID_MONITOR;
+#endif
+
+  KMP_MB();
+
+#if USE_ITT_BUILD
+  // Instruct Intel(R) Threading Tools to ignore monitor thread.
+  __kmp_itt_thread_ignore();
+#endif /* USE_ITT_BUILD */
+
+  __kmp_set_stack_info(((kmp_info_t *)thr)->th.th_info.ds.ds_gtid,
+                       (kmp_info_t *)thr);
+
+  __kmp_check_stack_overlap((kmp_info_t *)thr);
+
+#ifdef KMP_CANCEL_THREADS
+  status = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old_type);
+  KMP_CHECK_SYSFAIL("pthread_setcanceltype", status);
+  // josh todo: isn't PTHREAD_CANCEL_ENABLE default for newly-created threads?
+  status = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &old_state);
+  KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
+#endif
+
+#if KMP_REAL_TIME_FIX
+  // This is a potential fix which allows application with real-time scheduling
+  // policy work. However, decision about the fix is not made yet, so it is
+  // disabled by default.
+  { // Are program started with real-time scheduling policy?
+    int sched = sched_getscheduler(0);
+    if (sched == SCHED_FIFO || sched == SCHED_RR) {
+      // Yes, we are a part of real-time application. Try to increase the
+      // priority of the monitor.
+      struct sched_param param;
+      int max_priority = sched_get_priority_max(sched);
+      int rc;
+      KMP_WARNING(RealTimeSchedNotSupported);
+      sched_getparam(0, &param);
+      if (param.sched_priority < max_priority) {
+        param.sched_priority += 1;
+        rc = sched_setscheduler(0, sched, &param);
+        if (rc != 0) {
+          int error = errno;
+          kmp_msg_t err_code = KMP_ERR(error);
+          __kmp_msg(kmp_ms_warning, KMP_MSG(CantChangeMonitorPriority),
+                    err_code, KMP_MSG(MonitorWillStarve), __kmp_msg_null);
+          if (__kmp_generate_warnings == kmp_warnings_off) {
+            __kmp_str_free(&err_code.str);
+          }
+        }
+      } else {
+        // We cannot abort here, because number of CPUs may be enough for all
+        // the threads, including the monitor thread, so application could
+        // potentially work...
+        __kmp_msg(kmp_ms_warning, KMP_MSG(RunningAtMaxPriority),
+                  KMP_MSG(MonitorWillStarve), KMP_HNT(RunningAtMaxPriority),
+                  __kmp_msg_null);
+      }
+    }
+    // AC: free thread that waits for monitor started
+    TCW_4(__kmp_global.g.g_time.dt.t_value, 0);
+  }
+#endif // KMP_REAL_TIME_FIX
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  if (__kmp_monitor_wakeups == 1) {
+    interval.tv_sec = 1;
+    interval.tv_nsec = 0;
+  } else {
+    interval.tv_sec = 0;
+    interval.tv_nsec = (KMP_NSEC_PER_SEC / __kmp_monitor_wakeups);
+  }
+
+  KA_TRACE(10, ("__kmp_launch_monitor: #2 monitor\n"));
+
+  while (!TCR_4(__kmp_global.g.g_done)) {
+    struct timespec now;
+    struct timeval tval;
+
+    /*  This thread monitors the state of the system */
+
+    KA_TRACE(15, ("__kmp_launch_monitor: update\n"));
+
+    status = gettimeofday(&tval, NULL);
+    KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
+    TIMEVAL_TO_TIMESPEC(&tval, &now);
+
+    now.tv_sec += interval.tv_sec;
+    now.tv_nsec += interval.tv_nsec;
+
+    if (now.tv_nsec >= KMP_NSEC_PER_SEC) {
+      now.tv_sec += 1;
+      now.tv_nsec -= KMP_NSEC_PER_SEC;
+    }
+
+    status = pthread_mutex_lock(&__kmp_wait_mx.m_mutex);
+    KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+    // AC: the monitor should not fall asleep if g_done has been set
+    if (!TCR_4(__kmp_global.g.g_done)) { // check once more under mutex
+      status = pthread_cond_timedwait(&__kmp_wait_cv.c_cond,
+                                      &__kmp_wait_mx.m_mutex, &now);
+      if (status != 0) {
+        if (status != ETIMEDOUT && status != EINTR) {
+          KMP_SYSFAIL("pthread_cond_timedwait", status);
+        }
+      }
+    }
+    status = pthread_mutex_unlock(&__kmp_wait_mx.m_mutex);
+    KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+
+    TCW_4(__kmp_global.g.g_time.dt.t_value,
+          TCR_4(__kmp_global.g.g_time.dt.t_value) + 1);
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+  }
+
+  KA_TRACE(10, ("__kmp_launch_monitor: #3 cleanup\n"));
+
+#ifdef KMP_BLOCK_SIGNALS
+  status = sigfillset(&new_set);
+  KMP_CHECK_SYSFAIL_ERRNO("sigfillset", status);
+  status = pthread_sigmask(SIG_UNBLOCK, &new_set, NULL);
+  KMP_CHECK_SYSFAIL("pthread_sigmask", status);
+#endif /* KMP_BLOCK_SIGNALS */
+
+  KA_TRACE(10, ("__kmp_launch_monitor: #4 finished\n"));
+
+  if (__kmp_global.g.g_abort != 0) {
+    /* now we need to terminate the worker threads  */
+    /* the value of t_abort is the signal we caught */
+
+    int gtid;
+
+    KA_TRACE(10, ("__kmp_launch_monitor: #5 terminate sig=%d\n",
+                  __kmp_global.g.g_abort));
+
+    /* terminate the OpenMP worker threads */
+    /* TODO this is not valid for sibling threads!!
+     * the uber master might not be 0 anymore.. */
+    for (gtid = 1; gtid < __kmp_threads_capacity; ++gtid)
+      __kmp_terminate_thread(gtid);
+
+    __kmp_cleanup();
+
+    KA_TRACE(10, ("__kmp_launch_monitor: #6 raise sig=%d\n",
+                  __kmp_global.g.g_abort));
+
+    if (__kmp_global.g.g_abort > 0)
+      raise(__kmp_global.g.g_abort);
+  }
+
+  KA_TRACE(10, ("__kmp_launch_monitor: #7 exit\n"));
+
+  return thr;
+}
+#endif // KMP_USE_MONITOR
+
+void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size) {
+  pthread_t handle;
+  pthread_attr_t thread_attr;
+  int status;
+
+  th->th.th_info.ds.ds_gtid = gtid;
+
+#if KMP_STATS_ENABLED
+  // sets up worker thread stats
+  __kmp_acquire_tas_lock(&__kmp_stats_lock, gtid);
+
+  // th->th.th_stats is used to transfer thread-specific stats-pointer to
+  // __kmp_launch_worker. So when thread is created (goes into
+  // __kmp_launch_worker) it will set its thread local pointer to
+  // th->th.th_stats
+  if (!KMP_UBER_GTID(gtid)) {
+    th->th.th_stats = __kmp_stats_list->push_back(gtid);
+  } else {
+    // For root threads, __kmp_stats_thread_ptr is set in __kmp_register_root(),
+    // so set the th->th.th_stats field to it.
+    th->th.th_stats = __kmp_stats_thread_ptr;
+  }
+  __kmp_release_tas_lock(&__kmp_stats_lock, gtid);
+
+#endif // KMP_STATS_ENABLED
+
+  if (KMP_UBER_GTID(gtid)) {
+    KA_TRACE(10, ("__kmp_create_worker: uber thread (%d)\n", gtid));
+    th->th.th_info.ds.ds_thread = pthread_self();
+    __kmp_set_stack_info(gtid, th);
+    __kmp_check_stack_overlap(th);
+    return;
+  }
+
+  KA_TRACE(10, ("__kmp_create_worker: try to create thread (%d)\n", gtid));
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+#ifdef KMP_THREAD_ATTR
+  status = pthread_attr_init(&thread_attr);
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(CantInitThreadAttrs), KMP_ERR(status), __kmp_msg_null);
+  }
+  status = pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(CantSetWorkerState), KMP_ERR(status), __kmp_msg_null);
+  }
+
+  /* Set stack size for this thread now.
+     The multiple of 2 is there because on some machines, requesting an unusual
+     stacksize causes the thread to have an offset before the dummy alloca()
+     takes place to create the offset.  Since we want the user to have a
+     sufficient stacksize AND support a stack offset, we alloca() twice the
+     offset so that the upcoming alloca() does not eliminate any premade offset,
+     and also gives the user the stack space they requested for all threads */
+  stack_size += gtid * __kmp_stkoffset * 2;
+
+  KA_TRACE(10, ("__kmp_create_worker: T#%d, default stacksize = %lu bytes, "
+                "__kmp_stksize = %lu bytes, final stacksize = %lu bytes\n",
+                gtid, KMP_DEFAULT_STKSIZE, __kmp_stksize, stack_size));
+
+#ifdef _POSIX_THREAD_ATTR_STACKSIZE
+  status = pthread_attr_setstacksize(&thread_attr, stack_size);
+#ifdef KMP_BACKUP_STKSIZE
+  if (status != 0) {
+    if (!__kmp_env_stksize) {
+      stack_size = KMP_BACKUP_STKSIZE + gtid * __kmp_stkoffset;
+      __kmp_stksize = KMP_BACKUP_STKSIZE;
+      KA_TRACE(10, ("__kmp_create_worker: T#%d, default stacksize = %lu bytes, "
+                    "__kmp_stksize = %lu bytes, (backup) final stacksize = %lu "
+                    "bytes\n",
+                    gtid, KMP_DEFAULT_STKSIZE, __kmp_stksize, stack_size));
+      status = pthread_attr_setstacksize(&thread_attr, stack_size);
+    }
+  }
+#endif /* KMP_BACKUP_STKSIZE */
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(CantSetWorkerStackSize, stack_size), KMP_ERR(status),
+                KMP_HNT(ChangeWorkerStackSize), __kmp_msg_null);
+  }
+#endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+
+#endif /* KMP_THREAD_ATTR */
+
+  status =
+      pthread_create(&handle, &thread_attr, __kmp_launch_worker, (void *)th);
+  if (status != 0 || !handle) { // ??? Why do we check handle??
+#ifdef _POSIX_THREAD_ATTR_STACKSIZE
+    if (status == EINVAL) {
+      __kmp_fatal(KMP_MSG(CantSetWorkerStackSize, stack_size), KMP_ERR(status),
+                  KMP_HNT(IncreaseWorkerStackSize), __kmp_msg_null);
+    }
+    if (status == ENOMEM) {
+      __kmp_fatal(KMP_MSG(CantSetWorkerStackSize, stack_size), KMP_ERR(status),
+                  KMP_HNT(DecreaseWorkerStackSize), __kmp_msg_null);
+    }
+#endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+    if (status == EAGAIN) {
+      __kmp_fatal(KMP_MSG(NoResourcesForWorkerThread), KMP_ERR(status),
+                  KMP_HNT(Decrease_NUM_THREADS), __kmp_msg_null);
+    }
+    KMP_SYSFAIL("pthread_create", status);
+  }
+
+  th->th.th_info.ds.ds_thread = handle;
+
+#ifdef KMP_THREAD_ATTR
+  status = pthread_attr_destroy(&thread_attr);
+  if (status) {
+    kmp_msg_t err_code = KMP_ERR(status);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(CantDestroyThreadAttrs), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+  }
+#endif /* KMP_THREAD_ATTR */
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(10, ("__kmp_create_worker: done creating thread (%d)\n", gtid));
+
+} // __kmp_create_worker
+
+#if KMP_USE_MONITOR
+void __kmp_create_monitor(kmp_info_t *th) {
+  pthread_t handle;
+  pthread_attr_t thread_attr;
+  size_t size;
+  int status;
+  int auto_adj_size = FALSE;
+
+  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
+    // We don't need monitor thread in case of MAX_BLOCKTIME
+    KA_TRACE(10, ("__kmp_create_monitor: skipping monitor thread because of "
+                  "MAX blocktime\n"));
+    th->th.th_info.ds.ds_tid = 0; // this makes reap_monitor no-op
+    th->th.th_info.ds.ds_gtid = 0;
+    return;
+  }
+  KA_TRACE(10, ("__kmp_create_monitor: try to create monitor\n"));
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  th->th.th_info.ds.ds_tid = KMP_GTID_MONITOR;
+  th->th.th_info.ds.ds_gtid = KMP_GTID_MONITOR;
+#if KMP_REAL_TIME_FIX
+  TCW_4(__kmp_global.g.g_time.dt.t_value,
+        -1); // Will use it for synchronization a bit later.
+#else
+  TCW_4(__kmp_global.g.g_time.dt.t_value, 0);
+#endif // KMP_REAL_TIME_FIX
+
+#ifdef KMP_THREAD_ATTR
+  if (__kmp_monitor_stksize == 0) {
+    __kmp_monitor_stksize = KMP_DEFAULT_MONITOR_STKSIZE;
+    auto_adj_size = TRUE;
+  }
+  status = pthread_attr_init(&thread_attr);
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(CantInitThreadAttrs), KMP_ERR(status), __kmp_msg_null);
+  }
+  status = pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(CantSetMonitorState), KMP_ERR(status), __kmp_msg_null);
+  }
+
+#ifdef _POSIX_THREAD_ATTR_STACKSIZE
+  status = pthread_attr_getstacksize(&thread_attr, &size);
+  KMP_CHECK_SYSFAIL("pthread_attr_getstacksize", status);
+#else
+  size = __kmp_sys_min_stksize;
+#endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+#endif /* KMP_THREAD_ATTR */
+
+  if (__kmp_monitor_stksize == 0) {
+    __kmp_monitor_stksize = KMP_DEFAULT_MONITOR_STKSIZE;
+  }
+  if (__kmp_monitor_stksize < __kmp_sys_min_stksize) {
+    __kmp_monitor_stksize = __kmp_sys_min_stksize;
+  }
+
+  KA_TRACE(10, ("__kmp_create_monitor: default stacksize = %lu bytes,"
+                "requested stacksize = %lu bytes\n",
+                size, __kmp_monitor_stksize));
+
+retry:
+
+/* Set stack size for this thread now. */
+#ifdef _POSIX_THREAD_ATTR_STACKSIZE
+  KA_TRACE(10, ("__kmp_create_monitor: setting stacksize = %lu bytes,",
+                __kmp_monitor_stksize));
+  status = pthread_attr_setstacksize(&thread_attr, __kmp_monitor_stksize);
+  if (status != 0) {
+    if (auto_adj_size) {
+      __kmp_monitor_stksize *= 2;
+      goto retry;
+    }
+    kmp_msg_t err_code = KMP_ERR(status);
+    __kmp_msg(kmp_ms_warning, // should this be fatal?  BB
+              KMP_MSG(CantSetMonitorStackSize, (long int)__kmp_monitor_stksize),
+              err_code, KMP_HNT(ChangeMonitorStackSize), __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+  }
+#endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+
+  status =
+      pthread_create(&handle, &thread_attr, __kmp_launch_monitor, (void *)th);
+
+  if (status != 0) {
+#ifdef _POSIX_THREAD_ATTR_STACKSIZE
+    if (status == EINVAL) {
+      if (auto_adj_size && (__kmp_monitor_stksize < (size_t)0x40000000)) {
+        __kmp_monitor_stksize *= 2;
+        goto retry;
+      }
+      __kmp_fatal(KMP_MSG(CantSetMonitorStackSize, __kmp_monitor_stksize),
+                  KMP_ERR(status), KMP_HNT(IncreaseMonitorStackSize),
+                  __kmp_msg_null);
+    }
+    if (status == ENOMEM) {
+      __kmp_fatal(KMP_MSG(CantSetMonitorStackSize, __kmp_monitor_stksize),
+                  KMP_ERR(status), KMP_HNT(DecreaseMonitorStackSize),
+                  __kmp_msg_null);
+    }
+#endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+    if (status == EAGAIN) {
+      __kmp_fatal(KMP_MSG(NoResourcesForMonitorThread), KMP_ERR(status),
+                  KMP_HNT(DecreaseNumberOfThreadsInUse), __kmp_msg_null);
+    }
+    KMP_SYSFAIL("pthread_create", status);
+  }
+
+  th->th.th_info.ds.ds_thread = handle;
+
+#if KMP_REAL_TIME_FIX
+  // Wait for the monitor thread is really started and set its *priority*.
+  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) ==
+                   sizeof(__kmp_global.g.g_time.dt.t_value));
+  __kmp_wait_4((kmp_uint32 volatile *)&__kmp_global.g.g_time.dt.t_value, -1,
+               &__kmp_neq_4, NULL);
+#endif // KMP_REAL_TIME_FIX
+
+#ifdef KMP_THREAD_ATTR
+  status = pthread_attr_destroy(&thread_attr);
+  if (status != 0) {
+    kmp_msg_t err_code = KMP_ERR(status);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(CantDestroyThreadAttrs), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+  }
+#endif
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(10, ("__kmp_create_monitor: monitor created %#.8lx\n",
+                th->th.th_info.ds.ds_thread));
+
+} // __kmp_create_monitor
+#endif // KMP_USE_MONITOR
+
+void __kmp_exit_thread(int exit_status) {
+#if KMP_OS_WASI
+// TODO: the wasm32-wasi-threads target does not yet support pthread_exit.
+#else
+  pthread_exit((void *)(intptr_t)exit_status);
+#endif
+} // __kmp_exit_thread
+
+#if KMP_USE_MONITOR
+void __kmp_resume_monitor();
+
+extern "C" void __kmp_reap_monitor(kmp_info_t *th) {
+  int status;
+  void *exit_val;
+
+  KA_TRACE(10, ("__kmp_reap_monitor: try to reap monitor thread with handle"
+                " %#.8lx\n",
+                th->th.th_info.ds.ds_thread));
+
+  // If monitor has been created, its tid and gtid should be KMP_GTID_MONITOR.
+  // If both tid and gtid are 0, it means the monitor did not ever start.
+  // If both tid and gtid are KMP_GTID_DNE, the monitor has been shut down.
+  KMP_DEBUG_ASSERT(th->th.th_info.ds.ds_tid == th->th.th_info.ds.ds_gtid);
+  if (th->th.th_info.ds.ds_gtid != KMP_GTID_MONITOR) {
+    KA_TRACE(10, ("__kmp_reap_monitor: monitor did not start, returning\n"));
+    return;
+  }
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  /* First, check to see whether the monitor thread exists to wake it up. This
+     is to avoid performance problem when the monitor sleeps during
+     blocktime-size interval */
+
+  status = pthread_kill(th->th.th_info.ds.ds_thread, 0);
+  if (status != ESRCH) {
+    __kmp_resume_monitor(); // Wake up the monitor thread
+  }
+  KA_TRACE(10, ("__kmp_reap_monitor: try to join with monitor\n"));
+  status = pthread_join(th->th.th_info.ds.ds_thread, &exit_val);
+  if (exit_val != th) {
+    __kmp_fatal(KMP_MSG(ReapMonitorError), KMP_ERR(status), __kmp_msg_null);
+  }
+
+  th->th.th_info.ds.ds_tid = KMP_GTID_DNE;
+  th->th.th_info.ds.ds_gtid = KMP_GTID_DNE;
+
+  KA_TRACE(10, ("__kmp_reap_monitor: done reaping monitor thread with handle"
+                " %#.8lx\n",
+                th->th.th_info.ds.ds_thread));
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+}
+#else
+// Empty symbol to export (see exports_so.txt) when
+// monitor thread feature is disabled
+extern "C" void __kmp_reap_monitor(kmp_info_t *th) {
+  (void)th;
+}
+#endif // KMP_USE_MONITOR
+
+void __kmp_reap_worker(kmp_info_t *th) {
+  int status;
+  void *exit_val;
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(
+      10, ("__kmp_reap_worker: try to reap T#%d\n", th->th.th_info.ds.ds_gtid));
+
+  status = pthread_join(th->th.th_info.ds.ds_thread, &exit_val);
+#ifdef KMP_DEBUG
+  /* Don't expose these to the user until we understand when they trigger */
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(ReapWorkerError), KMP_ERR(status), __kmp_msg_null);
+  }
+  if (exit_val != th) {
+    KA_TRACE(10, ("__kmp_reap_worker: worker T#%d did not reap properly, "
+                  "exit_val = %p\n",
+                  th->th.th_info.ds.ds_gtid, exit_val));
+  }
+#else
+  (void)status; // unused variable
+#endif /* KMP_DEBUG */
+
+  KA_TRACE(10, ("__kmp_reap_worker: done reaping T#%d\n",
+                th->th.th_info.ds.ds_gtid));
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+}
+
+#if KMP_HANDLE_SIGNALS
+
+static void __kmp_null_handler(int signo) {
+  //  Do nothing, for doing SIG_IGN-type actions.
+} // __kmp_null_handler
+
+static void __kmp_team_handler(int signo) {
+  if (__kmp_global.g.g_abort == 0) {
+/* Stage 1 signal handler, let's shut down all of the threads */
+#ifdef KMP_DEBUG
+    __kmp_debug_printf("__kmp_team_handler: caught signal = %d\n", signo);
+#endif
+    switch (signo) {
+    case SIGHUP:
+    case SIGINT:
+    case SIGQUIT:
+    case SIGILL:
+    case SIGABRT:
+    case SIGFPE:
+    case SIGBUS:
+    case SIGSEGV:
+#ifdef SIGSYS
+    case SIGSYS:
+#endif
+    case SIGTERM:
+      if (__kmp_debug_buf) {
+        __kmp_dump_debug_buffer();
+      }
+      __kmp_unregister_library(); // cleanup shared memory
+      KMP_MB(); // Flush all pending memory write invalidates.
+      TCW_4(__kmp_global.g.g_abort, signo);
+      KMP_MB(); // Flush all pending memory write invalidates.
+      TCW_4(__kmp_global.g.g_done, TRUE);
+      KMP_MB(); // Flush all pending memory write invalidates.
+      break;
+    default:
+#ifdef KMP_DEBUG
+      __kmp_debug_printf("__kmp_team_handler: unknown signal type");
+#endif
+      break;
+    }
+  }
+} // __kmp_team_handler
+
+static void __kmp_sigaction(int signum, const struct sigaction *act,
+                            struct sigaction *oldact) {
+  int rc = sigaction(signum, act, oldact);
+  KMP_CHECK_SYSFAIL_ERRNO("sigaction", rc);
+}
+
+static void __kmp_install_one_handler(int sig, sig_func_t handler_func,
+                                      int parallel_init) {
+  KMP_MB(); // Flush all pending memory write invalidates.
+  KB_TRACE(60,
+           ("__kmp_install_one_handler( %d, ..., %d )\n", sig, parallel_init));
+  if (parallel_init) {
+    struct sigaction new_action;
+    struct sigaction old_action;
+    new_action.sa_handler = handler_func;
+    new_action.sa_flags = 0;
+    sigfillset(&new_action.sa_mask);
+    __kmp_sigaction(sig, &new_action, &old_action);
+    if (old_action.sa_handler == __kmp_sighldrs[sig].sa_handler) {
+      sigaddset(&__kmp_sigset, sig);
+    } else {
+      // Restore/keep user's handler if one previously installed.
+      __kmp_sigaction(sig, &old_action, NULL);
+    }
+  } else {
+    // Save initial/system signal handlers to see if user handlers installed.
+    __kmp_sigaction(sig, NULL, &__kmp_sighldrs[sig]);
+  }
+  KMP_MB(); // Flush all pending memory write invalidates.
+} // __kmp_install_one_handler
+
+static void __kmp_remove_one_handler(int sig) {
+  KB_TRACE(60, ("__kmp_remove_one_handler( %d )\n", sig));
+  if (sigismember(&__kmp_sigset, sig)) {
+    struct sigaction old;
+    KMP_MB(); // Flush all pending memory write invalidates.
+    __kmp_sigaction(sig, &__kmp_sighldrs[sig], &old);
+    if ((old.sa_handler != __kmp_team_handler) &&
+        (old.sa_handler != __kmp_null_handler)) {
+      // Restore the users signal handler.
+      KB_TRACE(10, ("__kmp_remove_one_handler: oops, not our handler, "
+                    "restoring: sig=%d\n",
+                    sig));
+      __kmp_sigaction(sig, &old, NULL);
+    }
+    sigdelset(&__kmp_sigset, sig);
+    KMP_MB(); // Flush all pending memory write invalidates.
+  }
+} // __kmp_remove_one_handler
+
+void __kmp_install_signals(int parallel_init) {
+  KB_TRACE(10, ("__kmp_install_signals( %d )\n", parallel_init));
+  if (__kmp_handle_signals || !parallel_init) {
+    // If ! parallel_init, we do not install handlers, just save original
+    // handlers. Let us do it even __handle_signals is 0.
+    sigemptyset(&__kmp_sigset);
+    __kmp_install_one_handler(SIGHUP, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGINT, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGQUIT, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGILL, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGABRT, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGFPE, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGBUS, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGSEGV, __kmp_team_handler, parallel_init);
+#ifdef SIGSYS
+    __kmp_install_one_handler(SIGSYS, __kmp_team_handler, parallel_init);
+#endif // SIGSYS
+    __kmp_install_one_handler(SIGTERM, __kmp_team_handler, parallel_init);
+#ifdef SIGPIPE
+    __kmp_install_one_handler(SIGPIPE, __kmp_team_handler, parallel_init);
+#endif // SIGPIPE
+  }
+} // __kmp_install_signals
+
+void __kmp_remove_signals(void) {
+  int sig;
+  KB_TRACE(10, ("__kmp_remove_signals()\n"));
+  for (sig = 1; sig < NSIG; ++sig) {
+    __kmp_remove_one_handler(sig);
+  }
+} // __kmp_remove_signals
+
+#endif // KMP_HANDLE_SIGNALS
+
+void __kmp_enable(int new_state) {
+#ifdef KMP_CANCEL_THREADS
+  int status, old_state;
+  status = pthread_setcancelstate(new_state, &old_state);
+  KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
+  KMP_DEBUG_ASSERT(old_state == PTHREAD_CANCEL_DISABLE);
+#endif
+}
+
+void __kmp_disable(int *old_state) {
+#ifdef KMP_CANCEL_THREADS
+  int status;
+  status = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, old_state);
+  KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
+#endif
+}
+
+static void __kmp_atfork_prepare(void) {
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+}
+
+static void __kmp_atfork_parent(void) {
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+
+/* Reset the library so execution in the child starts "all over again" with
+   clean data structures in initial states.  Don't worry about freeing memory
+   allocated by parent, just abandon it to be safe. */
+static void __kmp_atfork_child(void) {
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+  /* TODO make sure this is done right for nested/sibling */
+  // ATT:  Memory leaks are here? TODO: Check it and fix.
+  /* KMP_ASSERT( 0 ); */
+
+  ++__kmp_fork_count;
+
+#if KMP_AFFINITY_SUPPORTED
+#if KMP_OS_LINUX || KMP_OS_FREEBSD
+  // reset the affinity in the child to the initial thread
+  // affinity in the parent
+  kmp_set_thread_affinity_mask_initial();
+#endif
+  // Set default not to bind threads tightly in the child (we're expecting
+  // over-subscription after the fork and this can improve things for
+  // scripting languages that use OpenMP inside process-parallel code).
+  if (__kmp_nested_proc_bind.bind_types != NULL) {
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+  }
+  for (kmp_affinity_t *affinity : __kmp_affinities)
+    *affinity = KMP_AFFINITY_INIT(affinity->env_var);
+  __kmp_affin_fullMask = nullptr;
+  __kmp_affin_origMask = nullptr;
+  __kmp_topology = nullptr;
+#endif // KMP_AFFINITY_SUPPORTED
+
+#if KMP_USE_MONITOR
+  __kmp_init_monitor = 0;
+#endif
+  __kmp_init_parallel = FALSE;
+  __kmp_init_middle = FALSE;
+  __kmp_init_serial = FALSE;
+  TCW_4(__kmp_init_gtid, FALSE);
+  __kmp_init_common = FALSE;
+
+  TCW_4(__kmp_init_user_locks, FALSE);
+#if !KMP_USE_DYNAMIC_LOCK
+  __kmp_user_lock_table.used = 1;
+  __kmp_user_lock_table.allocated = 0;
+  __kmp_user_lock_table.table = NULL;
+  __kmp_lock_blocks = NULL;
+#endif
+
+  __kmp_all_nth = 0;
+  TCW_4(__kmp_nth, 0);
+
+  __kmp_thread_pool = NULL;
+  __kmp_thread_pool_insert_pt = NULL;
+  __kmp_team_pool = NULL;
+
+  /* Must actually zero all the *cache arguments passed to __kmpc_threadprivate
+     here so threadprivate doesn't use stale data */
+  KA_TRACE(10, ("__kmp_atfork_child: checking cache address list %p\n",
+                __kmp_threadpriv_cache_list));
+
+  while (__kmp_threadpriv_cache_list != NULL) {
+
+    if (*__kmp_threadpriv_cache_list->addr != NULL) {
+      KC_TRACE(50, ("__kmp_atfork_child: zeroing cache at address %p\n",
+                    &(*__kmp_threadpriv_cache_list->addr)));
+
+      *__kmp_threadpriv_cache_list->addr = NULL;
+    }
+    __kmp_threadpriv_cache_list = __kmp_threadpriv_cache_list->next;
+  }
+
+  __kmp_init_runtime = FALSE;
+
+  /* reset statically initialized locks */
+  __kmp_init_bootstrap_lock(&__kmp_initz_lock);
+  __kmp_init_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_init_bootstrap_lock(&__kmp_console_lock);
+  __kmp_init_bootstrap_lock(&__kmp_task_team_lock);
+
+#if USE_ITT_BUILD
+  __kmp_itt_reset(); // reset ITT's global state
+#endif /* USE_ITT_BUILD */
+
+  {
+    // Child process often get terminated without any use of OpenMP. That might
+    // cause mapped shared memory file to be left unattended. Thus we postpone
+    // library registration till middle initialization in the child process.
+    __kmp_need_register_serial = FALSE;
+    __kmp_serial_initialize();
+  }
+
+  /* This is necessary to make sure no stale data is left around */
+  /* AC: customers complain that we use unsafe routines in the atfork
+     handler. Mathworks: dlsym() is unsafe. We call dlsym and dlopen
+     in dynamic_link when check the presence of shared tbbmalloc library.
+     Suggestion is to make the library initialization lazier, similar
+     to what done for __kmpc_begin(). */
+  // TODO: synchronize all static initializations with regular library
+  //       startup; look at kmp_global.cpp and etc.
+  //__kmp_internal_begin ();
+}
+
+void __kmp_register_atfork(void) {
+  if (__kmp_need_register_atfork) {
+#if !KMP_OS_WASI
+    int status = pthread_atfork(__kmp_atfork_prepare, __kmp_atfork_parent,
+                                __kmp_atfork_child);
+    KMP_CHECK_SYSFAIL("pthread_atfork", status);
+#endif
+    __kmp_need_register_atfork = FALSE;
+  }
+}
+
+void __kmp_suspend_initialize(void) {
+  int status;
+  status = pthread_mutexattr_init(&__kmp_suspend_mutex_attr);
+  KMP_CHECK_SYSFAIL("pthread_mutexattr_init", status);
+  status = pthread_condattr_init(&__kmp_suspend_cond_attr);
+  KMP_CHECK_SYSFAIL("pthread_condattr_init", status);
+}
+
+void __kmp_suspend_initialize_thread(kmp_info_t *th) {
+  int old_value = KMP_ATOMIC_LD_RLX(&th->th.th_suspend_init_count);
+  int new_value = __kmp_fork_count + 1;
+  // Return if already initialized
+  if (old_value == new_value)
+    return;
+  // Wait, then return if being initialized
+  if (old_value == -1 || !__kmp_atomic_compare_store(
+                             &th->th.th_suspend_init_count, old_value, -1)) {
+    while (KMP_ATOMIC_LD_ACQ(&th->th.th_suspend_init_count) != new_value) {
+      KMP_CPU_PAUSE();
+    }
+  } else {
+    // Claim to be the initializer and do initializations
+    int status;
+    status = pthread_cond_init(&th->th.th_suspend_cv.c_cond,
+                               &__kmp_suspend_cond_attr);
+    KMP_CHECK_SYSFAIL("pthread_cond_init", status);
+    status = pthread_mutex_init(&th->th.th_suspend_mx.m_mutex,
+                                &__kmp_suspend_mutex_attr);
+    KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
+    KMP_ATOMIC_ST_REL(&th->th.th_suspend_init_count, new_value);
+  }
+}
+
+void __kmp_suspend_uninitialize_thread(kmp_info_t *th) {
+  if (KMP_ATOMIC_LD_ACQ(&th->th.th_suspend_init_count) > __kmp_fork_count) {
+    /* this means we have initialize the suspension pthread objects for this
+       thread in this instance of the process */
+    int status;
+
+    status = pthread_cond_destroy(&th->th.th_suspend_cv.c_cond);
+    if (status != 0 && status != EBUSY) {
+      KMP_SYSFAIL("pthread_cond_destroy", status);
+    }
+    status = pthread_mutex_destroy(&th->th.th_suspend_mx.m_mutex);
+    if (status != 0 && status != EBUSY) {
+      KMP_SYSFAIL("pthread_mutex_destroy", status);
+    }
+    --th->th.th_suspend_init_count;
+    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&th->th.th_suspend_init_count) ==
+                     __kmp_fork_count);
+  }
+}
+
+// return true if lock obtained, false otherwise
+int __kmp_try_suspend_mx(kmp_info_t *th) {
+  return (pthread_mutex_trylock(&th->th.th_suspend_mx.m_mutex) == 0);
+}
+
+void __kmp_lock_suspend_mx(kmp_info_t *th) {
+  int status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+}
+
+void __kmp_unlock_suspend_mx(kmp_info_t *th) {
+  int status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+}
+
+/* This routine puts the calling thread to sleep after setting the
+   sleep bit for the indicated flag variable to true. */
+template <class C>
+static inline void __kmp_suspend_template(int th_gtid, C *flag) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_suspend);
+  kmp_info_t *th = __kmp_threads[th_gtid];
+  int status;
+  typename C::flag_t old_spin;
+
+  KF_TRACE(30, ("__kmp_suspend_template: T#%d enter for flag = %p\n", th_gtid,
+                flag->get()));
+
+  __kmp_suspend_initialize_thread(th);
+
+  __kmp_lock_suspend_mx(th);
+
+  KF_TRACE(10, ("__kmp_suspend_template: T#%d setting sleep bit for spin(%p)\n",
+                th_gtid, flag->get()));
+
+  /* TODO: shouldn't this use release semantics to ensure that
+     __kmp_suspend_initialize_thread gets called first? */
+  old_spin = flag->set_sleeping();
+  TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+  th->th.th_sleep_loc_type = flag->get_type();
+  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+      __kmp_pause_status != kmp_soft_paused) {
+    flag->unset_sleeping();
+    TCW_PTR(th->th.th_sleep_loc, NULL);
+    th->th.th_sleep_loc_type = flag_unset;
+    __kmp_unlock_suspend_mx(th);
+    return;
+  }
+  KF_TRACE(5, ("__kmp_suspend_template: T#%d set sleep bit for spin(%p)==%x,"
+               " was %x\n",
+               th_gtid, flag->get(), flag->load(), old_spin));
+
+  if (flag->done_check_val(old_spin) || flag->done_check()) {
+    flag->unset_sleeping();
+    TCW_PTR(th->th.th_sleep_loc, NULL);
+    th->th.th_sleep_loc_type = flag_unset;
+    KF_TRACE(5, ("__kmp_suspend_template: T#%d false alarm, reset sleep bit "
+                 "for spin(%p)\n",
+                 th_gtid, flag->get()));
+  } else {
+    /* Encapsulate in a loop as the documentation states that this may
+       "with low probability" return when the condition variable has
+       not been signaled or broadcast */
+    int deactivated = FALSE;
+
+    while (flag->is_sleeping()) {
+#ifdef DEBUG_SUSPEND
+      char buffer[128];
+      __kmp_suspend_count++;
+      __kmp_print_cond(buffer, &th->th.th_suspend_cv);
+      __kmp_printf("__kmp_suspend_template: suspending T#%d: %s\n", th_gtid,
+                   buffer);
+#endif
+      // Mark the thread as no longer active (only in the first iteration of the
+      // loop).
+      if (!deactivated) {
+        th->th.th_active = FALSE;
+        if (th->th.th_active_in_pool) {
+          th->th.th_active_in_pool = FALSE;
+          KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
+          KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
+        }
+        deactivated = TRUE;
+      }
+
+      KMP_DEBUG_ASSERT(th->th.th_sleep_loc);
+      KMP_DEBUG_ASSERT(flag->get_type() == th->th.th_sleep_loc_type);
+
+#if USE_SUSPEND_TIMEOUT
+      struct timespec now;
+      struct timeval tval;
+      int msecs;
+
+      status = gettimeofday(&tval, NULL);
+      KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
+      TIMEVAL_TO_TIMESPEC(&tval, &now);
+
+      msecs = (4 * __kmp_dflt_blocktime) + 200;
+      now.tv_sec += msecs / 1000;
+      now.tv_nsec += (msecs % 1000) * 1000;
+
+      KF_TRACE(15, ("__kmp_suspend_template: T#%d about to perform "
+                    "pthread_cond_timedwait\n",
+                    th_gtid));
+      status = pthread_cond_timedwait(&th->th.th_suspend_cv.c_cond,
+                                      &th->th.th_suspend_mx.m_mutex, &now);
+#else
+      KF_TRACE(15, ("__kmp_suspend_template: T#%d about to perform"
+                    " pthread_cond_wait\n",
+                    th_gtid));
+      status = pthread_cond_wait(&th->th.th_suspend_cv.c_cond,
+                                 &th->th.th_suspend_mx.m_mutex);
+#endif // USE_SUSPEND_TIMEOUT
+
+      if ((status != 0) && (status != EINTR) && (status != ETIMEDOUT)) {
+        KMP_SYSFAIL("pthread_cond_wait", status);
+      }
+
+      KMP_DEBUG_ASSERT(flag->get_type() == flag->get_ptr_type());
+
+      if (!flag->is_sleeping() &&
+          ((status == EINTR) || (status == ETIMEDOUT))) {
+        // if interrupt or timeout, and thread is no longer sleeping, we need to
+        // make sure sleep_loc gets reset; however, this shouldn't be needed if
+        // we woke up with resume
+        flag->unset_sleeping();
+        TCW_PTR(th->th.th_sleep_loc, NULL);
+        th->th.th_sleep_loc_type = flag_unset;
+      }
+#ifdef KMP_DEBUG
+      if (status == ETIMEDOUT) {
+        if (flag->is_sleeping()) {
+          KF_TRACE(100,
+                   ("__kmp_suspend_template: T#%d timeout wakeup\n", th_gtid));
+        } else {
+          KF_TRACE(2, ("__kmp_suspend_template: T#%d timeout wakeup, sleep bit "
+                       "not set!\n",
+                       th_gtid));
+          TCW_PTR(th->th.th_sleep_loc, NULL);
+          th->th.th_sleep_loc_type = flag_unset;
+        }
+      } else if (flag->is_sleeping()) {
+        KF_TRACE(100,
+                 ("__kmp_suspend_template: T#%d spurious wakeup\n", th_gtid));
+      }
+#endif
+    } // while
+
+    // Mark the thread as active again (if it was previous marked as inactive)
+    if (deactivated) {
+      th->th.th_active = TRUE;
+      if (TCR_4(th->th.th_in_pool)) {
+        KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
+        th->th.th_active_in_pool = TRUE;
+      }
+    }
+  }
+  // We may have had the loop variable set before entering the loop body;
+  // so we need to reset sleep_loc.
+  TCW_PTR(th->th.th_sleep_loc, NULL);
+  th->th.th_sleep_loc_type = flag_unset;
+
+  KMP_DEBUG_ASSERT(!flag->is_sleeping());
+  KMP_DEBUG_ASSERT(!th->th.th_sleep_loc);
+#ifdef DEBUG_SUSPEND
+  {
+    char buffer[128];
+    __kmp_print_cond(buffer, &th->th.th_suspend_cv);
+    __kmp_printf("__kmp_suspend_template: T#%d has awakened: %s\n", th_gtid,
+                 buffer);
+  }
+#endif
+
+  __kmp_unlock_suspend_mx(th);
+  KF_TRACE(30, ("__kmp_suspend_template: T#%d exit\n", th_gtid));
+}
+
+template <bool C, bool S>
+void __kmp_suspend_32(int th_gtid, kmp_flag_32<C, S> *flag) {
+  __kmp_suspend_template(th_gtid, flag);
+}
+template <bool C, bool S>
+void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag) {
+  __kmp_suspend_template(th_gtid, flag);
+}
+template <bool C, bool S>
+void __kmp_atomic_suspend_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag) {
+  __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
+  __kmp_suspend_template(th_gtid, flag);
+}
+
+template void __kmp_suspend_32<false, false>(int, kmp_flag_32<false, false> *);
+template void __kmp_suspend_64<false, true>(int, kmp_flag_64<false, true> *);
+template void __kmp_suspend_64<true, false>(int, kmp_flag_64<true, false> *);
+template void
+__kmp_atomic_suspend_64<false, true>(int, kmp_atomic_flag_64<false, true> *);
+template void
+__kmp_atomic_suspend_64<true, false>(int, kmp_atomic_flag_64<true, false> *);
+
+/* This routine signals the thread specified by target_gtid to wake up
+   after setting the sleep bit indicated by the flag argument to FALSE.
+   The target thread must already have called __kmp_suspend_template() */
+template <class C>
+static inline void __kmp_resume_template(int target_gtid, C *flag) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_resume);
+  kmp_info_t *th = __kmp_threads[target_gtid];
+  int status;
+
+#ifdef KMP_DEBUG
+  int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+#endif
+
+  KF_TRACE(30, ("__kmp_resume_template: T#%d wants to wakeup T#%d enter\n",
+                gtid, target_gtid));
+  KMP_DEBUG_ASSERT(gtid != target_gtid);
+
+  __kmp_suspend_initialize_thread(th);
+
+  __kmp_lock_suspend_mx(th);
+
+  if (!flag || flag != th->th.th_sleep_loc) {
+    // coming from __kmp_null_resume_wrapper, or thread is now sleeping on a
+    // different location; wake up at new location
+    flag = (C *)CCAST(void *, th->th.th_sleep_loc);
+  }
+
+  // First, check if the flag is null or its type has changed. If so, someone
+  // else woke it up.
+  if (!flag) { // Thread doesn't appear to be sleeping on anything
+    KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
+                 "awake: flag(%p)\n",
+                 gtid, target_gtid, (void *)NULL));
+    __kmp_unlock_suspend_mx(th);
+    return;
+  } else if (flag->get_type() != th->th.th_sleep_loc_type) {
+    // Flag type does not appear to match this function template; possibly the
+    // thread is sleeping on something else. Try null resume again.
+    KF_TRACE(
+        5,
+        ("__kmp_resume_template: T#%d retrying, thread T#%d Mismatch flag(%p), "
+         "spin(%p) type=%d ptr_type=%d\n",
+         gtid, target_gtid, flag, flag->get(), flag->get_type(),
+         th->th.th_sleep_loc_type));
+    __kmp_unlock_suspend_mx(th);
+    __kmp_null_resume_wrapper(th);
+    return;
+  } else { // if multiple threads are sleeping, flag should be internally
+    // referring to a specific thread here
+    if (!flag->is_sleeping()) {
+      KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
+                   "awake: flag(%p): %u\n",
+                   gtid, target_gtid, flag->get(), (unsigned int)flag->load()));
+      __kmp_unlock_suspend_mx(th);
+      return;
+    }
+  }
+  KMP_DEBUG_ASSERT(flag);
+  flag->unset_sleeping();
+  TCW_PTR(th->th.th_sleep_loc, NULL);
+  th->th.th_sleep_loc_type = flag_unset;
+
+  KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset "
+               "sleep bit for flag's loc(%p): %u\n",
+               gtid, target_gtid, flag->get(), (unsigned int)flag->load()));
+
+#ifdef DEBUG_SUSPEND
+  {
+    char buffer[128];
+    __kmp_print_cond(buffer, &th->th.th_suspend_cv);
+    __kmp_printf("__kmp_resume_template: T#%d resuming T#%d: %s\n", gtid,
+                 target_gtid, buffer);
+  }
+#endif
+  status = pthread_cond_signal(&th->th.th_suspend_cv.c_cond);
+  KMP_CHECK_SYSFAIL("pthread_cond_signal", status);
+  __kmp_unlock_suspend_mx(th);
+  KF_TRACE(30, ("__kmp_resume_template: T#%d exiting after signaling wake up"
+                " for T#%d\n",
+                gtid, target_gtid));
+}
+
+template <bool C, bool S>
+void __kmp_resume_32(int target_gtid, kmp_flag_32<C, S> *flag) {
+  __kmp_resume_template(target_gtid, flag);
+}
+template <bool C, bool S>
+void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag) {
+  __kmp_resume_template(target_gtid, flag);
+}
+template <bool C, bool S>
+void __kmp_atomic_resume_64(int target_gtid, kmp_atomic_flag_64<C, S> *flag) {
+  __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
+  __kmp_resume_template(target_gtid, flag);
+}
+
+template void __kmp_resume_32<false, true>(int, kmp_flag_32<false, true> *);
+template void __kmp_resume_32<false, false>(int, kmp_flag_32<false, false> *);
+template void __kmp_resume_64<false, true>(int, kmp_flag_64<false, true> *);
+template void
+__kmp_atomic_resume_64<false, true>(int, kmp_atomic_flag_64<false, true> *);
+
+#if KMP_USE_MONITOR
+void __kmp_resume_monitor() {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_resume);
+  int status;
+#ifdef KMP_DEBUG
+  int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+  KF_TRACE(30, ("__kmp_resume_monitor: T#%d wants to wakeup T#%d enter\n", gtid,
+                KMP_GTID_MONITOR));
+  KMP_DEBUG_ASSERT(gtid != KMP_GTID_MONITOR);
+#endif
+  status = pthread_mutex_lock(&__kmp_wait_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+#ifdef DEBUG_SUSPEND
+  {
+    char buffer[128];
+    __kmp_print_cond(buffer, &__kmp_wait_cv.c_cond);
+    __kmp_printf("__kmp_resume_monitor: T#%d resuming T#%d: %s\n", gtid,
+                 KMP_GTID_MONITOR, buffer);
+  }
+#endif
+  status = pthread_cond_signal(&__kmp_wait_cv.c_cond);
+  KMP_CHECK_SYSFAIL("pthread_cond_signal", status);
+  status = pthread_mutex_unlock(&__kmp_wait_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+  KF_TRACE(30, ("__kmp_resume_monitor: T#%d exiting after signaling wake up"
+                " for T#%d\n",
+                gtid, KMP_GTID_MONITOR));
+}
+#endif // KMP_USE_MONITOR
+
+void __kmp_yield() { sched_yield(); }
+
+void __kmp_gtid_set_specific(int gtid) {
+  if (__kmp_init_gtid) {
+    int status;
+    status = pthread_setspecific(__kmp_gtid_threadprivate_key,
+                                 (void *)(intptr_t)(gtid + 1));
+    KMP_CHECK_SYSFAIL("pthread_setspecific", status);
+  } else {
+    KA_TRACE(50, ("__kmp_gtid_set_specific: runtime shutdown, returning\n"));
+  }
+}
+
+int __kmp_gtid_get_specific() {
+  int gtid;
+  if (!__kmp_init_gtid) {
+    KA_TRACE(50, ("__kmp_gtid_get_specific: runtime shutdown, returning "
+                  "KMP_GTID_SHUTDOWN\n"));
+    return KMP_GTID_SHUTDOWN;
+  }
+  gtid = (int)(size_t)pthread_getspecific(__kmp_gtid_threadprivate_key);
+  if (gtid == 0) {
+    gtid = KMP_GTID_DNE;
+  } else {
+    gtid--;
+  }
+  KA_TRACE(50, ("__kmp_gtid_get_specific: key:%d gtid:%d\n",
+                __kmp_gtid_threadprivate_key, gtid));
+  return gtid;
+}
+
+double __kmp_read_cpu_time(void) {
+  /*clock_t   t;*/
+  struct tms buffer;
+
+  /*t =*/times(&buffer);
+
+  return (double)(buffer.tms_utime + buffer.tms_cutime) /
+         (double)CLOCKS_PER_SEC;
+}
+
+int __kmp_read_system_info(struct kmp_sys_info *info) {
+  int status;
+  struct rusage r_usage;
+
+  memset(info, 0, sizeof(*info));
+
+  status = getrusage(RUSAGE_SELF, &r_usage);
+  KMP_CHECK_SYSFAIL_ERRNO("getrusage", status);
+
+#if !KMP_OS_WASI
+  // The maximum resident set size utilized (in kilobytes)
+  info->maxrss = r_usage.ru_maxrss;
+  // The number of page faults serviced without any I/O
+  info->minflt = r_usage.ru_minflt;
+  // The number of page faults serviced that required I/O
+  info->majflt = r_usage.ru_majflt;
+  // The number of times a process was "swapped" out of memory
+  info->nswap = r_usage.ru_nswap;
+  // The number of times the file system had to perform input
+  info->inblock = r_usage.ru_inblock;
+  // The number of times the file system had to perform output
+  info->oublock = r_usage.ru_oublock;
+  // The number of times a context switch was voluntarily
+  info->nvcsw = r_usage.ru_nvcsw;
+  // The number of times a context switch was forced
+  info->nivcsw = r_usage.ru_nivcsw;
+#endif
+
+  return (status != 0);
+}
+
+void __kmp_read_system_time(double *delta) {
+  double t_ns;
+  struct timeval tval;
+  struct timespec stop;
+  int status;
+
+  status = gettimeofday(&tval, NULL);
+  KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
+  TIMEVAL_TO_TIMESPEC(&tval, &stop);
+  t_ns = (double)(TS2NS(stop) - TS2NS(__kmp_sys_timer_data.start));
+  *delta = (t_ns * 1e-9);
+}
+
+void __kmp_clear_system_time(void) {
+  struct timeval tval;
+  int status;
+  status = gettimeofday(&tval, NULL);
+  KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
+  TIMEVAL_TO_TIMESPEC(&tval, &__kmp_sys_timer_data.start);
+}
+
+static int __kmp_get_xproc(void) {
+
+  int r = 0;
+
+#if KMP_OS_LINUX
+
+  __kmp_type_convert(sysconf(_SC_NPROCESSORS_CONF), &(r));
+
+#elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD || \
+    KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
+
+  __kmp_type_convert(sysconf(_SC_NPROCESSORS_ONLN), &(r));
+
+#elif KMP_OS_DARWIN
+
+  // Bug C77011 High "OpenMP Threads and number of active cores".
+
+  // Find the number of available CPUs.
+  kern_return_t rc;
+  host_basic_info_data_t info;
+  mach_msg_type_number_t num = HOST_BASIC_INFO_COUNT;
+  rc = host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&info, &num);
+  if (rc == 0 && num == HOST_BASIC_INFO_COUNT) {
+    // Cannot use KA_TRACE() here because this code works before trace support
+    // is initialized.
+    r = info.avail_cpus;
+  } else {
+    KMP_WARNING(CantGetNumAvailCPU);
+    KMP_INFORM(AssumedNumCPU);
+  }
+
+#else
+
+#error "Unknown or unsupported OS."
+
+#endif
+
+  return r > 0 ? r : 2; /* guess value of 2 if OS told us 0 */
+
+} // __kmp_get_xproc
+
+int __kmp_read_from_file(char const *path, char const *format, ...) {
+  int result;
+  va_list args;
+
+  va_start(args, format);
+  FILE *f = fopen(path, "rb");
+  if (f == NULL) {
+    va_end(args);
+    return 0;
+  }
+  result = vfscanf(f, format, args);
+  fclose(f);
+  va_end(args);
+
+  return result;
+}
+
+void __kmp_runtime_initialize(void) {
+  int status;
+  pthread_mutexattr_t mutex_attr;
+  pthread_condattr_t cond_attr;
+
+  if (__kmp_init_runtime) {
+    return;
+  }
+
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+  if (!__kmp_cpuinfo.initialized) {
+    __kmp_query_cpuid(&__kmp_cpuinfo);
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+  __kmp_xproc = __kmp_get_xproc();
+
+#if !KMP_32_BIT_ARCH
+  struct rlimit rlim;
+  // read stack size of calling thread, save it as default for worker threads;
+  // this should be done before reading environment variables
+  status = getrlimit(RLIMIT_STACK, &rlim);
+  if (status == 0) { // success?
+    __kmp_stksize = rlim.rlim_cur;
+    __kmp_check_stksize(&__kmp_stksize); // check value and adjust if needed
+  }
+#endif /* KMP_32_BIT_ARCH */
+
+  if (sysconf(_SC_THREADS)) {
+
+    /* Query the maximum number of threads */
+    __kmp_type_convert(sysconf(_SC_THREAD_THREADS_MAX), &(__kmp_sys_max_nth));
+#ifdef __ve__
+    if (__kmp_sys_max_nth == -1) {
+      // VE's pthread supports only up to 64 threads per a VE process.
+      // So we use that KMP_MAX_NTH (predefined as 64) here.
+      __kmp_sys_max_nth = KMP_MAX_NTH;
+    }
+#else
+    if (__kmp_sys_max_nth == -1) {
+      /* Unlimited threads for NPTL */
+      __kmp_sys_max_nth = INT_MAX;
+    } else if (__kmp_sys_max_nth <= 1) {
+      /* Can't tell, just use PTHREAD_THREADS_MAX */
+      __kmp_sys_max_nth = KMP_MAX_NTH;
+    }
+#endif
+
+    /* Query the minimum stack size */
+    __kmp_sys_min_stksize = sysconf(_SC_THREAD_STACK_MIN);
+    if (__kmp_sys_min_stksize <= 1) {
+      __kmp_sys_min_stksize = KMP_MIN_STKSIZE;
+    }
+  }
+
+  /* Set up minimum number of threads to switch to TLS gtid */
+  __kmp_tls_gtid_min = KMP_TLS_GTID_MIN;
+
+  status = pthread_key_create(&__kmp_gtid_threadprivate_key,
+                              __kmp_internal_end_dest);
+  KMP_CHECK_SYSFAIL("pthread_key_create", status);
+  status = pthread_mutexattr_init(&mutex_attr);
+  KMP_CHECK_SYSFAIL("pthread_mutexattr_init", status);
+  status = pthread_mutex_init(&__kmp_wait_mx.m_mutex, &mutex_attr);
+  KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
+  status = pthread_mutexattr_destroy(&mutex_attr);
+  KMP_CHECK_SYSFAIL("pthread_mutexattr_destroy", status);
+  status = pthread_condattr_init(&cond_attr);
+  KMP_CHECK_SYSFAIL("pthread_condattr_init", status);
+  status = pthread_cond_init(&__kmp_wait_cv.c_cond, &cond_attr);
+  KMP_CHECK_SYSFAIL("pthread_cond_init", status);
+  status = pthread_condattr_destroy(&cond_attr);
+  KMP_CHECK_SYSFAIL("pthread_condattr_destroy", status);
+#if USE_ITT_BUILD
+  __kmp_itt_initialize();
+#endif /* USE_ITT_BUILD */
+
+  __kmp_init_runtime = TRUE;
+}
+
+void __kmp_runtime_destroy(void) {
+  int status;
+
+  if (!__kmp_init_runtime) {
+    return; // Nothing to do.
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_destroy();
+#endif /* USE_ITT_BUILD */
+
+  status = pthread_key_delete(__kmp_gtid_threadprivate_key);
+  KMP_CHECK_SYSFAIL("pthread_key_delete", status);
+
+  status = pthread_mutex_destroy(&__kmp_wait_mx.m_mutex);
+  if (status != 0 && status != EBUSY) {
+    KMP_SYSFAIL("pthread_mutex_destroy", status);
+  }
+  status = pthread_cond_destroy(&__kmp_wait_cv.c_cond);
+  if (status != 0 && status != EBUSY) {
+    KMP_SYSFAIL("pthread_cond_destroy", status);
+  }
+#if KMP_AFFINITY_SUPPORTED
+  __kmp_affinity_uninitialize();
+#endif
+
+  __kmp_init_runtime = FALSE;
+}
+
+/* Put the thread to sleep for a time period */
+/* NOTE: not currently used anywhere */
+void __kmp_thread_sleep(int millis) { sleep((millis + 500) / 1000); }
+
+/* Calculate the elapsed wall clock time for the user */
+void __kmp_elapsed(double *t) {
+  int status;
+#ifdef FIX_SGI_CLOCK
+  struct timespec ts;
+
+  status = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  KMP_CHECK_SYSFAIL_ERRNO("clock_gettime", status);
+  *t =
+      (double)ts.tv_nsec * (1.0 / (double)KMP_NSEC_PER_SEC) + (double)ts.tv_sec;
+#else
+  struct timeval tv;
+
+  status = gettimeofday(&tv, NULL);
+  KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
+  *t =
+      (double)tv.tv_usec * (1.0 / (double)KMP_USEC_PER_SEC) + (double)tv.tv_sec;
+#endif
+}
+
+/* Calculate the elapsed wall clock tick for the user */
+void __kmp_elapsed_tick(double *t) { *t = 1 / (double)CLOCKS_PER_SEC; }
+
+/* Return the current time stamp in nsec */
+kmp_uint64 __kmp_now_nsec() {
+  struct timeval t;
+  gettimeofday(&t, NULL);
+  kmp_uint64 nsec = (kmp_uint64)KMP_NSEC_PER_SEC * (kmp_uint64)t.tv_sec +
+                    (kmp_uint64)1000 * (kmp_uint64)t.tv_usec;
+  return nsec;
+}
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+/* Measure clock ticks per millisecond */
+void __kmp_initialize_system_tick() {
+  kmp_uint64 now, nsec2, diff;
+  kmp_uint64 delay = 1000000; // ~450 usec on most machines.
+  kmp_uint64 nsec = __kmp_now_nsec();
+  kmp_uint64 goal = __kmp_hardware_timestamp() + delay;
+  while ((now = __kmp_hardware_timestamp()) < goal)
+    ;
+  nsec2 = __kmp_now_nsec();
+  diff = nsec2 - nsec;
+  if (diff > 0) {
+    double tpus = 1000.0 * (double)(delay + (now - goal)) / (double)diff;
+    if (tpus > 0.0) {
+      __kmp_ticks_per_msec = (kmp_uint64)(tpus * 1000.0);
+      __kmp_ticks_per_usec = (kmp_uint64)tpus;
+    }
+  }
+}
+#endif
+
+/* Determine whether the given address is mapped into the current address
+   space. */
+
+int __kmp_is_address_mapped(void *addr) {
+
+  int found = 0;
+  int rc;
+
+#if defined(__COSMOPOLITAN__)
+
+  (void)rc;
+  found = kisdangerous(addr);
+
+#elif KMP_OS_LINUX || KMP_OS_HURD
+
+  /* On GNUish OSes, read the /proc/<pid>/maps pseudo-file to get all the
+     address ranges mapped into the address space. */
+
+  char *name = __kmp_str_format("/proc/%d/maps", getpid());
+  FILE *file = NULL;
+
+  file = fopen(name, "r");
+  KMP_ASSERT(file != NULL);
+
+  for (;;) {
+
+    void *beginning = NULL;
+    void *ending = NULL;
+    char perms[5];
+
+    rc = fscanf(file, "%p-%p %4s %*[^\n]\n", &beginning, &ending, perms);
+    if (rc == EOF) {
+      break;
+    }
+    KMP_ASSERT(rc == 3 &&
+               KMP_STRLEN(perms) == 4); // Make sure all fields are read.
+
+    // Ending address is not included in the region, but beginning is.
+    if ((addr >= beginning) && (addr < ending)) {
+      perms[2] = 0; // 3th and 4th character does not matter.
+      if (strcmp(perms, "rw") == 0) {
+        // Memory we are looking for should be readable and writable.
+        found = 1;
+      }
+      break;
+    }
+  }
+
+  // Free resources.
+  fclose(file);
+  KMP_INTERNAL_FREE(name);
+#elif KMP_OS_FREEBSD
+  char *buf;
+  size_t lstsz;
+  int mib[] = {CTL_KERN, KERN_PROC, KERN_PROC_VMMAP, getpid()};
+  rc = sysctl(mib, 4, NULL, &lstsz, NULL, 0);
+  if (rc < 0)
+    return 0;
+  // We pass from number of vm entry's semantic
+  // to size of whole entry map list.
+  lstsz = lstsz * 4 / 3;
+  buf = reinterpret_cast<char *>(kmpc_malloc(lstsz));
+  rc = sysctl(mib, 4, buf, &lstsz, NULL, 0);
+  if (rc < 0) {
+    kmpc_free(buf);
+    return 0;
+  }
+
+  char *lw = buf;
+  char *up = buf + lstsz;
+
+  while (lw < up) {
+    struct kinfo_vmentry *cur = reinterpret_cast<struct kinfo_vmentry *>(lw);
+    size_t cursz = cur->kve_structsize;
+    if (cursz == 0)
+      break;
+    void *start = reinterpret_cast<void *>(cur->kve_start);
+    void *end = reinterpret_cast<void *>(cur->kve_end);
+    // Readable/Writable addresses within current map entry
+    if ((addr >= start) && (addr < end)) {
+      if ((cur->kve_protection & KVME_PROT_READ) != 0 &&
+          (cur->kve_protection & KVME_PROT_WRITE) != 0) {
+        found = 1;
+        break;
+      }
+    }
+    lw += cursz;
+  }
+  kmpc_free(buf);
+
+#elif KMP_OS_DARWIN
+
+  /* On OS X*, /proc pseudo filesystem is not available. Try to read memory
+     using vm interface. */
+
+  int buffer;
+  vm_size_t count;
+  rc = vm_read_overwrite(
+      mach_task_self(), // Task to read memory of.
+      (vm_address_t)(addr), // Address to read from.
+      1, // Number of bytes to be read.
+      (vm_address_t)(&buffer), // Address of buffer to save read bytes in.
+      &count // Address of var to save number of read bytes in.
+  );
+  if (rc == 0) {
+    // Memory successfully read.
+    found = 1;
+  }
+
+#elif KMP_OS_NETBSD
+
+  int mib[5];
+  mib[0] = CTL_VM;
+  mib[1] = VM_PROC;
+  mib[2] = VM_PROC_MAP;
+  mib[3] = getpid();
+  mib[4] = sizeof(struct kinfo_vmentry);
+
+  size_t size;
+  rc = sysctl(mib, __arraycount(mib), NULL, &size, NULL, 0);
+  KMP_ASSERT(!rc);
+  KMP_ASSERT(size);
+
+  size = size * 4 / 3;
+  struct kinfo_vmentry *kiv = (struct kinfo_vmentry *)KMP_INTERNAL_MALLOC(size);
+  KMP_ASSERT(kiv);
+
+  rc = sysctl(mib, __arraycount(mib), kiv, &size, NULL, 0);
+  KMP_ASSERT(!rc);
+  KMP_ASSERT(size);
+
+  for (size_t i = 0; i < size; i++) {
+    if (kiv[i].kve_start >= (uint64_t)addr &&
+        kiv[i].kve_end <= (uint64_t)addr) {
+      found = 1;
+      break;
+    }
+  }
+  KMP_INTERNAL_FREE(kiv);
+#elif KMP_OS_OPENBSD
+
+  int mib[3];
+  mib[0] = CTL_KERN;
+  mib[1] = KERN_PROC_VMMAP;
+  mib[2] = getpid();
+
+  size_t size;
+  uint64_t end;
+  rc = sysctl(mib, 3, NULL, &size, NULL, 0);
+  KMP_ASSERT(!rc);
+  KMP_ASSERT(size);
+  end = size;
+
+  struct kinfo_vmentry kiv = {.kve_start = 0};
+
+  while ((rc = sysctl(mib, 3, &kiv, &size, NULL, 0)) == 0) {
+    KMP_ASSERT(size);
+    if (kiv.kve_end == end)
+      break;
+
+    if (kiv.kve_start >= (uint64_t)addr && kiv.kve_end <= (uint64_t)addr) {
+      found = 1;
+      break;
+    }
+    kiv.kve_start += 1;
+  }
+#elif KMP_OS_WASI
+  found = (int)addr < (__builtin_wasm_memory_size(0) * PAGESIZE);
+#elif KMP_OS_DRAGONFLY || KMP_OS_SOLARIS || KMP_OS_AIX
+
+  // FIXME(DragonFly, Solaris, AIX): Implement this
+  found = 1;
+
+#else
+
+#error "Unknown or unsupported OS"
+
+#endif
+
+  return found;
+
+} // __kmp_is_address_mapped
+
+#ifdef USE_LOAD_BALANCE
+
+#if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||    \
+    KMP_OS_OPENBSD || KMP_OS_SOLARIS
+
+// The function returns the rounded value of the system load average
+// during given time interval which depends on the value of
+// __kmp_load_balance_interval variable (default is 60 sec, other values
+// may be 300 sec or 900 sec).
+// It returns -1 in case of error.
+int __kmp_get_load_balance(int max) {
+  double averages[3];
+  int ret_avg = 0;
+
+  int res = getloadavg(averages, 3);
+
+  // Check __kmp_load_balance_interval to determine which of averages to use.
+  // getloadavg() may return the number of samples less than requested that is
+  // less than 3.
+  if (__kmp_load_balance_interval < 180 && (res >= 1)) {
+    ret_avg = (int)averages[0]; // 1 min
+  } else if ((__kmp_load_balance_interval >= 180 &&
+              __kmp_load_balance_interval < 600) &&
+             (res >= 2)) {
+    ret_avg = (int)averages[1]; // 5 min
+  } else if ((__kmp_load_balance_interval >= 600) && (res == 3)) {
+    ret_avg = (int)averages[2]; // 15 min
+  } else { // Error occurred
+    return -1;
+  }
+
+  return ret_avg;
+}
+
+#else // Linux* OS
+
+// The function returns number of running (not sleeping) threads, or -1 in case
+// of error. Error could be reported if Linux* OS kernel too old (without
+// "/proc" support). Counting running threads stops if max running threads
+// encountered.
+int __kmp_get_load_balance(int max) {
+  static int permanent_error = 0;
+  static int glb_running_threads = 0; // Saved count of the running threads for
+  // the thread balance algorithm
+  static double glb_call_time = 0; /* Thread balance algorithm call time */
+
+  int running_threads = 0; // Number of running threads in the system.
+ 
+  DIR *proc_dir = NULL; // Handle of "/proc/" directory.
+  struct dirent *proc_entry = NULL;
+
+  kmp_str_buf_t task_path; // "/proc/<pid>/task/<tid>/" path.
+  DIR *task_dir = NULL; // Handle of "/proc/<pid>/task/<tid>/" directory.
+  struct dirent *task_entry = NULL;
+  int task_path_fixed_len;
+
+  kmp_str_buf_t stat_path; // "/proc/<pid>/task/<tid>/stat" path.
+  int stat_file = -1;
+  int stat_path_fixed_len;
+
+#ifdef KMP_DEBUG
+  int total_processes = 0; // Total number of processes in system.
+#endif
+
+  double call_time = 0.0;
+
+  __kmp_str_buf_init(&task_path);
+  __kmp_str_buf_init(&stat_path);
+
+  __kmp_elapsed(&call_time);
+
+  if (glb_call_time &&
+      (call_time - glb_call_time < __kmp_load_balance_interval)) {
+    running_threads = glb_running_threads;
+    goto finish;
+  }
+
+  glb_call_time = call_time;
+
+  // Do not spend time on scanning "/proc/" if we have a permanent error.
+  if (permanent_error) {
+    running_threads = -1;
+    goto finish;
+  }
+
+  if (max <= 0) {
+    max = INT_MAX;
+  }
+
+  // Open "/proc/" directory.
+  proc_dir = opendir("/proc");
+  if (proc_dir == NULL) {
+    // Cannot open "/proc/". Probably the kernel does not support it. Return an
+    // error now and in subsequent calls.
+    running_threads = -1;
+    permanent_error = 1;
+    goto finish;
+  }
+
+  // Initialize fixed part of task_path. This part will not change.
+  __kmp_str_buf_cat(&task_path, "/proc/", 6);
+  task_path_fixed_len = task_path.used; // Remember number of used characters.
+
+  proc_entry = readdir(proc_dir);
+  while (proc_entry != NULL) {
+#if KMP_OS_AIX
+    // Proc entry name starts with a digit. Assume it is a  process' directory.
+    if (isdigit(proc_entry->d_name[0])) {
+#else
+    // Proc entry is a directory and name starts with a digit. Assume it is a
+    // process' directory.
+    if (proc_entry->d_type == DT_DIR && isdigit(proc_entry->d_name[0])) {
+#endif
+
+#ifdef KMP_DEBUG
+      ++total_processes;
+#endif
+      // Make sure init process is the very first in "/proc", so we can replace
+      // strcmp( proc_entry->d_name, "1" ) == 0 with simpler total_processes ==
+      // 1. We are going to check that total_processes == 1 => d_name == "1" is
+      // true (where "=>" is implication). Since C++ does not have => operator,
+      // let us replace it with its equivalent: a => b == ! a || b.
+      KMP_DEBUG_ASSERT(total_processes != 1 ||
+                       strcmp(proc_entry->d_name, "1") == 0);
+
+      // Construct task_path.
+      task_path.used = task_path_fixed_len; // Reset task_path to "/proc/".
+      __kmp_str_buf_cat(&task_path, proc_entry->d_name,
+                        KMP_STRLEN(proc_entry->d_name));
+      __kmp_str_buf_cat(&task_path, "/task", 5);
+
+      task_dir = opendir(task_path.str);
+      if (task_dir == NULL) {
+        // Process can finish between reading "/proc/" directory entry and
+        // opening process' "task/" directory. So, in general case we should not
+        // complain, but have to skip this process and read the next one. But on
+        // systems with no "task/" support we will spend lot of time to scan
+        // "/proc/" tree again and again without any benefit. "init" process
+        // (its pid is 1) should exist always, so, if we cannot open
+        // "/proc/1/task/" directory, it means "task/" is not supported by
+        // kernel. Report an error now and in the future.
+        if (strcmp(proc_entry->d_name, "1") == 0) {
+          running_threads = -1;
+          permanent_error = 1;
+          goto finish;
+        }
+      } else {
+        // Construct fixed part of stat file path.
+        __kmp_str_buf_clear(&stat_path);
+        __kmp_str_buf_cat(&stat_path, task_path.str, task_path.used);
+        __kmp_str_buf_cat(&stat_path, "/", 1);
+        stat_path_fixed_len = stat_path.used;
+
+        task_entry = readdir(task_dir);
+        while (task_entry != NULL) {
+          // It is a directory and name starts with a digit.
+#if KMP_OS_AIX
+          if (isdigit(task_entry->d_name[0])) {
+#else
+          if (proc_entry->d_type == DT_DIR && isdigit(task_entry->d_name[0])) {
+#endif
+
+            // Construct complete stat file path. Easiest way would be:
+            //  __kmp_str_buf_print( & stat_path, "%s/%s/stat", task_path.str,
+            //  task_entry->d_name );
+            // but seriae of __kmp_str_buf_cat works a bit faster.
+            stat_path.used =
+                stat_path_fixed_len; // Reset stat path to its fixed part.
+            __kmp_str_buf_cat(&stat_path, task_entry->d_name,
+                              KMP_STRLEN(task_entry->d_name));
+            __kmp_str_buf_cat(&stat_path, "/stat", 5);
+
+            // Note: Low-level API (open/read/close) is used. High-level API
+            // (fopen/fclose)  works ~ 30 % slower.
+            stat_file = open(stat_path.str, O_RDONLY);
+            if (stat_file == -1) {
+              // We cannot report an error because task (thread) can terminate
+              // just before reading this file.
+            } else {
+              /* Content of "stat" file looks like:
+                 24285 (program) S ...
+
+                 It is a single line (if program name does not include funny
+                 symbols). First number is a thread id, then name of executable
+                 file name in paretheses, then state of the thread. We need just
+                 thread state.
+
+                 Good news: Length of program name is 15 characters max. Longer
+                 names are truncated.
+
+                 Thus, we need rather short buffer: 15 chars for program name +
+                 2 parenthesis, + 3 spaces + ~7 digits of pid = 37.
+
+                 Bad news: Program name may contain special symbols like space,
+                 closing parenthesis, or even new line. This makes parsing
+                 "stat" file not 100 % reliable. In case of fanny program names
+                 parsing may fail (report incorrect thread state).
+
+                 Parsing "status" file looks more promissing (due to different
+                 file structure and escaping special symbols) but reading and
+                 parsing of "status" file works slower.
+                  -- ln
+              */
+              char buffer[65];
+              ssize_t len;
+              len = read(stat_file, buffer, sizeof(buffer) - 1);
+              if (len >= 0) {
+                buffer[len] = 0;
+                // Using scanf:
+                //     sscanf( buffer, "%*d (%*s) %c ", & state );
+                // looks very nice, but searching for a closing parenthesis
+                // works a bit faster.
+                char *close_parent = strstr(buffer, ") ");
+                if (close_parent != NULL) {
+                  char state = *(close_parent + 2);
+                  if (state == 'R') {
+                    ++running_threads;
+                    if (running_threads >= max) {
+                      goto finish;
+                    }
+                  }
+                }
+              }
+              close(stat_file);
+              stat_file = -1;
+            }
+          }
+          task_entry = readdir(task_dir);
+        }
+        closedir(task_dir);
+        task_dir = NULL;
+      }
+    }
+    proc_entry = readdir(proc_dir);
+  }
+
+  // There _might_ be a timing hole where the thread executing this
+  // code get skipped in the load balance, and running_threads is 0.
+  // Assert in the debug builds only!!!
+  KMP_DEBUG_ASSERT(running_threads > 0);
+  if (running_threads <= 0) {
+    running_threads = 1;
+  }
+
+finish: // Clean up and exit.
+  if (proc_dir != NULL) {
+    closedir(proc_dir);
+  }
+  __kmp_str_buf_free(&task_path);
+  if (task_dir != NULL) {
+    closedir(task_dir);
+  }
+  __kmp_str_buf_free(&stat_path);
+  if (stat_file != -1) {
+    close(stat_file);
+  }
+
+  glb_running_threads = running_threads;
+
+  return running_threads;
+
+} // __kmp_get_load_balance
+
+#endif // KMP_OS_DARWIN
+
+#endif // USE_LOAD_BALANCE
+
+#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC ||                            \
+      ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) ||                 \
+      KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||            \
+      KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC_XCOFF)
+
+// we really only need the case with 1 argument, because CLANG always build
+// a struct of pointers to shared variables referenced in the outlined function
+int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+                           void *p_argv[]
+#if OMPT_SUPPORT
+                           ,
+                           void **exit_frame_ptr
+#endif
+) {
+#if OMPT_SUPPORT
+  *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+#endif
+
+  switch (argc) {
+  default:
+    fprintf(stderr, "Too many args to microtask: %d!\n", argc);
+    fflush(stderr);
+    exit(-1);
+  case 0:
+    (*pkfn)(&gtid, &tid);
+    break;
+  case 1:
+    (*pkfn)(&gtid, &tid, p_argv[0]);
+    break;
+  case 2:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
+    break;
+  case 3:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
+    break;
+  case 4:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
+    break;
+  case 5:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
+    break;
+  case 6:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5]);
+    break;
+  case 7:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6]);
+    break;
+  case 8:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7]);
+    break;
+  case 9:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8]);
+    break;
+  case 10:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9]);
+    break;
+  case 11:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
+    break;
+  case 12:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11]);
+    break;
+  case 13:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12]);
+    break;
+  case 14:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13]);
+    break;
+  case 15:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
+    break;
+  }
+
+  return 1;
+}
+
+#endif
+
+#if KMP_OS_LINUX
+// Functions for hidden helper task
+namespace {
+// Condition variable for initializing hidden helper team
+pthread_cond_t hidden_helper_threads_initz_cond_var;
+pthread_mutex_t hidden_helper_threads_initz_lock;
+volatile int hidden_helper_initz_signaled = FALSE;
+
+// Condition variable for deinitializing hidden helper team
+pthread_cond_t hidden_helper_threads_deinitz_cond_var;
+pthread_mutex_t hidden_helper_threads_deinitz_lock;
+volatile int hidden_helper_deinitz_signaled = FALSE;
+
+// Condition variable for the wrapper function of main thread
+pthread_cond_t hidden_helper_main_thread_cond_var;
+pthread_mutex_t hidden_helper_main_thread_lock;
+volatile int hidden_helper_main_thread_signaled = FALSE;
+
+// Semaphore for worker threads. We don't use condition variable here in case
+// that when multiple signals are sent at the same time, only one thread might
+// be waken.
+sem_t hidden_helper_task_sem;
+} // namespace
+
+void __kmp_hidden_helper_worker_thread_wait() {
+  int status = sem_wait(&hidden_helper_task_sem);
+  KMP_CHECK_SYSFAIL("sem_wait", status);
+}
+
+void __kmp_do_initialize_hidden_helper_threads() {
+  // Initialize condition variable
+  int status =
+      pthread_cond_init(&hidden_helper_threads_initz_cond_var, nullptr);
+  KMP_CHECK_SYSFAIL("pthread_cond_init", status);
+
+  status = pthread_cond_init(&hidden_helper_threads_deinitz_cond_var, nullptr);
+  KMP_CHECK_SYSFAIL("pthread_cond_init", status);
+
+  status = pthread_cond_init(&hidden_helper_main_thread_cond_var, nullptr);
+  KMP_CHECK_SYSFAIL("pthread_cond_init", status);
+
+  status = pthread_mutex_init(&hidden_helper_threads_initz_lock, nullptr);
+  KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
+
+  status = pthread_mutex_init(&hidden_helper_threads_deinitz_lock, nullptr);
+  KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
+
+  status = pthread_mutex_init(&hidden_helper_main_thread_lock, nullptr);
+  KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
+
+  // Initialize the semaphore
+  status = sem_init(&hidden_helper_task_sem, 0, 0);
+  KMP_CHECK_SYSFAIL("sem_init", status);
+
+  // Create a new thread to finish initialization
+  pthread_t handle;
+  status = pthread_create(
+      &handle, nullptr,
+      [](void *) -> void * {
+        __kmp_hidden_helper_threads_initz_routine();
+        return nullptr;
+      },
+      nullptr);
+  KMP_CHECK_SYSFAIL("pthread_create", status);
+}
+
+void __kmp_hidden_helper_threads_initz_wait() {
+  // Initial thread waits here for the completion of the initialization. The
+  // condition variable will be notified by main thread of hidden helper teams.
+  int status = pthread_mutex_lock(&hidden_helper_threads_initz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+
+  if (!TCR_4(hidden_helper_initz_signaled)) {
+    status = pthread_cond_wait(&hidden_helper_threads_initz_cond_var,
+                               &hidden_helper_threads_initz_lock);
+    KMP_CHECK_SYSFAIL("pthread_cond_wait", status);
+  }
+
+  status = pthread_mutex_unlock(&hidden_helper_threads_initz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+}
+
+void __kmp_hidden_helper_initz_release() {
+  // After all initialization, reset __kmp_init_hidden_helper_threads to false.
+  int status = pthread_mutex_lock(&hidden_helper_threads_initz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+
+  status = pthread_cond_signal(&hidden_helper_threads_initz_cond_var);
+  KMP_CHECK_SYSFAIL("pthread_cond_wait", status);
+
+  TCW_SYNC_4(hidden_helper_initz_signaled, TRUE);
+
+  status = pthread_mutex_unlock(&hidden_helper_threads_initz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+}
+
+void __kmp_hidden_helper_main_thread_wait() {
+  // The main thread of hidden helper team will be blocked here. The
+  // condition variable can only be signal in the destructor of RTL.
+  int status = pthread_mutex_lock(&hidden_helper_main_thread_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+
+  if (!TCR_4(hidden_helper_main_thread_signaled)) {
+    status = pthread_cond_wait(&hidden_helper_main_thread_cond_var,
+                               &hidden_helper_main_thread_lock);
+    KMP_CHECK_SYSFAIL("pthread_cond_wait", status);
+  }
+
+  status = pthread_mutex_unlock(&hidden_helper_main_thread_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+}
+
+void __kmp_hidden_helper_main_thread_release() {
+  // The initial thread of OpenMP RTL should call this function to wake up the
+  // main thread of hidden helper team.
+  int status = pthread_mutex_lock(&hidden_helper_main_thread_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+
+  status = pthread_cond_signal(&hidden_helper_main_thread_cond_var);
+  KMP_CHECK_SYSFAIL("pthread_cond_signal", status);
+
+  // The hidden helper team is done here
+  TCW_SYNC_4(hidden_helper_main_thread_signaled, TRUE);
+
+  status = pthread_mutex_unlock(&hidden_helper_main_thread_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+}
+
+void __kmp_hidden_helper_worker_thread_signal() {
+  int status = sem_post(&hidden_helper_task_sem);
+  KMP_CHECK_SYSFAIL("sem_post", status);
+}
+
+void __kmp_hidden_helper_threads_deinitz_wait() {
+  // Initial thread waits here for the completion of the deinitialization. The
+  // condition variable will be notified by main thread of hidden helper teams.
+  int status = pthread_mutex_lock(&hidden_helper_threads_deinitz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+
+  if (!TCR_4(hidden_helper_deinitz_signaled)) {
+    status = pthread_cond_wait(&hidden_helper_threads_deinitz_cond_var,
+                               &hidden_helper_threads_deinitz_lock);
+    KMP_CHECK_SYSFAIL("pthread_cond_wait", status);
+  }
+
+  status = pthread_mutex_unlock(&hidden_helper_threads_deinitz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+}
+
+void __kmp_hidden_helper_threads_deinitz_release() {
+  int status = pthread_mutex_lock(&hidden_helper_threads_deinitz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+
+  status = pthread_cond_signal(&hidden_helper_threads_deinitz_cond_var);
+  KMP_CHECK_SYSFAIL("pthread_cond_wait", status);
+
+  TCW_SYNC_4(hidden_helper_deinitz_signaled, TRUE);
+
+  status = pthread_mutex_unlock(&hidden_helper_threads_deinitz_lock);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+}
+#else // KMP_OS_LINUX
+void __kmp_hidden_helper_worker_thread_wait() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_do_initialize_hidden_helper_threads() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_threads_initz_wait() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_initz_release() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_main_thread_wait() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_main_thread_release() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_worker_thread_signal() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_threads_deinitz_wait() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+
+void __kmp_hidden_helper_threads_deinitz_release() {
+  KMP_ASSERT(0 && "Hidden helper task is not supported on this OS");
+}
+#endif // KMP_OS_LINUX
+
+bool __kmp_detect_shm() {
+  DIR *dir = opendir("/dev/shm");
+  if (dir) { // /dev/shm exists
+    closedir(dir);
+    return true;
+  } else if (ENOENT == errno) { // /dev/shm does not exist
+    return false;
+  } else { // opendir() failed
+    return false;
+  }
+}
+
+bool __kmp_detect_tmp() {
+  DIR *dir = opendir("/tmp");
+  if (dir) { // /tmp exists
+    closedir(dir);
+    return true;
+  } else if (ENOENT == errno) { // /tmp does not exist
+    return false;
+  } else { // opendir() failed
+    return false;
+  }
+}
+
+// end of file //
diff --git a/third_party/openmp/util2.S b/third_party/openmp/util2.S
new file mode 100644
index 000000000..941a2c33e
--- /dev/null
+++ b/third_party/openmp/util2.S
@@ -0,0 +1,2474 @@
+//  z_Linux_asm.S:  - microtasking routines specifically
+//                    written for Intel platforms running Linux* OS
+
+//
+////===----------------------------------------------------------------------===//
+////
+//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//// See https://llvm.org/LICENSE.txt for license information.
+//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+////
+////===----------------------------------------------------------------------===//
+//
+
+// -----------------------------------------------------------------------
+// macros
+// -----------------------------------------------------------------------
+
+#include "kmp_config.h"
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+# if KMP_MIC
+// the 'delay r16/r32/r64' should be used instead of the 'pause'.
+// The delay operation has the effect of removing the current thread from
+// the round-robin HT mechanism, and therefore speeds up the issue rate of
+// the other threads on the same core.
+//
+// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
+// barrier time to increase greatly for 3 or more threads per core.
+//
+// A value of 100 works pretty well for up to 4 threads per core, but isn't
+// quite as fast as 0 for 2 threads per core.
+//
+// We need to check what happens for oversubscription / > 4 threads per core.
+// It is possible that we need to pass the delay value in as a parameter
+// that the caller determines based on the total # threads / # cores.
+//
+//.macro pause_op
+//	mov    $100, %rax
+//	delay  %rax
+//.endm
+# else
+#  define pause_op   .byte 0xf3,0x90
+# endif // KMP_MIC
+
+# if KMP_OS_DARWIN
+#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
+#  define KMP_LABEL(x) L_##x             // form the name of label
+.macro KMP_CFI_DEF_OFFSET
+.endmacro
+.macro KMP_CFI_OFFSET
+.endmacro
+.macro KMP_CFI_REGISTER
+.endmacro
+.macro KMP_CFI_DEF
+.endmacro
+.macro ALIGN
+	.align $0
+.endmacro
+.macro DEBUG_INFO
+/* Not sure what .size does in icc, not sure if we need to do something
+   similar for OS X*.
+*/
+.endmacro
+.macro PROC
+	ALIGN  4
+	.globl KMP_PREFIX_UNDERSCORE($0)
+KMP_PREFIX_UNDERSCORE($0):
+.endmacro
+# else // KMP_OS_DARWIN
+#  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
+// Format labels so that they don't override function names in gdb's backtraces
+// MIC assembler doesn't accept .L syntax, the L works fine there (as well as
+// on OS X*)
+# if KMP_MIC
+#  define KMP_LABEL(x) L_##x          // local label
+# else
+#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
+# endif // KMP_MIC
+.macro ALIGN size
+	.align 1<<(\size)
+.endm
+.macro DEBUG_INFO proc
+	.cfi_endproc
+// Not sure why we need .type and .size for the functions
+	.align 16
+	.type  \proc,@function
+        .size  \proc,.-\proc
+.endm
+.macro PROC proc
+	ALIGN  4
+        .globl KMP_PREFIX_UNDERSCORE(\proc)
+KMP_PREFIX_UNDERSCORE(\proc):
+	.cfi_startproc
+.endm
+.macro KMP_CFI_DEF_OFFSET sz
+	.cfi_def_cfa_offset	\sz
+.endm
+.macro KMP_CFI_OFFSET reg, sz
+	.cfi_offset	\reg,\sz
+.endm
+.macro KMP_CFI_REGISTER reg
+	.cfi_def_cfa_register	\reg
+.endm
+.macro KMP_CFI_DEF reg, sz
+	.cfi_def_cfa	\reg,\sz
+.endm
+# endif // KMP_OS_DARWIN
+#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
+
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
+
+# if KMP_OS_DARWIN
+#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
+#  define KMP_LABEL(x) L_##x             // form the name of label
+
+.macro ALIGN
+	.align $0
+.endmacro
+
+.macro DEBUG_INFO
+/* Not sure what .size does in icc, not sure if we need to do something
+   similar for OS X*.
+*/
+.endmacro
+
+.macro PROC
+	ALIGN  4
+	.globl KMP_PREFIX_UNDERSCORE($0)
+KMP_PREFIX_UNDERSCORE($0):
+.endmacro
+# elif KMP_OS_WINDOWS
+#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Windows/ARM64 symbols
+// Format labels so that they don't override function names in gdb's backtraces
+#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
+
+.macro ALIGN size
+	.align 1<<(\size)
+.endm
+
+.macro DEBUG_INFO proc
+	ALIGN 2
+.endm
+
+.macro PROC proc
+	ALIGN 2
+	.globl KMP_PREFIX_UNDERSCORE(\proc)
+KMP_PREFIX_UNDERSCORE(\proc):
+.endm
+# else // KMP_OS_DARWIN || KMP_OS_WINDOWS
+#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
+// Format labels so that they don't override function names in gdb's backtraces
+#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
+
+.macro ALIGN size
+	.align 1<<(\size)
+.endm
+
+.macro DEBUG_INFO proc
+	.cfi_endproc
+// Not sure why we need .type and .size for the functions
+	ALIGN 2
+#if KMP_ARCH_ARM
+	.type  \proc,%function
+#else
+	.type  \proc,@function
+#endif
+	.size  \proc,.-\proc
+.endm
+
+.macro PROC proc
+	ALIGN 2
+	.globl KMP_PREFIX_UNDERSCORE(\proc)
+KMP_PREFIX_UNDERSCORE(\proc):
+	.cfi_startproc
+.endm
+# endif // KMP_OS_DARWIN
+
+#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
+
+.macro COMMON name, size, align_power
+#if KMP_OS_DARWIN
+	.comm \name, \size
+#elif KMP_OS_WINDOWS
+	.comm \name, \size, \align_power
+#else // !KMP_OS_DARWIN && !KMP_OS_WINDOWS
+	.comm \name, \size, (1<<(\align_power))
+#endif
+.endm
+
+// -----------------------------------------------------------------------
+// data
+// -----------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+
+// Support for unnamed common blocks.
+//
+// Because the symbol ".gomp_critical_user_" contains a ".", we have to
+// put this stuff in assembly.
+
+# if KMP_ARCH_X86
+#  if KMP_OS_DARWIN
+        .data
+        .comm .gomp_critical_user_,32
+        .data
+        .globl ___kmp_unnamed_critical_addr
+___kmp_unnamed_critical_addr:
+        .long .gomp_critical_user_
+#  else /* Linux* OS */
+        .data
+        .comm .gomp_critical_user_,32,8
+        .data
+	ALIGN 4
+        .global __kmp_unnamed_critical_addr
+__kmp_unnamed_critical_addr:
+        .4byte .gomp_critical_user_
+        .type __kmp_unnamed_critical_addr,@object
+        .size __kmp_unnamed_critical_addr,4
+#  endif /* KMP_OS_DARWIN */
+# endif /* KMP_ARCH_X86 */
+
+# if KMP_ARCH_X86_64
+#  if KMP_OS_DARWIN
+        .data
+        .comm .gomp_critical_user_,32
+        .data
+        .globl ___kmp_unnamed_critical_addr
+___kmp_unnamed_critical_addr:
+        .quad .gomp_critical_user_
+#  else /* Linux* OS */
+        .data
+#ifdef __COSMOPOLITAN__
+	.globl	.gomp_critical_user_
+".gomp_critical_user_":
+	.align	8
+	.byte	32
+#else
+        .comm .gomp_critical_user_,32,8
+#endif
+        .data
+	ALIGN 8
+        .global __kmp_unnamed_critical_addr
+__kmp_unnamed_critical_addr:
+        .8byte .gomp_critical_user_
+        .type __kmp_unnamed_critical_addr,@object
+        .size __kmp_unnamed_critical_addr,8
+#  endif /* KMP_OS_DARWIN */
+# endif /* KMP_ARCH_X86_64 */
+
+#endif /* KMP_GOMP_COMPAT */
+
+
+#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
+
+// -----------------------------------------------------------------------
+// microtasking routines specifically written for IA-32 architecture
+// running Linux* OS
+// -----------------------------------------------------------------------
+
+	.ident "Intel Corporation"
+	.data
+	ALIGN 4
+// void
+// __kmp_x86_pause( void );
+
+        .text
+	PROC  __kmp_x86_pause
+
+        pause_op
+        ret
+
+	DEBUG_INFO __kmp_x86_pause
+
+# if !KMP_ASM_INTRINS
+
+//------------------------------------------------------------------------
+// kmp_int32
+// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
+
+        PROC      __kmp_test_then_add32
+
+        movl      4(%esp), %ecx
+        movl      8(%esp), %eax
+        lock
+        xaddl     %eax,(%ecx)
+        ret
+
+	DEBUG_INFO __kmp_test_then_add32
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed8
+//
+// kmp_int32
+// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
+//
+// parameters:
+// 	p:	4(%esp)
+// 	d:	8(%esp)
+//
+// return:	%al
+        PROC  __kmp_xchg_fixed8
+
+        movl      4(%esp), %ecx    // "p"
+        movb      8(%esp), %al	// "d"
+
+        lock
+        xchgb     %al,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed8
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed16
+//
+// kmp_int16
+// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
+//
+// parameters:
+// 	p:	4(%esp)
+// 	d:	8(%esp)
+// return:     %ax
+        PROC  __kmp_xchg_fixed16
+
+        movl      4(%esp), %ecx    // "p"
+        movw      8(%esp), %ax	// "d"
+
+        lock
+        xchgw     %ax,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed16
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed32
+//
+// kmp_int32
+// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
+//
+// parameters:
+// 	p:	4(%esp)
+// 	d:	8(%esp)
+//
+// return:	%eax
+        PROC  __kmp_xchg_fixed32
+
+        movl      4(%esp), %ecx    // "p"
+        movl      8(%esp), %eax	// "d"
+
+        lock
+        xchgl     %eax,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed32
+
+
+// kmp_int8
+// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+        PROC  __kmp_compare_and_store8
+
+        movl      4(%esp), %ecx
+        movb      8(%esp), %al
+        movb      12(%esp), %dl
+        lock
+        cmpxchgb  %dl,(%ecx)
+        sete      %al           // if %al == (%ecx) set %al = 1 else set %al = 0
+        and       $1, %eax      // sign extend previous instruction
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store8
+
+// kmp_int16
+// __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
+        PROC  __kmp_compare_and_store16
+
+        movl      4(%esp), %ecx
+        movw      8(%esp), %ax
+        movw      12(%esp), %dx
+        lock
+        cmpxchgw  %dx,(%ecx)
+        sete      %al           // if %ax == (%ecx) set %al = 1 else set %al = 0
+        and       $1, %eax      // sign extend previous instruction
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store16
+
+// kmp_int32
+// __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
+        PROC  __kmp_compare_and_store32
+
+        movl      4(%esp), %ecx
+        movl      8(%esp), %eax
+        movl      12(%esp), %edx
+        lock
+        cmpxchgl  %edx,(%ecx)
+        sete      %al          // if %eax == (%ecx) set %al = 1 else set %al = 0
+        and       $1, %eax     // sign extend previous instruction
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store32
+
+// kmp_int32
+// __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
+        PROC  __kmp_compare_and_store64
+
+        pushl     %ebp
+        movl      %esp, %ebp
+        pushl     %ebx
+        pushl     %edi
+        movl      8(%ebp), %edi
+        movl      12(%ebp), %eax        // "cv" low order word
+        movl      16(%ebp), %edx        // "cv" high order word
+        movl      20(%ebp), %ebx        // "sv" low order word
+        movl      24(%ebp), %ecx        // "sv" high order word
+        lock
+        cmpxchg8b (%edi)
+        sete      %al      // if %edx:eax == (%edi) set %al = 1 else set %al = 0
+        and       $1, %eax // sign extend previous instruction
+        popl      %edi
+        popl      %ebx
+        movl      %ebp, %esp
+        popl      %ebp
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store64
+
+// kmp_int8
+// __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
+        PROC  __kmp_compare_and_store_ret8
+
+        movl      4(%esp), %ecx
+        movb      8(%esp), %al
+        movb      12(%esp), %dl
+        lock
+        cmpxchgb  %dl,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret8
+
+// kmp_int16
+// __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
+//                               kmp_int16 sv);
+        PROC  __kmp_compare_and_store_ret16
+
+        movl      4(%esp), %ecx
+        movw      8(%esp), %ax
+        movw      12(%esp), %dx
+        lock
+        cmpxchgw  %dx,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret16
+
+// kmp_int32
+// __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
+//                               kmp_int32 sv);
+        PROC  __kmp_compare_and_store_ret32
+
+        movl      4(%esp), %ecx
+        movl      8(%esp), %eax
+        movl      12(%esp), %edx
+        lock
+        cmpxchgl  %edx,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret32
+
+// kmp_int64
+// __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
+//                               kmp_int64 sv);
+        PROC  __kmp_compare_and_store_ret64
+
+        pushl     %ebp
+        movl      %esp, %ebp
+        pushl     %ebx
+        pushl     %edi
+        movl      8(%ebp), %edi
+        movl      12(%ebp), %eax        // "cv" low order word
+        movl      16(%ebp), %edx        // "cv" high order word
+        movl      20(%ebp), %ebx        // "sv" low order word
+        movl      24(%ebp), %ecx        // "sv" high order word
+        lock
+        cmpxchg8b (%edi)
+        popl      %edi
+        popl      %ebx
+        movl      %ebp, %esp
+        popl      %ebp
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret64
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_real32
+//
+// kmp_real32
+// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
+//
+// parameters:
+// 	addr:	4(%esp)
+// 	data:	8(%esp)
+//
+// return:	%eax
+        PROC  __kmp_xchg_real32
+
+        pushl   %ebp
+        movl    %esp, %ebp
+        subl    $4, %esp
+        pushl   %esi
+
+        movl    4(%ebp), %esi
+        flds    (%esi)
+                        // load <addr>
+        fsts    -4(%ebp)
+                        // store old value
+
+        movl    8(%ebp), %eax
+
+        lock
+        xchgl   %eax, (%esi)
+
+        flds    -4(%ebp)
+                        // return old value
+
+        popl    %esi
+        movl    %ebp, %esp
+        popl    %ebp
+        ret
+
+        DEBUG_INFO __kmp_xchg_real32
+
+# endif /* !KMP_ASM_INTRINS */
+
+//------------------------------------------------------------------------
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, argv[0], ... );
+//   return 1;
+// }
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	PROC  __kmp_invoke_microtask
+
+	pushl %ebp
+	KMP_CFI_DEF_OFFSET 8
+	KMP_CFI_OFFSET ebp,-8
+	movl %esp,%ebp		// establish the base pointer for this routine.
+	KMP_CFI_REGISTER ebp
+	subl $8,%esp		// allocate space for two local variables.
+				// These varibales are:
+				//	argv: -4(%ebp)
+				//	temp: -8(%ebp)
+				//
+	pushl %ebx		// save %ebx to use during this routine
+				//
+#if OMPT_SUPPORT
+	movl 28(%ebp),%ebx	// get exit_frame address
+	movl %ebp,(%ebx)	// save exit_frame
+#endif
+
+	movl 20(%ebp),%ebx	// Stack alignment - # args
+	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
+	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
+	movl %esp,%eax		//
+	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
+	movl %eax,%ebx		// Save to %ebx
+	andl $0xFFFFFF80,%eax	// mask off 7 bits
+	subl %eax,%ebx		// Amount to subtract from %esp
+	subl %ebx,%esp		// Prepare the stack ptr --
+				//   now it will be aligned on 128-byte boundary at the call
+
+	movl 24(%ebp),%eax	// copy from p_argv[]
+	movl %eax,-4(%ebp)	// into the local variable *argv.
+
+	movl 20(%ebp),%ebx	// argc is 20(%ebp)
+	shll $2,%ebx
+
+KMP_LABEL(invoke_2):
+	cmpl $0,%ebx
+	jg  KMP_LABEL(invoke_4)
+	jmp KMP_LABEL(invoke_3)
+	ALIGN 2
+KMP_LABEL(invoke_4):
+	movl -4(%ebp),%eax
+	subl $4,%ebx			// decrement argc.
+	addl %ebx,%eax			// index into argv.
+	movl (%eax),%edx
+	pushl %edx
+
+	jmp KMP_LABEL(invoke_2)
+	ALIGN 2
+KMP_LABEL(invoke_3):
+	leal 16(%ebp),%eax		// push & tid
+	pushl %eax
+
+	leal 12(%ebp),%eax		// push & gtid
+	pushl %eax
+
+	movl 8(%ebp),%ebx
+	call *%ebx			// call (*pkfn)();
+
+	movl $1,%eax			// return 1;
+
+	movl -12(%ebp),%ebx		// restore %ebx
+	leave
+	KMP_CFI_DEF esp,4
+	ret
+
+	DEBUG_INFO __kmp_invoke_microtask
+// -- End  __kmp_invoke_microtask
+
+
+// kmp_uint64
+// __kmp_hardware_timestamp(void)
+	PROC  __kmp_hardware_timestamp
+	rdtsc
+	ret
+
+	DEBUG_INFO __kmp_hardware_timestamp
+// -- End  __kmp_hardware_timestamp
+
+#endif /* KMP_ARCH_X86 */
+
+
+#if KMP_ARCH_X86_64
+
+// -----------------------------------------------------------------------
+// microtasking routines specifically written for IA-32 architecture and
+// Intel(R) 64 running Linux* OS
+// -----------------------------------------------------------------------
+
+// -- Machine type P
+// mark_description "Intel Corporation";
+	.ident "Intel Corporation"
+// --	.file "z_Linux_asm.S"
+	.data
+	ALIGN 4
+
+// To prevent getting our code into .data section .text added to every routine
+// definition for x86_64.
+//------------------------------------------------------------------------
+# if !KMP_ASM_INTRINS
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_test_then_add32
+//
+// kmp_int32
+// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%esi
+//
+// return:	%eax
+        .text
+        PROC  __kmp_test_then_add32
+
+        movl      %esi, %eax	// "d"
+        lock
+        xaddl     %eax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_test_then_add32
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_test_then_add64
+//
+// kmp_int64
+// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%rsi
+//	return:	%rax
+        .text
+        PROC  __kmp_test_then_add64
+
+        movq      %rsi, %rax	// "d"
+        lock
+        xaddq     %rax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_test_then_add64
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed8
+//
+// kmp_int32
+// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%sil
+//
+// return:	%al
+        .text
+        PROC  __kmp_xchg_fixed8
+
+        movb      %sil, %al	// "d"
+
+        lock
+        xchgb     %al,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed8
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed16
+//
+// kmp_int16
+// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%si
+// return:     %ax
+        .text
+        PROC  __kmp_xchg_fixed16
+
+        movw      %si, %ax	// "d"
+
+        lock
+        xchgw     %ax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed16
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed32
+//
+// kmp_int32
+// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%esi
+//
+// return:	%eax
+        .text
+        PROC  __kmp_xchg_fixed32
+
+        movl      %esi, %eax	// "d"
+
+        lock
+        xchgl     %eax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed32
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed64
+//
+// kmp_int64
+// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%rsi
+// return:	%rax
+        .text
+        PROC  __kmp_xchg_fixed64
+
+        movq      %rsi, %rax	// "d"
+
+        lock
+        xchgq     %rax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed64
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store8
+//
+// kmp_int8
+// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store8
+
+        movb      %sil, %al	// "cv"
+        lock
+        cmpxchgb  %dl,(%rdi)
+        sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store8
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store16
+//
+// kmp_int16
+// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%si
+//	sv:	%dx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store16
+
+        movw      %si, %ax	// "cv"
+        lock
+        cmpxchgw  %dx,(%rdi)
+        sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store16
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store32
+//
+// kmp_int32
+// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store32
+
+        movl      %esi, %eax	// "cv"
+        lock
+        cmpxchgl  %edx,(%rdi)
+        sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store32
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store64
+//
+// kmp_int32
+// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%rsi
+//	sv:	%rdx
+//	return:	%eax
+        .text
+        PROC  __kmp_compare_and_store64
+
+        movq      %rsi, %rax    // "cv"
+        lock
+        cmpxchgq  %rdx,(%rdi)
+        sete      %al           // if %rax == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store64
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store_ret8
+//
+// kmp_int8
+// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store_ret8
+
+        movb      %sil, %al	// "cv"
+        lock
+        cmpxchgb  %dl,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret8
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store_ret16
+//
+// kmp_int16
+// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%si
+//	sv:	%dx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store_ret16
+
+        movw      %si, %ax	// "cv"
+        lock
+        cmpxchgw  %dx,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret16
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store_ret32
+//
+// kmp_int32
+// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store_ret32
+
+        movl      %esi, %eax	// "cv"
+        lock
+        cmpxchgl  %edx,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret32
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store_ret64
+//
+// kmp_int64
+// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%rsi
+//	sv:	%rdx
+//	return:	%eax
+        .text
+        PROC  __kmp_compare_and_store_ret64
+
+        movq      %rsi, %rax    // "cv"
+        lock
+        cmpxchgq  %rdx,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret64
+
+# endif /* !KMP_ASM_INTRINS */
+
+
+# if !KMP_MIC
+
+# if !KMP_ASM_INTRINS
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_real32
+//
+// kmp_real32
+// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
+//
+// parameters:
+// 	addr:	%rdi
+// 	data:	%xmm0 (lower 4 bytes)
+//
+// return:	%xmm0 (lower 4 bytes)
+        .text
+        PROC  __kmp_xchg_real32
+
+	movd	%xmm0, %eax	// load "data" to eax
+
+         lock
+         xchgl %eax, (%rdi)
+
+	movd	%eax, %xmm0	// load old value into return register
+
+        ret
+
+        DEBUG_INFO __kmp_xchg_real32
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_real64
+//
+// kmp_real64
+// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
+//
+// parameters:
+//      addr:   %rdi
+//      data:   %xmm0 (lower 8 bytes)
+//      return: %xmm0 (lower 8 bytes)
+        .text
+        PROC  __kmp_xchg_real64
+
+	movd	%xmm0, %rax	// load "data" to rax
+
+         lock
+	xchgq  %rax, (%rdi)
+
+	movd	%rax, %xmm0	// load old value into return register
+        ret
+
+        DEBUG_INFO __kmp_xchg_real64
+
+
+# endif /* !KMP_MIC */
+
+# endif /* !KMP_ASM_INTRINS */
+
+//------------------------------------------------------------------------
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, argv[0], ... );
+//   return 1;
+// }
+//
+// note: at call to pkfn must have %rsp 128-byte aligned for compiler
+//
+// parameters:
+//      %rdi:  	pkfn
+//	%esi:	gtid
+//	%edx:	tid
+//	%ecx:	argc
+//	%r8:	p_argv
+//	%r9:	&exit_frame
+//
+// locals:
+//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
+//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
+//
+// reg temps:
+//	%rax:	used all over the place
+//	%rdx:	used in stack pointer alignment calculation
+//	%r11:	used to traverse p_argv array
+//	%rsi:	used as temporary for stack parameters
+//		used as temporary for number of pkfn parms to push
+//	%rbx:	used to hold pkfn address, and zero constant, callee-save
+//
+// return:	%eax 	(always 1/TRUE)
+__gtid = -16
+__tid = -24
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+        .text
+	PROC  __kmp_invoke_microtask
+
+	pushq 	%rbp		// save base pointer
+	KMP_CFI_DEF_OFFSET 16
+	KMP_CFI_OFFSET rbp,-16
+	movq 	%rsp,%rbp	// establish the base pointer for this routine.
+	KMP_CFI_REGISTER rbp
+
+#if OMPT_SUPPORT
+	movq	%rbp, (%r9)	// save exit_frame
+#endif
+
+	pushq 	%rbx		// %rbx is callee-saved register
+	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
+	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn
+
+	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
+	movq	$0, %rbx	// constant for cmovs later
+	subq	$4, %rax	// subtract four args passed in registers to pkfn
+#if KMP_MIC
+	js	KMP_LABEL(kmp_0)	// jump to movq
+	jmp	KMP_LABEL(kmp_0_exit)	// jump ahead
+KMP_LABEL(kmp_0):
+	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
+KMP_LABEL(kmp_0_exit):
+#else
+	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
+#endif // KMP_MIC
+
+	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
+	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8
+
+	movq 	%rsp, %rdx	//
+	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
+				// without align, stack ptr would be this
+	movq 	%rdx, %rax	// Save to %rax
+
+	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
+	subq 	%rax, %rdx	// Amount to subtract from %rsp
+	subq 	%rdx, %rsp	// Prepare the stack ptr --
+				// now %rsp will align to 128-byte boundary at call site
+
+				// setup pkfn parameter reg and stack
+	movq	%rcx, %rax	// argc -> %rax
+	cmpq	$0, %rsi
+	je	KMP_LABEL(kmp_invoke_pass_parms)	// jump ahead if no parms to push
+	shlq	$3, %rcx	// argc*8 -> %rcx
+	movq 	%r8, %rdx	// p_argv -> %rdx
+	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx
+
+	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx
+
+KMP_LABEL(kmp_invoke_push_parms):
+	// push nth - 7th parms to pkfn on stack
+	subq	$8, %rdx	// decrement p_argv pointer to previous parm
+	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
+	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
+	subl	$1, %ecx
+
+// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
+//		if the name of the label that is an operand of this jecxz starts with a dot (".");
+//	   Apple's linker does not support 1-byte length relocation;
+//         Resolution: replace all .labelX entries with L_labelX.
+
+	jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
+	jmp	KMP_LABEL(kmp_invoke_push_parms)
+	ALIGN 3
+KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
+				// order here is important to avoid trashing
+				// registers used for both input and output parms!
+	movq	%rdi, %rbx	// pkfn -> %rbx
+	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
+	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
+
+	movq	%r8, %r11	// p_argv -> %r11
+
+#if KMP_MIC
+	cmpq	$4, %rax	// argc >= 4?
+	jns	KMP_LABEL(kmp_4)	// jump to movq
+	jmp	KMP_LABEL(kmp_4_exit)	// jump ahead
+KMP_LABEL(kmp_4):
+	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
+KMP_LABEL(kmp_4_exit):
+
+	cmpq	$3, %rax	// argc >= 3?
+	jns	KMP_LABEL(kmp_3)	// jump to movq
+	jmp	KMP_LABEL(kmp_3_exit)	// jump ahead
+KMP_LABEL(kmp_3):
+	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
+KMP_LABEL(kmp_3_exit):
+
+	cmpq	$2, %rax	// argc >= 2?
+	jns	KMP_LABEL(kmp_2)	// jump to movq
+	jmp	KMP_LABEL(kmp_2_exit)	// jump ahead
+KMP_LABEL(kmp_2):
+	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
+KMP_LABEL(kmp_2_exit):
+
+	cmpq	$1, %rax	// argc >= 1?
+	jns	KMP_LABEL(kmp_1)	// jump to movq
+	jmp	KMP_LABEL(kmp_1_exit)	// jump ahead
+KMP_LABEL(kmp_1):
+	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
+KMP_LABEL(kmp_1_exit):
+#else
+	cmpq	$4, %rax	// argc >= 4?
+	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
+
+	cmpq	$3, %rax	// argc >= 3?
+	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
+
+	cmpq	$2, %rax	// argc >= 2?
+	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
+
+	cmpq	$1, %rax	// argc >= 1?
+	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
+#endif // KMP_MIC
+
+	call	*%rbx		// call (*pkfn)();
+	movq	$1, %rax	// move 1 into return register;
+
+	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
+	movq 	%rbp, %rsp	// restore stack pointer
+	popq 	%rbp		// restore frame pointer
+	KMP_CFI_DEF rsp,8
+	ret
+
+	DEBUG_INFO __kmp_invoke_microtask
+// -- End  __kmp_invoke_microtask
+
+// kmp_uint64
+// __kmp_hardware_timestamp(void)
+        .text
+	PROC  __kmp_hardware_timestamp
+	rdtsc
+	shlq    $32, %rdx
+	orq     %rdx, %rax
+	ret
+
+	DEBUG_INFO __kmp_hardware_timestamp
+// -- End  __kmp_hardware_timestamp
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_bsr32
+//
+// int
+// __kmp_bsr32( int );
+        .text
+        PROC  __kmp_bsr32
+
+        bsr    %edi,%eax
+        ret
+
+        DEBUG_INFO __kmp_bsr32
+
+// -----------------------------------------------------------------------
+#endif /* KMP_ARCH_X86_64 */
+
+// '
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64
+
+//------------------------------------------------------------------------
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, argv[0], ... );
+//
+// // FIXME: This is done at call-site and can be removed here.
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = 0;
+// #endif
+//
+//   return 1;
+// }
+//
+// parameters:
+//	x0:	pkfn
+//	w1:	gtid
+//	w2:	tid
+//	w3:	argc
+//	x4:	p_argv
+//	x5:	&exit_frame
+//
+// locals:
+//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
+//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
+//
+// reg temps:
+//	 x8:	used to hold pkfn address
+//	 w9:	used as temporary for number of pkfn parms
+//	x10:	used to traverse p_argv array
+//	x11:	used as temporary for stack placement calculation
+//	x12:	used as temporary for stack parameters
+//	x19:	used to preserve exit_frame_ptr, callee-save
+//
+// return:	w0	(always 1/TRUE)
+//
+
+__gtid = 4
+__tid = 8
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	PROC __kmp_invoke_microtask
+
+	stp	x29, x30, [sp, #-16]!
+# if OMPT_SUPPORT
+	stp	x19, x20, [sp, #-16]!
+# endif
+	mov	x29, sp
+
+	orr	w9, wzr, #1
+	add	w9, w9, w3, lsr #1
+	sub	sp, sp, w9, uxtw #4
+	mov	x11, sp
+
+	mov	x8, x0
+	str	w1, [x29, #-__gtid]
+	str	w2, [x29, #-__tid]
+	mov	w9, w3
+	mov	x10, x4
+# if OMPT_SUPPORT
+	mov	x19, x5
+	str	x29, [x19]
+# endif
+
+	sub	x0, x29, #__gtid
+	sub	x1, x29, #__tid
+
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x2, [x10]
+
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x3, [x10, #8]!
+
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x4, [x10, #8]!
+
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x5, [x10, #8]!
+
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x6, [x10, #8]!
+
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x7, [x10, #8]!
+
+KMP_LABEL(kmp_0):
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x12, [x10, #8]!
+	str	x12, [x11], #8
+	b	KMP_LABEL(kmp_0)
+KMP_LABEL(kmp_1):
+	blr	x8
+	orr	w0, wzr, #1
+	mov	sp, x29
+# if OMPT_SUPPORT
+	str	xzr, [x19]
+	ldp	x19, x20, [sp], #16
+# endif
+	ldp	x29, x30, [sp], #16
+	ret
+
+	DEBUG_INFO __kmp_invoke_microtask
+// -- End  __kmp_invoke_microtask
+
+#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
+
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM
+
+//------------------------------------------------------------------------
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, argv[0], ... );
+//
+// // FIXME: This is done at call-site and can be removed here.
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = 0;
+// #endif
+//
+//   return 1;
+// }
+//
+// parameters:
+//	r0:	pkfn
+//	r1:	gtid
+//	r2:	tid
+//	r3:	argc
+//	r4(stack):	p_argv
+//	r5(stack):	&exit_frame
+//
+// locals:
+//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
+//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
+//
+// reg temps:
+//	 r4:	used to hold pkfn address
+//	 r5:	used as temporary for number of pkfn parms
+//	 r6:	used to traverse p_argv array
+//	 r7:	frame pointer (in some configurations)
+//	 r8:	used as temporary for stack placement calculation
+//	 	and as pointer to base of callee saved area
+//	 r9:	used as temporary for stack parameters
+//	r10:	used to preserve exit_frame_ptr, callee-save
+//	r11:	frame pointer (in some configurations)
+//
+// return:	r0	(always 1/TRUE)
+//
+
+__gtid = 4
+__tid = 8
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	PROC __kmp_invoke_microtask
+
+	// Pushing one extra register (r3) to keep the stack aligned
+	// for when we call pkfn below
+	push	{r3-r11,lr}
+	// Load p_argv and &exit_frame
+	ldr	r4, [sp, #10*4]
+# if OMPT_SUPPORT
+	ldr	r5, [sp, #11*4]
+# endif
+
+# if KMP_OS_DARWIN || (defined(__thumb__) && !KMP_OS_WINDOWS)
+# define FP r7
+# define FPOFF 4*4
+#else
+# define FP r11
+# define FPOFF 8*4
+#endif
+	add	FP, sp, #FPOFF
+# if OMPT_SUPPORT
+	mov	r10, r5
+	str	FP, [r10]
+# endif
+	mov	r8, sp
+
+	// Calculate how much stack to allocate, in increments of 8 bytes.
+	// We strictly need 4*(argc-2) bytes (2 arguments are passed in
+	// registers) but allocate 4*argc for simplicity (to avoid needing
+	// to handle the argc<2 cases). We align the number of bytes
+	// allocated to 8 bytes, to keep the stack aligned. (Since we
+	// already allocate more than enough, it's ok to round down
+	// instead of up for the alignment.) We allocate another extra
+	// 8 bytes for gtid and tid.
+	mov	r5, #1
+	add	r5, r5, r3, lsr #1
+	sub	sp, sp, r5, lsl #3
+
+	str	r1, [r8, #-__gtid]
+	str	r2, [r8, #-__tid]
+	mov	r5, r3
+	mov	r6, r4
+	mov	r4, r0
+
+	// Prepare the first 2 parameters to pkfn - pointers to gtid and tid
+	// in our stack frame.
+	sub	r0, r8, #__gtid
+	sub	r1, r8, #__tid
+
+	mov	r8, sp
+
+	// Load p_argv[0] and p_argv[1] into r2 and r3, if argc >= 1/2
+	cmp	r5, #0
+	beq	KMP_LABEL(kmp_1)
+	ldr	r2, [r6]
+
+	subs	r5, r5, #1
+	beq	KMP_LABEL(kmp_1)
+	ldr	r3, [r6, #4]!
+
+	// Loop, loading the rest of p_argv and writing the elements on the
+	// stack.
+KMP_LABEL(kmp_0):
+	subs	r5, r5, #1
+	beq	KMP_LABEL(kmp_1)
+	ldr	r12, [r6, #4]!
+	str	r12, [r8], #4
+	b	KMP_LABEL(kmp_0)
+KMP_LABEL(kmp_1):
+	blx	r4
+	mov	r0, #1
+
+	sub	r4, FP, #FPOFF
+	mov	sp, r4
+# undef FP
+# undef FPOFF
+
+# if OMPT_SUPPORT
+	mov	r1, #0
+	str	r1, [r10]
+# endif
+	pop	{r3-r11,pc}
+
+	DEBUG_INFO __kmp_invoke_microtask
+// -- End  __kmp_invoke_microtask
+
+#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
+
+#if KMP_ARCH_PPC64
+
+//------------------------------------------------------------------------
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, argv[0], ... );
+//
+// // FIXME: This is done at call-site and can be removed here.
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = 0;
+// #endif
+//
+//   return 1;
+// }
+//
+// parameters:
+//	r3:	pkfn
+//	r4:	gtid
+//	r5:	tid
+//	r6:	argc
+//	r7:	p_argv
+//	r8:	&exit_frame
+//
+// return:	r3	(always 1/TRUE)
+//
+	.text
+# if KMP_ARCH_PPC64_ELFv2
+	.abiversion 2
+# endif
+	.globl	__kmp_invoke_microtask
+
+# if KMP_ARCH_PPC64_ELFv2
+	.p2align	4
+# else
+	.p2align	2
+# endif
+
+	.type	__kmp_invoke_microtask,@function
+
+# if KMP_ARCH_PPC64_ELFv2
+__kmp_invoke_microtask:
+.Lfunc_begin0:
+.Lfunc_gep0:
+	addis 2, 12, .TOC.-.Lfunc_gep0@ha
+	addi 2, 2, .TOC.-.Lfunc_gep0@l
+.Lfunc_lep0:
+	.localentry	__kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
+# else
+	.section	.opd,"aw",@progbits
+__kmp_invoke_microtask:
+	.p2align	3
+	.quad	.Lfunc_begin0
+	.quad	.TOC.@tocbase
+	.quad	0
+	.text
+.Lfunc_begin0:
+# endif
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+
+// We need to allocate a stack frame large enough to hold all of the parameters
+// on the stack for the microtask plus what this function needs. That's 48
+// bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
+// parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
+// and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
+// to save r30 to hold a copy of r8.
+
+	.cfi_startproc
+	mflr 0
+	std 31, -8(1)
+	std 0, 16(1)
+
+// This is unusual because normally we'd set r31 equal to r1 after the stack
+// frame is established. In this case, however, we need to dynamically compute
+// the stack frame size, and so we keep a direct copy of r1 to access our
+// register save areas and restore the r1 value before returning.
+	mr 31, 1
+	.cfi_def_cfa_register r31
+	.cfi_offset r31, -8
+	.cfi_offset lr, 16
+
+// Compute the size necessary for the local stack frame.
+# if KMP_ARCH_PPC64_ELFv2
+	li 12, 72
+# else
+	li 12, 88
+# endif
+	sldi 0, 6, 3
+	add 12, 0, 12
+	neg 12, 12
+
+// We need to make sure that the stack frame stays aligned (to 16 bytes).
+	li 0, -16
+	and 12, 0, 12
+
+// Establish the local stack frame.
+	stdux 1, 1, 12
+
+# if OMPT_SUPPORT
+	.cfi_offset r30, -16
+	std 30, -16(31)
+	std 1, 0(8)
+	mr 30, 8
+# endif
+
+// Store gtid and tid to the stack because they're passed by reference to the microtask.
+	stw 4, -20(31)
+	stw 5, -24(31)
+
+	mr 12, 6
+	mr 4, 7
+
+	cmpwi 0, 12, 1
+	blt	 0, .Lcall
+
+	ld 5, 0(4)
+
+	cmpwi 0, 12, 2
+	blt	 0, .Lcall
+
+	ld 6, 8(4)
+
+	cmpwi 0, 12, 3
+	blt	 0, .Lcall
+
+	ld 7, 16(4)
+
+	cmpwi 0, 12, 4
+	blt	 0, .Lcall
+
+	ld 8, 24(4)
+
+	cmpwi 0, 12, 5
+	blt	 0, .Lcall
+
+	ld 9, 32(4)
+
+	cmpwi 0, 12, 6
+	blt	 0, .Lcall
+
+	ld 10, 40(4)
+
+	cmpwi 0, 12, 7
+	blt	 0, .Lcall
+
+// There are more than 6 microtask parameters, so we need to store the
+// remainder to the stack.
+	addi 12, 12, -6
+	mtctr 12
+
+// These are set to 8 bytes before the first desired store address (we're using
+// pre-increment loads and stores in the loop below). The parameter save area
+// for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
+// 32 + 8*8 == 96 bytes above r1 for ELFv2.
+	addi 4, 4, 40
+# if KMP_ARCH_PPC64_ELFv2
+	addi 12, 1, 88
+# else
+	addi 12, 1, 104
+# endif
+
+.Lnext:
+	ldu 0, 8(4)
+	stdu 0, 8(12)
+	bdnz .Lnext
+
+.Lcall:
+# if KMP_ARCH_PPC64_ELFv2
+	std 2, 24(1)
+	mr 12, 3
+#else
+	std 2, 40(1)
+// For ELFv1, we need to load the actual function address from the function descriptor.
+	ld 12, 0(3)
+	ld 2, 8(3)
+	ld 11, 16(3)
+#endif
+
+	addi 3, 31, -20
+	addi 4, 31, -24
+
+	mtctr 12
+	bctrl
+# if KMP_ARCH_PPC64_ELFv2
+	ld 2, 24(1)
+# else
+	ld 2, 40(1)
+# endif
+
+# if OMPT_SUPPORT
+	li 3, 0
+	std 3, 0(30)
+# endif
+
+	li 3, 1
+
+# if OMPT_SUPPORT
+	ld 30, -16(31)
+# endif
+
+	mr 1, 31
+	ld 0, 16(1)
+	ld 31, -8(1)
+	mtlr 0
+	blr
+
+	.long	0
+	.quad	0
+.Lfunc_end0:
+	.size	__kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
+	.cfi_endproc
+
+// -- End  __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_PPC64 */
+
+#if KMP_ARCH_RISCV64
+
+//------------------------------------------------------------------------
+//
+// typedef void (*microtask_t)(int *gtid, int *tid, ...);
+//
+// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+//                            void *p_argv[]
+// #if OMPT_SUPPORT
+//                            ,
+//                            void **exit_frame_ptr
+// #endif
+//                            ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)(&gtid, &tid, argv[0], ...);
+//
+//   return 1;
+// }
+//
+// Parameters:
+//   a0: pkfn
+//   a1: gtid
+//   a2: tid
+//   a3: argc
+//   a4: p_argv
+//   a5: exit_frame_ptr
+//
+// Locals:
+//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
+//   __tid: tid param pushed on stack so can pass &tid to pkfn
+//
+// Temp. registers:
+//
+//  t0: used to calculate the dynamic stack size / used to hold pkfn address
+//  t1: used as temporary for stack placement calculation
+//  t2: used as temporary for stack arguments
+//  t3: used as temporary for number of remaining pkfn parms
+//  t4: used to traverse p_argv array
+//
+// return: a0 (always 1/TRUE)
+//
+
+__gtid = -20
+__tid = -24
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	.globl	__kmp_invoke_microtask
+	.p2align	1
+	.type	__kmp_invoke_microtask,@function
+__kmp_invoke_microtask:
+	.cfi_startproc
+
+	// First, save ra and fp
+	addi	sp, sp, -16
+	sd	ra, 8(sp)
+	sd	fp, 0(sp)
+	addi	fp, sp, 16
+	.cfi_def_cfa	fp, 0
+	.cfi_offset	ra, -8
+	.cfi_offset	fp, -16
+
+	// Compute the dynamic stack size:
+	//
+	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
+	//   reference
+	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
+	//   function by register. Given that we have 8 of such registers (a[0-7])
+	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
+	//   reserve max(0, argc - 6)*8 extra bytes
+	//
+	// The total number of bytes is then max(0, argc - 6)*8 + 8
+
+	// Compute max(0, argc - 6) using the following bithack:
+	// max(0, x) = x - (x & (x >> 31)), where x := argc - 6
+	// Source: http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+	addi	t0, a3, -6
+	srai	t1, t0, 31
+	and	t1, t0, t1
+	sub	t0, t0, t1
+
+	addi	t0, t0, 1
+
+	slli	t0, t0, 3
+	sub	sp, sp, t0
+
+	// Align the stack to 16 bytes
+	andi	sp, sp, -16
+
+	mv	t0, a0
+	mv	t3, a3
+	mv	t4, a4
+
+#if OMPT_SUPPORT
+	// Save frame pointer into exit_frame
+	sd	fp, 0(a5)
+#endif
+
+	// Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
+
+	sw	a1, __gtid(fp)
+	sw	a2, __tid(fp)
+
+	addi	a0, fp, __gtid
+	addi	a1, fp, __tid
+
+	beqz	t3, .L_kmp_3
+	ld	a2, 0(t4)
+
+	addi	t3, t3, -1
+	beqz	t3, .L_kmp_3
+	ld	a3, 8(t4)
+
+	addi	t3, t3, -1
+	beqz	t3, .L_kmp_3
+	ld	a4, 16(t4)
+
+	addi	t3, t3, -1
+	beqz	t3, .L_kmp_3
+	ld	a5, 24(t4)
+
+	addi	t3, t3, -1
+	beqz	t3, .L_kmp_3
+	ld	a6, 32(t4)
+
+	addi	t3, t3, -1
+	beqz	t3, .L_kmp_3
+	ld	a7, 40(t4)
+
+	// Prepare any additional argument passed through the stack
+	addi	t4, t4, 48
+	mv	t1, sp
+	j .L_kmp_2
+.L_kmp_1:
+	ld	t2, 0(t4)
+	sd	t2, 0(t1)
+	addi	t4, t4, 8
+	addi	t1, t1, 8
+.L_kmp_2:
+	addi	t3, t3, -1
+	bnez	t3, .L_kmp_1
+
+.L_kmp_3:
+	// Call pkfn function
+	jalr	t0
+
+	// Restore stack and return
+
+	addi	a0, zero, 1
+
+	addi	sp, fp, -16
+	ld	fp, 0(sp)
+	ld	ra, 8(sp)
+	addi	sp, sp, 16
+	ret
+.Lfunc_end0:
+	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
+	.cfi_endproc
+
+// -- End  __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_RISCV64 */
+
+#if KMP_ARCH_LOONGARCH64
+
+//------------------------------------------------------------------------
+//
+// typedef void (*microtask_t)(int *gtid, int *tid, ...);
+//
+// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+//                            void *p_argv[]
+// #if OMPT_SUPPORT
+//                            ,
+//                            void **exit_frame_ptr
+// #endif
+//                            ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)(&gtid, &tid, argv[0], ...);
+//
+//   return 1;
+// }
+//
+// Parameters:
+//   a0: pkfn
+//   a1: gtid
+//   a2: tid
+//   a3: argc
+//   a4: p_argv
+//   a5: exit_frame_ptr
+//
+// Locals:
+//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
+//   __tid: tid param pushed on stack so can pass &tid to pkfn
+//
+// Temp registers:
+//
+//  t0: used to calculate the dynamic stack size / used to hold pkfn address
+//  t1: used as temporary for stack placement calculation
+//  t2: used as temporary for stack arguments
+//  t3: used as temporary for number of remaining pkfn parms
+//  t4: used to traverse p_argv array
+//
+// return: a0 (always 1/TRUE)
+//
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	.globl	__kmp_invoke_microtask
+	.p2align	2
+	.type	__kmp_invoke_microtask,@function
+__kmp_invoke_microtask:
+	.cfi_startproc
+
+	// First, save ra and fp
+	addi.d	$sp, $sp, -16
+	st.d	$ra, $sp, 8
+	st.d	$fp, $sp, 0
+	addi.d	$fp, $sp, 16
+	.cfi_def_cfa	22, 0
+	.cfi_offset	1, -8
+	.cfi_offset	22, -16
+
+	// Compute the dynamic stack size:
+	//
+	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
+	//   reference
+	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
+	//   function by register. Given that we have 8 of such registers (a[0-7])
+	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
+	//   reserve max(0, argc - 6)*8 extra bytes
+	//
+	// The total number of bytes is then max(0, argc - 6)*8 + 8
+
+	addi.d  $t0, $a3, -6
+	slt  $t1, $t0, $zero
+	masknez  $t0, $t0, $t1
+	addi.d  $t0, $t0, 1
+	slli.d	$t0, $t0, 3
+	sub.d	$sp, $sp, $t0
+
+	// Align the stack to 16 bytes
+	bstrins.d $sp, $zero, 3, 0
+
+	move	$t0, $a0
+	move	$t3, $a3
+	move	$t4, $a4
+
+#if OMPT_SUPPORT
+	// Save frame pointer into exit_frame
+	st.d	$fp, $a5, 0
+#endif
+
+	// Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
+
+	st.w	$a1, $fp, -20
+	st.w	$a2, $fp, -24
+
+	addi.d	$a0, $fp, -20
+	addi.d	$a1, $fp, -24
+
+	beqz	$t3, .L_kmp_3
+	ld.d	$a2, $t4, 0
+
+	addi.d	$t3, $t3, -1
+	beqz	$t3, .L_kmp_3
+	ld.d	$a3, $t4, 8
+
+	addi.d	$t3, $t3, -1
+	beqz	$t3, .L_kmp_3
+	ld.d	$a4, $t4, 16
+
+	addi.d	$t3, $t3, -1
+	beqz	$t3, .L_kmp_3
+	ld.d	$a5, $t4, 24
+
+	addi.d	$t3, $t3, -1
+	beqz	$t3, .L_kmp_3
+	ld.d	$a6, $t4, 32
+
+	addi.d	$t3, $t3, -1
+	beqz	$t3, .L_kmp_3
+	ld.d	$a7, $t4, 40
+
+	// Prepare any additional argument passed through the stack
+	addi.d	$t4, $t4, 48
+	move	$t1, $sp
+	b .L_kmp_2
+.L_kmp_1:
+	ld.d	$t2, $t4, 0
+	st.d	$t2, $t1, 0
+	addi.d	$t4, $t4, 8
+	addi.d	$t1, $t1, 8
+.L_kmp_2:
+	addi.d	$t3, $t3, -1
+	bnez	$t3, .L_kmp_1
+
+.L_kmp_3:
+	// Call pkfn function
+	jirl	$ra, $t0, 0
+
+	// Restore stack and return
+
+	addi.d	$a0, $zero, 1
+
+	addi.d	$sp, $fp, -16
+	ld.d	$fp, $sp, 0
+	ld.d	$ra, $sp, 8
+	addi.d	$sp, $sp, 16
+	jr $ra
+.Lfunc_end0:
+	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
+	.cfi_endproc
+
+// -- End  __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_LOONGARCH64 */
+
+#if KMP_ARCH_VE
+
+//------------------------------------------------------------------------
+//
+// typedef void (*microtask_t)(int *gtid, int *tid, ...);
+//
+// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+//                            void *p_argv[]
+// #if OMPT_SUPPORT
+//                            ,
+//                            void **exit_frame_ptr
+// #endif
+//                            ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)(&gtid, &tid, argv[0], ...);
+//
+//   return 1;
+// }
+//
+// Parameters:
+//   s0: pkfn
+//   s1: gtid
+//   s2: tid
+//   s3: argc
+//   s4: p_argv
+//   s5: exit_frame_ptr
+//
+// Locals:
+//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
+//   __tid: tid param pushed on stack so can pass &tid to pkfn
+//
+// Temp. registers:
+//
+//  s34: used to calculate the dynamic stack size
+//  s35: used as temporary for stack placement calculation
+//  s36: used as temporary for stack arguments
+//  s37: used as temporary for number of remaining pkfn parms
+//  s38: used to traverse p_argv array
+//
+// return: s0 (always 1/TRUE)
+//
+
+__gtid = -4
+__tid = -8
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	.globl	__kmp_invoke_microtask
+	// A function requires 8 bytes align.
+	.p2align	3
+	.type	__kmp_invoke_microtask,@function
+__kmp_invoke_microtask:
+	.cfi_startproc
+
+	// First, save fp and lr.  VE stores them at caller stack frame.
+	st	%fp, 0(, %sp)
+	st	%lr, 8(, %sp)
+	or	%fp, 0, %sp
+	.cfi_def_cfa	%fp, 0
+	.cfi_offset	%lr, 8
+	.cfi_offset	%fp, 0
+
+	// Compute the dynamic stack size:
+	//
+	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them
+	//   by reference
+	// - We need 8 bytes for whole arguments.  We have two + 'argc'
+	//   arguments (condider &gtid and &tid).  We need to reserve
+	//   (argc + 2) * 8 bytes.
+	// - We need 176 bytes for RSA and others
+	//
+	// The total number of bytes is then (argc + 2) * 8 + 8 + 176.
+	//
+	// |------------------------------|
+	// | return address of callee     | 8(%fp)
+	// |------------------------------|
+	// | frame pointer of callee      | 0(%fp)
+	// |------------------------------| <------------------ %fp
+	// | __tid / __gtid               | -8(%fp) / -4(%fp)
+	// |------------------------------|
+	// | argc+2 for arguments         | 176(%sp)
+	// |------------------------------|
+	// | RSA                          |
+	// |------------------------------|
+	// | return address               |
+	// |------------------------------|
+	// | frame pointer                |
+	// |------------------------------| <------------------ %sp
+
+	adds.w.sx	%s34, 2, %s3
+	sll	%s34, %s34, 3
+	lea	%s34, 184(, %s34)
+	subs.l	%sp, %sp, %s34
+
+	// Align the stack to 16 bytes.
+	and	%sp, -16, %sp
+
+	// Save pkfn.
+	or	%s12, 0, %s0
+
+	// Call host to allocate stack if it is necessary.
+	brge.l	%sp, %sl, .L_kmp_pass
+	ld	%s61, 24(, %tp)
+	lea	%s63, 0x13b
+	shm.l	%s63, 0(%s61)
+	shm.l	%sl, 8(%s61)
+	shm.l	%sp, 16(%s61)
+	monc
+
+.L_kmp_pass:
+	lea	%s35, 176(, %sp)
+	adds.w.sx	%s37, 0, %s3
+	or	%s38, 0, %s4
+
+#if OMPT_SUPPORT
+	// Save frame pointer into exit_frame.
+	st	%fp, 0(%s5)
+#endif
+
+	// Prepare arguments for the pkfn function (first 8 using s0-s7
+	// registers, but need to store stack also because of varargs).
+
+	stl	%s1, __gtid(%fp)
+	stl	%s2, __tid(%fp)
+
+	adds.l	%s0, __gtid, %fp
+	st	%s0, 0(, %s35)
+	adds.l	%s1, __tid, %fp
+	st	%s1, 8(, %s35)
+
+	breq.l	0, %s37, .L_kmp_call
+	ld	%s2, 0(, %s38)
+	st	%s2, 16(, %s35)
+
+	breq.l	1, %s37, .L_kmp_call
+	ld	%s3, 8(, %s38)
+	st	%s3, 24(, %s35)
+
+	breq.l	2, %s37, .L_kmp_call
+	ld	%s4, 16(, %s38)
+	st	%s4, 32(, %s35)
+
+	breq.l	3, %s37, .L_kmp_call
+	ld	%s5, 24(, %s38)
+	st	%s5, 40(, %s35)
+
+	breq.l	4, %s37, .L_kmp_call
+	ld	%s6, 32(, %s38)
+	st	%s6, 48(, %s35)
+
+	breq.l	5, %s37, .L_kmp_call
+	ld	%s7, 40(, %s38)
+	st	%s7, 56(, %s35)
+
+	breq.l	6, %s37, .L_kmp_call
+
+	// Prepare any additional argument passed through the stack.
+	adds.l	%s37, -6, %s37
+	lea	%s38, 48(, %s38)
+	lea	%s35, 64(, %s35)
+.L_kmp_loop:
+	ld	%s36, 0(, %s38)
+	st	%s36, 0(, %s35)
+	adds.l	%s37, -1, %s37
+	adds.l	%s38, 8, %s38
+	adds.l	%s35, 8, %s35
+	brne.l	0, %s37, .L_kmp_loop
+
+.L_kmp_call:
+	// Call pkfn function.
+	bsic	%lr, (, %s12)
+
+	// Return value.
+	lea	%s0, 1
+
+	// Restore stack and return.
+	or	%sp, 0, %fp
+	ld	%lr, 8(, %sp)
+	ld	%fp, 0(, %sp)
+	b.l.t	(, %lr)
+.Lfunc_end0:
+	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
+	.cfi_endproc
+
+// -- End  __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_VE */
+
+#if KMP_ARCH_S390X
+
+//------------------------------------------------------------------------
+//
+// typedef void (*microtask_t)(int *gtid, int *tid, ...);
+//
+// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+//                            void *p_argv[]
+// #if OMPT_SUPPORT
+//                            ,
+//                            void **exit_frame_ptr
+// #endif
+//                            ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)(&gtid, &tid, argv[0], ...);
+//
+//   return 1;
+// }
+//
+// Parameters:
+//   r2: pkfn
+//   r3: gtid
+//   r4: tid
+//   r5: argc
+//   r6: p_argv
+//   SP+160: exit_frame_ptr
+//
+// Locals:
+//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
+//   __tid: tid param pushed on stack so can pass &tid to pkfn
+//
+// Temp. registers:
+//
+//  r0: used to fetch argv slots
+//  r7: used as temporary for number of remaining pkfn parms
+//  r8: argv
+//  r9: pkfn
+//  r10: stack size
+//  r11: previous fp
+//  r12: stack parameter area
+//  r13: argv slot
+//
+// return: r2 (always 1/TRUE)
+//
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	.globl	__kmp_invoke_microtask
+	.p2align	1
+	.type	__kmp_invoke_microtask,@function
+__kmp_invoke_microtask:
+	.cfi_startproc
+
+	stmg	%r6,%r14,48(%r15)
+        .cfi_offset %r6, -112
+        .cfi_offset %r7, -104
+        .cfi_offset %r8, -96
+        .cfi_offset %r9, -88
+        .cfi_offset %r10, -80
+        .cfi_offset %r11, -72
+        .cfi_offset %r12, -64
+        .cfi_offset %r13, -56
+        .cfi_offset %r14, -48
+        .cfi_offset %r15, -40
+	lgr	%r11,%r15
+	.cfi_def_cfa %r11, 160
+
+	// Compute the dynamic stack size:
+	//
+	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
+	//   reference
+	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
+	//   function by register. Given that we have 5 of such registers (r[2-6])
+	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
+	//   reserve max(0, argc - 3)*8 extra bytes
+	//
+	// The total number of bytes is then max(0, argc - 3)*8 + 8
+
+	lgr	%r10,%r5
+	aghi	%r10,-2
+	jnm	0f
+	lghi	%r10,0
+0:
+	sllg	%r10,%r10,3
+	lgr	%r12,%r10
+	aghi	%r10,176
+	sgr 	%r15,%r10
+	agr	%r12,%r15
+	stg	%r11,0(%r15)
+
+	lgr	%r9,%r2			// pkfn
+
+#if OMPT_SUPPORT
+	// Save frame pointer into exit_frame
+	lg	%r8,160(%r11)
+	stg	%r11,0(%r8)
+#endif
+
+	// Prepare arguments for the pkfn function (first 5 using r2-r6 registers)
+
+	stg     %r3,160(%r12)
+	la	%r2,164(%r12)		// gid
+	stg	%r4,168(%r12)		
+	la	%r3,172(%r12)		// tid
+	lgr	%r8,%r6			// argv
+
+	// If argc > 0
+	ltgr	%r7,%r5
+	jz	1f
+
+	lg	%r4,0(%r8)		// argv[0]
+	aghi	%r7,-1
+	jz	1f
+
+	// If argc > 1
+	lg	%r5,8(%r8)		// argv[1]
+	aghi	%r7,-1
+	jz	1f
+
+	// If argc > 2
+	lg	%r6,16(%r8)		// argv[2]
+	aghi	%r7,-1
+	jz	1f
+
+	lghi	%r13,0			// Index [n]
+2:
+	lg	%r0,24(%r13,%r8)	// argv[2+n]
+	stg	%r0,160(%r13,%r15)	// parm[2+n]
+	aghi	%r13,8			// Next
+	aghi	%r7,-1
+	jnz	2b
+
+1:
+	basr	%r14,%r9		// Call pkfn
+
+	// Restore stack and return
+
+	lgr	%r15,%r11
+	lmg	%r6,%r14,48(%r15)
+	lghi	%r2,1
+	br	%r14
+.Lfunc_end0:
+	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
+	.cfi_endproc
+
+// -- End  __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_S390X */
+
+#if KMP_ARCH_ARM || KMP_ARCH_MIPS
+    .data
+    COMMON .gomp_critical_user_, 32, 3
+    .data
+    .align 4
+    .global __kmp_unnamed_critical_addr
+__kmp_unnamed_critical_addr:
+    .4byte .gomp_critical_user_
+#ifdef __ELF__
+    .size __kmp_unnamed_critical_addr,4
+#endif
+#endif /* KMP_ARCH_ARM */
+
+#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||                   \
+    KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE ||                 \
+    KMP_ARCH_S390X
+#ifndef KMP_PREFIX_UNDERSCORE
+# define KMP_PREFIX_UNDERSCORE(x) x
+#endif
+    .data
+    COMMON .gomp_critical_user_, 32, 3
+    .data
+    .align 8
+    .global KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr)
+KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
+    .8byte .gomp_critical_user_
+#ifdef __ELF__
+    .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8
+#endif
+#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
+          KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE || 
+          KMP_ARCH_S390X */
+
+#if KMP_OS_LINUX
+# if KMP_ARCH_ARM || KMP_ARCH_AARCH64
+.section .note.GNU-stack,"",%progbits
+# elif !KMP_ARCH_WASM
+.section .note.GNU-stack,"",@progbits
+# endif
+#endif
+
+#if KMP_ARCH_WASM
+.data
+.global .gomp_critical_user_
+.global .gomp_critical_user_.var
+.global .gomp_critical_user_.reduction.var
+.global __kmp_unnamed_critical_addr
+.gomp_critical_user_:
+.zero 4
+.size .gomp_critical_user_, 4
+.gomp_critical_user_.var:
+.zero 4
+.size .gomp_critical_user_.var, 4
+.gomp_critical_user_.reduction.var:
+.zero 4
+.size .gomp_critical_user_.reduction.var, 4
+__kmp_unnamed_critical_addr:
+    .4byte .gomp_critical_user_
+    .size __kmp_unnamed_critical_addr, 4
+#endif
diff --git a/tool/emacs/cosmo-c-keywords.el b/tool/emacs/cosmo-c-keywords.el
index 6d95663b4..f3aa5b349 100644
--- a/tool/emacs/cosmo-c-keywords.el
+++ b/tool/emacs/cosmo-c-keywords.el
@@ -10,7 +10,11 @@
            "_Complex"))
 
         (cuda
-         '("__device__"
+         '("gridDim"
+           "blockIdx"
+           "blockDim"
+           "threadIdx"
+           "__device__"
            "__forceinline__"
            "__global__"
            "__shared__"
diff --git a/tool/emacs/cosmo-stuff.el b/tool/emacs/cosmo-stuff.el
index 278d3b67b..22cd41d0a 100644
--- a/tool/emacs/cosmo-stuff.el
+++ b/tool/emacs/cosmo-stuff.el
@@ -214,7 +214,7 @@
          (buddy (format "test/%s_test.c" name))
          (runs (format "o/$m/%s.com%s V=5 TESTARGS=-b" name runsuffix))
          (buns (format "o/$m/test/%s_test.com%s V=5 TESTARGS=-b" name runsuffix)))
-    (cond ((not (member ext '("c" "cc" "s" "S" "rl" "f")))
+    (cond ((not (member ext '("c" "cc" "cpp" "s" "S" "rl" "f")))
            (format "m=%s; make -j12 MODE=$m o/$m/%s"
                    mode
                    (directory-file-name
@@ -753,6 +753,7 @@
                             (concat dots notest ".c")
                             (concat dots notest ".cc")
                             (concat dots notest ".rl")
+                            (concat dots notest ".cpp")
                             (concat dots notest ".greg.c")
                             (concat dots notest ".ncabi.c")
                             (concat dots notest ".hookabi.c")