Merge e91920a11d into 3a8579252d

2025-08-08 02:40:28 +00:00 · 2023-07-07 10:48:38 -07:00 · 2023-07-07 10:48:38 -07:00 · 5af5413d27
commit 5af5413d27
parent 3a8579252d e91920a11d
168 changed files with 50926 additions and 6 deletions
--- a/1
+++ b/1
@ -182,6 +182,7 @@ include third_party/double-conversion/test/test.mk
 include third_party/lua/lua.mk
 include third_party/zstd/zstd.mk
 include third_party/tr/tr.mk
+include third_party/tbb/tbb.mk
 include third_party/sed/sed.mk
 include third_party/awk/awk.mk
 include third_party/hiredis/hiredis.mk
--- a/third_party/libcxx/atomic
+++ b/third_party/libcxx/atomic
@ -781,7 +781,7 @@ bool __cxx_atomic_compare_exchange_strong(
 }

 template <typename _Tp>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 bool __cxx_atomic_compare_exchange_strong(
    __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order __success,
    memory_order __failure) {
@ -835,7 +835,7 @@ _Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_base_impl<_Tp>* __a,
 }

 template <typename _Tp, typename _Td>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp>* __a, _Td __delta,
                           memory_order __order) {
  return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp>::value,
@ -851,7 +851,7 @@ _Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_base_impl<_Tp>* __a,
 }

 template <typename _Tp, typename _Td>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp>* __a, _Td __delta,
                           memory_order __order) {
  return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp>::value,
@ -867,7 +867,7 @@ _Tp __cxx_atomic_fetch_and(volatile __cxx_atomic_base_impl<_Tp>* __a,
 }

 template <typename _Tp>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp>* __a,
                           _Tp __pattern, memory_order __order) {
  return __atomic_fetch_and(&__a->__a_value, __pattern,
@ -875,7 +875,7 @@ _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp>* __a,
 }

 template <typename _Tp>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 _Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_base_impl<_Tp>* __a,
                          _Tp __pattern, memory_order __order) {
  return __atomic_fetch_or(&__a->__a_value, __pattern,
@ -883,7 +883,7 @@ _Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_base_impl<_Tp>* __a,
 }

 template <typename _Tp>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern,
                          memory_order __order) {
  return __atomic_fetch_or(&__a->__a_value, __pattern,
--- a/third_party/libcxx/libcxx.mk
+++ b/third_party/libcxx/libcxx.mk
@ -109,6 +109,7 @@ THIRD_PARTY_LIBCXX_A_HDRS =					\
 	third_party/libcxx/refstring.hh				\
 	third_party/libcxx/regex				\
 	third_party/libcxx/scoped_allocator			\
+	third_party/libcxx/span			\
 	third_party/libcxx/set					\
 	third_party/libcxx/sstream				\
 	third_party/libcxx/stack				\
--- a/third_party/libcxx/span
+++ b/third_party/libcxx/span
@ -0,0 +1,590 @@
+// -*- C++ -*-
+//===------------------------------ span ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_SPAN
+#define _LIBCPP_SPAN
+
+/*
+    span synopsis
+
+namespace std {
+
+// constants
+inline constexpr size_t dynamic_extent = numeric_limits<size_t>::max();
+
+// [views.span], class template span
+template <class ElementType, size_t Extent = dynamic_extent>
+    class span;
+
+// [span.objectrep], views of object representation
+template <class ElementType, size_t Extent>
+    span<const byte, ((Extent == dynamic_extent) ? dynamic_extent :
+        (sizeof(ElementType) * Extent))> as_bytes(span<ElementType, Extent> s) noexcept;
+
+template <class ElementType, size_t Extent>
+    span<      byte, ((Extent == dynamic_extent) ? dynamic_extent :
+        (sizeof(ElementType) * Extent))> as_writable_bytes(span<ElementType, Extent> s) noexcept;
+
+
+namespace std {
+template <class ElementType, size_t Extent = dynamic_extent>
+class span {
+public:
+    // constants and types
+    using element_type = ElementType;
+    using value_type = remove_cv_t<ElementType>;
+    using index_type = size_t;
+    using difference_type = ptrdiff_t;
+    using pointer = element_type*;
+    using const_pointer = const element_type*;
+    using reference = element_type&;
+    using const_reference = const element_type&;
+    using iterator = implementation-defined;
+    using const_iterator = implementation-defined;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+    static constexpr index_type extent = Extent;
+
+    // [span.cons], span constructors, copy, assignment, and destructor
+    constexpr span() noexcept;
+    constexpr span(pointer ptr, index_type count);
+    constexpr span(pointer firstElem, pointer lastElem);
+    template <size_t N>
+        constexpr span(element_type (&arr)[N]) noexcept;
+    template <size_t N>
+        constexpr span(array<value_type, N>& arr) noexcept;
+    template <size_t N>
+        constexpr span(const array<value_type, N>& arr) noexcept;
+    template <class Container>
+        constexpr span(Container& cont);
+    template <class Container>
+        constexpr span(const Container& cont);
+    constexpr span(const span& other) noexcept = default;
+    template <class OtherElementType, size_t OtherExtent>
+        constexpr span(const span<OtherElementType, OtherExtent>& s) noexcept;
+    ~span() noexcept = default;
+    constexpr span& operator=(const span& other) noexcept = default;
+
+    // [span.sub], span subviews
+    template <size_t Count>
+        constexpr span<element_type, Count> first() const;
+    template <size_t Count>
+        constexpr span<element_type, Count> last() const;
+    template <size_t Offset, size_t Count = dynamic_extent>
+        constexpr span<element_type, see below> subspan() const;
+
+    constexpr span<element_type, dynamic_extent> first(index_type count) const;
+    constexpr span<element_type, dynamic_extent> last(index_type count) const;
+    constexpr span<element_type, dynamic_extent> subspan(index_type offset, index_type count = dynamic_extent) const;
+
+    // [span.obs], span observers
+    constexpr index_type size() const noexcept;
+    constexpr index_type size_bytes() const noexcept;
+    constexpr bool empty() const noexcept;
+
+    // [span.elem], span element access
+    constexpr reference operator[](index_type idx) const;
+    constexpr reference front() const;
+    constexpr reference back() const;
+    constexpr pointer data() const noexcept;
+
+    // [span.iterators], span iterator support
+    constexpr iterator begin() const noexcept;
+    constexpr iterator end() const noexcept;
+    constexpr const_iterator cbegin() const noexcept;
+    constexpr const_iterator cend() const noexcept;
+    constexpr reverse_iterator rbegin() const noexcept;
+    constexpr reverse_iterator rend() const noexcept;
+    constexpr const_reverse_iterator crbegin() const noexcept;
+    constexpr const_reverse_iterator crend() const noexcept;
+
+private:
+    pointer data_;     // exposition only
+    index_type size_;  // exposition only
+};
+
+template<class T, size_t N>
+    span(T (&)[N]) -> span<T, N>;
+
+template<class T, size_t N>
+    span(array<T, N>&) -> span<T, N>;
+
+template<class T, size_t N>
+    span(const array<T, N>&) -> span<const T, N>;
+
+template<class Container>
+    span(Container&) -> span<typename Container::value_type>;
+
+template<class Container>
+    span(const Container&) -> span<const typename Container::value_type>;
+
+} // namespace std
+
+*/
+
+#include "third_party/libcxx/__config"
+#include "third_party/libcxx/cstddef"      // for ptrdiff_t
+#include "third_party/libcxx/iterator"     // for iterators
+#include "third_party/libcxx/array"        // for array
+#include "third_party/libcxx/type_traits"  // for remove_cv, etc
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER > 17
+
+inline constexpr size_t dynamic_extent = numeric_limits<size_t>::max();
+template <typename _Tp, size_t _Extent = dynamic_extent> class span;
+
+
+template <class _Tp>
+struct __is_span_impl : public false_type {};
+
+template <class _Tp, size_t _Extent>
+struct __is_span_impl<span<_Tp, _Extent>> : public true_type {};
+
+template <class _Tp>
+struct __is_span : public __is_span_impl<remove_cv_t<_Tp>> {};
+
+template <class _Tp>
+struct __is_std_array_impl : public false_type {};
+
+template <class _Tp, size_t _Sz>
+struct __is_std_array_impl<array<_Tp, _Sz>> : public true_type {};
+
+template <class _Tp>
+struct __is_std_array : public __is_std_array_impl<remove_cv_t<_Tp>> {};
+
+template <class _Tp, class _ElementType, class = void>
+struct __is_span_compatible_container : public false_type {};
+
+template <class _Tp, class _ElementType>
+struct __is_span_compatible_container<_Tp, _ElementType,
+        void_t<
+        // is not a specialization of span
+            typename enable_if<!__is_span<_Tp>::value, nullptr_t>::type,
+        // is not a specialization of array
+            typename enable_if<!__is_std_array<_Tp>::value, nullptr_t>::type,
+        // is_array_v<Container> is false,
+            typename enable_if<!is_array_v<_Tp>, nullptr_t>::type,
+        // data(cont) and size(cont) are well formed
+            decltype(data(declval<_Tp>())),
+            decltype(size(declval<_Tp>())),
+        // remove_pointer_t<decltype(data(cont))>(*)[] is convertible to ElementType(*)[]
+            typename enable_if<
+                is_convertible_v<remove_pointer_t<decltype(data(declval<_Tp &>()))>(*)[],
+                                 _ElementType(*)[]>,
+                nullptr_t>::type
+        >>
+    : public true_type {};
+
+
+template <typename _Tp, size_t _Extent>
+class _LIBCPP_TEMPLATE_VIS span {
+public:
+//  constants and types
+    using element_type           = _Tp;
+    using value_type             = remove_cv_t<_Tp>;
+    using index_type             = size_t;
+    using difference_type        = ptrdiff_t;
+    using pointer                = _Tp *;
+    using const_pointer          = const _Tp *;
+    using reference              = _Tp &;
+    using const_reference        = const _Tp &;
+    using iterator               =  __wrap_iter<pointer>;
+    using const_iterator         =  __wrap_iter<const_pointer>;
+    using reverse_iterator       = _VSTD::reverse_iterator<iterator>;
+    using const_reverse_iterator = _VSTD::reverse_iterator<const_iterator>;
+
+    static constexpr index_type extent = _Extent;
+
+// [span.cons], span constructors, copy, assignment, and destructor
+    _LIBCPP_INLINE_VISIBILITY constexpr span() noexcept : __data{nullptr}
+    { static_assert(_Extent == 0, "Can't default construct a statically sized span with size > 0"); }
+
+    constexpr span           (const span&) noexcept = default;
+    constexpr span& operator=(const span&) noexcept = default;
+
+    _LIBCPP_INLINE_VISIBILITY constexpr span(pointer __ptr, index_type __count) : __data{__ptr}
+        { (void)__count; _LIBCPP_ASSERT(_Extent == __count, "size mismatch in span's constructor (ptr, len)"); }
+    _LIBCPP_INLINE_VISIBILITY constexpr span(pointer __f, pointer __l) : __data{__f}
+        { (void)__l;     _LIBCPP_ASSERT(_Extent == distance(__f, __l), "size mismatch in span's constructor (ptr, ptr)"); }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr span(element_type (&__arr)[_Extent])          noexcept : __data{__arr} {}
+    _LIBCPP_INLINE_VISIBILITY constexpr span(      array<value_type, _Extent>& __arr) noexcept : __data{__arr.data()} {}
+    _LIBCPP_INLINE_VISIBILITY constexpr span(const array<value_type, _Extent>& __arr) noexcept : __data{__arr.data()} {}
+
+    template <class _OtherElementType>
+    _LIBCPP_INLINE_VISIBILITY
+        constexpr span(const span<_OtherElementType, _Extent>& __other,
+                       enable_if_t<
+                          is_convertible_v<_OtherElementType(*)[], element_type (*)[]>,
+                          nullptr_t> = nullptr)
+        : __data{__other.data()} {}
+
+    template <class _OtherElementType>
+    _LIBCPP_INLINE_VISIBILITY
+        constexpr span(const span<_OtherElementType, dynamic_extent>& __other,
+                       enable_if_t<
+                          is_convertible_v<_OtherElementType(*)[], element_type (*)[]>,
+                          nullptr_t> = nullptr) noexcept
+        : __data{__other.data()} { _LIBCPP_ASSERT(_Extent == __other.size(), "size mismatch in span's constructor (other span)"); }
+
+
+//  ~span() noexcept = default;
+
+    template <size_t _Count>
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span<element_type, _Count> first() const noexcept
+    {
+        static_assert(_Count <= _Extent, "Count out of range in span::first()");
+        return {data(), _Count};
+    }
+
+    template <size_t _Count>
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span<element_type, _Count> last() const noexcept
+    {
+        static_assert(_Count <= _Extent, "Count out of range in span::last()");
+        return {data() + size() - _Count, _Count};
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span<element_type, dynamic_extent> first(index_type __count) const noexcept
+    {
+        _LIBCPP_ASSERT(__count <= size(), "Count out of range in span::first(count)");
+        return {data(), __count};
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span<element_type, dynamic_extent> last(index_type __count) const noexcept
+    {
+        _LIBCPP_ASSERT(__count <= size(), "Count out of range in span::last(count)");
+        return {data() + size() - __count, __count};
+    }
+
+    template <size_t _Offset, size_t _Count = dynamic_extent>
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr auto subspan() const noexcept
+        -> span<element_type, _Count != dynamic_extent ? _Count : _Extent - _Offset>
+    {
+        static_assert(_Offset <= _Extent, "Offset out of range in span::subspan()");
+        return {data() + _Offset, _Count == dynamic_extent ? size() - _Offset : _Count};
+    }
+
+
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span<element_type, dynamic_extent>
+       subspan(index_type __offset, index_type __count = dynamic_extent) const noexcept
+    {
+        _LIBCPP_ASSERT(__offset <= size(), "Offset out of range in span::subspan(offset, count)");
+        _LIBCPP_ASSERT(__count  <= size() || __count == dynamic_extent, "Count out of range in span::subspan(offset, count)");
+        if (__count == dynamic_extent)
+            return {data() + __offset, size() - __offset};
+        _LIBCPP_ASSERT(__offset <= size() - __count, "count + offset out of range in span::subspan(offset, count)");
+        return {data() + __offset, __count};
+    }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr index_type size()       const noexcept { return _Extent; }
+    _LIBCPP_INLINE_VISIBILITY constexpr index_type size_bytes() const noexcept { return _Extent * sizeof(element_type); }
+    _LIBCPP_INLINE_VISIBILITY constexpr bool empty()            const noexcept { return _Extent == 0; }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr reference operator[](index_type __idx) const noexcept
+    {
+        _LIBCPP_ASSERT(__idx >= 0 && __idx < size(), "span<T,N>[] index out of bounds");
+        return __data[__idx];
+    }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr reference front() const noexcept
+    {
+        static_assert(_Extent > 0, "span<T,N>[].front() on empty span");
+        return __data[0];
+    }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr reference back() const noexcept
+    {
+        static_assert(_Extent > 0, "span<T,N>[].back() on empty span");
+        return __data[size()-1];
+    }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr pointer data()                         const noexcept { return __data; }
+
+// [span.iter], span iterator support
+    _LIBCPP_INLINE_VISIBILITY constexpr iterator                 begin() const noexcept { return iterator(data()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr iterator                   end() const noexcept { return iterator(data() + size()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr const_iterator          cbegin() const noexcept { return const_iterator(data()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr const_iterator            cend() const noexcept { return const_iterator(data() + size()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr reverse_iterator        rbegin() const noexcept { return reverse_iterator(end()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr reverse_iterator          rend() const noexcept { return reverse_iterator(begin()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(cend()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr const_reverse_iterator   crend() const noexcept { return const_reverse_iterator(cbegin()); }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr void swap(span &__other) noexcept
+    {
+        pointer __p = __data;
+        __data = __other.__data;
+        __other.__data = __p;
+    }
+
+    _LIBCPP_INLINE_VISIBILITY span<const byte, _Extent * sizeof(element_type)> __as_bytes() const noexcept
+    { return {reinterpret_cast<const byte *>(data()), size_bytes()}; }
+
+    _LIBCPP_INLINE_VISIBILITY span<byte, _Extent * sizeof(element_type)> __as_writable_bytes() const noexcept
+    { return {reinterpret_cast<byte *>(data()), size_bytes()}; }
+
+private:
+    pointer    __data;
+
+};
+
+
+template <typename _Tp>
+class _LIBCPP_TEMPLATE_VIS span<_Tp, dynamic_extent> {
+private:
+
+public:
+//  constants and types
+    using element_type           = _Tp;
+    using value_type             = remove_cv_t<_Tp>;
+    using index_type             = size_t;
+    using difference_type        = ptrdiff_t;
+    using pointer                = _Tp *;
+    using const_pointer          = const _Tp *;
+    using reference              = _Tp &;
+    using const_reference        = const _Tp &;
+    using iterator               =  __wrap_iter<pointer>;
+    using const_iterator         =  __wrap_iter<const_pointer>;
+    using reverse_iterator       = _VSTD::reverse_iterator<iterator>;
+    using const_reverse_iterator = _VSTD::reverse_iterator<const_iterator>;
+
+    static constexpr index_type extent = dynamic_extent;
+
+// [span.cons], span constructors, copy, assignment, and destructor
+    _LIBCPP_INLINE_VISIBILITY constexpr span() noexcept : __data{nullptr}, __size{0} {}
+
+    constexpr span           (const span&) noexcept = default;
+    constexpr span& operator=(const span&) noexcept = default;
+
+    _LIBCPP_INLINE_VISIBILITY constexpr span(pointer __ptr, index_type __count) : __data{__ptr}, __size{__count} {}
+    _LIBCPP_INLINE_VISIBILITY constexpr span(pointer __f, pointer __l) : __data{__f}, __size{static_cast<size_t>(distance(__f, __l))} {}
+
+    template <size_t _Sz>
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span(element_type (&__arr)[_Sz])          noexcept : __data{__arr}, __size{_Sz} {}
+
+    template <size_t _Sz>
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span(array<value_type, _Sz>& __arr)       noexcept : __data{__arr.data()}, __size{_Sz} {}
+
+    template <size_t _Sz>
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span(const array<value_type, _Sz>& __arr) noexcept : __data{__arr.data()}, __size{_Sz} {}
+
+    template <class _Container>
+    _LIBCPP_INLINE_VISIBILITY
+        constexpr span(      _Container& __c,
+            enable_if_t<__is_span_compatible_container<_Container, _Tp>::value, nullptr_t> = nullptr)
+        : __data{_VSTD::data(__c)}, __size{(index_type) _VSTD::size(__c)} {}
+
+    template <class _Container>
+    _LIBCPP_INLINE_VISIBILITY
+        constexpr span(const _Container& __c,
+            enable_if_t<__is_span_compatible_container<const _Container, _Tp>::value, nullptr_t> = nullptr)
+        : __data{_VSTD::data(__c)}, __size{(index_type) _VSTD::size(__c)} {}
+
+
+    template <class _OtherElementType, size_t _OtherExtent>
+    _LIBCPP_INLINE_VISIBILITY
+        constexpr span(const span<_OtherElementType, _OtherExtent>& __other,
+                       enable_if_t<
+                          is_convertible_v<_OtherElementType(*)[], element_type (*)[]>,
+                          nullptr_t> = nullptr) noexcept
+        : __data{__other.data()}, __size{__other.size()} {}
+
+//    ~span() noexcept = default;
+
+    template <size_t _Count>
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span<element_type, _Count> first() const noexcept
+    {
+        _LIBCPP_ASSERT(_Count <= size(), "Count out of range in span::first()");
+        return {data(), _Count};
+    }
+
+    template <size_t _Count>
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span<element_type, _Count> last() const noexcept
+    {
+        _LIBCPP_ASSERT(_Count <= size(), "Count out of range in span::last()");
+        return {data() + size() - _Count, _Count};
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span<element_type, dynamic_extent> first(index_type __count) const noexcept
+    {
+        _LIBCPP_ASSERT(__count <= size(), "Count out of range in span::first(count)");
+        return {data(), __count};
+    }
+
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span<element_type, dynamic_extent> last (index_type __count) const noexcept
+    {
+        _LIBCPP_ASSERT(__count <= size(), "Count out of range in span::last(count)");
+        return {data() + size() - __count, __count};
+    }
+
+    template <size_t _Offset, size_t _Count = dynamic_extent>
+    _LIBCPP_INLINE_VISIBILITY
+    constexpr span<_Tp, dynamic_extent> subspan() const noexcept
+    {
+        _LIBCPP_ASSERT(_Offset <= size(), "Offset out of range in span::subspan()");
+        _LIBCPP_ASSERT(_Count == dynamic_extent || _Offset + _Count <= size(), "Count out of range in span::subspan()");
+        return {data() + _Offset, _Count == dynamic_extent ? size() - _Offset : _Count};
+    }
+
+    constexpr span<element_type, dynamic_extent>
+    _LIBCPP_INLINE_VISIBILITY
+    subspan(index_type __offset, index_type __count = dynamic_extent) const noexcept
+    {
+        _LIBCPP_ASSERT(__offset <= size(), "Offset out of range in span::subspan(offset, count)");
+        _LIBCPP_ASSERT(__count  <= size() || __count == dynamic_extent, "count out of range in span::subspan(offset, count)");
+        if (__count == dynamic_extent)
+            return {data() + __offset, size() - __offset};
+        _LIBCPP_ASSERT(__offset <= size() - __count, "Offset + count out of range in span::subspan(offset, count)");
+        return {data() + __offset, __count};
+    }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr index_type size()       const noexcept { return __size; }
+    _LIBCPP_INLINE_VISIBILITY constexpr index_type size_bytes() const noexcept { return __size * sizeof(element_type); }
+    _LIBCPP_INLINE_VISIBILITY constexpr bool empty()            const noexcept { return __size == 0; }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr reference operator[](index_type __idx) const noexcept
+    {
+        _LIBCPP_ASSERT(__idx >= 0 && __idx < size(), "span<T>[] index out of bounds");
+        return __data[__idx];
+    }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr reference front() const noexcept
+    {
+        _LIBCPP_ASSERT(!empty(), "span<T>[].front() on empty span");
+        return __data[0];
+    }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr reference back() const noexcept
+    {
+        _LIBCPP_ASSERT(!empty(), "span<T>[].back() on empty span");
+        return __data[size()-1];
+    }
+
+
+    _LIBCPP_INLINE_VISIBILITY constexpr pointer data()                         const noexcept { return __data; }
+
+// [span.iter], span iterator support
+    _LIBCPP_INLINE_VISIBILITY constexpr iterator                 begin() const noexcept { return iterator(data()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr iterator                   end() const noexcept { return iterator(data() + size()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr const_iterator          cbegin() const noexcept { return const_iterator(data()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr const_iterator            cend() const noexcept { return const_iterator(data() + size()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr reverse_iterator        rbegin() const noexcept { return reverse_iterator(end()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr reverse_iterator          rend() const noexcept { return reverse_iterator(begin()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(cend()); }
+    _LIBCPP_INLINE_VISIBILITY constexpr const_reverse_iterator   crend() const noexcept { return const_reverse_iterator(cbegin()); }
+
+    _LIBCPP_INLINE_VISIBILITY constexpr void swap(span &__other) noexcept
+    {
+        pointer __p = __data;
+        __data = __other.__data;
+        __other.__data = __p;
+
+        index_type __sz = __size;
+        __size = __other.__size;
+        __other.__size = __sz;
+    }
+
+    _LIBCPP_INLINE_VISIBILITY span<const byte, dynamic_extent> __as_bytes() const noexcept
+    { return {reinterpret_cast<const byte *>(data()), size_bytes()}; }
+
+    _LIBCPP_INLINE_VISIBILITY span<byte, dynamic_extent> __as_writable_bytes() const noexcept
+    { return {reinterpret_cast<byte *>(data()), size_bytes()}; }
+
+private:
+    pointer    __data;
+    index_type __size;
+};
+
+//  tuple interface
+template <class _Tp, size_t _Size>
+struct _LIBCPP_TEMPLATE_VIS tuple_size<span<_Tp, _Size>>
+    : public integral_constant<size_t, _Size> {};
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS tuple_size<span<_Tp, dynamic_extent>>; // declared but not defined
+
+
+template <size_t _Ip, class _Tp, size_t _Size>
+struct _LIBCPP_TEMPLATE_VIS tuple_element<_Ip, span<_Tp, _Size>>
+{
+    static_assert( dynamic_extent != _Size, "std::tuple_element<> not supported for std::span<T, dynamic_extent>");
+    static_assert(_Ip < _Size, "Index out of bounds in std::tuple_element<> (std::span)");
+    typedef _Tp type;
+};
+
+template <size_t _Ip, class _Tp, size_t _Size>
+_LIBCPP_INLINE_VISIBILITY constexpr
+_Tp&
+get(span<_Tp, _Size> __s) noexcept
+{
+    static_assert( dynamic_extent != _Size, "std::get<> not supported for std::span<T, dynamic_extent>");
+    static_assert(_Ip < _Size, "Index out of bounds in std::get<> (std::span)");
+    return __s[_Ip];
+}
+
+
+//  as_bytes & as_writable_bytes
+template <class _Tp, size_t _Extent>
+_LIBCPP_INLINE_VISIBILITY
+auto as_bytes(span<_Tp, _Extent> __s) noexcept
+-> decltype(__s.__as_bytes())
+{ return    __s.__as_bytes(); }
+
+template <class _Tp, size_t _Extent>
+_LIBCPP_INLINE_VISIBILITY
+auto as_writable_bytes(span<_Tp, _Extent> __s) noexcept
+-> enable_if_t<!is_const_v<_Tp>, decltype(__s.__as_writable_bytes())>
+{ return __s.__as_writable_bytes(); }
+
+template <class _Tp, size_t _Extent>
+_LIBCPP_INLINE_VISIBILITY
+constexpr void swap(span<_Tp, _Extent> &__lhs, span<_Tp, _Extent> &__rhs) noexcept
+{ __lhs.swap(__rhs); }
+
+
+//  Deduction guides
+template<class _Tp, size_t _Sz>
+    span(_Tp (&)[_Sz]) -> span<_Tp, _Sz>;
+
+template<class _Tp, size_t _Sz>
+    span(array<_Tp, _Sz>&) -> span<_Tp, _Sz>;
+
+template<class _Tp, size_t _Sz>
+    span(const array<_Tp, _Sz>&) -> span<const _Tp, _Sz>;
+
+template<class _Container>
+    span(_Container&) -> span<typename _Container::value_type>;
+
+template<class _Container>
+    span(const _Container&) -> span<const typename _Container::value_type>;
+
+#endif // _LIBCPP_STD_VER > 17
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_SPAN
--- a/third_party/tbb/README.cosmo
+++ b/third_party/tbb/README.cosmo
@ -0,0 +1,17 @@
+// clang-format off
+DESCRIPTION
+
+  oneAPI Threading Building Blocks (oneTBB)
+
+  oneTBB is a flexible C++ library that simplifies the work of adding parallelism to complex applications,
+  even if you are not a threading expert.
+
+SOURCE
+
+  https://github.com/oneapi-src/oneTBB
+
+  commit e813596ba3a1bee0ffa06fb66b5e30b7ea801319
+  Author: Alexandra <alexandra.epanchinzeva@intel.com>
+  Date:   Wed Jun 21 18:46:54 2023 +0200
+
+      Documentation for std::invoke (#1112)
--- a/third_party/tbb/address_waiter.cc
+++ b/third_party/tbb/address_waiter.cc
@ -0,0 +1,107 @@
+// clang-format off
+/*
+    Copyright (c) 2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/governor.hh"
+#include "third_party/tbb/concurrent_monitor.hh"
+#include "third_party/tbb/detail/_waitable_atomic.hh"
+
+#include "third_party/libcxx/type_traits"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+struct address_context {
+    address_context() = default;
+
+    address_context(void* address, std::uintptr_t context) :
+        my_address(address), my_context(context)
+    {}
+
+    void* my_address{nullptr};
+    std::uintptr_t my_context{0};
+};
+
+class address_waiter : public concurrent_monitor_base<address_context> {
+    using base_type = concurrent_monitor_base<address_context>;
+public:
+    using base_type::base_type;
+    /** per-thread descriptor for concurrent_monitor */
+    using thread_context = sleep_node<address_context>;
+};
+
+// 1024 is a rough estimate based on two assumptions:
+//   1) there are no more than 1000 threads in the application;
+//   2) the mutexes are optimized for short critical sections less than a couple of microseconds,
+//      which is less than 1/1000 of a time slice.
+// In the worst case, we have single mutex that is locked and its thread is preempted.
+// Therefore, the probability of a collision while taking unrelated mutex is about 1/size of a table.
+static constexpr std::size_t num_address_waiters = 2 << 10;
+static_assert(std::is_standard_layout<address_waiter>::value,
+              "address_waiter must be with standard layout");
+static address_waiter address_waiter_table[num_address_waiters];
+
+void clear_address_waiter_table() {
+    for (std::size_t i = 0; i < num_address_waiters; ++i) {
+        address_waiter_table[i].destroy();
+    }
+}
+
+static address_waiter& get_address_waiter(void* address) {
+    std::uintptr_t tag = std::uintptr_t(address);
+    return address_waiter_table[((tag >> 5) ^ tag) % num_address_waiters];
+}
+
+void wait_on_address(void* address, d1::delegate_base& predicate, std::uintptr_t context) {
+    address_waiter& waiter = get_address_waiter(address);
+    waiter.wait<address_waiter::thread_context>(predicate, address_context{address, context});
+}
+
+void notify_by_address(void* address, std::uintptr_t target_context) {
+    address_waiter& waiter = get_address_waiter(address);
+
+    auto predicate = [address, target_context] (address_context ctx) {
+        return ctx.my_address == address && ctx.my_context == target_context;
+    };
+
+    waiter.notify_relaxed(predicate);
+}
+
+void notify_by_address_one(void* address) {
+    address_waiter& waiter = get_address_waiter(address);
+
+    auto predicate = [address] (address_context ctx) {
+        return ctx.my_address == address;
+    };
+
+    waiter.notify_one_relaxed(predicate);
+}
+
+void notify_by_address_all(void* address) {
+    address_waiter& waiter = get_address_waiter(address);
+
+    auto predicate = [address] (address_context ctx) {
+        return ctx.my_address == address;
+    };
+
+    waiter.notify_relaxed(predicate);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
--- a/third_party/tbb/allocator.cc
+++ b/third_party/tbb/allocator.cc
@ -0,0 +1,314 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/version.hh"
+
+#include "third_party/tbb/detail/_exception.hh"
+#include "third_party/tbb/detail/_assert.hh"
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/tbb_allocator.hh" // Is this OK?
+#include "third_party/tbb/cache_aligned_allocator.hh"
+
+#include "third_party/tbb/dynamic_link.hh"
+#include "third_party/tbb/misc.hh"
+
+#include "third_party/libcxx/cstdlib"
+
+#ifdef _WIN32
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#else
+#include "libc/runtime/dlfcn.h"
+#endif
+
+#if (!defined(_WIN32) && !defined(_WIN64)) || defined(__CYGWIN__)
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h" // posix_memalign, free
+// With glibc, uClibc and musl on Linux and bionic on Android it is safe to use memalign(), as the allocated memory
+// can be freed with free(). It is also better to use memalign() since posix_memalign() is just a wrapper on top of
+// memalign() and it offers nothing but overhead due to inconvenient interface. This is likely the case with other
+// standard libraries as well, and more libraries can be added to the preprocessor check below. Unfortunately, we
+// can't detect musl, so we simply enable memalign() on Linux and Android in general.
+#if defined(linux) || defined(__linux) || defined(__linux__) || defined(__ANDROID__)
+#include "libc/mem/mem.h" // memalign
+#define __TBB_USE_MEMALIGN
+#else
+#define __TBB_USE_POSIX_MEMALIGN
+#endif
+#elif defined(_MSC_VER) || defined(__MINGW32__)
+#include "libc/mem/mem.h" // _aligned_malloc, _aligned_free
+#define __TBB_USE_MSVC_ALIGNED_MALLOC
+#endif
+
+#if __TBB_WEAK_SYMBOLS_PRESENT
+
+#pragma weak scalable_malloc
+#pragma weak scalable_free
+#pragma weak scalable_aligned_malloc
+#pragma weak scalable_aligned_free
+
+extern "C" {
+    void* scalable_malloc(std::size_t);
+    void  scalable_free(void*);
+    void* scalable_aligned_malloc(std::size_t, std::size_t);
+    void  scalable_aligned_free(void*);
+}
+
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Initialization routine used for first indirect call via allocate_handler.
+static void* initialize_allocate_handler(std::size_t size);
+
+//! Handler for memory allocation
+using allocate_handler_type = void* (*)(std::size_t size);
+static std::atomic<allocate_handler_type> allocate_handler{ &initialize_allocate_handler };
+allocate_handler_type allocate_handler_unsafe = nullptr;
+
+//! Handler for memory deallocation
+static void  (*deallocate_handler)(void* pointer) = nullptr;
+
+//! Initialization routine used for first indirect call via cache_aligned_allocate_handler.
+static void* initialize_cache_aligned_allocate_handler(std::size_t n, std::size_t alignment);
+
+//! Allocates overaligned memory using standard memory allocator. It is used when scalable_allocator is not available.
+static void* std_cache_aligned_allocate(std::size_t n, std::size_t alignment);
+
+//! Deallocates overaligned memory using standard memory allocator. It is used when scalable_allocator is not available.
+static void  std_cache_aligned_deallocate(void* p);
+
+//! Handler for padded memory allocation
+using cache_aligned_allocate_handler_type = void* (*)(std::size_t n, std::size_t alignment);
+static std::atomic<cache_aligned_allocate_handler_type> cache_aligned_allocate_handler{ &initialize_cache_aligned_allocate_handler };
+cache_aligned_allocate_handler_type cache_aligned_allocate_handler_unsafe = nullptr;
+
+//! Handler for padded memory deallocation
+static void (*cache_aligned_deallocate_handler)(void* p) = nullptr;
+
+//! Table describing how to link the handlers.
+static const dynamic_link_descriptor MallocLinkTable[] = {
+    DLD(scalable_malloc, allocate_handler_unsafe),
+    DLD(scalable_free, deallocate_handler),
+    DLD(scalable_aligned_malloc, cache_aligned_allocate_handler_unsafe),
+    DLD(scalable_aligned_free, cache_aligned_deallocate_handler),
+};
+
+
+#if TBB_USE_DEBUG
+#define DEBUG_SUFFIX "_debug"
+#else
+#define DEBUG_SUFFIX
+#endif /* TBB_USE_DEBUG */
+
+// MALLOCLIB_NAME is the name of the oneTBB memory allocator library.
+#if _WIN32||_WIN64
+#define MALLOCLIB_NAME "tbbmalloc" DEBUG_SUFFIX ".dll"
+#elif __APPLE__
+#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".2.dylib"
+#elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX || __ANDROID__
+#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so"
+#elif __unix__  // Note that order of these #elif's is important!
+#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so.2"
+#else
+#error Unknown OS
+#endif
+
+//! Initialize the allocation/free handler pointers.
+/** Caller is responsible for ensuring this routine is called exactly once.
+    The routine attempts to dynamically link with the TBB memory allocator.
+    If that allocator is not found, it links to malloc and free. */
+void initialize_handler_pointers() {
+    __TBB_ASSERT(allocate_handler == &initialize_allocate_handler, nullptr);
+    bool success = dynamic_link(MALLOCLIB_NAME, MallocLinkTable, 4);
+    if(!success) {
+        // If unsuccessful, set the handlers to the default routines.
+        // This must be done now, and not before FillDynamicLinks runs, because if other
+        // threads call the handlers, we want them to go through the DoOneTimeInitializations logic,
+        // which forces them to wait.
+        allocate_handler_unsafe = &std::malloc;
+        deallocate_handler = &std::free;
+        cache_aligned_allocate_handler_unsafe = &std_cache_aligned_allocate;
+        cache_aligned_deallocate_handler = &std_cache_aligned_deallocate;
+    }
+
+    allocate_handler.store(allocate_handler_unsafe, std::memory_order_release);
+    cache_aligned_allocate_handler.store(cache_aligned_allocate_handler_unsafe, std::memory_order_release);
+
+    PrintExtraVersionInfo( "ALLOCATOR", success?"scalable_malloc":"malloc" );
+}
+
+static std::once_flag initialization_state;
+void initialize_cache_aligned_allocator() {
+    std::call_once(initialization_state, &initialize_handler_pointers);
+}
+
+//! Executed on very first call through allocate_handler
+static void* initialize_allocate_handler(std::size_t size) {
+    initialize_cache_aligned_allocator();
+    __TBB_ASSERT(allocate_handler != &initialize_allocate_handler, nullptr);
+    return (*allocate_handler)(size);
+}
+
+//! Executed on very first call through cache_aligned_allocate_handler
+static void* initialize_cache_aligned_allocate_handler(std::size_t bytes, std::size_t alignment) {
+    initialize_cache_aligned_allocator();
+    __TBB_ASSERT(cache_aligned_allocate_handler != &initialize_cache_aligned_allocate_handler, nullptr);
+    return (*cache_aligned_allocate_handler)(bytes, alignment);
+}
+
+// TODO: use CPUID to find actual line size, though consider backward compatibility
+// nfs - no false sharing
+static constexpr std::size_t nfs_size = 128;
+
+std::size_t __TBB_EXPORTED_FUNC cache_line_size() {
+    return nfs_size;
+}
+
+void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size) {
+    const std::size_t cache_line_size = nfs_size;
+    __TBB_ASSERT(is_power_of_two(cache_line_size), "must be power of two");
+
+    // Check for overflow
+    if (size + cache_line_size < size) {
+        throw_exception(exception_id::bad_alloc);
+    }
+    // scalable_aligned_malloc considers zero size request an error, and returns nullptr
+    if (size == 0) size = 1;
+
+    void* result = cache_aligned_allocate_handler.load(std::memory_order_acquire)(size, cache_line_size);
+    if (!result) {
+        throw_exception(exception_id::bad_alloc);
+    }
+    __TBB_ASSERT(is_aligned(result, cache_line_size), "The returned address isn't aligned");
+    return result;
+}
+
+void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p) {
+    __TBB_ASSERT(cache_aligned_deallocate_handler, "Initialization has not been yet.");
+    (*cache_aligned_deallocate_handler)(p);
+}
+
+static void* std_cache_aligned_allocate(std::size_t bytes, std::size_t alignment) {
+#if defined(__TBB_USE_MEMALIGN)
+    return memalign(alignment, bytes);
+#elif defined(__TBB_USE_POSIX_MEMALIGN)
+    void* p = nullptr;
+    int res = posix_memalign(&p, alignment, bytes);
+    if (res != 0)
+        p = nullptr;
+    return p;
+#elif defined(__TBB_USE_MSVC_ALIGNED_MALLOC)
+    return _aligned_malloc(bytes, alignment);
+#else
+    // TODO: make it common with cache_aligned_resource
+    std::size_t space = alignment + bytes;
+    std::uintptr_t base = reinterpret_cast<std::uintptr_t>(std::malloc(space));
+    if (!base) {
+        return nullptr;
+    }
+    std::uintptr_t result = (base + nfs_size) & ~(nfs_size - 1);
+    // Round up to the next cache line (align the base address)
+    __TBB_ASSERT((result - base) >= sizeof(std::uintptr_t), "Cannot store a base pointer to the header");
+    __TBB_ASSERT(space - (result - base) >= bytes, "Not enough space for the storage");
+
+    // Record where block actually starts.
+    (reinterpret_cast<std::uintptr_t*>(result))[-1] = base;
+    return reinterpret_cast<void*>(result);
+#endif
+}
+
+static void std_cache_aligned_deallocate(void* p) {
+#if defined(__TBB_USE_MEMALIGN) || defined(__TBB_USE_POSIX_MEMALIGN)
+    free(p);
+#elif defined(__TBB_USE_MSVC_ALIGNED_MALLOC)
+    _aligned_free(p);
+#else
+    if (p) {
+        __TBB_ASSERT(reinterpret_cast<std::uintptr_t>(p) >= 0x4096, "attempt to free block not obtained from cache_aligned_allocator");
+        // Recover where block actually starts
+        std::uintptr_t base = (reinterpret_cast<std::uintptr_t*>(p))[-1];
+        __TBB_ASSERT(((base + nfs_size) & ~(nfs_size - 1)) == reinterpret_cast<std::uintptr_t>(p), "Incorrect alignment or not allocated by std_cache_aligned_deallocate?");
+        std::free(reinterpret_cast<void*>(base));
+    }
+#endif
+}
+
+void* __TBB_EXPORTED_FUNC allocate_memory(std::size_t size) {
+    void* result = allocate_handler.load(std::memory_order_acquire)(size);
+    if (!result) {
+        throw_exception(exception_id::bad_alloc);
+    }
+    return result;
+}
+
+void __TBB_EXPORTED_FUNC deallocate_memory(void* p) {
+    if (p) {
+        __TBB_ASSERT(deallocate_handler, "Initialization has not been yet.");
+        (*deallocate_handler)(p);
+    }
+}
+
+bool __TBB_EXPORTED_FUNC is_tbbmalloc_used() {
+    auto handler_snapshot = allocate_handler.load(std::memory_order_acquire);
+    if (handler_snapshot == &initialize_allocate_handler) {
+        initialize_cache_aligned_allocator();
+    }
+    handler_snapshot = allocate_handler.load(std::memory_order_relaxed);
+    __TBB_ASSERT(handler_snapshot != &initialize_allocate_handler && deallocate_handler != nullptr, nullptr);
+    // Cast to void avoids type mismatch errors on some compilers (e.g. __IBMCPP__)
+    __TBB_ASSERT((reinterpret_cast<void*>(handler_snapshot) == reinterpret_cast<void*>(&std::malloc)) == (reinterpret_cast<void*>(deallocate_handler) == reinterpret_cast<void*>(&std::free)),
+                  "Both shim pointers must refer to routines from the same package (either TBB or CRT)");
+    return reinterpret_cast<void*>(handler_snapshot) == reinterpret_cast<void*>(&std::malloc);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
--- a/third_party/tbb/arena.cc
+++ b/third_party/tbb/arena.cc
@ -0,0 +1,858 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/task_dispatcher.hh"
+#include "third_party/tbb/governor.hh"
+#include "third_party/tbb/threading_control.hh"
+#include "third_party/tbb/arena.hh"
+#include "third_party/tbb/itt_notify.hh"
+#include "third_party/tbb/semaphore.hh"
+#include "third_party/tbb/waiters.hh"
+#include "third_party/tbb/detail/_task.hh"
+#include "third_party/tbb/info.hh"
+#include "third_party/tbb/tbb_allocator.hh"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/functional"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_ARENA_BINDING
+class numa_binding_observer : public tbb::task_scheduler_observer {
+    binding_handler* my_binding_handler;
+public:
+    numa_binding_observer( d1::task_arena* ta, int num_slots, int numa_id, core_type_id core_type, int max_threads_per_core )
+        : task_scheduler_observer(*ta)
+        , my_binding_handler(construct_binding_handler(num_slots, numa_id, core_type, max_threads_per_core))
+    {}
+
+    void on_scheduler_entry( bool ) override {
+        apply_affinity_mask(my_binding_handler, this_task_arena::current_thread_index());
+    }
+
+    void on_scheduler_exit( bool ) override {
+        restore_affinity_mask(my_binding_handler, this_task_arena::current_thread_index());
+    }
+
+    ~numa_binding_observer() override{
+        destroy_binding_handler(my_binding_handler);
+    }
+};
+
+numa_binding_observer* construct_binding_observer( d1::task_arena* ta, int num_slots, int numa_id, core_type_id core_type, int max_threads_per_core ) {
+    numa_binding_observer* binding_observer = nullptr;
+    if ((core_type >= 0 && core_type_count() > 1) || (numa_id >= 0 && numa_node_count() > 1) || max_threads_per_core > 0) {
+        binding_observer = new(allocate_memory(sizeof(numa_binding_observer))) numa_binding_observer(ta, num_slots, numa_id, core_type, max_threads_per_core);
+        __TBB_ASSERT(binding_observer, "Failure during NUMA binding observer allocation and construction");
+        binding_observer->observe(true);
+    }
+    return binding_observer;
+}
+
+void destroy_binding_observer( numa_binding_observer* binding_observer ) {
+    __TBB_ASSERT(binding_observer, "Trying to deallocate nullptr pointer");
+    binding_observer->observe(false);
+    binding_observer->~numa_binding_observer();
+    deallocate_memory(binding_observer);
+}
+#endif /*!__TBB_ARENA_BINDING*/
+
+void arena::on_thread_leaving(unsigned ref_param) {
+    //
+    // Implementation of arena destruction synchronization logic contained various
+    // bugs/flaws at the different stages of its evolution, so below is a detailed
+    // description of the issues taken into consideration in the framework of the
+    // current design.
+    //
+    // In case of using fire-and-forget tasks (scheduled via task::enqueue())
+    // external thread is allowed to leave its arena before all its work is executed,
+    // and market may temporarily revoke all workers from this arena. Since revoked
+    // workers never attempt to reset arena state to EMPTY and cancel its request
+    // to RML for threads, the arena object is destroyed only when both the last
+    // thread is leaving it and arena's state is EMPTY (that is its external thread
+    // left and it does not contain any work).
+    // Thus resetting arena to EMPTY state (as earlier TBB versions did) should not
+    // be done here (or anywhere else in the external thread to that matter); doing so
+    // can result either in arena's premature destruction (at least without
+    // additional costly checks in workers) or in unnecessary arena state changes
+    // (and ensuing workers migration).
+    //
+    // A worker that checks for work presence and transitions arena to the EMPTY
+    // state (in snapshot taking procedure arena::out_of_work()) updates
+    // arena::my_pool_state first and only then arena::my_num_workers_requested.
+    // So the check for work absence must be done against the latter field.
+    //
+    // In a time window between decrementing the active threads count and checking
+    // if there is an outstanding request for workers. New worker thread may arrive,
+    // finish remaining work, set arena state to empty, and leave decrementing its
+    // refcount and destroying. Then the current thread will destroy the arena
+    // the second time. To preclude it a local copy of the outstanding request
+    // value can be stored before decrementing active threads count.
+    //
+    // But this technique may cause two other problem. When the stored request is
+    // zero, it is possible that arena still has threads and they can generate new
+    // tasks and thus re-establish non-zero requests. Then all the threads can be
+    // revoked (as described above) leaving this thread the last one, and causing
+    // it to destroy non-empty arena.
+    //
+    // The other problem takes place when the stored request is non-zero. Another
+    // thread may complete the work, set arena state to empty, and leave without
+    // arena destruction before this thread decrements the refcount. This thread
+    // cannot destroy the arena either. Thus the arena may be "orphaned".
+    //
+    // In both cases we cannot dereference arena pointer after the refcount is
+    // decremented, as our arena may already be destroyed.
+    //
+    // If this is the external thread, the market is protected by refcount to it.
+    // In case of workers market's liveness is ensured by the RML connection
+    // rundown protocol, according to which the client (i.e. the market) lives
+    // until RML server notifies it about connection termination, and this
+    // notification is fired only after all workers return into RML.
+    //
+    // Thus if we decremented refcount to zero we ask the market to check arena
+    // state (including the fact if it is alive) under the lock.
+    //
+
+    __TBB_ASSERT(my_references.load(std::memory_order_relaxed) >= ref_param, "broken arena reference counter");
+
+    // When there is no workers someone must free arena, as
+    // without workers, no one calls out_of_work().
+    if (ref_param == ref_external && !my_mandatory_concurrency.test()) {
+        out_of_work();
+    }
+
+    threading_control* tc = my_threading_control;
+    auto tc_client_snapshot = tc->prepare_client_destruction(my_tc_client);
+    // Release our reference to sync with destroy_client
+    unsigned remaining_ref = my_references.fetch_sub(ref_param, std::memory_order_release) - ref_param;
+    // do not access `this` it might be destroyed already
+    if (remaining_ref == 0) {
+        if (tc->try_destroy_client(tc_client_snapshot)) {
+            // We are requested to destroy ourself
+            free_arena();
+        }
+    }
+}
+
+std::size_t arena::occupy_free_slot_in_range( thread_data& tls, std::size_t lower, std::size_t upper ) {
+    if ( lower >= upper ) return out_of_arena;
+    // Start search for an empty slot from the one we occupied the last time
+    std::size_t index = tls.my_arena_index;
+    if ( index < lower || index >= upper ) index = tls.my_random.get() % (upper - lower) + lower;
+    __TBB_ASSERT( index >= lower && index < upper, nullptr);
+    // Find a free slot
+    for ( std::size_t i = index; i < upper; ++i )
+        if (my_slots[i].try_occupy()) return i;
+    for ( std::size_t i = lower; i < index; ++i )
+        if (my_slots[i].try_occupy()) return i;
+    return out_of_arena;
+}
+
+template <bool as_worker>
+std::size_t arena::occupy_free_slot(thread_data& tls) {
+    // Firstly, external threads try to occupy reserved slots
+    std::size_t index = as_worker ? out_of_arena : occupy_free_slot_in_range( tls,  0, my_num_reserved_slots );
+    if ( index == out_of_arena ) {
+        // Secondly, all threads try to occupy all non-reserved slots
+        index = occupy_free_slot_in_range(tls, my_num_reserved_slots, my_num_slots );
+        // Likely this arena is already saturated
+        if ( index == out_of_arena )
+            return out_of_arena;
+    }
+
+    atomic_update( my_limit, (unsigned)(index + 1), std::less<unsigned>() );
+    return index;
+}
+
+std::uintptr_t arena::calculate_stealing_threshold() {
+    stack_anchor_type anchor;
+    return r1::calculate_stealing_threshold(reinterpret_cast<std::uintptr_t>(&anchor), my_threading_control->worker_stack_size());
+}
+
+void arena::process(thread_data& tls) {
+    governor::set_thread_data(tls); // TODO: consider moving to create_one_job.
+    __TBB_ASSERT( is_alive(my_guard), nullptr);
+    __TBB_ASSERT( my_num_slots >= 1, nullptr);
+
+    std::size_t index = occupy_free_slot</*as_worker*/true>(tls);
+    if (index == out_of_arena) {
+        on_thread_leaving(ref_worker);
+        return;
+    }
+
+    __TBB_ASSERT( index >= my_num_reserved_slots, "Workers cannot occupy reserved slots" );
+    tls.attach_arena(*this, index);
+    // worker thread enters the dispatch loop to look for a work
+    tls.my_inbox.set_is_idle(true);
+    if (tls.my_arena_slot->is_task_pool_published()) {
+        tls.my_inbox.set_is_idle(false);
+    }
+
+    task_dispatcher& task_disp = tls.my_arena_slot->default_task_dispatcher();
+    tls.enter_task_dispatcher(task_disp, calculate_stealing_threshold());
+    __TBB_ASSERT(task_disp.can_steal(), nullptr);
+
+    __TBB_ASSERT( !tls.my_last_observer, "There cannot be notified local observers when entering arena" );
+    my_observers.notify_entry_observers(tls.my_last_observer, tls.my_is_worker);
+
+    // Waiting on special object tied to this arena
+    outermost_worker_waiter waiter(*this);
+    d1::task* t = tls.my_task_dispatcher->local_wait_for_all(nullptr, waiter);
+    // For purposes of affinity support, the slot's mailbox is considered idle while no thread is
+    // attached to it.
+    tls.my_inbox.set_is_idle(true);
+
+    __TBB_ASSERT_EX(t == nullptr, "Outermost worker must not leave dispatch loop with a task");
+    __TBB_ASSERT(governor::is_thread_data_set(&tls), nullptr);
+    __TBB_ASSERT(tls.my_task_dispatcher == &task_disp, nullptr);
+
+    my_observers.notify_exit_observers(tls.my_last_observer, tls.my_is_worker);
+    tls.my_last_observer = nullptr;
+
+    tls.leave_task_dispatcher();
+
+    // Arena slot detach (arena may be used in market::process)
+    // TODO: Consider moving several calls below into a new method(e.g.detach_arena).
+    tls.my_arena_slot->release();
+    tls.my_arena_slot = nullptr;
+    tls.my_inbox.detach();
+    __TBB_ASSERT(tls.my_inbox.is_idle_state(true), nullptr);
+    __TBB_ASSERT(is_alive(my_guard), nullptr);
+
+    // In contrast to earlier versions of TBB (before 3.0 U5) now it is possible
+    // that arena may be temporarily left unpopulated by threads. See comments in
+    // arena::on_thread_leaving() for more details.
+    on_thread_leaving(ref_worker);
+    __TBB_ASSERT(tls.my_arena == this, "my_arena is used as a hint when searching the arena to join");
+}
+
+arena::arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned priority_level) {
+    __TBB_ASSERT( !my_guard, "improperly allocated arena?" );
+    __TBB_ASSERT( sizeof(my_slots[0]) % cache_line_size()==0, "arena::slot size not multiple of cache line size" );
+    __TBB_ASSERT( is_aligned(this, cache_line_size()), "arena misaligned" );
+    my_threading_control = control;
+    my_limit = 1;
+    // Two slots are mandatory: for the external thread, and for 1 worker (required to support starvation resistant tasks).
+    my_num_slots = num_arena_slots(num_slots, num_reserved_slots);
+    my_num_reserved_slots = num_reserved_slots;
+    my_max_num_workers = num_slots-num_reserved_slots;
+    my_priority_level = priority_level;
+    my_references = ref_external; // accounts for the external thread
+    my_observers.my_arena = this;
+    my_co_cache.init(4 * num_slots);
+    __TBB_ASSERT ( my_max_num_workers <= my_num_slots, nullptr);
+    // Initialize the default context. It should be allocated before task_dispatch construction.
+    my_default_ctx = new (cache_aligned_allocate(sizeof(d1::task_group_context)))
+        d1::task_group_context{ d1::task_group_context::isolated, d1::task_group_context::fp_settings };
+    // Construct slots. Mark internal synchronization elements for the tools.
+    task_dispatcher* base_td_pointer = reinterpret_cast<task_dispatcher*>(my_slots + my_num_slots);
+    for( unsigned i = 0; i < my_num_slots; ++i ) {
+        // __TBB_ASSERT( !my_slots[i].my_scheduler && !my_slots[i].task_pool, nullptr);
+        __TBB_ASSERT( !my_slots[i].task_pool_ptr, nullptr);
+        __TBB_ASSERT( !my_slots[i].my_task_pool_size, nullptr);
+        mailbox(i).construct();
+        my_slots[i].init_task_streams(i);
+        my_slots[i].my_default_task_dispatcher = new(base_td_pointer + i) task_dispatcher(this);
+        my_slots[i].my_is_occupied.store(false, std::memory_order_relaxed);
+    }
+    my_fifo_task_stream.initialize(my_num_slots);
+    my_resume_task_stream.initialize(my_num_slots);
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    my_critical_task_stream.initialize(my_num_slots);
+#endif
+    my_mandatory_requests = 0;
+}
+
+arena& arena::allocate_arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots,
+                              unsigned priority_level)
+{
+    __TBB_ASSERT( sizeof(base_type) + sizeof(arena_slot) == sizeof(arena), "All arena data fields must go to arena_base" );
+    __TBB_ASSERT( sizeof(base_type) % cache_line_size() == 0, "arena slots area misaligned: wrong padding" );
+    __TBB_ASSERT( sizeof(mail_outbox) == max_nfs_size, "Mailbox padding is wrong" );
+    std::size_t n = allocation_size(num_arena_slots(num_slots, num_reserved_slots));
+    unsigned char* storage = (unsigned char*)cache_aligned_allocate(n);
+    // Zero all slots to indicate that they are empty
+    std::memset( storage, 0, n );
+
+    return *new( storage + num_arena_slots(num_slots, num_reserved_slots) * sizeof(mail_outbox) )
+        arena(control, num_slots, num_reserved_slots, priority_level);
+}
+
+void arena::free_arena () {
+    __TBB_ASSERT( is_alive(my_guard), nullptr);
+    __TBB_ASSERT( !my_references.load(std::memory_order_relaxed), "There are threads in the dying arena" );
+    __TBB_ASSERT( !my_total_num_workers_requested && !my_num_workers_allotted, "Dying arena requests workers" );
+    __TBB_ASSERT( is_empty(), "Inconsistent state of a dying arena" );
+#if __TBB_ARENA_BINDING
+    if (my_numa_binding_observer != nullptr) {
+        destroy_binding_observer(my_numa_binding_observer);
+        my_numa_binding_observer = nullptr;
+    }
+#endif /*__TBB_ARENA_BINDING*/
+    poison_value( my_guard );
+    for ( unsigned i = 0; i < my_num_slots; ++i ) {
+        // __TBB_ASSERT( !my_slots[i].my_scheduler, "arena slot is not empty" );
+        // TODO: understand the assertion and modify
+        // __TBB_ASSERT( my_slots[i].task_pool == EmptyTaskPool, nullptr);
+        __TBB_ASSERT( my_slots[i].head == my_slots[i].tail, nullptr); // TODO: replace by is_quiescent_local_task_pool_empty
+        my_slots[i].free_task_pool();
+        mailbox(i).drain();
+        my_slots[i].my_default_task_dispatcher->~task_dispatcher();
+    }
+    __TBB_ASSERT(my_fifo_task_stream.empty(), "Not all enqueued tasks were executed");
+    __TBB_ASSERT(my_resume_task_stream.empty(), "Not all enqueued tasks were executed");
+    // Cleanup coroutines/schedulers cache
+    my_co_cache.cleanup();
+    my_default_ctx->~task_group_context();
+    cache_aligned_deallocate(my_default_ctx);
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    __TBB_ASSERT( my_critical_task_stream.empty(), "Not all critical tasks were executed");
+#endif
+    // Clear enfources synchronization with observe(false)
+    my_observers.clear();
+
+    void* storage  = &mailbox(my_num_slots-1);
+    __TBB_ASSERT( my_references.load(std::memory_order_relaxed) == 0, nullptr);
+    this->~arena();
+#if TBB_USE_ASSERT > 1
+    std::memset( storage, 0, allocation_size(my_num_slots) );
+#endif /* TBB_USE_ASSERT */
+    cache_aligned_deallocate( storage );
+}
+
+bool arena::has_enqueued_tasks() {
+    return !my_fifo_task_stream.empty();
+}
+
+void arena::request_workers(int mandatory_delta, int workers_delta, bool wakeup_threads) {
+    my_threading_control->adjust_demand(my_tc_client, mandatory_delta, workers_delta);
+
+    if (wakeup_threads) {
+        // Notify all sleeping threads that work has appeared in the arena.
+        get_waiting_threads_monitor().notify([&] (market_context context) {
+            return this == context.my_arena_addr;
+        });
+    }
+}
+
+bool arena::has_tasks() {
+    // TODO: rework it to return at least a hint about where a task was found; better if the task itself.
+    std::size_t n = my_limit.load(std::memory_order_acquire);
+    bool tasks_are_available = false;
+    for (std::size_t k = 0; k < n && !tasks_are_available; ++k) {
+        tasks_are_available = !my_slots[k].is_empty();
+    }
+    tasks_are_available = tasks_are_available || has_enqueued_tasks() || !my_resume_task_stream.empty();
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    tasks_are_available = tasks_are_available || !my_critical_task_stream.empty();
+#endif
+    return tasks_are_available;
+}
+
+void arena::out_of_work() {
+    // We should try unset my_pool_state first due to keep arena invariants in consistent state
+    // Otherwise, we might have my_pool_state = false and my_mandatory_concurrency = true that is broken invariant
+    bool disable_mandatory = my_mandatory_concurrency.try_clear_if([this] { return !has_enqueued_tasks(); });
+    bool release_workers = my_pool_state.try_clear_if([this] { return !has_tasks(); });
+
+    if (disable_mandatory || release_workers) {
+        int mandatory_delta = disable_mandatory ? -1 : 0;
+        int workers_delta = release_workers ? -(int)my_max_num_workers : 0;
+
+        if (disable_mandatory && is_arena_workerless()) {
+            // We had set workers_delta to 1 when enabled mandatory concurrency, so revert it now
+            workers_delta = -1;
+        }
+        request_workers(mandatory_delta, workers_delta);
+    }
+}
+
+void arena::set_top_priority(bool is_top_priority) {
+    my_is_top_priority.store(is_top_priority, std::memory_order_relaxed);
+}
+
+bool arena::is_top_priority() const {
+    return my_is_top_priority.load(std::memory_order_relaxed);
+}
+
+bool arena::try_join() {
+    if (num_workers_active() < my_num_workers_allotted.load(std::memory_order_relaxed)) {
+        my_references += arena::ref_worker;
+        return true;
+    }
+    return false;
+}
+
+void arena::set_allotment(unsigned allotment) {
+    if (my_num_workers_allotted.load(std::memory_order_relaxed) != allotment) {
+        my_num_workers_allotted.store(allotment, std::memory_order_relaxed);
+    }
+}
+
+std::pair<int, int> arena::update_request(int mandatory_delta, int workers_delta) {
+    __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr);
+
+    int min_workers_request = 0;
+    int max_workers_request = 0;
+
+    // Calculate min request
+    my_mandatory_requests += mandatory_delta;
+    min_workers_request = my_mandatory_requests > 0 ? 1 : 0;
+
+    // Calculate max request
+    my_total_num_workers_requested += workers_delta;
+    // Clamp worker request into interval [0, my_max_num_workers]
+    max_workers_request = clamp(my_total_num_workers_requested, 0,
+        min_workers_request > 0 && is_arena_workerless() ? 1 : (int)my_max_num_workers);
+
+    return { min_workers_request, max_workers_request };
+}
+
+thread_control_monitor& arena::get_waiting_threads_monitor() {
+    return my_threading_control->get_waiting_threads_monitor();
+}
+
+void arena::enqueue_task(d1::task& t, d1::task_group_context& ctx, thread_data& td) {
+    task_group_context_impl::bind_to(ctx, &td);
+    task_accessor::context(t) = &ctx;
+    task_accessor::isolation(t) = no_isolation;
+    my_fifo_task_stream.push( &t, random_lane_selector(td.my_random) );
+    advertise_new_work<work_enqueued>();
+}
+
+arena& arena::create(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned arena_priority_level)
+{
+    __TBB_ASSERT(num_slots > 0, NULL);
+    __TBB_ASSERT(num_reserved_slots <= num_slots, NULL);
+    // Add public market reference for an external thread/task_arena (that adds an internal reference in exchange).
+    arena& a = arena::allocate_arena(control, num_slots, num_reserved_slots, arena_priority_level);
+    a.my_tc_client = control->create_client(a);
+    // We should not publish arena until all fields are initialized
+    control->publish_client(a.my_tc_client);
+    return a;
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+// Enable task_arena.h
+#include "third_party/tbb/task_arena.hh" // task_arena_base
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if TBB_USE_ASSERT
+void assert_arena_priority_valid( tbb::task_arena::priority a_priority ) {
+    bool is_arena_priority_correct =
+        a_priority == tbb::task_arena::priority::high   ||
+        a_priority == tbb::task_arena::priority::normal ||
+        a_priority == tbb::task_arena::priority::low;
+    __TBB_ASSERT( is_arena_priority_correct,
+                  "Task arena priority should be equal to one of the predefined values." );
+}
+#else
+void assert_arena_priority_valid( tbb::task_arena::priority ) {}
+#endif
+
+unsigned arena_priority_level( tbb::task_arena::priority a_priority ) {
+    assert_arena_priority_valid( a_priority );
+    return d1::num_priority_levels - unsigned(int(a_priority) / d1::priority_stride);
+}
+
+tbb::task_arena::priority arena_priority( unsigned priority_level ) {
+    auto priority = tbb::task_arena::priority(
+        (d1::num_priority_levels - priority_level) * d1::priority_stride
+    );
+    assert_arena_priority_valid( priority );
+    return priority;
+}
+
+struct task_arena_impl {
+    static void initialize(d1::task_arena_base&);
+    static void terminate(d1::task_arena_base&);
+    static bool attach(d1::task_arena_base&);
+    static void execute(d1::task_arena_base&, d1::delegate_base&);
+    static void wait(d1::task_arena_base&);
+    static int max_concurrency(const d1::task_arena_base*);
+    static void enqueue(d1::task&, d1::task_group_context*, d1::task_arena_base*);
+};
+
+void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base& ta) {
+    task_arena_impl::initialize(ta);
+}
+void __TBB_EXPORTED_FUNC terminate(d1::task_arena_base& ta) {
+    task_arena_impl::terminate(ta);
+}
+bool __TBB_EXPORTED_FUNC attach(d1::task_arena_base& ta) {
+    return task_arena_impl::attach(ta);
+}
+void __TBB_EXPORTED_FUNC execute(d1::task_arena_base& ta, d1::delegate_base& d) {
+    task_arena_impl::execute(ta, d);
+}
+void __TBB_EXPORTED_FUNC wait(d1::task_arena_base& ta) {
+    task_arena_impl::wait(ta);
+}
+
+int __TBB_EXPORTED_FUNC max_concurrency(const d1::task_arena_base* ta) {
+    return task_arena_impl::max_concurrency(ta);
+}
+
+void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_arena_base* ta) {
+    task_arena_impl::enqueue(t, nullptr, ta);
+}
+
+void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_group_context& ctx, d1::task_arena_base* ta) {
+    task_arena_impl::enqueue(t, &ctx, ta);
+}
+
+void task_arena_impl::initialize(d1::task_arena_base& ta) {
+    // Enforce global market initialization to properly initialize soft limit
+    (void)governor::get_thread_data();
+    if (ta.my_max_concurrency < 1) {
+#if __TBB_ARENA_BINDING
+        d1::constraints arena_constraints = d1::constraints{}
+            .set_core_type(ta.core_type())
+            .set_max_threads_per_core(ta.max_threads_per_core())
+            .set_numa_id(ta.my_numa_id);
+        ta.my_max_concurrency = (int)default_concurrency(arena_constraints);
+#else /*!__TBB_ARENA_BINDING*/
+        ta.my_max_concurrency = (int)governor::default_num_threads();
+#endif /*!__TBB_ARENA_BINDING*/
+    }
+
+    __TBB_ASSERT(ta.my_arena.load(std::memory_order_relaxed) == nullptr, "Arena already initialized");
+    unsigned priority_level = arena_priority_level(ta.my_priority);
+    threading_control* thr_control = threading_control::register_public_reference();
+    arena& a = arena::create(thr_control, unsigned(ta.my_max_concurrency), ta.my_num_reserved_slots, priority_level);
+    ta.my_arena.store(&a, std::memory_order_release);
+#if __TBB_ARENA_BINDING
+    a.my_numa_binding_observer = construct_binding_observer(
+        static_cast<d1::task_arena*>(&ta), a.my_num_slots, ta.my_numa_id, ta.core_type(), ta.max_threads_per_core());
+#endif /*__TBB_ARENA_BINDING*/
+}
+
+void task_arena_impl::terminate(d1::task_arena_base& ta) {
+    arena* a = ta.my_arena.load(std::memory_order_relaxed);
+    assert_pointer_valid(a);
+    threading_control::unregister_public_reference(/*blocking_terminate=*/false);
+    a->on_thread_leaving(arena::ref_external);
+    ta.my_arena.store(nullptr, std::memory_order_relaxed);
+}
+
+bool task_arena_impl::attach(d1::task_arena_base& ta) {
+    __TBB_ASSERT(!ta.my_arena.load(std::memory_order_relaxed), nullptr);
+    thread_data* td = governor::get_thread_data_if_initialized();
+    if( td && td->my_arena ) {
+        arena* a = td->my_arena;
+        // There is an active arena to attach to.
+        // It's still used by s, so won't be destroyed right away.
+        __TBB_ASSERT(a->my_references > 0, nullptr);
+        a->my_references += arena::ref_external;
+        ta.my_num_reserved_slots = a->my_num_reserved_slots;
+        ta.my_priority = arena_priority(a->my_priority_level);
+        ta.my_max_concurrency = ta.my_num_reserved_slots + a->my_max_num_workers;
+        __TBB_ASSERT(arena::num_arena_slots(ta.my_max_concurrency, ta.my_num_reserved_slots) == a->my_num_slots, nullptr);
+        ta.my_arena.store(a, std::memory_order_release);
+        // increases threading_control's ref count for task_arena
+        threading_control::register_public_reference();
+        return true;
+    }
+    return false;
+}
+
+void task_arena_impl::enqueue(d1::task& t, d1::task_group_context* c, d1::task_arena_base* ta) {
+    thread_data* td = governor::get_thread_data();  // thread data is only needed for FastRandom instance
+    assert_pointer_valid(td, "thread_data pointer should not be null");
+    arena* a = ta ?
+              ta->my_arena.load(std::memory_order_relaxed)
+            : td->my_arena
+    ;
+    assert_pointer_valid(a, "arena pointer should not be null");
+    auto* ctx = c ? c : a->my_default_ctx;
+    assert_pointer_valid(ctx, "context pointer should not be null");
+    // Is there a better place for checking the state of ctx?
+     __TBB_ASSERT(!a->my_default_ctx->is_group_execution_cancelled(),
+                  "The task will not be executed because its task_group_context is cancelled.");
+     a->enqueue_task(t, *ctx, *td);
+}
+
+class nested_arena_context : no_copy {
+public:
+    nested_arena_context(thread_data& td, arena& nested_arena, std::size_t slot_index)
+        : m_orig_execute_data_ext(td.my_task_dispatcher->m_execute_data_ext)
+    {
+        if (td.my_arena != &nested_arena) {
+            m_orig_arena = td.my_arena;
+            m_orig_slot_index = td.my_arena_index;
+            m_orig_last_observer = td.my_last_observer;
+
+            td.detach_task_dispatcher();
+            td.attach_arena(nested_arena, slot_index);
+            if (td.my_inbox.is_idle_state(true))
+                td.my_inbox.set_is_idle(false);
+            task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher();
+            td.enter_task_dispatcher(task_disp, m_orig_execute_data_ext.task_disp->m_stealing_threshold);
+
+            // If the calling thread occupies the slots out of external thread reserve we need to notify the
+            // market that this arena requires one worker less.
+            if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) {
+                td.my_arena->request_workers(/* mandatory_delta = */ 0, /* workers_delta = */ -1);
+            }
+
+            td.my_last_observer = nullptr;
+            // The task_arena::execute method considers each calling thread as an external thread.
+            td.my_arena->my_observers.notify_entry_observers(td.my_last_observer, /* worker*/false);
+        }
+
+        m_task_dispatcher = td.my_task_dispatcher;
+        m_orig_fifo_tasks_allowed = m_task_dispatcher->allow_fifo_task(true);
+        m_orig_critical_task_allowed = m_task_dispatcher->m_properties.critical_task_allowed;
+        m_task_dispatcher->m_properties.critical_task_allowed = true;
+
+        execution_data_ext& ed_ext = td.my_task_dispatcher->m_execute_data_ext;
+        ed_ext.context = td.my_arena->my_default_ctx;
+        ed_ext.original_slot = td.my_arena_index;
+        ed_ext.affinity_slot = d1::no_slot;
+        ed_ext.task_disp = td.my_task_dispatcher;
+        ed_ext.isolation = no_isolation;
+
+        __TBB_ASSERT(td.my_arena_slot, nullptr);
+        __TBB_ASSERT(td.my_arena_slot->is_occupied(), nullptr);
+        __TBB_ASSERT(td.my_task_dispatcher, nullptr);
+    }
+    ~nested_arena_context() {
+        thread_data& td = *m_task_dispatcher->m_thread_data;
+        __TBB_ASSERT(governor::is_thread_data_set(&td), nullptr);
+        m_task_dispatcher->allow_fifo_task(m_orig_fifo_tasks_allowed);
+        m_task_dispatcher->m_properties.critical_task_allowed = m_orig_critical_task_allowed;
+        if (m_orig_arena) {
+            td.my_arena->my_observers.notify_exit_observers(td.my_last_observer, /*worker*/ false);
+            td.my_last_observer = m_orig_last_observer;
+
+            // Notify the market that this thread releasing a one slot
+            // that can be used by a worker thread.
+            if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) {
+                td.my_arena->request_workers(/* mandatory_delta = */ 0, /* workers_delta = */ 1);
+            }
+
+            td.leave_task_dispatcher();
+            td.my_arena_slot->release();
+            td.my_arena->my_exit_monitors.notify_one(); // do not relax!
+
+            td.attach_arena(*m_orig_arena, m_orig_slot_index);
+            td.attach_task_dispatcher(*m_orig_execute_data_ext.task_disp);
+            __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr);
+        }
+        td.my_task_dispatcher->m_execute_data_ext = m_orig_execute_data_ext;
+    }
+
+private:
+    execution_data_ext    m_orig_execute_data_ext{};
+    arena*              m_orig_arena{ nullptr };
+    observer_proxy*     m_orig_last_observer{ nullptr };
+    task_dispatcher*    m_task_dispatcher{ nullptr };
+    unsigned            m_orig_slot_index{};
+    bool                m_orig_fifo_tasks_allowed{};
+    bool                m_orig_critical_task_allowed{};
+};
+
+class delegated_task : public d1::task {
+    d1::delegate_base&  m_delegate;
+    concurrent_monitor& m_monitor;
+    d1::wait_context&   m_wait_ctx;
+    std::atomic<bool>   m_completed;
+    d1::task* execute(d1::execution_data& ed) override {
+        const execution_data_ext& ed_ext = static_cast<const execution_data_ext&>(ed);
+        execution_data_ext orig_execute_data_ext = ed_ext.task_disp->m_execute_data_ext;
+        __TBB_ASSERT(&ed_ext.task_disp->m_execute_data_ext == &ed,
+            "The execute data shall point to the current task dispatcher execute data");
+        __TBB_ASSERT(ed_ext.task_disp->m_execute_data_ext.isolation == no_isolation, nullptr);
+
+        ed_ext.task_disp->m_execute_data_ext.context = ed_ext.task_disp->get_thread_data().my_arena->my_default_ctx;
+        bool fifo_task_allowed = ed_ext.task_disp->allow_fifo_task(true);
+        try_call([&] {
+            m_delegate();
+        }).on_completion([&] {
+            ed_ext.task_disp->m_execute_data_ext = orig_execute_data_ext;
+            ed_ext.task_disp->allow_fifo_task(fifo_task_allowed);
+        });
+
+        finalize();
+        return nullptr;
+    }
+    d1::task* cancel(d1::execution_data&) override {
+        finalize();
+        return nullptr;
+    }
+    void finalize() {
+        m_wait_ctx.release(); // must precede the wakeup
+        m_monitor.notify([this] (std::uintptr_t ctx) {
+            return ctx == std::uintptr_t(&m_delegate);
+        }); // do not relax, it needs a fence!
+        m_completed.store(true, std::memory_order_release);
+    }
+public:
+    delegated_task(d1::delegate_base& d, concurrent_monitor& s, d1::wait_context& wo)
+        : m_delegate(d), m_monitor(s), m_wait_ctx(wo), m_completed{ false }{}
+    ~delegated_task() override {
+        // The destructor can be called earlier than the m_monitor is notified
+        // because the waiting thread can be released after m_wait_ctx.release_wait.
+        // To close that race we wait for the m_completed signal.
+        spin_wait_until_eq(m_completed, true);
+    }
+};
+
+void task_arena_impl::execute(d1::task_arena_base& ta, d1::delegate_base& d) {
+    arena* a = ta.my_arena.load(std::memory_order_relaxed);
+    __TBB_ASSERT(a != nullptr, nullptr);
+    thread_data* td = governor::get_thread_data();
+
+    bool same_arena = td->my_arena == a;
+    std::size_t index1 = td->my_arena_index;
+    if (!same_arena) {
+        index1 = a->occupy_free_slot</*as_worker */false>(*td);
+        if (index1 == arena::out_of_arena) {
+            concurrent_monitor::thread_context waiter((std::uintptr_t)&d);
+            d1::wait_context wo(1);
+            d1::task_group_context exec_context(d1::task_group_context::isolated);
+            task_group_context_impl::copy_fp_settings(exec_context, *a->my_default_ctx);
+
+            delegated_task dt(d, a->my_exit_monitors, wo);
+            a->enqueue_task( dt, exec_context, *td);
+            size_t index2 = arena::out_of_arena;
+            do {
+                a->my_exit_monitors.prepare_wait(waiter);
+                if (!wo.continue_execution()) {
+                    a->my_exit_monitors.cancel_wait(waiter);
+                    break;
+                }
+                index2 = a->occupy_free_slot</*as_worker*/false>(*td);
+                if (index2 != arena::out_of_arena) {
+                    a->my_exit_monitors.cancel_wait(waiter);
+                    nested_arena_context scope(*td, *a, index2 );
+                    r1::wait(wo, exec_context);
+                    __TBB_ASSERT(!exec_context.my_exception.load(std::memory_order_relaxed), nullptr); // exception can be thrown above, not deferred
+                    break;
+                }
+                a->my_exit_monitors.commit_wait(waiter);
+            } while (wo.continue_execution());
+            if (index2 == arena::out_of_arena) {
+                // notify a waiting thread even if this thread did not enter arena,
+                // in case it was woken by a leaving thread but did not need to enter
+                a->my_exit_monitors.notify_one(); // do not relax!
+            }
+            // process possible exception
+            auto exception = exec_context.my_exception.load(std::memory_order_acquire);
+            if (exception) {
+                __TBB_ASSERT(exec_context.is_group_execution_cancelled(), "The task group context with an exception should be canceled.");
+                exception->throw_self();
+            }
+            __TBB_ASSERT(governor::is_thread_data_set(td), nullptr);
+            return;
+        } // if (index1 == arena::out_of_arena)
+    } // if (!same_arena)
+
+    context_guard_helper</*report_tasks=*/false> context_guard;
+    context_guard.set_ctx(a->my_default_ctx);
+    nested_arena_context scope(*td, *a, index1);
+#if _WIN64
+    try {
+#endif
+        d();
+        __TBB_ASSERT(same_arena || governor::is_thread_data_set(td), nullptr);
+#if _WIN64
+    } catch (...) {
+        context_guard.restore_default();
+        throw;
+    }
+#endif
+}
+
+void task_arena_impl::wait(d1::task_arena_base& ta) {
+    arena* a = ta.my_arena.load(std::memory_order_relaxed);
+    __TBB_ASSERT(a != nullptr, nullptr);
+    thread_data* td = governor::get_thread_data();
+    __TBB_ASSERT_EX(td, "Scheduler is not initialized");
+    __TBB_ASSERT(td->my_arena != a || td->my_arena_index == 0, "internal_wait is not supported within a worker context" );
+    if (a->my_max_num_workers != 0) {
+        while (a->num_workers_active() || !a->is_empty()) {
+            yield();
+        }
+    }
+}
+
+int task_arena_impl::max_concurrency(const d1::task_arena_base *ta) {
+    arena* a = nullptr;
+    if( ta ) // for special cases of ta->max_concurrency()
+        a = ta->my_arena.load(std::memory_order_relaxed);
+    else if( thread_data* td = governor::get_thread_data_if_initialized() )
+        a = td->my_arena; // the current arena if any
+
+    if( a ) { // Get parameters from the arena
+        __TBB_ASSERT( !ta || ta->my_max_concurrency==1, nullptr);
+        int mandatory_worker = 0;
+        if (a->is_arena_workerless() && a->my_num_reserved_slots == 1) {
+            mandatory_worker = a->my_mandatory_concurrency.test() ? 1 : 0;
+        }
+        return a->my_num_reserved_slots + a->my_max_num_workers + mandatory_worker;
+    }
+
+    if (ta && ta->my_max_concurrency == 1) {
+        return 1;
+    }
+
+#if __TBB_ARENA_BINDING
+    if (ta) {
+        d1::constraints arena_constraints = d1::constraints{}
+            .set_numa_id(ta->my_numa_id)
+            .set_core_type(ta->core_type())
+            .set_max_threads_per_core(ta->max_threads_per_core());
+        return (int)default_concurrency(arena_constraints);
+    }
+#endif /*!__TBB_ARENA_BINDING*/
+
+    __TBB_ASSERT(!ta || ta->my_max_concurrency==d1::task_arena_base::automatic, nullptr);
+    return int(governor::default_num_threads());
+}
+
+void isolate_within_arena(d1::delegate_base& d, std::intptr_t isolation) {
+    // TODO: Decide what to do if the scheduler is not initialized. Is there a use case for it?
+    thread_data* tls = governor::get_thread_data();
+    assert_pointers_valid(tls, tls->my_task_dispatcher);
+    task_dispatcher* dispatcher = tls->my_task_dispatcher;
+    isolation_type previous_isolation = dispatcher->m_execute_data_ext.isolation;
+    try_call([&] {
+        // We temporarily change the isolation tag of the currently running task. It will be restored in the destructor of the guard.
+        isolation_type current_isolation = isolation ? isolation : reinterpret_cast<isolation_type>(&d);
+        // Save the current isolation value and set new one
+        previous_isolation = dispatcher->set_isolation(current_isolation);
+        // Isolation within this callable
+        d();
+    }).on_completion([&] {
+        __TBB_ASSERT(governor::get_thread_data()->my_task_dispatcher == dispatcher, nullptr);
+        dispatcher->set_isolation(previous_isolation);
+    });
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
--- a/third_party/tbb/arena.hh
+++ b/third_party/tbb/arena.hh
@ -0,0 +1,511 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_arena_H
+#define _TBB_arena_H
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/cstring"
+
+#include "third_party/tbb/detail/_task.hh"
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/spin_mutex.hh"
+
+#include "third_party/tbb/scheduler_common.hh"
+#include "third_party/tbb/intrusive_list.hh"
+#include "third_party/tbb/task_stream.hh"
+#include "third_party/tbb/arena_slot.hh"
+#include "third_party/tbb/rml_tbb.hh"
+#include "third_party/tbb/mailbox.hh"
+#include "third_party/tbb/governor.hh"
+#include "third_party/tbb/concurrent_monitor.hh"
+#include "third_party/tbb/observer_proxy.hh"
+#include "third_party/tbb/thread_control_monitor.hh"
+#include "third_party/tbb/threading_control_client.hh"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class task_dispatcher;
+class task_group_context;
+class threading_control;
+class allocate_root_with_context_proxy;
+
+#if __TBB_ARENA_BINDING
+class numa_binding_observer;
+#endif /*__TBB_ARENA_BINDING*/
+
+//! Bounded coroutines cache LIFO ring buffer
+class arena_co_cache {
+    //! Ring buffer storage
+    task_dispatcher** my_co_scheduler_cache;
+    //! Current cache index
+    unsigned my_head;
+    //! Cache capacity for arena
+    unsigned my_max_index;
+    //! Accessor lock for modification operations
+    tbb::spin_mutex my_co_cache_mutex;
+
+    unsigned next_index() {
+        return ( my_head == my_max_index ) ? 0 : my_head + 1;
+    }
+
+    unsigned prev_index() {
+        return ( my_head == 0 ) ? my_max_index : my_head - 1;
+    }
+
+    bool internal_empty() {
+        return my_co_scheduler_cache[prev_index()] == nullptr;
+    }
+
+    void internal_task_dispatcher_cleanup(task_dispatcher* to_cleanup) {
+        to_cleanup->~task_dispatcher();
+        cache_aligned_deallocate(to_cleanup);
+    }
+
+public:
+    void init(unsigned cache_capacity) {
+        std::size_t alloc_size = cache_capacity * sizeof(task_dispatcher*);
+        my_co_scheduler_cache = (task_dispatcher**)cache_aligned_allocate(alloc_size);
+        std::memset( my_co_scheduler_cache, 0, alloc_size );
+        my_head = 0;
+        my_max_index = cache_capacity - 1;
+    }
+
+    void cleanup() {
+        while (task_dispatcher* to_cleanup = pop()) {
+            internal_task_dispatcher_cleanup(to_cleanup);
+        }
+        cache_aligned_deallocate(my_co_scheduler_cache);
+    }
+
+    //! Insert scheduler to the current available place.
+    //! Replace an old value, if necessary.
+    void push(task_dispatcher* s) {
+        task_dispatcher* to_cleanup = nullptr;
+        {
+            tbb::spin_mutex::scoped_lock lock(my_co_cache_mutex);
+            // Check if we are replacing some existing buffer entrance
+            if (my_co_scheduler_cache[my_head] != nullptr) {
+                to_cleanup = my_co_scheduler_cache[my_head];
+            }
+            // Store the cached value
+            my_co_scheduler_cache[my_head] = s;
+            // Move head index to the next slot
+            my_head = next_index();
+        }
+        // Cleanup replaced buffer if any
+        if (to_cleanup) {
+            internal_task_dispatcher_cleanup(to_cleanup);
+        }
+    }
+
+    //! Get a cached scheduler if any
+    task_dispatcher* pop() {
+        tbb::spin_mutex::scoped_lock lock(my_co_cache_mutex);
+        // No cached coroutine
+        if (internal_empty()) {
+            return nullptr;
+        }
+        // Move head index to the currently available value
+        my_head = prev_index();
+        // Retrieve the value from the buffer
+        task_dispatcher* to_return = my_co_scheduler_cache[my_head];
+        // Clear the previous entrance value
+        my_co_scheduler_cache[my_head] = nullptr;
+        return to_return;
+    }
+};
+
+struct stack_anchor_type {
+    stack_anchor_type() = default;
+    stack_anchor_type(const stack_anchor_type&) = delete;
+};
+
+class atomic_flag {
+    static const std::uintptr_t SET = 1;
+    static const std::uintptr_t UNSET = 0;
+    std::atomic<std::uintptr_t> my_state{UNSET};
+public:
+    bool test_and_set() {
+        std::uintptr_t state = my_state.load(std::memory_order_acquire);
+        switch (state) {
+        case SET:
+            return false;
+        default: /* busy */
+            if (my_state.compare_exchange_strong(state, SET)) {
+                // We interrupted clear transaction
+                return false;
+            }
+            if (state != UNSET) {
+                // We lost our epoch
+                return false;
+            }
+            // We are too late but still in the same epoch
+            __TBB_fallthrough;
+        case UNSET:
+            return my_state.compare_exchange_strong(state, SET);
+        }
+    }
+    template <typename Pred>
+    bool try_clear_if(Pred&& pred) {
+        std::uintptr_t busy = std::uintptr_t(&busy);
+        std::uintptr_t state = my_state.load(std::memory_order_acquire);
+        if (state == SET && my_state.compare_exchange_strong(state, busy)) {
+            if (pred()) {
+                return my_state.compare_exchange_strong(busy, UNSET);
+            }
+            // The result of the next operation is discarded, always false should be returned.
+            my_state.compare_exchange_strong(busy, SET);
+        }
+        return false;
+    }
+    void clear() {
+        my_state.store(UNSET, std::memory_order_release);
+    }
+    bool test(std::memory_order order = std::memory_order_acquire) {
+        return my_state.load(order) != UNSET;
+    }
+};
+
+//! The structure of an arena, except the array of slots.
+/** Separated in order to simplify padding.
+    Intrusive list node base class is used by market to form a list of arenas. **/
+// TODO: Analyze arena_base cache lines placement
+struct arena_base : padded<intrusive_list_node> {
+    //! The number of workers that have been marked out by the resource manager to service the arena.
+    std::atomic<unsigned> my_num_workers_allotted;   // heavy use in stealing loop
+
+    //! Reference counter for the arena.
+    /** Worker and external thread references are counted separately: first several bits are for references
+        from external thread threads or explicit task_arenas (see arena::ref_external_bits below);
+        the rest counts the number of workers servicing the arena. */
+    std::atomic<unsigned> my_references;     // heavy use in stealing loop
+
+    //! The maximal number of currently busy slots.
+    std::atomic<unsigned> my_limit;          // heavy use in stealing loop
+
+    //! Task pool for the tasks scheduled via task::enqueue() method.
+    /** Such scheduling guarantees eventual execution even if
+        - new tasks are constantly coming (by extracting scheduled tasks in
+          relaxed FIFO order);
+        - the enqueuing thread does not call any of wait_for_all methods. **/
+    task_stream<front_accessor> my_fifo_task_stream; // heavy use in stealing loop
+
+    //! Task pool for the tasks scheduled via tbb::resume() function.
+    task_stream<front_accessor> my_resume_task_stream; // heavy use in stealing loop
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    //! Task pool for the tasks with critical property set.
+    /** Critical tasks are scheduled for execution ahead of other sources (including local task pool
+        and even bypassed tasks) unless the thread already executes a critical task in an outer
+        dispatch loop **/
+    // used on the hot path of the task dispatch loop
+    task_stream<back_nonnull_accessor> my_critical_task_stream;
+#endif
+
+    //! The total number of workers that are requested from the resource manager.
+    int my_total_num_workers_requested;
+
+    //! The index in the array of per priority lists of arenas this object is in.
+    /*const*/ unsigned my_priority_level;
+
+    //! The max priority level of arena in permit manager.
+    std::atomic<bool> my_is_top_priority{false};
+
+    //! Current task pool state and estimate of available tasks amount.
+    atomic_flag my_pool_state;
+
+    //! The list of local observers attached to this arena.
+    observer_list my_observers;
+
+#if __TBB_ARENA_BINDING
+    //! Pointer to internal observer that allows to bind threads in arena to certain NUMA node.
+    numa_binding_observer* my_numa_binding_observer;
+#endif /*__TBB_ARENA_BINDING*/
+
+    // Below are rarely modified members
+
+    threading_control* my_threading_control;
+
+    //! Default task group context.
+    d1::task_group_context* my_default_ctx;
+
+    //! Waiting object for external threads that cannot join the arena.
+    concurrent_monitor my_exit_monitors;
+
+    //! Coroutines (task_dispathers) cache buffer
+    arena_co_cache my_co_cache;
+
+    // arena needs an extra worker despite the arena limit
+    atomic_flag my_mandatory_concurrency;
+    // the number of local mandatory concurrency requests
+    int my_mandatory_requests;
+
+    //! The number of slots in the arena.
+    unsigned my_num_slots;
+    //! The number of reserved slots (can be occupied only by external threads).
+    unsigned my_num_reserved_slots;
+    //! The number of workers requested by the external thread owning the arena.
+    unsigned my_max_num_workers;
+
+    threading_control_client my_tc_client;
+
+#if TBB_USE_ASSERT
+    //! Used to trap accesses to the object after its destruction.
+    std::uintptr_t my_guard;
+#endif /* TBB_USE_ASSERT */
+}; // struct arena_base
+
+class arena: public padded<arena_base>
+{
+public:
+    using base_type = padded<arena_base>;
+
+    //! Types of work advertised by advertise_new_work()
+    enum new_work_type {
+        work_spawned,
+        wakeup,
+        work_enqueued
+    };
+
+    //! Constructor
+    arena(threading_control* control, unsigned max_num_workers, unsigned num_reserved_slots, unsigned priority_level);
+
+    //! Allocate an instance of arena.
+    static arena& allocate_arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots,
+                                  unsigned priority_level);
+
+    static arena& create(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned arena_priority_level);
+
+    static int unsigned num_arena_slots ( unsigned num_slots, unsigned num_reserved_slots ) {
+        return num_reserved_slots == 0 ? num_slots : max(2u, num_slots);
+    }
+
+    static int allocation_size( unsigned num_slots ) {
+        return sizeof(base_type) + num_slots * (sizeof(mail_outbox) + sizeof(arena_slot) + sizeof(task_dispatcher));
+    }
+
+    //! Get reference to mailbox corresponding to given slot_id
+    mail_outbox& mailbox( d1::slot_id slot ) {
+        __TBB_ASSERT( slot != d1::no_slot, "affinity should be specified" );
+
+        return reinterpret_cast<mail_outbox*>(this)[-(int)(slot+1)]; // cast to 'int' is redundant but left for readability
+    }
+
+    //! Completes arena shutdown, destructs and deallocates it.
+    void free_arena();
+
+    //! The number of least significant bits for external references
+    static const unsigned ref_external_bits = 12; // up to 4095 external and 1M workers
+
+    //! Reference increment values for externals and workers
+    static const unsigned ref_external = 1;
+    static const unsigned ref_worker   = 1 << ref_external_bits;
+
+    //! The number of workers active in the arena.
+    unsigned num_workers_active() const {
+        return my_references.load(std::memory_order_acquire) >> ref_external_bits;
+    }
+
+    //! Check if the recall is requested by the market.
+    bool is_recall_requested() const {
+        return num_workers_active() > my_num_workers_allotted.load(std::memory_order_relaxed);
+    }
+
+    void request_workers(int mandatory_delta, int workers_delta, bool wakeup_threads = false);
+
+    //! If necessary, raise a flag that there is new job in arena.
+    template<arena::new_work_type work_type> void advertise_new_work();
+
+    //! Attempts to steal a task from a randomly chosen arena slot
+    d1::task* steal_task(unsigned arena_index, FastRandom& frnd, execution_data_ext& ed, isolation_type isolation);
+
+    //! Get a task from a global starvation resistant queue
+    template<task_stream_accessor_type accessor>
+    d1::task* get_stream_task(task_stream<accessor>& stream, unsigned& hint);
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    //! Tries to find a critical task in global critical task stream
+    d1::task* get_critical_task(unsigned& hint, isolation_type isolation);
+#endif
+
+    //! Check if there is job anywhere in arena.
+    void out_of_work();
+
+    //! enqueue a task into starvation-resistance queue
+    void enqueue_task(d1::task&, d1::task_group_context&, thread_data&);
+
+    //! Registers the worker with the arena and enters TBB scheduler dispatch loop
+    void process(thread_data&);
+
+    //! Notification that the thread leaves its arena
+
+    void on_thread_leaving(unsigned ref_param);
+
+    //! Check for the presence of enqueued tasks
+    bool has_enqueued_tasks();
+
+    //! Check for the presence of any tasks
+    bool has_tasks();
+
+    bool is_empty() { return my_pool_state.test() == /* EMPTY */ false; }
+
+    thread_control_monitor& get_waiting_threads_monitor();
+
+    static const std::size_t out_of_arena = ~size_t(0);
+    //! Tries to occupy a slot in the arena. On success, returns the slot index; if no slot is available, returns out_of_arena.
+    template <bool as_worker>
+    std::size_t occupy_free_slot(thread_data&);
+    //! Tries to occupy a slot in the specified range.
+    std::size_t occupy_free_slot_in_range(thread_data& tls, std::size_t lower, std::size_t upper);
+
+    std::uintptr_t calculate_stealing_threshold();
+
+    unsigned priority_level() { return my_priority_level; }
+
+    bool has_request() { return my_total_num_workers_requested; }
+
+    unsigned references() const { return my_references.load(std::memory_order_acquire); }
+
+    bool is_arena_workerless() const { return my_max_num_workers == 0; }
+
+    void set_top_priority(bool);
+
+    bool is_top_priority() const;
+
+    bool try_join();
+
+    void set_allotment(unsigned allotment);
+
+    std::pair</*min workers = */ int, /*max workers = */ int> update_request(int mandatory_delta, int workers_delta);
+
+    /** Must be the last data field */
+    arena_slot my_slots[1];
+}; // class arena
+
+template <arena::new_work_type work_type>
+void arena::advertise_new_work() {
+    bool is_mandatory_needed = false;
+    bool are_workers_needed = false;
+
+    if (work_type != work_spawned) {
+        // Local memory fence here and below is required to avoid missed wakeups; see the comment below.
+        // Starvation resistant tasks require concurrency, so missed wakeups are unacceptable.
+        atomic_fence_seq_cst();
+    }
+
+    if (work_type == work_enqueued && my_num_slots > my_num_reserved_slots) {
+        is_mandatory_needed = my_mandatory_concurrency.test_and_set();
+    }
+
+    // Double-check idiom that, in case of spawning, is deliberately sloppy about memory fences.
+    // Technically, to avoid missed wakeups, there should be a full memory fence between the point we
+    // released the task pool (i.e. spawned task) and read the arena's state.  However, adding such a
+    // fence might hurt overall performance more than it helps, because the fence would be executed
+    // on every task pool release, even when stealing does not occur.  Since TBB allows parallelism,
+    // but never promises parallelism, the missed wakeup is not a correctness problem.
+    are_workers_needed = my_pool_state.test_and_set();
+
+    if (is_mandatory_needed || are_workers_needed) {
+        int mandatory_delta = is_mandatory_needed ? 1 : 0;
+        int workers_delta = are_workers_needed ? my_max_num_workers : 0;
+
+        if (is_mandatory_needed && is_arena_workerless()) {
+            // Set workers_delta to 1 to keep arena invariants consistent
+            workers_delta = 1;
+        }
+
+        bool wakeup_workers = is_mandatory_needed || are_workers_needed;
+        request_workers(mandatory_delta, workers_delta, wakeup_workers);
+    }
+}
+
+inline d1::task* arena::steal_task(unsigned arena_index, FastRandom& frnd, execution_data_ext& ed, isolation_type isolation) {
+    auto slot_num_limit = my_limit.load(std::memory_order_relaxed);
+    if (slot_num_limit == 1) {
+        // No slots to steal from
+        return nullptr;
+    }
+    // Try to steal a task from a random victim.
+    std::size_t k = frnd.get() % (slot_num_limit - 1);
+    // The following condition excludes the external thread that might have
+    // already taken our previous place in the arena from the list .
+    // of potential victims. But since such a situation can take
+    // place only in case of significant oversubscription, keeping
+    // the checks simple seems to be preferable to complicating the code.
+    if (k >= arena_index) {
+        ++k; // Adjusts random distribution to exclude self
+    }
+    arena_slot* victim = &my_slots[k];
+    d1::task **pool = victim->task_pool.load(std::memory_order_relaxed);
+    d1::task *t = nullptr;
+    if (pool == EmptyTaskPool || !(t = victim->steal_task(*this, isolation, k))) {
+        return nullptr;
+    }
+    if (task_accessor::is_proxy_task(*t)) {
+        task_proxy &tp = *(task_proxy*)t;
+        d1::slot_id slot = tp.slot;
+        t = tp.extract_task<task_proxy::pool_bit>();
+        if (!t) {
+            // Proxy was empty, so it's our responsibility to free it
+            tp.allocator.delete_object(&tp, ed);
+            return nullptr;
+        }
+        // Note affinity is called for any stolen task (proxy or general)
+        ed.affinity_slot = slot;
+    } else {
+        // Note affinity is called for any stolen task (proxy or general)
+        ed.affinity_slot = d1::any_slot;
+    }
+    // Update task owner thread id to identify stealing
+    ed.original_slot = k;
+    return t;
+}
+
+template<task_stream_accessor_type accessor>
+inline d1::task* arena::get_stream_task(task_stream<accessor>& stream, unsigned& hint) {
+    if (stream.empty())
+        return nullptr;
+    return stream.pop(subsequent_lane_selector(hint));
+}
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+// Retrieves critical task respecting isolation level, if provided. The rule is:
+// 1) If no outer critical task and no isolation => take any critical task
+// 2) If working on an outer critical task and no isolation => cannot take any critical task
+// 3) If no outer critical task but isolated => respect isolation
+// 4) If working on an outer critical task and isolated => respect isolation
+// Hint is used to keep some LIFO-ness, start search with the lane that was used during push operation.
+inline d1::task* arena::get_critical_task(unsigned& hint, isolation_type isolation) {
+    if (my_critical_task_stream.empty())
+        return nullptr;
+
+    if ( isolation != no_isolation ) {
+        return my_critical_task_stream.pop_specific( hint, isolation );
+    } else {
+        return my_critical_task_stream.pop(preceding_lane_selector(hint));
+    }
+}
+#endif // __TBB_PREVIEW_CRITICAL_TASKS
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_arena_H */
--- a/third_party/tbb/arena_slot.cc
+++ b/third_party/tbb/arena_slot.cc
@ -0,0 +1,219 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/arena_slot.hh"
+#include "third_party/tbb/arena.hh"
+#include "third_party/tbb/thread_data.hh"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// Arena Slot
+//------------------------------------------------------------------------
+d1::task* arena_slot::get_task_impl(size_t T, execution_data_ext& ed, bool& tasks_omitted, isolation_type isolation) {
+    __TBB_ASSERT(tail.load(std::memory_order_relaxed) <= T || is_local_task_pool_quiescent(),
+            "Is it safe to get a task at position T?");
+
+    d1::task* result = task_pool_ptr[T];
+    __TBB_ASSERT(!is_poisoned( result ), "The poisoned task is going to be processed");
+
+    if (!result) {
+        return nullptr;
+    }
+    bool omit = isolation != no_isolation && isolation != task_accessor::isolation(*result);
+    if (!omit && !task_accessor::is_proxy_task(*result)) {
+        return result;
+    } else if (omit) {
+        tasks_omitted = true;
+        return nullptr;
+    }
+
+    task_proxy& tp = static_cast<task_proxy&>(*result);
+    d1::slot_id aff_id = tp.slot;
+    if ( d1::task *t = tp.extract_task<task_proxy::pool_bit>() ) {
+        ed.affinity_slot = aff_id;
+        return t;
+    }
+    // Proxy was empty, so it's our responsibility to free it
+    tp.allocator.delete_object(&tp, ed);
+
+    if ( tasks_omitted ) {
+        task_pool_ptr[T] = nullptr;
+    }
+    return nullptr;
+}
+
+d1::task* arena_slot::get_task(execution_data_ext& ed, isolation_type isolation) {
+    __TBB_ASSERT(is_task_pool_published(), nullptr);
+    // The current task position in the task pool.
+    std::size_t T0 = tail.load(std::memory_order_relaxed);
+    // The bounds of available tasks in the task pool. H0 is only used when the head bound is reached.
+    std::size_t H0 = (std::size_t)-1, T = T0;
+    d1::task* result = nullptr;
+    bool task_pool_empty = false;
+    bool tasks_omitted = false;
+    do {
+        __TBB_ASSERT( !result, nullptr );
+        // The full fence is required to sync the store of `tail` with the load of `head` (write-read barrier)
+        T = --tail;
+        // The acquire load of head is required to guarantee consistency of our task pool
+        // when a thief rolls back the head.
+        if ( (std::intptr_t)( head.load(std::memory_order_acquire) ) > (std::intptr_t)T ) {
+            acquire_task_pool();
+            H0 = head.load(std::memory_order_relaxed);
+            if ( (std::intptr_t)H0 > (std::intptr_t)T ) {
+                // The thief has not backed off - nothing to grab.
+                __TBB_ASSERT( H0 == head.load(std::memory_order_relaxed)
+                    && T == tail.load(std::memory_order_relaxed)
+                    && H0 == T + 1, "victim/thief arbitration algorithm failure" );
+                reset_task_pool_and_leave();
+                // No tasks in the task pool.
+                task_pool_empty = true;
+                break;
+            } else if ( H0 == T ) {
+                // There is only one task in the task pool.
+                reset_task_pool_and_leave();
+                task_pool_empty = true;
+            } else {
+                // Release task pool if there are still some tasks.
+                // After the release, the tail will be less than T, thus a thief
+                // will not attempt to get a task at position T.
+                release_task_pool();
+            }
+        }
+        result = get_task_impl( T, ed, tasks_omitted, isolation );
+        if ( result ) {
+            poison_pointer( task_pool_ptr[T] );
+            break;
+        } else if ( !tasks_omitted ) {
+            poison_pointer( task_pool_ptr[T] );
+            __TBB_ASSERT( T0 == T+1, nullptr );
+            T0 = T;
+        }
+    } while ( !result && !task_pool_empty );
+
+    if ( tasks_omitted ) {
+        if ( task_pool_empty ) {
+            // All tasks have been checked. The task pool should be  in reset state.
+            // We just restore the bounds for the available tasks.
+            // TODO: Does it have sense to move them to the beginning of the task pool?
+            __TBB_ASSERT( is_quiescent_local_task_pool_reset(), nullptr );
+            if ( result ) {
+                // If we have a task, it should be at H0 position.
+                __TBB_ASSERT( H0 == T, nullptr );
+                ++H0;
+            }
+            __TBB_ASSERT( H0 <= T0, nullptr );
+            if ( H0 < T0 ) {
+                // Restore the task pool if there are some tasks.
+                head.store(H0, std::memory_order_relaxed);
+                tail.store(T0, std::memory_order_relaxed);
+                // The release fence is used in publish_task_pool.
+                publish_task_pool();
+                // Synchronize with snapshot as we published some tasks.
+                ed.task_disp->m_thread_data->my_arena->advertise_new_work<arena::wakeup>();
+            }
+        } else {
+            // A task has been obtained. We need to make a hole in position T.
+            __TBB_ASSERT( is_task_pool_published(), nullptr );
+            __TBB_ASSERT( result, nullptr );
+            task_pool_ptr[T] = nullptr;
+            tail.store(T0, std::memory_order_release);
+            // Synchronize with snapshot as we published some tasks.
+            // TODO: consider some approach not to call wakeup for each time. E.g. check if the tail reached the head.
+            ed.task_disp->m_thread_data->my_arena->advertise_new_work<arena::wakeup>();
+        }
+    }
+
+    __TBB_ASSERT( (std::intptr_t)tail.load(std::memory_order_relaxed) >= 0, nullptr );
+    __TBB_ASSERT( result || tasks_omitted || is_quiescent_local_task_pool_reset(), nullptr );
+    return result;
+}
+
+d1::task* arena_slot::steal_task(arena& a, isolation_type isolation, std::size_t slot_index) {
+    d1::task** victim_pool = lock_task_pool();
+    if (!victim_pool) {
+        return nullptr;
+    }
+    d1::task* result = nullptr;
+    std::size_t H = head.load(std::memory_order_relaxed); // mirror
+    std::size_t H0 = H;
+    bool tasks_omitted = false;
+    do {
+        // The full fence is required to sync the store of `head` with the load of `tail` (write-read barrier)
+        H = ++head;
+        // The acquire load of tail is required to guarantee consistency of victim_pool
+        // because the owner synchronizes task spawning via tail.
+        if ((std::intptr_t)H > (std::intptr_t)(tail.load(std::memory_order_acquire))) {
+            // Stealing attempt failed, deque contents has not been changed by us
+            head.store( /*dead: H = */ H0, std::memory_order_relaxed );
+            __TBB_ASSERT( !result, nullptr );
+            goto unlock;
+        }
+        result = victim_pool[H-1];
+        __TBB_ASSERT( !is_poisoned( result ), nullptr );
+
+        if (result) {
+            if (isolation == no_isolation || isolation == task_accessor::isolation(*result)) {
+                if (!task_accessor::is_proxy_task(*result)) {
+                    break;
+                }
+                task_proxy& tp = *static_cast<task_proxy*>(result);
+                // If mailed task is likely to be grabbed by its destination thread, skip it.
+                if (!task_proxy::is_shared(tp.task_and_tag) || !tp.outbox->recipient_is_idle() || a.mailbox(slot_index).recipient_is_idle()) {
+                    break;
+                }
+            }
+            // The task cannot be executed either due to isolation or proxy constraints.
+            result = nullptr;
+            tasks_omitted = true;
+        } else if (!tasks_omitted) {
+            // Cleanup the task pool from holes until a task is skipped.
+            __TBB_ASSERT( H0 == H-1, nullptr );
+            poison_pointer( victim_pool[H0] );
+            H0 = H;
+        }
+    } while (!result);
+    __TBB_ASSERT( result, nullptr );
+
+    // emit "task was consumed" signal
+    poison_pointer( victim_pool[H-1] );
+    if (tasks_omitted) {
+        // Some proxies in the task pool have been omitted. Set the stolen task to nullptr.
+        victim_pool[H-1] = nullptr;
+        // The release store synchronizes the victim_pool update(the store of nullptr).
+        head.store( /*dead: H = */ H0, std::memory_order_release );
+    }
+unlock:
+    unlock_task_pool(victim_pool);
+
+#if __TBB_PREFETCHING
+    __TBB_cl_evict(&victim_slot.head);
+    __TBB_cl_evict(&victim_slot.tail);
+#endif
+    if (tasks_omitted) {
+        // Synchronize with snapshot as the head and tail can be bumped which can falsely trigger EMPTY state
+        a.advertise_new_work<arena::wakeup>();
+    }
+    return result;
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
--- a/third_party/tbb/arena_slot.hh
+++ b/third_party/tbb/arena_slot.hh
@ -0,0 +1,415 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_arena_slot_H
+#define _TBB_arena_slot_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/detail/_template_helpers.hh"
+#include "third_party/tbb/detail/_task.hh"
+
+#include "third_party/tbb/cache_aligned_allocator.hh"
+
+#include "third_party/tbb/misc.hh"
+#include "third_party/tbb/mailbox.hh"
+#include "third_party/tbb/scheduler_common.hh"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class arena;
+class task_group_context;
+
+//--------------------------------------------------------------------------------------------------------
+// Arena Slot
+//--------------------------------------------------------------------------------------------------------
+
+static d1::task** const EmptyTaskPool  = nullptr;
+static d1::task** const LockedTaskPool = reinterpret_cast<d1::task**>(~std::intptr_t(0));
+
+struct alignas(max_nfs_size) arena_slot_shared_state {
+    //! Scheduler of the thread attached to the slot
+    /** Marks the slot as busy, and is used to iterate through the schedulers belonging to this arena **/
+    std::atomic<bool> my_is_occupied;
+
+    // Synchronization of access to Task pool
+    /** Also is used to specify if the slot is empty or locked:
+         0 - empty
+        -1 - locked **/
+    std::atomic<d1::task**> task_pool;
+
+    //! Index of the first ready task in the deque.
+    /** Modified by thieves, and by the owner during compaction/reallocation **/
+    std::atomic<std::size_t> head;
+};
+
+struct alignas(max_nfs_size) arena_slot_private_state {
+    //! Hint provided for operations with the container of starvation-resistant tasks.
+    /** Modified by the owner thread (during these operations). **/
+    unsigned hint_for_fifo_stream;
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    //! Similar to 'hint_for_fifo_stream' but for critical tasks.
+    unsigned hint_for_critical_stream;
+#endif
+
+    //! Similar to 'hint_for_fifo_stream' but for the resume tasks.
+    unsigned hint_for_resume_stream;
+
+    //! Index of the element following the last ready task in the deque.
+    /** Modified by the owner thread. **/
+    std::atomic<std::size_t> tail;
+
+    //! Capacity of the primary task pool (number of elements - pointers to task).
+    std::size_t my_task_pool_size;
+
+    //! Task pool of the scheduler that owns this slot
+    // TODO: previously was task**__TBB_atomic, but seems like not accessed on other thread
+    d1::task** task_pool_ptr;
+};
+
+class arena_slot : private arena_slot_shared_state, private arena_slot_private_state {
+    friend class arena;
+    friend class outermost_worker_waiter;
+    friend class task_dispatcher;
+    friend class thread_data;
+    friend class nested_arena_context;
+
+    //! The original task dispather associated with this slot
+    task_dispatcher* my_default_task_dispatcher;
+
+#if TBB_USE_ASSERT
+    void fill_with_canary_pattern ( std::size_t first, std::size_t last ) {
+        for ( std::size_t i = first; i < last; ++i )
+            poison_pointer(task_pool_ptr[i]);
+    }
+#else
+    void fill_with_canary_pattern ( size_t, std::size_t ) {}
+#endif /* TBB_USE_ASSERT */
+
+    static constexpr std::size_t min_task_pool_size = 64;
+
+    void allocate_task_pool( std::size_t n ) {
+        std::size_t byte_size = ((n * sizeof(d1::task*) + max_nfs_size - 1) / max_nfs_size) * max_nfs_size;
+        my_task_pool_size = byte_size / sizeof(d1::task*);
+        task_pool_ptr = (d1::task**)cache_aligned_allocate(byte_size);
+        // No need to clear the fresh deque since valid items are designated by the head and tail members.
+        // But fill it with a canary pattern in the high vigilance debug mode.
+        fill_with_canary_pattern( 0, my_task_pool_size );
+    }
+
+public:
+    //! Deallocate task pool that was allocated by means of allocate_task_pool.
+    void free_task_pool( ) {
+        // TODO: understand the assertion and modify
+        // __TBB_ASSERT( !task_pool /* TODO: == EmptyTaskPool */, nullptr);
+        if( task_pool_ptr ) {
+           __TBB_ASSERT( my_task_pool_size, nullptr);
+           cache_aligned_deallocate( task_pool_ptr );
+           task_pool_ptr = nullptr;
+           my_task_pool_size = 0;
+        }
+    }
+
+    //! Get a task from the local pool.
+    /** Called only by the pool owner.
+        Returns the pointer to the task or nullptr if a suitable task is not found.
+        Resets the pool if it is empty. **/
+    d1::task* get_task(execution_data_ext&, isolation_type);
+
+    //! Steal task from slot's ready pool
+    d1::task* steal_task(arena&, isolation_type, std::size_t);
+
+    //! Some thread is now the owner of this slot
+    void occupy() {
+        __TBB_ASSERT(!my_is_occupied.load(std::memory_order_relaxed), nullptr);
+        my_is_occupied.store(true, std::memory_order_release);
+    }
+
+    //! Try to occupy the slot
+    bool try_occupy() {
+        return !is_occupied() && my_is_occupied.exchange(true) == false;
+    }
+
+    //! Some thread is now the owner of this slot
+    void release() {
+        __TBB_ASSERT(my_is_occupied.load(std::memory_order_relaxed), nullptr);
+        my_is_occupied.store(false, std::memory_order_release);
+    }
+
+    //! Spawn newly created tasks
+    void spawn(d1::task& t) {
+        std::size_t T = prepare_task_pool(1);
+        __TBB_ASSERT(is_poisoned(task_pool_ptr[T]), nullptr);
+        task_pool_ptr[T] = &t;
+        commit_spawned_tasks(T + 1);
+        if (!is_task_pool_published()) {
+            publish_task_pool();
+        }
+    }
+
+    bool is_task_pool_published() const {
+        return task_pool.load(std::memory_order_relaxed) != EmptyTaskPool;
+    }
+
+    bool is_empty() const {
+        return task_pool.load(std::memory_order_relaxed) == EmptyTaskPool ||
+               head.load(std::memory_order_relaxed) >= tail.load(std::memory_order_relaxed);
+    }
+
+    bool is_occupied() const {
+        return my_is_occupied.load(std::memory_order_relaxed);
+    }
+
+    task_dispatcher& default_task_dispatcher() {
+        __TBB_ASSERT(my_default_task_dispatcher != nullptr, nullptr);
+        return *my_default_task_dispatcher;
+    }
+
+    void init_task_streams(unsigned h) {
+        hint_for_fifo_stream = h;
+#if __TBB_RESUMABLE_TASKS
+        hint_for_resume_stream = h;
+#endif
+#if __TBB_PREVIEW_CRITICAL_TASKS
+        hint_for_critical_stream = h;
+#endif
+    }
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    unsigned& critical_hint() {
+        return hint_for_critical_stream;
+    }
+#endif
+private:
+    //! Get a task from the local pool at specified location T.
+    /** Returns the pointer to the task or nullptr if the task cannot be executed,
+        e.g. proxy has been deallocated or isolation constraint is not met.
+        tasks_omitted tells if some tasks have been omitted.
+        Called only by the pool owner. The caller should guarantee that the
+        position T is not available for a thief. **/
+    d1::task* get_task_impl(size_t T, execution_data_ext& ed, bool& tasks_omitted, isolation_type isolation);
+
+    //! Makes sure that the task pool can accommodate at least n more elements
+    /** If necessary relocates existing task pointers or grows the ready task deque.
+     *  Returns (possible updated) tail index (not accounting for n). **/
+    std::size_t prepare_task_pool(std::size_t num_tasks) {
+        std::size_t T = tail.load(std::memory_order_relaxed); // mirror
+        if ( T + num_tasks <= my_task_pool_size ) {
+            return T;
+        }
+
+        std::size_t new_size = num_tasks;
+        if ( !my_task_pool_size ) {
+            __TBB_ASSERT( !is_task_pool_published() && is_quiescent_local_task_pool_reset(), nullptr);
+            __TBB_ASSERT( !task_pool_ptr, nullptr);
+            if ( num_tasks < min_task_pool_size ) new_size = min_task_pool_size;
+            allocate_task_pool( new_size );
+            return 0;
+        }
+        acquire_task_pool();
+        std::size_t H =  head.load(std::memory_order_relaxed); // mirror
+        d1::task** new_task_pool = task_pool_ptr;
+        __TBB_ASSERT( my_task_pool_size >= min_task_pool_size, nullptr);
+        // Count not skipped tasks. Consider using std::count_if.
+        for ( std::size_t i = H; i < T; ++i )
+            if ( new_task_pool[i] ) ++new_size;
+        // If the free space at the beginning of the task pool is too short, we
+        // are likely facing a pathological single-producer-multiple-consumers
+        // scenario, and thus it's better to expand the task pool
+        bool allocate = new_size > my_task_pool_size - min_task_pool_size/4;
+        if ( allocate ) {
+            // Grow task pool. As this operation is rare, and its cost is asymptotically
+            // amortizable, we can tolerate new task pool allocation done under the lock.
+            if ( new_size < 2 * my_task_pool_size )
+                new_size = 2 * my_task_pool_size;
+            allocate_task_pool( new_size ); // updates my_task_pool_size
+        }
+        // Filter out skipped tasks. Consider using std::copy_if.
+        std::size_t T1 = 0;
+        for ( std::size_t i = H; i < T; ++i ) {
+            if ( new_task_pool[i] ) {
+                task_pool_ptr[T1++] = new_task_pool[i];
+            }
+        }
+        // Deallocate the previous task pool if a new one has been allocated.
+        if ( allocate )
+            cache_aligned_deallocate( new_task_pool );
+        else
+            fill_with_canary_pattern( T1, tail );
+        // Publish the new state.
+        commit_relocated_tasks( T1 );
+        // assert_task_pool_valid();
+        return T1;
+    }
+
+    //! Makes newly spawned tasks visible to thieves
+    void commit_spawned_tasks(std::size_t new_tail) {
+        __TBB_ASSERT (new_tail <= my_task_pool_size, "task deque end was overwritten");
+        // emit "task was released" signal
+        // Release fence is necessary to make sure that previously stored task pointers
+        // are visible to thieves.
+        tail.store(new_tail, std::memory_order_release);
+    }
+
+    //! Used by workers to enter the task pool
+    /** Does not lock the task pool in case if arena slot has been successfully grabbed. **/
+    void publish_task_pool() {
+        __TBB_ASSERT ( task_pool == EmptyTaskPool, "someone else grabbed my arena slot?" );
+        __TBB_ASSERT ( head.load(std::memory_order_relaxed) < tail.load(std::memory_order_relaxed),
+                "entering arena without tasks to share" );
+        // Release signal on behalf of previously spawned tasks (when this thread was not in arena yet)
+        task_pool.store(task_pool_ptr, std::memory_order_release );
+    }
+
+    //! Locks the local task pool
+    /** Garbles task_pool for the duration of the lock. Requires correctly set task_pool_ptr.
+        ATTENTION: This method is mostly the same as generic_scheduler::lock_task_pool(), with
+        a little different logic of slot state checks (slot is either locked or points
+        to our task pool). Thus if either of them is changed, consider changing the counterpart as well. **/
+    void acquire_task_pool() {
+        if (!is_task_pool_published()) {
+            return; // we are not in arena - nothing to lock
+        }
+        bool sync_prepare_done = false;
+        for( atomic_backoff b;;b.pause() ) {
+#if TBB_USE_ASSERT
+            // Local copy of the arena slot task pool pointer is necessary for the next
+            // assertion to work correctly to exclude asynchronous state transition effect.
+            d1::task** tp = task_pool.load(std::memory_order_relaxed);
+            __TBB_ASSERT( tp == LockedTaskPool || tp == task_pool_ptr, "slot ownership corrupt?" );
+#endif
+            d1::task** expected = task_pool_ptr;
+            if( task_pool.load(std::memory_order_relaxed) != LockedTaskPool &&
+                task_pool.compare_exchange_strong(expected, LockedTaskPool ) ) {
+                // We acquired our own slot
+                break;
+            } else if( !sync_prepare_done ) {
+                // Start waiting
+                sync_prepare_done = true;
+            }
+            // Someone else acquired a lock, so pause and do exponential backoff.
+        }
+        __TBB_ASSERT( task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "not really acquired task pool" );
+    }
+
+    //! Unlocks the local task pool
+    /** Restores task_pool munged by acquire_task_pool. Requires
+        correctly set task_pool_ptr. **/
+    void release_task_pool() {
+        if ( !(task_pool.load(std::memory_order_relaxed) != EmptyTaskPool) )
+            return; // we are not in arena - nothing to unlock
+        __TBB_ASSERT( task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "arena slot is not locked" );
+        task_pool.store( task_pool_ptr, std::memory_order_release );
+    }
+
+    //! Locks victim's task pool, and returns pointer to it. The pointer can be nullptr.
+    /** Garbles victim_arena_slot->task_pool for the duration of the lock. **/
+    d1::task** lock_task_pool() {
+        d1::task** victim_task_pool;
+        for ( atomic_backoff backoff;; /*backoff pause embedded in the loop*/) {
+            victim_task_pool = task_pool.load(std::memory_order_relaxed);
+            // Microbenchmarks demonstrated that aborting stealing attempt when the
+            // victim's task pool is locked degrade performance.
+            // NOTE: Do not use comparison of head and tail indices to check for
+            // the presence of work in the victim's task pool, as they may give
+            // incorrect indication because of task pool relocations and resizes.
+            if (victim_task_pool == EmptyTaskPool) {
+                break;
+            }
+            d1::task** expected = victim_task_pool;
+            if (victim_task_pool != LockedTaskPool && task_pool.compare_exchange_strong(expected, LockedTaskPool) ) {
+                // We've locked victim's task pool
+                break;
+            } 
+            // Someone else acquired a lock, so pause and do exponential backoff.
+            backoff.pause();
+        }
+        __TBB_ASSERT(victim_task_pool == EmptyTaskPool ||
+                    (task_pool.load(std::memory_order_relaxed) == LockedTaskPool &&
+                    victim_task_pool != LockedTaskPool), "not really locked victim's task pool?");
+        return victim_task_pool;
+    }
+
+    //! Unlocks victim's task pool
+    /** Restores victim_arena_slot->task_pool munged by lock_task_pool. **/
+    void unlock_task_pool(d1::task** victim_task_pool) {
+        __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "victim arena slot is not locked");
+        __TBB_ASSERT(victim_task_pool != LockedTaskPool, nullptr);
+        task_pool.store(victim_task_pool, std::memory_order_release);
+    }
+
+#if TBB_USE_ASSERT
+    bool is_local_task_pool_quiescent() const {
+        d1::task** tp = task_pool.load(std::memory_order_relaxed);
+        return tp == EmptyTaskPool || tp == LockedTaskPool;
+    }
+
+    bool is_quiescent_local_task_pool_empty() const {
+        __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool is not quiescent");
+        return head.load(std::memory_order_relaxed) == tail.load(std::memory_order_relaxed);
+    }
+
+    bool is_quiescent_local_task_pool_reset() const {
+        __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool is not quiescent");
+        return head.load(std::memory_order_relaxed) == 0 && tail.load(std::memory_order_relaxed) == 0;
+    }
+#endif // TBB_USE_ASSERT
+
+    //! Leave the task pool
+    /** Leaving task pool automatically releases the task pool if it is locked. **/
+    void leave_task_pool() {
+        __TBB_ASSERT(is_task_pool_published(), "Not in arena");
+        // Do not reset my_arena_index. It will be used to (attempt to) re-acquire the slot next time
+        __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "Task pool must be locked when leaving arena");
+        __TBB_ASSERT(is_quiescent_local_task_pool_empty(), "Cannot leave arena when the task pool is not empty");
+        // No release fence is necessary here as this assignment precludes external
+        // accesses to the local task pool when becomes visible. Thus it is harmless
+        // if it gets hoisted above preceding local bookkeeping manipulations.
+        task_pool.store(EmptyTaskPool, std::memory_order_relaxed);
+    }
+
+    //! Resets head and tail indices to 0, and leaves task pool
+    /** The task pool must be locked by the owner (via acquire_task_pool).**/
+    void reset_task_pool_and_leave() {
+        __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "Task pool must be locked when resetting task pool");
+        tail.store(0, std::memory_order_relaxed);
+        head.store(0, std::memory_order_relaxed);
+        leave_task_pool();
+    }
+
+    //! Makes relocated tasks visible to thieves and releases the local task pool.
+    /** Obviously, the task pool must be locked when calling this method. **/
+    void commit_relocated_tasks(std::size_t new_tail) {
+        __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool must be locked when calling commit_relocated_tasks()");
+        head.store(0, std::memory_order_relaxed);
+        // Tail is updated last to minimize probability of a thread making arena
+        // snapshot being misguided into thinking that this task pool is empty.
+        tail.store(new_tail, std::memory_order_release);
+        release_task_pool();
+    }
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_arena_slot_H
--- a/third_party/tbb/assert_impl.hh
+++ b/third_party/tbb/assert_impl.hh
@ -0,0 +1,98 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_assert_impl_H
+#define __TBB_assert_impl_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_utils.hh"
+
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/cstdarg"
+#if _MSC_VER && _DEBUG
+// MISSING #include <crtdbg.h>
+#endif
+
+#include "third_party/libcxx/mutex"
+
+#if __TBBMALLOC_BUILD
+namespace rml { namespace internal {
+#else
+namespace tbb {
+namespace detail {
+namespace r1 {
+#endif
+// TODO: consider extension for formatted error description string
+static void assertion_failure_impl(const char* location, int line, const char* expression, const char* comment) {
+
+    std::fprintf(stderr, "Assertion %s failed (located in the %s function, line in file: %d)\n",
+        expression, location, line);
+
+    if (comment) {
+        std::fprintf(stderr, "Detailed description: %s\n", comment);
+    }
+#if _MSC_VER && _DEBUG
+    if (1 == _CrtDbgReport(_CRT_ASSERT, location, line, "tbb_debug.dll", "%s\r\n%s", expression, comment?comment:"")) {
+        _CrtDbgBreak();
+    } else
+#endif
+    {
+        std::fflush(stderr);
+        std::abort();
+    }
+}
+
+// Do not move the definition into the assertion_failure function because it will require "magic statics".
+// It will bring a dependency on C++ runtime on some platforms while assert_impl.h is reused in tbbmalloc 
+// that should not depend on C++ runtime
+static std::atomic<tbb::detail::do_once_state> assertion_state;
+
+void __TBB_EXPORTED_FUNC assertion_failure(const char* location, int line, const char* expression, const char* comment) {
+#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED
+    // Workaround for erroneous "unreachable code" during assertion throwing using call_once
+    #pragma warning (push)
+    #pragma warning (disable: 4702)
+#endif
+    // We cannot use std::call_once because it brings a dependency on C++ runtime on some platforms 
+    // while assert_impl.h is reused in tbbmalloc that should not depend on C++ runtime
+    atomic_do_once([&](){ assertion_failure_impl(location, line, expression, comment); }, assertion_state);
+#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED
+    #pragma warning (pop)
+#endif
+}
+
+//! Report a runtime warning.
+void runtime_warning( const char* format, ... ) {
+    char str[1024]; std::memset(str, 0, 1024);
+    va_list args; va_start(args, format);
+    vsnprintf( str, 1024-1, format, args);
+    va_end(args);
+    fprintf(stderr, "TBB Warning: %s\n", str);
+}
+
+#if __TBBMALLOC_BUILD
+}} // namespaces rml::internal
+#else
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+#endif
+
+#endif // __TBB_assert_impl_H
+
--- a/third_party/tbb/blocked_range.hh
+++ b/third_party/tbb/blocked_range.hh
@ -0,0 +1,171 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_range_H
+#define __TBB_blocked_range_H
+
+#include "third_party/libcxx/cstddef"
+
+#include "third_party/tbb/detail/_range_common.hh"
+#include "third_party/tbb/detail/_namespace_injection.hh"
+
+#include "third_party/tbb/version.hh"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+/** \page range_req Requirements on range concept
+    Class \c R implementing the concept of range must define:
+    - \code R::R( const R& ); \endcode               Copy constructor
+    - \code R::~R(); \endcode                        Destructor
+    - \code bool R::is_divisible() const; \endcode   True if range can be partitioned into two subranges
+    - \code bool R::empty() const; \endcode          True if range is empty
+    - \code R::R( R& r, split ); \endcode            Split range \c r into two subranges.
+**/
+
+//! A range over which to iterate.
+/** @ingroup algorithms */
+template<typename Value>
+    __TBB_requires(blocked_range_value<Value>)
+class blocked_range {
+public:
+    //! Type of a value
+    /** Called a const_iterator for sake of algorithms that need to treat a blocked_range
+        as an STL container. */
+    using const_iterator = Value;
+
+    //! Type for size of a range
+    using size_type = std::size_t;
+
+    //! Construct range over half-open interval [begin,end), with the given grainsize.
+    blocked_range( Value begin_, Value end_, size_type grainsize_=1 ) :
+        my_end(end_), my_begin(begin_), my_grainsize(grainsize_)
+    {
+        __TBB_ASSERT( my_grainsize>0, "grainsize must be positive" );
+    }
+
+    //! Beginning of range.
+    const_iterator begin() const { return my_begin; }
+
+    //! One past last value in range.
+    const_iterator end() const { return my_end; }
+
+    //! Size of the range
+    /** Unspecified if end()<begin(). */
+    size_type size() const {
+        __TBB_ASSERT( !(end()<begin()), "size() unspecified if end()<begin()" );
+        return size_type(my_end-my_begin);
+    }
+
+    //! The grain size for this range.
+    size_type grainsize() const { return my_grainsize; }
+
+    //------------------------------------------------------------------------
+    // Methods that implement Range concept
+    //------------------------------------------------------------------------
+
+    //! True if range is empty.
+    bool empty() const { return !(my_begin<my_end); }
+
+    //! True if range is divisible.
+    /** Unspecified if end()<begin(). */
+    bool is_divisible() const { return my_grainsize<size(); }
+
+    //! Split range.
+    /** The new Range *this has the second part, the old range r has the first part.
+        Unspecified if end()<begin() or !is_divisible(). */
+    blocked_range( blocked_range& r, split ) :
+        my_end(r.my_end),
+        my_begin(do_split(r, split())),
+        my_grainsize(r.my_grainsize)
+    {
+        // only comparison 'less than' is required from values of blocked_range objects
+        __TBB_ASSERT( !(my_begin < r.my_end) && !(r.my_end < my_begin), "blocked_range has been split incorrectly" );
+    }
+
+    //! Split range.
+    /** The new Range *this has the second part split according to specified proportion, the old range r has the first part.
+        Unspecified if end()<begin() or !is_divisible(). */
+    blocked_range( blocked_range& r, proportional_split& proportion ) :
+        my_end(r.my_end),
+        my_begin(do_split(r, proportion)),
+        my_grainsize(r.my_grainsize)
+    {
+        // only comparison 'less than' is required from values of blocked_range objects
+        __TBB_ASSERT( !(my_begin < r.my_end) && !(r.my_end < my_begin), "blocked_range has been split incorrectly" );
+    }
+
+private:
+    /** NOTE: my_end MUST be declared before my_begin, otherwise the splitting constructor will break. */
+    Value my_end;
+    Value my_begin;
+    size_type my_grainsize;
+
+    //! Auxiliary function used by the splitting constructor.
+    static Value do_split( blocked_range& r, split )
+    {
+        __TBB_ASSERT( r.is_divisible(), "cannot split blocked_range that is not divisible" );
+        Value middle = r.my_begin + (r.my_end - r.my_begin) / 2u;
+        r.my_end = middle;
+        return middle;
+    }
+
+    static Value do_split( blocked_range& r, proportional_split& proportion )
+    {
+        __TBB_ASSERT( r.is_divisible(), "cannot split blocked_range that is not divisible" );
+
+        // usage of 32-bit floating point arithmetic is not enough to handle ranges of
+        // more than 2^24 iterations accurately. However, even on ranges with 2^64
+        // iterations the computational error approximately equals to 0.000001% which
+        // makes small impact on uniform distribution of such range's iterations (assuming
+        // all iterations take equal time to complete). See 'test_partitioner_whitebox'
+        // for implementation of an exact split algorithm
+        size_type right_part = size_type(float(r.size()) * float(proportion.right())
+                                         / float(proportion.left() + proportion.right()) + 0.5f);
+        return r.my_end = Value(r.my_end - right_part);
+    }
+
+    template<typename RowValue, typename ColValue>
+        __TBB_requires(blocked_range_value<RowValue> &&
+                       blocked_range_value<ColValue>)
+    friend class blocked_range2d;
+
+    template<typename RowValue, typename ColValue, typename PageValue>
+        __TBB_requires(blocked_range_value<RowValue> &&
+                       blocked_range_value<ColValue> &&
+                       blocked_range_value<PageValue>)
+    friend class blocked_range3d;
+
+    template<typename DimValue, unsigned int N, typename>
+        __TBB_requires(blocked_range_value<DimValue>)
+    friend class blocked_rangeNd_impl;
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_range;
+// Split types
+using detail::split;
+using detail::proportional_split;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_blocked_range_H */
--- a/third_party/tbb/blocked_range2d.hh
+++ b/third_party/tbb/blocked_range2d.hh
@ -0,0 +1,112 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_range2d_H
+#define __TBB_blocked_range2d_H
+
+#include "third_party/libcxx/cstddef"
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_namespace_injection.hh"
+#include "third_party/tbb/detail/_range_common.hh"
+
+#include "third_party/tbb/blocked_range.hh"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! A 2-dimensional range that models the Range concept.
+/** @ingroup algorithms */
+template<typename RowValue, typename ColValue = RowValue>
+    __TBB_requires(blocked_range_value<RowValue> &&
+                   blocked_range_value<ColValue>)
+class blocked_range2d {
+public:
+    //! Type for size of an iteration range
+    using row_range_type = blocked_range<RowValue>;
+    using col_range_type = blocked_range<ColValue>;
+
+private:
+    row_range_type my_rows;
+    col_range_type my_cols;
+
+public:
+    blocked_range2d( RowValue row_begin, RowValue row_end, typename row_range_type::size_type row_grainsize,
+                     ColValue col_begin, ColValue col_end, typename col_range_type::size_type col_grainsize ) :
+        my_rows(row_begin,row_end,row_grainsize),
+        my_cols(col_begin,col_end,col_grainsize)
+    {}
+
+    blocked_range2d( RowValue row_begin, RowValue row_end,
+                     ColValue col_begin, ColValue col_end ) :
+        my_rows(row_begin,row_end),
+        my_cols(col_begin,col_end)
+    {}
+
+    //! True if range is empty
+    bool empty() const {
+        // Range is empty if at least one dimension is empty.
+        return my_rows.empty() || my_cols.empty();
+    }
+
+    //! True if range is divisible into two pieces.
+    bool is_divisible() const {
+        return my_rows.is_divisible() || my_cols.is_divisible();
+    }
+
+    blocked_range2d( blocked_range2d& r, split ) :
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        split split_obj;
+        do_split(r, split_obj);
+    }
+
+    blocked_range2d( blocked_range2d& r, proportional_split& proportion ) :
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        do_split(r, proportion);
+    }
+
+    //! The rows of the iteration space
+    const row_range_type& rows() const { return my_rows; }
+
+    //! The columns of the iteration space
+    const col_range_type& cols() const { return my_cols; }
+
+private:
+    template <typename Split>
+    void do_split( blocked_range2d& r, Split& split_obj ) {
+        if ( my_rows.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_rows.grainsize()) ) {
+            my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj);
+        } else {
+            my_rows.my_begin = row_range_type::do_split(r.my_rows, split_obj);
+        }
+    }
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_range2d;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_blocked_range2d_H */
--- a/third_party/tbb/blocked_range3d.hh
+++ b/third_party/tbb/blocked_range3d.hh
@ -0,0 +1,131 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_range3d_H
+#define __TBB_blocked_range3d_H
+
+#include "third_party/libcxx/cstddef"
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_namespace_injection.hh"
+
+#include "third_party/tbb/blocked_range.hh"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! A 3-dimensional range that models the Range concept.
+/** @ingroup algorithms */
+template<typename PageValue, typename RowValue = PageValue, typename ColValue = RowValue>
+    __TBB_requires(blocked_range_value<PageValue> &&
+                   blocked_range_value<RowValue> &&
+                   blocked_range_value<ColValue>)
+class blocked_range3d {
+public:
+    //! Type for size of an iteration range
+    using page_range_type = blocked_range<PageValue>;
+    using row_range_type = blocked_range<RowValue>;
+    using col_range_type = blocked_range<ColValue>;
+
+private:
+    page_range_type my_pages;
+    row_range_type  my_rows;
+    col_range_type  my_cols;
+
+public:
+
+    blocked_range3d( PageValue page_begin, PageValue page_end,
+                     RowValue  row_begin,  RowValue row_end,
+                     ColValue  col_begin,  ColValue col_end ) :
+        my_pages(page_begin,page_end),
+        my_rows(row_begin,row_end),
+        my_cols(col_begin,col_end)
+    {}
+
+    blocked_range3d( PageValue page_begin, PageValue page_end, typename page_range_type::size_type page_grainsize,
+                     RowValue  row_begin,  RowValue row_end,   typename row_range_type::size_type row_grainsize,
+                     ColValue  col_begin,  ColValue col_end,   typename col_range_type::size_type col_grainsize ) :
+        my_pages(page_begin,page_end,page_grainsize),
+        my_rows(row_begin,row_end,row_grainsize),
+        my_cols(col_begin,col_end,col_grainsize)
+    {}
+
+    //! True if range is empty
+    bool empty() const {
+        // Range is empty if at least one dimension is empty.
+        return my_pages.empty() || my_rows.empty() || my_cols.empty();
+    }
+
+    //! True if range is divisible into two pieces.
+    bool is_divisible() const {
+        return  my_pages.is_divisible() || my_rows.is_divisible() || my_cols.is_divisible();
+    }
+
+    blocked_range3d( blocked_range3d& r, split split_obj ) :
+        my_pages(r.my_pages),
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        do_split(r, split_obj);
+    }
+
+    blocked_range3d( blocked_range3d& r, proportional_split& proportion ) :
+        my_pages(r.my_pages),
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        do_split(r, proportion);
+    }
+
+    //! The pages of the iteration space
+    const page_range_type& pages() const { return my_pages; }
+
+    //! The rows of the iteration space
+    const row_range_type& rows() const { return my_rows; }
+
+    //! The columns of the iteration space
+    const col_range_type& cols() const { return my_cols; }
+
+private:
+    template <typename Split>
+    void do_split( blocked_range3d& r, Split& split_obj) {
+        if ( my_pages.size()*double(my_rows.grainsize()) < my_rows.size()*double(my_pages.grainsize()) ) {
+            if ( my_rows.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_rows.grainsize()) ) {
+                my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj);
+            } else {
+                my_rows.my_begin = row_range_type::do_split(r.my_rows, split_obj);
+            }
+        } else {
+            if ( my_pages.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_pages.grainsize()) ) {
+                my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj);
+            } else {
+                my_pages.my_begin = page_range_type::do_split(r.my_pages, split_obj);
+            }
+        }
+    }
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_range3d;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_blocked_range3d_H */
--- a/third_party/tbb/blocked_rangeNd.hh
+++ b/third_party/tbb/blocked_rangeNd.hh
@ -0,0 +1,148 @@
+// clang-format off
+/*
+    Copyright (c) 2017-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_rangeNd_H
+#define __TBB_blocked_rangeNd_H
+
+#if !TBB_PREVIEW_BLOCKED_RANGE_ND
+    #error Set TBB_PREVIEW_BLOCKED_RANGE_ND to include blocked_rangeNd.h
+#endif
+
+#include "third_party/libcxx/algorithm"    // std::any_of
+#include "third_party/libcxx/array"
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/type_traits"  // std::is_same, std::enable_if
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_template_helpers.hh" // index_sequence, make_index_sequence
+#include "third_party/tbb/detail/_range_common.hh"
+
+#include "third_party/tbb/blocked_range.hh"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+/*
+    The blocked_rangeNd_impl uses make_index_sequence<N> to automatically generate a ctor with
+    exactly N arguments of the type tbb::blocked_range<Value>. Such ctor provides an opportunity
+    to use braced-init-list parameters to initialize each dimension.
+    Use of parameters, whose representation is a braced-init-list, but they're not
+    std::initializer_list or a reference to one, produces a non-deduced context
+    within template argument deduction.
+
+    NOTE: blocked_rangeNd must be exactly a templated alias to the blocked_rangeNd_impl
+    (and not e.g. a derived class), otherwise it would need to declare its own ctor
+    facing the same problem that the impl class solves.
+*/
+
+template<typename Value, unsigned int N, typename = detail::make_index_sequence<N>>
+    __TBB_requires(blocked_range_value<Value>)
+class blocked_rangeNd_impl;
+
+template<typename Value, unsigned int N, std::size_t... Is>
+    __TBB_requires(blocked_range_value<Value>)
+class blocked_rangeNd_impl<Value, N, detail::index_sequence<Is...>> {
+public:
+    //! Type of a value.
+    using value_type = Value;
+
+private:
+    //! Helper type to construct range with N tbb::blocked_range<value_type> objects.
+    template<std::size_t>
+    using dim_type_helper = tbb::blocked_range<value_type>;
+
+public:
+    blocked_rangeNd_impl() = delete;
+
+    //! Constructs N-dimensional range over N half-open intervals each represented as tbb::blocked_range<Value>.
+    blocked_rangeNd_impl(const dim_type_helper<Is>&... args) : my_dims{ {args...} } {}
+
+    //! Dimensionality of a range.
+    static constexpr unsigned int ndims() { return N; }
+
+    //! Range in certain dimension.
+    const tbb::blocked_range<value_type>& dim(unsigned int dimension) const {
+        __TBB_ASSERT(dimension < N, "out of bound");
+        return my_dims[dimension];
+    }
+
+    //------------------------------------------------------------------------
+    // Methods that implement Range concept
+    //------------------------------------------------------------------------
+
+    //! True if at least one dimension is empty.
+    bool empty() const {
+        return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& d) {
+            return d.empty();
+        });
+    }
+
+    //! True if at least one dimension is divisible.
+    bool is_divisible() const {
+        return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& d) {
+            return d.is_divisible();
+        });
+    }
+
+    blocked_rangeNd_impl(blocked_rangeNd_impl& r, proportional_split proportion) : my_dims(r.my_dims) {
+        do_split(r, proportion);
+    }
+
+    blocked_rangeNd_impl(blocked_rangeNd_impl& r, split proportion) : my_dims(r.my_dims) {
+        do_split(r, proportion);
+    }
+
+private:
+    static_assert(N != 0, "zero dimensional blocked_rangeNd can't be constructed");
+
+    //! Ranges in each dimension.
+    std::array<tbb::blocked_range<value_type>, N> my_dims;
+
+    template<typename split_type>
+    void do_split(blocked_rangeNd_impl& r, split_type proportion) {
+        static_assert((std::is_same<split_type, split>::value || std::is_same<split_type, proportional_split>::value), "type of split object is incorrect");
+        __TBB_ASSERT(r.is_divisible(), "can't split not divisible range");
+
+        auto my_it = std::max_element(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& first, const tbb::blocked_range<value_type>& second) {
+            return (first.size() * second.grainsize() < second.size() * first.grainsize());
+        });
+
+        auto r_it = r.my_dims.begin() + (my_it - my_dims.begin());
+
+        my_it->my_begin = tbb::blocked_range<value_type>::do_split(*r_it, proportion);
+
+        // (!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin)) equals to
+        // (my_it->my_begin == r_it->my_end), but we can't use operator== due to Value concept
+        __TBB_ASSERT(!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin),
+                     "blocked_range has been split incorrectly");
+    }
+};
+
+template<typename Value, unsigned int N>
+using blocked_rangeNd = blocked_rangeNd_impl<Value, N>;
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_rangeNd;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_blocked_rangeNd_H */
+
--- a/third_party/tbb/cache_aligned_allocator.hh
+++ b/third_party/tbb/cache_aligned_allocator.hh
@ -0,0 +1,190 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_cache_aligned_allocator_H
+#define __TBB_cache_aligned_allocator_H
+
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/detail/_namespace_injection.hh"
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/utility"
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+// MISSING #include <memory_resource>
+#endif
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+TBB_EXPORT void*       __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size);
+TBB_EXPORT void        __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p);
+TBB_EXPORT std::size_t __TBB_EXPORTED_FUNC cache_line_size();
+}
+
+namespace d1 {
+
+template<typename T>
+class cache_aligned_allocator {
+public:
+    using value_type = T;
+    using propagate_on_container_move_assignment = std::true_type;
+
+    //! Always defined for TBB containers (supported since C++17 for std containers)
+    using is_always_equal = std::true_type;
+
+    cache_aligned_allocator() = default;
+    template<typename U> cache_aligned_allocator(const cache_aligned_allocator<U>&) noexcept {}
+
+    //! Allocate space for n objects, starting on a cache/sector line.
+    __TBB_nodiscard T* allocate(std::size_t n) {
+        return static_cast<T*>(r1::cache_aligned_allocate(n * sizeof(value_type)));
+    }
+
+    //! Free block of memory that starts on a cache line
+    void deallocate(T* p, std::size_t) {
+        r1::cache_aligned_deallocate(p);
+    }
+
+    //! Largest value for which method allocate might succeed.
+    std::size_t max_size() const noexcept {
+        return (~std::size_t(0) - r1::cache_line_size()) / sizeof(value_type);
+    }
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    using pointer = value_type*;
+    using const_pointer = const value_type*;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using difference_type = std::ptrdiff_t;
+    using size_type = std::size_t;
+    template<typename U> struct rebind {
+        using other = cache_aligned_allocator<U>;
+    };
+    template<typename U, typename... Args>
+    void construct(U *p, Args&&... args)
+        { ::new (p) U(std::forward<Args>(args)...); }
+    void destroy(pointer p) { p->~value_type(); }
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return &x; }
+#endif // TBB_ALLOCATOR_TRAITS_BROKEN
+};
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    template<>
+    class cache_aligned_allocator<void> {
+    public:
+        using pointer = void*;
+        using const_pointer = const void*;
+        using value_type = void;
+        template<typename U> struct rebind {
+            using other = cache_aligned_allocator<U>;
+        };
+    };
+#endif
+
+template<typename T, typename U>
+bool operator==(const cache_aligned_allocator<T>&, const cache_aligned_allocator<U>&) noexcept { return true; }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template<typename T, typename U>
+bool operator!=(const cache_aligned_allocator<T>&, const cache_aligned_allocator<U>&) noexcept { return false; }
+#endif
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+
+//! C++17 memory resource wrapper to ensure cache line size alignment
+class cache_aligned_resource : public std::pmr::memory_resource {
+public:
+    cache_aligned_resource() : cache_aligned_resource(std::pmr::get_default_resource()) {}
+    explicit cache_aligned_resource(std::pmr::memory_resource* upstream) : m_upstream(upstream) {}
+
+    std::pmr::memory_resource* upstream_resource() const {
+        return m_upstream;
+    }
+
+private:
+    //! We don't know what memory resource set. Use padding to guarantee alignment
+    void* do_allocate(std::size_t bytes, std::size_t alignment) override {
+        // TODO: make it common with tbb_allocator.cpp
+        std::size_t cache_line_alignment = correct_alignment(alignment);
+        std::size_t space = correct_size(bytes) + cache_line_alignment;
+        std::uintptr_t base = reinterpret_cast<std::uintptr_t>(m_upstream->allocate(space));
+        __TBB_ASSERT(base != 0, "Upstream resource returned nullptr.");
+
+        // Round up to the next cache line (align the base address)
+        std::uintptr_t result = (base + cache_line_alignment) & ~(cache_line_alignment - 1);
+        __TBB_ASSERT((result - base) >= sizeof(std::uintptr_t), "Can`t store a base pointer to the header");
+        __TBB_ASSERT(space - (result - base) >= bytes, "Not enough space for the storage");
+
+        // Record where block actually starts.
+        (reinterpret_cast<std::uintptr_t*>(result))[-1] = base;
+        return reinterpret_cast<void*>(result);
+    }
+
+    void do_deallocate(void* ptr, std::size_t bytes, std::size_t alignment) override {
+        if (ptr) {
+            // Recover where block actually starts
+            std::uintptr_t base = (reinterpret_cast<std::uintptr_t*>(ptr))[-1];
+            m_upstream->deallocate(reinterpret_cast<void*>(base), correct_size(bytes) + correct_alignment(alignment));
+        }
+    }
+
+    bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override {
+        if (this == &other) { return true; }
+#if __TBB_USE_OPTIONAL_RTTI
+        const cache_aligned_resource* other_res = dynamic_cast<const cache_aligned_resource*>(&other);
+        return other_res && (upstream_resource() == other_res->upstream_resource());
+#else
+        return false;
+#endif
+    }
+
+    std::size_t correct_alignment(std::size_t alignment) {
+        __TBB_ASSERT(tbb::detail::is_power_of_two(alignment), "Alignment is not a power of 2");
+#if __TBB_CPP17_HW_INTERFERENCE_SIZE_PRESENT
+        std::size_t cache_line_size = std::hardware_destructive_interference_size;
+#else
+        std::size_t cache_line_size = r1::cache_line_size();
+#endif
+        return alignment < cache_line_size ? cache_line_size : alignment;
+    }
+
+    std::size_t correct_size(std::size_t bytes) {
+        // To handle the case, when small size requested. There could be not
+        // enough space to store the original pointer.
+        return bytes < sizeof(std::uintptr_t) ? sizeof(std::uintptr_t) : bytes;
+    }
+
+    std::pmr::memory_resource* m_upstream;
+};
+
+#endif // __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::cache_aligned_allocator;
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+using detail::d1::cache_aligned_resource;
+#endif
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_cache_aligned_allocator_H */
+
--- a/third_party/tbb/cancellation_disseminator.hh
+++ b/third_party/tbb/cancellation_disseminator.hh
@ -0,0 +1,86 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_cancellation_disseminator_H
+#define _TBB_cancellation_disseminator_H
+
+#include "third_party/tbb/mutex.hh"
+#include "third_party/tbb/task_group.hh"
+
+#include "third_party/tbb/intrusive_list.hh"
+#include "third_party/tbb/thread_data.hh"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class cancellation_disseminator {
+public:
+    //! Finds all contexts affected by the state change and propagates the new state to them.
+    /*  The propagation is relayed to the cancellation_disseminator because tasks created by one
+        external thread can be passed to and executed by other external threads. This means
+        that context trees can span several arenas at once and thus state change
+        propagation cannot be generally localized to one arena only.
+    */
+    bool propagate_task_group_state(std::atomic<uint32_t> d1::task_group_context::*mptr_state, d1::task_group_context& src, uint32_t new_state) {
+        if (src.my_may_have_children.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) {
+            return true;
+        }
+
+        // The whole propagation algorithm is under the lock in order to ensure correctness
+        // in case of concurrent state changes at the different levels of the context tree.
+        threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex);
+        // TODO: consider to use double-check idiom
+        if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state) {
+            // Another thread has concurrently changed the state. Back down.
+            return false;
+        }
+
+        // Advance global state propagation epoch
+        ++the_context_state_propagation_epoch;
+        // Propagate to all workers and external threads and sync up their local epochs with the global one
+        // The whole propagation sequence is locked, thus no contention is expected
+        for (auto& thr_data : my_threads_list) {
+            thr_data.propagate_task_group_state(mptr_state, src, new_state);
+        }
+
+        return true;
+    }
+
+    void register_thread(thread_data& td) {
+        threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex);
+        my_threads_list.push_front(td);
+    }
+
+    void unregister_thread(thread_data& td) {
+        threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex);
+        my_threads_list.remove(td);
+    }
+
+private:
+    using thread_data_list_type = intrusive_list<thread_data>;
+    using threads_list_mutex_type = d1::mutex;
+
+    threads_list_mutex_type my_threads_list_mutex;
+    thread_data_list_type my_threads_list;
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_cancellation_disseminator_H
--- a/third_party/tbb/co_context.hh
+++ b/third_party/tbb/co_context.hh
@ -0,0 +1,428 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_co_context_H
+#define _TBB_co_context_H
+
+#include "third_party/tbb/detail/_config.hh"
+
+#if __TBB_RESUMABLE_TASKS
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/cstdint"
+
+#if __TBB_RESUMABLE_TASKS_USE_THREADS
+
+#if _WIN32 || _WIN64
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#else
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/thread2.h"
+#endif
+
+#include "third_party/libcxx/condition_variable"
+#include "third_party/tbb/governor.hh"
+
+#elif _WIN32 || _WIN64
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#else
+// ucontext.h API is deprecated since macOS 10.6
+#if __APPLE__
+    #if __INTEL_COMPILER
+        #pragma warning(push)
+        #pragma warning(disable:1478)
+    #elif __clang__
+        #pragma clang diagnostic push
+        #pragma clang diagnostic ignored "-Wdeprecated-declarations"
+    #endif
+#endif // __APPLE__
+
+#include "libc/calls/ucontext.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/mlock.h"
+#include "libc/sysv/consts/msync.h"
+#include "libc/sysv/consts/posix.h"
+#include "libc/sysv/consts/prot.h"
+#include "libc/sysv/consts/madv.h"
+#include "libc/sysv/consts/mfd.h"
+#include "libc/sysv/consts/mremap.h" // mprotect
+
+#include "third_party/tbb/governor.hh" // default_page_size()
+
+#ifndef MAP_STACK
+// macOS* does not define MAP_STACK
+#define MAP_STACK 0
+#endif
+#ifndef MAP_ANONYMOUS
+// macOS* defines MAP_ANON, which is deprecated in Linux*.
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+#endif // _WIN32 || _WIN64
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_RESUMABLE_TASKS_USE_THREADS
+    struct coroutine_type {
+#if _WIN32 || _WIN64
+        using handle_type = HANDLE;
+#else
+        using handle_type = pthread_t;
+#endif
+
+        handle_type my_thread;
+        std::condition_variable my_condvar;
+        std::mutex my_mutex;
+        thread_data* my_thread_data{ nullptr };
+        bool my_is_active{ true };
+    };
+#elif _WIN32 || _WIN64
+    typedef LPVOID coroutine_type;
+#else
+    struct coroutine_type {
+        coroutine_type() : my_context(), my_stack(), my_stack_size() {}
+        ucontext_t my_context;
+        void* my_stack;
+        std::size_t my_stack_size;
+    };
+#endif
+
+    // Forward declaration of the coroutine API.
+    void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg);
+    void current_coroutine(coroutine_type& c);
+    void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine);
+    void destroy_coroutine(coroutine_type& c);
+
+class co_context {
+    enum co_state {
+        co_invalid,
+        co_suspended,
+        co_executing,
+        co_destroyed
+    };
+    coroutine_type      my_coroutine;
+    co_state            my_state;
+
+public:
+    co_context(std::size_t stack_size, void* arg)
+        : my_state(stack_size ? co_suspended : co_executing)
+    {
+        if (stack_size) {
+            __TBB_ASSERT(arg != nullptr, nullptr);
+            create_coroutine(my_coroutine, stack_size, arg);
+        } else {
+            current_coroutine(my_coroutine);
+        }
+    }
+
+    ~co_context() {
+        __TBB_ASSERT(1 << my_state & (1 << co_suspended | 1 << co_executing), nullptr);
+        if (my_state == co_suspended) {
+#if __TBB_RESUMABLE_TASKS_USE_THREADS
+            my_state = co_executing;
+#endif
+            destroy_coroutine(my_coroutine);
+        }
+        my_state = co_destroyed;
+    }
+
+    void resume(co_context& target) {
+        // Do not create non-trivial objects on the stack of this function. They might never be destroyed.
+        __TBB_ASSERT(my_state == co_executing, nullptr);
+        __TBB_ASSERT(target.my_state == co_suspended, nullptr);
+
+        my_state = co_suspended;
+        target.my_state = co_executing;
+
+        // 'target' can reference an invalid object after swap_coroutine. Do not access it.
+        swap_coroutine(my_coroutine, target.my_coroutine);
+
+        __TBB_ASSERT(my_state == co_executing, nullptr);
+    }
+};
+
+#if _WIN32 || _WIN64
+/* [[noreturn]] */ void __stdcall co_local_wait_for_all(void* arg) noexcept;
+#else
+/* [[noreturn]] */ void co_local_wait_for_all(unsigned hi, unsigned lo) noexcept;
+#endif
+
+#if __TBB_RESUMABLE_TASKS_USE_THREADS
+void handle_perror(int error_code, const char* what);
+
+inline void check(int error_code, const char* routine) {
+    if (error_code) {
+        handle_perror(error_code, routine);
+    }
+}
+
+using thread_data_t = std::pair<coroutine_type&, void*&>;
+
+#if _WIN32 || _WIN64
+inline unsigned WINAPI coroutine_thread_func(void* d)
+#else
+inline void* coroutine_thread_func(void* d)
+#endif
+{
+    thread_data_t& data = *static_cast<thread_data_t*>(d);
+    coroutine_type& c = data.first;
+    void* arg = data.second;
+    {
+        std::unique_lock<std::mutex> lock(c.my_mutex);
+        __TBB_ASSERT(c.my_thread_data == nullptr, nullptr);
+        c.my_is_active = false;
+
+        // We read the data notify the waiting thread
+        data.second = nullptr;
+        c.my_condvar.notify_one();
+
+        c.my_condvar.wait(lock, [&c] { return c.my_is_active == true; });
+    }
+    __TBB_ASSERT(c.my_thread_data != nullptr, nullptr);
+    governor::set_thread_data(*c.my_thread_data);
+
+#if _WIN32 || _WIN64
+    co_local_wait_for_all(arg);
+
+    return 0;
+#else
+    std::uintptr_t addr = std::uintptr_t(arg);
+    unsigned lo = unsigned(addr);
+    unsigned hi = unsigned(std::uint64_t(addr) >> 32);
+    __TBB_ASSERT(sizeof(addr) == 8 || hi == 0, nullptr);
+
+    co_local_wait_for_all(hi, lo);
+
+    return nullptr;
+#endif
+};
+
+inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) {
+    thread_data_t data{ c, arg };
+
+#if _WIN32 || _WIN64
+    c.my_thread = (HANDLE)_beginthreadex(nullptr, unsigned(stack_size), coroutine_thread_func, &data, STACK_SIZE_PARAM_IS_A_RESERVATION, nullptr);
+    if (!c.my_thread) {
+        handle_perror(0, "create_coroutine: _beginthreadex failed\n");
+    }
+#else
+    pthread_attr_t s;
+    check(pthread_attr_init(&s), "pthread_attr_init has failed");
+    if (stack_size > 0) {
+        check(pthread_attr_setstacksize(&s, stack_size), "pthread_attr_setstack_size has failed");
+    }
+    check(pthread_create(&c.my_thread, &s, coroutine_thread_func, &data), "pthread_create has failed");
+    check(pthread_attr_destroy(&s), "pthread_attr_destroy has failed");
+#endif
+
+    // Wait for the just created thread to read the data
+    std::unique_lock<std::mutex> lock(c.my_mutex);
+    c.my_condvar.wait(lock, [&arg] { return arg == nullptr; });
+}
+
+inline void current_coroutine(coroutine_type& c) {
+#if _WIN32 || _WIN64
+    c.my_thread = GetCurrentThread();
+#else
+    c.my_thread = pthread_self();
+#endif
+}
+
+inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) {
+    thread_data* td = governor::get_thread_data();
+    __TBB_ASSERT(prev_coroutine.my_is_active == true, "The current thread should be active");
+
+    // Detach our state before notification other thread
+    // (because we might be notified just after other thread notification)
+    prev_coroutine.my_thread_data = nullptr;
+    prev_coroutine.my_is_active = false;
+    governor::clear_thread_data();
+
+    {
+        std::unique_lock<std::mutex> lock(new_coroutine.my_mutex);
+        __TBB_ASSERT(new_coroutine.my_is_active == false, "The sleeping thread should not be active");
+        __TBB_ASSERT(new_coroutine.my_thread_data == nullptr, "The sleeping thread should not be active");
+
+        new_coroutine.my_thread_data = td;
+        new_coroutine.my_is_active = true;
+        new_coroutine.my_condvar.notify_one();
+    }
+
+    std::unique_lock<std::mutex> lock(prev_coroutine.my_mutex);
+    prev_coroutine.my_condvar.wait(lock, [&prev_coroutine] { return prev_coroutine.my_is_active == true; });
+    __TBB_ASSERT(governor::get_thread_data() != nullptr, nullptr);
+    governor::set_thread_data(*prev_coroutine.my_thread_data);
+}
+
+inline void destroy_coroutine(coroutine_type& c) {
+    {
+        std::unique_lock<std::mutex> lock(c.my_mutex);
+        __TBB_ASSERT(c.my_thread_data == nullptr, "The sleeping thread should not be active");
+        __TBB_ASSERT(c.my_is_active == false, "The sleeping thread should not be active");
+        c.my_is_active = true;
+        c.my_condvar.notify_one();
+    }
+#if _WIN32 || _WIN64
+    WaitForSingleObject(c.my_thread, INFINITE);
+    CloseHandle(c.my_thread);
+#else
+    check(pthread_join(c.my_thread, nullptr), "pthread_join has failed");
+#endif
+}
+#elif _WIN32 || _WIN64
+inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) {
+    __TBB_ASSERT(arg, nullptr);
+    c = CreateFiber(stack_size, co_local_wait_for_all, arg);
+    __TBB_ASSERT(c, nullptr);
+}
+
+inline void current_coroutine(coroutine_type& c) {
+    c = IsThreadAFiber() ? GetCurrentFiber() :
+        ConvertThreadToFiberEx(nullptr, FIBER_FLAG_FLOAT_SWITCH);
+    __TBB_ASSERT(c, nullptr);
+}
+
+inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) {
+    if (!IsThreadAFiber()) {
+        ConvertThreadToFiberEx(nullptr, FIBER_FLAG_FLOAT_SWITCH);
+    }
+    __TBB_ASSERT(new_coroutine, nullptr);
+    prev_coroutine = GetCurrentFiber();
+    __TBB_ASSERT(prev_coroutine, nullptr);
+    SwitchToFiber(new_coroutine);
+}
+
+inline void destroy_coroutine(coroutine_type& c) {
+    __TBB_ASSERT(c, nullptr);
+    DeleteFiber(c);
+}
+#else // !(_WIN32 || _WIN64)
+
+inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) {
+    const std::size_t REG_PAGE_SIZE = governor::default_page_size();
+    const std::size_t page_aligned_stack_size = (stack_size + (REG_PAGE_SIZE - 1)) & ~(REG_PAGE_SIZE - 1);
+    const std::size_t protected_stack_size = page_aligned_stack_size + 2 * REG_PAGE_SIZE;
+
+    // Allocate the stack with protection property
+    std::uintptr_t stack_ptr = (std::uintptr_t)mmap(nullptr, protected_stack_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+    __TBB_ASSERT((void*)stack_ptr != MAP_FAILED, nullptr);
+
+    // Allow read write on our stack (guarded pages are still protected)
+    int err = mprotect((void*)(stack_ptr + REG_PAGE_SIZE), page_aligned_stack_size, PROT_READ | PROT_WRITE);
+    __TBB_ASSERT_EX(!err, nullptr);
+
+    // Remember the stack state
+    c.my_stack = (void*)(stack_ptr + REG_PAGE_SIZE);
+    c.my_stack_size = page_aligned_stack_size;
+
+    err = getcontext(&c.my_context);
+    __TBB_ASSERT_EX(!err, nullptr);
+
+    c.my_context.uc_link = nullptr;
+    // cast to char* to disable FreeBSD clang-3.4.1 'incompatible type' error
+    c.my_context.uc_stack.ss_sp = (char*)c.my_stack;
+    c.my_context.uc_stack.ss_size = c.my_stack_size;
+    c.my_context.uc_stack.ss_flags = 0;
+
+    typedef void(*coroutine_func_t)();
+
+    std::uintptr_t addr = std::uintptr_t(arg);
+    unsigned lo = unsigned(addr);
+    unsigned hi = unsigned(std::uint64_t(addr) >> 32);
+    __TBB_ASSERT(sizeof(addr) == 8 || hi == 0, nullptr);
+
+    makecontext(&c.my_context, (coroutine_func_t)co_local_wait_for_all, 2, hi, lo);
+}
+
+inline void current_coroutine(coroutine_type& c) {
+    int err = getcontext(&c.my_context);
+    __TBB_ASSERT_EX(!err, nullptr);
+}
+
+inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) {
+    int err = swapcontext(&prev_coroutine.my_context, &new_coroutine.my_context);
+    __TBB_ASSERT_EX(!err, nullptr);
+}
+
+inline void destroy_coroutine(coroutine_type& c) {
+    const std::size_t REG_PAGE_SIZE = governor::default_page_size();
+    // Free stack memory with guarded pages
+    munmap((void*)((std::uintptr_t)c.my_stack - REG_PAGE_SIZE), c.my_stack_size + 2 * REG_PAGE_SIZE);
+    // Clear the stack state afterwards
+    c.my_stack = nullptr;
+    c.my_stack_size = 0;
+}
+
+#if __APPLE__
+    #if __INTEL_COMPILER
+        #pragma warning(pop) // 1478 warning
+    #elif __clang__
+        #pragma clang diagnostic pop // "-Wdeprecated-declarations"
+    #endif
+#endif
+
+#endif // _WIN32 || _WIN64
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_RESUMABLE_TASKS */
+
+#endif /* _TBB_co_context_H */
--- a/third_party/tbb/collaborative_call_once.hh
+++ b/third_party/tbb/collaborative_call_once.hh
@ -0,0 +1,236 @@
+// clang-format off
+/*
+    Copyright (c) 2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_collaborative_call_once_H
+#define __TBB_collaborative_call_once_H
+
+#include "third_party/tbb/task_arena.hh"
+#include "third_party/tbb/task_group.hh"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress warning: structure was padded due to alignment specifier
+    #pragma warning (push)
+    #pragma warning (disable: 4324)
+#endif
+
+constexpr std::uintptr_t collaborative_once_max_references = max_nfs_size;
+constexpr std::uintptr_t collaborative_once_references_mask = collaborative_once_max_references-1;
+
+class alignas(max_nfs_size) collaborative_once_runner : no_copy {
+
+    struct storage_t {
+        task_arena m_arena{ task_arena::attach{} };
+        wait_context m_wait_context{1};
+    };
+
+    std::atomic<std::int64_t> m_ref_count{0};
+    std::atomic<bool> m_is_ready{false};
+
+    // Storage with task_arena and wait_context must be initialized only by winner thread
+    union {
+        storage_t m_storage;
+    };
+
+    template<typename Fn>
+    void isolated_execute(Fn f) {
+        auto func = [f] {
+            f();
+           // delegate_base requires bool returning functor while isolate_within_arena ignores the result
+            return true;
+        };
+
+        delegated_function<decltype(func)> delegate(func);
+
+        r1::isolate_within_arena(delegate, reinterpret_cast<std::intptr_t>(this));
+    }
+
+public:
+    class lifetime_guard : no_copy {
+        collaborative_once_runner& m_runner;
+    public:
+        lifetime_guard(collaborative_once_runner& r) : m_runner(r) {
+            m_runner.m_ref_count++;
+        }
+        ~lifetime_guard() {
+            m_runner.m_ref_count--;
+        }
+    };
+
+    collaborative_once_runner() {}
+
+    ~collaborative_once_runner() {
+        spin_wait_until_eq(m_ref_count, 0, std::memory_order_acquire);
+        if (m_is_ready.load(std::memory_order_relaxed)) {
+            m_storage.~storage_t();
+        }
+    }
+
+    std::uintptr_t to_bits() {
+        return reinterpret_cast<std::uintptr_t>(this);
+    }
+
+    static collaborative_once_runner* from_bits(std::uintptr_t bits) {
+        __TBB_ASSERT( (bits & collaborative_once_references_mask) == 0, "invalid pointer, last log2(max_nfs_size) bits must be zero" );
+        return reinterpret_cast<collaborative_once_runner*>(bits);
+    }
+
+    template <typename F>
+    void run_once(F&& f) {
+        __TBB_ASSERT(!m_is_ready.load(std::memory_order_relaxed), "storage with task_arena and wait_context is already initialized");
+        // Initialize internal state
+        new(&m_storage) storage_t();
+        m_storage.m_arena.execute([&] {
+            isolated_execute([&] {
+                task_group_context context{ task_group_context::bound,
+                    task_group_context::default_traits | task_group_context::concurrent_wait };
+
+                function_stack_task<F> t{ std::forward<F>(f), m_storage.m_wait_context };
+
+                // Set the ready flag after entering the execute body to prevent
+                // moonlighting threads from occupying all slots inside the arena.
+                m_is_ready.store(true, std::memory_order_release);
+                execute_and_wait(t, context, m_storage.m_wait_context, context);
+            });
+        });
+    }
+
+    void assist() noexcept {
+        // Do not join the arena until the winner thread takes the slot
+        spin_wait_while_eq(m_is_ready, false);
+        m_storage.m_arena.execute([&] {
+            isolated_execute([&] {
+                // We do not want to get an exception from user functor on moonlighting threads.
+                // The exception is handled with the winner thread
+                task_group_context stub_context;
+                wait(m_storage.m_wait_context, stub_context);
+            });
+        });
+    }
+
+};
+
+class collaborative_once_flag : no_copy {
+    enum state : std::uintptr_t {
+        uninitialized,
+        done,
+#if TBB_USE_ASSERT
+        dead
+#endif
+    };
+    std::atomic<std::uintptr_t> m_state{ state::uninitialized };
+
+    template <typename Fn, typename... Args>
+    friend void collaborative_call_once(collaborative_once_flag& flag, Fn&& f, Args&&... args);
+
+    void set_completion_state(std::uintptr_t runner_bits, std::uintptr_t desired) {
+        std::uintptr_t expected = runner_bits;
+        do {
+            expected = runner_bits;
+            // Possible inefficiency: when we start waiting,
+            // some moonlighting threads might continue coming that will prolong our waiting.
+            // Fortunately, there are limited number of threads on the system so wait time is limited.
+            spin_wait_until_eq(m_state, expected);
+        } while (!m_state.compare_exchange_strong(expected, desired));
+    }
+    
+    template <typename Fn>
+    void do_collaborative_call_once(Fn&& f) {
+        std::uintptr_t expected = m_state.load(std::memory_order_acquire);
+        collaborative_once_runner runner;
+
+        do {
+            if (expected == state::uninitialized && m_state.compare_exchange_strong(expected, runner.to_bits())) {
+                // Winner thread
+                runner.run_once([&] {
+                    try_call([&] {
+                        std::forward<Fn>(f)();
+                    }).on_exception([&] {
+                        // Reset the state to uninitialized to allow other threads to try initialization again
+                        set_completion_state(runner.to_bits(), state::uninitialized);
+                    });
+                    // We successfully executed functor
+                    set_completion_state(runner.to_bits(), state::done);
+                });
+                break;
+            } else {
+                // Moonlighting thread: we need to add a reference to the state to prolong runner lifetime.
+                // However, the maximum number of references are limited with runner alignment.
+                // So, we use CAS loop and spin_wait to guarantee that references never exceed "max_value".
+                do {
+                    auto max_value = expected | collaborative_once_references_mask;
+                    expected = spin_wait_while_eq(m_state, max_value);
+                // "expected > state::done" prevents storing values, when state is uninitialized or done
+                } while (expected > state::done && !m_state.compare_exchange_strong(expected, expected + 1));
+
+                if (auto shared_runner = collaborative_once_runner::from_bits(expected & ~collaborative_once_references_mask)) {
+                    collaborative_once_runner::lifetime_guard guard{*shared_runner};
+                    m_state.fetch_sub(1);
+
+                    // The moonlighting threads are not expected to handle exceptions from user functor.
+                    // Therefore, no exception is expected from assist().
+                    shared_runner->assist();
+                }
+            }
+            __TBB_ASSERT(m_state.load(std::memory_order_relaxed) != state::dead,
+                         "collaborative_once_flag has been prematurely destroyed");
+        } while (expected != state::done);
+    }
+
+#if TBB_USE_ASSERT
+public:
+    ~collaborative_once_flag() {
+        m_state.store(state::dead, std::memory_order_relaxed);
+    }
+#endif
+};
+
+
+template <typename Fn, typename... Args>
+void collaborative_call_once(collaborative_once_flag& flag, Fn&& fn, Args&&... args) {
+    __TBB_ASSERT(flag.m_state.load(std::memory_order_relaxed) != collaborative_once_flag::dead,
+                 "collaborative_once_flag has been prematurely destroyed");
+    if (flag.m_state.load(std::memory_order_acquire) != collaborative_once_flag::done) {
+    #if __TBB_GCC_PARAMETER_PACK_IN_LAMBDAS_BROKEN
+        // Using stored_pack to suppress bug in GCC 4.8
+        // with parameter pack expansion in lambda
+        auto stored_pack = save_pack(std::forward<Args>(args)...);
+        auto func = [&] { call(std::forward<Fn>(fn), std::move(stored_pack)); };
+    #else
+        auto func = [&] { fn(std::forward<Args>(args)...); };
+    #endif
+        flag.do_collaborative_call_once(func);
+    }
+}
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning (pop) // 4324 warning
+#endif
+
+} // namespace d1
+} // namespace detail
+
+using detail::d1::collaborative_call_once;
+using detail::d1::collaborative_once_flag;
+} // namespace tbb
+
+#endif // __TBB_collaborative_call_once_H
--- a/third_party/tbb/combinable.hh
+++ b/third_party/tbb/combinable.hh
@ -0,0 +1,70 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_combinable_H
+#define __TBB_combinable_H
+
+#include "third_party/tbb/detail/_namespace_injection.hh"
+
+#include "third_party/tbb/enumerable_thread_specific.hh"
+#include "third_party/tbb/cache_aligned_allocator.hh"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+/** \name combinable **/
+//@{
+//! Thread-local storage with optional reduction
+/** @ingroup containers */
+template <typename T>
+class combinable {
+    using my_alloc = typename tbb::cache_aligned_allocator<T>;
+    using my_ets_type = typename tbb::enumerable_thread_specific<T, my_alloc, ets_no_key>;
+    my_ets_type my_ets;
+
+public:
+    combinable() = default;
+
+    template <typename Finit>
+    explicit combinable(Finit _finit) : my_ets(_finit) { }
+
+    void clear() { my_ets.clear(); }
+
+    T& local() { return my_ets.local(); }
+
+    T& local(bool& exists) { return my_ets.local(exists); }
+
+    // combine_func_t has signature T(T,T) or T(const T&, const T&)
+    template <typename CombineFunc>
+    T combine(CombineFunc f_combine) { return my_ets.combine(f_combine); }
+
+    // combine_func_t has signature void(T) or void(const T&)
+    template <typename CombineFunc>
+    void combine_each(CombineFunc f_combine) { my_ets.combine_each(f_combine); }
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::combinable;
+} // inline namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_combinable_H */
+
--- a/third_party/tbb/concurrent_bounded_queue.cc
+++ b/third_party/tbb/concurrent_bounded_queue.cc
@ -0,0 +1,85 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/concurrent_queue.hh"
+#include "third_party/tbb/cache_aligned_allocator.hh"
+#include "third_party/tbb/concurrent_monitor.hh"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+static constexpr std::size_t monitors_number = 2;
+
+std::uint8_t* __TBB_EXPORTED_FUNC allocate_bounded_queue_rep( std::size_t queue_rep_size )
+{
+    std::size_t monitors_mem_size = sizeof(concurrent_monitor) * monitors_number;
+    std::uint8_t* mem = static_cast<std::uint8_t*>(cache_aligned_allocate(queue_rep_size + monitors_mem_size));
+
+    concurrent_monitor* monitors = reinterpret_cast<concurrent_monitor*>(mem + queue_rep_size);
+    for (std::size_t i = 0; i < monitors_number; ++i) {
+        new (monitors + i) concurrent_monitor();
+    }
+
+    return mem;
+}
+
+void __TBB_EXPORTED_FUNC deallocate_bounded_queue_rep( std::uint8_t* mem, std::size_t queue_rep_size )
+{
+    concurrent_monitor* monitors = reinterpret_cast<concurrent_monitor*>(mem + queue_rep_size);
+    for (std::size_t i = 0; i < monitors_number; ++i) {
+        monitors[i].~concurrent_monitor();
+    }
+
+    cache_aligned_deallocate(mem);
+}
+
+void __TBB_EXPORTED_FUNC wait_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag,
+                                                        std::ptrdiff_t target, d1::delegate_base& predicate )
+{
+    __TBB_ASSERT(monitor_tag < monitors_number, nullptr);
+    concurrent_monitor& monitor = monitors[monitor_tag];
+
+    monitor.wait<concurrent_monitor::thread_context>([&] { return !predicate(); }, std::uintptr_t(target));
+}
+
+void __TBB_EXPORTED_FUNC abort_bounded_queue_monitors( concurrent_monitor* monitors ) {
+    concurrent_monitor& items_avail = monitors[d2::cbq_items_avail_tag];
+    concurrent_monitor& slots_avail = monitors[d2::cbq_slots_avail_tag];
+
+    items_avail.abort_all();
+    slots_avail.abort_all();
+}
+
+struct predicate_leq {
+    std::size_t my_ticket;
+    predicate_leq( std::size_t ticket ) : my_ticket(ticket) {}
+    bool operator() ( std::uintptr_t ticket ) const { return static_cast<std::size_t>(ticket) <= my_ticket; }
+};
+
+void __TBB_EXPORTED_FUNC notify_bounded_queue_monitor( concurrent_monitor* monitors,
+                                                               std::size_t monitor_tag, std::size_t ticket)
+{
+    __TBB_ASSERT(monitor_tag < monitors_number, nullptr);
+    concurrent_monitor& monitor = monitors[monitor_tag];
+    monitor.notify(predicate_leq(ticket));
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
--- a/third_party/tbb/concurrent_hash_map.hh
+++ b/third_party/tbb/concurrent_hash_map.hh
--- a/third_party/tbb/concurrent_lru_cache.hh
+++ b/third_party/tbb/concurrent_lru_cache.hh
@ -0,0 +1,375 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_lru_cache_H
+#define __TBB_concurrent_lru_cache_H
+
+#if ! TBB_PREVIEW_CONCURRENT_LRU_CACHE
+    #error Set TBB_PREVIEW_CONCURRENT_LRU_CACHE to include concurrent_lru_cache.h
+#endif
+
+#include "third_party/tbb/detail/_assert.hh"
+#include "third_party/tbb/detail/_aggregator.hh"
+
+#include "third_party/libcxx/map"       // for std::map
+#include "third_party/libcxx/list"      // for std::list
+#include "third_party/libcxx/utility"   // for std::make_pair
+#include "third_party/libcxx/algorithm" // for std::find
+#include "third_party/libcxx/atomic"    // for std::atomic<bool>
+
+namespace tbb {
+
+namespace detail {
+namespace d1 {
+
+//-----------------------------------------------------------------------------
+// Concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT = ValT (*) (KeyT)>
+class concurrent_lru_cache : no_assign {
+// incapsulated helper classes
+private:
+    struct handle_object;
+    struct storage_map_value_type;
+
+    struct aggregator_operation;
+    struct retrieve_aggregator_operation;
+    struct signal_end_of_usage_aggregator_operation;
+
+// typedefs
+public:
+    using key_type = KeyT;
+    using value_type = ValT;
+    using pointer = ValT*;
+    using reference = ValT&;
+    using const_pointer = const ValT*;
+    using const_reference = const ValT&;
+
+    using value_function_type = KeyToValFunctorT;
+    using handle = handle_object;
+private:
+    using lru_cache_type = concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>;
+
+    using storage_map_type = std::map<key_type, storage_map_value_type>;
+    using storage_map_iterator_type = typename storage_map_type::iterator;
+    using storage_map_pointer_type = typename storage_map_type::pointer;
+    using storage_map_reference_type = typename storage_map_type::reference;
+
+    using history_list_type = std::list<storage_map_iterator_type>;
+    using history_list_iterator_type = typename history_list_type::iterator;
+
+    using aggregator_operation_type = aggregator_operation;
+    using aggregator_function_type = aggregating_functor<lru_cache_type, aggregator_operation_type>;
+    using aggregator_type = aggregator<aggregator_function_type, aggregator_operation_type>;
+
+    friend class aggregating_functor<lru_cache_type,aggregator_operation_type>;
+
+// fields
+private:
+    value_function_type my_value_function;
+    aggregator_type my_aggregator;
+
+    storage_map_type my_storage_map;            // storage map for used objects
+    history_list_type my_history_list;          // history list for unused objects
+    const std::size_t my_history_list_capacity; // history list's allowed capacity
+
+// interface
+public:
+
+    concurrent_lru_cache(value_function_type value_function, std::size_t cache_capacity)
+        : my_value_function(value_function), my_history_list_capacity(cache_capacity) {
+        my_aggregator.initialize_handler(aggregator_function_type(this));
+    }
+
+    handle operator[](key_type key) {
+        retrieve_aggregator_operation op(key);
+        my_aggregator.execute(&op);
+
+        if (op.is_new_value_needed()) {
+            op.result().second.my_value = my_value_function(key);
+            op.result().second.my_is_ready.store(true, std::memory_order_release);
+        } else {
+            spin_wait_while_eq(op.result().second.my_is_ready, false);
+        }
+
+        return handle(*this, op.result());
+    }
+
+private:
+
+    void handle_operations(aggregator_operation* op_list) {
+        while (op_list) {
+            op_list->cast_and_handle(*this);
+            aggregator_operation* prev_op = op_list;
+            op_list = op_list->next;
+
+            (prev_op->status).store(1, std::memory_order_release);
+        }
+    }
+
+    void signal_end_of_usage(storage_map_reference_type map_record_ref) {
+        signal_end_of_usage_aggregator_operation op(map_record_ref);
+        my_aggregator.execute(&op);
+    }
+
+    void signal_end_of_usage_serial(storage_map_reference_type map_record_ref) {
+        storage_map_iterator_type map_it = my_storage_map.find(map_record_ref.first);
+
+        __TBB_ASSERT(map_it != my_storage_map.end(),
+            "cache should not return past-end iterators to outer world");
+        __TBB_ASSERT(&(*map_it) == &map_record_ref,
+            "dangling reference has been returned to outside world: data race?");
+        __TBB_ASSERT(std::find(my_history_list.begin(), my_history_list.end(), map_it) == my_history_list.end(),
+            "object in use should not be in list of unused objects ");
+
+        // if it was the last reference, put it to the LRU history
+        if (! --(map_it->second.my_ref_counter)) {
+            // if the LRU history is full, evict the oldest items to get space
+            if (my_history_list.size() >= my_history_list_capacity) {
+                if (my_history_list_capacity == 0) {
+                    // Since LRU history capacity is zero, there is no need to keep the element in history
+                    my_storage_map.erase(map_it);
+                    return;
+                }
+                std::size_t number_of_elements_to_evict = 1 + my_history_list.size() - my_history_list_capacity;
+
+                for (std::size_t i = 0; i < number_of_elements_to_evict; ++i) {
+                    storage_map_iterator_type map_it_to_evict = my_history_list.back();
+
+                    __TBB_ASSERT(map_it_to_evict->second.my_ref_counter == 0,
+                        "item to be evicted should not have a live references");
+
+                    // TODO: can we use forward_list instead of list? pop_front / insert_after last
+                    my_history_list.pop_back();
+                    my_storage_map.erase(map_it_to_evict);
+                }
+            }
+
+            // TODO: can we use forward_list instead of list? pop_front / insert_after last
+            my_history_list.push_front(map_it);
+            map_it->second.my_history_list_iterator = my_history_list.begin();
+        }
+    }
+
+    storage_map_reference_type retrieve_serial(key_type key, bool& is_new_value_needed) {
+        storage_map_iterator_type map_it = my_storage_map.find(key);
+
+        if (map_it == my_storage_map.end()) {
+            map_it = my_storage_map.emplace_hint(
+                map_it, std::piecewise_construct, std::make_tuple(key), std::make_tuple(value_type(), 0, my_history_list.end(), false));
+            is_new_value_needed = true;
+        } else {
+            history_list_iterator_type list_it = map_it->second.my_history_list_iterator;
+            if (list_it != my_history_list.end()) {
+                __TBB_ASSERT(map_it->second.my_ref_counter == 0,
+                    "item to be evicted should not have a live references");
+
+                // Item is going to be used. Therefore it is not a subject for eviction,
+                // so we remove it from LRU history.
+                my_history_list.erase(list_it);
+                map_it->second.my_history_list_iterator = my_history_list.end();
+            }
+        }
+
+        ++(map_it->second.my_ref_counter);
+        return *map_it;
+    }
+};
+
+//-----------------------------------------------------------------------------
+// Value type for storage map in concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::storage_map_value_type {
+//typedefs
+public:
+    using ref_counter_type = std::size_t;
+
+// fields
+public:
+    value_type my_value;
+    ref_counter_type my_ref_counter;
+    history_list_iterator_type my_history_list_iterator;
+    std::atomic<bool> my_is_ready;
+
+// interface
+public:
+    storage_map_value_type(
+        value_type const& value, ref_counter_type ref_counter,
+        history_list_iterator_type history_list_iterator, bool is_ready)
+        : my_value(value), my_ref_counter(ref_counter),
+          my_history_list_iterator(history_list_iterator), my_is_ready(is_ready) {}
+};
+
+//-----------------------------------------------------------------------------
+// Handle object for operator[] in concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::handle_object {
+// fields
+private:
+    lru_cache_type* my_lru_cache_ptr;
+    storage_map_pointer_type my_map_record_ptr;
+
+// interface
+public:
+    handle_object()
+        : my_lru_cache_ptr(nullptr), my_map_record_ptr(nullptr) {}
+    handle_object(lru_cache_type& lru_cache_ref, storage_map_reference_type map_record_ref)
+        : my_lru_cache_ptr(&lru_cache_ref), my_map_record_ptr(&map_record_ref) {}
+
+    handle_object(handle_object&) = delete;
+    void operator=(handle_object&) = delete;
+
+    handle_object(handle_object&& other)
+        : my_lru_cache_ptr(other.my_lru_cache_ptr), my_map_record_ptr(other.my_map_record_ptr) {
+
+        __TBB_ASSERT(
+            (other.my_lru_cache_ptr != nullptr && other.my_map_record_ptr != nullptr) ||
+            (other.my_lru_cache_ptr == nullptr && other.my_map_record_ptr == nullptr),
+            "invalid state of moving object?");
+
+        other.my_lru_cache_ptr = nullptr;
+        other.my_map_record_ptr = nullptr;
+    }
+
+    handle_object& operator=(handle_object&& other) {
+        __TBB_ASSERT(
+            (other.my_lru_cache_ptr != nullptr && other.my_map_record_ptr != nullptr) ||
+            (other.my_lru_cache_ptr == nullptr && other.my_map_record_ptr == nullptr),
+            "invalid state of moving object?");
+
+        if (my_lru_cache_ptr)
+            my_lru_cache_ptr->signal_end_of_usage(*my_map_record_ptr);
+
+        my_lru_cache_ptr = other.my_lru_cache_ptr;
+        my_map_record_ptr = other.my_map_record_ptr;
+        other.my_lru_cache_ptr = nullptr;
+        other.my_map_record_ptr = nullptr;
+
+        return *this;
+    }
+
+    ~handle_object() {
+        if (my_lru_cache_ptr)
+            my_lru_cache_ptr->signal_end_of_usage(*my_map_record_ptr);
+    }
+
+    operator bool() const {
+        return (my_lru_cache_ptr && my_map_record_ptr);
+    }
+
+    value_type& value() {
+        __TBB_ASSERT(my_lru_cache_ptr, "get value from already moved object?");
+        __TBB_ASSERT(my_map_record_ptr, "get value from an invalid or already moved object?");
+
+        return my_map_record_ptr->second.my_value;
+    }
+};
+
+//-----------------------------------------------------------------------------
+// Aggregator operation for aggregator type in concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::aggregator_operation
+    : aggregated_operation<aggregator_operation> {
+// incapsulated helper classes
+public:
+    enum class op_type { retrieve, signal_end_of_usage };
+
+// fields
+private:
+    op_type my_op;
+
+// interface
+public:
+    aggregator_operation(op_type op) : my_op(op) {}
+
+    // TODO: aggregator_operation can be implemented
+    //   - as a statically typed variant type or CRTP? (static, dependent on the use case)
+    //   - or use pointer to function and apply_visitor (dynamic)
+    //   - or use virtual functions (dynamic)
+    void cast_and_handle(lru_cache_type& lru_cache_ref) {
+        if (my_op == op_type::retrieve)
+            static_cast<retrieve_aggregator_operation*>(this)->handle(lru_cache_ref);
+        else
+            static_cast<signal_end_of_usage_aggregator_operation*>(this)->handle(lru_cache_ref);
+    }
+};
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::retrieve_aggregator_operation
+    : aggregator_operation, private no_assign {
+public:
+    key_type my_key;
+    storage_map_pointer_type my_map_record_ptr;
+    bool my_is_new_value_needed;
+
+public:
+    retrieve_aggregator_operation(key_type key)
+        : aggregator_operation(aggregator_operation::op_type::retrieve),
+          my_key(key), my_map_record_ptr(nullptr), my_is_new_value_needed(false) {}
+
+    void handle(lru_cache_type& lru_cache_ref) {
+        my_map_record_ptr = &lru_cache_ref.retrieve_serial(my_key, my_is_new_value_needed);
+    }
+
+    storage_map_reference_type result() {
+        __TBB_ASSERT(my_map_record_ptr, "Attempt to call result() before calling handle()");
+        return *my_map_record_ptr;
+    }
+
+    bool is_new_value_needed() { return my_is_new_value_needed; }
+};
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::signal_end_of_usage_aggregator_operation
+    : aggregator_operation, private no_assign {
+
+private:
+    storage_map_reference_type my_map_record_ref;
+
+public:
+    signal_end_of_usage_aggregator_operation(storage_map_reference_type map_record_ref)
+        : aggregator_operation(aggregator_operation::op_type::signal_end_of_usage),
+          my_map_record_ref(map_record_ref) {}
+
+    void handle(lru_cache_type& lru_cache_ref) {
+        lru_cache_ref.signal_end_of_usage_serial(my_map_record_ref);
+    }
+};
+
+// TODO: if we have guarantees that KeyToValFunctorT always have
+//       ValT as a return type and KeyT as an argument type
+//       we can deduce template parameters of concurrent_lru_cache
+//       by pattern matching on KeyToValFunctorT
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_lru_cache;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_lru_cache_H
--- a/third_party/tbb/concurrent_map.hh
+++ b/third_party/tbb/concurrent_map.hh
@ -0,0 +1,351 @@
+// clang-format off
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_map_H
+#define __TBB_concurrent_map_H
+
+#include "third_party/tbb/detail/_namespace_injection.hh"
+#include "third_party/tbb/detail/_concurrent_skip_list.hh"
+#include "third_party/tbb/tbb_allocator.hh"
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/tuple"
+#include "third_party/libcxx/utility"
+
+namespace tbb {
+namespace detail {
+namespace d2 {
+
+template<typename Key, typename Value, typename KeyCompare, typename RandomGenerator,
+         typename Allocator, bool AllowMultimapping>
+struct map_traits {
+    static constexpr std::size_t max_level = RandomGenerator::max_level;
+    using random_level_generator_type = RandomGenerator;
+    using key_type = Key;
+    using mapped_type = Value;
+    using compare_type = KeyCompare;
+    using value_type = std::pair<const key_type, mapped_type>;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using allocator_type = Allocator;
+
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    class value_compare {
+    public:
+        bool operator()(const value_type& lhs, const value_type& rhs) const {
+            return comp(lhs.first, rhs.first);
+        }
+
+    protected:
+        value_compare(compare_type c) : comp(c) {}
+
+        friend struct map_traits;
+
+        compare_type comp;
+    };
+
+    static value_compare value_comp(compare_type comp) { return value_compare(comp); }
+
+    static const key_type& get_key(const_reference val) {
+        return val.first;
+    }
+}; // struct map_traits
+
+template <typename Key, typename Value, typename Compare, typename Allocator>
+class concurrent_multimap;
+
+template <typename Key, typename Value, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<std::pair<const Key, Value>>>
+class concurrent_map : public concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, false>> {
+    using base_type = concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, false>>;
+public:
+    using key_type = Key;
+    using mapped_type = Value;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base type
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_map() = default;
+    concurrent_map( const concurrent_map& ) = default;
+    concurrent_map( const concurrent_map& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_map( concurrent_map&& ) = default;
+    concurrent_map( concurrent_map&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_map& operator=( const concurrent_map& ) = default;
+    concurrent_map& operator=( concurrent_map&& ) = default;
+
+    concurrent_map& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    // Observers
+    mapped_type& at(const key_type& key) {
+        iterator it = this->find(key);
+
+        if (it == this->end()) {
+            throw_exception(exception_id::invalid_key);
+        }
+        return it->second;
+    }
+
+    const mapped_type& at(const key_type& key) const {
+        return const_cast<concurrent_map*>(this)->at(key);
+    }
+
+    mapped_type& operator[](const key_type& key) {
+        iterator it = this->find(key);
+
+        if (it == this->end()) {
+            it = this->emplace(std::piecewise_construct, std::forward_as_tuple(key), std::tuple<>()).first;
+        }
+        return it->second;
+    }
+
+    mapped_type& operator[](key_type&& key) {
+        iterator it = this->find(key);
+
+        if (it == this->end()) {
+            it = this->emplace(std::piecewise_construct, std::forward_as_tuple(std::move(key)), std::tuple<>()).first;
+        }
+        return it->second;
+    }
+
+    using base_type::insert;
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value )
+    {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value )
+    {
+        return this->emplace_hint(hint, std::forward<P>(value));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_map
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_map( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_map<iterator_key_t<It>, iterator_mapped_t<It>, Comp, Alloc>;
+
+template <typename Key, typename T,
+          typename Comp = std::less<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_map( std::initializer_list<std::pair<Key, T>>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_map<std::remove_const_t<Key>, T, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_map( It, It, Alloc )
+-> concurrent_map<iterator_key_t<It>, iterator_mapped_t<It>,
+                  std::less<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_map( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_map<std::remove_const_t<Key>, T, std::less<std::remove_const_t<Key>>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Value, typename Compare, typename Allocator>
+void swap( concurrent_map<Key, Value, Compare, Allocator>& lhs,
+           concurrent_map<Key, Value, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename Value, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<std::pair<const Key, Value>>>
+class concurrent_multimap : public concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, true>> {
+    using base_type = concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, true>>;
+public:
+    using key_type = Key;
+    using mapped_type = Value;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type
+    using base_type::base_type;
+    using base_type::insert;
+
+    // Required for implicit deduction guides
+    concurrent_multimap() = default;
+    concurrent_multimap( const concurrent_multimap& ) = default;
+    concurrent_multimap( const concurrent_multimap& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_multimap( concurrent_multimap&& ) = default;
+    concurrent_multimap( concurrent_multimap&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_multimap& operator=( const concurrent_multimap& ) = default;
+    concurrent_multimap& operator=( concurrent_multimap&& ) = default;
+
+    concurrent_multimap& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value )
+    {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value )
+    {
+        return this->emplace_hint(hint, std::forward<P>(value));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_multimap
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multimap( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multimap<iterator_key_t<It>, iterator_mapped_t<It>, Comp, Alloc>;
+
+template <typename Key, typename T,
+          typename Comp = std::less<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multimap( std::initializer_list<std::pair<Key, T>>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multimap<std::remove_const_t<Key>, T, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multimap( It, It, Alloc )
+-> concurrent_multimap<iterator_key_t<It>, iterator_mapped_t<It>,
+                       std::less<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multimap( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_multimap<std::remove_const_t<Key>, T, std::less<std::remove_const_t<Key>>, Alloc>;
+
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Value, typename Compare, typename Allocator>
+void swap( concurrent_multimap<Key, Value, Compare, Allocator>& lhs,
+           concurrent_multimap<Key, Value, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+} // namespace d2
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d2::concurrent_map;
+using detail::d2::concurrent_multimap;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_map_H
--- a/third_party/tbb/concurrent_monitor.hh
+++ b/third_party/tbb/concurrent_monitor.hh
@ -0,0 +1,489 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_monitor_H
+#define __TBB_concurrent_monitor_H
+
+#include "third_party/tbb/spin_mutex.hh"
+#include "third_party/tbb/detail/_exception.hh"
+#include "third_party/tbb/detail/_aligned_space.hh"
+#include "third_party/tbb/concurrent_monitor_mutex.hh"
+#include "third_party/tbb/semaphore.hh"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Circular doubly-linked list with sentinel
+/** head.next points to the front and head.prev points to the back */
+class circular_doubly_linked_list_with_sentinel {
+public:
+    struct base_node {
+        base_node* next;
+        base_node* prev;
+
+        constexpr base_node(base_node* n, base_node* p) : next(n), prev(p) {}
+        explicit base_node() : next((base_node*)(uintptr_t)0xcdcdcdcd), prev((base_node*)(uintptr_t)0xcdcdcdcd) {}
+    };
+
+    // ctor
+    constexpr circular_doubly_linked_list_with_sentinel() : count(0), head(&head, &head) {}
+
+    circular_doubly_linked_list_with_sentinel(const circular_doubly_linked_list_with_sentinel&) = delete;
+    circular_doubly_linked_list_with_sentinel& operator=(const circular_doubly_linked_list_with_sentinel&) = delete;
+
+    inline std::size_t size() const { return count.load(std::memory_order_relaxed); }
+    inline bool empty() const { return size() == 0; }
+    inline base_node* front() const { return head.next; }
+    inline base_node* last() const { return head.prev; }
+    inline const base_node* end() const { return &head; }
+
+    //! add to the back of the list
+    inline void add( base_node* n ) {
+        count.store(count.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+        n->prev = head.prev;
+        n->next = &head;
+        head.prev->next = n;
+        head.prev = n;
+    }
+
+    //! remove node 'n'
+    inline void remove( base_node& n ) {
+        __TBB_ASSERT(count.load(std::memory_order_relaxed) > 0, "attempt to remove an item from an empty list");
+        count.store(count.load( std::memory_order_relaxed ) - 1, std::memory_order_relaxed);
+        n.prev->next = n.next;
+        n.next->prev = n.prev;
+    }
+
+    //! move all elements to 'lst' and initialize the 'this' list
+    inline void flush_to( circular_doubly_linked_list_with_sentinel& lst ) {
+        const std::size_t l_count = size();
+        if (l_count > 0) {
+            lst.count.store(l_count, std::memory_order_relaxed);
+            lst.head.next = head.next;
+            lst.head.prev = head.prev;
+            head.next->prev = &lst.head;
+            head.prev->next = &lst.head;
+            clear();
+        }
+    }
+
+    void clear() {
+        head.next = &head;
+        head.prev = &head;
+        count.store(0, std::memory_order_relaxed);
+    }
+private:
+    std::atomic<std::size_t> count;
+    base_node head;
+};
+
+using base_list = circular_doubly_linked_list_with_sentinel;
+using base_node = circular_doubly_linked_list_with_sentinel::base_node;
+
+template <typename Context>
+class concurrent_monitor_base;
+
+template <typename Context>
+class wait_node : public base_node {
+public:
+
+#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900
+    wait_node(Context ctx) : my_context(ctx), my_is_in_list(false) {}
+#else
+    wait_node(Context ctx) : my_context(ctx) {}
+#endif
+
+    virtual ~wait_node() = default;
+
+    virtual void init() {
+        __TBB_ASSERT(!my_initialized, nullptr);
+        my_initialized = true;
+    }
+
+    virtual void wait() = 0;
+
+    virtual void reset() {
+        __TBB_ASSERT(my_skipped_wakeup, nullptr);
+        my_skipped_wakeup = false;
+    }
+
+    virtual void notify() = 0;
+
+protected:
+    friend class concurrent_monitor_base<Context>;
+    friend class thread_data;
+
+    Context my_context{};
+#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900
+    std::atomic<bool> my_is_in_list;
+#else
+    std::atomic<bool> my_is_in_list{false};
+#endif
+
+    bool my_initialized{false};
+    bool my_skipped_wakeup{false};
+    bool my_aborted{false};
+    unsigned my_epoch{0};
+};
+
+template <typename Context>
+class sleep_node : public wait_node<Context> {
+    using base_type = wait_node<Context>;
+public:
+    using base_type::base_type;
+
+    ~sleep_node() override {
+        if (this->my_initialized) {
+            if (this->my_skipped_wakeup) semaphore().P();
+            semaphore().~binary_semaphore();
+        }
+    }
+
+    binary_semaphore& semaphore() { return *sema.begin(); }
+
+    void init() override {
+        if (!this->my_initialized) {
+            new (sema.begin()) binary_semaphore;
+            base_type::init();
+        }
+    }
+
+    void wait() override {
+        __TBB_ASSERT(this->my_initialized,
+            "Use of commit_wait() without prior prepare_wait()");
+        semaphore().P();
+        __TBB_ASSERT(!this->my_is_in_list.load(std::memory_order_relaxed), "Still in the queue?");
+        if (this->my_aborted)
+            throw_exception(exception_id::user_abort);
+    }
+
+    void reset() override {
+        base_type::reset();
+        semaphore().P();
+    }
+
+    void notify() override {
+        semaphore().V();
+    }
+
+private:
+    tbb::detail::aligned_space<binary_semaphore> sema;
+};
+
+//! concurrent_monitor
+/** fine-grained concurrent_monitor implementation */
+template <typename Context>
+class concurrent_monitor_base {
+public:
+    //! ctor
+    constexpr concurrent_monitor_base() {}
+    //! dtor
+    ~concurrent_monitor_base() = default;
+
+    concurrent_monitor_base(const concurrent_monitor_base&) = delete;
+    concurrent_monitor_base& operator=(const concurrent_monitor_base&) = delete;
+
+    //! prepare wait by inserting 'thr' into the wait queue
+    void prepare_wait( wait_node<Context>& node) {
+        // TODO: consider making even more lazy instantiation of the semaphore, that is only when it is actually needed, e.g. move it in node::wait()
+        if (!node.my_initialized) {
+            node.init();
+        }
+        // this is good place to pump previous skipped wakeup
+        else if (node.my_skipped_wakeup) {
+            node.reset();
+        }
+
+        node.my_is_in_list.store(true, std::memory_order_relaxed);
+
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            node.my_epoch = my_epoch.load(std::memory_order_relaxed);
+            my_waitset.add(&node);
+        }
+
+        // Prepare wait guarantees Write Read memory barrier.
+        // In C++ only full fence covers this type of barrier.
+        atomic_fence_seq_cst();
+    }
+
+    //! Commit wait if event count has not changed; otherwise, cancel wait.
+    /** Returns true if committed, false if canceled. */
+    inline bool commit_wait( wait_node<Context>& node ) {
+        const bool do_it = node.my_epoch == my_epoch.load(std::memory_order_relaxed);
+        // this check is just an optimization
+        if (do_it) {
+           node.wait();
+        } else {
+            cancel_wait( node );
+        }
+        return do_it;
+    }
+
+    //! Cancel the wait. Removes the thread from the wait queue if not removed yet.
+    void cancel_wait( wait_node<Context>& node ) {
+        // possible skipped wakeup will be pumped in the following prepare_wait()
+        node.my_skipped_wakeup = true;
+        // try to remove node from waitset
+        // Cancel wait guarantees acquire memory barrier.
+        bool in_list = node.my_is_in_list.load(std::memory_order_acquire);
+        if (in_list) {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            if (node.my_is_in_list.load(std::memory_order_relaxed)) {
+                my_waitset.remove(node);
+                // node is removed from waitset, so there will be no wakeup
+                node.my_is_in_list.store(false, std::memory_order_relaxed);
+                node.my_skipped_wakeup = false;
+            }
+        }
+    }
+
+    //! Wait for a condition to be satisfied with waiting-on my_context
+    template <typename NodeType, typename Pred>
+    bool wait(Pred&& pred, NodeType&& node) {
+        prepare_wait(node);
+        while (!guarded_call(std::forward<Pred>(pred), node)) {
+            if (commit_wait(node)) {
+                return true;
+            }
+
+            prepare_wait(node);
+        }
+
+        cancel_wait(node);
+        return false;
+    }
+
+    //! Notify one thread about the event
+    void notify_one() {
+        atomic_fence_seq_cst();
+        notify_one_relaxed();
+    }
+
+    //! Notify one thread about the event. Relaxed version.
+    void notify_one_relaxed() {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_node* n;
+        const base_node* end = my_waitset.end();
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+            n = my_waitset.front();
+            if (n != end) {
+                my_waitset.remove(*n);
+                to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed);
+            }
+        }
+
+        if (n != end) {
+            to_wait_node(n)->notify();
+        }
+    }
+
+    //! Notify all waiting threads of the event
+    void notify_all() {
+        atomic_fence_seq_cst();
+        notify_all_relaxed();
+    }
+
+    // ! Notify all waiting threads of the event; Relaxed version
+    void notify_all_relaxed() {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_list temp;
+        const base_node* end;
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+            // TODO: Possible optimization, don't change node state under lock, just do flush
+            my_waitset.flush_to(temp);
+            end = temp.end();
+            for (base_node* n = temp.front(); n != end; n = n->next) {
+                to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed);
+            }
+        }
+
+        base_node* nxt;
+        for (base_node* n = temp.front(); n != end; n=nxt) {
+            nxt = n->next;
+            to_wait_node(n)->notify();
+        }
+#if TBB_USE_ASSERT
+        temp.clear();
+#endif
+    }
+
+    //! Notify waiting threads of the event that satisfies the given predicate
+    template <typename P>
+    void notify( const P& predicate ) {
+        atomic_fence_seq_cst();
+        notify_relaxed( predicate );
+    }
+
+    //! Notify waiting threads of the event that satisfies the given predicate;
+    //! the predicate is called under the lock. Relaxed version.
+    template<typename P>
+    void notify_relaxed( const P& predicate ) {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_list temp;
+        base_node* nxt;
+        const base_node* end = my_waitset.end();
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load( std::memory_order_relaxed ) + 1, std::memory_order_relaxed);
+            for (base_node* n = my_waitset.last(); n != end; n = nxt) {
+                nxt = n->prev;
+                auto* node = static_cast<wait_node<Context>*>(n);
+                if (predicate(node->my_context)) {
+                    my_waitset.remove(*n);
+                    node->my_is_in_list.store(false, std::memory_order_relaxed);
+                    temp.add(n);
+                }
+            }
+        }
+
+        end = temp.end();
+        for (base_node* n=temp.front(); n != end; n = nxt) {
+            nxt = n->next;
+            to_wait_node(n)->notify();
+        }
+#if TBB_USE_ASSERT
+        temp.clear();
+#endif
+    }
+
+    //! Notify waiting threads of the event that satisfies the given predicate;
+    //! the predicate is called under the lock. Relaxed version.
+    template<typename P>
+    void notify_one_relaxed( const P& predicate ) {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_node* tmp = nullptr;
+        base_node* next{};
+        const base_node* end = my_waitset.end();
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load( std::memory_order_relaxed ) + 1, std::memory_order_relaxed);
+            for (base_node* n = my_waitset.last(); n != end; n = next) {
+                next = n->prev;
+                auto* node = static_cast<wait_node<Context>*>(n);
+                if (predicate(node->my_context)) {
+                    my_waitset.remove(*n);
+                    node->my_is_in_list.store(false, std::memory_order_relaxed);
+                    tmp = n;
+                    break;
+                }
+            }
+        }
+
+        if (tmp) {
+            to_wait_node(tmp)->notify();
+        }
+    }
+
+    //! Abort any sleeping threads at the time of the call
+    void abort_all() {
+        atomic_fence_seq_cst();
+        abort_all_relaxed();
+    }
+
+    //! Abort any sleeping threads at the time of the call; Relaxed version
+    void abort_all_relaxed() {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_list temp;
+        const base_node* end;
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+            my_waitset.flush_to(temp);
+            end = temp.end();
+            for (base_node* n = temp.front(); n != end; n = n->next) {
+                to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed);
+            }
+        }
+
+        base_node* nxt;
+        for (base_node* n = temp.front(); n != end; n = nxt) {
+            nxt = n->next;
+            to_wait_node(n)->my_aborted = true;
+            to_wait_node(n)->notify();
+        }
+#if TBB_USE_ASSERT
+        temp.clear();
+#endif
+    }
+
+    void destroy() {
+        this->abort_all();
+        my_mutex.destroy();
+        __TBB_ASSERT(this->my_waitset.empty(), "waitset not empty?");
+    }
+
+private:
+    template <typename NodeType, typename Pred>
+    bool guarded_call(Pred&& predicate, NodeType& node) {
+        bool res = false;
+        tbb::detail::d0::try_call( [&] {
+            res = std::forward<Pred>(predicate)();
+        }).on_exception( [&] {
+            cancel_wait(node);
+        });
+
+        return res;
+    }
+
+    concurrent_monitor_mutex my_mutex{};
+    base_list my_waitset{};
+    std::atomic<unsigned> my_epoch{};
+
+    wait_node<Context>* to_wait_node( base_node* node ) { return static_cast<wait_node<Context>*>(node); }
+};
+
+class concurrent_monitor : public concurrent_monitor_base<std::uintptr_t> {
+    using base_type = concurrent_monitor_base<std::uintptr_t>;
+public:
+    using base_type::base_type;
+
+    ~concurrent_monitor() {
+        destroy();
+    }
+
+    /** per-thread descriptor for concurrent_monitor */
+    using thread_context = sleep_node<std::uintptr_t>;
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_concurrent_monitor_H */
--- a/third_party/tbb/concurrent_monitor_mutex.hh
+++ b/third_party/tbb/concurrent_monitor_mutex.hh
@ -0,0 +1,114 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_monitor_mutex_H
+#define __TBB_monitor_mutex_H
+
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/detail/_aligned_space.hh"
+#include "third_party/tbb/semaphore.hh"
+
+#include "third_party/libcxx/mutex"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class concurrent_monitor_mutex {
+public:
+    using scoped_lock = std::lock_guard<concurrent_monitor_mutex>;
+
+    constexpr concurrent_monitor_mutex() {}
+
+    ~concurrent_monitor_mutex() = default;
+
+    void destroy() {
+#if !__TBB_USE_FUTEX
+        if (my_init_flag.load(std::memory_order_relaxed)) {
+            get_semaphore().~semaphore();
+        }
+#endif
+    }
+
+    void lock() {
+        auto wakeup_condition = [&] {
+            return my_flag.load(std::memory_order_relaxed) == 0;
+        };
+
+        while (my_flag.exchange(1)) {
+            if (!timed_spin_wait_until(wakeup_condition)) {
+                ++my_waiters;
+                while (!wakeup_condition()) {
+                    wait();
+                }
+                --my_waiters;
+            }
+        }
+    }
+
+    void unlock() {
+        my_flag.exchange(0); // full fence, so the next load is relaxed
+        if (my_waiters.load(std::memory_order_relaxed)) {
+            wakeup();
+        }
+    }
+
+private:
+    void wait() {
+#if __TBB_USE_FUTEX
+        futex_wait(&my_flag, 1);
+#else
+        get_semaphore().P();
+#endif
+    }
+
+    void wakeup() {
+#if __TBB_USE_FUTEX
+        futex_wakeup_one(&my_flag);
+#else
+        get_semaphore().V();
+#endif
+    }
+
+    // The flag should be int for the futex operations
+    std::atomic<int> my_flag{0};
+    std::atomic<int> my_waiters{0};
+
+#if !__TBB_USE_FUTEX
+    semaphore& get_semaphore() {
+        if (!my_init_flag.load(std::memory_order_acquire)) {
+            std::lock_guard<std::mutex> lock(my_init_mutex);
+            if (!my_init_flag.load(std::memory_order_relaxed)) {
+                new (my_semaphore.begin()) semaphore();
+                my_init_flag.store(true, std::memory_order_release);
+            }
+        }
+
+        return *my_semaphore.begin();
+    }
+
+    static std::mutex my_init_mutex;
+    std::atomic<bool> my_init_flag{false};
+    aligned_space<semaphore> my_semaphore{};
+#endif
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_monitor_mutex_H
--- a/third_party/tbb/concurrent_priority_queue.hh
+++ b/third_party/tbb/concurrent_priority_queue.hh
@ -0,0 +1,491 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_priority_queue_H
+#define __TBB_concurrent_priority_queue_H
+
+#include "third_party/tbb/detail/_namespace_injection.hh"
+#include "third_party/tbb/detail/_aggregator.hh"
+#include "third_party/tbb/detail/_template_helpers.hh"
+#include "third_party/tbb/detail/_allocator_traits.hh"
+#include "third_party/tbb/detail/_range_common.hh"
+#include "third_party/tbb/detail/_exception.hh"
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/detail/_containers_helpers.hh"
+#include "third_party/tbb/cache_aligned_allocator.hh"
+#include "third_party/libcxx/vector"
+#include "third_party/libcxx/iterator"
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/initializer_list"
+#include "third_party/libcxx/type_traits"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename T, typename Compare = std::less<T>, typename Allocator = cache_aligned_allocator<T>>
+class concurrent_priority_queue {
+public:
+    using value_type = T;
+    using reference = T&;
+    using const_reference = const T&;
+
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+
+    using allocator_type = Allocator;
+
+    concurrent_priority_queue() : concurrent_priority_queue(allocator_type{}) {}
+
+    explicit concurrent_priority_queue( const allocator_type& alloc )
+        : mark(0), my_size(0), my_compare(), data(alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    explicit concurrent_priority_queue( const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_size(0), my_compare(compare), data(alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    explicit concurrent_priority_queue( size_type init_capacity, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_size(0), my_compare(), data(alloc)
+    {
+        data.reserve(init_capacity);
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    explicit concurrent_priority_queue( size_type init_capacity, const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_size(0), my_compare(compare), data(alloc)
+    {
+        data.reserve(init_capacity);
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    template <typename InputIterator>
+    concurrent_priority_queue( InputIterator begin, InputIterator end, const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_compare(compare), data(begin, end, alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+        heapify();
+        my_size.store(data.size(), std::memory_order_relaxed);
+    }
+
+    template <typename InputIterator>
+    concurrent_priority_queue( InputIterator begin, InputIterator end, const allocator_type& alloc = allocator_type() )
+        : concurrent_priority_queue(begin, end, Compare(), alloc) {}
+
+    concurrent_priority_queue( std::initializer_list<value_type> init, const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : concurrent_priority_queue(init.begin(), init.end(), compare, alloc) {}
+
+    concurrent_priority_queue( std::initializer_list<value_type> init, const allocator_type& alloc = allocator_type() )
+        : concurrent_priority_queue(init, Compare(), alloc) {}
+
+    concurrent_priority_queue( const concurrent_priority_queue& other )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(other.data)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue( const concurrent_priority_queue& other, const allocator_type& alloc )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(other.data, alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue( concurrent_priority_queue&& other )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(std::move(other.data))
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue( concurrent_priority_queue&& other, const allocator_type& alloc )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(std::move(other.data), alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue& operator=( const concurrent_priority_queue& other ) {
+        if (this != &other) {
+            data = other.data;
+            mark = other.mark;
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        }
+        return *this;
+    }
+
+    concurrent_priority_queue& operator=( concurrent_priority_queue&& other ) {
+        if (this != &other) {
+            // TODO: check if exceptions from std::vector::operator=(vector&&) should be handled separately
+            data = std::move(other.data);
+            mark = other.mark;
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        }
+        return *this;
+    }
+
+    concurrent_priority_queue& operator=( std::initializer_list<value_type> init ) {
+        assign(init.begin(), init.end());
+        return *this;
+    }
+
+    template <typename InputIterator>
+    void assign( InputIterator begin, InputIterator end ) {
+        data.assign(begin, end);
+        mark = 0;
+        my_size.store(data.size(), std::memory_order_relaxed);
+        heapify();
+    }
+
+    void assign( std::initializer_list<value_type> init ) {
+        assign(init.begin(), init.end());
+    }
+
+    /* Returned value may not reflect results of pending operations.
+       This operation reads shared data and will trigger a race condition. */
+    __TBB_nodiscard bool empty() const { return size() == 0; }
+
+    // Returns the current number of elements contained in the queue
+    /* Returned value may not reflect results of pending operations.
+       This operation reads shared data and will trigger a race condition. */
+    size_type size() const { return my_size.load(std::memory_order_relaxed); }
+
+    /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    void push( const value_type& value ) {
+        cpq_operation op_data(value, PUSH_OP);
+        my_aggregator.execute(&op_data);
+        if (op_data.status == FAILED)
+            throw_exception(exception_id::bad_alloc);
+    }
+
+    /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    void push( value_type&& value ) {
+        cpq_operation op_data(value, PUSH_RVALUE_OP);
+        my_aggregator.execute(&op_data);
+        if (op_data.status == FAILED)
+            throw_exception(exception_id::bad_alloc);
+    }
+
+    /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    template <typename... Args>
+    void emplace( Args&&... args ) {
+        // TODO: support uses allocator construction in this place
+        push(value_type(std::forward<Args>(args)...));
+    }
+
+    // Gets a reference to and removes highest priority element
+    /* If a highest priority element was found, sets elem and returns true,
+       otherwise returns false.
+       This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    bool try_pop( value_type& value ) {
+        cpq_operation op_data(value, POP_OP);
+        my_aggregator.execute(&op_data);
+        return op_data.status == SUCCEEDED;
+    }
+
+    // This operation affects the whole container => it is not thread-safe
+    void clear() {
+        data.clear();
+        mark = 0;
+        my_size.store(0, std::memory_order_relaxed);
+    }
+
+    // This operation affects the whole container => it is not thread-safe
+    void swap( concurrent_priority_queue& other ) {
+        if (this != &other) {
+            using std::swap;
+            swap(data, other.data);
+            swap(mark, other.mark);
+
+            size_type sz = my_size.load(std::memory_order_relaxed);
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            other.my_size.store(sz, std::memory_order_relaxed);
+        }
+    }
+
+    allocator_type get_allocator() const { return data.get_allocator(); }
+private:
+    enum operation_type {INVALID_OP, PUSH_OP, POP_OP, PUSH_RVALUE_OP};
+    enum operation_status {WAIT = 0, SUCCEEDED, FAILED};
+
+    class cpq_operation : public aggregated_operation<cpq_operation> {
+    public:
+        operation_type type;
+        union {
+            value_type* elem;
+            size_type sz;
+        };
+        cpq_operation( const value_type& value, operation_type t )
+            : type(t), elem(const_cast<value_type*>(&value)) {}
+    }; // class cpq_operation
+
+    class functor {
+        concurrent_priority_queue* my_cpq;
+    public:
+        functor() : my_cpq(nullptr) {}
+        functor( concurrent_priority_queue* cpq ) : my_cpq(cpq) {}
+
+        void operator()(cpq_operation* op_list) {
+            __TBB_ASSERT(my_cpq != nullptr, "Invalid functor");
+            my_cpq->handle_operations(op_list);
+        }
+    }; // class functor
+
+    void handle_operations( cpq_operation* op_list ) {
+        call_itt_notify(acquired, this);
+        cpq_operation* tmp, *pop_list = nullptr;
+        __TBB_ASSERT(mark == data.size(), nullptr);
+
+        // First pass processes all constant (amortized; reallocation may happen) time pushes and pops.
+        while(op_list) {
+            // ITT note: &(op_list->status) tag is used to cover accesses to op_list
+            // node. This thread is going to handle the operation, and so will acquire it
+            // and perform the associated operation w/o triggering a race condition; the
+            // thread that created the operation is waiting on the status field, so when
+            // this thread is done with the operation, it will perform a
+            // store_with_release to give control back to the waiting thread in
+            // aggregator::insert_operation.
+            // TODO: enable
+            call_itt_notify(acquired, &(op_list->status));
+            __TBB_ASSERT(op_list->type != INVALID_OP, nullptr);
+
+            tmp = op_list;
+            op_list = op_list->next.load(std::memory_order_relaxed);
+            if (tmp->type == POP_OP) {
+                if (mark < data.size() &&
+                    my_compare(data[0], data.back()))
+                {
+                    // there are newly pushed elems and the last one is higher than top
+                    *(tmp->elem) = std::move(data.back());
+                    my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+
+                    data.pop_back();
+                    __TBB_ASSERT(mark <= data.size(), nullptr);
+                } else { // no convenient item to pop; postpone
+                    tmp->next.store(pop_list, std::memory_order_relaxed);
+                    pop_list = tmp;
+                }
+            } else { // PUSH_OP or PUSH_RVALUE_OP
+                __TBB_ASSERT(tmp->type == PUSH_OP || tmp->type == PUSH_RVALUE_OP, "Unknown operation");
+#if TBB_USE_EXCEPTIONS
+                try
+#endif
+                {
+                    if (tmp->type == PUSH_OP) {
+                        push_back_helper(*(tmp->elem));
+                    } else {
+                        data.push_back(std::move(*(tmp->elem)));
+                    }
+                    my_size.store(my_size.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+                }
+#if TBB_USE_EXCEPTIONS
+                catch(...) {
+                    tmp->status.store(uintptr_t(FAILED), std::memory_order_release);
+                }
+#endif
+            }
+        }
+
+        // Second pass processes pop operations
+        while(pop_list) {
+            tmp = pop_list;
+            pop_list = pop_list->next.load(std::memory_order_relaxed);
+            __TBB_ASSERT(tmp->type == POP_OP, nullptr);
+            if (data.empty()) {
+                tmp->status.store(uintptr_t(FAILED), std::memory_order_release);
+            } else {
+                __TBB_ASSERT(mark <= data.size(), nullptr);
+                if (mark < data.size() &&
+                    my_compare(data[0], data.back()))
+                {
+                    // there are newly pushed elems and the last one is higher than top
+                    *(tmp->elem) = std::move(data.back());
+                    my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+                    data.pop_back();
+                } else { // extract top and push last element down heap
+                    *(tmp->elem) = std::move(data[0]);
+                    my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+                    reheap();
+                }
+            }
+        }
+
+        // heapify any leftover pushed elements before doing the next
+        // batch of operations
+        if (mark < data.size()) heapify();
+        __TBB_ASSERT(mark == data.size(), nullptr);
+        call_itt_notify(releasing, this);
+    }
+
+    // Merge unsorted elements into heap
+    void heapify() {
+        if (!mark && data.size() > 0) mark = 1;
+        for (; mark < data.size(); ++mark) {
+            // for each unheapified element under size
+            size_type cur_pos = mark;
+            value_type to_place = std::move(data[mark]);
+            do { // push to_place up the heap
+                size_type parent = (cur_pos - 1) >> 1;
+                if (!my_compare(data[parent], to_place))
+                    break;
+                data[cur_pos] = std::move(data[parent]);
+                cur_pos = parent;
+            } while(cur_pos);
+            data[cur_pos] = std::move(to_place);
+        }
+    }
+
+    // Re-heapify after an extraction
+    // Re-heapify by pushing last element down the heap from the root.
+    void reheap() {
+        size_type cur_pos = 0, child = 1;
+
+        while(child < mark) {
+            size_type target = child;
+            if (child + 1 < mark && my_compare(data[child], data[child + 1]))
+                ++target;
+            // target now has the higher priority child
+            if (my_compare(data[target], data.back()))
+                break;
+            data[cur_pos] = std::move(data[target]);
+            cur_pos = target;
+            child = (cur_pos << 1) + 1;
+        }
+        if (cur_pos != data.size() - 1)
+            data[cur_pos] = std::move(data.back());
+        data.pop_back();
+        if (mark > data.size()) mark = data.size();
+    }
+
+    void push_back_helper( const T& value ) {
+        push_back_helper_impl(value, std::is_copy_constructible<T>{});
+    }
+
+    void push_back_helper_impl( const T& value, /*is_copy_constructible = */std::true_type ) {
+        data.push_back(value);
+    }
+
+    void push_back_helper_impl( const T&, /*is_copy_constructible = */std::false_type ) {
+        __TBB_ASSERT(false, "error: calling tbb::concurrent_priority_queue.push(const value_type&) for move-only type");
+    }
+
+    using aggregator_type = aggregator<functor, cpq_operation>;
+
+    aggregator_type my_aggregator;
+    // Padding added to avoid false sharing
+    char padding1[max_nfs_size - sizeof(aggregator_type)];
+    // The point at which unsorted elements begin
+    size_type mark;
+    std::atomic<size_type> my_size;
+    Compare my_compare;
+
+    // Padding added to avoid false sharing
+    char padding2[max_nfs_size - (2*sizeof(size_type)) - sizeof(Compare)];
+    //! Storage for the heap of elements in queue, plus unheapified elements
+    /** data has the following structure:
+
+         binary unheapified
+          heap   elements
+        ____|_______|____
+        |       |       |
+        v       v       v
+        [_|...|_|_|...|_| |...| ]
+         0       ^       ^       ^
+                 |       |       |__capacity
+                 |       |__my_size
+                 |__mark
+
+        Thus, data stores the binary heap starting at position 0 through
+        mark-1 (it may be empty).  Then there are 0 or more elements
+        that have not yet been inserted into the heap, in positions
+        mark through my_size-1. */
+
+    using vector_type = std::vector<value_type, allocator_type>;
+    vector_type data;
+
+    friend bool operator==( const concurrent_priority_queue& lhs,
+                            const concurrent_priority_queue& rhs )
+    {
+        return lhs.data == rhs.data;
+    }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+    friend bool operator!=( const concurrent_priority_queue& lhs,
+                            const concurrent_priority_queue& rhs )
+    {
+        return !(lhs == rhs);
+    }
+#endif
+}; // class concurrent_priority_queue
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename It,
+          typename Comp = std::less<iterator_value_t<It>>,
+          typename Alloc = tbb::cache_aligned_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_priority_queue( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_priority_queue<iterator_value_t<It>, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_priority_queue( It, It, Alloc )
+-> concurrent_priority_queue<iterator_value_t<It>, std::less<iterator_value_t<It>>, Alloc>;
+
+template <typename T,
+          typename Comp = std::less<T>,
+          typename Alloc = tbb::cache_aligned_allocator<T>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_priority_queue( std::initializer_list<T>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_priority_queue<T, Comp, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_priority_queue( std::initializer_list<T>, Alloc )
+-> concurrent_priority_queue<T, std::less<T>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename T, typename Compare, typename Allocator>
+void swap( concurrent_priority_queue<T, Compare, Allocator>& lhs,
+           concurrent_priority_queue<T, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+inline namespace v1 {
+using detail::d1::concurrent_priority_queue;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_priority_queue_H
--- a/third_party/tbb/concurrent_queue.hh
+++ b/third_party/tbb/concurrent_queue.hh
@ -0,0 +1,701 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_queue_H
+#define __TBB_concurrent_queue_H
+
+#include "third_party/tbb/detail/_namespace_injection.hh"
+#include "third_party/tbb/detail/_concurrent_queue_base.hh"
+#include "third_party/tbb/detail/_allocator_traits.hh"
+#include "third_party/tbb/detail/_exception.hh"
+#include "third_party/tbb/detail/_containers_helpers.hh"
+#include "third_party/tbb/cache_aligned_allocator.hh"
+
+namespace tbb {
+namespace detail {
+namespace d2 {
+
+template <typename QueueRep, typename Allocator>
+std::pair<bool, ticket_type> internal_try_pop_impl(void* dst, QueueRep& queue, Allocator& alloc ) {
+    ticket_type ticket{};
+    do {
+        // Basically, we need to read `head_counter` before `tail_counter`. To achieve it we build happens-before on `head_counter`
+        ticket = queue.head_counter.load(std::memory_order_acquire);
+        do {
+            if (static_cast<std::ptrdiff_t>(queue.tail_counter.load(std::memory_order_relaxed) - ticket) <= 0) { // queue is empty
+                // Queue is empty
+                return { false, ticket };
+            }
+            // Queue had item with ticket k when we looked.  Attempt to get that item.
+            // Another thread snatched the item, retry.
+        } while (!queue.head_counter.compare_exchange_strong(ticket, ticket + 1));
+    } while (!queue.choose(ticket).pop(dst, ticket, queue, alloc));
+    return { true, ticket };
+}
+
+// A high-performance thread-safe non-blocking concurrent queue.
+// Multiple threads may each push and pop concurrently.
+// Assignment construction is not allowed.
+template <typename T, typename Allocator = tbb::cache_aligned_allocator<T>>
+class concurrent_queue {
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+    using queue_representation_type = concurrent_queue_rep<T, Allocator>;
+    using queue_allocator_type = typename allocator_traits_type::template rebind_alloc<queue_representation_type>;
+    using queue_allocator_traits = tbb::detail::allocator_traits<queue_allocator_type>;
+public:
+    using size_type = std::size_t;
+    using value_type = T;
+    using reference = T&;
+    using const_reference = const T&;
+    using difference_type = std::ptrdiff_t;
+
+    using allocator_type = Allocator;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using iterator = concurrent_queue_iterator<concurrent_queue, T, Allocator>;
+    using const_iterator = concurrent_queue_iterator<concurrent_queue, const T, Allocator>;
+
+    concurrent_queue() : concurrent_queue(allocator_type()) {}
+
+    explicit concurrent_queue(const allocator_type& a) :
+        my_allocator(a), my_queue_representation(nullptr)
+    {
+        my_queue_representation = static_cast<queue_representation_type*>(r1::cache_aligned_allocate(sizeof(queue_representation_type)));
+        queue_allocator_traits::construct(my_allocator, my_queue_representation);
+
+        __TBB_ASSERT(is_aligned(my_queue_representation, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->head_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->tail_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->array, max_nfs_size), "alignment error" );
+    }
+
+    template <typename InputIterator>
+    concurrent_queue(InputIterator begin, InputIterator end, const allocator_type& a = allocator_type()) :
+        concurrent_queue(a)
+    {
+        for (; begin != end; ++begin)
+            push(*begin);
+    }
+
+    concurrent_queue( std::initializer_list<value_type> init, const allocator_type& alloc = allocator_type() ) :
+        concurrent_queue(init.begin(), init.end(), alloc)
+    {}
+
+    concurrent_queue(const concurrent_queue& src, const allocator_type& a) :
+        concurrent_queue(a)
+    {
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item);
+    }
+
+    concurrent_queue(const concurrent_queue& src) :
+        concurrent_queue(queue_allocator_traits::select_on_container_copy_construction(src.get_allocator()))
+    {
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item);
+    }
+
+    // Move constructors
+    concurrent_queue(concurrent_queue&& src) :
+        concurrent_queue(std::move(src.my_allocator))
+    {
+        internal_swap(src);
+    }
+
+    concurrent_queue(concurrent_queue&& src, const allocator_type& a) :
+        concurrent_queue(a)
+    {
+        // checking that memory allocated by one instance of allocator can be deallocated
+        // with another
+        if (my_allocator == src.my_allocator) {
+            internal_swap(src);
+        } else {
+            // allocators are different => performing per-element move
+            my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item);
+            src.clear();
+        }
+    }
+
+    // Destroy queue
+    ~concurrent_queue() {
+        clear();
+        my_queue_representation->clear(my_allocator);
+        queue_allocator_traits::destroy(my_allocator, my_queue_representation);
+        r1::cache_aligned_deallocate(my_queue_representation);
+    }
+
+    concurrent_queue& operator=( const concurrent_queue& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_copy_assignment
+        if (my_queue_representation != other.my_queue_representation) {
+            clear();
+            my_allocator = other.my_allocator;
+            my_queue_representation->assign(*other.my_queue_representation, my_allocator, copy_construct_item);
+        }
+        return *this;
+    }
+
+    concurrent_queue& operator=( concurrent_queue&& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_move_assignment
+        if (my_queue_representation != other.my_queue_representation) {
+            clear();
+            if (my_allocator == other.my_allocator) {
+                internal_swap(other);
+            } else {
+                my_queue_representation->assign(*other.my_queue_representation, other.my_allocator, move_construct_item);
+                other.clear();
+                my_allocator = std::move(other.my_allocator);
+            }
+        }
+        return *this;
+    }
+
+    concurrent_queue& operator=( std::initializer_list<value_type> init ) {
+        assign(init);
+        return *this;
+    }
+
+    template <typename InputIterator>
+    void assign( InputIterator first, InputIterator last ) {
+        concurrent_queue src(first, last);
+        clear();
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item);
+    }
+
+    void assign( std::initializer_list<value_type> init ) {
+        assign(init.begin(), init.end());
+    }
+
+    void swap ( concurrent_queue& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_swap
+        __TBB_ASSERT(my_allocator == other.my_allocator, "unequal allocators");
+        internal_swap(other);
+    }
+
+    // Enqueue an item at tail of queue.
+    void push(const T& value) {
+        internal_push(value);
+    }
+
+    void push(T&& value) {
+        internal_push(std::move(value));
+    }
+
+    template <typename... Args>
+    void emplace( Args&&... args ) {
+        internal_push(std::forward<Args>(args)...);
+    }
+
+    // Attempt to dequeue an item from head of queue.
+    /** Does not wait for item to become available.
+        Returns true if successful; false otherwise. */
+    bool try_pop( T& result ) {
+        return internal_try_pop(&result);
+    }
+
+    // Return the number of items in the queue; thread unsafe
+    size_type unsafe_size() const {
+        std::ptrdiff_t size = my_queue_representation->size();
+        return size < 0 ? 0 :  size_type(size);
+    }
+
+    // Equivalent to size()==0.
+    __TBB_nodiscard bool empty() const {
+        return my_queue_representation->empty();
+    }
+
+    // Clear the queue. not thread-safe.
+    void clear() {
+        my_queue_representation->clear(my_allocator);
+    }
+
+    // Return allocator object
+    allocator_type get_allocator() const { return my_allocator; }
+
+    //------------------------------------------------------------------------
+    // The iterators are intended only for debugging.  They are slow and not thread safe.
+    //------------------------------------------------------------------------
+
+    iterator unsafe_begin() { return concurrent_queue_iterator_provider::get<iterator>(*this); }
+    iterator unsafe_end() { return iterator(); }
+    const_iterator unsafe_begin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_end() const { return const_iterator(); }
+    const_iterator unsafe_cbegin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_cend() const { return const_iterator(); }
+
+private:
+    void internal_swap(concurrent_queue& src) {
+        using std::swap;
+        swap(my_queue_representation, src.my_queue_representation);
+    }
+
+    template <typename... Args>
+    void internal_push( Args&&... args ) {
+        ticket_type k = my_queue_representation->tail_counter++;
+        my_queue_representation->choose(k).push(k, *my_queue_representation, my_allocator, std::forward<Args>(args)...);
+    }
+
+    bool internal_try_pop( void* dst ) {
+        return internal_try_pop_impl(dst, *my_queue_representation, my_allocator).first;
+    }
+
+    template <typename Container, typename Value, typename A>
+    friend class concurrent_queue_iterator;
+
+    static void copy_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for copy construction
+        new (location) value_type(*static_cast<const value_type*>(src));
+        // queue_allocator_traits::construct(my_allocator, location, *static_cast<const T*>(src));
+    }
+
+    static void move_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for move construction
+        new (location) value_type(std::move(*static_cast<value_type*>(const_cast<void*>(src))));
+    }
+
+    queue_allocator_type my_allocator;
+    queue_representation_type* my_queue_representation;
+
+    friend void swap( concurrent_queue& lhs, concurrent_queue& rhs ) {
+        lhs.swap(rhs);
+    }
+
+    friend bool operator==( const concurrent_queue& lhs, const concurrent_queue& rhs ) {
+        return lhs.unsafe_size() == rhs.unsafe_size() && std::equal(lhs.unsafe_begin(), lhs.unsafe_end(), rhs.unsafe_begin());
+    }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+    friend bool operator!=( const concurrent_queue& lhs,  const concurrent_queue& rhs ) {
+        return !(lhs == rhs);
+    }
+#endif // __TBB_CPP20_COMPARISONS_PRESENT
+}; // class concurrent_queue
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+// Deduction guide for the constructor from two iterators
+template <typename It, typename Alloc = tbb::cache_aligned_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_queue( It, It, Alloc = Alloc() )
+-> concurrent_queue<iterator_value_t<It>, Alloc>;
+
+#endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */
+
+class concurrent_monitor;
+
+// The concurrent monitor tags for concurrent_bounded_queue.
+static constexpr std::size_t cbq_slots_avail_tag = 0;
+static constexpr std::size_t cbq_items_avail_tag = 1;
+} // namespace d2
+
+
+namespace r1 {
+    class concurrent_monitor;
+
+    TBB_EXPORT std::uint8_t* __TBB_EXPORTED_FUNC allocate_bounded_queue_rep( std::size_t queue_rep_size );
+    TBB_EXPORT void __TBB_EXPORTED_FUNC deallocate_bounded_queue_rep( std::uint8_t* mem, std::size_t queue_rep_size );
+    TBB_EXPORT void __TBB_EXPORTED_FUNC abort_bounded_queue_monitors( concurrent_monitor* monitors );
+    TBB_EXPORT void __TBB_EXPORTED_FUNC notify_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag
+                                                            , std::size_t ticket );
+    TBB_EXPORT void __TBB_EXPORTED_FUNC wait_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag,
+                                                            std::ptrdiff_t target, d1::delegate_base& predicate );
+} // namespace r1
+
+
+namespace d2 {
+// A high-performance thread-safe blocking concurrent bounded queue.
+// Supports boundedness and blocking semantics.
+// Multiple threads may each push and pop concurrently.
+// Assignment construction is not allowed.
+template <typename T, typename Allocator = tbb::cache_aligned_allocator<T>>
+class concurrent_bounded_queue {
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+    using queue_representation_type = concurrent_queue_rep<T, Allocator>;
+    using queue_allocator_type = typename allocator_traits_type::template rebind_alloc<queue_representation_type>;
+    using queue_allocator_traits = tbb::detail::allocator_traits<queue_allocator_type>;
+
+    template <typename FuncType>
+    void internal_wait(r1::concurrent_monitor* monitors, std::size_t monitor_tag, std::ptrdiff_t target, FuncType pred) {
+        d1::delegated_function<FuncType> func(pred);
+        r1::wait_bounded_queue_monitor(monitors, monitor_tag, target, func);
+    }
+public:
+    using size_type = std::ptrdiff_t;
+    using value_type = T;
+    using reference = T&;
+    using const_reference = const T&;
+    using difference_type = std::ptrdiff_t;
+
+    using allocator_type = Allocator;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using iterator = concurrent_queue_iterator<concurrent_bounded_queue, T, Allocator>;
+    using const_iterator = concurrent_queue_iterator<concurrent_bounded_queue, const T, Allocator> ;
+
+    concurrent_bounded_queue() : concurrent_bounded_queue(allocator_type()) {}
+
+    explicit concurrent_bounded_queue( const allocator_type& a ) :
+        my_allocator(a), my_capacity(0), my_abort_counter(0), my_queue_representation(nullptr)
+    {
+        my_queue_representation = reinterpret_cast<queue_representation_type*>(
+            r1::allocate_bounded_queue_rep(sizeof(queue_representation_type)));
+        my_monitors = reinterpret_cast<r1::concurrent_monitor*>(my_queue_representation + 1);
+        queue_allocator_traits::construct(my_allocator, my_queue_representation);
+        my_capacity = std::size_t(-1) / (queue_representation_type::item_size > 1 ? queue_representation_type::item_size : 2);
+
+        __TBB_ASSERT(is_aligned(my_queue_representation, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->head_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->tail_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->array, max_nfs_size), "alignment error" );
+    }
+
+    template <typename InputIterator>
+    concurrent_bounded_queue( InputIterator begin, InputIterator end, const allocator_type& a = allocator_type() ) :
+        concurrent_bounded_queue(a)
+    {
+        for (; begin != end; ++begin)
+            push(*begin);
+    }
+
+    concurrent_bounded_queue( std::initializer_list<value_type> init, const allocator_type& alloc = allocator_type() ):
+        concurrent_bounded_queue(init.begin(), init.end(), alloc)
+    {}
+
+    concurrent_bounded_queue( const concurrent_bounded_queue& src, const allocator_type& a ) :
+        concurrent_bounded_queue(a)
+    {
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item);
+    }
+
+    concurrent_bounded_queue( const concurrent_bounded_queue& src ) :
+        concurrent_bounded_queue(queue_allocator_traits::select_on_container_copy_construction(src.get_allocator()))
+    {
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item);
+    }
+
+    // Move constructors
+    concurrent_bounded_queue( concurrent_bounded_queue&& src ) :
+        concurrent_bounded_queue(std::move(src.my_allocator))
+    {
+        internal_swap(src);
+    }
+
+    concurrent_bounded_queue( concurrent_bounded_queue&& src, const allocator_type& a ) :
+        concurrent_bounded_queue(a)
+    {
+        // checking that memory allocated by one instance of allocator can be deallocated
+        // with another
+        if (my_allocator == src.my_allocator) {
+            internal_swap(src);
+        } else {
+            // allocators are different => performing per-element move
+            my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item);
+            src.clear();
+        }
+    }
+
+    // Destroy queue
+    ~concurrent_bounded_queue() {
+        clear();
+        my_queue_representation->clear(my_allocator);
+        queue_allocator_traits::destroy(my_allocator, my_queue_representation);
+        r1::deallocate_bounded_queue_rep(reinterpret_cast<std::uint8_t*>(my_queue_representation),
+                                         sizeof(queue_representation_type));
+    }
+
+    concurrent_bounded_queue& operator=( const concurrent_bounded_queue& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_copy_assignment
+        if (my_queue_representation != other.my_queue_representation) {
+            clear();
+            my_allocator = other.my_allocator;
+            my_queue_representation->assign(*other.my_queue_representation, my_allocator, copy_construct_item);
+        }
+        return *this;
+    }
+
+    concurrent_bounded_queue& operator=( concurrent_bounded_queue&& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_move_assignment
+        if (my_queue_representation != other.my_queue_representation) {
+            clear();
+            if (my_allocator == other.my_allocator) {
+                internal_swap(other);
+            } else {
+                my_queue_representation->assign(*other.my_queue_representation, other.my_allocator, move_construct_item);
+                other.clear();
+                my_allocator = std::move(other.my_allocator);
+            }
+        }
+        return *this;
+    }
+
+    concurrent_bounded_queue& operator=( std::initializer_list<value_type> init ) {
+        assign(init);
+        return *this;
+    }
+
+    template <typename InputIterator>
+    void assign( InputIterator first, InputIterator last ) {
+        concurrent_bounded_queue src(first, last);
+        clear();
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item);
+    }
+
+    void assign( std::initializer_list<value_type> init ) {
+        assign(init.begin(), init.end());
+    }
+
+    void swap ( concurrent_bounded_queue& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_swap
+        __TBB_ASSERT(my_allocator == other.my_allocator, "unequal allocators");
+        internal_swap(other);
+    }
+
+    // Enqueue an item at tail of queue.
+    void push( const T& value ) {
+        internal_push(value);
+    }
+
+    void push( T&& value ) {
+        internal_push(std::move(value));
+    }
+
+    // Enqueue an item at tail of queue if queue is not already full.
+    // Does not wait for queue to become not full.
+    // Returns true if item is pushed; false if queue was already full.
+    bool try_push( const T& value ) {
+        return internal_push_if_not_full(value);
+    }
+
+    bool try_push( T&& value ) {
+        return internal_push_if_not_full(std::move(value));
+    }
+
+    template <typename... Args>
+    void emplace( Args&&... args ) {
+        internal_push(std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    bool try_emplace( Args&&... args ) {
+        return internal_push_if_not_full(std::forward<Args>(args)...);
+    }
+
+    // Attempt to dequeue an item from head of queue.
+    void pop( T& result ) {
+        internal_pop(&result);
+    }
+
+    /** Does not wait for item to become available.
+        Returns true if successful; false otherwise. */
+    bool try_pop( T& result ) {
+        return internal_pop_if_present(&result);
+    }
+
+    void abort() {
+        internal_abort();
+    }
+
+    // Return the number of items in the queue; thread unsafe
+    std::ptrdiff_t size() const {
+        return my_queue_representation->size();
+    }
+
+    void set_capacity( size_type new_capacity ) {
+        std::ptrdiff_t c = new_capacity < 0 ? infinite_capacity : new_capacity;
+        my_capacity = c;
+    }
+
+    size_type capacity() const {
+        return my_capacity;
+    }
+
+    // Equivalent to size()==0.
+    __TBB_nodiscard bool empty() const {
+        return my_queue_representation->empty();
+    }
+
+    // Clear the queue. not thread-safe.
+    void clear() {
+        my_queue_representation->clear(my_allocator);
+    }
+
+    // Return allocator object
+    allocator_type get_allocator() const { return my_allocator; }
+
+    //------------------------------------------------------------------------
+    // The iterators are intended only for debugging.  They are slow and not thread safe.
+    //------------------------------------------------------------------------
+
+    iterator unsafe_begin() { return concurrent_queue_iterator_provider::get<iterator>(*this); }
+    iterator unsafe_end() { return iterator(); }
+    const_iterator unsafe_begin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_end() const { return const_iterator(); }
+    const_iterator unsafe_cbegin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_cend() const { return const_iterator(); }
+
+private:
+    void internal_swap( concurrent_bounded_queue& src ) {
+        std::swap(my_queue_representation, src.my_queue_representation);
+        std::swap(my_monitors, src.my_monitors);
+    }
+
+    static constexpr std::ptrdiff_t infinite_capacity = std::ptrdiff_t(~size_type(0) / 2);
+
+    template <typename... Args>
+    void internal_push( Args&&... args ) {
+        unsigned old_abort_counter = my_abort_counter.load(std::memory_order_relaxed);
+        ticket_type ticket = my_queue_representation->tail_counter++;
+        std::ptrdiff_t target = ticket - my_capacity;
+
+        if (static_cast<std::ptrdiff_t>(my_queue_representation->head_counter.load(std::memory_order_relaxed)) <= target) { // queue is full
+            auto pred = [&] {
+                if (my_abort_counter.load(std::memory_order_relaxed) != old_abort_counter) {
+                    throw_exception(exception_id::user_abort);
+                }
+
+                return static_cast<std::ptrdiff_t>(my_queue_representation->head_counter.load(std::memory_order_relaxed)) <= target;
+            };
+
+            try_call( [&] {
+                internal_wait(my_monitors, cbq_slots_avail_tag, target, pred);
+            }).on_exception( [&] {
+                my_queue_representation->choose(ticket).abort_push(ticket, *my_queue_representation, my_allocator);
+            });
+
+        }
+        __TBB_ASSERT((static_cast<std::ptrdiff_t>(my_queue_representation->head_counter.load(std::memory_order_relaxed)) > target), nullptr);
+        my_queue_representation->choose(ticket).push(ticket, *my_queue_representation, my_allocator, std::forward<Args>(args)...);
+        r1::notify_bounded_queue_monitor(my_monitors, cbq_items_avail_tag, ticket);
+    }
+
+    template <typename... Args>
+    bool internal_push_if_not_full( Args&&... args ) {
+        ticket_type ticket = my_queue_representation->tail_counter.load(std::memory_order_relaxed);
+        do {
+            if (static_cast<std::ptrdiff_t>(ticket - my_queue_representation->head_counter.load(std::memory_order_relaxed)) >= my_capacity) {
+                // Queue is full
+                return false;
+            }
+            // Queue had empty slot with ticket k when we looked. Attempt to claim that slot.
+            // Another thread claimed the slot, so retry.
+        } while (!my_queue_representation->tail_counter.compare_exchange_strong(ticket, ticket + 1));
+
+        my_queue_representation->choose(ticket).push(ticket, *my_queue_representation, my_allocator, std::forward<Args>(args)...);
+        r1::notify_bounded_queue_monitor(my_monitors, cbq_items_avail_tag, ticket);
+        return true;
+    }
+
+    void internal_pop( void* dst ) {
+        std::ptrdiff_t target;
+        // This loop is a single pop operation; abort_counter should not be re-read inside
+        unsigned old_abort_counter = my_abort_counter.load(std::memory_order_relaxed);
+
+        do {
+            target = my_queue_representation->head_counter++;
+            if (static_cast<std::ptrdiff_t>(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) <= target) {
+                auto pred = [&] {
+                    if (my_abort_counter.load(std::memory_order_relaxed) != old_abort_counter) {
+                            throw_exception(exception_id::user_abort);
+                    }
+
+                    return static_cast<std::ptrdiff_t>(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) <= target;
+                };
+
+                try_call( [&] {
+                    internal_wait(my_monitors, cbq_items_avail_tag, target, pred);
+                }).on_exception( [&] {
+                    my_queue_representation->head_counter--;
+                });
+            }
+            __TBB_ASSERT(static_cast<std::ptrdiff_t>(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) > target, nullptr);
+        } while (!my_queue_representation->choose(target).pop(dst, target, *my_queue_representation, my_allocator));
+
+        r1::notify_bounded_queue_monitor(my_monitors, cbq_slots_avail_tag, target);
+    }
+
+    bool internal_pop_if_present( void* dst ) {
+        bool present{};
+        ticket_type ticket{};
+        std::tie(present, ticket) = internal_try_pop_impl(dst, *my_queue_representation, my_allocator);
+
+        if (present) {
+            r1::notify_bounded_queue_monitor(my_monitors, cbq_slots_avail_tag, ticket);
+        }
+        return present;
+    }
+
+    void internal_abort() {
+        ++my_abort_counter;
+        r1::abort_bounded_queue_monitors(my_monitors);
+    }
+
+    static void copy_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for copy construction
+        new (location) value_type(*static_cast<const value_type*>(src));
+    }
+
+    static void move_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for move construction
+        new (location) value_type(std::move(*static_cast<value_type*>(const_cast<void*>(src))));
+    }
+
+    template <typename Container, typename Value, typename A>
+    friend class concurrent_queue_iterator;
+
+    queue_allocator_type my_allocator;
+    std::ptrdiff_t my_capacity;
+    std::atomic<unsigned> my_abort_counter;
+    queue_representation_type* my_queue_representation;
+
+    r1::concurrent_monitor* my_monitors;
+
+    friend void swap( concurrent_bounded_queue& lhs, concurrent_bounded_queue& rhs ) {
+        lhs.swap(rhs);
+    }
+
+    friend bool operator==( const concurrent_bounded_queue& lhs, const concurrent_bounded_queue& rhs ) {
+        return lhs.size() == rhs.size() && std::equal(lhs.unsafe_begin(), lhs.unsafe_end(), rhs.unsafe_begin());
+    }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+    friend bool operator!=( const concurrent_bounded_queue& lhs, const concurrent_bounded_queue& rhs ) {
+        return !(lhs == rhs);
+    }
+#endif // __TBB_CPP20_COMPARISONS_PRESENT
+}; // class concurrent_bounded_queue
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+// Deduction guide for the constructor from two iterators
+template <typename It, typename Alloc = tbb::cache_aligned_allocator<iterator_value_t<It>>>
+concurrent_bounded_queue( It, It, Alloc = Alloc() )
+-> concurrent_bounded_queue<iterator_value_t<It>, Alloc>;
+
+#endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */
+
+} //namespace d2
+} // namesapce detail
+
+inline namespace v1 {
+
+using detail::d2::concurrent_queue;
+using detail::d2::concurrent_bounded_queue;
+using detail::r1::user_abort;
+using detail::r1::bad_last_alloc;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_queue_H
--- a/third_party/tbb/concurrent_set.hh
+++ b/third_party/tbb/concurrent_set.hh
@ -0,0 +1,268 @@
+// clang-format off
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_set_H
+#define __TBB_concurrent_set_H
+
+#include "third_party/tbb/detail/_namespace_injection.hh"
+#include "third_party/tbb/detail/_concurrent_skip_list.hh"
+#include "third_party/tbb/tbb_allocator.hh"
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/utility"
+
+namespace tbb {
+namespace detail {
+namespace d2 {
+
+template<typename Key, typename KeyCompare, typename RandomGenerator, typename Allocator, bool AllowMultimapping>
+struct set_traits {
+    static constexpr std::size_t max_level = RandomGenerator::max_level;
+    using random_level_generator_type = RandomGenerator;
+    using key_type = Key;
+    using value_type = key_type;
+    using compare_type = KeyCompare;
+    using value_compare = compare_type;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using allocator_type = Allocator;
+
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    static const key_type& get_key(const_reference val) {
+        return val;
+    }
+
+    static value_compare value_comp(compare_type comp) { return comp; }
+}; // struct set_traits
+
+template <typename Key, typename Compare, typename Allocator>
+class concurrent_multiset;
+
+template <typename Key, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_set : public concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, false>> {
+    using base_type = concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, false>>;
+public:
+    using key_type = Key;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_set() = default;
+    concurrent_set( const concurrent_set& ) = default;
+    concurrent_set( const concurrent_set& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_set( concurrent_set&& ) = default;
+    concurrent_set( concurrent_set&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_set& operator=( const concurrent_set& ) = default;
+    concurrent_set& operator=( concurrent_set&& ) = default;
+
+    concurrent_set& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_set
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_set( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_set<iterator_value_t<It>, Comp, Alloc>;
+
+template <typename Key,
+          typename Comp = std::less<Key>,
+          typename Alloc = tbb::tbb_allocator<Key>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_set( std::initializer_list<Key>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_set<Key, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_set( It, It, Alloc )
+-> concurrent_set<iterator_value_t<It>,
+                  std::less<iterator_value_t<It>>, Alloc>;
+
+template <typename Key, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_set( std::initializer_list<Key>, Alloc )
+-> concurrent_set<Key, std::less<Key>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Compare, typename Allocator>
+void swap( concurrent_set<Key, Compare, Allocator>& lhs,
+           concurrent_set<Key, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_multiset : public concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, true>> {
+    using base_type = concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, true>>;
+public:
+    using key_type = Key;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type;
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_multiset() = default;
+    concurrent_multiset( const concurrent_multiset& ) = default;
+    concurrent_multiset( const concurrent_multiset& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_multiset( concurrent_multiset&& ) = default;
+    concurrent_multiset( concurrent_multiset&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_multiset& operator=( const concurrent_multiset& ) = default;
+    concurrent_multiset& operator=( concurrent_multiset&& ) = default;
+
+    concurrent_multiset& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_multiset
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multiset( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multiset<iterator_value_t<It>, Comp, Alloc>;
+
+template <typename Key,
+          typename Comp = std::less<Key>,
+          typename Alloc = tbb::tbb_allocator<Key>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multiset( std::initializer_list<Key>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multiset<Key, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multiset( It, It, Alloc )
+-> concurrent_multiset<iterator_value_t<It>, std::less<iterator_value_t<It>>, Alloc>;
+
+template <typename Key, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multiset( std::initializer_list<Key>, Alloc )
+-> concurrent_multiset<Key, std::less<Key>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Compare, typename Allocator>
+void swap( concurrent_multiset<Key, Compare, Allocator>& lhs,
+           concurrent_multiset<Key, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+} // namespace d2
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d2::concurrent_set;
+using detail::d2::concurrent_multiset;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_set_H
--- a/third_party/tbb/concurrent_unordered_map.hh
+++ b/third_party/tbb/concurrent_unordered_map.hh
@ -0,0 +1,415 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_unordered_map_H
+#define __TBB_concurrent_unordered_map_H
+
+#include "third_party/tbb/detail/_namespace_injection.hh"
+#include "third_party/tbb/detail/_concurrent_unordered_base.hh"
+#include "third_party/tbb/tbb_allocator.hh"
+#include "third_party/libcxx/functional"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
+struct concurrent_unordered_map_traits {
+    using value_type = std::pair<const Key, T>;
+    using key_type = Key;
+    using allocator_type = Allocator;
+    using hash_compare_type = hash_compare<Key, Hash, KeyEqual>;
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    static constexpr const key_type& get_key( const value_type& value ) {
+        return value.first;
+    }
+}; // struct concurrent_unordered_map_traits
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator>
+class concurrent_unordered_multimap;
+
+template <typename Key, typename T, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<std::pair<const Key, T>> >
+class concurrent_unordered_map
+    : public concurrent_unordered_base<concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, false>>
+{
+    using traits_type = concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, false>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using mapped_type = T;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base type
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_unordered_map() = default;
+    concurrent_unordered_map( const concurrent_unordered_map& ) = default;
+    concurrent_unordered_map( const concurrent_unordered_map& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_map( concurrent_unordered_map&& ) = default;
+    concurrent_unordered_map( concurrent_unordered_map&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_map& operator=( const concurrent_unordered_map& ) = default;
+    concurrent_unordered_map& operator=( concurrent_unordered_map&& ) = default;
+
+    concurrent_unordered_map& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    // Observers
+    mapped_type& operator[]( const key_type& key ) {
+        iterator where = this->find(key);
+
+        if (where == this->end()) {
+            where = this->emplace(std::piecewise_construct, std::forward_as_tuple(key), std::tuple<>()).first;
+        }
+        return where->second;
+    }
+
+    mapped_type& operator[]( key_type&& key ) {
+        iterator where = this->find(key);
+
+        if (where == this->end()) {
+            where = this->emplace(std::piecewise_construct, std::forward_as_tuple(std::move(key)), std::tuple<>()).first;
+        }
+        return where->second;
+    }
+
+    mapped_type& at( const key_type& key ) {
+        iterator where = this->find(key);
+
+        if (where == this->end()) {
+            throw_exception(exception_id::invalid_key);
+        }
+        return where->second;
+    }
+
+    const mapped_type& at( const key_type& key ) const {
+        const_iterator where = this->find(key);
+
+        if (where == this->end()) {
+            throw_exception(exception_id::out_of_range);
+        }
+        return where->second;
+    }
+
+    using base_type::insert;
+
+    template<typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value ) {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template<typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value ) {
+        return this->emplace_hint(hint, std::forward<P>(value));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_map
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename It,
+          typename Hash = std::hash<iterator_key_t<It>>,
+          typename KeyEq = std::equal_to<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( It, It, std::size_t =  {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_map<iterator_key_t<It>, iterator_mapped_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename Key, typename T,
+          typename Hash = std::hash<std::remove_const_t<Key>>,
+          typename KeyEq = std::equal_to<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, std::size_t = {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_map( It, It, std::size_t, Alloc )
+-> concurrent_unordered_map<iterator_key_t<It>, iterator_mapped_t<It>,
+                            std::hash<iterator_key_t<It>>,
+                            std::equal_to<iterator_key_t<It>>, Alloc>;
+
+// TODO: investigate if a deduction guide for concurrent_unordered_map(It, It, Alloc) is needed
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_map<iterator_key_t<It>, iterator_mapped_t<It>,
+                            Hash, std::equal_to<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, std::size_t, Alloc )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                            std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                            std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, Hash,
+                            std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+#if __APPLE__ && __TBB_CLANG_VERSION == 100000
+// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0
+// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances.
+// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides
+// The issue reproduces only on this version of the compiler
+template <typename Key, typename T, typename Hash, typename KeyEq, typename Alloc>
+concurrent_unordered_map( concurrent_unordered_map<Key, T, Hash, KeyEq, Alloc>, Alloc )
+-> concurrent_unordered_map<Key, T, Hash, KeyEq, Alloc>;
+#endif
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_map<Key, T, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_map<Key, T, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename T, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<std::pair<const Key, T>> >
+class concurrent_unordered_multimap
+    : public concurrent_unordered_base<concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, true>>
+{
+    using traits_type = concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, true>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using mapped_type = T;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base type
+    using base_type::base_type;
+    using base_type::insert;
+
+    // Required for implicit deduction guides
+    concurrent_unordered_multimap() = default;
+    concurrent_unordered_multimap( const concurrent_unordered_multimap& ) = default;
+    concurrent_unordered_multimap( const concurrent_unordered_multimap& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_multimap( concurrent_unordered_multimap&& ) = default;
+    concurrent_unordered_multimap( concurrent_unordered_multimap&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_multimap& operator=( const concurrent_unordered_multimap& ) = default;
+    concurrent_unordered_multimap& operator=( concurrent_unordered_multimap&& ) = default;
+
+    concurrent_unordered_multimap& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value ) {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template<typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value ) {
+        return this->emplace_hint(hint, std::forward<P&&>(value));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_multimap
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Hash = std::hash<iterator_key_t<It>>,
+          typename KeyEq = std::equal_to<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multimap<iterator_key_t<It>, iterator_mapped_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename Key, typename T,
+          typename Hash = std::hash<std::remove_const_t<Key>>,
+          typename KeyEq = std::equal_to<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, std::size_t = {},
+                               Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multimap( It, It, std::size_t, Alloc )
+-> concurrent_unordered_multimap<iterator_key_t<It>, iterator_mapped_t<It>,
+                                 std::hash<iterator_key_t<It>>,
+                                 std::equal_to<iterator_key_t<It>>, Alloc>;
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multimap<iterator_key_t<It>, iterator_mapped_t<It>, Hash,
+                                 std::equal_to<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, std::size_t, Alloc )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                                 std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                                 std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, Hash,
+                                 std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+#if __APPLE__ && __TBB_CLANG_VERSION == 100000
+// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0
+// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances.
+// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides
+// The issue reproduces only on this version of the compiler
+template <typename Key, typename T, typename Hash, typename KeyEq, typename Alloc>
+concurrent_unordered_multimap( concurrent_unordered_multimap<Key, T, Hash, KeyEq, Alloc>, Alloc )
+-> concurrent_unordered_multimap<Key, T, Hash, KeyEq, Alloc>;
+#endif
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_multimap<Key, T, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_multimap<Key, T, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_unordered_map;
+using detail::d1::concurrent_unordered_multimap;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_unordered_map_H
--- a/third_party/tbb/concurrent_unordered_set.hh
+++ b/third_party/tbb/concurrent_unordered_set.hh
@ -0,0 +1,334 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_unordered_set_H
+#define __TBB_concurrent_unordered_set_H
+
+#include "third_party/tbb/detail/_namespace_injection.hh"
+#include "third_party/tbb/detail/_concurrent_unordered_base.hh"
+#include "third_party/tbb/tbb_allocator.hh"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
+struct concurrent_unordered_set_traits {
+    using key_type = Key;
+    using value_type = key_type;
+    using allocator_type = Allocator;
+    using hash_compare_type = hash_compare<key_type, Hash, KeyEqual>;
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    static constexpr const key_type& get_key( const value_type& value ) {
+        return value;
+    }
+}; // class concurrent_unordered_set_traits
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator>
+class concurrent_unordered_multiset;
+
+template <typename Key, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_unordered_set
+    : public concurrent_unordered_base<concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, false>>
+{
+    using traits_type = concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, false>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type;
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_unordered_set() = default;
+    concurrent_unordered_set( const concurrent_unordered_set& ) = default;
+    concurrent_unordered_set( const concurrent_unordered_set& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_set( concurrent_unordered_set&& ) = default;
+    concurrent_unordered_set( concurrent_unordered_set&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_set& operator=( const concurrent_unordered_set& ) = default;
+    concurrent_unordered_set& operator=( concurrent_unordered_set&& ) = default;
+
+    concurrent_unordered_set& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_set
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Hash = std::hash<iterator_value_t<It>>,
+          typename KeyEq = std::equal_to<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_set<iterator_value_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename T,
+          typename Hash = std::hash<T>,
+          typename KeyEq = std::equal_to<T>,
+          typename Alloc = tbb::tbb_allocator<T>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( std::initializer_list<T>, std::size_t = {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_set<T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_set( It, It, std::size_t, Alloc )
+-> concurrent_unordered_set<iterator_value_t<It>, std::hash<iterator_value_t<It>>,
+                            std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_set<iterator_value_t<It>, Hash, std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_set( std::initializer_list<T>, std::size_t, Alloc )
+-> concurrent_unordered_set<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_set( std::initializer_list<T>, Alloc )
+-> concurrent_unordered_set<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( std::initializer_list<T>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_set<T, Hash, std::equal_to<T>, Alloc>;
+
+#if __APPLE__ && __TBB_CLANG_VERSION == 100000
+// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0
+// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances.
+// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides
+// The issue reproduces only on this version of the compiler
+template <typename T, typename Hash, typename KeyEq, typename Alloc>
+concurrent_unordered_set( concurrent_unordered_set<T, Hash, KeyEq, Alloc>, Alloc )
+-> concurrent_unordered_set<T, Hash, KeyEq, Alloc>;
+#endif
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_set<Key, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_set<Key, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_unordered_multiset
+    : public concurrent_unordered_base<concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, true>>
+{
+    using traits_type = concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, true>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type;
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_unordered_multiset() = default;
+    concurrent_unordered_multiset( const concurrent_unordered_multiset& ) = default;
+    concurrent_unordered_multiset( const concurrent_unordered_multiset& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_multiset( concurrent_unordered_multiset&& ) = default;
+    concurrent_unordered_multiset( concurrent_unordered_multiset&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_multiset& operator=( const concurrent_unordered_multiset& ) = default;
+    concurrent_unordered_multiset& operator=( concurrent_unordered_multiset&& ) = default;
+
+    concurrent_unordered_multiset& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_multiset
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename It,
+          typename Hash = std::hash<iterator_value_t<It>>,
+          typename KeyEq = std::equal_to<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multiset<iterator_value_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename T,
+          typename Hash = std::hash<T>,
+          typename KeyEq = std::equal_to<T>,
+          typename Alloc = tbb::tbb_allocator<T>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( std::initializer_list<T>, std::size_t = {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multiset<T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multiset( It, It, std::size_t, Alloc )
+-> concurrent_unordered_multiset<iterator_value_t<It>, std::hash<iterator_value_t<It>>,
+                            std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multiset<iterator_value_t<It>, Hash, std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multiset( std::initializer_list<T>, std::size_t, Alloc )
+-> concurrent_unordered_multiset<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multiset( std::initializer_list<T>, Alloc )
+-> concurrent_unordered_multiset<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( std::initializer_list<T>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multiset<T, Hash, std::equal_to<T>, Alloc>;
+
+#if __APPLE__ && __TBB_CLANG_VERSION == 100000
+// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0
+// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances.
+// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides
+// The issue reproduces only on this version of the compiler
+template <typename T, typename Hash, typename KeyEq, typename Alloc>
+concurrent_unordered_multiset( concurrent_unordered_multiset<T, Hash, KeyEq, Alloc>, Alloc )
+-> concurrent_unordered_multiset<T, Hash, KeyEq, Alloc>;
+#endif
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_multiset<Key, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_multiset<Key, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_unordered_set;
+using detail::d1::concurrent_unordered_multiset;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_unordered_set_H
--- a/third_party/tbb/concurrent_vector.hh
+++ b/third_party/tbb/concurrent_vector.hh
--- a/third_party/tbb/detail/_aggregator.hh
+++ b/third_party/tbb/detail/_aggregator.hh
@ -0,0 +1,177 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+
+#ifndef __TBB_detail__aggregator_H
+#define __TBB_detail__aggregator_H
+
+#include "third_party/tbb/detail/_assert.hh"
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/libcxx/atomic"
+#if !__TBBMALLOC_BUILD // TODO: check this macro with TBB Malloc
+#include "third_party/tbb/profiling.hh"
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+// Base class for aggregated operation
+template <typename Derived>
+class aggregated_operation {
+public:
+    // Zero value means "wait" status, all other values are "user" specified values and
+    // are defined into the scope of a class which uses "status"
+    std::atomic<uintptr_t> status;
+
+    std::atomic<Derived*> next;
+    aggregated_operation() : status{}, next(nullptr) {}
+}; // class aggregated_operation
+
+// Aggregator base class
+/* An aggregator for collecting operations coming from multiple sources and executing
+   them serially on a single thread.  OperationType must be derived from
+   aggregated_operation. The parameter HandlerType is a functor that will be passed the
+   list of operations and is expected to handle each operation appropriately, setting the
+   status of each operation to non-zero. */
+template <typename OperationType>
+class aggregator_generic {
+public:
+    aggregator_generic() : pending_operations(nullptr), handler_busy(false) {}
+
+    // Execute an operation
+    /* Places an operation into the waitlist (pending_operations), and either handles the list,
+       or waits for the operation to complete, or returns.
+       The long_life_time parameter specifies the life time of the given operation object.
+       Operations with long_life_time == true may be accessed after execution.
+       A "short" life time operation (long_life_time == false) can be destroyed
+       during execution, and so any access to it after it was put into the waitlist,
+       including status check, is invalid. As a consequence, waiting for completion
+       of such operation causes undefined behavior. */
+    template <typename HandlerType>
+    void execute( OperationType* op, HandlerType& handle_operations, bool long_life_time = true ) {
+        // op->status should be read before inserting the operation into the
+        // aggregator waitlist since it can become invalid after executing a
+        // handler (if the operation has 'short' life time.)
+        const uintptr_t status = op->status.load(std::memory_order_relaxed);
+
+        // ITT note: &(op->status) tag is used to cover accesses to this op node. This
+        // thread has created the operation, and now releases it so that the handler
+        // thread may handle the associated operation w/o triggering a race condition;
+        // thus this tag will be acquired just before the operation is handled in the
+        // handle_operations functor.
+        call_itt_notify(releasing, &(op->status));
+        // insert the operation in the queue.
+        OperationType* res = pending_operations.load(std::memory_order_relaxed);
+        do {
+            op->next.store(res, std::memory_order_relaxed);
+        } while (!pending_operations.compare_exchange_strong(res, op));
+        if (!res) { // first in the list; handle the operations
+            // ITT note: &pending_operations tag covers access to the handler_busy flag,
+            // which this waiting handler thread will try to set before entering
+            // handle_operations.
+            call_itt_notify(acquired, &pending_operations);
+            start_handle_operations(handle_operations);
+            // The operation with 'short' life time can already be destroyed
+            if (long_life_time)
+                __TBB_ASSERT(op->status.load(std::memory_order_relaxed), nullptr);
+        }
+        // Not first; wait for op to be ready
+        else if (!status) { // operation is blocking here.
+            __TBB_ASSERT(long_life_time, "Waiting for an operation object that might be destroyed during processing");
+            call_itt_notify(prepare, &(op->status));
+            spin_wait_while_eq(op->status, uintptr_t(0));
+        }
+   }
+
+private:
+    // Trigger the handling of operations when the handler is free
+    template <typename HandlerType>
+    void start_handle_operations( HandlerType& handle_operations ) {
+        OperationType* op_list;
+
+        // ITT note: &handler_busy tag covers access to pending_operations as it is passed
+        // between active and waiting handlers.  Below, the waiting handler waits until
+        // the active handler releases, and the waiting handler acquires &handler_busy as
+        // it becomes the active_handler. The release point is at the end of this
+        // function, when all operations in pending_operations have been handled by the
+        // owner of this aggregator.
+        call_itt_notify(prepare, &handler_busy);
+        // get the handler_busy:
+        // only one thread can possibly spin here at a time
+        spin_wait_until_eq(handler_busy, uintptr_t(0));
+        call_itt_notify(acquired, &handler_busy);
+        // acquire fence not necessary here due to causality rule and surrounding atomics
+        handler_busy.store(1, std::memory_order_relaxed);
+
+        // ITT note: &pending_operations tag covers access to the handler_busy flag
+        // itself. Capturing the state of the pending_operations signifies that
+        // handler_busy has been set and a new active handler will now process that list's
+        // operations.
+        call_itt_notify(releasing, &pending_operations);
+        // grab pending_operations
+        op_list = pending_operations.exchange(nullptr);
+
+        // handle all the operations
+        handle_operations(op_list);
+
+        // release the handler
+        handler_busy.store(0, std::memory_order_release);
+    }
+
+    // An atomically updated list (aka mailbox) of pending operations
+    std::atomic<OperationType*> pending_operations;
+    // Controls threads access to handle_operations
+    std::atomic<uintptr_t> handler_busy;
+}; // class aggregator_generic
+
+template <typename HandlerType, typename OperationType>
+class aggregator : public aggregator_generic<OperationType> {
+    HandlerType handle_operations;
+public:
+    aggregator() = default;
+
+    void initialize_handler( HandlerType h ) { handle_operations = h; }
+
+    void execute(OperationType* op) {
+        aggregator_generic<OperationType>::execute(op, handle_operations);
+    }
+}; // class aggregator
+
+// the most-compatible friend declaration (vs, gcc, icc) is
+// template<class U, class V> friend class aggregating_functor;
+template <typename AggregatingClass, typename OperationList>
+class aggregating_functor {
+    AggregatingClass* my_object{nullptr};
+public:
+    aggregating_functor() = default;
+    aggregating_functor( AggregatingClass* object ) : my_object(object) {
+        __TBB_ASSERT(my_object, nullptr);
+    }
+
+    void operator()( OperationList* op_list ) {
+        __TBB_ASSERT(my_object, nullptr);
+        my_object->handle_operations(op_list);
+    }
+}; // class aggregating_functor
+
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__aggregator_H
--- a/third_party/tbb/detail/_aligned_space.hh
+++ b/third_party/tbb/detail/_aligned_space.hh
@ -0,0 +1,47 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+#ifndef __TBB_aligned_space_H
+#define __TBB_aligned_space_H
+
+#include "third_party/libcxx/cstddef"
+
+#include "third_party/tbb/detail/_template_helpers.hh"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//! Block of space aligned sufficiently to construct an array T with N elements.
+/** The elements are not constructed or destroyed by this class.
+    @ingroup memory_allocation */
+template<typename T, std::size_t N = 1>
+class aligned_space {
+    alignas(alignof(T)) std::uint8_t aligned_array[N * sizeof(T)];
+
+public:
+    //! Pointer to beginning of array
+    T* begin() const { return punned_cast<T*>(&aligned_array); }
+
+    //! Pointer to one past last element in array.
+    T* end() const { return begin() + N; }
+};
+
+} // namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_aligned_space_H */
--- a/third_party/tbb/detail/_allocator_traits.hh
+++ b/third_party/tbb/detail/_allocator_traits.hh
@ -0,0 +1,108 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__allocator_traits_H
+#define __TBB_detail__allocator_traits_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_template_helpers.hh"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/type_traits"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+#if !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT
+// Struct is_always_equal_detector provides the member type "type" which is
+// Allocator::is_always_equal if it is present, std::false_type otherwise
+template <typename Allocator, typename = void>
+struct is_always_equal_detector {
+    using type = std::false_type;
+};
+
+template <typename Allocator>
+struct is_always_equal_detector<Allocator, tbb::detail::void_t<typename Allocator::is_always_equal>>
+{
+    using type = typename Allocator::is_always_equal;
+};
+#endif // !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT
+
+template <typename Allocator>
+class allocator_traits : public std::allocator_traits<Allocator>
+{
+    using base_type = std::allocator_traits<Allocator>;
+public:
+#if !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT
+    using is_always_equal = typename is_always_equal_detector<Allocator>::type;
+#endif
+
+    template <typename T>
+    using rebind_traits = typename tbb::detail::allocator_traits<typename base_type::template rebind_alloc<T>>;
+}; // struct allocator_traits
+
+template <typename Allocator>
+void copy_assign_allocators_impl( Allocator& lhs, const Allocator& rhs, /*pocca = */std::true_type ) {
+    lhs = rhs;
+}
+
+template <typename Allocator>
+void copy_assign_allocators_impl( Allocator&, const Allocator&, /*pocca = */ std::false_type ) {}
+
+// Copy assigns allocators only if propagate_on_container_copy_assignment is true
+template <typename Allocator>
+void copy_assign_allocators( Allocator& lhs, const Allocator& rhs ) {
+    using pocca_type = typename allocator_traits<Allocator>::propagate_on_container_copy_assignment;
+    copy_assign_allocators_impl(lhs, rhs, pocca_type());
+}
+
+template <typename Allocator>
+void move_assign_allocators_impl( Allocator& lhs, Allocator& rhs, /*pocma = */ std::true_type ) {
+    lhs = std::move(rhs);
+}
+
+template <typename Allocator>
+void move_assign_allocators_impl( Allocator&, Allocator&, /*pocma = */ std::false_type ) {}
+
+// Move assigns allocators only if propagate_on_container_move_assignment is true
+template <typename Allocator>
+void move_assign_allocators( Allocator& lhs, Allocator& rhs ) {
+    using pocma_type = typename allocator_traits<Allocator>::propagate_on_container_move_assignment;
+    move_assign_allocators_impl(lhs, rhs, pocma_type());
+}
+
+template <typename Allocator>
+void swap_allocators_impl( Allocator& lhs, Allocator& rhs, /*pocs = */ std::true_type ) {
+    using std::swap;
+    swap(lhs, rhs);
+}
+
+template <typename Allocator>
+void swap_allocators_impl( Allocator&, Allocator&, /*pocs = */ std::false_type ) {}
+
+// Swaps allocators only if propagate_on_container_swap is true
+template <typename Allocator>
+void swap_allocators( Allocator& lhs, Allocator& rhs ) {
+    using pocs_type = typename allocator_traits<Allocator>::propagate_on_container_swap;
+    swap_allocators_impl(lhs, rhs, pocs_type());
+}
+
+} // inline namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__allocator_traits_H
--- a/third_party/tbb/detail/_assert.hh
+++ b/third_party/tbb/detail/_assert.hh
@ -0,0 +1,65 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__assert_H
+#define __TBB_detail__assert_H
+
+#include "third_party/tbb/detail/_config.hh"
+
+#if __TBBMALLOC_BUILD
+namespace rml { namespace internal {
+#else
+namespace tbb {
+namespace detail {
+namespace r1 {
+#endif
+//! Process an assertion failure.
+/** Normally called from __TBB_ASSERT macro.
+  If assertion handler is null, print message for assertion failure and abort.
+  Otherwise call the assertion handler. */
+TBB_EXPORT void __TBB_EXPORTED_FUNC assertion_failure(const char* location, int line, const char* expression, const char* comment);
+#if __TBBMALLOC_BUILD
+}} // namespaces rml::internal
+#else
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+#endif
+
+#if __TBBMALLOC_BUILD
+//! Release version of assertions
+#define __TBB_ASSERT_RELEASE(predicate,message) ((predicate)?((void)0) : rml::internal::assertion_failure(__func__,__LINE__,#predicate,message))
+#else
+#define __TBB_ASSERT_RELEASE(predicate,message) ((predicate)?((void)0) : tbb::detail::r1::assertion_failure(__func__,__LINE__,#predicate,message))
+#endif
+
+#if TBB_USE_ASSERT
+    //! Assert that predicate is true.
+    /** If predicate is false, print assertion failure message.
+        If the comment argument is not nullptr, it is printed as part of the failure message.
+        The comment argument has no other effect. */
+    #define __TBB_ASSERT(predicate,message) __TBB_ASSERT_RELEASE(predicate,message)
+    //! "Extended" version
+    #define __TBB_ASSERT_EX __TBB_ASSERT
+#else
+    //! No-op version of __TBB_ASSERT.
+    #define __TBB_ASSERT(predicate,comment) ((void)0)
+    //! "Extended" version is useful to suppress warnings if a variable is only used with an assert
+    #define __TBB_ASSERT_EX(predicate,comment) ((void)(1 && (predicate)))
+#endif // TBB_USE_ASSERT
+
+#endif // __TBB_detail__assert_H
--- a/third_party/tbb/detail/_attach.hh
+++ b/third_party/tbb/detail/_attach.hh
@ -0,0 +1,33 @@
+// clang-format off
+/*
+    Copyright (c) 2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__attach_H
+#define __TBB_detail__attach_H
+
+#include "third_party/tbb/detail/_config.hh"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+    struct attach {};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__attach_H
--- a/third_party/tbb/detail/_concurrent_queue_base.hh
+++ b/third_party/tbb/detail/_concurrent_queue_base.hh
@ -0,0 +1,651 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__concurrent_queue_base_H
+#define __TBB_detail__concurrent_queue_base_H
+
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/detail/_exception.hh"
+#include "third_party/tbb/detail/_machine.hh"
+#include "third_party/tbb/detail/_allocator_traits.hh"
+
+#include "third_party/tbb/profiling.hh"
+#include "third_party/tbb/spin_mutex.hh"
+#include "third_party/tbb/cache_aligned_allocator.hh"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace d2 {
+
+using ticket_type = std::size_t;
+
+template <typename Page>
+inline bool is_valid_page(const Page p) {
+    return reinterpret_cast<std::uintptr_t>(p) > 1;
+}
+
+template <typename T, typename Allocator>
+struct concurrent_queue_rep;
+
+template <typename Container, typename T, typename Allocator>
+class micro_queue_pop_finalizer;
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+// unary minus operator applied to unsigned type, result still unsigned
+#pragma warning( push )
+#pragma warning( disable: 4146 )
+#endif
+
+// A queue using simple locking.
+// For efficiency, this class has no constructor.
+// The caller is expected to zero-initialize it.
+template <typename T, typename Allocator>
+class micro_queue {
+private:
+    using queue_rep_type = concurrent_queue_rep<T, Allocator>;
+    using self_type = micro_queue<T, Allocator>;
+public:
+    using size_type = std::size_t;
+    using value_type = T;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+
+    using allocator_type = Allocator;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using queue_allocator_type = typename allocator_traits_type::template rebind_alloc<queue_rep_type>;
+
+    static constexpr size_type item_size = sizeof(T);
+    static constexpr size_type items_per_page = item_size <=   8 ? 32 :
+                                                item_size <=  16 ? 16 :
+                                                item_size <=  32 ?  8 :
+                                                item_size <=  64 ?  4 :
+                                                item_size <= 128 ?  2 : 1;
+
+    struct padded_page {
+        padded_page() {}
+        ~padded_page() {}
+
+        reference operator[] (std::size_t index) {
+            __TBB_ASSERT(index < items_per_page, "Index out of range");
+            return items[index];
+        }
+
+        const_reference operator[] (std::size_t index) const {
+            __TBB_ASSERT(index < items_per_page, "Index out of range");
+            return items[index];
+        }
+
+        padded_page* next{ nullptr };
+        std::atomic<std::uintptr_t> mask{};
+
+        union {
+            value_type items[items_per_page];
+        };
+    }; // struct padded_page
+
+    using page_allocator_type = typename allocator_traits_type::template rebind_alloc<padded_page>;
+protected:
+    using page_allocator_traits = tbb::detail::allocator_traits<page_allocator_type>;
+
+public:
+    using item_constructor_type = void (*)(value_type* location, const void* src);
+    micro_queue() = default;
+    micro_queue( const micro_queue& ) = delete;
+    micro_queue& operator=( const micro_queue& ) = delete;
+
+    size_type prepare_page( ticket_type k, queue_rep_type& base, page_allocator_type page_allocator,
+                            padded_page*& p ) {
+        __TBB_ASSERT(p == nullptr, "Invalid page argument for prepare_page");
+        k &= -queue_rep_type::n_queue;
+        size_type index = modulo_power_of_two(k / queue_rep_type::n_queue, items_per_page);
+        if (!index) {
+            try_call( [&] {
+                p = page_allocator_traits::allocate(page_allocator, 1);
+            }).on_exception( [&] {
+                ++base.n_invalid_entries;
+                invalidate_page( k );
+            });
+            page_allocator_traits::construct(page_allocator, p);
+        }
+
+        spin_wait_until_my_turn(tail_counter, k, base);
+        d1::call_itt_notify(d1::acquired, &tail_counter);
+
+        if (p) {
+            spin_mutex::scoped_lock lock( page_mutex );
+            padded_page* q = tail_page.load(std::memory_order_relaxed);
+            if (is_valid_page(q)) {
+                q->next = p;
+            } else {
+                head_page.store(p, std::memory_order_relaxed);
+            }
+            tail_page.store(p, std::memory_order_relaxed);
+        } else {
+            p = tail_page.load(std::memory_order_relaxed);
+        }
+        return index;
+    }
+
+    template<typename... Args>
+    void push( ticket_type k, queue_rep_type& base, queue_allocator_type& allocator, Args&&... args )
+    {
+        padded_page* p = nullptr;
+        page_allocator_type page_allocator(allocator);
+        size_type index = prepare_page(k, base, page_allocator, p);
+        __TBB_ASSERT(p != nullptr, "Page was not prepared");
+
+        // try_call API is not convenient here due to broken
+        // variadic capture on GCC 4.8.5
+        auto value_guard = make_raii_guard([&] {
+            ++base.n_invalid_entries;
+            d1::call_itt_notify(d1::releasing, &tail_counter);
+            tail_counter.fetch_add(queue_rep_type::n_queue);
+        });
+
+        page_allocator_traits::construct(page_allocator, &(*p)[index], std::forward<Args>(args)...);
+        // If no exception was thrown, mark item as present.
+        p->mask.store(p->mask.load(std::memory_order_relaxed) | uintptr_t(1) << index, std::memory_order_relaxed);
+        d1::call_itt_notify(d1::releasing, &tail_counter);
+
+        value_guard.dismiss();
+        tail_counter.fetch_add(queue_rep_type::n_queue);
+    }
+
+    void abort_push( ticket_type k, queue_rep_type& base, queue_allocator_type& allocator ) {
+        padded_page* p = nullptr;
+        prepare_page(k, base, allocator, p);
+        ++base.n_invalid_entries;
+        tail_counter.fetch_add(queue_rep_type::n_queue);
+    }
+
+    bool pop( void* dst, ticket_type k, queue_rep_type& base, queue_allocator_type& allocator ) {
+        k &= -queue_rep_type::n_queue;
+        spin_wait_until_eq(head_counter, k);
+        d1::call_itt_notify(d1::acquired, &head_counter);
+        spin_wait_while_eq(tail_counter, k);
+        d1::call_itt_notify(d1::acquired, &tail_counter);
+        padded_page *p = head_page.load(std::memory_order_relaxed);
+        __TBB_ASSERT( p, nullptr );
+        size_type index = modulo_power_of_two( k/queue_rep_type::n_queue, items_per_page );
+        bool success = false;
+        {
+            page_allocator_type page_allocator(allocator);
+            micro_queue_pop_finalizer<self_type, value_type, page_allocator_type> finalizer(*this, page_allocator,
+                k + queue_rep_type::n_queue, index == items_per_page - 1 ? p : nullptr );
+            if (p->mask.load(std::memory_order_relaxed) & (std::uintptr_t(1) << index)) {
+                success = true;
+                assign_and_destroy_item(dst, *p, index);
+            } else {
+                --base.n_invalid_entries;
+            }
+        }
+        return success;
+    }
+
+    micro_queue& assign( const micro_queue& src, queue_allocator_type& allocator,
+        item_constructor_type construct_item )
+    {
+        head_counter.store(src.head_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        tail_counter.store(src.tail_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+
+        const padded_page* srcp = src.head_page.load(std::memory_order_relaxed);
+        if( is_valid_page(srcp) ) {
+            ticket_type g_index = head_counter.load(std::memory_order_relaxed);
+            size_type n_items  = (tail_counter.load(std::memory_order_relaxed) - head_counter.load(std::memory_order_relaxed))
+                / queue_rep_type::n_queue;
+            size_type index = modulo_power_of_two(head_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue, items_per_page);
+            size_type end_in_first_page = (index+n_items < items_per_page) ? (index + n_items) : items_per_page;
+
+            try_call( [&] {
+                head_page.store(make_copy(allocator, srcp, index, end_in_first_page, g_index, construct_item), std::memory_order_relaxed);
+            }).on_exception( [&] {
+                head_counter.store(0, std::memory_order_relaxed);
+                tail_counter.store(0, std::memory_order_relaxed);
+            });
+            padded_page* cur_page = head_page.load(std::memory_order_relaxed);
+
+            try_call( [&] {
+                if (srcp != src.tail_page.load(std::memory_order_relaxed)) {
+                    for (srcp = srcp->next; srcp != src.tail_page.load(std::memory_order_relaxed); srcp=srcp->next ) {
+                        cur_page->next = make_copy( allocator, srcp, 0, items_per_page, g_index, construct_item );
+                        cur_page = cur_page->next;
+                    }
+
+                    __TBB_ASSERT(srcp == src.tail_page.load(std::memory_order_relaxed), nullptr );
+                    size_type last_index = modulo_power_of_two(tail_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue, items_per_page);
+                    if( last_index==0 ) last_index = items_per_page;
+
+                    cur_page->next = make_copy( allocator, srcp, 0, last_index, g_index, construct_item );
+                    cur_page = cur_page->next;
+                }
+                tail_page.store(cur_page, std::memory_order_relaxed);
+            }).on_exception( [&] {
+                padded_page* invalid_page = reinterpret_cast<padded_page*>(std::uintptr_t(1));
+                tail_page.store(invalid_page, std::memory_order_relaxed);
+            });
+        } else {
+            head_page.store(nullptr, std::memory_order_relaxed);
+            tail_page.store(nullptr, std::memory_order_relaxed);
+        }
+        return *this;
+    }
+
+    padded_page* make_copy( queue_allocator_type& allocator, const padded_page* src_page, size_type begin_in_page,
+        size_type end_in_page, ticket_type& g_index, item_constructor_type construct_item )
+    {
+        page_allocator_type page_allocator(allocator);
+        padded_page* new_page = page_allocator_traits::allocate(page_allocator, 1);
+        new_page->next = nullptr;
+        new_page->mask.store(src_page->mask.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        for (; begin_in_page!=end_in_page; ++begin_in_page, ++g_index) {
+            if (new_page->mask.load(std::memory_order_relaxed) & uintptr_t(1) << begin_in_page) {
+                copy_item(*new_page, begin_in_page, *src_page, begin_in_page, construct_item);
+            }
+        }
+        return new_page;
+    }
+
+    void invalidate_page( ticket_type k )  {
+        // Append an invalid page at address 1 so that no more pushes are allowed.
+        padded_page* invalid_page = reinterpret_cast<padded_page*>(std::uintptr_t(1));
+        {
+            spin_mutex::scoped_lock lock( page_mutex );
+            tail_counter.store(k + queue_rep_type::n_queue + 1, std::memory_order_relaxed);
+            padded_page* q = tail_page.load(std::memory_order_relaxed);
+            if (is_valid_page(q)) {
+                q->next = invalid_page;
+            } else {
+                head_page.store(invalid_page, std::memory_order_relaxed);
+            }
+            tail_page.store(invalid_page, std::memory_order_relaxed);
+        }
+    }
+
+    padded_page* get_head_page() {
+        return head_page.load(std::memory_order_relaxed);
+    }
+
+    void clear(queue_allocator_type& allocator, padded_page* new_head = nullptr, padded_page* new_tail = nullptr) {
+        padded_page* curr_page = get_head_page();
+        size_type index = (head_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue) % items_per_page;
+        page_allocator_type page_allocator(allocator);
+
+        while (curr_page && is_valid_page(curr_page)) {
+            while (index != items_per_page) {
+                if (curr_page->mask.load(std::memory_order_relaxed) & (std::uintptr_t(1) << index)) {
+                    page_allocator_traits::destroy(page_allocator, &curr_page->operator[](index));
+                }
+                ++index;
+            }
+
+            index = 0;
+            padded_page* next_page = curr_page->next;
+            page_allocator_traits::destroy(page_allocator, curr_page);
+            page_allocator_traits::deallocate(page_allocator, curr_page, 1);
+            curr_page = next_page;
+        }
+        head_counter.store(0, std::memory_order_relaxed);
+        tail_counter.store(0, std::memory_order_relaxed);
+        head_page.store(new_head, std::memory_order_relaxed);
+        tail_page.store(new_tail, std::memory_order_relaxed);
+    }
+
+    void clear_and_invalidate(queue_allocator_type& allocator) {
+        padded_page* invalid_page = reinterpret_cast<padded_page*>(std::uintptr_t(1));
+        clear(allocator, invalid_page, invalid_page);
+    }
+
+private:
+    // template <typename U, typename A>
+    friend class micro_queue_pop_finalizer<self_type, value_type, page_allocator_type>;
+
+    // Class used to ensure exception-safety of method "pop"
+    class destroyer  {
+        value_type& my_value;
+    public:
+        destroyer( reference value ) : my_value(value) {}
+        destroyer( const destroyer& ) = delete;
+        destroyer& operator=( const destroyer& ) = delete;
+        ~destroyer() {my_value.~T();}
+    }; // class destroyer
+
+    void copy_item( padded_page& dst, size_type dindex, const padded_page& src, size_type sindex,
+        item_constructor_type construct_item )
+    {
+        auto& src_item = src[sindex];
+        construct_item( &dst[dindex], static_cast<const void*>(&src_item) );
+    }
+
+    void assign_and_destroy_item( void* dst, padded_page& src, size_type index ) {
+        auto& from = src[index];
+        destroyer d(from);
+        *static_cast<T*>(dst) = std::move(from);
+    }
+
+    void spin_wait_until_my_turn( std::atomic<ticket_type>& counter, ticket_type k, queue_rep_type& rb ) const {
+        for (atomic_backoff b{};; b.pause()) {
+            ticket_type c = counter.load(std::memory_order_acquire);
+            if (c == k) return;
+            else if (c & 1) {
+                ++rb.n_invalid_entries;
+                throw_exception( exception_id::bad_last_alloc);
+            }
+        }
+    }
+
+    std::atomic<padded_page*> head_page{};
+    std::atomic<ticket_type> head_counter{};
+
+    std::atomic<padded_page*> tail_page{};
+    std::atomic<ticket_type> tail_counter{};
+
+    spin_mutex page_mutex{};
+}; // class micro_queue
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+#pragma warning( pop )
+#endif // warning 4146 is back
+
+template <typename Container, typename T, typename Allocator>
+class micro_queue_pop_finalizer {
+public:
+    using padded_page = typename Container::padded_page;
+    using allocator_type = Allocator;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+
+    micro_queue_pop_finalizer( Container& queue, Allocator& alloc, ticket_type k, padded_page* p ) :
+        my_ticket_type(k), my_queue(queue), my_page(p), allocator(alloc)
+    {}
+
+    micro_queue_pop_finalizer( const micro_queue_pop_finalizer& ) = delete;
+    micro_queue_pop_finalizer& operator=( const micro_queue_pop_finalizer& ) = delete;
+
+    ~micro_queue_pop_finalizer() {
+        padded_page* p = my_page;
+        if( is_valid_page(p) ) {
+            spin_mutex::scoped_lock lock( my_queue.page_mutex );
+            padded_page* q = p->next;
+            my_queue.head_page.store(q, std::memory_order_relaxed);
+            if( !is_valid_page(q) ) {
+                my_queue.tail_page.store(nullptr, std::memory_order_relaxed);
+            }
+        }
+        my_queue.head_counter.store(my_ticket_type, std::memory_order_release);
+        if ( is_valid_page(p) ) {
+            allocator_traits_type::destroy(allocator, static_cast<padded_page*>(p));
+            allocator_traits_type::deallocate(allocator, static_cast<padded_page*>(p), 1);
+        }
+    }
+private:
+    ticket_type my_ticket_type;
+    Container& my_queue;
+    padded_page* my_page;
+    Allocator& allocator;
+}; // class micro_queue_pop_finalizer
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+// structure was padded due to alignment specifier
+#pragma warning( push )
+#pragma warning( disable: 4324 )
+#endif
+
+template <typename T, typename Allocator>
+struct concurrent_queue_rep {
+    using self_type = concurrent_queue_rep<T, Allocator>;
+    using size_type = std::size_t;
+    using micro_queue_type = micro_queue<T, Allocator>;
+    using allocator_type = Allocator;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using padded_page = typename micro_queue_type::padded_page;
+    using page_allocator_type = typename micro_queue_type::page_allocator_type;
+    using item_constructor_type = typename micro_queue_type::item_constructor_type;
+private:
+    using page_allocator_traits = tbb::detail::allocator_traits<page_allocator_type>;
+    using queue_allocator_type = typename allocator_traits_type::template rebind_alloc<self_type>;
+
+public:
+    // must be power of 2
+    static constexpr size_type n_queue = 8;
+    // Approximately n_queue/golden ratio
+    static constexpr size_type phi = 3;
+    static constexpr size_type item_size = micro_queue_type::item_size;
+    static constexpr size_type items_per_page = micro_queue_type::items_per_page;
+
+    concurrent_queue_rep() {}
+
+    concurrent_queue_rep( const concurrent_queue_rep& ) = delete;
+    concurrent_queue_rep& operator=( const concurrent_queue_rep& ) = delete;
+
+    void clear( queue_allocator_type& alloc ) {
+        for (size_type index = 0; index < n_queue; ++index) {
+            array[index].clear(alloc);
+        }
+        head_counter.store(0, std::memory_order_relaxed);
+        tail_counter.store(0, std::memory_order_relaxed);
+        n_invalid_entries.store(0, std::memory_order_relaxed);
+    }
+
+    void assign( const concurrent_queue_rep& src, queue_allocator_type& alloc, item_constructor_type construct_item ) {
+        head_counter.store(src.head_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        tail_counter.store(src.tail_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        n_invalid_entries.store(src.n_invalid_entries.load(std::memory_order_relaxed), std::memory_order_relaxed);
+
+        // copy or move micro_queues
+        size_type queue_idx = 0;
+        try_call( [&] {
+            for (; queue_idx < n_queue; ++queue_idx) {
+                array[queue_idx].assign(src.array[queue_idx], alloc, construct_item);
+            }
+        }).on_exception( [&] {
+            for (size_type i = 0; i < queue_idx + 1; ++i) {
+                array[i].clear_and_invalidate(alloc);
+            }
+            head_counter.store(0, std::memory_order_relaxed);
+            tail_counter.store(0, std::memory_order_relaxed);
+            n_invalid_entries.store(0, std::memory_order_relaxed);
+        });
+
+        __TBB_ASSERT(head_counter.load(std::memory_order_relaxed) == src.head_counter.load(std::memory_order_relaxed) &&
+                     tail_counter.load(std::memory_order_relaxed) == src.tail_counter.load(std::memory_order_relaxed),
+                     "the source concurrent queue should not be concurrently modified." );
+    }
+
+    bool empty() const {
+        ticket_type tc = tail_counter.load(std::memory_order_acquire);
+        ticket_type hc = head_counter.load(std::memory_order_relaxed);
+        // if tc!=r.tail_counter, the queue was not empty at some point between the two reads.
+        return tc == tail_counter.load(std::memory_order_relaxed) &&
+               std::ptrdiff_t(tc - hc - n_invalid_entries.load(std::memory_order_relaxed)) <= 0;
+    }
+
+    std::ptrdiff_t size() const {
+        __TBB_ASSERT(sizeof(std::ptrdiff_t) <= sizeof(size_type), nullptr);
+        std::ptrdiff_t hc = head_counter.load(std::memory_order_acquire);
+        std::ptrdiff_t tc = tail_counter.load(std::memory_order_relaxed);
+        std::ptrdiff_t nie = n_invalid_entries.load(std::memory_order_relaxed);
+
+        return tc - hc - nie;
+    }
+
+    friend class micro_queue<T, Allocator>;
+
+    // Map ticket_type to an array index
+    static size_type index( ticket_type k ) {
+        return k * phi % n_queue;
+    }
+
+    micro_queue_type& choose( ticket_type k ) {
+        // The formula here approximates LRU in a cache-oblivious way.
+        return array[index(k)];
+    }
+
+    alignas(max_nfs_size) micro_queue_type array[n_queue];
+
+    alignas(max_nfs_size) std::atomic<ticket_type> head_counter{};
+    alignas(max_nfs_size) std::atomic<ticket_type> tail_counter{};
+    alignas(max_nfs_size) std::atomic<size_type> n_invalid_entries{};
+}; // class concurrent_queue_rep
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+#pragma warning( pop )
+#endif
+
+template <typename Value, typename Allocator>
+class concurrent_queue_iterator_base {
+    using queue_rep_type = concurrent_queue_rep<Value, Allocator>;
+    using padded_page = typename queue_rep_type::padded_page;
+protected:
+    concurrent_queue_iterator_base() = default;
+
+    concurrent_queue_iterator_base( const concurrent_queue_iterator_base& other ) {
+        assign(other);
+    }
+
+    concurrent_queue_iterator_base( queue_rep_type* queue_rep )
+        : my_queue_rep(queue_rep),
+          my_head_counter(my_queue_rep->head_counter.load(std::memory_order_relaxed))
+    {
+        for (std::size_t i = 0; i < queue_rep_type::n_queue; ++i) {
+            my_array[i] = my_queue_rep->array[i].get_head_page();
+        }
+
+        if (!get_item(my_item, my_head_counter)) advance();
+    }
+
+    void assign( const concurrent_queue_iterator_base& other ) {
+        my_item = other.my_item;
+        my_queue_rep = other.my_queue_rep;
+
+        if (my_queue_rep != nullptr) {
+            my_head_counter = other.my_head_counter;
+
+            for (std::size_t i = 0; i < queue_rep_type::n_queue; ++i) {
+                my_array[i] = other.my_array[i];
+            }
+        }
+    }
+
+    void advance() {
+        __TBB_ASSERT(my_item, "Attempt to increment iterator past end of the queue");
+        std::size_t k = my_head_counter;
+#if TBB_USE_ASSERT
+        Value* tmp;
+        get_item(tmp, k);
+        __TBB_ASSERT(my_item == tmp, nullptr);
+#endif
+        std::size_t i = modulo_power_of_two(k / queue_rep_type::n_queue, my_queue_rep->items_per_page);
+        if (i == my_queue_rep->items_per_page - 1) {
+            padded_page*& root = my_array[queue_rep_type::index(k)];
+            root = root->next;
+        }
+        // Advance k
+        my_head_counter = ++k;
+        if (!get_item(my_item, k)) advance();
+    }
+
+    concurrent_queue_iterator_base& operator=( const concurrent_queue_iterator_base& other ) {
+        this->assign(other);
+        return *this;
+    }
+
+    bool get_item( Value*& item, std::size_t k ) {
+        if (k == my_queue_rep->tail_counter.load(std::memory_order_relaxed)) {
+            item = nullptr;
+            return true;
+        } else {
+            padded_page* p = my_array[queue_rep_type::index(k)];
+            __TBB_ASSERT(p, nullptr);
+            std::size_t i = modulo_power_of_two(k / queue_rep_type::n_queue, my_queue_rep->items_per_page);
+            item = &(*p)[i];
+            return (p->mask & uintptr_t(1) << i) != 0;
+        }
+    }
+
+    Value* my_item{ nullptr };
+    queue_rep_type* my_queue_rep{ nullptr };
+    ticket_type my_head_counter{};
+    padded_page* my_array[queue_rep_type::n_queue]{};
+}; // class concurrent_queue_iterator_base
+
+struct concurrent_queue_iterator_provider {
+    template <typename Iterator, typename Container>
+    static Iterator get( const Container& container ) {
+        return Iterator(container);
+    }
+}; // struct concurrent_queue_iterator_provider
+
+template <typename Container, typename Value, typename Allocator>
+class concurrent_queue_iterator : public concurrent_queue_iterator_base<typename std::remove_cv<Value>::type, Allocator> {
+    using base_type = concurrent_queue_iterator_base<typename std::remove_cv<Value>::type, Allocator>;
+public:
+    using value_type = Value;
+    using pointer = value_type*;
+    using reference = value_type&;
+    using difference_type = std::ptrdiff_t;
+    using iterator_category = std::forward_iterator_tag;
+
+    concurrent_queue_iterator() = default;
+
+    /** If Value==Container::value_type, then this routine is the copy constructor.
+        If Value==const Container::value_type, then this routine is a conversion constructor. */
+    concurrent_queue_iterator( const concurrent_queue_iterator<Container, typename Container::value_type, Allocator>& other )
+        : base_type(other) {}
+
+private:
+    concurrent_queue_iterator( const Container& container )
+        : base_type(container.my_queue_representation) {}
+public:
+    concurrent_queue_iterator& operator=( const concurrent_queue_iterator<Container, typename Container::value_type, Allocator>& other ) {
+        this->assign(other);
+        return *this;
+    }
+
+    reference operator*() const {
+        return *static_cast<pointer>(this->my_item);
+    }
+
+    pointer operator->() const { return &operator*(); }
+
+    concurrent_queue_iterator& operator++() {
+        this->advance();
+        return *this;
+    }
+
+    concurrent_queue_iterator operator++(int) {
+        concurrent_queue_iterator tmp = *this;
+        ++*this;
+        return tmp;
+    }
+
+    friend bool operator==( const concurrent_queue_iterator& lhs, const concurrent_queue_iterator& rhs ) {
+        return lhs.my_item == rhs.my_item;
+    }
+
+    friend bool operator!=( const concurrent_queue_iterator& lhs, const concurrent_queue_iterator& rhs ) {
+        return lhs.my_item != rhs.my_item;
+    }
+private:
+    friend struct concurrent_queue_iterator_provider;
+}; // class concurrent_queue_iterator
+
+} // namespace d2
+} // namespace detail
+} // tbb
+
+#endif // __TBB_detail__concurrent_queue_base_H
--- a/third_party/tbb/detail/_concurrent_skip_list.hh
+++ b/third_party/tbb/detail/_concurrent_skip_list.hh
--- a/third_party/tbb/detail/_concurrent_unordered_base.hh
+++ b/third_party/tbb/detail/_concurrent_unordered_base.hh
--- a/third_party/tbb/detail/_config.hh
+++ b/third_party/tbb/detail/_config.hh
@ -0,0 +1,530 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__config_H
+#define __TBB_detail__config_H
+
+/** This header is supposed to contain macro definitions only.
+    The macros defined here are intended to control such aspects of TBB build as
+    - presence of compiler features
+    - compilation modes
+    - feature sets
+    - known compiler/platform issues
+**/
+
+/* Check which standard library we use. */
+#include "third_party/libcxx/cstddef"
+
+#ifdef __has_include
+#if __has_include(<version>)
+#include "third_party/libcxx/version"
+#endif
+#endif
+
+#include "third_party/tbb/detail/_export.hh"
+
+#if _MSC_VER
+    #define __TBB_EXPORTED_FUNC   __cdecl
+    #define __TBB_EXPORTED_METHOD __thiscall
+#else
+    #define __TBB_EXPORTED_FUNC
+    #define __TBB_EXPORTED_METHOD
+#endif
+
+#if defined(_MSVC_LANG)
+    #define __TBB_LANG _MSVC_LANG
+#else
+    #define __TBB_LANG __cplusplus
+#endif // _MSVC_LANG
+
+#define __TBB_CPP14_PRESENT (__TBB_LANG >= 201402L)
+#define __TBB_CPP17_PRESENT (__TBB_LANG >= 201703L)
+#define __TBB_CPP20_PRESENT (__TBB_LANG >= 202002L)
+
+#if __INTEL_COMPILER || _MSC_VER
+    #define __TBB_NOINLINE(decl) __declspec(noinline) decl
+#elif __GNUC__
+    #define __TBB_NOINLINE(decl) decl __attribute__ ((noinline))
+#else
+    #define __TBB_NOINLINE(decl) decl
+#endif
+
+#define __TBB_STRING_AUX(x) #x
+#define __TBB_STRING(x) __TBB_STRING_AUX(x)
+
+// Note that when ICC or Clang is in use, __TBB_GCC_VERSION might not fully match
+// the actual GCC version on the system.
+#define __TBB_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+
+/* Check which standard library we use. */
+
+// Prior to GCC 7, GNU libstdc++ did not have a convenient version macro.
+// Therefore we use different ways to detect its version.
+#ifdef TBB_USE_GLIBCXX_VERSION
+    // The version is explicitly specified in our public TBB_USE_GLIBCXX_VERSION macro.
+    // Its format should match the __TBB_GCC_VERSION above, e.g. 70301 for libstdc++ coming with GCC 7.3.1.
+    #define __TBB_GLIBCXX_VERSION TBB_USE_GLIBCXX_VERSION
+#elif _GLIBCXX_RELEASE && _GLIBCXX_RELEASE != __GNUC__
+    // Reported versions of GCC and libstdc++ do not match; trust the latter
+    #define __TBB_GLIBCXX_VERSION (_GLIBCXX_RELEASE*10000)
+#elif __GLIBCPP__ || __GLIBCXX__
+    // The version macro is not defined or matches the GCC version; use __TBB_GCC_VERSION
+    #define __TBB_GLIBCXX_VERSION __TBB_GCC_VERSION
+#endif
+
+#if __clang__
+    // according to clang documentation, version can be vendor specific
+    #define __TBB_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+#endif
+
+/** Macro helpers **/
+
+#define __TBB_CONCAT_AUX(A,B) A##B
+// The additional level of indirection is needed to expand macros A and B (not to get the AB macro).
+// See [cpp.subst] and [cpp.concat] for more details.
+#define __TBB_CONCAT(A,B) __TBB_CONCAT_AUX(A,B)
+// The IGNORED argument and comma are needed to always have 2 arguments (even when A is empty).
+#define __TBB_IS_MACRO_EMPTY(A,IGNORED) __TBB_CONCAT_AUX(__TBB_MACRO_EMPTY,A)
+#define __TBB_MACRO_EMPTY 1
+
+#if _M_X64 || _M_ARM64
+    #define __TBB_W(name) name##64
+#else
+    #define __TBB_W(name) name
+#endif
+
+/** User controlled TBB features & modes **/
+
+#ifndef TBB_USE_DEBUG
+    /*
+    There are four cases that are supported:
+    1. "_DEBUG is undefined" means "no debug";
+    2. "_DEBUG defined to something that is evaluated to 0" (including "garbage", as per [cpp.cond]) means "no debug";
+    3. "_DEBUG defined to something that is evaluated to a non-zero value" means "debug";
+    4. "_DEBUG defined to nothing (empty)" means "debug".
+    */
+    #ifdef _DEBUG
+        // Check if _DEBUG is empty.
+        #define __TBB_IS__DEBUG_EMPTY (__TBB_IS_MACRO_EMPTY(_DEBUG,IGNORED)==__TBB_MACRO_EMPTY)
+        #if __TBB_IS__DEBUG_EMPTY
+            #define TBB_USE_DEBUG 1
+        #else
+            #define TBB_USE_DEBUG _DEBUG
+        #endif // __TBB_IS__DEBUG_EMPTY
+    #else
+        #define TBB_USE_DEBUG 0
+    #endif // _DEBUG
+#endif // TBB_USE_DEBUG
+
+#ifndef TBB_USE_ASSERT
+    #define TBB_USE_ASSERT TBB_USE_DEBUG
+#endif // TBB_USE_ASSERT
+
+#ifndef TBB_USE_PROFILING_TOOLS
+#if TBB_USE_DEBUG
+    #define TBB_USE_PROFILING_TOOLS 2
+#else // TBB_USE_DEBUG
+    #define TBB_USE_PROFILING_TOOLS 0
+#endif // TBB_USE_DEBUG
+#endif // TBB_USE_PROFILING_TOOLS
+
+// Exceptions support cases
+#if !(__EXCEPTIONS || defined(_CPPUNWIND) || __SUNPRO_CC)
+    #if TBB_USE_EXCEPTIONS
+        #error Compilation settings do not support exception handling. Please do not set TBB_USE_EXCEPTIONS macro or set it to 0.
+    #elif !defined(TBB_USE_EXCEPTIONS)
+        #define TBB_USE_EXCEPTIONS 0
+    #endif
+#elif !defined(TBB_USE_EXCEPTIONS)
+    #define TBB_USE_EXCEPTIONS 1
+#endif
+
+/** Preprocessor symbols to determine HW architecture **/
+
+#if _WIN32 || _WIN64
+    #if defined(_M_X64) || defined(__x86_64__)  // the latter for MinGW support
+        #define __TBB_x86_64 1
+    #elif defined(_M_IA64)
+        #define __TBB_ipf 1
+    #elif defined(_M_IX86) || defined(__i386__) // the latter for MinGW support
+        #define __TBB_x86_32 1
+    #else
+        #define __TBB_generic_arch 1
+    #endif
+#else /* Assume generic Unix */
+    #if __x86_64__
+        #define __TBB_x86_64 1
+    #elif __ia64__
+        #define __TBB_ipf 1
+    #elif __i386__||__i386  // __i386 is for Sun OS
+        #define __TBB_x86_32 1
+    #else
+        #define __TBB_generic_arch 1
+    #endif
+#endif
+
+/** Windows API or POSIX API **/
+
+#if _WIN32 || _WIN64
+    #define __TBB_USE_WINAPI 1
+#else
+    #define __TBB_USE_POSIX 1
+#endif
+
+/** Internal TBB features & modes **/
+
+/** __TBB_DYNAMIC_LOAD_ENABLED describes the system possibility to load shared libraries at run time **/
+#ifndef __TBB_DYNAMIC_LOAD_ENABLED
+    #define __TBB_DYNAMIC_LOAD_ENABLED 1
+#endif
+
+/** __TBB_WIN8UI_SUPPORT enables support of Windows* Store Apps and limit a possibility to load
+    shared libraries at run time only from application container **/
+#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
+    #define __TBB_WIN8UI_SUPPORT 1
+#else
+    #define __TBB_WIN8UI_SUPPORT 0
+#endif
+
+/** __TBB_WEAK_SYMBOLS_PRESENT denotes that the system supports the weak symbol mechanism **/
+#ifndef __TBB_WEAK_SYMBOLS_PRESENT
+    #define __TBB_WEAK_SYMBOLS_PRESENT ( !_WIN32 && !__APPLE__ && !__sun && (__TBB_GCC_VERSION >= 40000 || __INTEL_COMPILER ) )
+#endif
+
+/** Presence of compiler features **/
+
+#if __clang__ && !__INTEL_COMPILER
+    #define __TBB_USE_OPTIONAL_RTTI __has_feature(cxx_rtti)
+#elif defined(_CPPRTTI)
+    #define __TBB_USE_OPTIONAL_RTTI 1
+#else
+    #define __TBB_USE_OPTIONAL_RTTI (__GXX_RTTI || __RTTI || __INTEL_RTTI__)
+#endif
+
+/** Address sanitizer detection **/
+#ifdef __SANITIZE_ADDRESS__
+    #define __TBB_USE_ADDRESS_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(address_sanitizer)
+    #define __TBB_USE_ADDRESS_SANITIZER 1
+#endif
+#endif
+
+/** Library features presence macros **/
+
+#define __TBB_CPP14_INTEGER_SEQUENCE_PRESENT       (__TBB_LANG >= 201402L)
+#define __TBB_CPP17_INVOKE_PRESENT                 (__TBB_LANG >= 201703L)
+
+// TODO: Remove the condition(__INTEL_COMPILER > 2021) from the __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+// macro when this feature start working correctly on this compiler.
+#if __INTEL_COMPILER && (!_MSC_VER || __INTEL_CXX11_MOVE__)
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L)
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (__INTEL_COMPILER > 2021 && __TBB_LANG >= 201703L)
+    #define __TBB_CPP20_CONCEPTS_PRESENT           0 // TODO: add a mechanism for future addition
+#elif __clang__
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__has_feature(cxx_variable_templates))
+    #define __TBB_CPP20_CONCEPTS_PRESENT           0 // TODO: add a mechanism for future addition
+    #ifdef __cpp_deduction_guides
+        #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (__cpp_deduction_guides >= 201611L)
+    #else
+        #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT 0
+    #endif
+#elif __GNUC__
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L && __TBB_GCC_VERSION >= 50000)
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (__cpp_deduction_guides >= 201606L)
+    #define __TBB_CPP20_CONCEPTS_PRESENT           (__TBB_LANG >= 201709L && __TBB_GCC_VERSION >= 100201)
+#elif _MSC_VER
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (_MSC_FULL_VER >= 190023918 && (!__INTEL_COMPILER || __INTEL_COMPILER >= 1700))
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (_MSC_VER >= 1914 && __TBB_LANG >= 201703L && (!__INTEL_COMPILER || __INTEL_COMPILER > 2021))
+    #define __TBB_CPP20_CONCEPTS_PRESENT           (_MSC_VER >= 1923 && __TBB_LANG >= 202002L) // TODO: INTEL_COMPILER?
+#else
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L)
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (__TBB_LANG >= 201703L)
+    #define __TBB_CPP20_CONCEPTS_PRESENT           (__TBB_LANG >= 202002L)
+#endif
+
+// GCC4.8 on RHEL7 does not support std::get_new_handler
+#define __TBB_CPP11_GET_NEW_HANDLER_PRESENT             (_MSC_VER >= 1900 || __TBB_GLIBCXX_VERSION >= 40900 && __GXX_EXPERIMENTAL_CXX0X__ || _LIBCPP_VERSION)
+// GCC4.8 on RHEL7 does not support std::is_trivially_copyable
+#define __TBB_CPP11_TYPE_PROPERTIES_PRESENT             (_LIBCPP_VERSION || _MSC_VER >= 1700 || (__TBB_GLIBCXX_VERSION >= 50000 && __GXX_EXPERIMENTAL_CXX0X__))
+
+#define __TBB_CPP17_MEMORY_RESOURCE_PRESENT             (_MSC_VER >= 1913 && (__TBB_LANG > 201402L) || \
+                                                        __TBB_GLIBCXX_VERSION >= 90000 && __TBB_LANG >= 201703L)
+#define __TBB_CPP17_HW_INTERFERENCE_SIZE_PRESENT        (_MSC_VER >= 1911)
+#define __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT          (__TBB_LANG >= 201703L)
+#define __TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT   (__TBB_LANG >= 201703L)
+#define __TBB_CPP17_IS_SWAPPABLE_PRESENT                (__TBB_LANG >= 201703L)
+
+#if defined(__cpp_impl_three_way_comparison) && defined(__cpp_lib_three_way_comparison)
+    #define __TBB_CPP20_COMPARISONS_PRESENT ((__cpp_impl_three_way_comparison >= 201907L) && (__cpp_lib_three_way_comparison >= 201907L))
+#else
+    #define __TBB_CPP20_COMPARISONS_PRESENT 0
+#endif
+
+#define __TBB_RESUMABLE_TASKS                           (!__TBB_WIN8UI_SUPPORT && !__ANDROID__ && !__QNXNTO__ && (!__linux__ || __GLIBC__))
+
+/* This macro marks incomplete code or comments describing ideas which are considered for the future.
+ * See also for plain comment with TODO and FIXME marks for small improvement opportunities.
+ */
+#define __TBB_TODO 0
+
+/* Check which standard library we use. */
+/* __TBB_SYMBOL is defined only while processing exported symbols list where C++ is not allowed. */
+#if !defined(__TBB_SYMBOL) && !__TBB_CONFIG_PREPROC_ONLY
+    #include "third_party/libcxx/cstddef"
+#endif
+
+/** Target OS is either iOS* or iOS* simulator **/
+#if __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__
+    #define __TBB_IOS 1
+#endif
+
+#if __APPLE__
+    #if __INTEL_COMPILER && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1099 \
+                         && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101000
+        // ICC does not correctly set the macro if -mmacosx-min-version is not specified
+        #define __TBB_MACOS_TARGET_VERSION  (100000 + 10*(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ - 1000))
+    #else
+        #define __TBB_MACOS_TARGET_VERSION  __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
+    #endif
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+    #define __TBB_GCC_WARNING_IGNORED_ATTRIBUTES_PRESENT (__TBB_GCC_VERSION >= 60100)
+#endif
+
+#if __GNUC__ && !__INTEL_COMPILER && !__clang__
+    #define __TBB_GCC_PARAMETER_PACK_IN_LAMBDAS_BROKEN (__TBB_GCC_VERSION <= 40805)
+#endif
+
+#define __TBB_CPP17_FALLTHROUGH_PRESENT (__TBB_LANG >= 201703L)
+#define __TBB_CPP17_NODISCARD_PRESENT   (__TBB_LANG >= 201703L)
+#define __TBB_FALLTHROUGH_PRESENT       (__TBB_GCC_VERSION >= 70000 && !__INTEL_COMPILER)
+
+#if __TBB_CPP17_FALLTHROUGH_PRESENT
+    #define __TBB_fallthrough [[fallthrough]]
+#elif __TBB_FALLTHROUGH_PRESENT
+    #define __TBB_fallthrough __attribute__ ((fallthrough))
+#else
+    #define __TBB_fallthrough
+#endif
+
+#if __TBB_CPP17_NODISCARD_PRESENT
+    #define __TBB_nodiscard [[nodiscard]]
+#elif __clang__ || __GNUC__
+    #define __TBB_nodiscard __attribute__((warn_unused_result))
+#else
+    #define __TBB_nodiscard
+#endif
+
+#define __TBB_CPP17_UNCAUGHT_EXCEPTIONS_PRESENT             (_MSC_VER >= 1900 || __GLIBCXX__ && __cpp_lib_uncaught_exceptions \
+                                                            || _LIBCPP_VERSION >= 3700 && (!__TBB_MACOS_TARGET_VERSION || __TBB_MACOS_TARGET_VERSION >= 101200))
+
+#define __TBB_TSX_INTRINSICS_PRESENT (__RTM__ || __INTEL_COMPILER || (_MSC_VER>=1700 && (__TBB_x86_64 || __TBB_x86_32)))
+
+#define __TBB_WAITPKG_INTRINSICS_PRESENT ((__INTEL_COMPILER >= 1900 || __TBB_GCC_VERSION >= 110000 || __TBB_CLANG_VERSION >= 120000) \
+                                         && (_WIN32 || _WIN64 || __unix__ || __APPLE__) && (__TBB_x86_32 || __TBB_x86_64) && !__ANDROID__)
+
+/** Internal TBB features & modes **/
+
+/** __TBB_SOURCE_DIRECTLY_INCLUDED is a mode used in whitebox testing when
+    it's necessary to test internal functions not exported from TBB DLLs
+**/
+#if (_WIN32||_WIN64) && (__TBB_SOURCE_DIRECTLY_INCLUDED || TBB_USE_PREVIEW_BINARY)
+    #define __TBB_NO_IMPLICIT_LINKAGE 1
+    #define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1
+#endif
+
+#if (__TBB_BUILD || __TBBMALLOC_BUILD || __TBBMALLOCPROXY_BUILD || __TBBBIND_BUILD) && !defined(__TBB_NO_IMPLICIT_LINKAGE)
+    #define __TBB_NO_IMPLICIT_LINKAGE 1
+#endif
+
+#if _MSC_VER
+    #if !__TBB_NO_IMPLICIT_LINKAGE
+        #ifdef _DEBUG
+            #pragma comment(lib, "tbb12_debug.lib")
+        #else
+            #pragma comment(lib, "tbb12.lib")
+        #endif
+    #endif
+#endif
+
+#ifndef __TBB_SCHEDULER_OBSERVER
+    #define __TBB_SCHEDULER_OBSERVER 1
+#endif /* __TBB_SCHEDULER_OBSERVER */
+
+#ifndef __TBB_FP_CONTEXT
+    #define __TBB_FP_CONTEXT 1
+#endif /* __TBB_FP_CONTEXT */
+
+#define __TBB_RECYCLE_TO_ENQUEUE __TBB_BUILD // keep non-official
+
+#ifndef __TBB_ARENA_OBSERVER
+    #define __TBB_ARENA_OBSERVER __TBB_SCHEDULER_OBSERVER
+#endif /* __TBB_ARENA_OBSERVER */
+
+#ifndef __TBB_ARENA_BINDING
+    #define __TBB_ARENA_BINDING 1
+#endif
+
+#ifndef __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+    #define __TBB_ENQUEUE_ENFORCED_CONCURRENCY 1
+#endif
+
+#if !defined(__TBB_SURVIVE_THREAD_SWITCH) && \
+          (_WIN32 || _WIN64 || __APPLE__ || (defined(__unix__) && !__ANDROID__))
+    #define __TBB_SURVIVE_THREAD_SWITCH 1
+#endif /* __TBB_SURVIVE_THREAD_SWITCH */
+
+#ifndef TBB_PREVIEW_FLOW_GRAPH_FEATURES
+    #define TBB_PREVIEW_FLOW_GRAPH_FEATURES __TBB_CPF_BUILD
+#endif
+
+#ifndef __TBB_DEFAULT_PARTITIONER
+    #define __TBB_DEFAULT_PARTITIONER tbb::auto_partitioner
+#endif
+
+#ifndef __TBB_FLOW_TRACE_CODEPTR
+    #define __TBB_FLOW_TRACE_CODEPTR __TBB_CPF_BUILD
+#endif
+
+// Intel(R) C++ Compiler starts analyzing usages of the deprecated content at the template
+// instantiation site, which is too late for suppression of the corresponding messages for internal
+// stuff.
+#if !defined(__INTEL_COMPILER) && (!defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0))
+    #if (__TBB_LANG >= 201402L && (!defined(_MSC_VER) || _MSC_VER >= 1920))
+        #define __TBB_DEPRECATED [[deprecated]]
+        #define __TBB_DEPRECATED_MSG(msg) [[deprecated(msg)]]
+    #elif _MSC_VER
+        #define __TBB_DEPRECATED __declspec(deprecated)
+        #define __TBB_DEPRECATED_MSG(msg) __declspec(deprecated(msg))
+    #elif (__GNUC__ && __TBB_GCC_VERSION >= 40805) || __clang__
+        #define __TBB_DEPRECATED __attribute__((deprecated))
+        #define __TBB_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+    #endif
+#endif  // !defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)
+
+#if !defined(__TBB_DEPRECATED)
+    #define __TBB_DEPRECATED
+    #define __TBB_DEPRECATED_MSG(msg)
+#elif !defined(__TBB_SUPPRESS_INTERNAL_DEPRECATED_MESSAGES)
+    // Suppress deprecated messages from self
+    #define __TBB_SUPPRESS_INTERNAL_DEPRECATED_MESSAGES 1
+#endif
+
+#if defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) && (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)
+    #define __TBB_DEPRECATED_VERBOSE __TBB_DEPRECATED
+    #define __TBB_DEPRECATED_VERBOSE_MSG(msg) __TBB_DEPRECATED_MSG(msg)
+#else
+    #define __TBB_DEPRECATED_VERBOSE
+    #define __TBB_DEPRECATED_VERBOSE_MSG(msg)
+#endif // (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)
+
+#if (!defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)) && !(__TBB_LANG >= 201103L || _MSC_VER >= 1900)
+    #pragma message("TBB Warning: Support for C++98/03 is deprecated. Please use the compiler that supports C++11 features at least.")
+#endif
+
+#ifdef _VARIADIC_MAX
+    #define __TBB_VARIADIC_MAX _VARIADIC_MAX
+#else
+    #if _MSC_VER == 1700
+        #define __TBB_VARIADIC_MAX 5 // VS11 setting, issue resolved in VS12
+    #elif _MSC_VER == 1600
+        #define __TBB_VARIADIC_MAX 10 // VS10 setting
+    #else
+        #define __TBB_VARIADIC_MAX 15
+    #endif
+#endif
+
+#if __SANITIZE_THREAD__
+    #define __TBB_USE_THREAD_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+    #define __TBB_USE_THREAD_SANITIZER 1
+#endif
+#endif
+
+#ifndef __TBB_USE_SANITIZERS
+#define __TBB_USE_SANITIZERS (__TBB_USE_THREAD_SANITIZER || __TBB_USE_ADDRESS_SANITIZER)
+#endif
+
+#ifndef __TBB_RESUMABLE_TASKS_USE_THREADS
+#define __TBB_RESUMABLE_TASKS_USE_THREADS __TBB_USE_SANITIZERS
+#endif
+
+#ifndef __TBB_USE_CONSTRAINTS
+#define __TBB_USE_CONSTRAINTS 1
+#endif
+
+#ifndef __TBB_STRICT_CONSTRAINTS
+#define __TBB_STRICT_CONSTRAINTS 1
+#endif
+
+#if __TBB_CPP20_CONCEPTS_PRESENT && __TBB_USE_CONSTRAINTS
+    #define __TBB_requires(...) requires __VA_ARGS__
+#else // __TBB_CPP20_CONCEPTS_PRESENT
+    #define __TBB_requires(...)
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+/** Macros of the form __TBB_XXX_BROKEN denote known issues that are caused by
+    the bugs in compilers, standard or OS specific libraries. They should be
+    removed as soon as the corresponding bugs are fixed or the buggy OS/compiler
+    versions go out of the support list.
+**/
+
+// Some STL containers not support allocator traits in old GCC versions
+#if __GXX_EXPERIMENTAL_CXX0X__ && __TBB_GLIBCXX_VERSION <= 50301
+    #define TBB_ALLOCATOR_TRAITS_BROKEN 1
+#endif
+
+// GCC 4.8 C++ standard library implements std::this_thread::yield as no-op.
+#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900
+    #define __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN 1
+#endif
+
+/** End of __TBB_XXX_BROKEN macro section **/
+
+#if defined(_MSC_VER) && _MSC_VER>=1500 && !defined(__INTEL_COMPILER)
+    // A macro to suppress erroneous or benign "unreachable code" MSVC warning (4702)
+    #define __TBB_MSVC_UNREACHABLE_CODE_IGNORED 1
+#endif
+
+// Many OS versions (Android 4.0.[0-3] for example) need workaround for dlopen to avoid non-recursive loader lock hang
+// Setting the workaround for all compile targets ($APP_PLATFORM) below Android 4.4 (android-19)
+#if __ANDROID__
+    // MISSING #include <android/api-level.h>
+#endif
+
+#define __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING (TBB_PREVIEW_FLOW_GRAPH_FEATURES)
+
+#ifndef __TBB_PREVIEW_CRITICAL_TASKS
+#define __TBB_PREVIEW_CRITICAL_TASKS            1
+#endif
+
+#ifndef __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+#define __TBB_PREVIEW_FLOW_GRAPH_NODE_SET       (TBB_PREVIEW_FLOW_GRAPH_FEATURES)
+#endif
+
+#if TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS
+#define __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS 1
+#endif
+
+#if TBB_PREVIEW_TASK_GROUP_EXTENSIONS || __TBB_BUILD
+#define __TBB_PREVIEW_TASK_GROUP_EXTENSIONS 1
+#endif
+
+#endif // __TBB_detail__config_H
--- a/third_party/tbb/detail/_containers_helpers.hh
+++ b/third_party/tbb/detail/_containers_helpers.hh
@ -0,0 +1,68 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__containers_helpers_H
+#define __TBB_detail__containers_helpers_H
+
+#include "third_party/tbb/detail/_template_helpers.hh"
+#include "third_party/tbb/detail/_allocator_traits.hh"
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/functional"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+template <typename Compare, typename = void>
+struct comp_is_transparent : std::false_type {};
+
+template <typename Compare>
+struct comp_is_transparent<Compare, tbb::detail::void_t<typename Compare::is_transparent>> : std::true_type {};
+
+template <typename Key, typename Hasher, typename KeyEqual, typename = void >
+struct has_transparent_key_equal : std::false_type { using type = KeyEqual; };
+
+template <typename Key, typename Hasher, typename KeyEqual>
+struct has_transparent_key_equal<Key, Hasher, KeyEqual, tbb::detail::void_t<typename Hasher::transparent_key_equal>> : std::true_type {
+    using type = typename Hasher::transparent_key_equal;
+    static_assert(comp_is_transparent<type>::value, "Hash::transparent_key_equal::is_transparent is not valid or does not denote a type.");
+    static_assert((std::is_same<KeyEqual, std::equal_to<Key>>::value ||
+        std::is_same<typename Hasher::transparent_key_equal, KeyEqual>::value), "KeyEqual is a different type than equal_to<Key> or Hash::transparent_key_equal.");
+ };
+
+struct is_iterator_impl {
+template <typename T>
+using iter_traits_category = typename std::iterator_traits<T>::iterator_category;
+
+template <typename T>
+using input_iter_category = typename std::enable_if<std::is_base_of<std::input_iterator_tag, iter_traits_category<T>>::value>::type;
+}; // struct is_iterator_impl
+
+template <typename T>
+using is_input_iterator = supports<T, is_iterator_impl::iter_traits_category, is_iterator_impl::input_iter_category>;
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename T>
+inline constexpr bool is_input_iterator_v = is_input_iterator<T>::value;
+#endif
+
+} // inline namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__containers_helpers_H
--- a/third_party/tbb/detail/_exception.hh
+++ b/third_party/tbb/detail/_exception.hh
@ -0,0 +1,89 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__exception_H
+#define __TBB__exception_H
+
+#include "third_party/tbb/detail/_config.hh"
+
+#include "third_party/libcxx/new"          // std::bad_alloc
+#include "third_party/libcxx/exception"    // std::exception
+#include "third_party/libcxx/stdexcept"    // std::runtime_error
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+enum class exception_id {
+    bad_alloc = 1,
+    bad_last_alloc,
+    user_abort,
+    nonpositive_step,
+    out_of_range,
+    reservation_length_error,
+    missing_wait,
+    invalid_load_factor,
+    invalid_key,
+    bad_tagged_msg_cast,
+    unsafe_wait,
+    last_entry
+};
+} // namespace d0
+
+#if _MSC_VER
+    #pragma warning(disable: 4275)
+#endif
+
+namespace r1 {
+//! Exception for concurrent containers
+class TBB_EXPORT bad_last_alloc : public std::bad_alloc {
+public:
+    const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override;
+};
+
+//! Exception for user-initiated abort
+class TBB_EXPORT user_abort : public std::exception {
+public:
+    const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override;
+};
+
+//! Exception for missing wait on structured_task_group
+class TBB_EXPORT missing_wait : public std::exception {
+public:
+    const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override;
+};
+
+//! Exception for impossible finalization of task_sheduler_handle
+class TBB_EXPORT unsafe_wait : public std::runtime_error {
+public:
+    unsafe_wait(const char* msg) : std::runtime_error(msg) {}
+};
+
+//! Gathers all throw operators in one place.
+/** Its purpose is to minimize code bloat that can be caused by throw operators
+    scattered in multiple places, especially in templates. **/
+TBB_EXPORT void __TBB_EXPORTED_FUNC throw_exception ( exception_id );
+} // namespace r1
+
+inline namespace d0 {
+using r1::throw_exception;
+} // namespace d0
+
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB__exception_H
+
--- a/third_party/tbb/detail/_export.hh
+++ b/third_party/tbb/detail/_export.hh
@ -0,0 +1,47 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__export_H
+#define __TBB_detail__export_H
+
+#if defined(__MINGW32__)
+    #define _EXPORT __declspec(dllexport)
+#elif defined(_WIN32) || defined(__unix__) || defined(__APPLE__) // Use .def files for these
+    #define _EXPORT
+#else
+    #error "Unknown platform/compiler"
+#endif
+
+#if __TBB_BUILD
+    #define TBB_EXPORT _EXPORT
+#else
+    #define TBB_EXPORT
+#endif
+
+#if __TBBMALLOC_BUILD
+    #define TBBMALLOC_EXPORT _EXPORT
+#else
+    #define TBBMALLOC_EXPORT
+#endif
+
+#if __TBBBIND_BUILD
+    #define TBBBIND_EXPORT _EXPORT
+#else
+    #define TBBBIND_EXPORT
+#endif
+
+#endif
--- a/third_party/tbb/detail/_flow_graph_body_impl.hh
+++ b/third_party/tbb/detail/_flow_graph_body_impl.hh
@ -0,0 +1,386 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_body_impl_H
+#define __TBB__flow_graph_body_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1 (in flow_graph.h)
+
+typedef std::uint64_t tag_value;
+
+
+// TODO revamp: find out if there is already helper for has_policy.
+template<typename ... Policies> struct Policy {};
+
+template<typename ... Policies> struct has_policy;
+
+template<typename ExpectedPolicy, typename FirstPolicy, typename ...Policies>
+struct has_policy<ExpectedPolicy, FirstPolicy, Policies...> :
+    std::integral_constant<bool, has_policy<ExpectedPolicy, FirstPolicy>::value ||
+                                 has_policy<ExpectedPolicy, Policies...>::value> {};
+
+template<typename ExpectedPolicy, typename SinglePolicy>
+struct has_policy<ExpectedPolicy, SinglePolicy> :
+    std::integral_constant<bool, std::is_same<ExpectedPolicy, SinglePolicy>::value> {};
+
+template<typename ExpectedPolicy, typename ...Policies>
+struct has_policy<ExpectedPolicy, Policy<Policies...> > : has_policy<ExpectedPolicy, Policies...> {};
+
+namespace graph_policy_namespace {
+
+    struct rejecting { };
+    struct reserving { };
+    struct queueing  { };
+    struct lightweight  { };
+
+    // K == type of field used for key-matching.  Each tag-matching port will be provided
+    // functor that, given an object accepted by the port, will return the
+    /// field of type K being used for matching.
+    template<typename K, typename KHash=tbb_hash_compare<typename std::decay<K>::type > >
+        __TBB_requires(tbb::detail::hash_compare<KHash, K>)
+    struct key_matching {
+        typedef K key_type;
+        typedef typename std::decay<K>::type base_key_type;
+        typedef KHash hash_compare_type;
+    };
+
+    // old tag_matching join's new specifier
+    typedef key_matching<tag_value> tag_matching;
+
+    // Aliases for Policy combinations
+    typedef Policy<queueing, lightweight> queueing_lightweight;
+    typedef Policy<rejecting, lightweight> rejecting_lightweight;
+
+} // namespace graph_policy_namespace
+
+// -------------- function_body containers ----------------------
+
+//! A functor that takes no input and generates a value of type Output
+template< typename Output >
+class input_body : no_assign {
+public:
+    virtual ~input_body() {}
+    virtual Output operator()(flow_control& fc) = 0;
+    virtual input_body* clone() = 0;
+};
+
+//! The leaf for input_body
+template< typename Output, typename Body>
+class input_body_leaf : public input_body<Output> {
+public:
+    input_body_leaf( const Body &_body ) : body(_body) { }
+    Output operator()(flow_control& fc) override { return body(fc); }
+    input_body_leaf* clone() override {
+        return new input_body_leaf< Output, Body >(body);
+    }
+    Body get_body() { return body; }
+private:
+    Body body;
+};
+
+//! A functor that takes an Input and generates an Output
+template< typename Input, typename Output >
+class function_body : no_assign {
+public:
+    virtual ~function_body() {}
+    virtual Output operator()(const Input &input) = 0;
+    virtual function_body* clone() = 0;
+};
+
+//! the leaf for function_body
+template <typename Input, typename Output, typename B>
+class function_body_leaf : public function_body< Input, Output > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    Output operator()(const Input &i) override { return tbb::detail::invoke(body,i); }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< Input, Output, B >(body);
+    }
+private:
+    B body;
+};
+
+//! the leaf for function_body specialized for Input and output of continue_msg
+template <typename B>
+class function_body_leaf< continue_msg, continue_msg, B> : public function_body< continue_msg, continue_msg > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    continue_msg operator()( const continue_msg &i ) override {
+        body(i);
+        return i;
+    }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< continue_msg, continue_msg, B >(body);
+    }
+private:
+    B body;
+};
+
+//! the leaf for function_body specialized for Output of continue_msg
+template <typename Input, typename B>
+class function_body_leaf< Input, continue_msg, B> : public function_body< Input, continue_msg > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    continue_msg operator()(const Input &i) override {
+        body(i);
+        return continue_msg();
+    }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< Input, continue_msg, B >(body);
+    }
+private:
+    B body;
+};
+
+//! the leaf for function_body specialized for Input of continue_msg
+template <typename Output, typename B>
+class function_body_leaf< continue_msg, Output, B > : public function_body< continue_msg, Output > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    Output operator()(const continue_msg &i) override {
+        return body(i);
+    }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< continue_msg, Output, B >(body);
+    }
+private:
+    B body;
+};
+
+//! function_body that takes an Input and a set of output ports
+template<typename Input, typename OutputSet>
+class multifunction_body : no_assign {
+public:
+    virtual ~multifunction_body () {}
+    virtual void operator()(const Input &/* input*/, OutputSet &/*oset*/) = 0;
+    virtual multifunction_body* clone() = 0;
+    virtual void* get_body_ptr() = 0;
+};
+
+//! leaf for multifunction.  OutputSet can be a std::tuple or a vector.
+template<typename Input, typename OutputSet, typename B >
+class multifunction_body_leaf : public multifunction_body<Input, OutputSet> {
+public:
+    multifunction_body_leaf(const B &_body) : body(_body) { }
+    void operator()(const Input &input, OutputSet &oset) override {
+        tbb::detail::invoke(body, input, oset); // body may explicitly put() to one or more of oset.
+    }
+    void* get_body_ptr() override { return &body; }
+    multifunction_body_leaf* clone() override {
+        return new multifunction_body_leaf<Input, OutputSet,B>(body);
+    }
+
+private:
+    B body;
+};
+
+// ------ function bodies for hash_buffers and key-matching joins.
+
+template<typename Input, typename Output>
+class type_to_key_function_body : no_assign {
+    public:
+        virtual ~type_to_key_function_body() {}
+        virtual Output operator()(const Input &input) = 0;  // returns an Output
+        virtual type_to_key_function_body* clone() = 0;
+};
+
+// specialization for ref output
+template<typename Input, typename Output>
+class type_to_key_function_body<Input,Output&> : no_assign {
+    public:
+        virtual ~type_to_key_function_body() {}
+        virtual const Output & operator()(const Input &input) = 0;  // returns a const Output&
+        virtual type_to_key_function_body* clone() = 0;
+};
+
+template <typename Input, typename Output, typename B>
+class type_to_key_function_body_leaf : public type_to_key_function_body<Input, Output> {
+public:
+    type_to_key_function_body_leaf( const B &_body ) : body(_body) { }
+    Output operator()(const Input &i) override { return tbb::detail::invoke(body, i); }
+    type_to_key_function_body_leaf* clone() override {
+        return new type_to_key_function_body_leaf< Input, Output, B>(body);
+    }
+private:
+    B body;
+};
+
+template <typename Input, typename Output, typename B>
+class type_to_key_function_body_leaf<Input,Output&,B> : public type_to_key_function_body< Input, Output&> {
+public:
+    type_to_key_function_body_leaf( const B &_body ) : body(_body) { }
+    const Output& operator()(const Input &i) override {
+        return tbb::detail::invoke(body, i);
+    }
+    type_to_key_function_body_leaf* clone() override {
+        return new type_to_key_function_body_leaf< Input, Output&, B>(body);
+    }
+private:
+    B body;
+};
+
+// --------------------------- end of function_body containers ------------------------
+
+// --------------------------- node task bodies ---------------------------------------
+
+//! A task that calls a node's forward_task function
+template< typename NodeType >
+class forward_task_bypass : public graph_task {
+    NodeType &my_node;
+public:
+    forward_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n
+                         , node_priority_t node_priority = no_priority
+    ) : graph_task(g, allocator, node_priority),
+    my_node(n) {}
+
+    task* execute(execution_data& ed) override {
+        graph_task* next_task = my_node.forward_task();
+        if (SUCCESSFULLY_ENQUEUED == next_task)
+            next_task = nullptr;
+        else if (next_task)
+            next_task = prioritize_task(my_node.graph_reference(), *next_task);
+        finalize<forward_task_bypass>(ed);
+        return next_task;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize<forward_task_bypass>(ed);
+        return nullptr;
+    }
+};
+
+//! A task that calls a node's apply_body_bypass function, passing in an input of type Input
+//  return the task* unless it is SUCCESSFULLY_ENQUEUED, in which case return nullptr
+template< typename NodeType, typename Input >
+class apply_body_task_bypass : public graph_task {
+    NodeType &my_node;
+    Input my_input;
+public:
+
+    apply_body_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n, const Input &i
+                            , node_priority_t node_priority = no_priority
+    ) : graph_task(g, allocator, node_priority),
+        my_node(n), my_input(i) {}
+
+    task* execute(execution_data& ed) override {
+        graph_task* next_task = my_node.apply_body_bypass( my_input );
+        if (SUCCESSFULLY_ENQUEUED == next_task)
+            next_task = nullptr;
+        else if (next_task)
+            next_task = prioritize_task(my_node.graph_reference(), *next_task);
+        finalize<apply_body_task_bypass>(ed);
+        return next_task;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize<apply_body_task_bypass>(ed);
+        return nullptr;
+    }
+};
+
+//! A task that calls a node's apply_body_bypass function with no input
+template< typename NodeType >
+class input_node_task_bypass : public graph_task {
+    NodeType &my_node;
+public:
+    input_node_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n )
+        : graph_task(g, allocator), my_node(n) {}
+
+    task* execute(execution_data& ed) override {
+        graph_task* next_task = my_node.apply_body_bypass( );
+        if (SUCCESSFULLY_ENQUEUED == next_task)
+            next_task = nullptr;
+        else if (next_task)
+            next_task = prioritize_task(my_node.graph_reference(), *next_task);
+        finalize<input_node_task_bypass>(ed);
+        return next_task;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize<input_node_task_bypass>(ed);
+        return nullptr;
+    }
+};
+
+// ------------------------ end of node task bodies -----------------------------------
+
+template<typename T, typename DecrementType, typename DummyType = void>
+class threshold_regulator;
+
+template<typename T, typename DecrementType>
+class threshold_regulator<T, DecrementType,
+                  typename std::enable_if<std::is_integral<DecrementType>::value>::type>
+    : public receiver<DecrementType>, no_copy
+{
+    T* my_node;
+protected:
+
+    graph_task* try_put_task( const DecrementType& value ) override {
+        graph_task* result = my_node->decrement_counter( value );
+        if( !result )
+            result = SUCCESSFULLY_ENQUEUED;
+        return result;
+    }
+
+    graph& graph_reference() const override {
+        return my_node->my_graph;
+    }
+
+    template<typename U, typename V> friend class limiter_node;
+    void reset_receiver( reset_flags ) {}
+
+public:
+    threshold_regulator(T* owner) : my_node(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+};
+
+template<typename T>
+class threshold_regulator<T, continue_msg, void> : public continue_receiver, no_copy {
+
+    T *my_node;
+
+    graph_task* execute() override {
+        return my_node->decrement_counter( 1 );
+    }
+
+protected:
+
+    graph& graph_reference() const override {
+        return my_node->my_graph;
+    }
+
+public:
+
+    typedef continue_msg input_type;
+    typedef continue_msg output_type;
+    threshold_regulator(T* owner)
+        : continue_receiver( /*number_of_predecessors=*/0, no_priority ), my_node(owner)
+    {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+};
+
+#endif // __TBB__flow_graph_body_impl_H
--- a/third_party/tbb/detail/_flow_graph_cache_impl.hh
+++ b/third_party/tbb/detail/_flow_graph_cache_impl.hh
@ -0,0 +1,435 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_cache_impl_H
+#define __TBB__flow_graph_cache_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1 (in flow_graph.h)
+
+//! A node_cache maintains a std::queue of elements of type T.  Each operation is protected by a lock.
+template< typename T, typename M=spin_mutex >
+class node_cache {
+    public:
+
+    typedef size_t size_type;
+
+    bool empty() {
+        typename mutex_type::scoped_lock lock( my_mutex );
+        return internal_empty();
+    }
+
+    void add( T &n ) {
+        typename mutex_type::scoped_lock lock( my_mutex );
+        internal_push(n);
+    }
+
+    void remove( T &n ) {
+        typename mutex_type::scoped_lock lock( my_mutex );
+        for ( size_t i = internal_size(); i != 0; --i ) {
+            T &s = internal_pop();
+            if ( &s == &n )
+                break;  // only remove one predecessor per request
+            internal_push(s);
+        }
+    }
+
+    void clear() {
+        while( !my_q.empty()) (void)my_q.pop();
+    }
+
+protected:
+
+    typedef M mutex_type;
+    mutex_type my_mutex;
+    std::queue< T * > my_q;
+
+    // Assumes lock is held
+    inline bool internal_empty( )  {
+        return my_q.empty();
+    }
+
+    // Assumes lock is held
+    inline size_type internal_size( )  {
+        return my_q.size();
+    }
+
+    // Assumes lock is held
+    inline void internal_push( T &n )  {
+        my_q.push(&n);
+    }
+
+    // Assumes lock is held
+    inline T &internal_pop() {
+        T *v = my_q.front();
+        my_q.pop();
+        return *v;
+    }
+
+};
+
+//! A cache of predecessors that only supports try_get
+template< typename T, typename M=spin_mutex >
+class predecessor_cache : public node_cache< sender<T>, M > {
+public:
+    typedef M mutex_type;
+    typedef T output_type;
+    typedef sender<output_type> predecessor_type;
+    typedef receiver<output_type> successor_type;
+
+    predecessor_cache( successor_type* owner ) : my_owner( owner ) {
+        __TBB_ASSERT( my_owner, "predecessor_cache should have an owner." );
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    bool get_item( output_type& v ) {
+
+        bool msg = false;
+
+        do {
+            predecessor_type *src;
+            {
+                typename mutex_type::scoped_lock lock(this->my_mutex);
+                if ( this->internal_empty() ) {
+                    break;
+                }
+                src = &this->internal_pop();
+            }
+
+            // Try to get from this sender
+            msg = src->try_get( v );
+
+            if (msg == false) {
+                // Relinquish ownership of the edge
+                register_successor(*src, *my_owner);
+            } else {
+                // Retain ownership of the edge
+                this->add(*src);
+            }
+        } while ( msg == false );
+        return msg;
+    }
+
+    // If we are removing arcs (rf_clear_edges), call clear() rather than reset().
+    void reset() {
+        for(;;) {
+            predecessor_type *src;
+            {
+                if (this->internal_empty()) break;
+                src = &this->internal_pop();
+            }
+            register_successor(*src, *my_owner);
+        }
+    }
+
+protected:
+    successor_type* my_owner;
+};
+
+//! An cache of predecessors that supports requests and reservations
+template< typename T, typename M=spin_mutex >
+class reservable_predecessor_cache : public predecessor_cache< T, M > {
+public:
+    typedef M mutex_type;
+    typedef T output_type;
+    typedef sender<T> predecessor_type;
+    typedef receiver<T> successor_type;
+
+    reservable_predecessor_cache( successor_type* owner )
+        : predecessor_cache<T,M>(owner), reserved_src(nullptr)
+    {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    bool try_reserve( output_type &v ) {
+        bool msg = false;
+
+        do {
+            predecessor_type* pred = nullptr;
+            {
+                typename mutex_type::scoped_lock lock(this->my_mutex);
+                if ( reserved_src.load(std::memory_order_relaxed) || this->internal_empty() )
+                    return false;
+
+                pred = &this->internal_pop();
+                reserved_src.store(pred, std::memory_order_relaxed);
+            }
+
+            // Try to get from this sender
+            msg = pred->try_reserve( v );
+
+            if (msg == false) {
+                typename mutex_type::scoped_lock lock(this->my_mutex);
+                // Relinquish ownership of the edge
+                register_successor( *pred, *this->my_owner );
+                reserved_src.store(nullptr, std::memory_order_relaxed);
+            } else {
+                // Retain ownership of the edge
+                this->add( *pred);
+            }
+        } while ( msg == false );
+
+        return msg;
+    }
+
+    bool try_release() {
+        reserved_src.load(std::memory_order_relaxed)->try_release();
+        reserved_src.store(nullptr, std::memory_order_relaxed);
+        return true;
+    }
+
+    bool try_consume() {
+        reserved_src.load(std::memory_order_relaxed)->try_consume();
+        reserved_src.store(nullptr, std::memory_order_relaxed);
+        return true;
+    }
+
+    void reset() {
+        reserved_src.store(nullptr, std::memory_order_relaxed);
+        predecessor_cache<T, M>::reset();
+    }
+
+    void clear() {
+        reserved_src.store(nullptr, std::memory_order_relaxed);
+        predecessor_cache<T, M>::clear();
+    }
+
+private:
+    std::atomic<predecessor_type*> reserved_src;
+};
+
+
+//! An abstract cache of successors
+template<typename T, typename M=spin_rw_mutex >
+class successor_cache : no_copy {
+protected:
+
+    typedef M mutex_type;
+    mutex_type my_mutex;
+
+    typedef receiver<T> successor_type;
+    typedef receiver<T>* pointer_type;
+    typedef sender<T> owner_type;
+    // TODO revamp: introduce heapified collection of successors for strict priorities
+    typedef std::list< pointer_type > successors_type;
+    successors_type my_successors;
+
+    owner_type* my_owner;
+
+public:
+    successor_cache( owner_type* owner ) : my_owner(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    virtual ~successor_cache() {}
+
+    void register_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        if( r.priority() != no_priority )
+            my_successors.push_front( &r );
+        else
+            my_successors.push_back( &r );
+    }
+
+    void remove_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        for ( typename successors_type::iterator i = my_successors.begin();
+              i != my_successors.end(); ++i ) {
+            if ( *i == & r ) {
+                my_successors.erase(i);
+                break;
+            }
+        }
+    }
+
+    bool empty() {
+        typename mutex_type::scoped_lock l(my_mutex, false);
+        return my_successors.empty();
+    }
+
+    void clear() {
+        my_successors.clear();
+    }
+
+    virtual graph_task* try_put_task( const T& t ) = 0;
+};  // successor_cache<T>
+
+//! An abstract cache of successors, specialized to continue_msg
+template<typename M>
+class successor_cache< continue_msg, M > : no_copy {
+protected:
+
+    typedef M mutex_type;
+    mutex_type my_mutex;
+
+    typedef receiver<continue_msg> successor_type;
+    typedef receiver<continue_msg>* pointer_type;
+    typedef sender<continue_msg> owner_type;
+    typedef std::list< pointer_type > successors_type;
+    successors_type my_successors;
+    owner_type* my_owner;
+
+public:
+    successor_cache( sender<continue_msg>* owner ) : my_owner(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    virtual ~successor_cache() {}
+
+    void register_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        if( r.priority() != no_priority )
+            my_successors.push_front( &r );
+        else
+            my_successors.push_back( &r );
+        __TBB_ASSERT( my_owner, "Cache of successors must have an owner." );
+        if ( r.is_continue_receiver() ) {
+            r.register_predecessor( *my_owner );
+        }
+    }
+
+    void remove_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        for ( successors_type::iterator i = my_successors.begin(); i != my_successors.end(); ++i ) {
+            if ( *i == &r ) {
+                __TBB_ASSERT(my_owner, "Cache of successors must have an owner.");
+                // TODO: check if we need to test for continue_receiver before removing from r.
+                r.remove_predecessor( *my_owner );
+                my_successors.erase(i);
+                break;
+            }
+        }
+    }
+
+    bool empty() {
+        typename mutex_type::scoped_lock l(my_mutex, false);
+        return my_successors.empty();
+    }
+
+    void clear() {
+        my_successors.clear();
+    }
+
+    virtual graph_task* try_put_task( const continue_msg& t ) = 0;
+};  // successor_cache< continue_msg >
+
+//! A cache of successors that are broadcast to
+template<typename T, typename M=spin_rw_mutex>
+class broadcast_cache : public successor_cache<T, M> {
+    typedef successor_cache<T, M> base_type;
+    typedef M mutex_type;
+    typedef typename successor_cache<T,M>::successors_type successors_type;
+
+public:
+
+    broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    // as above, but call try_put_task instead, and return the last task we received (if any)
+    graph_task* try_put_task( const T &t ) override {
+        graph_task * last_task = nullptr;
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
+        typename successors_type::iterator i = this->my_successors.begin();
+        while ( i != this->my_successors.end() ) {
+            graph_task *new_task = (*i)->try_put_task(t);
+            // workaround for icc bug
+            graph& graph_ref = (*i)->graph_reference();
+            last_task = combine_tasks(graph_ref, last_task, new_task);  // enqueue if necessary
+            if(new_task) {
+                ++i;
+            }
+            else {  // failed
+                if ( (*i)->register_predecessor(*this->my_owner) ) {
+                    i = this->my_successors.erase(i);
+                } else {
+                    ++i;
+                }
+            }
+        }
+        return last_task;
+    }
+
+    // call try_put_task and return list of received tasks
+    bool gather_successful_try_puts( const T &t, graph_task_list& tasks ) {
+        bool is_at_least_one_put_successful = false;
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
+        typename successors_type::iterator i = this->my_successors.begin();
+        while ( i != this->my_successors.end() ) {
+            graph_task * new_task = (*i)->try_put_task(t);
+            if(new_task) {
+                ++i;
+                if(new_task != SUCCESSFULLY_ENQUEUED) {
+                    tasks.push_back(*new_task);
+                }
+                is_at_least_one_put_successful = true;
+            }
+            else {  // failed
+                if ( (*i)->register_predecessor(*this->my_owner) ) {
+                    i = this->my_successors.erase(i);
+                } else {
+                    ++i;
+                }
+            }
+        }
+        return is_at_least_one_put_successful;
+    }
+};
+
+//! A cache of successors that are put in a round-robin fashion
+template<typename T, typename M=spin_rw_mutex >
+class round_robin_cache : public successor_cache<T, M> {
+    typedef successor_cache<T, M> base_type;
+    typedef size_t size_type;
+    typedef M mutex_type;
+    typedef typename successor_cache<T,M>::successors_type successors_type;
+
+public:
+
+    round_robin_cache( typename base_type::owner_type* owner ): base_type(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    size_type size() {
+        typename mutex_type::scoped_lock l(this->my_mutex, false);
+        return this->my_successors.size();
+    }
+
+    graph_task* try_put_task( const T &t ) override {
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
+        typename successors_type::iterator i = this->my_successors.begin();
+        while ( i != this->my_successors.end() ) {
+            graph_task* new_task = (*i)->try_put_task(t);
+            if ( new_task ) {
+                return new_task;
+            } else {
+               if ( (*i)->register_predecessor(*this->my_owner) ) {
+                   i = this->my_successors.erase(i);
+               }
+               else {
+                   ++i;
+               }
+            }
+        }
+        return nullptr;
+    }
+};
+
+#endif // __TBB__flow_graph_cache_impl_H
--- a/third_party/tbb/detail/_flow_graph_impl.hh
+++ b/third_party/tbb/detail/_flow_graph_impl.hh
@ -0,0 +1,477 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_impl_H
+#define __TBB_flow_graph_impl_H
+
+// // MISSING #include "../config.h"
+#include "third_party/tbb/detail/_task.hh"
+#include "third_party/tbb/task_group.hh"
+#include "third_party/tbb/task_arena.hh"
+#include "third_party/tbb/flow_graph_abstractions.hh"
+
+#include "third_party/tbb/concurrent_priority_queue.hh"
+
+#include "third_party/libcxx/list"
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+
+class graph_task;
+static graph_task* const SUCCESSFULLY_ENQUEUED = (graph_task*)-1;
+typedef unsigned int node_priority_t;
+static const node_priority_t no_priority = node_priority_t(0);
+
+class graph;
+class graph_node;
+
+template <typename GraphContainerType, typename GraphNodeType>
+class graph_iterator {
+    friend class graph;
+    friend class graph_node;
+public:
+    typedef size_t size_type;
+    typedef GraphNodeType value_type;
+    typedef GraphNodeType* pointer;
+    typedef GraphNodeType& reference;
+    typedef const GraphNodeType& const_reference;
+    typedef std::forward_iterator_tag iterator_category;
+
+    //! Copy constructor
+    graph_iterator(const graph_iterator& other) :
+        my_graph(other.my_graph), current_node(other.current_node)
+    {}
+
+    //! Assignment
+    graph_iterator& operator=(const graph_iterator& other) {
+        if (this != &other) {
+            my_graph = other.my_graph;
+            current_node = other.current_node;
+        }
+        return *this;
+    }
+
+    //! Dereference
+    reference operator*() const;
+
+    //! Dereference
+    pointer operator->() const;
+
+    //! Equality
+    bool operator==(const graph_iterator& other) const {
+        return ((my_graph == other.my_graph) && (current_node == other.current_node));
+    }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+    //! Inequality
+    bool operator!=(const graph_iterator& other) const { return !(operator==(other)); }
+#endif
+
+    //! Pre-increment
+    graph_iterator& operator++() {
+        internal_forward();
+        return *this;
+    }
+
+    //! Post-increment
+    graph_iterator operator++(int) {
+        graph_iterator result = *this;
+        operator++();
+        return result;
+    }
+
+private:
+    // the graph over which we are iterating
+    GraphContainerType *my_graph;
+    // pointer into my_graph's my_nodes list
+    pointer current_node;
+
+    //! Private initializing constructor for begin() and end() iterators
+    graph_iterator(GraphContainerType *g, bool begin);
+    void internal_forward();
+};  // class graph_iterator
+
+// flags to modify the behavior of the graph reset().  Can be combined.
+enum reset_flags {
+    rf_reset_protocol = 0,
+    rf_reset_bodies = 1 << 0,  // delete the current node body, reset to a copy of the initial node body.
+    rf_clear_edges = 1 << 1   // delete edges
+};
+
+void activate_graph(graph& g);
+void deactivate_graph(graph& g);
+bool is_graph_active(graph& g);
+graph_task* prioritize_task(graph& g, graph_task& arena_task);
+void spawn_in_graph_arena(graph& g, graph_task& arena_task);
+void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
+
+class graph;
+
+//! Base class for tasks generated by graph nodes.
+class graph_task : public task {
+public:
+    graph_task(graph& g, small_object_allocator& allocator
+               , node_priority_t node_priority = no_priority
+    )
+        : my_graph(g)
+        , priority(node_priority)
+        , my_allocator(allocator)
+    {}
+    graph& my_graph; // graph instance the task belongs to
+    // TODO revamp: rename to my_priority
+    node_priority_t priority;
+    template <typename DerivedType>
+    void destruct_and_deallocate(const execution_data& ed);
+protected:
+    template <typename DerivedType>
+    void finalize(const execution_data& ed);
+private:
+    // To organize task_list
+    graph_task* my_next{ nullptr };
+    small_object_allocator my_allocator;
+    // TODO revamp: elaborate internal interfaces to avoid friends declarations
+    friend class graph_task_list;
+    friend graph_task* prioritize_task(graph& g, graph_task& gt);
+};
+
+struct graph_task_comparator {
+    bool operator()(const graph_task* left, const graph_task* right) {
+        return left->priority < right->priority;
+    }
+};
+
+typedef tbb::concurrent_priority_queue<graph_task*, graph_task_comparator> graph_task_priority_queue_t;
+
+class priority_task_selector : public task {
+public:
+    priority_task_selector(graph_task_priority_queue_t& priority_queue, small_object_allocator& allocator)
+        : my_priority_queue(priority_queue), my_allocator(allocator), my_task() {}
+    task* execute(execution_data& ed) override {
+        next_task();
+        __TBB_ASSERT(my_task, nullptr);
+        task* t_next = my_task->execute(ed);
+        my_allocator.delete_object(this, ed);
+        return t_next;
+    }
+    task* cancel(execution_data& ed) override {
+        if (!my_task) {
+            next_task();
+        }
+        __TBB_ASSERT(my_task, nullptr);
+        task* t_next = my_task->cancel(ed);
+        my_allocator.delete_object(this, ed);
+        return t_next;
+    }
+private:
+    void next_task() {
+        // TODO revamp: hold functors in priority queue instead of real tasks
+        bool result = my_priority_queue.try_pop(my_task);
+        __TBB_ASSERT_EX(result, "Number of critical tasks for scheduler and tasks"
+            " in graph's priority queue mismatched");
+        __TBB_ASSERT(my_task && my_task != SUCCESSFULLY_ENQUEUED,
+            "Incorrect task submitted to graph priority queue");
+        __TBB_ASSERT(my_task->priority != no_priority,
+            "Tasks from graph's priority queue must have priority");
+    }
+
+    graph_task_priority_queue_t& my_priority_queue;
+    small_object_allocator my_allocator;
+    graph_task* my_task;
+};
+
+template <typename Receiver, typename Body> class run_and_put_task;
+template <typename Body> class run_task;
+
+//********************************************************************************
+// graph tasks helpers
+//********************************************************************************
+
+//! The list of graph tasks
+class graph_task_list : no_copy {
+private:
+    graph_task* my_first;
+    graph_task** my_next_ptr;
+public:
+    //! Construct empty list
+    graph_task_list() : my_first(nullptr), my_next_ptr(&my_first) {}
+
+    //! True if list is empty; false otherwise.
+    bool empty() const { return !my_first; }
+
+    //! Push task onto back of list.
+    void push_back(graph_task& task) {
+        task.my_next = nullptr;
+        *my_next_ptr = &task;
+        my_next_ptr = &task.my_next;
+    }
+
+    //! Pop the front task from the list.
+    graph_task& pop_front() {
+        __TBB_ASSERT(!empty(), "attempt to pop item from empty task_list");
+        graph_task* result = my_first;
+        my_first = result->my_next;
+        if (!my_first) {
+            my_next_ptr = &my_first;
+        }
+        return *result;
+    }
+};
+
+//! The graph class
+/** This class serves as a handle to the graph */
+class graph : no_copy, public graph_proxy {
+    friend class graph_node;
+
+    void prepare_task_arena(bool reinit = false) {
+        if (reinit) {
+            __TBB_ASSERT(my_task_arena, "task arena is nullptr");
+            my_task_arena->terminate();
+            my_task_arena->initialize(task_arena::attach());
+        }
+        else {
+            __TBB_ASSERT(my_task_arena == nullptr, "task arena is not nullptr");
+            my_task_arena = new task_arena(task_arena::attach());
+        }
+        if (!my_task_arena->is_active()) // failed to attach
+            my_task_arena->initialize(); // create a new, default-initialized arena
+        __TBB_ASSERT(my_task_arena->is_active(), "task arena is not active");
+    }
+
+public:
+    //! Constructs a graph with isolated task_group_context
+    graph();
+
+    //! Constructs a graph with use_this_context as context
+    explicit graph(task_group_context& use_this_context);
+
+    //! Destroys the graph.
+    /** Calls wait_for_all, then destroys the root task and context. */
+    ~graph();
+
+    //! Used to register that an external entity may still interact with the graph.
+    /** The graph will not return from wait_for_all until a matching number of release_wait calls is
+    made. */
+    void reserve_wait() override;
+
+    //! Deregisters an external entity that may have interacted with the graph.
+    /** The graph will not return from wait_for_all until all the number of reserve_wait calls
+    matches the number of release_wait calls. */
+    void release_wait() override;
+
+    //! Wait until graph is idle and the number of release_wait calls equals to the number of
+    //! reserve_wait calls.
+    /** The waiting thread will go off and steal work while it is blocked in the wait_for_all. */
+    void wait_for_all() {
+        cancelled = false;
+        caught_exception = false;
+        try_call([this] {
+            my_task_arena->execute([this] {
+                wait(my_wait_context, *my_context);
+            });
+            cancelled = my_context->is_group_execution_cancelled();
+        }).on_exception([this] {
+            my_context->reset();
+            caught_exception = true;
+            cancelled = true;
+        });
+        // TODO: the "if" condition below is just a work-around to support the concurrent wait
+        // mode. The cancellation and exception mechanisms are still broken in this mode.
+        // Consider using task group not to re-implement the same functionality.
+        if (!(my_context->traits() & task_group_context::concurrent_wait)) {
+            my_context->reset();  // consistent with behavior in catch()
+        }
+    }
+
+    // TODO revamp: consider adding getter for task_group_context.
+
+    // ITERATORS
+    template<typename C, typename N>
+    friend class graph_iterator;
+
+    // Graph iterator typedefs
+    typedef graph_iterator<graph, graph_node> iterator;
+    typedef graph_iterator<const graph, const graph_node> const_iterator;
+
+    // Graph iterator constructors
+    //! start iterator
+    iterator begin();
+    //! end iterator
+    iterator end();
+    //! start const iterator
+    const_iterator begin() const;
+    //! end const iterator
+    const_iterator end() const;
+    //! start const iterator
+    const_iterator cbegin() const;
+    //! end const iterator
+    const_iterator cend() const;
+
+    // thread-unsafe state reset.
+    void reset(reset_flags f = rf_reset_protocol);
+
+    //! cancels execution of the associated task_group_context
+    void cancel();
+
+    //! return status of graph execution
+    bool is_cancelled() { return cancelled; }
+    bool exception_thrown() { return caught_exception; }
+
+private:
+    wait_context my_wait_context;
+    task_group_context *my_context;
+    bool own_context;
+    bool cancelled;
+    bool caught_exception;
+    bool my_is_active;
+
+    graph_node *my_nodes, *my_nodes_last;
+
+    tbb::spin_mutex nodelist_mutex;
+    void register_node(graph_node *n);
+    void remove_node(graph_node *n);
+
+    task_arena* my_task_arena;
+
+    graph_task_priority_queue_t my_priority_queue;
+
+    friend void activate_graph(graph& g);
+    friend void deactivate_graph(graph& g);
+    friend bool is_graph_active(graph& g);
+    friend graph_task* prioritize_task(graph& g, graph_task& arena_task);
+    friend void spawn_in_graph_arena(graph& g, graph_task& arena_task);
+    friend void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
+
+    friend class task_arena_base;
+
+};  // class graph
+
+template<typename DerivedType>
+inline void graph_task::destruct_and_deallocate(const execution_data& ed) {
+    auto allocator = my_allocator;
+    // TODO: investigate if direct call of derived destructor gives any benefits.
+    this->~graph_task();
+    allocator.deallocate(static_cast<DerivedType*>(this), ed);
+}
+
+template<typename DerivedType>
+inline void graph_task::finalize(const execution_data& ed) {
+    graph& g = my_graph;
+    destruct_and_deallocate<DerivedType>(ed);
+    g.release_wait();
+}
+
+//********************************************************************************
+// end of graph tasks helpers
+//********************************************************************************
+
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+class get_graph_helper;
+#endif
+
+//! The base of all graph nodes.
+class graph_node : no_copy {
+    friend class graph;
+    template<typename C, typename N>
+    friend class graph_iterator;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    friend class get_graph_helper;
+#endif
+
+protected:
+    graph& my_graph;
+    graph& graph_reference() const {
+        // TODO revamp: propagate graph_reference() method to all the reference places.
+        return my_graph;
+    }
+    graph_node* next = nullptr;
+    graph_node* prev = nullptr;
+public:
+    explicit graph_node(graph& g);
+
+    virtual ~graph_node();
+
+protected:
+    // performs the reset on an individual node.
+    virtual void reset_node(reset_flags f = rf_reset_protocol) = 0;
+};  // class graph_node
+
+inline void activate_graph(graph& g) {
+    g.my_is_active = true;
+}
+
+inline void deactivate_graph(graph& g) {
+    g.my_is_active = false;
+}
+
+inline bool is_graph_active(graph& g) {
+    return g.my_is_active;
+}
+
+inline graph_task* prioritize_task(graph& g, graph_task& gt) {
+    if( no_priority == gt.priority )
+        return &gt;
+
+    //! Non-preemptive priority pattern. The original task is submitted as a work item to the
+    //! priority queue, and a new critical task is created to take and execute a work item with
+    //! the highest known priority. The reference counting responsibility is transferred (via
+    //! allocate_continuation) to the new task.
+    task* critical_task = gt.my_allocator.new_object<priority_task_selector>(g.my_priority_queue, gt.my_allocator);
+    __TBB_ASSERT( critical_task, "bad_alloc?" );
+    g.my_priority_queue.push(&gt);
+    using tbb::detail::d1::submit;
+    submit( *critical_task, *g.my_task_arena, *g.my_context, /*as_critical=*/true );
+    return nullptr;
+}
+
+//! Spawns a task inside graph arena
+inline void spawn_in_graph_arena(graph& g, graph_task& arena_task) {
+    if (is_graph_active(g)) {
+        task* gt = prioritize_task(g, arena_task);
+        if( !gt )
+            return;
+
+        __TBB_ASSERT(g.my_task_arena && g.my_task_arena->is_active(), nullptr);
+        submit( *gt, *g.my_task_arena, *g.my_context
+#if __TBB_PREVIEW_CRITICAL_TASKS
+                , /*as_critical=*/false
+#endif
+        );
+    }
+}
+
+// TODO revamp: unify *_in_graph_arena functions
+
+//! Enqueues a task inside graph arena
+inline void enqueue_in_graph_arena(graph &g, graph_task& arena_task) {
+    if (is_graph_active(g)) {
+        __TBB_ASSERT( g.my_task_arena && g.my_task_arena->is_active(), "Is graph's arena initialized and active?" );
+
+        // TODO revamp: decide on the approach that does not postpone critical task
+        if( task* gt = prioritize_task(g, arena_task) )
+            submit( *gt, *g.my_task_arena, *g.my_context, /*as_critical=*/false);
+    }
+}
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_flow_graph_impl_H
--- a/third_party/tbb/detail/_flow_graph_indexer_impl.hh
+++ b/third_party/tbb/detail/_flow_graph_indexer_impl.hh
@ -0,0 +1,352 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_indexer_impl_H
+#define __TBB__flow_graph_indexer_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1
+
+#include "third_party/tbb/detail/_flow_graph_types_impl.hh"
+
+    // Output of the indexer_node is a tbb::flow::tagged_msg, and will be of
+    // the form  tagged_msg<tag, result>
+    // where the value of tag will indicate which result was put to the
+    // successor.
+
+    template<typename IndexerNodeBaseType, typename T, size_t K>
+    graph_task* do_try_put(const T &v, void *p) {
+        typename IndexerNodeBaseType::output_type o(K, v);
+        return reinterpret_cast<IndexerNodeBaseType *>(p)->try_put_task(&o);
+    }
+
+    template<typename TupleTypes,int N>
+    struct indexer_helper {
+        template<typename IndexerNodeBaseType, typename PortTuple>
+        static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
+            typedef typename std::tuple_element<N-1, TupleTypes>::type T;
+            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, N-1>;
+            std::get<N-1>(my_input).set_up(p, indexer_node_put_task, g);
+            indexer_helper<TupleTypes,N-1>::template set_indexer_node_pointer<IndexerNodeBaseType,PortTuple>(my_input, p, g);
+        }
+    };
+
+    template<typename TupleTypes>
+    struct indexer_helper<TupleTypes,1> {
+        template<typename IndexerNodeBaseType, typename PortTuple>
+        static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
+            typedef typename std::tuple_element<0, TupleTypes>::type T;
+            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, 0>;
+            std::get<0>(my_input).set_up(p, indexer_node_put_task, g);
+        }
+    };
+
+    template<typename T>
+    class indexer_input_port : public receiver<T> {
+    private:
+        void* my_indexer_ptr;
+        typedef graph_task* (* forward_function_ptr)(T const &, void* );
+        forward_function_ptr my_try_put_task;
+        graph* my_graph;
+    public:
+        void set_up(void* p, forward_function_ptr f, graph& g) {
+            my_indexer_ptr = p;
+            my_try_put_task = f;
+            my_graph = &g;
+        }
+
+    protected:
+        template< typename R, typename B > friend class run_and_put_task;
+        template<typename X, typename Y> friend class broadcast_cache;
+        template<typename X, typename Y> friend class round_robin_cache;
+        graph_task* try_put_task(const T &v) override {
+            return my_try_put_task(v, my_indexer_ptr);
+        }
+
+        graph& graph_reference() const override {
+            return *my_graph;
+        }
+    };
+
+    template<typename InputTuple, typename OutputType, typename StructTypes>
+    class indexer_node_FE {
+    public:
+        static const int N = std::tuple_size<InputTuple>::value;
+        typedef OutputType output_type;
+        typedef InputTuple input_type;
+
+        // Some versions of Intel(R) C++ Compiler fail to generate an implicit constructor for the class which has std::tuple as a member.
+        indexer_node_FE() : my_inputs() {}
+
+        input_type &input_ports() { return my_inputs; }
+    protected:
+        input_type my_inputs;
+    };
+
+    //! indexer_node_base
+    template<typename InputTuple, typename OutputType, typename StructTypes>
+    class indexer_node_base : public graph_node, public indexer_node_FE<InputTuple, OutputType,StructTypes>,
+                           public sender<OutputType> {
+    protected:
+       using graph_node::my_graph;
+    public:
+        static const size_t N = std::tuple_size<InputTuple>::value;
+        typedef OutputType output_type;
+        typedef StructTypes tuple_types;
+        typedef typename sender<output_type>::successor_type successor_type;
+        typedef indexer_node_FE<InputTuple, output_type,StructTypes> input_ports_type;
+
+    private:
+        // ----------- Aggregator ------------
+        enum op_type { reg_succ, rem_succ, try__put_task
+        };
+        typedef indexer_node_base<InputTuple,output_type,StructTypes> class_type;
+
+        class indexer_node_base_operation : public aggregated_operation<indexer_node_base_operation> {
+        public:
+            char type;
+            union {
+                output_type const *my_arg;
+                successor_type *my_succ;
+                graph_task* bypass_t;
+            };
+            indexer_node_base_operation(const output_type* e, op_type t) :
+                type(char(t)), my_arg(e) {}
+            indexer_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
+                my_succ(const_cast<successor_type *>(&s)) {}
+        };
+
+        typedef aggregating_functor<class_type, indexer_node_base_operation> handler_type;
+        friend class aggregating_functor<class_type, indexer_node_base_operation>;
+        aggregator<handler_type, indexer_node_base_operation> my_aggregator;
+
+        void handle_operations(indexer_node_base_operation* op_list) {
+            indexer_node_base_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+
+                case reg_succ:
+                    my_successors.register_successor(*(current->my_succ));
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+
+                case rem_succ:
+                    my_successors.remove_successor(*(current->my_succ));
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case try__put_task: {
+                        current->bypass_t = my_successors.try_put_task(*(current->my_arg));
+                        current->status.store( SUCCEEDED, std::memory_order_release);  // return of try_put_task actual return value
+                    }
+                    break;
+                }
+            }
+        }
+        // ---------- end aggregator -----------
+    public:
+        indexer_node_base(graph& g) : graph_node(g), input_ports_type(), my_successors(this) {
+            indexer_helper<StructTypes,N>::set_indexer_node_pointer(this->my_inputs, this, g);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        indexer_node_base(const indexer_node_base& other)
+            : graph_node(other.my_graph), input_ports_type(), sender<output_type>(), my_successors(this)
+        {
+            indexer_helper<StructTypes,N>::set_indexer_node_pointer(this->my_inputs, this, other.my_graph);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        bool register_successor(successor_type &r) override {
+            indexer_node_base_operation op_data(r, reg_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        bool remove_successor( successor_type &r) override {
+            indexer_node_base_operation op_data(r, rem_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        graph_task* try_put_task(output_type const *v) { // not a virtual method in this class
+            indexer_node_base_operation op_data(v, try__put_task);
+            my_aggregator.execute(&op_data);
+            return op_data.bypass_t;
+        }
+
+    protected:
+        void reset_node(reset_flags f) override {
+            if(f & rf_clear_edges) {
+                my_successors.clear();
+            }
+        }
+
+    private:
+        broadcast_cache<output_type, null_rw_mutex> my_successors;
+    };  //indexer_node_base
+
+
+    template<int N, typename InputTuple> struct input_types;
+
+    template<typename InputTuple>
+    struct input_types<1, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef tagged_msg<size_t, first_type > type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<2, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef tagged_msg<size_t, first_type, second_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<3, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<4, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<5, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<6, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<7, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type> type;
+    };
+
+
+    template<typename InputTuple>
+    struct input_types<8, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef typename std::tuple_element<7, InputTuple>::type eighth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type, eighth_type> type;
+    };
+
+
+    template<typename InputTuple>
+    struct input_types<9, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef typename std::tuple_element<7, InputTuple>::type eighth_type;
+        typedef typename std::tuple_element<8, InputTuple>::type nineth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type, eighth_type, nineth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<10, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef typename std::tuple_element<7, InputTuple>::type eighth_type;
+        typedef typename std::tuple_element<8, InputTuple>::type nineth_type;
+        typedef typename std::tuple_element<9, InputTuple>::type tenth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type, eighth_type, nineth_type,
+                                                      tenth_type> type;
+    };
+
+    // type generators
+    template<typename OutputTuple>
+    struct indexer_types : public input_types<std::tuple_size<OutputTuple>::value, OutputTuple> {
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef typename input_types<N, OutputTuple>::type output_type;
+        typedef typename wrap_tuple_elements<N,indexer_input_port,OutputTuple>::type input_ports_type;
+        typedef indexer_node_FE<input_ports_type,output_type,OutputTuple> indexer_FE_type;
+        typedef indexer_node_base<input_ports_type, output_type, OutputTuple> indexer_base_type;
+    };
+
+    template<class OutputTuple>
+    class unfolded_indexer_node : public indexer_types<OutputTuple>::indexer_base_type {
+    public:
+        typedef typename indexer_types<OutputTuple>::input_ports_type input_ports_type;
+        typedef OutputTuple tuple_types;
+        typedef typename indexer_types<OutputTuple>::output_type output_type;
+    private:
+        typedef typename indexer_types<OutputTuple>::indexer_base_type base_type;
+    public:
+        unfolded_indexer_node(graph& g) : base_type(g) {}
+        unfolded_indexer_node(const unfolded_indexer_node &other) : base_type(other) {}
+    };
+
+#endif  /* __TBB__flow_graph_indexer_impl_H */
--- a/third_party/tbb/detail/_flow_graph_item_buffer_impl.hh
+++ b/third_party/tbb/detail/_flow_graph_item_buffer_impl.hh
@ -0,0 +1,280 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_item_buffer_impl_H
+#define __TBB__flow_graph_item_buffer_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+#include "third_party/tbb/detail/_aligned_space.hh"
+
+// in namespace tbb::flow::interfaceX (included in _flow_graph_node_impl.h)
+
+//! Expandable buffer of items.  The possible operations are push, pop,
+//* tests for empty and so forth.  No mutual exclusion is built in.
+//* objects are constructed into and explicitly-destroyed.  get_my_item gives
+// a read-only reference to the item in the buffer.  set_my_item may be called
+// with either an empty or occupied slot.
+
+template <typename T, typename A=cache_aligned_allocator<T> >
+class item_buffer {
+public:
+    typedef T item_type;
+    enum buffer_item_state { no_item=0, has_item=1, reserved_item=2 };
+protected:
+    typedef size_t size_type;
+    typedef std::pair<item_type, buffer_item_state> aligned_space_item;
+    typedef aligned_space<aligned_space_item> buffer_item_type;
+    typedef typename allocator_traits<A>::template rebind_alloc<buffer_item_type> allocator_type;
+    buffer_item_type *my_array;
+    size_type my_array_size;
+    static const size_type initial_buffer_size = 4;
+    size_type my_head;
+    size_type my_tail;
+
+    bool buffer_empty() const { return my_head == my_tail; }
+
+    aligned_space_item &item(size_type i) {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value), nullptr);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), nullptr);
+        return *my_array[i & (my_array_size - 1) ].begin();
+    }
+
+    const aligned_space_item &item(size_type i) const {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value), nullptr);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), nullptr);
+        return *my_array[i & (my_array_size-1)].begin();
+    }
+
+    bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (item(i).second != no_item); }
+#if TBB_USE_ASSERT
+    bool my_item_reserved(size_type i) const { return item(i).second == reserved_item; }
+#endif
+
+    // object management in buffer
+    const item_type &get_my_item(size_t i) const {
+        __TBB_ASSERT(my_item_valid(i),"attempt to get invalid item");
+        item_type* itm = const_cast<item_type*>(reinterpret_cast<const item_type*>(&item(i).first));
+        return *itm;
+    }
+
+    // may be called with an empty slot or a slot that has already been constructed into.
+    void set_my_item(size_t i, const item_type &o) {
+        if(item(i).second != no_item) {
+            destroy_item(i);
+        }
+        new(&(item(i).first)) item_type(o);
+        item(i).second = has_item;
+    }
+
+    // destructively-fetch an object from the buffer
+    void fetch_item(size_t i, item_type &o) {
+        __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot");
+        o = get_my_item(i);  // could have std::move assign semantics
+        destroy_item(i);
+    }
+
+    // move an existing item from one slot to another.  The moved-to slot must be unoccupied,
+    // the moved-from slot must exist and not be reserved.  The after, from will be empty,
+    // to will be occupied but not reserved
+    void move_item(size_t to, size_t from) {
+        __TBB_ASSERT(!my_item_valid(to), "Trying to move to a non-empty slot");
+        __TBB_ASSERT(my_item_valid(from), "Trying to move from an empty slot");
+        set_my_item(to, get_my_item(from));   // could have std::move semantics
+        destroy_item(from);
+
+    }
+
+    // put an item in an empty slot.  Return true if successful, else false
+    bool place_item(size_t here, const item_type &me) {
+#if !TBB_DEPRECATED_SEQUENCER_DUPLICATES
+        if(my_item_valid(here)) return false;
+#endif
+        set_my_item(here, me);
+        return true;
+    }
+
+    // could be implemented with std::move semantics
+    void swap_items(size_t i, size_t j) {
+        __TBB_ASSERT(my_item_valid(i) && my_item_valid(j), "attempt to swap invalid item(s)");
+        item_type temp = get_my_item(i);
+        set_my_item(i, get_my_item(j));
+        set_my_item(j, temp);
+    }
+
+    void destroy_item(size_type i) {
+        __TBB_ASSERT(my_item_valid(i), "destruction of invalid item");
+        item(i).first.~item_type();
+        item(i).second = no_item;
+    }
+
+    // returns the front element
+    const item_type& front() const
+    {
+        __TBB_ASSERT(my_item_valid(my_head), "attempt to fetch head non-item");
+        return get_my_item(my_head);
+    }
+
+    // returns  the back element
+    const item_type& back() const
+    {
+        __TBB_ASSERT(my_item_valid(my_tail - 1), "attempt to fetch head non-item");
+        return get_my_item(my_tail - 1);
+    }
+
+    // following methods are for reservation of the front of a buffer.
+    void reserve_item(size_type i) { __TBB_ASSERT(my_item_valid(i) && !my_item_reserved(i), "item cannot be reserved"); item(i).second = reserved_item; }
+    void release_item(size_type i) { __TBB_ASSERT(my_item_reserved(i), "item is not reserved"); item(i).second = has_item; }
+
+    void destroy_front() { destroy_item(my_head); ++my_head; }
+    void destroy_back() { destroy_item(my_tail-1); --my_tail; }
+
+    // we have to be able to test against a new tail value without changing my_tail
+    // grow_array doesn't work if we change my_tail when the old array is too small
+    size_type size(size_t new_tail = 0) { return (new_tail ? new_tail : my_tail) - my_head; }
+    size_type capacity() { return my_array_size; }
+    // sequencer_node does not use this method, so we don't
+    // need a version that passes in the new_tail value.
+    bool buffer_full() { return size() >= capacity(); }
+
+    //! Grows the internal array.
+    void grow_my_array( size_t minimum_size ) {
+        // test that we haven't made the structure inconsistent.
+        __TBB_ASSERT(capacity() >= my_tail - my_head, "total items exceed capacity");
+        size_type new_size = my_array_size ? 2*my_array_size : initial_buffer_size;
+        while( new_size<minimum_size )
+            new_size*=2;
+
+        buffer_item_type* new_array = allocator_type().allocate(new_size);
+
+        // initialize validity to "no"
+        for( size_type i=0; i<new_size; ++i ) { new_array[i].begin()->second = no_item; }
+
+        for( size_type i=my_head; i<my_tail; ++i) {
+            if(my_item_valid(i)) {  // sequencer_node may have empty slots
+                // placement-new copy-construct; could be std::move
+                char *new_space = (char *)&(new_array[i&(new_size-1)].begin()->first);
+                (void)new(new_space) item_type(get_my_item(i));
+                new_array[i&(new_size-1)].begin()->second = item(i).second;
+            }
+        }
+
+        clean_up_buffer(/*reset_pointers*/false);
+
+        my_array = new_array;
+        my_array_size = new_size;
+    }
+
+    bool push_back(item_type &v) {
+        if(buffer_full()) {
+            grow_my_array(size() + 1);
+        }
+        set_my_item(my_tail, v);
+        ++my_tail;
+        return true;
+    }
+
+    bool pop_back(item_type &v) {
+        if (!my_item_valid(my_tail-1)) {
+            return false;
+        }
+        v = this->back();
+        destroy_back();
+        return true;
+    }
+
+    bool pop_front(item_type &v) {
+        if(!my_item_valid(my_head)) {
+            return false;
+        }
+        v = this->front();
+        destroy_front();
+        return true;
+    }
+
+    // This is used both for reset and for grow_my_array.  In the case of grow_my_array
+    // we want to retain the values of the head and tail.
+    void clean_up_buffer(bool reset_pointers) {
+        if (my_array) {
+            for( size_type i=my_head; i<my_tail; ++i ) {
+                if(my_item_valid(i))
+                    destroy_item(i);
+            }
+            allocator_type().deallocate(my_array,my_array_size);
+        }
+        my_array = nullptr;
+        if(reset_pointers) {
+            my_head = my_tail = my_array_size = 0;
+        }
+    }
+
+public:
+    //! Constructor
+    item_buffer( ) : my_array(nullptr), my_array_size(0),
+                     my_head(0), my_tail(0) {
+        grow_my_array(initial_buffer_size);
+    }
+
+    ~item_buffer() {
+        clean_up_buffer(/*reset_pointers*/true);
+    }
+
+    void reset() { clean_up_buffer(/*reset_pointers*/true); grow_my_array(initial_buffer_size); }
+
+};
+
+//! item_buffer with reservable front-end.  NOTE: if reserving, do not
+//* complete operation with pop_front(); use consume_front().
+//* No synchronization built-in.
+template<typename T, typename A=cache_aligned_allocator<T> >
+class reservable_item_buffer : public item_buffer<T, A> {
+protected:
+    using item_buffer<T, A>::my_item_valid;
+    using item_buffer<T, A>::my_head;
+
+public:
+    reservable_item_buffer() : item_buffer<T, A>(), my_reserved(false) {}
+    void reset() {my_reserved = false; item_buffer<T,A>::reset(); }
+protected:
+
+    bool reserve_front(T &v) {
+        if(my_reserved || !my_item_valid(this->my_head)) return false;
+        my_reserved = true;
+        // reserving the head
+        v = this->front();
+        this->reserve_item(this->my_head);
+        return true;
+    }
+
+    void consume_front() {
+        __TBB_ASSERT(my_reserved, "Attempt to consume a non-reserved item");
+        this->destroy_front();
+        my_reserved = false;
+    }
+
+    void release_front() {
+        __TBB_ASSERT(my_reserved, "Attempt to release a non-reserved item");
+        this->release_item(this->my_head);
+        my_reserved = false;
+    }
+
+    bool my_reserved;
+};
+
+#endif // __TBB__flow_graph_item_buffer_impl_H
--- a/third_party/tbb/detail/_flow_graph_join_impl.hh
+++ b/third_party/tbb/detail/_flow_graph_join_impl.hh
--- a/third_party/tbb/detail/_flow_graph_node_impl.hh
+++ b/third_party/tbb/detail/_flow_graph_node_impl.hh
@ -0,0 +1,775 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_node_impl_H
+#define __TBB__flow_graph_node_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+#include "third_party/tbb/detail/_flow_graph_item_buffer_impl.hh"
+
+template< typename T, typename A >
+class function_input_queue : public item_buffer<T,A> {
+public:
+    bool empty() const {
+        return this->buffer_empty();
+    }
+
+    const T& front() const {
+        return this->item_buffer<T, A>::front();
+    }
+
+    void pop() {
+        this->destroy_front();
+    }
+
+    bool push( T& t ) {
+        return this->push_back( t );
+    }
+};
+
+//! Input and scheduling for a function node that takes a type Input as input
+//  The only up-ref is apply_body_impl, which should implement the function
+//  call and any handling of the result.
+template< typename Input, typename Policy, typename A, typename ImplType >
+class function_input_base : public receiver<Input>, no_assign {
+    enum op_type {reg_pred, rem_pred, try_fwd, tryput_bypass, app_body_bypass, occupy_concurrency
+    };
+    typedef function_input_base<Input, Policy, A, ImplType> class_type;
+
+public:
+
+    //! The input type of this receiver
+    typedef Input input_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef predecessor_cache<input_type, null_mutex > predecessor_cache_type;
+    typedef function_input_queue<input_type, A> input_queue_type;
+    typedef typename allocator_traits<A>::template rebind_alloc<input_queue_type> allocator_type;
+    static_assert(!has_policy<queueing, Policy>::value || !has_policy<rejecting, Policy>::value, "");
+
+    //! Constructor for function_input_base
+    function_input_base( graph &g, size_t max_concurrency, node_priority_t a_priority, bool is_no_throw )
+        : my_graph_ref(g), my_max_concurrency(max_concurrency)
+        , my_concurrency(0), my_priority(a_priority), my_is_no_throw(is_no_throw)
+        , my_queue(!has_policy<rejecting, Policy>::value ? new input_queue_type() : nullptr)
+        , my_predecessors(this)
+        , forwarder_busy(false)
+    {
+        my_aggregator.initialize_handler(handler_type(this));
+    }
+
+    //! Copy constructor
+    function_input_base( const function_input_base& src )
+        : function_input_base(src.my_graph_ref, src.my_max_concurrency, src.my_priority, src.my_is_no_throw) {}
+
+    //! Destructor
+    // The queue is allocated by the constructor for {multi}function_node.
+    // TODO: pass the graph_buffer_policy to the base so it can allocate the queue instead.
+    // This would be an interface-breaking change.
+    virtual ~function_input_base() {
+        delete my_queue;
+        my_queue = nullptr;
+    }
+
+    graph_task* try_put_task( const input_type& t) override {
+        if ( my_is_no_throw )
+            return try_put_task_impl(t, has_policy<lightweight, Policy>());
+        else
+            return try_put_task_impl(t, std::false_type());
+    }
+
+    //! Adds src to the list of cached predecessors.
+    bool register_predecessor( predecessor_type &src ) override {
+        operation_type op_data(reg_pred);
+        op_data.r = &src;
+        my_aggregator.execute(&op_data);
+        return true;
+    }
+
+    //! Removes src from the list of cached predecessors.
+    bool remove_predecessor( predecessor_type &src ) override {
+        operation_type op_data(rem_pred);
+        op_data.r = &src;
+        my_aggregator.execute(&op_data);
+        return true;
+    }
+
+protected:
+
+    void reset_function_input_base( reset_flags f) {
+        my_concurrency = 0;
+        if(my_queue) {
+            my_queue->reset();
+        }
+        reset_receiver(f);
+        forwarder_busy = false;
+    }
+
+    graph& my_graph_ref;
+    const size_t my_max_concurrency;
+    size_t my_concurrency;
+    node_priority_t my_priority;
+    const bool my_is_no_throw;
+    input_queue_type *my_queue;
+    predecessor_cache<input_type, null_mutex > my_predecessors;
+
+    void reset_receiver( reset_flags f) {
+        if( f & rf_clear_edges) my_predecessors.clear();
+        else
+            my_predecessors.reset();
+        __TBB_ASSERT(!(f & rf_clear_edges) || my_predecessors.empty(), "function_input_base reset failed");
+    }
+
+    graph& graph_reference() const override {
+        return my_graph_ref;
+    }
+
+    graph_task* try_get_postponed_task(const input_type& i) {
+        operation_type op_data(i, app_body_bypass);  // tries to pop an item or get_item
+        my_aggregator.execute(&op_data);
+        return op_data.bypass_t;
+    }
+
+private:
+
+    friend class apply_body_task_bypass< class_type, input_type >;
+    friend class forward_task_bypass< class_type >;
+
+    class operation_type : public aggregated_operation< operation_type > {
+    public:
+        char type;
+        union {
+            input_type *elem;
+            predecessor_type *r;
+        };
+        graph_task* bypass_t;
+        operation_type(const input_type& e, op_type t) :
+            type(char(t)), elem(const_cast<input_type*>(&e)), bypass_t(nullptr) {}
+        operation_type(op_type t) : type(char(t)), r(nullptr), bypass_t(nullptr) {}
+    };
+
+    bool forwarder_busy;
+    typedef aggregating_functor<class_type, operation_type> handler_type;
+    friend class aggregating_functor<class_type, operation_type>;
+    aggregator< handler_type, operation_type > my_aggregator;
+
+    graph_task* perform_queued_requests() {
+        graph_task* new_task = nullptr;
+        if(my_queue) {
+            if(!my_queue->empty()) {
+                ++my_concurrency;
+                new_task = create_body_task(my_queue->front());
+
+                my_queue->pop();
+            }
+        }
+        else {
+            input_type i;
+            if(my_predecessors.get_item(i)) {
+                ++my_concurrency;
+                new_task = create_body_task(i);
+            }
+        }
+        return new_task;
+    }
+    void handle_operations(operation_type *op_list) {
+        operation_type* tmp;
+        while (op_list) {
+            tmp = op_list;
+            op_list = op_list->next;
+            switch (tmp->type) {
+            case reg_pred:
+                my_predecessors.add(*(tmp->r));
+                tmp->status.store(SUCCEEDED, std::memory_order_release);
+                if (!forwarder_busy) {
+                    forwarder_busy = true;
+                    spawn_forward_task();
+                }
+                break;
+            case rem_pred:
+                my_predecessors.remove(*(tmp->r));
+                tmp->status.store(SUCCEEDED, std::memory_order_release);
+                break;
+            case app_body_bypass: {
+                tmp->bypass_t = nullptr;
+                __TBB_ASSERT(my_max_concurrency != 0, nullptr);
+                --my_concurrency;
+                if(my_concurrency<my_max_concurrency)
+                    tmp->bypass_t = perform_queued_requests();
+                tmp->status.store(SUCCEEDED, std::memory_order_release);
+            }
+                break;
+            case tryput_bypass: internal_try_put_task(tmp);  break;
+            case try_fwd: internal_forward(tmp);  break;
+            case occupy_concurrency:
+                if (my_concurrency < my_max_concurrency) {
+                    ++my_concurrency;
+                    tmp->status.store(SUCCEEDED, std::memory_order_release);
+                } else {
+                    tmp->status.store(FAILED, std::memory_order_release);
+                }
+                break;
+            }
+        }
+    }
+
+    //! Put to the node, but return the task instead of enqueueing it
+    void internal_try_put_task(operation_type *op) {
+        __TBB_ASSERT(my_max_concurrency != 0, nullptr);
+        if (my_concurrency < my_max_concurrency) {
+            ++my_concurrency;
+            graph_task * new_task = create_body_task(*(op->elem));
+            op->bypass_t = new_task;
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        } else if ( my_queue && my_queue->push(*(op->elem)) ) {
+            op->bypass_t = SUCCESSFULLY_ENQUEUED;
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        } else {
+            op->bypass_t = nullptr;
+            op->status.store(FAILED, std::memory_order_release);
+        }
+    }
+
+    //! Creates tasks for postponed messages if available and if concurrency allows
+    void internal_forward(operation_type *op) {
+        op->bypass_t = nullptr;
+        if (my_concurrency < my_max_concurrency)
+            op->bypass_t = perform_queued_requests();
+        if(op->bypass_t)
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        else {
+            forwarder_busy = false;
+            op->status.store(FAILED, std::memory_order_release);
+        }
+    }
+
+    graph_task* internal_try_put_bypass( const input_type& t ) {
+        operation_type op_data(t, tryput_bypass);
+        my_aggregator.execute(&op_data);
+        if( op_data.status == SUCCEEDED ) {
+            return op_data.bypass_t;
+        }
+        return nullptr;
+    }
+
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::true_type ) {
+        if( my_max_concurrency == 0 ) {
+            return apply_body_bypass(t);
+        } else {
+            operation_type check_op(t, occupy_concurrency);
+            my_aggregator.execute(&check_op);
+            if( check_op.status == SUCCEEDED ) {
+                return apply_body_bypass(t);
+            }
+            return internal_try_put_bypass(t);
+        }
+    }
+
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::false_type ) {
+        if( my_max_concurrency == 0 ) {
+            return create_body_task(t);
+        } else {
+            return internal_try_put_bypass(t);
+        }
+    }
+
+    //! Applies the body to the provided input
+    //  then decides if more work is available
+    graph_task* apply_body_bypass( const input_type &i ) {
+        return static_cast<ImplType *>(this)->apply_body_impl_bypass(i);
+    }
+
+    //! allocates a task to apply a body
+    graph_task* create_body_task( const input_type &input ) {
+        if (!is_graph_active(my_graph_ref)) {
+            return nullptr;
+        }
+        // TODO revamp: extract helper for common graph task allocation part
+        small_object_allocator allocator{};
+        typedef apply_body_task_bypass<class_type, input_type> task_type;
+        graph_task* t = allocator.new_object<task_type>( my_graph_ref, allocator, *this, input, my_priority );
+        graph_reference().reserve_wait();
+        return t;
+    }
+
+    //! This is executed by an enqueued task, the "forwarder"
+    graph_task* forward_task() {
+        operation_type op_data(try_fwd);
+        graph_task* rval = nullptr;
+        do {
+            op_data.status = WAIT;
+            my_aggregator.execute(&op_data);
+            if(op_data.status == SUCCEEDED) {
+                graph_task* ttask = op_data.bypass_t;
+                __TBB_ASSERT( ttask && ttask != SUCCESSFULLY_ENQUEUED, nullptr);
+                rval = combine_tasks(my_graph_ref, rval, ttask);
+            }
+        } while (op_data.status == SUCCEEDED);
+        return rval;
+    }
+
+    inline graph_task* create_forward_task() {
+        if (!is_graph_active(my_graph_ref)) {
+            return nullptr;
+        }
+        small_object_allocator allocator{};
+        typedef forward_task_bypass<class_type> task_type;
+        graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, my_priority );
+        graph_reference().reserve_wait();
+        return t;
+    }
+
+    //! Spawns a task that calls forward()
+    inline void spawn_forward_task() {
+        graph_task* tp = create_forward_task();
+        if(tp) {
+            spawn_in_graph_arena(graph_reference(), *tp);
+        }
+    }
+
+    node_priority_t priority() const override { return my_priority; }
+};  // function_input_base
+
+//! Implements methods for a function node that takes a type Input as input and sends
+//  a type Output to its successors.
+template< typename Input, typename Output, typename Policy, typename A>
+class function_input : public function_input_base<Input, Policy, A, function_input<Input,Output,Policy,A> > {
+public:
+    typedef Input input_type;
+    typedef Output output_type;
+    typedef function_body<input_type, output_type> function_body_type;
+    typedef function_input<Input, Output, Policy,A> my_class;
+    typedef function_input_base<Input, Policy, A, my_class> base_type;
+    typedef function_input_queue<input_type, A> input_queue_type;
+
+    // constructor
+    template<typename Body>
+    function_input(
+        graph &g, size_t max_concurrency, Body& body, node_priority_t a_priority )
+      : base_type(g, max_concurrency, a_priority, noexcept(tbb::detail::invoke(body, input_type())))
+      , my_body( new function_body_leaf< input_type, output_type, Body>(body) )
+      , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) ) {
+    }
+
+    //! Copy constructor
+    function_input( const function_input& src ) :
+        base_type(src),
+        my_body( src.my_init_body->clone() ),
+        my_init_body(src.my_init_body->clone() ) {
+    }
+#if __INTEL_COMPILER <= 2021
+    // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited
+    // class while the parent class has the virtual keyword for the destrocutor.
+    virtual
+#endif
+    ~function_input() {
+        delete my_body;
+        delete my_init_body;
+    }
+
+    template< typename Body >
+    Body copy_function_object() {
+        function_body_type &body_ref = *this->my_body;
+        return dynamic_cast< function_body_leaf<input_type, output_type, Body> & >(body_ref).get_body();
+    }
+
+    output_type apply_body_impl( const input_type& i) {
+        // There is an extra copied needed to capture the
+        // body execution without the try_put
+        fgt_begin_body( my_body );
+        output_type v = tbb::detail::invoke(*my_body, i);
+        fgt_end_body( my_body );
+        return v;
+    }
+
+    //TODO: consider moving into the base class
+    graph_task* apply_body_impl_bypass( const input_type &i) {
+        output_type v = apply_body_impl(i);
+        graph_task* postponed_task = nullptr;
+        if( base_type::my_max_concurrency != 0 ) {
+            postponed_task = base_type::try_get_postponed_task(i);
+            __TBB_ASSERT( !postponed_task || postponed_task != SUCCESSFULLY_ENQUEUED, nullptr);
+        }
+        if( postponed_task ) {
+            // make the task available for other workers since we do not know successors'
+            // execution policy
+            spawn_in_graph_arena(base_type::graph_reference(), *postponed_task);
+        }
+        graph_task* successor_task = successors().try_put_task(v);
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (push)
+#pragma warning (disable: 4127)  /* suppress conditional expression is constant */
+#endif
+        if(has_policy<lightweight, Policy>::value) {
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (pop)
+#endif
+            if(!successor_task) {
+                // Return confirmative status since current
+                // node's body has been executed anyway
+                successor_task = SUCCESSFULLY_ENQUEUED;
+            }
+        }
+        return successor_task;
+    }
+
+protected:
+
+    void reset_function_input(reset_flags f) {
+        base_type::reset_function_input_base(f);
+        if(f & rf_reset_bodies) {
+            function_body_type *tmp = my_init_body->clone();
+            delete my_body;
+            my_body = tmp;
+        }
+    }
+
+    function_body_type *my_body;
+    function_body_type *my_init_body;
+    virtual broadcast_cache<output_type > &successors() = 0;
+
+};  // function_input
+
+
+// helper templates to clear the successor edges of the output ports of an multifunction_node
+template<int N> struct clear_element {
+    template<typename P> static void clear_this(P &p) {
+        (void)std::get<N-1>(p).successors().clear();
+        clear_element<N-1>::clear_this(p);
+    }
+#if TBB_USE_ASSERT
+    template<typename P> static bool this_empty(P &p) {
+        if(std::get<N-1>(p).successors().empty())
+            return clear_element<N-1>::this_empty(p);
+        return false;
+    }
+#endif
+};
+
+template<> struct clear_element<1> {
+    template<typename P> static void clear_this(P &p) {
+        (void)std::get<0>(p).successors().clear();
+    }
+#if TBB_USE_ASSERT
+    template<typename P> static bool this_empty(P &p) {
+        return std::get<0>(p).successors().empty();
+    }
+#endif
+};
+
+template <typename OutputTuple>
+struct init_output_ports {
+    template <typename... Args>
+    static OutputTuple call(graph& g, const std::tuple<Args...>&) {
+        return OutputTuple(Args(g)...);
+    }
+}; // struct init_output_ports
+
+//! Implements methods for a function node that takes a type Input as input
+//  and has a tuple of output ports specified.
+template< typename Input, typename OutputPortSet, typename Policy, typename A>
+class multifunction_input : public function_input_base<Input, Policy, A, multifunction_input<Input,OutputPortSet,Policy,A> > {
+public:
+    static const int N = std::tuple_size<OutputPortSet>::value;
+    typedef Input input_type;
+    typedef OutputPortSet output_ports_type;
+    typedef multifunction_body<input_type, output_ports_type> multifunction_body_type;
+    typedef multifunction_input<Input, OutputPortSet, Policy, A> my_class;
+    typedef function_input_base<Input, Policy, A, my_class> base_type;
+    typedef function_input_queue<input_type, A> input_queue_type;
+
+    // constructor
+    template<typename Body>
+    multifunction_input(graph &g, size_t max_concurrency,Body& body, node_priority_t a_priority )
+      : base_type(g, max_concurrency, a_priority, noexcept(tbb::detail::invoke(body, input_type(), my_output_ports)))
+      , my_body( new multifunction_body_leaf<input_type, output_ports_type, Body>(body) )
+      , my_init_body( new multifunction_body_leaf<input_type, output_ports_type, Body>(body) )
+      , my_output_ports(init_output_ports<output_ports_type>::call(g, my_output_ports)){
+    }
+
+    //! Copy constructor
+    multifunction_input( const multifunction_input& src ) :
+        base_type(src),
+        my_body( src.my_init_body->clone() ),
+        my_init_body(src.my_init_body->clone() ),
+        my_output_ports( init_output_ports<output_ports_type>::call(src.my_graph_ref, my_output_ports) ) {
+    }
+
+    ~multifunction_input() {
+        delete my_body;
+        delete my_init_body;
+    }
+
+    template< typename Body >
+    Body copy_function_object() {
+        multifunction_body_type &body_ref = *this->my_body;
+        return *static_cast<Body*>(dynamic_cast< multifunction_body_leaf<input_type, output_ports_type, Body> & >(body_ref).get_body_ptr());
+    }
+
+    // for multifunction nodes we do not have a single successor as such.  So we just tell
+    // the task we were successful.
+    //TODO: consider moving common parts with implementation in function_input into separate function
+    graph_task* apply_body_impl_bypass( const input_type &i ) {
+        fgt_begin_body( my_body );
+        (*my_body)(i, my_output_ports);
+        fgt_end_body( my_body );
+        graph_task* ttask = nullptr;
+        if(base_type::my_max_concurrency != 0) {
+            ttask = base_type::try_get_postponed_task(i);
+        }
+        return ttask ? ttask : SUCCESSFULLY_ENQUEUED;
+    }
+
+    output_ports_type &output_ports(){ return my_output_ports; }
+
+protected:
+
+    void reset(reset_flags f) {
+        base_type::reset_function_input_base(f);
+        if(f & rf_clear_edges)clear_element<N>::clear_this(my_output_ports);
+        if(f & rf_reset_bodies) {
+            multifunction_body_type* tmp = my_init_body->clone();
+            delete my_body;
+            my_body = tmp;
+        }
+        __TBB_ASSERT(!(f & rf_clear_edges) || clear_element<N>::this_empty(my_output_ports), "multifunction_node reset failed");
+    }
+
+    multifunction_body_type *my_body;
+    multifunction_body_type *my_init_body;
+    output_ports_type my_output_ports;
+
+};  // multifunction_input
+
+// template to refer to an output port of a multifunction_node
+template<size_t N, typename MOP>
+typename std::tuple_element<N, typename MOP::output_ports_type>::type &output_port(MOP &op) {
+    return std::get<N>(op.output_ports());
+}
+
+inline void check_task_and_spawn(graph& g, graph_task* t) {
+    if (t && t != SUCCESSFULLY_ENQUEUED) {
+        spawn_in_graph_arena(g, *t);
+    }
+}
+
+// helper structs for split_node
+template<int N>
+struct emit_element {
+    template<typename T, typename P>
+    static graph_task* emit_this(graph& g, const T &t, P &p) {
+        // TODO: consider to collect all the tasks in task_list and spawn them all at once
+        graph_task* last_task = std::get<N-1>(p).try_put_task(std::get<N-1>(t));
+        check_task_and_spawn(g, last_task);
+        return emit_element<N-1>::emit_this(g,t,p);
+    }
+};
+
+template<>
+struct emit_element<1> {
+    template<typename T, typename P>
+    static graph_task* emit_this(graph& g, const T &t, P &p) {
+        graph_task* last_task = std::get<0>(p).try_put_task(std::get<0>(t));
+        check_task_and_spawn(g, last_task);
+        return SUCCESSFULLY_ENQUEUED;
+    }
+};
+
+//! Implements methods for an executable node that takes continue_msg as input
+template< typename Output, typename Policy>
+class continue_input : public continue_receiver {
+public:
+
+    //! The input type of this receiver
+    typedef continue_msg input_type;
+
+    //! The output type of this receiver
+    typedef Output output_type;
+    typedef function_body<input_type, output_type> function_body_type;
+    typedef continue_input<output_type, Policy> class_type;
+
+    template< typename Body >
+    continue_input( graph &g, Body& body, node_priority_t a_priority )
+        : continue_receiver(/*number_of_predecessors=*/0, a_priority)
+        , my_graph_ref(g)
+        , my_body( new function_body_leaf< input_type, output_type, Body>(body) )
+        , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) )
+    { }
+
+    template< typename Body >
+    continue_input( graph &g, int number_of_predecessors,
+                    Body& body, node_priority_t a_priority )
+      : continue_receiver( number_of_predecessors, a_priority )
+      , my_graph_ref(g)
+      , my_body( new function_body_leaf< input_type, output_type, Body>(body) )
+      , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) )
+    { }
+
+    continue_input( const continue_input& src ) : continue_receiver(src),
+                                                  my_graph_ref(src.my_graph_ref),
+                                                  my_body( src.my_init_body->clone() ),
+                                                  my_init_body( src.my_init_body->clone() ) {}
+
+    ~continue_input() {
+        delete my_body;
+        delete my_init_body;
+    }
+
+    template< typename Body >
+    Body copy_function_object() {
+        function_body_type &body_ref = *my_body;
+        return dynamic_cast< function_body_leaf<input_type, output_type, Body> & >(body_ref).get_body();
+    }
+
+    void reset_receiver( reset_flags f) override {
+        continue_receiver::reset_receiver(f);
+        if(f & rf_reset_bodies) {
+            function_body_type *tmp = my_init_body->clone();
+            delete my_body;
+            my_body = tmp;
+        }
+    }
+
+protected:
+
+    graph& my_graph_ref;
+    function_body_type *my_body;
+    function_body_type *my_init_body;
+
+    virtual broadcast_cache<output_type > &successors() = 0;
+
+    friend class apply_body_task_bypass< class_type, continue_msg >;
+
+    //! Applies the body to the provided input
+    graph_task* apply_body_bypass( input_type ) {
+        // There is an extra copied needed to capture the
+        // body execution without the try_put
+        fgt_begin_body( my_body );
+        output_type v = (*my_body)( continue_msg() );
+        fgt_end_body( my_body );
+        return successors().try_put_task( v );
+    }
+
+    graph_task* execute() override {
+        if(!is_graph_active(my_graph_ref)) {
+            return nullptr;
+        }
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (push)
+#pragma warning (disable: 4127)  /* suppress conditional expression is constant */
+#endif
+        if(has_policy<lightweight, Policy>::value) {
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (pop)
+#endif
+            return apply_body_bypass( continue_msg() );
+        }
+        else {
+            small_object_allocator allocator{};
+            typedef apply_body_task_bypass<class_type, continue_msg> task_type;
+            graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority );
+            graph_reference().reserve_wait();
+            return t;
+        }
+    }
+
+    graph& graph_reference() const override {
+        return my_graph_ref;
+    }
+};  // continue_input
+
+//! Implements methods for both executable and function nodes that puts Output to its successors
+template< typename Output >
+class function_output : public sender<Output> {
+public:
+
+    template<int N> friend struct clear_element;
+    typedef Output output_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+    typedef broadcast_cache<output_type> broadcast_cache_type;
+
+    function_output(graph& g) : my_successors(this), my_graph_ref(g) {}
+    function_output(const function_output& other) = delete;
+
+    //! Adds a new successor to this node
+    bool register_successor( successor_type &r ) override {
+        successors().register_successor( r );
+        return true;
+    }
+
+    //! Removes a successor from this node
+    bool remove_successor( successor_type &r ) override {
+        successors().remove_successor( r );
+        return true;
+    }
+
+    broadcast_cache_type &successors() { return my_successors; }
+
+    graph& graph_reference() const { return my_graph_ref; }
+protected:
+    broadcast_cache_type my_successors;
+    graph& my_graph_ref;
+};  // function_output
+
+template< typename Output >
+class multifunction_output : public function_output<Output> {
+public:
+    typedef Output output_type;
+    typedef function_output<output_type> base_type;
+    using base_type::my_successors;
+
+    multifunction_output(graph& g) : base_type(g) {}
+    multifunction_output(const multifunction_output& other) : base_type(other.my_graph_ref) {}
+
+    bool try_put(const output_type &i) {
+        graph_task *res = try_put_task(i);
+        if( !res ) return false;
+        if( res != SUCCESSFULLY_ENQUEUED ) {
+            // wrapping in task_arena::execute() is not needed since the method is called from
+            // inside task::execute()
+            spawn_in_graph_arena(graph_reference(), *res);
+        }
+        return true;
+    }
+
+    using base_type::graph_reference;
+
+protected:
+
+    graph_task* try_put_task(const output_type &i) {
+        return my_successors.try_put_task(i);
+    }
+
+    template <int N> friend struct emit_element;
+
+};  // multifunction_output
+
+//composite_node
+template<typename CompositeType>
+void add_nodes_impl(CompositeType*, bool) {}
+
+template< typename CompositeType, typename NodeType1, typename... NodeTypes >
+void add_nodes_impl(CompositeType *c_node, bool visible, const NodeType1& n1, const NodeTypes&... n) {
+    void *addr = const_cast<NodeType1 *>(&n1);
+
+    fgt_alias_port(c_node, addr, visible);
+    add_nodes_impl(c_node, visible, n...);
+}
+
+#endif // __TBB__flow_graph_node_impl_H
--- a/third_party/tbb/detail/_flow_graph_node_set_impl.hh
+++ b/third_party/tbb/detail/_flow_graph_node_set_impl.hh
@ -0,0 +1,266 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_node_set_impl_H
+#define __TBB_flow_graph_node_set_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// Included in namespace tbb::detail::d1 (in flow_graph.h)
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+// Visual Studio 2019 reports an error while calling predecessor_selector::get and successor_selector::get
+// Seems like the well-formed expression in trailing decltype is treated as ill-formed
+// TODO: investigate problems with decltype in trailing return types or find the cross-platform solution
+#define __TBB_MSVC_DISABLE_TRAILING_DECLTYPE (_MSC_VER >= 1900)
+
+namespace order {
+struct undefined {};
+struct following {};
+struct preceding {};
+}
+
+class get_graph_helper {
+public:
+    // TODO: consider making graph_reference() public and consistent interface to get a reference to the graph
+    // and remove get_graph_helper
+    template <typename T>
+    static graph& get(const T& object) {
+        return get_impl(object, std::is_base_of<graph_node, T>());
+    }
+
+private:
+    // Get graph from the object of type derived from graph_node
+    template <typename T>
+    static graph& get_impl(const T& object, std::true_type) {
+        return static_cast<const graph_node*>(&object)->my_graph;
+    }
+
+    template <typename T>
+    static graph& get_impl(const T& object, std::false_type) {
+        return object.graph_reference();
+    }
+};
+
+template<typename Order, typename... Nodes>
+struct node_set {
+    typedef Order order_type;
+
+    std::tuple<Nodes&...> nodes;
+    node_set(Nodes&... ns) : nodes(ns...) {}
+
+    template <typename... Nodes2>
+    node_set(const node_set<order::undefined, Nodes2...>& set) : nodes(set.nodes) {}
+
+    graph& graph_reference() const {
+        return get_graph_helper::get(std::get<0>(nodes));
+    }
+};
+
+namespace alias_helpers {
+template <typename T> using output_type = typename T::output_type;
+template <typename T> using output_ports_type = typename T::output_ports_type;
+template <typename T> using input_type = typename T::input_type;
+template <typename T> using input_ports_type = typename T::input_ports_type;
+} // namespace alias_helpers
+
+template <typename T>
+using has_output_type = supports<T, alias_helpers::output_type>;
+
+template <typename T>
+using has_input_type = supports<T, alias_helpers::input_type>;
+
+template <typename T>
+using has_input_ports_type = supports<T, alias_helpers::input_ports_type>;
+
+template <typename T>
+using has_output_ports_type = supports<T, alias_helpers::output_ports_type>;
+
+template<typename T>
+struct is_sender : std::is_base_of<sender<typename T::output_type>, T> {};
+
+template<typename T>
+struct is_receiver : std::is_base_of<receiver<typename T::input_type>, T> {};
+
+template <typename Node>
+struct is_async_node : std::false_type {};
+
+template <typename... Args>
+struct is_async_node<async_node<Args...>> : std::true_type {};
+
+template<typename FirstPredecessor, typename... Predecessors>
+node_set<order::following, FirstPredecessor, Predecessors...>
+follows(FirstPredecessor& first_predecessor, Predecessors&... predecessors) {
+    static_assert((conjunction<has_output_type<FirstPredecessor>,
+                                                   has_output_type<Predecessors>...>::value),
+                        "Not all node's predecessors has output_type typedef");
+    static_assert((conjunction<is_sender<FirstPredecessor>, is_sender<Predecessors>...>::value),
+                        "Not all node's predecessors are senders");
+    return node_set<order::following, FirstPredecessor, Predecessors...>(first_predecessor, predecessors...);
+}
+
+template<typename... Predecessors>
+node_set<order::following, Predecessors...>
+follows(node_set<order::undefined, Predecessors...>& predecessors_set) {
+    static_assert((conjunction<has_output_type<Predecessors>...>::value),
+                        "Not all nodes in the set has output_type typedef");
+    static_assert((conjunction<is_sender<Predecessors>...>::value),
+                        "Not all nodes in the set are senders");
+    return node_set<order::following, Predecessors...>(predecessors_set);
+}
+
+template<typename FirstSuccessor, typename... Successors>
+node_set<order::preceding, FirstSuccessor, Successors...>
+precedes(FirstSuccessor& first_successor, Successors&... successors) {
+    static_assert((conjunction<has_input_type<FirstSuccessor>,
+                                                    has_input_type<Successors>...>::value),
+                        "Not all node's successors has input_type typedef");
+    static_assert((conjunction<is_receiver<FirstSuccessor>, is_receiver<Successors>...>::value),
+                        "Not all node's successors are receivers");
+    return node_set<order::preceding, FirstSuccessor, Successors...>(first_successor, successors...);
+}
+
+template<typename... Successors>
+node_set<order::preceding, Successors...>
+precedes(node_set<order::undefined, Successors...>& successors_set) {
+    static_assert((conjunction<has_input_type<Successors>...>::value),
+                        "Not all nodes in the set has input_type typedef");
+    static_assert((conjunction<is_receiver<Successors>...>::value),
+                        "Not all nodes in the set are receivers");
+    return node_set<order::preceding, Successors...>(successors_set);
+}
+
+template <typename Node, typename... Nodes>
+node_set<order::undefined, Node, Nodes...>
+make_node_set(Node& first_node, Nodes&... nodes) {
+    return node_set<order::undefined, Node, Nodes...>(first_node, nodes...);
+}
+
+template<size_t I>
+class successor_selector {
+    template <typename NodeType>
+    static auto get_impl(NodeType& node, std::true_type) -> decltype(input_port<I>(node)) {
+        return input_port<I>(node);
+    }
+
+    template <typename NodeType>
+    static NodeType& get_impl(NodeType& node, std::false_type) { return node; }
+
+public:
+    template <typename NodeType>
+#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE
+    static auto& get(NodeType& node)
+#else
+    static auto get(NodeType& node) -> decltype(get_impl(node, has_input_ports_type<NodeType>()))
+#endif
+    {
+        return get_impl(node, has_input_ports_type<NodeType>());
+    }
+};
+
+template<size_t I>
+class predecessor_selector {
+    template <typename NodeType>
+    static auto internal_get(NodeType& node, std::true_type) -> decltype(output_port<I>(node)) {
+        return output_port<I>(node);
+    }
+
+    template <typename NodeType>
+    static NodeType& internal_get(NodeType& node, std::false_type) { return node;}
+
+    template <typename NodeType>
+#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE
+    static auto& get_impl(NodeType& node, std::false_type)
+#else
+    static auto get_impl(NodeType& node, std::false_type) -> decltype(internal_get(node, has_output_ports_type<NodeType>()))
+#endif
+    {
+        return internal_get(node, has_output_ports_type<NodeType>());
+    }
+
+    template <typename AsyncNode>
+    static AsyncNode& get_impl(AsyncNode& node, std::true_type) { return node; }
+
+public:
+    template <typename NodeType>
+#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE
+    static auto& get(NodeType& node)
+#else
+    static auto get(NodeType& node) -> decltype(get_impl(node, is_async_node<NodeType>()))
+#endif
+    {
+        return get_impl(node, is_async_node<NodeType>());
+    }
+};
+
+template<size_t I>
+class make_edges_helper {
+public:
+    template<typename PredecessorsTuple, typename NodeType>
+    static void connect_predecessors(PredecessorsTuple& predecessors, NodeType& node) {
+        make_edge(std::get<I>(predecessors), successor_selector<I>::get(node));
+        make_edges_helper<I - 1>::connect_predecessors(predecessors, node);
+    }
+
+    template<typename SuccessorsTuple, typename NodeType>
+    static void connect_successors(NodeType& node, SuccessorsTuple& successors) {
+        make_edge(predecessor_selector<I>::get(node), std::get<I>(successors));
+        make_edges_helper<I - 1>::connect_successors(node, successors);
+    }
+};
+
+template<>
+struct make_edges_helper<0> {
+    template<typename PredecessorsTuple, typename NodeType>
+    static void connect_predecessors(PredecessorsTuple& predecessors, NodeType& node) {
+        make_edge(std::get<0>(predecessors), successor_selector<0>::get(node));
+    }
+
+    template<typename SuccessorsTuple, typename NodeType>
+    static void connect_successors(NodeType& node, SuccessorsTuple& successors) {
+        make_edge(predecessor_selector<0>::get(node), std::get<0>(successors));
+    }
+};
+
+// TODO: consider adding an overload for making edges between node sets
+template<typename NodeType, typename OrderFlagType, typename... Args>
+void make_edges(const node_set<OrderFlagType, Args...>& s, NodeType& node) {
+    const std::size_t SetSize = std::tuple_size<decltype(s.nodes)>::value;
+    make_edges_helper<SetSize - 1>::connect_predecessors(s.nodes, node);
+}
+
+template <typename NodeType, typename OrderFlagType, typename... Args>
+void make_edges(NodeType& node, const node_set<OrderFlagType, Args...>& s) {
+    const std::size_t SetSize = std::tuple_size<decltype(s.nodes)>::value;
+    make_edges_helper<SetSize - 1>::connect_successors(node, s.nodes);
+}
+
+template <typename NodeType, typename... Nodes>
+void make_edges_in_order(const node_set<order::following, Nodes...>& ns, NodeType& node) {
+    make_edges(ns, node);
+}
+
+template <typename NodeType, typename... Nodes>
+void make_edges_in_order(const node_set<order::preceding, Nodes...>& ns, NodeType& node) {
+    make_edges(node, ns);
+}
+
+#endif  // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+#endif // __TBB_flow_graph_node_set_impl_H
--- a/third_party/tbb/detail/_flow_graph_nodes_deduction.hh
+++ b/third_party/tbb/detail/_flow_graph_nodes_deduction.hh
@ -0,0 +1,278 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_nodes_deduction_H
+#define __TBB_flow_graph_nodes_deduction_H
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Input, typename Output>
+struct declare_body_types {
+    using input_type = Input;
+    using output_type = Output;
+};
+
+struct NoInputBody {};
+
+template <typename Output>
+struct declare_body_types<NoInputBody, Output> {
+    using output_type = Output;
+};
+
+template <typename T> struct body_types;
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(const Input&) const> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(const Input&)> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(Input&) const> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(Input&)> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Output>
+struct body_types<Output (T::*)(flow_control&) const> : declare_body_types<NoInputBody, Output> {};
+
+template <typename T, typename Output>
+struct body_types<Output (T::*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+
+template <typename Input, typename Output>
+struct body_types<Output (*)(Input&)> : declare_body_types<Input, Output> {};
+
+template <typename Input, typename Output>
+struct body_types<Output (*)(const Input&)> : declare_body_types<Input, Output> {};
+
+template <typename Output>
+struct body_types<Output (*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+
+template <typename Body>
+using input_t = typename body_types<Body>::input_type;
+
+template <typename Body>
+using output_t = typename body_types<Body>::output_type;
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(const Input&) const)->decltype(name);
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(const Input&))->decltype(name);
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(Input&) const)->decltype(name);
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(Input&))->decltype(name);
+
+template <typename Input, typename Output>
+auto decide_on_operator_overload(Output (*name)(const Input&))->decltype(name);
+
+template <typename Input, typename Output>
+auto decide_on_operator_overload(Output (*name)(Input&))->decltype(name);
+
+template <typename Body>
+decltype(decide_on_operator_overload(&Body::operator())) decide_on_callable_type(int);
+
+template <typename Body>
+decltype(decide_on_operator_overload(std::declval<Body>())) decide_on_callable_type(...);
+
+// Deduction guides for Flow Graph nodes
+
+template <typename GraphOrSet, typename Body>
+input_node(GraphOrSet&&, Body)
+->input_node<output_t<decltype(decide_on_callable_type<Body>(0))>>;
+    
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename NodeSet>
+struct decide_on_set;
+
+template <typename Node, typename... Nodes>
+struct decide_on_set<node_set<order::following, Node, Nodes...>> {
+    using type = typename Node::output_type;
+};
+
+template <typename Node, typename... Nodes>
+struct decide_on_set<node_set<order::preceding, Node, Nodes...>> {
+    using type = typename Node::input_type;
+};
+
+template <typename NodeSet>
+using decide_on_set_t = typename decide_on_set<std::decay_t<NodeSet>>::type;
+
+template <typename NodeSet>
+broadcast_node(const NodeSet&)
+->broadcast_node<decide_on_set_t<NodeSet>>;
+
+template <typename NodeSet>
+buffer_node(const NodeSet&)
+->buffer_node<decide_on_set_t<NodeSet>>;
+
+template <typename NodeSet>
+queue_node(const NodeSet&)
+->queue_node<decide_on_set_t<NodeSet>>;
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename GraphOrProxy, typename Sequencer>
+sequencer_node(GraphOrProxy&&, Sequencer)
+->sequencer_node<input_t<decltype(decide_on_callable_type<Sequencer>(0))>>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename NodeSet, typename Compare>
+priority_queue_node(const NodeSet&, const Compare&)
+->priority_queue_node<decide_on_set_t<NodeSet>, Compare>;
+
+template <typename NodeSet>
+priority_queue_node(const NodeSet&)
+->priority_queue_node<decide_on_set_t<NodeSet>, std::less<decide_on_set_t<NodeSet>>>;
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename Key>
+struct join_key {
+    using type = Key;
+};
+
+template <typename T>
+struct join_key<const T&> {
+    using type = T&;
+};
+
+template <typename Key>
+using join_key_t = typename join_key<Key>::type;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename Policy, typename... Predecessors>
+join_node(const node_set<order::following, Predecessors...>&, Policy)
+->join_node<std::tuple<typename Predecessors::output_type...>,
+            Policy>;
+
+template <typename Policy, typename Successor, typename... Successors>
+join_node(const node_set<order::preceding, Successor, Successors...>&, Policy)
+->join_node<typename Successor::input_type, Policy>;
+
+template <typename... Predecessors>
+join_node(const node_set<order::following, Predecessors...>)
+->join_node<std::tuple<typename Predecessors::output_type...>,
+            queueing>;
+
+template <typename Successor, typename... Successors>
+join_node(const node_set<order::preceding, Successor, Successors...>)
+->join_node<typename Successor::input_type, queueing>;
+#endif
+
+template <typename GraphOrProxy, typename Body, typename... Bodies>
+join_node(GraphOrProxy&&, Body, Bodies...)
+->join_node<std::tuple<input_t<decltype(decide_on_callable_type<Body>(0))>,
+                       input_t<decltype(decide_on_callable_type<Bodies>(0))>...>,
+            key_matching<join_key_t<output_t<decltype(decide_on_callable_type<Body>(0))>>>>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename... Predecessors>
+indexer_node(const node_set<order::following, Predecessors...>&)
+->indexer_node<typename Predecessors::output_type...>;
+#endif
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename NodeSet>
+limiter_node(const NodeSet&, size_t)
+->limiter_node<decide_on_set_t<NodeSet>>;
+
+template <typename Predecessor, typename... Predecessors>
+split_node(const node_set<order::following, Predecessor, Predecessors...>&)
+->split_node<typename Predecessor::output_type>;
+
+template <typename... Successors>
+split_node(const node_set<order::preceding, Successors...>&)
+->split_node<std::tuple<typename Successors::input_type...>>;
+
+#endif
+
+template <typename GraphOrSet, typename Body, typename Policy>
+function_node(GraphOrSet&&,
+              size_t, Body,
+              Policy, node_priority_t = no_priority)
+->function_node<input_t<decltype(decide_on_callable_type<Body>(0))>,
+                output_t<decltype(decide_on_callable_type<Body>(0))>,
+                Policy>;
+
+template <typename GraphOrSet, typename Body>
+function_node(GraphOrSet&&, size_t,
+              Body, node_priority_t = no_priority)
+->function_node<input_t<decltype(decide_on_callable_type<Body>(0))>,
+                output_t<decltype(decide_on_callable_type<Body>(0))>,
+                queueing>;
+
+template <typename Output>
+struct continue_output {
+    using type = Output;
+};
+
+template <>
+struct continue_output<void> {
+    using type = continue_msg;
+};
+
+template <typename T>
+using continue_output_t = typename continue_output<T>::type;
+
+template <typename GraphOrSet, typename Body, typename Policy>
+continue_node(GraphOrSet&&, Body,
+              Policy, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>,
+                Policy>;
+
+template <typename GraphOrSet, typename Body, typename Policy>
+continue_node(GraphOrSet&&,
+              int, Body,
+              Policy, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>,
+                Policy>;
+
+template <typename GraphOrSet, typename Body>
+continue_node(GraphOrSet&&,
+              Body, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>, Policy<void>>;
+
+template <typename GraphOrSet, typename Body>
+continue_node(GraphOrSet&&, int,
+              Body, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>,
+                Policy<void>>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename NodeSet>
+overwrite_node(const NodeSet&)
+->overwrite_node<decide_on_set_t<NodeSet>>;
+
+template <typename NodeSet>
+write_once_node(const NodeSet&)
+->write_once_node<decide_on_set_t<NodeSet>>;
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+#endif // __TBB_flow_graph_nodes_deduction_H
--- a/third_party/tbb/detail/_flow_graph_tagged_buffer_impl.hh
+++ b/third_party/tbb/detail/_flow_graph_tagged_buffer_impl.hh
@ -0,0 +1,258 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// a hash table buffer that can expand, and can support as many deletions as
+// additions, list-based, with elements of list held in array (for destruction
+// management), multiplicative hashing (like ets).  No synchronization built-in.
+//
+
+#ifndef __TBB__flow_graph_hash_buffer_impl_H
+#define __TBB__flow_graph_hash_buffer_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::flow::interfaceX::internal
+
+// elements in the table are a simple list; we need pointer to next element to
+// traverse the chain
+template<typename ValueType>
+struct buffer_element_type {
+    // the second parameter below is void * because we can't forward-declare the type
+    // itself, so we just reinterpret_cast below.
+    typedef typename aligned_pair<ValueType, void *>::type type;
+};
+
+template
+    <
+     typename Key,         // type of key within ValueType
+     typename ValueType,
+     typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
+     typename HashCompare, // has hash and equal
+     typename Allocator=tbb::cache_aligned_allocator< typename aligned_pair<ValueType, void *>::type >
+    >
+class hash_buffer : public HashCompare {
+public:
+    static const size_t INITIAL_SIZE = 8;  // initial size of the hash pointer table
+    typedef ValueType value_type;
+    typedef typename buffer_element_type< value_type >::type element_type;
+    typedef value_type *pointer_type;
+    typedef element_type *list_array_type;  // array we manage manually
+    typedef list_array_type *pointer_array_type;
+    typedef typename std::allocator_traits<Allocator>::template rebind_alloc<list_array_type> pointer_array_allocator_type;
+    typedef typename std::allocator_traits<Allocator>::template rebind_alloc<element_type> elements_array_allocator;
+    typedef typename std::decay<Key>::type Knoref;
+
+private:
+    ValueToKey *my_key;
+    size_t my_size;
+    size_t nelements;
+    pointer_array_type pointer_array;    // pointer_array[my_size]
+    list_array_type elements_array;      // elements_array[my_size / 2]
+    element_type* free_list;
+
+    size_t mask() { return my_size - 1; }
+
+    void set_up_free_list( element_type **p_free_list, list_array_type la, size_t sz) {
+        for(size_t i=0; i < sz - 1; ++i ) {  // construct free list
+            la[i].second = &(la[i+1]);
+        }
+        la[sz-1].second = nullptr;
+        *p_free_list = (element_type *)&(la[0]);
+    }
+
+    // cleanup for exceptions
+    struct DoCleanup {
+        pointer_array_type *my_pa;
+        list_array_type *my_elements;
+        size_t my_size;
+
+        DoCleanup(pointer_array_type &pa, list_array_type &my_els, size_t sz) :
+            my_pa(&pa), my_elements(&my_els), my_size(sz) {  }
+        ~DoCleanup() {
+            if(my_pa) {
+                size_t dont_care = 0;
+                internal_free_buffer(*my_pa, *my_elements, my_size, dont_care);
+            }
+        }
+    };
+
+    // exception-safety requires we do all the potentially-throwing operations first
+    void grow_array() {
+        size_t new_size = my_size*2;
+        size_t new_nelements = nelements;  // internal_free_buffer zeroes this
+        list_array_type new_elements_array = nullptr;
+        pointer_array_type new_pointer_array = nullptr;
+        list_array_type new_free_list = nullptr;
+        {
+            DoCleanup my_cleanup(new_pointer_array, new_elements_array, new_size);
+            new_elements_array = elements_array_allocator().allocate(my_size);
+            new_pointer_array = pointer_array_allocator_type().allocate(new_size);
+            for(size_t i=0; i < new_size; ++i) new_pointer_array[i] = nullptr;
+            set_up_free_list(&new_free_list, new_elements_array, my_size );
+
+            for(size_t i=0; i < my_size; ++i) {
+                for( element_type* op = pointer_array[i]; op; op = (element_type *)(op->second)) {
+                    value_type *ov = reinterpret_cast<value_type *>(&(op->first));
+                    // could have std::move semantics
+                    internal_insert_with_key(new_pointer_array, new_size, new_free_list, *ov);
+                }
+            }
+            my_cleanup.my_pa = nullptr;
+            my_cleanup.my_elements = nullptr;
+        }
+
+        internal_free_buffer(pointer_array, elements_array, my_size, nelements);
+        free_list = new_free_list;
+        pointer_array = new_pointer_array;
+        elements_array = new_elements_array;
+        my_size = new_size;
+        nelements = new_nelements;
+    }
+
+    // v should have perfect forwarding if std::move implemented.
+    // we use this method to move elements in grow_array, so can't use class fields
+    void internal_insert_with_key( element_type **p_pointer_array, size_t p_sz, list_array_type &p_free_list,
+            const value_type &v) {
+        size_t l_mask = p_sz-1;
+        __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+        size_t h = this->hash(tbb::detail::invoke(*my_key, v)) & l_mask;
+        __TBB_ASSERT(p_free_list, "Error: free list not set up.");
+        element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->second);
+        (void) new(&(my_elem->first)) value_type(v);
+        my_elem->second = p_pointer_array[h];
+        p_pointer_array[h] = my_elem;
+    }
+
+    void internal_initialize_buffer() {
+        pointer_array = pointer_array_allocator_type().allocate(my_size);
+        for(size_t i = 0; i < my_size; ++i) pointer_array[i] = nullptr;
+        elements_array = elements_array_allocator().allocate(my_size / 2);
+        set_up_free_list(&free_list, elements_array, my_size / 2);
+    }
+
+    // made static so an enclosed class can use to properly dispose of the internals
+    static void internal_free_buffer( pointer_array_type &pa, list_array_type &el, size_t &sz, size_t &ne ) {
+        if(pa) {
+            for(size_t i = 0; i < sz; ++i ) {
+                element_type *p_next;
+                for( element_type *p = pa[i]; p; p = p_next) {
+                    p_next = (element_type *)p->second;
+                    // TODO revamp: make sure type casting is correct.
+                    void* ptr = (void*)(p->first);
+#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
+                    suppress_unused_warning(ptr);
+#endif
+                    ((value_type*)ptr)->~value_type();
+                }
+            }
+            pointer_array_allocator_type().deallocate(pa, sz);
+            pa = nullptr;
+        }
+        // Separate test (if allocation of pa throws, el may be allocated.
+        // but no elements will be constructed.)
+        if(el) {
+            elements_array_allocator().deallocate(el, sz / 2);
+            el = nullptr;
+        }
+        sz = INITIAL_SIZE;
+        ne = 0;
+    }
+
+public:
+    hash_buffer() : my_key(nullptr), my_size(INITIAL_SIZE), nelements(0) {
+        internal_initialize_buffer();
+    }
+
+    ~hash_buffer() {
+        internal_free_buffer(pointer_array, elements_array, my_size, nelements);
+        delete my_key;
+        my_key = nullptr;
+    }
+    hash_buffer(const hash_buffer&) = delete;
+    hash_buffer& operator=(const hash_buffer&) = delete;
+
+    void reset() {
+        internal_free_buffer(pointer_array, elements_array, my_size, nelements);
+        internal_initialize_buffer();
+    }
+
+    // Take ownership of func object allocated with new.
+    // This method is only used internally, so can't be misused by user.
+    void set_key_func(ValueToKey *vtk) { my_key = vtk; }
+    // pointer is used to clone()
+    ValueToKey* get_key_func() { return my_key; }
+
+    bool insert_with_key(const value_type &v) {
+        pointer_type p = nullptr;
+        __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+        if(find_ref_with_key(tbb::detail::invoke(*my_key, v), p)) {
+            p->~value_type();
+            (void) new(p) value_type(v);  // copy-construct into the space
+            return false;
+        }
+        ++nelements;
+        if(nelements*2 > my_size) grow_array();
+        internal_insert_with_key(pointer_array, my_size, free_list, v);
+        return true;
+    }
+
+    // returns true and sets v to array element if found, else returns false.
+    bool find_ref_with_key(const Knoref& k, pointer_type &v) {
+        size_t i = this->hash(k) & mask();
+        for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->second)) {
+            pointer_type pv = reinterpret_cast<pointer_type>(&(p->first));
+            __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+            if(this->equal(tbb::detail::invoke(*my_key, *pv), k)) {
+                v = pv;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool find_with_key( const Knoref& k, value_type &v) {
+        value_type *p;
+        if(find_ref_with_key(k, p)) {
+            v = *p;
+            return true;
+        }
+        else
+            return false;
+    }
+
+    void delete_with_key(const Knoref& k) {
+        size_t h = this->hash(k) & mask();
+        element_type* prev = nullptr;
+        for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->second)) {
+            value_type *vp = reinterpret_cast<value_type *>(&(p->first));
+            __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+            if(this->equal(tbb::detail::invoke(*my_key, *vp), k)) {
+                vp->~value_type();
+                if(prev) prev->second = p->second;
+                else pointer_array[h] = (element_type *)(p->second);
+                p->second = free_list;
+                free_list = p;
+                --nelements;
+                return;
+            }
+        }
+        __TBB_ASSERT(false, "key not found for delete");
+    }
+};
+#endif // __TBB__flow_graph_hash_buffer_impl_H
--- a/third_party/tbb/detail/_flow_graph_trace_impl.hh
+++ b/third_party/tbb/detail/_flow_graph_trace_impl.hh
@ -0,0 +1,365 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _FGT_GRAPH_TRACE_IMPL_H
+#define _FGT_GRAPH_TRACE_IMPL_H
+
+#include "third_party/tbb/profiling.hh"
+#if (_MSC_VER >= 1900)
+    // MISSING #include <intrin.h>
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template< typename T > class sender;
+template< typename T > class receiver;
+
+#if TBB_USE_PROFILING_TOOLS
+    #if __TBB_FLOW_TRACE_CODEPTR
+        #if (_MSC_VER >= 1900)
+            #define CODEPTR() (_ReturnAddress())
+        #elif __TBB_GCC_VERSION >= 40800
+            #define CODEPTR() ( __builtin_return_address(0))
+        #else
+            #define CODEPTR() nullptr
+        #endif
+    #else
+        #define CODEPTR() nullptr
+    #endif /* __TBB_FLOW_TRACE_CODEPTR */
+
+static inline void fgt_alias_port(void *node, void *p, bool visible) {
+    if(visible)
+        itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_NODE );
+    else
+        itt_relation_add( ITT_DOMAIN_FLOW, p, FLOW_NODE, __itt_relation_is_child_of, node, FLOW_NODE );
+}
+
+static inline void fgt_composite ( void* codeptr, void *node, void *graph ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_COMPOSITE_NODE );
+    suppress_unused_warning( codeptr );
+#if __TBB_FLOW_TRACE_CODEPTR
+    if (codeptr != nullptr) {
+        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+    }
+#endif
+}
+
+static inline void fgt_internal_alias_input_port( void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_INPUT_PORT );
+}
+
+static inline void fgt_internal_alias_output_port( void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_OUTPUT_PORT );
+}
+
+template<typename InputType>
+void alias_input_port(void *node, receiver<InputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_alias_input_port a function template?
+    fgt_internal_alias_input_port( node, port, name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_input_alias_helper {
+    static void alias_port( void *node, PortsTuple &ports ) {
+        alias_input_port( node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_INPUT_PORT_0 + N - 1) );
+        fgt_internal_input_alias_helper<PortsTuple, N-1>::alias_port( node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_input_alias_helper<PortsTuple, 0> {
+    static void alias_port( void * /* node */, PortsTuple & /* ports */ ) { }
+};
+
+template<typename OutputType>
+void alias_output_port(void *node, sender<OutputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_alias_output_port a function template?
+    fgt_internal_alias_output_port( node, static_cast<void *>(port), name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_output_alias_helper {
+    static void alias_port( void *node, PortsTuple &ports ) {
+        alias_output_port( node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_OUTPUT_PORT_0 + N - 1) );
+        fgt_internal_output_alias_helper<PortsTuple, N-1>::alias_port( node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_output_alias_helper<PortsTuple, 0> {
+    static void alias_port( void * /*node*/, PortsTuple &/*ports*/ ) {
+    }
+};
+
+static inline void fgt_internal_create_input_port( void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+}
+
+static inline void fgt_internal_create_output_port( void* codeptr, void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group(ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index);
+    suppress_unused_warning( codeptr );
+#if __TBB_FLOW_TRACE_CODEPTR
+    if (codeptr != nullptr) {
+        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+    }
+#endif
+}
+
+template<typename InputType>
+void register_input_port(void *node, receiver<InputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_create_input_port a function template?
+    fgt_internal_create_input_port(node, static_cast<void*>(port), name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_input_helper {
+    static void register_port( void *node, PortsTuple &ports ) {
+        register_input_port( node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_INPUT_PORT_0 + N - 1) );
+        fgt_internal_input_helper<PortsTuple, N-1>::register_port( node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_input_helper<PortsTuple, 1> {
+    static void register_port( void *node, PortsTuple &ports ) {
+        register_input_port( node, &(std::get<0>(ports)), FLOW_INPUT_PORT_0 );
+    }
+};
+
+template<typename OutputType>
+void register_output_port(void* codeptr, void *node, sender<OutputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_create_output_port a function template?
+    fgt_internal_create_output_port( codeptr, node, static_cast<void *>(port), name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_output_helper {
+    static void register_port( void* codeptr, void *node, PortsTuple &ports ) {
+        register_output_port( codeptr, node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_OUTPUT_PORT_0 + N - 1) );
+        fgt_internal_output_helper<PortsTuple, N-1>::register_port( codeptr, node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_output_helper<PortsTuple,1> {
+    static void register_port( void* codeptr, void *node, PortsTuple &ports ) {
+        register_output_port( codeptr, node, &(std::get<0>(ports)), FLOW_OUTPUT_PORT_0 );
+    }
+};
+
+template< typename NodeType >
+void fgt_multioutput_node_desc( const NodeType *node, const char *desc ) {
+    void *addr =  (void *)( static_cast< receiver< typename NodeType::input_type > * >(const_cast< NodeType *>(node)) );
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+}
+
+template< typename NodeType >
+void fgt_multiinput_multioutput_node_desc( const NodeType *node, const char *desc ) {
+    void *addr =  const_cast<NodeType *>(node);
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+}
+
+template< typename NodeType >
+static inline void fgt_node_desc( const NodeType *node, const char *desc ) {
+    void *addr =  (void *)( static_cast< sender< typename NodeType::output_type > * >(const_cast< NodeType *>(node)) );
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+}
+
+static inline void fgt_graph_desc( const void *g, const char *desc ) {
+    void *addr = const_cast< void *>(g);
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_GRAPH, FLOW_OBJECT_NAME, desc );
+}
+
+static inline void fgt_body( void *node, void *body ) {
+    itt_relation_add( ITT_DOMAIN_FLOW, body, FLOW_BODY, __itt_relation_is_child_of, node, FLOW_NODE );
+}
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node(void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
+    fgt_internal_output_helper<PortsTuple, N>::register_port(codeptr, input_port, ports );
+}
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports, void *body ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
+    fgt_internal_output_helper<PortsTuple, N>::register_port( codeptr, input_port, ports );
+    fgt_body( input_port, body );
+}
+
+template< int N, typename PortsTuple >
+static inline void fgt_multiinput_node( void* codeptr, string_resource_index t, void *g, PortsTuple &ports, void *output_port) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
+    fgt_internal_input_helper<PortsTuple, N>::register_port( output_port, ports );
+}
+
+static inline void fgt_multiinput_multioutput_node( void* codeptr, string_resource_index t, void *n, void *g ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, n, FLOW_NODE, g, FLOW_GRAPH, t );
+    suppress_unused_warning( codeptr );
+#if __TBB_FLOW_TRACE_CODEPTR
+    if (codeptr != nullptr) {
+        register_node_addr(ITT_DOMAIN_FLOW, n, FLOW_NODE, CODE_ADDRESS, &codeptr);
+    }
+#endif
+}
+
+static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *output_port ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
+}
+
+static void fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *output_port, void *body ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_output_port(codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
+    fgt_body( output_port, body );
+}
+
+static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *input_port, void *output_port ) {
+    fgt_node( codeptr, t, g, output_port );
+    fgt_internal_create_input_port( output_port, input_port, FLOW_INPUT_PORT_0 );
+}
+
+static inline void  fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, void *output_port, void *body ) {
+    fgt_node_with_body( codeptr, t, g, output_port, body );
+    fgt_internal_create_input_port( output_port, input_port, FLOW_INPUT_PORT_0 );
+}
+
+
+static inline void  fgt_node( void* codeptr, string_resource_index t, void *g, void *input_port, void *decrement_port, void *output_port ) {
+    fgt_node( codeptr, t, g, input_port, output_port );
+    fgt_internal_create_input_port( output_port, decrement_port, FLOW_INPUT_PORT_1 );
+}
+
+static inline void fgt_make_edge( void *output_port, void *input_port ) {
+    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_predecessor_to, input_port, FLOW_INPUT_PORT);
+}
+
+static inline void fgt_remove_edge( void *output_port, void *input_port ) {
+    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_sibling_of, input_port, FLOW_INPUT_PORT);
+}
+
+static inline void fgt_graph( void *g ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, g, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_GRAPH );
+}
+
+static inline void fgt_begin_body( void *body ) {
+    itt_task_begin( ITT_DOMAIN_FLOW, body, FLOW_BODY, nullptr, FLOW_NULL, FLOW_BODY );
+}
+
+static inline void fgt_end_body( void * ) {
+    itt_task_end( ITT_DOMAIN_FLOW );
+}
+
+static inline void fgt_async_try_put_begin( void *node, void *port ) {
+    itt_task_begin( ITT_DOMAIN_FLOW, port, FLOW_OUTPUT_PORT, node, FLOW_NODE, FLOW_OUTPUT_PORT );
+}
+
+static inline void fgt_async_try_put_end( void *, void * ) {
+    itt_task_end( ITT_DOMAIN_FLOW );
+}
+
+static inline void fgt_async_reserve( void *node, void *graph ) {
+    itt_region_begin( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL );
+}
+
+static inline void fgt_async_commit( void *node, void * /*graph*/) {
+    itt_region_end( ITT_DOMAIN_FLOW, node, FLOW_NODE );
+}
+
+static inline void fgt_reserve_wait( void *graph ) {
+    itt_region_begin( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_NULL );
+}
+
+static inline void fgt_release_wait( void *graph ) {
+    itt_region_end( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH );
+}
+
+#else // TBB_USE_PROFILING_TOOLS
+
+#define CODEPTR() nullptr
+
+static inline void fgt_alias_port(void * /*node*/, void * /*p*/, bool /*visible*/ ) { }
+
+static inline void fgt_composite ( void* /*codeptr*/, void * /*node*/, void * /*graph*/ ) { }
+
+static inline void fgt_graph( void * /*g*/ ) { }
+
+template< typename NodeType >
+static inline void fgt_multioutput_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { }
+
+template< typename NodeType >
+static inline void fgt_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { }
+
+static inline void fgt_graph_desc( const void * /*g*/, const char * /*desc*/ ) { }
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, PortsTuple & /*ports*/ ) { }
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, PortsTuple & /*ports*/, void * /*body*/ ) { }
+
+template< int N, typename PortsTuple >
+static inline void fgt_multiinput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, PortsTuple & /*ports*/, void * /*output_port*/ ) { }
+
+static inline void fgt_multiinput_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*node*/, void * /*graph*/ ) { }
+
+static inline void fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*output_port*/ ) { }
+static inline void  fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*decrement_port*/, void * /*output_port*/ ) { }
+
+static inline void fgt_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*output_port*/, void * /*body*/ ) { }
+static inline void fgt_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*output_port*/, void * /*body*/ ) { }
+
+static inline void fgt_make_edge( void * /*output_port*/, void * /*input_port*/ ) { }
+static inline void fgt_remove_edge( void * /*output_port*/, void * /*input_port*/ ) { }
+
+static inline void fgt_begin_body( void * /*body*/ ) { }
+static inline void fgt_end_body( void *  /*body*/) { }
+
+static inline void fgt_async_try_put_begin( void * /*node*/, void * /*port*/ ) { }
+static inline void fgt_async_try_put_end( void * /*node*/ , void * /*port*/ ) { }
+static inline void fgt_async_reserve( void * /*node*/, void * /*graph*/ ) { }
+static inline void fgt_async_commit( void * /*node*/, void * /*graph*/ ) { }
+static inline void fgt_reserve_wait( void * /*graph*/ ) { }
+static inline void fgt_release_wait( void * /*graph*/ ) { }
+
+template< typename NodeType >
+void fgt_multiinput_multioutput_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { }
+
+template < typename PortsTuple, int N >
+struct fgt_internal_input_alias_helper {
+    static void alias_port( void * /*node*/, PortsTuple & /*ports*/ ) { }
+};
+
+template < typename PortsTuple, int N >
+struct fgt_internal_output_alias_helper {
+    static void alias_port( void * /*node*/, PortsTuple & /*ports*/ ) { }
+};
+
+#endif // TBB_USE_PROFILING_TOOLS
+
+} // d1
+} // namespace detail
+} // namespace tbb
+
+#endif // _FGT_GRAPH_TRACE_IMPL_H
--- a/third_party/tbb/detail/_flow_graph_types_impl.hh
+++ b/third_party/tbb/detail/_flow_graph_types_impl.hh
@ -0,0 +1,408 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_types_impl_H
+#define __TBB__flow_graph_types_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1
+
+// the change to key_matching (adding a K and KHash template parameter, making it a class)
+// means we have to pass this data to the key_matching_port.  All the ports have only one
+// template parameter, so we have to wrap the following types in a trait:
+//
+//    . K == key_type
+//    . KHash == hash and compare for Key
+//    . TtoK == function_body that given an object of T, returns its K
+//    . T == type accepted by port, and stored in the hash table
+//
+// The port will have an additional parameter on node construction, which is a function_body
+// that accepts a const T& and returns a K which is the field in T which is its K.
+template<typename Kp, typename KHashp, typename Tp>
+struct KeyTrait {
+    typedef Kp K;
+    typedef Tp T;
+    typedef type_to_key_function_body<T,K> TtoK;
+    typedef KHashp KHash;
+};
+
+// wrap each element of a tuple in a template, and make a tuple of the result.
+template<int N, template<class> class PT, typename TypeTuple>
+struct wrap_tuple_elements;
+
+// A wrapper that generates the traits needed for each port of a key-matching join,
+// and the type of the tuple of input ports.
+template<int N, template<class> class PT, typename KeyTraits, typename TypeTuple>
+struct wrap_key_tuple_elements;
+
+template<int N, template<class> class PT,  typename... Args>
+struct wrap_tuple_elements<N, PT, std::tuple<Args...> >{
+    typedef typename std::tuple<PT<Args>... > type;
+};
+
+template<int N, template<class> class PT, typename KeyTraits, typename... Args>
+struct wrap_key_tuple_elements<N, PT, KeyTraits, std::tuple<Args...> > {
+    typedef typename KeyTraits::key_type K;
+    typedef typename KeyTraits::hash_compare_type KHash;
+    typedef typename std::tuple<PT<KeyTrait<K, KHash, Args> >... > type;
+};
+
+template< int... S > class sequence {};
+
+template< int N, int... S >
+struct make_sequence : make_sequence < N - 1, N - 1, S... > {};
+
+template< int... S >
+struct make_sequence < 0, S... > {
+    typedef sequence<S...> type;
+};
+
+//! type mimicking std::pair but with trailing fill to ensure each element of an array
+//* will have the correct alignment
+template<typename T1, typename T2, size_t REM>
+struct type_plus_align {
+    char first[sizeof(T1)];
+    T2 second;
+    char fill1[REM];
+};
+
+template<typename T1, typename T2>
+struct type_plus_align<T1,T2,0> {
+    char first[sizeof(T1)];
+    T2 second;
+};
+
+template<class U> struct alignment_of {
+    typedef struct { char t; U    padded; } test_alignment;
+    static const size_t value = sizeof(test_alignment) - sizeof(U);
+};
+
+// T1, T2 are actual types stored.  The space defined for T1 in the type returned
+// is a char array of the correct size.  Type T2 should be trivially-constructible,
+// T1 must be explicitly managed.
+template<typename T1, typename T2>
+struct aligned_pair {
+    static const size_t t1_align = alignment_of<T1>::value;
+    static const size_t t2_align = alignment_of<T2>::value;
+    typedef type_plus_align<T1, T2, 0 > just_pair;
+    static const size_t max_align = t1_align < t2_align ? t2_align : t1_align;
+    static const size_t extra_bytes = sizeof(just_pair) % max_align;
+    static const size_t remainder = extra_bytes ? max_align - extra_bytes : 0;
+public:
+    typedef type_plus_align<T1,T2,remainder> type;
+};  // aligned_pair
+
+// support for variant type
+// type we use when we're not storing a value
+struct default_constructed { };
+
+// type which contains another type, tests for what type is contained, and references to it.
+// Wrapper<T>
+//     void CopyTo( void *newSpace) : builds a Wrapper<T> copy of itself in newSpace
+
+// struct to allow us to copy and test the type of objects
+struct WrapperBase {
+    virtual ~WrapperBase() {}
+    virtual void CopyTo(void* /*newSpace*/) const = 0;
+};
+
+// Wrapper<T> contains a T, with the ability to test what T is.  The Wrapper<T> can be
+// constructed from a T, can be copy-constructed from another Wrapper<T>, and can be
+// examined via value(), but not modified.
+template<typename T>
+struct Wrapper: public WrapperBase {
+    typedef T value_type;
+    typedef T* pointer_type;
+private:
+    T value_space;
+public:
+    const value_type &value() const { return value_space; }
+
+private:
+    Wrapper();
+
+    // on exception will ensure the Wrapper will contain only a trivially-constructed object
+    struct _unwind_space {
+        pointer_type space;
+        _unwind_space(pointer_type p) : space(p) {}
+        ~_unwind_space() {
+            if(space) (void) new (space) Wrapper<default_constructed>(default_constructed());
+        }
+    };
+public:
+    explicit Wrapper( const T& other ) : value_space(other) { }
+    explicit Wrapper(const Wrapper& other) = delete;
+
+    void CopyTo(void* newSpace) const override {
+        _unwind_space guard((pointer_type)newSpace);
+        (void) new(newSpace) Wrapper(value_space);
+        guard.space = nullptr;
+    }
+    ~Wrapper() { }
+};
+
+// specialization for array objects
+template<typename T, size_t N>
+struct Wrapper<T[N]> : public WrapperBase {
+    typedef T value_type;
+    typedef T* pointer_type;
+    // space must be untyped.
+    typedef T ArrayType[N];
+private:
+    // The space is not of type T[N] because when copy-constructing, it would be
+    // default-initialized and then copied to in some fashion, resulting in two
+    // constructions and one destruction per element.  If the type is char[ ], we
+    // placement new into each element, resulting in one construction per element.
+    static const size_t space_size = sizeof(ArrayType);
+    char value_space[space_size];
+
+
+    // on exception will ensure the already-built objects will be destructed
+    // (the value_space is a char array, so it is already trivially-destructible.)
+    struct _unwind_class {
+        pointer_type space;
+        int    already_built;
+        _unwind_class(pointer_type p) : space(p), already_built(0) {}
+        ~_unwind_class() {
+            if(space) {
+                for(size_t i = already_built; i > 0 ; --i ) space[i-1].~value_type();
+                (void) new(space) Wrapper<default_constructed>(default_constructed());
+            }
+        }
+    };
+public:
+    const ArrayType &value() const {
+        char *vp = const_cast<char *>(value_space);
+        return reinterpret_cast<ArrayType &>(*vp);
+    }
+
+private:
+    Wrapper();
+public:
+    // have to explicitly construct because other decays to a const value_type*
+    explicit Wrapper(const ArrayType& other) {
+        _unwind_class guard((pointer_type)value_space);
+        pointer_type vp = reinterpret_cast<pointer_type>(&value_space);
+        for(size_t i = 0; i < N; ++i ) {
+            (void) new(vp++) value_type(other[i]);
+            ++(guard.already_built);
+        }
+        guard.space = nullptr;
+    }
+    explicit Wrapper(const Wrapper& other) : WrapperBase() {
+        // we have to do the heavy lifting to copy contents
+        _unwind_class guard((pointer_type)value_space);
+        pointer_type dp = reinterpret_cast<pointer_type>(value_space);
+        pointer_type sp = reinterpret_cast<pointer_type>(const_cast<char *>(other.value_space));
+        for(size_t i = 0; i < N; ++i, ++dp, ++sp) {
+            (void) new(dp) value_type(*sp);
+            ++(guard.already_built);
+        }
+        guard.space = nullptr;
+    }
+
+    void CopyTo(void* newSpace) const override {
+        (void) new(newSpace) Wrapper(*this);  // exceptions handled in copy constructor
+    }
+
+    ~Wrapper() {
+        // have to destroy explicitly in reverse order
+        pointer_type vp = reinterpret_cast<pointer_type>(&value_space);
+        for(size_t i = N; i > 0 ; --i ) vp[i-1].~value_type();
+    }
+};
+
+// given a tuple, return the type of the element that has the maximum alignment requirement.
+// Given a tuple and that type, return the number of elements of the object with the max
+// alignment requirement that is at least as big as the largest object in the tuple.
+
+template<bool, class T1, class T2> struct pick_one;
+template<class T1, class T2> struct pick_one<true , T1, T2> { typedef T1 type; };
+template<class T1, class T2> struct pick_one<false, T1, T2> { typedef T2 type; };
+
+template< template<class> class Selector, typename T1, typename T2 >
+struct pick_max {
+    typedef typename pick_one< (Selector<T1>::value > Selector<T2>::value), T1, T2 >::type type;
+};
+
+template<typename T> struct size_of { static const int value = sizeof(T); };
+
+template< size_t N, class Tuple, template<class> class Selector > struct pick_tuple_max {
+    typedef typename pick_tuple_max<N-1, Tuple, Selector>::type LeftMaxType;
+    typedef typename std::tuple_element<N-1, Tuple>::type ThisType;
+    typedef typename pick_max<Selector, LeftMaxType, ThisType>::type type;
+};
+
+template< class Tuple, template<class> class Selector > struct pick_tuple_max<0, Tuple, Selector> {
+    typedef typename std::tuple_element<0, Tuple>::type type;
+};
+
+// is the specified type included in a tuple?
+template<class Q, size_t N, class Tuple>
+struct is_element_of {
+    typedef typename std::tuple_element<N-1, Tuple>::type T_i;
+    static const bool value = std::is_same<Q,T_i>::value || is_element_of<Q,N-1,Tuple>::value;
+};
+
+template<class Q, class Tuple>
+struct is_element_of<Q,0,Tuple> {
+    typedef typename std::tuple_element<0, Tuple>::type T_i;
+    static const bool value = std::is_same<Q,T_i>::value;
+};
+
+// allow the construction of types that are listed tuple.  If a disallowed type
+// construction is written, a method involving this type is created.  The
+// type has no definition, so a syntax error is generated.
+template<typename T> struct ERROR_Type_Not_allowed_In_Tagged_Msg_Not_Member_Of_Tuple;
+
+template<typename T, bool BUILD_IT> struct do_if;
+template<typename T>
+struct do_if<T, true> {
+    static void construct(void *mySpace, const T& x) {
+        (void) new(mySpace) Wrapper<T>(x);
+    }
+};
+template<typename T>
+struct do_if<T, false> {
+    static void construct(void * /*mySpace*/, const T& x) {
+        // This method is instantiated when the type T does not match any of the
+        // element types in the Tuple in variant<Tuple>.
+        ERROR_Type_Not_allowed_In_Tagged_Msg_Not_Member_Of_Tuple<T>::bad_type(x);
+    }
+};
+
+// Tuple tells us the allowed types that variant can hold.  It determines the alignment of the space in
+// Wrapper, and how big Wrapper is.
+//
+// the object can only be tested for type, and a read-only reference can be fetched by cast_to<T>().
+
+using tbb::detail::punned_cast;
+struct tagged_null_type {};
+template<typename TagType, typename T0, typename T1=tagged_null_type, typename T2=tagged_null_type, typename T3=tagged_null_type,
+                           typename T4=tagged_null_type, typename T5=tagged_null_type, typename T6=tagged_null_type,
+                           typename T7=tagged_null_type, typename T8=tagged_null_type, typename T9=tagged_null_type>
+class tagged_msg {
+    typedef std::tuple<T0, T1, T2, T3, T4
+                  //TODO: Should we reject lists longer than a tuple can hold?
+                  #if __TBB_VARIADIC_MAX >= 6
+                  , T5
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 7
+                  , T6
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 8
+                  , T7
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 9
+                  , T8
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 10
+                  , T9
+                  #endif
+                  > Tuple;
+
+private:
+    class variant {
+        static const size_t N = std::tuple_size<Tuple>::value;
+        typedef typename pick_tuple_max<N, Tuple, alignment_of>::type AlignType;
+        typedef typename pick_tuple_max<N, Tuple, size_of>::type MaxSizeType;
+        static const size_t MaxNBytes = (sizeof(Wrapper<MaxSizeType>)+sizeof(AlignType)-1);
+        static const size_t MaxNElements = MaxNBytes/sizeof(AlignType);
+        typedef aligned_space<AlignType, MaxNElements> SpaceType;
+        SpaceType my_space;
+        static const size_t MaxSize = sizeof(SpaceType);
+
+    public:
+        variant() { (void) new(&my_space) Wrapper<default_constructed>(default_constructed()); }
+
+        template<typename T>
+        variant( const T& x ) {
+            do_if<T, is_element_of<T, N, Tuple>::value>::construct(&my_space,x);
+        }
+
+        variant(const variant& other) {
+            const WrapperBase * h = punned_cast<const WrapperBase *>(&(other.my_space));
+            h->CopyTo(&my_space);
+        }
+
+        // assignment must destroy and re-create the Wrapper type, as there is no way
+        // to create a Wrapper-to-Wrapper assign even if we find they agree in type.
+        void operator=( const variant& rhs ) {
+            if(&rhs != this) {
+                WrapperBase *h = punned_cast<WrapperBase *>(&my_space);
+                h->~WrapperBase();
+                const WrapperBase *ch = punned_cast<const WrapperBase *>(&(rhs.my_space));
+                ch->CopyTo(&my_space);
+            }
+        }
+
+        template<typename U>
+        const U& variant_cast_to() const {
+            const Wrapper<U> *h = dynamic_cast<const Wrapper<U>*>(punned_cast<const WrapperBase *>(&my_space));
+            if(!h) {
+                throw_exception(exception_id::bad_tagged_msg_cast);
+            }
+            return h->value();
+        }
+        template<typename U>
+        bool variant_is_a() const { return dynamic_cast<const Wrapper<U>*>(punned_cast<const WrapperBase *>(&my_space)) != nullptr; }
+
+        bool variant_is_default_constructed() const {return variant_is_a<default_constructed>();}
+
+        ~variant() {
+            WrapperBase *h = punned_cast<WrapperBase *>(&my_space);
+            h->~WrapperBase();
+        }
+    }; //class variant
+
+    TagType my_tag;
+    variant my_msg;
+
+public:
+    tagged_msg(): my_tag(TagType(~0)), my_msg(){}
+
+    template<typename T, typename R>
+    tagged_msg(T const &index, R const &value) : my_tag(index), my_msg(value) {}
+
+    template<typename T, typename R, size_t N>
+    tagged_msg(T const &index,  R (&value)[N]) : my_tag(index), my_msg(value) {}
+
+    void set_tag(TagType const &index) {my_tag = index;}
+    TagType tag() const {return my_tag;}
+
+    template<typename V>
+    const V& cast_to() const {return my_msg.template variant_cast_to<V>();}
+
+    template<typename V>
+    bool is_a() const {return my_msg.template variant_is_a<V>();}
+
+    bool is_default_constructed() const {return my_msg.variant_is_default_constructed();}
+}; //class tagged_msg
+
+// template to simplify cast and test for tagged_msg in template contexts
+template<typename V, typename T>
+const V& cast_to(T const &t) { return t.template cast_to<V>(); }
+
+template<typename V, typename T>
+bool is_a(T const &t) { return t.template is_a<V>(); }
+
+enum op_stat { WAIT = 0, SUCCEEDED, FAILED };
+
+#endif  /* __TBB__flow_graph_types_impl_H */
--- a/third_party/tbb/detail/_hash_compare.hh
+++ b/third_party/tbb/detail/_hash_compare.hh
@ -0,0 +1,148 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__hash_compare_H
+#define __TBB_detail__hash_compare_H
+
+#include "third_party/libcxx/functional"
+
+#include "third_party/tbb/detail/_containers_helpers.hh"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Key, typename Hash, typename KeyEqual>
+class hash_compare {
+    using is_transparent_hash = has_transparent_key_equal<Key, Hash, KeyEqual>;
+public:
+    using hasher = Hash;
+    using key_equal = typename is_transparent_hash::type;
+
+    hash_compare() = default;
+    hash_compare( hasher hash, key_equal equal ) : my_hasher(hash), my_equal(equal) {}
+
+    std::size_t operator()( const Key& key ) const {
+        return std::size_t(my_hasher(key));
+    }
+
+    bool operator()( const Key& key1, const Key& key2 ) const {
+        return my_equal(key1, key2);
+    }
+
+    template <typename K, typename = typename std::enable_if<is_transparent_hash::value, K>::type>
+    std::size_t operator()( const K& key ) const {
+        return std::size_t(my_hasher(key));
+    }
+
+    template <typename K1, typename K2, typename = typename std::enable_if<is_transparent_hash::value, K1>::type>
+    bool operator()( const K1& key1, const K2& key2 ) const {
+        return my_equal(key1, key2);
+    }
+
+    hasher hash_function() const {
+        return my_hasher;
+    }
+
+    key_equal key_eq() const {
+        return my_equal;
+    }
+
+
+private:
+    hasher my_hasher;
+    key_equal my_equal;
+}; // class hash_compare
+
+//! hash_compare that is default argument for concurrent_hash_map
+template <typename Key>
+class tbb_hash_compare {
+public:
+    std::size_t hash( const Key& a ) const { return my_hash_func(a); }
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#pragma warning (push)
+// MSVC 2015 throws a strange warning: 'std::size_t': forcing value to bool 'true' or 'false'
+#pragma warning (disable: 4800)
+#endif
+    bool equal( const Key& a, const Key& b ) const { return my_key_equal(a, b); }
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#pragma warning (pop)
+#endif
+private:
+    std::hash<Key> my_hash_func;
+    std::equal_to<Key> my_key_equal;
+};
+
+} // namespace d1
+#if __TBB_CPP20_CONCEPTS_PRESENT
+inline namespace d0 {
+
+template <typename HashCompare, typename Key>
+concept hash_compare = std::copy_constructible<HashCompare> &&
+                       requires( const std::remove_reference_t<HashCompare>& hc, const Key& key1, const Key& key2 ) {
+                           { hc.hash(key1) } -> std::same_as<std::size_t>;
+                           { hc.equal(key1, key2) } -> std::convertible_to<bool>;
+                       };
+
+} // namespace d0
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+} // namespace detail
+} // namespace tbb
+
+#if TBB_DEFINE_STD_HASH_SPECIALIZATIONS
+
+namespace std {
+
+template <typename T, typename U>
+struct hash<std::pair<T, U>> {
+public:
+    std::size_t operator()( const std::pair<T, U>& p ) const {
+        return first_hash(p.first) ^ second_hash(p.second);
+    }
+
+private:
+    std::hash<T> first_hash;
+    std::hash<U> second_hash;
+}; // struct hash<std::pair>
+
+// Apple clang and MSVC defines their own specializations for std::hash<std::basic_string<T, Traits, Alloc>>
+#if !(_LIBCPP_VERSION) && !(_CPPLIB_VER)
+
+template <typename CharT, typename Traits, typename Allocator>
+struct hash<std::basic_string<CharT, Traits, Allocator>> {
+public:
+    std::size_t operator()( const std::basic_string<CharT, Traits, Allocator>& s ) const {
+        std::size_t h = 0;
+        for ( const CharT* c = s.c_str(); *c; ++c ) {
+            h = h * hash_multiplier ^ char_hash(*c);
+        }
+        return h;
+    }
+
+private:
+    static constexpr std::size_t hash_multiplier = tbb::detail::select_size_t_constant<2654435769U, 11400714819323198485ULL>::value;
+
+    std::hash<CharT> char_hash;
+}; // struct hash<std::basic_string>
+
+#endif // !(_LIBCPP_VERSION || _CPPLIB_VER)
+
+} // namespace std
+
+#endif // TBB_DEFINE_STD_HASH_SPECIALIZATIONS
+
+#endif // __TBB_detail__hash_compare_H
--- a/third_party/tbb/detail/_intrusive_list_node.hh
+++ b/third_party/tbb/detail/_intrusive_list_node.hh
@ -0,0 +1,42 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_detail__intrusive_list_node_H
+#define _TBB_detail__intrusive_list_node_H
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Data structure to be inherited by the types that can form intrusive lists.
+/** Intrusive list is formed by means of the member_intrusive_list<T> template class.
+    Note that type T must derive from intrusive_list_node either publicly or
+    declare instantiation member_intrusive_list<T> as a friend.
+    This class implements a limited subset of std::list interface. **/
+struct intrusive_list_node {
+    intrusive_list_node* my_prev_node{};
+    intrusive_list_node* my_next_node{};
+#if TBB_USE_ASSERT
+    intrusive_list_node() { my_prev_node = my_next_node = this; }
+#endif /* TBB_USE_ASSERT */
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_detail__intrusive_list_node_H
--- a/third_party/tbb/detail/_machine.hh
+++ b/third_party/tbb/detail/_machine.hh
@ -0,0 +1,397 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__machine_H
+#define __TBB_detail__machine_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_assert.hh"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/climits"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/cstddef"
+
+#ifdef _WIN32
+// MISSING #include <intrin.h>
+#ifdef __TBBMALLOC_BUILD
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h" // SwitchToThread()
+#endif
+#ifdef _MSC_VER
+#if __TBB_x86_64 || __TBB_x86_32
+#pragma intrinsic(__rdtsc)
+#endif
+#endif
+#endif
+#if __TBB_x86_64 || __TBB_x86_32
+#include "third_party/intel/immintrin.internal.h" // _mm_pause
+#endif
+#if (_WIN32)
+#include "libc/math.h"
+#include "libc/runtime/fenv.h" // _control87
+#endif
+
+#if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/cpuset.h"
+#include "libc/calls/struct/sched_param.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/sched.h" // sched_yield
+#else
+#include "third_party/libcxx/thread" // std::this_thread::yield()
+#endif
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//--------------------------------------------------------------------------------------------------
+// Yield implementation
+//--------------------------------------------------------------------------------------------------
+
+#if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
+static inline void yield() {
+    int err = sched_yield();
+    __TBB_ASSERT_EX(err == 0, "sched_yield has failed");
+}
+#elif __TBBMALLOC_BUILD && _WIN32
+// Use Windows API for yield in tbbmalloc to avoid dependency on C++ runtime with some implementations.
+static inline void yield() {
+    SwitchToThread();
+}
+#else
+using std::this_thread::yield;
+#endif
+
+//--------------------------------------------------------------------------------------------------
+// atomic_fence_seq_cst implementation
+//--------------------------------------------------------------------------------------------------
+
+static inline void atomic_fence_seq_cst() {
+#if (__TBB_x86_64 || __TBB_x86_32) && defined(__GNUC__) && __GNUC__ < 11
+    unsigned char dummy = 0u;
+    __asm__ __volatile__ ("lock; notb %0" : "+m" (dummy) :: "memory");
+#else
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+#endif
+}
+
+//--------------------------------------------------------------------------------------------------
+// Pause implementation
+//--------------------------------------------------------------------------------------------------
+
+static inline void machine_pause(int32_t delay) {
+#if __TBB_x86_64 || __TBB_x86_32
+    while (delay-- > 0) { _mm_pause(); }
+#elif __ARM_ARCH_7A__ || __aarch64__
+    while (delay-- > 0) { __asm__ __volatile__("yield" ::: "memory"); }
+#else /* Generic */
+    (void)delay; // suppress without including _template_helpers.h
+    yield();
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// tbb::detail::log2() implementation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// TODO: Use log2p1() function that will be available in C++20 standard
+
+#if defined(__GNUC__) || defined(__clang__)
+namespace gnu_builtins {
+    inline uintptr_t clz(unsigned int x) { return static_cast<uintptr_t>(__builtin_clz(x)); }
+    inline uintptr_t clz(unsigned long int x) { return static_cast<uintptr_t>(__builtin_clzl(x)); }
+    inline uintptr_t clz(unsigned long long int x) { return static_cast<uintptr_t>(__builtin_clzll(x)); }
+}
+#elif defined(_MSC_VER)
+#pragma intrinsic(__TBB_W(_BitScanReverse))
+namespace msvc_intrinsics {
+    static inline uintptr_t bit_scan_reverse(uintptr_t i) {
+        unsigned long j;
+        __TBB_W(_BitScanReverse)( &j, i );
+        return j;
+    }
+}
+#endif
+
+template <typename T>
+constexpr std::uintptr_t number_of_bits() {
+    return sizeof(T) * CHAR_BIT;
+}
+
+// logarithm is the index of the most significant non-zero bit
+static inline uintptr_t machine_log2(uintptr_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+    // If P is a power of 2 and x<P, then (P-1)-x == (P-1) XOR x
+    return (number_of_bits<decltype(x)>() - 1) ^ gnu_builtins::clz(x);
+#elif defined(_MSC_VER)
+    return msvc_intrinsics::bit_scan_reverse(x);
+#elif __i386__ || __i386 /*for Sun OS*/ || __MINGW32__
+    uintptr_t j, i = x;
+    __asm__("bsr %1,%0" : "=r"(j) : "r"(i));
+    return j;
+#elif __powerpc__ || __POWERPC__
+    #if __TBB_WORDSIZE==8
+    __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x));
+    return 63 - static_cast<intptr_t>(x);
+    #else
+    __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x));
+    return 31 - static_cast<intptr_t>(x);
+    #endif /*__TBB_WORDSIZE*/
+#elif __sparc
+    uint64_t count;
+    // one hot encode
+    x |= (x >> 1);
+    x |= (x >> 2);
+    x |= (x >> 4);
+    x |= (x >> 8);
+    x |= (x >> 16);
+    x |= (x >> 32);
+    // count 1's
+    __asm__ ("popc %1, %0" : "=r"(count) : "r"(x) );
+    return count - 1;
+#else
+    intptr_t result = 0;
+
+    if( sizeof(x) > 4 && (uintptr_t tmp = x >> 32) ) { x = tmp; result += 32; }
+    if( uintptr_t tmp = x >> 16 ) { x = tmp; result += 16; }
+    if( uintptr_t tmp = x >> 8 )  { x = tmp; result += 8; }
+    if( uintptr_t tmp = x >> 4 )  { x = tmp; result += 4; }
+    if( uintptr_t tmp = x >> 2 )  { x = tmp; result += 2; }
+
+    return (x & 2) ? result + 1 : result;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// tbb::detail::reverse_bits() implementation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#if TBB_USE_CLANG_BITREVERSE_BUILTINS
+namespace  llvm_builtins {
+    inline uint8_t  builtin_bitreverse(uint8_t  x) { return __builtin_bitreverse8 (x); }
+    inline uint16_t builtin_bitreverse(uint16_t x) { return __builtin_bitreverse16(x); }
+    inline uint32_t builtin_bitreverse(uint32_t x) { return __builtin_bitreverse32(x); }
+    inline uint64_t builtin_bitreverse(uint64_t x) { return __builtin_bitreverse64(x); }
+}
+#else // generic
+template<typename T>
+struct reverse {
+    static const T byte_table[256];
+};
+
+template<typename T>
+const T reverse<T>::byte_table[256] = {
+    0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
+    0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
+    0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
+    0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
+    0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
+    0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
+    0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
+    0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
+    0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
+    0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
+    0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
+    0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
+    0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
+    0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
+    0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
+    0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
+};
+
+inline unsigned char reverse_byte(unsigned char src) {
+    return reverse<unsigned char>::byte_table[src];
+}
+#endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
+
+template<typename T>
+T machine_reverse_bits(T src) {
+#if TBB_USE_CLANG_BITREVERSE_BUILTINS
+    return builtin_bitreverse(fixed_width_cast(src));
+#else /* Generic */
+    T dst;
+    unsigned char *original = reinterpret_cast<unsigned char *>(&src);
+    unsigned char *reversed = reinterpret_cast<unsigned char *>(&dst);
+
+    for ( int i = sizeof(T) - 1; i >= 0; i-- ) {
+        reversed[i] = reverse_byte( original[sizeof(T) - i - 1] );
+    }
+
+    return dst;
+#endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
+}
+
+} // inline namespace d0
+
+namespace d1 {
+
+#if (_WIN32)
+// API to retrieve/update FPU control setting
+#define __TBB_CPU_CTL_ENV_PRESENT 1
+struct cpu_ctl_env {
+    unsigned int x87cw{};
+#if (__TBB_x86_64)
+    // Changing the infinity mode or the floating-point precision is not supported on x64.
+    // The attempt causes an assertion. See
+    // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/control87-controlfp-control87-2
+    static constexpr unsigned int X87CW_CONTROL_MASK = _MCW_DN | _MCW_EM | _MCW_RC;
+#else
+    static constexpr unsigned int X87CW_CONTROL_MASK = ~0U;
+#endif
+#if (__TBB_x86_32 || __TBB_x86_64)
+    unsigned int mxcsr{};
+    static constexpr unsigned int MXCSR_CONTROL_MASK = ~0x3fu; /* all except last six status bits */
+#endif
+
+    bool operator!=( const cpu_ctl_env& ctl ) const {
+        return
+#if (__TBB_x86_32 || __TBB_x86_64)
+            mxcsr != ctl.mxcsr ||
+#endif
+            x87cw != ctl.x87cw;
+    }
+    void get_env() {
+        x87cw = _control87(0, 0);
+#if (__TBB_x86_32 || __TBB_x86_64)
+        mxcsr = _mm_getcsr();
+#endif
+    }
+    void set_env() const {
+        _control87(x87cw, X87CW_CONTROL_MASK);
+#if (__TBB_x86_32 || __TBB_x86_64)
+        _mm_setcsr(mxcsr & MXCSR_CONTROL_MASK);
+#endif
+    }
+};
+#elif (__TBB_x86_32 || __TBB_x86_64)
+// API to retrieve/update FPU control setting
+#define __TBB_CPU_CTL_ENV_PRESENT 1
+struct cpu_ctl_env {
+    int     mxcsr{};
+    short   x87cw{};
+    static const int MXCSR_CONTROL_MASK = ~0x3f; /* all except last six status bits */
+
+    bool operator!=(const cpu_ctl_env& ctl) const {
+        return mxcsr != ctl.mxcsr || x87cw != ctl.x87cw;
+    }
+    void get_env() {
+        __asm__ __volatile__(
+            "stmxcsr %0\n\t"
+            "fstcw %1"
+            : "=m"(mxcsr), "=m"(x87cw)
+        );
+        mxcsr &= MXCSR_CONTROL_MASK;
+    }
+    void set_env() const {
+        __asm__ __volatile__(
+            "ldmxcsr %0\n\t"
+            "fldcw %1"
+            : : "m"(mxcsr), "m"(x87cw)
+        );
+    }
+};
+#endif
+
+} // namespace d1
+
+} // namespace detail
+} // namespace tbb
+
+#if !__TBB_CPU_CTL_ENV_PRESENT
+#include "libc/runtime/fenv.h"
+
+#include "third_party/libcxx/cstring"
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size);
+void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p);
+} // namespace r1
+
+namespace d1 {
+
+class cpu_ctl_env {
+    fenv_t *my_fenv_ptr;
+public:
+    cpu_ctl_env() : my_fenv_ptr(nullptr) {}
+    ~cpu_ctl_env() {
+        if ( my_fenv_ptr )
+            r1::cache_aligned_deallocate( (void*)my_fenv_ptr );
+    }
+    // It is possible not to copy memory but just to copy pointers but the following issues should be addressed:
+    //   1. The arena lifetime and the context lifetime are independent;
+    //   2. The user is allowed to recapture different FPU settings to context so 'current FPU settings' inside
+    //   dispatch loop may become invalid.
+    // But do we really want to improve the fenv implementation? It seems to be better to replace the fenv implementation
+    // with a platform specific implementation.
+    cpu_ctl_env( const cpu_ctl_env &src ) : my_fenv_ptr(nullptr) {
+        *this = src;
+    }
+    cpu_ctl_env& operator=( const cpu_ctl_env &src ) {
+        __TBB_ASSERT( src.my_fenv_ptr, nullptr);
+        if ( !my_fenv_ptr )
+            my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
+        *my_fenv_ptr = *src.my_fenv_ptr;
+        return *this;
+    }
+    bool operator!=( const cpu_ctl_env &ctl ) const {
+        __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
+        __TBB_ASSERT( ctl.my_fenv_ptr, "cpu_ctl_env is not initialized." );
+        return std::memcmp( (void*)my_fenv_ptr, (void*)ctl.my_fenv_ptr, sizeof(fenv_t) );
+    }
+    void get_env () {
+        if ( !my_fenv_ptr )
+            my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
+        fegetenv( my_fenv_ptr );
+    }
+    const cpu_ctl_env& set_env () const {
+        __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
+        fesetenv( my_fenv_ptr );
+        return *this;
+    }
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* !__TBB_CPU_CTL_ENV_PRESENT */
+
+#endif // __TBB_detail__machine_H
--- a/third_party/tbb/detail/_mutex_common.hh
+++ b/third_party/tbb/detail/_mutex_common.hh
@ -0,0 +1,62 @@
+// clang-format off
+/*
+    Copyright (c) 2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__mutex_common_H
+#define __TBB_detail__mutex_common_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_utils.hh"
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+// MISSING #include <concepts>
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+template <typename Lock, typename Mutex>
+concept mutex_scoped_lock = std::default_initializable<Lock> &&
+                            std::constructible_from<Lock, Mutex&> &&
+                            requires( Lock& lock, Mutex& mutex ) {
+                                lock.acquire(mutex);
+                                { lock.try_acquire(mutex) } -> adaptive_same_as<bool>;
+                                lock.release();
+                            };
+
+template <typename Lock, typename Mutex>
+concept rw_mutex_scoped_lock = mutex_scoped_lock<Lock, Mutex> &&
+                               std::constructible_from<Lock, Mutex&, bool> &&
+                               requires( Lock& lock, Mutex& mutex ) {
+                                   lock.acquire(mutex, false);
+                                   { lock.try_acquire(mutex, false) } -> adaptive_same_as<bool>;
+                                   { lock.upgrade_to_writer() } -> adaptive_same_as<bool>;
+                                   { lock.downgrade_to_reader() } -> adaptive_same_as<bool>;
+                               };
+
+template <typename Mutex>
+concept scoped_lockable = mutex_scoped_lock<typename Mutex::scoped_lock, Mutex>;
+
+template <typename Mutex>
+concept rw_scoped_lockable = scoped_lockable<Mutex> &&
+                             rw_mutex_scoped_lock<typename Mutex::scoped_lock, Mutex>;
+
+} // namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+#endif // __TBB_detail__mutex_common_H
--- a/third_party/tbb/detail/_namespace_injection.hh
+++ b/third_party/tbb/detail/_namespace_injection.hh
@ -0,0 +1,25 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// All public entities of the OneAPI Spec are available under oneapi namespace
+
+// Define tbb namespace first as it might not be known yet
+namespace tbb {}
+
+namespace oneapi {
+namespace tbb = ::tbb;
+}
--- a/third_party/tbb/detail/_node_handle.hh
+++ b/third_party/tbb/detail/_node_handle.hh
@ -0,0 +1,163 @@
+// clang-format off
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__node_handle_H
+#define __TBB_detail__node_handle_H
+
+#include "third_party/tbb/detail/_allocator_traits.hh"
+#include "third_party/tbb/detail/_assert.hh"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+// A structure to access private node handle methods in internal TBB classes
+// Regular friend declaration is not convenient because classes which use node handle
+// can be placed in the different versioning namespaces.
+struct node_handle_accessor {
+    template <typename NodeHandleType>
+    static typename NodeHandleType::node* get_node_ptr( NodeHandleType& nh ) {
+        return nh.get_node_ptr();
+    }
+
+    template <typename NodeHandleType>
+    static NodeHandleType construct( typename NodeHandleType::node* node_ptr ) {
+        return NodeHandleType{node_ptr};
+    }
+
+    template <typename NodeHandleType>
+    static void deactivate( NodeHandleType& nh ) {
+        nh.deactivate();
+    }
+}; // struct node_handle_accessor
+
+template<typename Value, typename Node, typename Allocator>
+class node_handle_base {
+public:
+    using allocator_type = Allocator;
+protected:
+    using node = Node;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+public:
+
+    node_handle_base() : my_node(nullptr), my_allocator() {}
+    node_handle_base(node_handle_base&& nh) : my_node(nh.my_node),
+                                              my_allocator(std::move(nh.my_allocator)) {
+        nh.my_node = nullptr;
+    }
+
+    __TBB_nodiscard bool empty() const { return my_node == nullptr; }
+    explicit operator bool() const { return my_node != nullptr; }
+
+    ~node_handle_base() { internal_destroy(); }
+
+    node_handle_base& operator=( node_handle_base&& nh ) {
+        internal_destroy();
+        my_node = nh.my_node;
+        move_assign_allocators(my_allocator, nh.my_allocator);
+        nh.deactivate();
+        return *this;
+    }
+
+    void swap( node_handle_base& nh ) {
+        using std::swap;
+        swap(my_node, nh.my_node);
+        swap_allocators(my_allocator, nh.my_allocator);
+    }
+
+    allocator_type get_allocator() const {
+        return my_allocator;
+    }
+
+protected:
+    node_handle_base( node* n ) : my_node(n) {}
+
+    void internal_destroy() {
+        if(my_node != nullptr) {
+            allocator_traits_type::destroy(my_allocator, my_node->storage());
+            typename allocator_traits_type::template rebind_alloc<node> node_allocator(my_allocator);
+            node_allocator.deallocate(my_node, 1);
+        }
+    }
+
+    node* get_node_ptr() { return my_node; }
+
+    void deactivate() { my_node = nullptr; }
+
+    node* my_node;
+    allocator_type my_allocator;
+};
+
+// node handle for maps
+template<typename Key, typename Value, typename Node, typename Allocator>
+class node_handle : public node_handle_base<Value, Node, Allocator> {
+    using base_type = node_handle_base<Value, Node, Allocator>;
+public:
+    using key_type = Key;
+    using mapped_type = typename Value::second_type;
+    using allocator_type = typename base_type::allocator_type;
+
+    node_handle() = default;
+
+    key_type& key() const {
+        __TBB_ASSERT(!this->empty(), "Cannot get key from the empty node_type object");
+        return *const_cast<key_type*>(&(this->my_node->value().first));
+    }
+
+    mapped_type& mapped() const {
+        __TBB_ASSERT(!this->empty(), "Cannot get mapped value from the empty node_type object");
+        return this->my_node->value().second;
+    }
+
+private:
+    friend struct node_handle_accessor;
+
+    node_handle( typename base_type::node* n ) : base_type(n) {}
+}; // class node_handle
+
+// node handle for sets
+template<typename Key, typename Node, typename Allocator>
+class node_handle<Key, Key, Node, Allocator> : public node_handle_base<Key, Node, Allocator> {
+    using base_type = node_handle_base<Key, Node, Allocator>;
+public:
+    using value_type = Key;
+    using allocator_type = typename base_type::allocator_type;
+
+    node_handle() = default;
+
+    value_type& value() const {
+        __TBB_ASSERT(!this->empty(), "Cannot get value from the empty node_type object");
+        return *const_cast<value_type*>(&(this->my_node->value()));
+    }
+
+private:
+    friend struct node_handle_accessor;
+
+    node_handle( typename base_type::node* n ) : base_type(n) {}
+}; // class node_handle
+
+template <typename Key, typename Value, typename Node, typename Allocator>
+void swap( node_handle<Key, Value, Node, Allocator>& lhs,
+           node_handle<Key, Value, Node, Allocator>& rhs ) {
+    return lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__node_handle_H
--- a/third_party/tbb/detail/_pipeline_filters.hh
+++ b/third_party/tbb/detail/_pipeline_filters.hh
@ -0,0 +1,456 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_filters_H
+#define __TBB_parallel_filters_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_task.hh"
+#include "third_party/tbb/detail/_pipeline_filters_deduction.hh"
+#include "third_party/tbb/tbb_allocator.hh"
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/cstdint"
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class base_filter;
+}
+
+namespace r1 {
+TBB_EXPORT void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter&);
+class pipeline;
+class stage_task;
+class input_buffer;
+}
+
+namespace d1 {
+class filter_node;
+
+//! A stage in a pipeline.
+/** @ingroup algorithms */
+class base_filter{
+private:
+    //! Value used to mark "not in pipeline"
+    static base_filter* not_in_pipeline() { return reinterpret_cast<base_filter*>(std::intptr_t(-1)); }
+public:
+    //! The lowest bit 0 is for parallel vs serial
+    static constexpr  unsigned int filter_is_serial = 0x1;
+
+    //! 2nd bit distinguishes ordered vs unordered filters.
+    static constexpr  unsigned int filter_is_out_of_order = 0x1<<1;
+
+    //! 3rd bit marks input filters emitting small objects
+    static constexpr  unsigned int filter_may_emit_null = 0x1<<2;
+
+    base_filter(const base_filter&) = delete;
+    base_filter& operator=(const base_filter&) = delete;
+
+protected:
+    explicit base_filter( unsigned int m ) :
+        next_filter_in_pipeline(not_in_pipeline()),
+        my_input_buffer(nullptr),
+        my_filter_mode(m),
+        my_pipeline(nullptr)
+    {}
+
+    // signal end-of-input for concrete_filters
+    void set_end_of_input() {
+        r1::set_end_of_input(*this);
+    }
+
+public:
+    //! True if filter is serial.
+    bool is_serial() const {
+        return bool( my_filter_mode & filter_is_serial );
+    }
+
+    //! True if filter must receive stream in order.
+    bool is_ordered() const {
+        return (my_filter_mode & filter_is_serial) && !(my_filter_mode & filter_is_out_of_order);
+    }
+
+    //! true if an input filter can emit null
+    bool object_may_be_null() {
+        return ( my_filter_mode & filter_may_emit_null ) == filter_may_emit_null;
+    }
+
+    //! Operate on an item from the input stream, and return item for output stream.
+    /** Returns nullptr if filter is a sink. */
+    virtual void* operator()( void* item ) = 0;
+
+    //! Destroy filter.
+    virtual ~base_filter() {};
+
+    //! Destroys item if pipeline was cancelled.
+    /** Required to prevent memory leaks.
+        Note it can be called concurrently even for serial filters.*/
+    virtual void finalize( void* /*item*/ ) {}
+
+private:
+    //! Pointer to next filter in the pipeline.
+    base_filter* next_filter_in_pipeline;
+
+    //! Buffer for incoming tokens, or nullptr if not required.
+    /** The buffer is required if the filter is serial. */
+    r1::input_buffer* my_input_buffer;
+
+    friend class r1::stage_task;
+    friend class r1::pipeline;
+    friend void r1::set_end_of_input(d1::base_filter&);
+
+    //! Storage for filter mode and dynamically checked implementation version.
+    const unsigned int my_filter_mode;
+
+    //! Pointer to the pipeline.
+    r1::pipeline* my_pipeline;
+};
+
+template<typename Body, typename InputType, typename OutputType >
+class concrete_filter;
+
+//! input_filter control to signal end-of-input for parallel_pipeline
+class flow_control {
+    bool is_pipeline_stopped = false;
+    flow_control() = default;
+    template<typename Body, typename InputType, typename OutputType > friend class concrete_filter;
+    template<typename Output>
+    __TBB_requires(std::copyable<Output>)
+    friend class input_node;
+public:
+    void stop() { is_pipeline_stopped = true; }
+};
+
+// Emulate std::is_trivially_copyable (false positives not allowed, false negatives suboptimal but safe).
+#if __TBB_CPP11_TYPE_PROPERTIES_PRESENT
+template<typename T> using tbb_trivially_copyable = std::is_trivially_copyable<T>;
+#else
+template<typename T> struct tbb_trivially_copyable                      { enum { value = false }; };
+template<typename T> struct tbb_trivially_copyable <         T*       > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         bool     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         char     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <  signed char     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned char     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         short    > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned short    > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         int      > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned int      > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         long     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned long     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         long long> { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned long long> { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         float    > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         double   > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <    long double   > { enum { value = true  }; };
+#endif // __TBB_CPP11_TYPE_PROPERTIES_PRESENT
+
+template<typename T>
+struct use_allocator {
+   static constexpr bool value = sizeof(T) > sizeof(void *) || !tbb_trivially_copyable<T>::value;
+};
+
+// A helper class to customize how a type is passed between filters.
+// Usage: token_helper<T, use_allocator<T>::value>
+template<typename T, bool Allocate> struct token_helper;
+
+// using tbb_allocator
+template<typename T>
+struct token_helper<T, true> {
+    using pointer = T*;
+    using value_type = T;
+    static pointer create_token(value_type && source) {
+        return new (r1::allocate_memory(sizeof(T))) T(std::move(source));
+    }
+    static value_type & token(pointer & t) { return *t; }
+    static void * cast_to_void_ptr(pointer ref) { return reinterpret_cast<void *>(ref); }
+    static pointer cast_from_void_ptr(void * ref) { return reinterpret_cast<pointer>(ref); }
+    static void destroy_token(pointer token) {
+        token->~value_type();
+        r1::deallocate_memory(token);
+    }
+};
+
+// pointer specialization
+template<typename T>
+struct token_helper<T*, false> {
+    using pointer = T*;
+    using value_type = T*;
+    static pointer create_token(const value_type & source) { return source; }
+    static value_type & token(pointer & t) { return t; }
+    static void * cast_to_void_ptr(pointer ref) { return reinterpret_cast<void *>(ref); }
+    static pointer cast_from_void_ptr(void * ref) { return reinterpret_cast<pointer>(ref); }
+    static void destroy_token( pointer /*token*/) {}
+};
+
+// converting type to and from void*, passing objects directly
+template<typename T>
+struct token_helper<T, false> {
+    typedef union {
+        T actual_value;
+        void * void_overlay;
+    } type_to_void_ptr_map;
+    using pointer = T;  // not really a pointer in this case.
+    using value_type = T;
+    static pointer create_token(const value_type & source) { return source; }
+    static value_type & token(pointer & t) { return t; }
+    static void * cast_to_void_ptr(pointer ref) {
+        type_to_void_ptr_map mymap;
+        mymap.void_overlay = nullptr;
+        mymap.actual_value = ref;
+        return mymap.void_overlay;
+    }
+    static pointer cast_from_void_ptr(void * ref) {
+        type_to_void_ptr_map mymap;
+        mymap.void_overlay = ref;
+        return mymap.actual_value;
+    }
+    static void destroy_token( pointer /*token*/) {}
+};
+
+// intermediate
+template<typename InputType,  typename OutputType, typename Body>
+class concrete_filter: public base_filter {
+    const Body& my_body;
+    using input_helper = token_helper<InputType, use_allocator<InputType >::value>;
+    using input_pointer = typename input_helper::pointer;
+    using output_helper = token_helper<OutputType, use_allocator<OutputType>::value>;
+    using output_pointer = typename output_helper::pointer;
+
+    void* operator()(void* input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        output_pointer temp_output = output_helper::create_token(tbb::detail::invoke(my_body, std::move(input_helper::token(temp_input))));
+        input_helper::destroy_token(temp_input);
+        return output_helper::cast_to_void_ptr(temp_output);
+    }
+
+    void finalize(void * input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        input_helper::destroy_token(temp_input);
+    }
+
+public:
+    concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {}
+};
+
+// input
+template<typename OutputType, typename Body>
+class concrete_filter<void, OutputType, Body>: public base_filter {
+    const Body& my_body;
+    using output_helper = token_helper<OutputType, use_allocator<OutputType>::value>;
+    using output_pointer = typename output_helper::pointer;
+
+    void* operator()(void*) override {
+        flow_control control;
+        output_pointer temp_output = output_helper::create_token(my_body(control));
+        if(control.is_pipeline_stopped) {
+            output_helper::destroy_token(temp_output);
+            set_end_of_input();
+            return nullptr;
+        }
+        return output_helper::cast_to_void_ptr(temp_output);
+    }
+
+public:
+    concrete_filter(unsigned int m, const Body& body) :
+        base_filter(m | filter_may_emit_null),
+        my_body(body)
+    {}
+};
+
+// output
+template<typename InputType, typename Body>
+class concrete_filter<InputType, void, Body>: public base_filter {
+    const Body& my_body;
+    using input_helper = token_helper<InputType, use_allocator<InputType >::value>;
+    using input_pointer = typename input_helper::pointer;
+
+    void* operator()(void* input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        tbb::detail::invoke(my_body, std::move(input_helper::token(temp_input)));
+        input_helper::destroy_token(temp_input);
+        return nullptr;
+    }
+    void finalize(void* input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        input_helper::destroy_token(temp_input);
+    }
+
+public:
+    concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {}
+};
+
+template<typename Body>
+class concrete_filter<void, void, Body>: public base_filter {
+    const Body& my_body;
+
+    void* operator()(void*) override {
+        flow_control control;
+        my_body(control);
+        void* output = control.is_pipeline_stopped ? nullptr : (void*)(std::intptr_t)-1;
+        return output;
+    }
+public:
+    concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {}
+};
+
+class filter_node_ptr {
+    filter_node * my_node;
+
+public:
+    filter_node_ptr() : my_node(nullptr) {}
+    filter_node_ptr(filter_node *);
+    ~filter_node_ptr();
+    filter_node_ptr(const filter_node_ptr &);
+    filter_node_ptr(filter_node_ptr &&);
+    void operator=(filter_node *);
+    void operator=(const filter_node_ptr &);
+    void operator=(filter_node_ptr &&);
+    filter_node& operator*() const;
+    operator bool() const;
+};
+
+//! Abstract base class that represents a node in a parse tree underlying a filter class.
+/** These nodes are always heap-allocated and can be shared by filter objects. */
+class filter_node {
+    /** Count must be atomic because it is hidden state for user, but might be shared by threads. */
+    std::atomic<std::intptr_t> ref_count;
+public:
+    filter_node_ptr left;
+    filter_node_ptr right;
+protected:
+    filter_node() : ref_count(0), left(nullptr), right(nullptr) {
+#ifdef __TBB_TEST_FILTER_NODE_COUNT
+        ++(__TBB_TEST_FILTER_NODE_COUNT);
+#endif
+    }
+public:
+    filter_node(const filter_node_ptr& x, const filter_node_ptr& y) : filter_node(){
+        left = x;
+        right = y;
+    }
+    filter_node(const filter_node&) = delete;
+    filter_node& operator=(const filter_node&) = delete;
+
+    //! Add concrete_filter to pipeline
+    virtual base_filter* create_filter() const {
+        __TBB_ASSERT(false, "method of non-leaf was called");
+        return nullptr;
+    }
+
+    //! Increment reference count
+    void add_ref() { ref_count.fetch_add(1, std::memory_order_relaxed); }
+
+    //! Decrement reference count and delete if it becomes zero.
+    void remove_ref() {
+        __TBB_ASSERT(ref_count>0,"ref_count underflow");
+        if( ref_count.fetch_sub(1, std::memory_order_relaxed) == 1 ) {
+            this->~filter_node();
+            r1::deallocate_memory(this);
+        }
+    }
+
+    virtual ~filter_node() {
+#ifdef __TBB_TEST_FILTER_NODE_COUNT
+        --(__TBB_TEST_FILTER_NODE_COUNT);
+#endif
+    }
+};
+
+inline filter_node_ptr::filter_node_ptr(filter_node * nd) : my_node(nd) {
+    if (my_node) {
+        my_node->add_ref();
+    }
+}
+
+inline filter_node_ptr::~filter_node_ptr() {
+    if (my_node) {
+        my_node->remove_ref();
+    }
+}
+
+inline filter_node_ptr::filter_node_ptr(const filter_node_ptr & rhs) : my_node(rhs.my_node) {
+    if (my_node) {
+        my_node->add_ref();
+    }
+}
+
+inline filter_node_ptr::filter_node_ptr(filter_node_ptr && rhs) : my_node(rhs.my_node) {
+    rhs.my_node = nullptr;
+}
+
+inline void filter_node_ptr::operator=(filter_node * rhs) {
+    // Order of operations below carefully chosen so that reference counts remain correct
+    // in unlikely event that remove_ref throws exception.
+    filter_node* old = my_node;
+    my_node = rhs;
+    if (my_node) {
+        my_node->add_ref();
+    }
+    if (old) {
+        old->remove_ref();
+    }
+}
+
+inline void filter_node_ptr::operator=(const filter_node_ptr & rhs) {
+    *this = rhs.my_node;
+}
+
+inline void filter_node_ptr::operator=(filter_node_ptr && rhs) {
+    filter_node* old = my_node;
+    my_node = rhs.my_node;
+    rhs.my_node = nullptr;
+    if (old) {
+        old->remove_ref();
+    }
+}
+
+inline filter_node& filter_node_ptr::operator*() const{
+    __TBB_ASSERT(my_node,"nullptr node is used");
+    return *my_node;
+}
+
+inline filter_node_ptr::operator bool() const {
+    return my_node != nullptr;
+}
+
+//! Node in parse tree representing result of make_filter.
+template<typename InputType, typename OutputType, typename Body>
+class filter_node_leaf: public filter_node {
+    const unsigned int my_mode;
+    const Body my_body;
+    base_filter* create_filter() const override {
+        return new(r1::allocate_memory(sizeof(concrete_filter<InputType, OutputType, Body>))) concrete_filter<InputType, OutputType, Body>(my_mode,my_body);
+    }
+public:
+    filter_node_leaf( unsigned int m, const Body& b ) : my_mode(m), my_body(b) {}
+};
+
+
+template <typename Body, typename Input = typename filter_body_types<decltype(&Body::operator())>::input_type>
+using filter_input = typename std::conditional<std::is_same<Input, flow_control>::value, void, Input>::type;
+
+template <typename Body>
+using filter_output = typename filter_body_types<decltype(&Body::operator())>::output_type;
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+
+#endif /* __TBB_parallel_filters_H */
--- a/third_party/tbb/detail/_pipeline_filters_deduction.hh
+++ b/third_party/tbb/detail/_pipeline_filters_deduction.hh
@ -0,0 +1,47 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__pipeline_filters_deduction_H
+#define __TBB__pipeline_filters_deduction_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/type_traits"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Input, typename Output>
+struct declare_filter_types {
+    using input_type = typename std::remove_const<typename std::remove_reference<Input>::type>::type;
+    using output_type = typename std::remove_const<typename std::remove_reference<Output>::type>::type;
+};
+
+template <typename T> struct filter_body_types;
+
+template <typename T, typename Input, typename Output>
+struct filter_body_types<Output(T::*)(Input) const> : declare_filter_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct filter_body_types<Output(T::*)(Input)> : declare_filter_types<Input, Output> {};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB__pipeline_filters_deduction_H
--- a/third_party/tbb/detail/_range_common.hh
+++ b/third_party/tbb/detail/_range_common.hh
@ -0,0 +1,131 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__range_common_H
+#define __TBB_detail__range_common_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_utils.hh"
+#if __TBB_CPP20_CONCEPTS_PRESENT
+// MISSING #include <concepts>
+#endif
+#include "third_party/libcxx/iterator"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//! Dummy type that distinguishes splitting constructor from copy constructor.
+/**
+ * See description of parallel_for and parallel_reduce for example usages.
+ * @ingroup algorithms
+ */
+class split {};
+
+//! Type enables transmission of splitting proportion from partitioners to range objects
+/**
+ * In order to make use of such facility Range objects must implement
+ * splitting constructor with this type passed.
+ */
+class proportional_split : no_assign {
+public:
+    proportional_split(size_t _left = 1, size_t _right = 1) : my_left(_left), my_right(_right) { }
+
+    size_t left() const { return my_left; }
+    size_t right() const { return my_right; }
+
+    // used when range does not support proportional split
+    explicit operator split() const { return split(); }
+
+private:
+    size_t my_left, my_right;
+};
+
+template <typename Range, typename = void>
+struct range_split_object_provider {
+    template <typename PartitionerSplitType>
+    static split get( PartitionerSplitType& ) { return split(); }
+};
+
+template <typename Range>
+struct range_split_object_provider<Range,
+                                   typename std::enable_if<std::is_constructible<Range, Range&, proportional_split&>::value>::type> {
+    template <typename PartitionerSplitType>
+    static PartitionerSplitType& get( PartitionerSplitType& split_obj ) { return split_obj; }
+};
+
+template <typename Range, typename PartitionerSplitType>
+auto get_range_split_object( PartitionerSplitType& split_obj )
+-> decltype(range_split_object_provider<Range>::get(split_obj)) {
+    return range_split_object_provider<Range>::get(split_obj);
+}
+
+template <typename Range>
+using range_iterator_type = decltype(std::begin(std::declval<Range&>()));
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+template <typename Iterator>
+using iterator_reference_type = typename std::iterator_traits<Iterator>::reference;
+
+template <typename Range>
+using range_reference_type = iterator_reference_type<range_iterator_type<Range>>;
+
+template <typename Value>
+concept blocked_range_value = std::copyable<Value> &&
+                              requires( const std::remove_reference_t<Value>& lhs, const std::remove_reference_t<Value>& rhs ) {
+                                  { lhs < rhs } -> relaxed_convertible_to<bool>;
+                                  { lhs - rhs } -> std::convertible_to<std::size_t>;
+                                  { lhs + (rhs - lhs) } -> std::convertible_to<Value>;
+                              };
+
+template <typename T>
+concept splittable = std::constructible_from<T, T&, tbb::detail::split>;
+
+template <typename Range>
+concept tbb_range = std::copy_constructible<Range> &&
+                    splittable<Range> &&
+                    requires( const std::remove_reference_t<Range>& range ) {
+                        { range.empty() } -> relaxed_convertible_to<bool>;
+                        { range.is_divisible() } -> relaxed_convertible_to<bool>;
+                    };
+
+template <typename Iterator>
+constexpr bool iterator_concept_helper( std::input_iterator_tag ) {
+    return std::input_iterator<Iterator>;
+}
+
+template <typename Iterator>
+constexpr bool iterator_concept_helper( std::random_access_iterator_tag ) {
+    return std::random_access_iterator<Iterator>;
+}
+
+template <typename Iterator, typename IteratorTag>
+concept iterator_satisfies = requires (IteratorTag tag) {
+    requires iterator_concept_helper<Iterator>(tag);
+};
+
+template <typename Sequence, typename IteratorTag>
+concept container_based_sequence = requires( Sequence& seq ) {
+    { std::begin(seq) } -> iterator_satisfies<IteratorTag>;
+    { std::end(seq) } -> iterator_satisfies<IteratorTag>;
+};
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+} // namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__range_common_H
--- a/third_party/tbb/detail/_rtm_mutex.hh
+++ b/third_party/tbb/detail/_rtm_mutex.hh
@ -0,0 +1,163 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__rtm_mutex_impl_H
+#define __TBB__rtm_mutex_impl_H
+
+#include "third_party/tbb/detail/_assert.hh"
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/spin_mutex.hh"
+
+#include "third_party/tbb/profiling.hh"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+struct rtm_mutex_impl;
+}
+namespace d1 {
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress warning: structure was padded due to alignment specifier
+    #pragma warning (push)
+    #pragma warning (disable: 4324)
+#endif
+
+/** A rtm_mutex is an speculation-enabled spin mutex.
+    It should be used for locking short critical sections where the lock is
+    contended but the data it protects are not.  If zero-initialized, the
+    mutex is considered unheld.
+    @ingroup synchronization */
+class alignas(max_nfs_size) rtm_mutex : private spin_mutex {
+private:
+    enum class rtm_state {
+        rtm_none,
+        rtm_transacting,
+        rtm_real
+    };
+public:
+    //! Constructors
+    rtm_mutex() noexcept {
+        create_itt_sync(this, "tbb::speculative_spin_mutex", "");
+    }
+
+    //! Destructor
+    ~rtm_mutex() = default;
+
+    //! Represents acquisition of a mutex.
+    class scoped_lock {
+    public:
+        friend class rtm_mutex;
+        //! Construct lock that has not acquired a mutex.
+        /** Equivalent to zero-initialization of *this. */
+        constexpr scoped_lock() : m_mutex(nullptr), m_transaction_state(rtm_state::rtm_none) {}
+
+        //! Acquire lock on given mutex.
+        scoped_lock(rtm_mutex& m) : m_mutex(nullptr), m_transaction_state(rtm_state::rtm_none) {
+            acquire(m);
+        }
+
+        //! Release lock (if lock is held).
+        ~scoped_lock() {
+            if(m_transaction_state != rtm_state::rtm_none) {
+                release();
+            }
+        }
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        //! Acquire lock on given mutex.
+        void acquire(rtm_mutex& m);
+
+        //! Try acquire lock on given mutex.
+        bool try_acquire(rtm_mutex& m);
+
+        //! Release lock
+        void release();
+
+    private:
+        rtm_mutex* m_mutex;
+        rtm_state m_transaction_state;
+        friend r1::rtm_mutex_impl;
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = false;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+private:
+    friend r1::rtm_mutex_impl;
+}; // end of rtm_mutex
+} // namespace d1
+
+namespace r1 {
+    //! Internal acquire lock.
+    // only_speculate == true if we're doing a try_lock, else false.
+    TBB_EXPORT void __TBB_EXPORTED_FUNC acquire(d1::rtm_mutex&, d1::rtm_mutex::scoped_lock&, bool only_speculate = false);
+    //! Internal try_acquire lock.
+    TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire(d1::rtm_mutex&, d1::rtm_mutex::scoped_lock&);
+    //! Internal release lock.
+    TBB_EXPORT void __TBB_EXPORTED_FUNC release(d1::rtm_mutex::scoped_lock&);
+} // namespace r1
+
+namespace d1 {
+//! Acquire lock on given mutex.
+inline void rtm_mutex::scoped_lock::acquire(rtm_mutex& m) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    r1::acquire(m, *this);
+}
+
+//! Try acquire lock on given mutex.
+inline bool rtm_mutex::scoped_lock::try_acquire(rtm_mutex& m) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    return r1::try_acquire(m, *this);
+}
+
+//! Release lock
+inline void rtm_mutex::scoped_lock::release() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    __TBB_ASSERT(m_transaction_state != rtm_state::rtm_none, "lock is not acquired");
+    return r1::release(*this);
+}
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning (pop) // 4324 warning
+#endif
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(rtm_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64)
+inline void set_name(rtm_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif // WIN
+#else
+inline void set_name(rtm_mutex&, const char*) {}
+#if (_WIN32||_WIN64)
+inline void set_name(rtm_mutex&, const wchar_t*) {}
+#endif // WIN
+#endif
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB__rtm_mutex_impl_H */
--- a/third_party/tbb/detail/_rtm_rw_mutex.hh
+++ b/third_party/tbb/detail/_rtm_rw_mutex.hh
@ -0,0 +1,216 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__rtm_rw_mutex_H
+#define __TBB_detail__rtm_rw_mutex_H
+
+#include "third_party/tbb/detail/_assert.hh"
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/spin_rw_mutex.hh"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+struct rtm_rw_mutex_impl;
+}
+
+namespace d1 {
+
+constexpr std::size_t speculation_granularity = 64;
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress warning: structure was padded due to alignment specifier
+    #pragma warning (push)
+    #pragma warning (disable: 4324)
+#endif
+
+//! Fast, unfair, spinning speculation-enabled reader-writer lock with backoff and writer-preference
+/** @ingroup synchronization */
+class alignas(max_nfs_size) rtm_rw_mutex : private spin_rw_mutex {
+    friend struct r1::rtm_rw_mutex_impl;
+private:
+    enum class rtm_type {
+        rtm_not_in_mutex,
+        rtm_transacting_reader,
+        rtm_transacting_writer,
+        rtm_real_reader,
+        rtm_real_writer
+    };
+public:
+    //! Constructors
+    rtm_rw_mutex() noexcept : write_flag(false) {
+        create_itt_sync(this, "tbb::speculative_spin_rw_mutex", "");
+    }
+
+    //! Destructor
+    ~rtm_rw_mutex() = default;
+
+    //! Represents acquisition of a mutex.
+    class scoped_lock {
+        friend struct r1::rtm_rw_mutex_impl;
+    public:
+        //! Construct lock that has not acquired a mutex.
+        /** Equivalent to zero-initialization of *this. */
+        constexpr scoped_lock() : m_mutex(nullptr), m_transaction_state(rtm_type::rtm_not_in_mutex) {}
+
+        //! Acquire lock on given mutex.
+        scoped_lock(rtm_rw_mutex& m, bool write = true) : m_mutex(nullptr), m_transaction_state(rtm_type::rtm_not_in_mutex) {
+            acquire(m, write);
+        }
+
+        //! Release lock (if lock is held).
+        ~scoped_lock() {
+            if(m_transaction_state != rtm_type::rtm_not_in_mutex) {
+                release();
+            }
+        }
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        //! Acquire lock on given mutex.
+        inline void acquire(rtm_rw_mutex& m, bool write = true);
+
+        //! Try acquire lock on given mutex.
+        inline bool try_acquire(rtm_rw_mutex& m, bool write = true);
+
+        //! Release lock
+        inline void release();
+
+        //! Upgrade reader to become a writer.
+        /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+        inline bool upgrade_to_writer();
+
+        //! Downgrade writer to become a reader.
+        inline bool downgrade_to_reader();
+
+        inline bool is_writer() const;
+    private:
+        rtm_rw_mutex* m_mutex;
+        rtm_type m_transaction_state;
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = true;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+
+private:
+    alignas(speculation_granularity) std::atomic<bool> write_flag;
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning (pop) // 4324 warning
+#endif
+
+} // namespace d1
+
+namespace r1 {
+    //! Internal acquire write lock.
+    // only_speculate == true if we're doing a try_lock, else false.
+    TBB_EXPORT void __TBB_EXPORTED_FUNC acquire_writer(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&, bool only_speculate = false);
+    //! Internal acquire read lock.
+    // only_speculate == true if we're doing a try_lock, else false.
+    TBB_EXPORT void __TBB_EXPORTED_FUNC acquire_reader(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&, bool only_speculate = false);
+    //! Internal upgrade reader to become a writer.
+    TBB_EXPORT bool __TBB_EXPORTED_FUNC upgrade(d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal downgrade writer to become a reader.
+    TBB_EXPORT bool __TBB_EXPORTED_FUNC downgrade(d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal try_acquire write lock.
+    TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire_writer(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal try_acquire read lock.
+    TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire_reader(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal release lock.
+    TBB_EXPORT void __TBB_EXPORTED_FUNC release(d1::rtm_rw_mutex::scoped_lock&);
+}
+
+namespace d1 {
+//! Acquire lock on given mutex.
+void rtm_rw_mutex::scoped_lock::acquire(rtm_rw_mutex& m, bool write) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    if (write) {
+        r1::acquire_writer(m, *this);
+    } else {
+        r1::acquire_reader(m, *this);
+    }
+}
+
+//! Try acquire lock on given mutex.
+bool rtm_rw_mutex::scoped_lock::try_acquire(rtm_rw_mutex& m, bool write) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    if (write) {
+        return r1::try_acquire_writer(m, *this);
+    } else {
+        return r1::try_acquire_reader(m, *this);
+    }
+}
+
+//! Release lock
+void rtm_rw_mutex::scoped_lock::release() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    __TBB_ASSERT(m_transaction_state != rtm_type::rtm_not_in_mutex, "lock is not acquired");
+    return r1::release(*this);
+}
+
+//! Upgrade reader to become a writer.
+/** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+bool rtm_rw_mutex::scoped_lock::upgrade_to_writer() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    if (m_transaction_state == rtm_type::rtm_transacting_writer || m_transaction_state == rtm_type::rtm_real_writer) {
+        return true; // Already a writer
+    }
+    return r1::upgrade(*this);
+}
+
+//! Downgrade writer to become a reader.
+bool rtm_rw_mutex::scoped_lock::downgrade_to_reader() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    if (m_transaction_state == rtm_type::rtm_transacting_reader || m_transaction_state == rtm_type::rtm_real_reader) {
+        return true; // Already a reader
+    }
+    return r1::downgrade(*this);
+}
+
+bool rtm_rw_mutex::scoped_lock::is_writer() const {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    return m_transaction_state == rtm_type::rtm_transacting_writer || m_transaction_state == rtm_type::rtm_real_writer;
+}
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(rtm_rw_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64)
+inline void set_name(rtm_rw_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif // WIN
+#else
+inline void set_name(rtm_rw_mutex&, const char*) {}
+#if (_WIN32||_WIN64)
+inline void set_name(rtm_rw_mutex&, const wchar_t*) {}
+#endif // WIN
+#endif
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__rtm_rw_mutex_H
--- a/third_party/tbb/detail/_scoped_lock.hh
+++ b/third_party/tbb/detail/_scoped_lock.hh
@ -0,0 +1,175 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail_scoped_lock_H
+#define __TBB_detail_scoped_lock_H
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+// unique_scoped_lock supposes that Mutex operations never throw
+template <typename Mutex>
+class unique_scoped_lock {
+    //! Points to currently held Mutex, or nullptr if no lock is held.
+    Mutex* m_mutex{};
+
+public:
+    //! Construct without acquiring a Mutex.
+    constexpr unique_scoped_lock() noexcept : m_mutex(nullptr) {}
+
+    //! Construct and acquire lock on a Mutex.
+    unique_scoped_lock(Mutex& m) {
+        acquire(m);
+    }
+
+    //! No Copy
+    unique_scoped_lock(const unique_scoped_lock&) = delete;
+    unique_scoped_lock& operator=(const unique_scoped_lock&) = delete;
+
+    //! Acquire lock.
+    void acquire(Mutex& m) {
+        __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired");
+        m_mutex = &m;
+        m.lock();
+    }
+
+    //! Try acquiring lock (non-blocking)
+    /** Return true if lock acquired; false otherwise. */
+    bool try_acquire(Mutex& m) {
+        __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired");
+        bool succeed = m.try_lock();
+        if (succeed) {
+            m_mutex = &m;
+        }
+        return succeed;
+    }
+
+    //! Release lock
+    void release() {
+        __TBB_ASSERT(m_mutex, "release on Mutex::unique_scoped_lock that is not holding a lock");
+        m_mutex->unlock();
+        m_mutex = nullptr;
+    }
+
+    //! Destroy lock. If holding a lock, releases the lock first.
+    ~unique_scoped_lock() {
+        if (m_mutex) {
+            release();
+        }
+    }
+};
+
+// rw_scoped_lock supposes that Mutex operations never throw
+template <typename Mutex>
+class rw_scoped_lock {
+public:
+    //! Construct lock that has not acquired a mutex.
+    /** Equivalent to zero-initialization of *this. */
+    constexpr rw_scoped_lock() noexcept {}
+
+    //! Acquire lock on given mutex.
+    rw_scoped_lock(Mutex& m, bool write = true) {
+        acquire(m, write);
+    }
+
+    //! Release lock (if lock is held).
+    ~rw_scoped_lock() {
+        if (m_mutex) {
+            release();
+        }
+    }
+
+    //! No Copy
+    rw_scoped_lock(const rw_scoped_lock&) = delete;
+    rw_scoped_lock& operator=(const rw_scoped_lock&) = delete;
+
+    //! Acquire lock on given mutex.
+    void acquire(Mutex& m, bool write = true) {
+        __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired");
+        m_is_writer = write;
+        m_mutex = &m;
+        if (write) {
+            m_mutex->lock();
+        } else {
+            m_mutex->lock_shared();
+        }
+    }
+
+    //! Try acquire lock on given mutex.
+    bool try_acquire(Mutex& m, bool write = true) {
+        bool succeed = write ? m.try_lock() : m.try_lock_shared();
+        if (succeed) {
+            m_mutex = &m;
+            m_is_writer = write;
+        }
+        return succeed;
+    }
+
+    //! Release lock.
+    void release() {
+        __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired");
+        Mutex* m = m_mutex;
+        m_mutex = nullptr;
+
+        if (m_is_writer) {
+            m->unlock();
+        } else {
+            m->unlock_shared();
+        }
+    }
+
+    //! Upgrade reader to become a writer.
+    /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+    bool upgrade_to_writer() {
+        __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired");
+        if (m_is_writer) {
+            return true; // Already a writer
+        }
+        m_is_writer = true;
+        return m_mutex->upgrade();
+    }
+
+    //! Downgrade writer to become a reader.
+    bool downgrade_to_reader() {
+        __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired");
+        if (m_is_writer) {
+            m_mutex->downgrade();
+            m_is_writer = false;
+        }
+        return true;
+    }
+
+    bool is_writer() const {
+        __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired");
+        return m_is_writer;
+    }
+
+protected:
+    //! The pointer to the current mutex that is held, or nullptr if no mutex is held.
+    Mutex* m_mutex {nullptr};
+
+    //! If mutex != nullptr, then is_writer is true if holding a writer lock, false if holding a reader lock.
+    /** Not defined if not holding a lock. */
+    bool m_is_writer {false};
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail_scoped_lock_H
--- a/third_party/tbb/detail/_segment_table.hh
+++ b/third_party/tbb/detail/_segment_table.hh
@ -0,0 +1,567 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__segment_table_H
+#define __TBB_detail__segment_table_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_allocator_traits.hh"
+#include "third_party/tbb/detail/_template_helpers.hh"
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/detail/_assert.hh"
+#include "third_party/tbb/detail/_exception.hh"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/cstring"
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: conditional expression is constant
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename T, typename Allocator, typename DerivedType, std::size_t PointersPerEmbeddedTable>
+class segment_table {
+public:
+    using value_type = T;
+    using segment_type = T*;
+    using atomic_segment = std::atomic<segment_type>;
+    using segment_table_type = atomic_segment*;
+
+    using size_type = std::size_t;
+    using segment_index_type = std::size_t;
+
+    using allocator_type = Allocator;
+
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using segment_table_allocator_type = typename allocator_traits_type::template rebind_alloc<atomic_segment>;
+protected:
+    using segment_table_allocator_traits = tbb::detail::allocator_traits<segment_table_allocator_type>;
+    using derived_type = DerivedType;
+
+    static constexpr size_type pointers_per_embedded_table = PointersPerEmbeddedTable;
+    static constexpr size_type pointers_per_long_table = sizeof(size_type) * 8;
+public:
+    segment_table( const allocator_type& alloc = allocator_type() )
+        : my_segment_table_allocator(alloc), my_segment_table(nullptr)
+        , my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+    }
+
+    segment_table( const segment_table& other )
+        : my_segment_table_allocator(segment_table_allocator_traits::
+                                     select_on_container_copy_construction(other.my_segment_table_allocator))
+        , my_segment_table(nullptr), my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        try_call( [&] {
+            internal_transfer(other, copy_segment_body_type{*this});
+        } ).on_exception( [&] {
+            clear();
+        });
+    }
+
+    segment_table( const segment_table& other, const allocator_type& alloc )
+        : my_segment_table_allocator(alloc), my_segment_table(nullptr)
+        , my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        try_call( [&] {
+            internal_transfer(other, copy_segment_body_type{*this});
+        } ).on_exception( [&] {
+            clear();
+        });
+    }
+
+    segment_table( segment_table&& other )
+        : my_segment_table_allocator(std::move(other.my_segment_table_allocator)), my_segment_table(nullptr)
+        , my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        internal_move(std::move(other));
+    }
+
+    segment_table( segment_table&& other, const allocator_type& alloc )
+        : my_segment_table_allocator(alloc), my_segment_table(nullptr), my_first_block{}
+        , my_size{}, my_segment_table_allocation_failed{}
+    {
+        my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        using is_equal_type = typename segment_table_allocator_traits::is_always_equal;
+        internal_move_construct_with_allocator(std::move(other), alloc, is_equal_type());
+    }
+
+    ~segment_table() {
+        clear();
+    }
+
+    segment_table& operator=( const segment_table& other ) {
+        if (this != &other) {
+            copy_assign_allocators(my_segment_table_allocator, other.my_segment_table_allocator);
+            internal_transfer(other, copy_segment_body_type{*this});
+        }
+        return *this;
+    }
+
+    segment_table& operator=( segment_table&& other )
+        noexcept(derived_type::is_noexcept_assignment)
+    {
+        using pocma_type = typename segment_table_allocator_traits::propagate_on_container_move_assignment;
+        using is_equal_type = typename segment_table_allocator_traits::is_always_equal;
+
+        if (this != &other) {
+            move_assign_allocators(my_segment_table_allocator, other.my_segment_table_allocator);
+            internal_move_assign(std::move(other), tbb::detail::disjunction<is_equal_type, pocma_type>());
+        }
+        return *this;
+    }
+
+    void swap( segment_table& other )
+        noexcept(derived_type::is_noexcept_swap)
+    {
+        using is_equal_type = typename segment_table_allocator_traits::is_always_equal;
+        using pocs_type = typename segment_table_allocator_traits::propagate_on_container_swap;
+
+        if (this != &other) {
+            swap_allocators(my_segment_table_allocator, other.my_segment_table_allocator);
+            internal_swap(other, tbb::detail::disjunction<is_equal_type, pocs_type>());
+        }
+    }
+
+    segment_type get_segment( segment_index_type index ) const {
+        return get_table()[index] + segment_base(index);
+    }
+
+    value_type& operator[]( size_type index ) {
+        return internal_subscript<true>(index);
+    }
+
+    const value_type& operator[]( size_type index ) const {
+        return const_cast<segment_table*>(this)->internal_subscript<true>(index);
+    }
+
+    const segment_table_allocator_type& get_allocator() const {
+        return my_segment_table_allocator;
+    }
+
+    segment_table_allocator_type& get_allocator() {
+        return my_segment_table_allocator;
+    }
+
+    void enable_segment( segment_type& segment, segment_table_type table, segment_index_type seg_index, size_type index ) {
+        // Allocate new segment
+        segment_type new_segment = self()->create_segment(table, seg_index, index);
+        if (new_segment != nullptr) {
+            // Store (new_segment - segment_base) into the segment table to allow access to the table by index via
+            // my_segment_table[segment_index_of(index)][index]
+            segment_type disabled_segment = nullptr;
+            if (!table[seg_index].compare_exchange_strong(disabled_segment, new_segment - segment_base(seg_index))) {
+                // compare_exchange failed => some other thread has already enabled this segment
+                // Deallocate the memory
+                self()->deallocate_segment(new_segment, seg_index);
+            }
+        }
+
+        segment = table[seg_index].load(std::memory_order_acquire);
+        __TBB_ASSERT(segment != nullptr, "If create_segment returned nullptr, the element should be stored in the table");
+    }
+
+    void delete_segment( segment_index_type seg_index ) {
+        segment_type segment_to_delete = self()->nullify_segment(get_table(), seg_index);
+        if (segment_to_delete == segment_allocation_failure_tag) {
+            return;
+        }
+
+        segment_to_delete += segment_base(seg_index);
+
+        // Deallocate the segment
+        self()->destroy_segment(segment_to_delete, seg_index);
+    }
+
+    size_type number_of_segments( segment_table_type table ) const {
+        // Check for an active table, if it is embedded table - return the number of embedded segments
+        // Otherwise - return the maximum number of segments
+        return table == my_embedded_table ? pointers_per_embedded_table : pointers_per_long_table;
+    }
+
+    size_type capacity() const noexcept {
+        segment_table_type table = get_table();
+        size_type num_segments = number_of_segments(table);
+        for (size_type seg_index = 0; seg_index < num_segments; ++seg_index) {
+            // Check if the pointer is valid (allocated)
+            if (table[seg_index].load(std::memory_order_relaxed) <= segment_allocation_failure_tag) {
+                return segment_base(seg_index);
+            }
+        }
+        return segment_base(num_segments);
+    }
+
+    size_type find_last_allocated_segment( segment_table_type table ) const noexcept {
+        size_type end = 0;
+        size_type num_segments = number_of_segments(table);
+        for (size_type seg_index = 0; seg_index < num_segments; ++seg_index) {
+            // Check if the pointer is valid (allocated)
+            if (table[seg_index].load(std::memory_order_relaxed) > segment_allocation_failure_tag) {
+                end = seg_index + 1;
+            }
+        }
+        return end;
+    }
+
+    void reserve( size_type n ) {
+        if (n > allocator_traits_type::max_size(my_segment_table_allocator)) {
+            throw_exception(exception_id::reservation_length_error);
+        }
+
+        size_type size = my_size.load(std::memory_order_relaxed);
+        segment_index_type start_seg_idx = size == 0 ? 0 : segment_index_of(size - 1) + 1;
+        for (segment_index_type seg_idx = start_seg_idx; segment_base(seg_idx) < n; ++seg_idx) {
+                size_type first_index = segment_base(seg_idx);
+                internal_subscript<true>(first_index);
+        }
+    }
+
+    void clear() {
+        clear_segments();
+        clear_table();
+        my_size.store(0, std::memory_order_relaxed);
+        my_first_block.store(0, std::memory_order_relaxed);
+    }
+
+    void clear_segments() {
+        segment_table_type current_segment_table = get_table();
+        for (size_type i = number_of_segments(current_segment_table); i != 0; --i) {
+            if (current_segment_table[i - 1].load(std::memory_order_relaxed) != nullptr) {
+                // If the segment was enabled - disable and deallocate it
+                delete_segment(i - 1);
+            }
+        }
+    }
+
+    void clear_table() {
+        segment_table_type current_segment_table = get_table();
+        if (current_segment_table != my_embedded_table) {
+            // If the active table is not the embedded one - deallocate the active table
+            for (size_type i = 0; i != pointers_per_long_table; ++i) {
+                segment_table_allocator_traits::destroy(my_segment_table_allocator, &current_segment_table[i]);
+            }
+
+            segment_table_allocator_traits::deallocate(my_segment_table_allocator, current_segment_table, pointers_per_long_table);
+            my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+            zero_table(my_embedded_table, pointers_per_embedded_table);
+        }
+    }
+
+    void extend_table_if_necessary(segment_table_type& table, size_type start_index, size_type end_index) {
+        // extend_segment_table if an active table is an embedded table
+        // and the requested index is not in the embedded table
+        if (table == my_embedded_table && end_index > embedded_table_size) {
+            if (start_index <= embedded_table_size) {
+                try_call([&] {
+                    table = self()->allocate_long_table(my_embedded_table, start_index);
+                    // It is possible that the table was extended by the thread that allocated first_block.
+                    // In this case it is necessary to re-read the current table.
+
+                    if (table) {
+                        my_segment_table.store(table, std::memory_order_release);
+                    } else {
+                        table = my_segment_table.load(std::memory_order_acquire);
+                    }
+                }).on_exception([&] {
+                    my_segment_table_allocation_failed.store(true, std::memory_order_relaxed);
+                });
+            } else {
+                atomic_backoff backoff;
+                do {
+                    if (my_segment_table_allocation_failed.load(std::memory_order_relaxed)) {
+                        throw_exception(exception_id::bad_alloc);
+                    }
+                    backoff.pause();
+                    table = my_segment_table.load(std::memory_order_acquire);
+                } while (table == my_embedded_table);
+            }
+        }
+    }
+
+    // Return the segment where index is stored
+    static constexpr segment_index_type segment_index_of( size_type index ) {
+        return size_type(tbb::detail::log2(uintptr_t(index|1)));
+    }
+
+    // Needed to calculate the offset in segment
+    static constexpr size_type segment_base( size_type index ) {
+        return size_type(1) << index & ~size_type(1);
+    }
+
+    // Return size of the segment
+    static constexpr size_type segment_size( size_type index ) {
+        return index == 0 ? 2 : size_type(1) << index;
+    }
+
+private:
+
+    derived_type* self() {
+        return static_cast<derived_type*>(this);
+    }
+
+    struct copy_segment_body_type {
+        void operator()( segment_index_type index, segment_type from, segment_type to ) const {
+            my_instance.self()->copy_segment(index, from, to);
+        }
+        segment_table& my_instance;
+    };
+
+    struct move_segment_body_type {
+        void operator()( segment_index_type index, segment_type from, segment_type to ) const {
+            my_instance.self()->move_segment(index, from, to);
+        }
+        segment_table& my_instance;
+    };
+
+    // Transgers all segments from the other table
+    template <typename TransferBody>
+    void internal_transfer( const segment_table& other, TransferBody transfer_segment ) {
+        static_cast<derived_type*>(this)->destroy_elements();
+
+        assign_first_block_if_necessary(other.my_first_block.load(std::memory_order_relaxed));
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+
+        segment_table_type other_table = other.get_table();
+        size_type end_segment_size = segment_size(other.find_last_allocated_segment(other_table));
+
+        // If an exception occurred in other, then the size may be greater than the size of the end segment.
+        size_type other_size = end_segment_size < other.my_size.load(std::memory_order_relaxed) ?
+            other.my_size.load(std::memory_order_relaxed) : end_segment_size;
+        other_size = my_segment_table_allocation_failed ? embedded_table_size : other_size;
+
+        for (segment_index_type i = 0; segment_base(i) < other_size; ++i) {
+            // If the segment in other table is enabled - transfer it
+            if (other_table[i].load(std::memory_order_relaxed) == segment_allocation_failure_tag)
+            {
+                    my_size = segment_base(i);
+                    break;
+            } else if (other_table[i].load(std::memory_order_relaxed) != nullptr) {
+                internal_subscript<true>(segment_base(i));
+                transfer_segment(i, other.get_table()[i].load(std::memory_order_relaxed) + segment_base(i),
+                                get_table()[i].load(std::memory_order_relaxed) + segment_base(i));
+            }
+        }
+    }
+
+    // Moves the other segment table
+    // Only equal allocators are allowed
+    void internal_move( segment_table&& other ) {
+        // NOTE: allocators should be equal
+        clear();
+        my_first_block.store(other.my_first_block.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        // If an active table in other is embedded - restore all of the embedded segments
+        if (other.get_table() == other.my_embedded_table) {
+            for ( size_type i = 0; i != pointers_per_embedded_table; ++i ) {
+                segment_type other_segment = other.my_embedded_table[i].load(std::memory_order_relaxed);
+                my_embedded_table[i].store(other_segment, std::memory_order_relaxed);
+                other.my_embedded_table[i].store(nullptr, std::memory_order_relaxed);
+            }
+            my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        } else {
+            my_segment_table.store(other.my_segment_table, std::memory_order_relaxed);
+            other.my_segment_table.store(other.my_embedded_table, std::memory_order_relaxed);
+            zero_table(other.my_embedded_table, pointers_per_embedded_table);
+        }
+        other.my_size.store(0, std::memory_order_relaxed);
+    }
+
+    // Move construct the segment table with the allocator object
+    // if any instances of allocator_type are always equal
+    void internal_move_construct_with_allocator( segment_table&& other, const allocator_type&,
+                                                 /*is_always_equal = */ std::true_type ) {
+        internal_move(std::move(other));
+    }
+
+    // Move construct the segment table with the allocator object
+    // if any instances of allocator_type are always equal
+    void internal_move_construct_with_allocator( segment_table&& other, const allocator_type& alloc,
+                                                 /*is_always_equal = */ std::false_type ) {
+        if (other.my_segment_table_allocator == alloc) {
+            // If allocators are equal - restore pointers
+            internal_move(std::move(other));
+        } else {
+            // If allocators are not equal - perform per element move with reallocation
+            try_call( [&] {
+                internal_transfer(other, move_segment_body_type{*this});
+            } ).on_exception( [&] {
+                clear();
+            });
+        }
+    }
+
+    // Move assigns the segment table to other is any instances of allocator_type are always equal
+    // or propagate_on_container_move_assignment is true
+    void internal_move_assign( segment_table&& other, /*is_always_equal || POCMA = */ std::true_type ) {
+        internal_move(std::move(other));
+    }
+
+    // Move assigns the segment table to other is any instances of allocator_type are not always equal
+    // and propagate_on_container_move_assignment is false
+    void internal_move_assign( segment_table&& other, /*is_always_equal || POCMA = */ std::false_type ) {
+        if (my_segment_table_allocator == other.my_segment_table_allocator) {
+            // If allocators are equal - restore pointers
+            internal_move(std::move(other));
+        } else {
+            // If allocators are not equal - perform per element move with reallocation
+            internal_transfer(other, move_segment_body_type{*this});
+        }
+    }
+
+    // Swaps two segment tables if any instances of allocator_type are always equal
+    // or propagate_on_container_swap is true
+    void internal_swap( segment_table& other, /*is_always_equal || POCS = */ std::true_type ) {
+        internal_swap_fields(other);
+    }
+
+    // Swaps two segment tables if any instances of allocator_type are not always equal
+    // and propagate_on_container_swap is false
+    // According to the C++ standard, swapping of two containers with unequal allocators
+    // is an undefined behavior scenario
+    void internal_swap( segment_table& other, /*is_always_equal || POCS = */ std::false_type ) {
+        __TBB_ASSERT(my_segment_table_allocator == other.my_segment_table_allocator,
+                     "Swapping with unequal allocators is not allowed");
+        internal_swap_fields(other);
+    }
+
+    void internal_swap_fields( segment_table& other ) {
+        // If an active table in either *this segment table or other is an embedded one - swaps the embedded tables
+        if (get_table() == my_embedded_table ||
+            other.get_table() == other.my_embedded_table) {
+
+            for (size_type i = 0; i != pointers_per_embedded_table; ++i) {
+                segment_type current_segment = my_embedded_table[i].load(std::memory_order_relaxed);
+                segment_type other_segment = other.my_embedded_table[i].load(std::memory_order_relaxed);
+
+                my_embedded_table[i].store(other_segment, std::memory_order_relaxed);
+                other.my_embedded_table[i].store(current_segment, std::memory_order_relaxed);
+            }
+        }
+
+        segment_table_type current_segment_table = get_table();
+        segment_table_type other_segment_table = other.get_table();
+
+        // If an active table is an embedded one -
+        // store an active table in other to the embedded one from other
+        if (current_segment_table == my_embedded_table) {
+            other.my_segment_table.store(other.my_embedded_table, std::memory_order_relaxed);
+        } else {
+            // Otherwise - store it to the active segment table
+            other.my_segment_table.store(current_segment_table, std::memory_order_relaxed);
+        }
+
+        // If an active table in other segment table is an embedded one -
+        // store an active table in other to the embedded one from *this
+        if (other_segment_table == other.my_embedded_table) {
+            my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        } else {
+            // Otherwise - store it to the active segment table in other
+            my_segment_table.store(other_segment_table, std::memory_order_relaxed);
+        }
+        auto first_block = other.my_first_block.load(std::memory_order_relaxed);
+        other.my_first_block.store(my_first_block.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        my_first_block.store(first_block, std::memory_order_relaxed);
+
+        auto size = other.my_size.load(std::memory_order_relaxed);
+        other.my_size.store(my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        my_size.store(size, std::memory_order_relaxed);
+    }
+
+protected:
+    // A flag indicates that an exception was throws during segment allocations
+    const segment_type segment_allocation_failure_tag = reinterpret_cast<segment_type>(1);
+    static constexpr size_type embedded_table_size = segment_size(pointers_per_embedded_table);
+
+    template <bool allow_out_of_range_access>
+    value_type& internal_subscript( size_type index ) {
+        segment_index_type seg_index = segment_index_of(index);
+        segment_table_type table = my_segment_table.load(std::memory_order_acquire);
+        segment_type segment = nullptr;
+
+        if (allow_out_of_range_access) {
+            if (derived_type::allow_table_extending) {
+                extend_table_if_necessary(table, index, index + 1);
+            }
+
+            segment = table[seg_index].load(std::memory_order_acquire);
+            // If the required segment is disabled - enable it
+            if (segment == nullptr) {
+                enable_segment(segment, table, seg_index, index);
+            }
+            // Check if an exception was thrown during segment allocation
+            if (segment == segment_allocation_failure_tag) {
+                throw_exception(exception_id::bad_alloc);
+            }
+        } else {
+            segment = table[seg_index].load(std::memory_order_acquire);
+        }
+        __TBB_ASSERT(segment != nullptr, nullptr);
+
+        return segment[index];
+    }
+
+    void assign_first_block_if_necessary(segment_index_type index) {
+        size_type zero = 0;
+        if (this->my_first_block.load(std::memory_order_relaxed) == zero) {
+            this->my_first_block.compare_exchange_strong(zero, index);
+        }
+    }
+
+    void zero_table( segment_table_type table, size_type count ) {
+        for (size_type i = 0; i != count; ++i) {
+            table[i].store(nullptr, std::memory_order_relaxed);
+        }
+    }
+
+    segment_table_type get_table() const {
+        return my_segment_table.load(std::memory_order_acquire);
+    }
+
+    segment_table_allocator_type my_segment_table_allocator;
+    std::atomic<segment_table_type> my_segment_table;
+    atomic_segment my_embedded_table[pointers_per_embedded_table];
+    // Number of segments in first block
+    std::atomic<size_type> my_first_block;
+    // Number of elements in table
+    std::atomic<size_type> my_size;
+    // Flag to indicate failed extend table
+    std::atomic<bool> my_segment_table_allocation_failed;
+}; // class segment_table
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(pop) // warning 4127 is back
+#endif
+
+#endif // __TBB_detail__segment_table_H
--- a/third_party/tbb/detail/_small_object_pool.hh
+++ b/third_party/tbb/detail/_small_object_pool.hh
@ -0,0 +1,109 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__small_object_pool_H
+#define __TBB__small_object_pool_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_assert.hh"
+
+#include "third_party/tbb/profiling.hh"
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class small_object_pool {
+protected:
+    small_object_pool() = default;
+};
+struct execution_data;
+}
+
+namespace r1 {
+TBB_EXPORT void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& pool, std::size_t number_of_bytes,
+                                    const d1::execution_data& ed);
+TBB_EXPORT void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& pool, std::size_t number_of_bytes);
+TBB_EXPORT void  __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& pool, void* ptr, std::size_t number_of_bytes,
+                                        const d1::execution_data& ed);
+TBB_EXPORT void  __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& pool, void* ptr, std::size_t number_of_bytes);
+}
+
+namespace d1 {
+class small_object_allocator {
+public:
+    template <typename Type, typename... Args>
+    Type* new_object(execution_data& ed, Args&&... args) {
+        void* allocated_object = r1::allocate(m_pool, sizeof(Type), ed);
+
+        auto constructed_object = new(allocated_object) Type(std::forward<Args>(args)...);
+        return constructed_object;
+    }
+
+    template <typename Type, typename... Args>
+    Type* new_object(Args&&... args) {
+        void* allocated_object = r1::allocate(m_pool, sizeof(Type));
+
+        auto constructed_object = new(allocated_object) Type(std::forward<Args>(args)...);
+        return constructed_object;
+    }
+
+    template <typename Type>
+    void delete_object(Type* object, const execution_data& ed) {
+        // Copy this since it can be a member of the passed object and
+        // unintentionally destroyed when Type destructor is called below
+        small_object_allocator alloc = *this;
+        object->~Type();
+        alloc.deallocate(object, ed);
+    }
+
+    template <typename Type>
+    void delete_object(Type* object) {
+        // Copy this since it can be a member of the passed object and
+        // unintentionally destroyed when Type destructor is called below
+        small_object_allocator alloc = *this;
+        object->~Type();
+        alloc.deallocate(object);
+    }
+
+    template <typename Type>
+    void deallocate(Type* ptr, const execution_data& ed) {
+        call_itt_task_notify(destroy, ptr);
+
+        __TBB_ASSERT(m_pool != nullptr, "Pool must be valid for deallocate call");
+        r1::deallocate(*m_pool, ptr, sizeof(Type), ed);
+    }
+
+    template <typename Type>
+    void deallocate(Type* ptr) {
+        call_itt_task_notify(destroy, ptr);
+
+        __TBB_ASSERT(m_pool != nullptr, "Pool must be valid for deallocate call");
+        r1::deallocate(*m_pool, ptr, sizeof(Type));
+    }
+private:
+    small_object_pool* m_pool{};
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB__small_object_pool_H */
--- a/third_party/tbb/detail/_string_resource.hh
+++ b/third_party/tbb/detail/_string_resource.hh
@ -0,0 +1,79 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+TBB_STRING_RESOURCE(ALGORITHM, "tbb_algorithm")
+TBB_STRING_RESOURCE(PARALLEL_FOR, "tbb_parallel_for")
+TBB_STRING_RESOURCE(PARALLEL_FOR_EACH, "tbb_parallel_for_each")
+TBB_STRING_RESOURCE(PARALLEL_INVOKE, "tbb_parallel_invoke")
+TBB_STRING_RESOURCE(PARALLEL_REDUCE, "tbb_parallel_reduce")
+TBB_STRING_RESOURCE(PARALLEL_SCAN, "tbb_parallel_scan")
+TBB_STRING_RESOURCE(PARALLEL_SORT, "tbb_parallel_sort")
+TBB_STRING_RESOURCE(PARALLEL_PIPELINE, "tbb_parallel_pipeline")
+TBB_STRING_RESOURCE(CUSTOM_CTX, "tbb_custom")
+
+TBB_STRING_RESOURCE(FLOW_NULL, "null")
+TBB_STRING_RESOURCE(FLOW_BROADCAST_NODE, "broadcast_node")
+TBB_STRING_RESOURCE(FLOW_BUFFER_NODE, "buffer_node")
+TBB_STRING_RESOURCE(FLOW_CONTINUE_NODE, "continue_node")
+TBB_STRING_RESOURCE(FLOW_FUNCTION_NODE, "function_node")
+TBB_STRING_RESOURCE(FLOW_JOIN_NODE_QUEUEING, "join_node (queueing)")
+TBB_STRING_RESOURCE(FLOW_JOIN_NODE_RESERVING, "join_node (reserving)")
+TBB_STRING_RESOURCE(FLOW_JOIN_NODE_TAG_MATCHING, "join_node (tag_matching)")
+TBB_STRING_RESOURCE(FLOW_LIMITER_NODE, "limiter_node")
+TBB_STRING_RESOURCE(FLOW_MULTIFUNCTION_NODE, "multifunction_node")
+TBB_STRING_RESOURCE(FLOW_OVERWRITE_NODE, "overwrite_node")
+TBB_STRING_RESOURCE(FLOW_PRIORITY_QUEUE_NODE, "priority_queue_node")
+TBB_STRING_RESOURCE(FLOW_QUEUE_NODE, "queue_node")
+TBB_STRING_RESOURCE(FLOW_SEQUENCER_NODE, "sequencer_node")
+TBB_STRING_RESOURCE(FLOW_INPUT_NODE, "input_node")
+TBB_STRING_RESOURCE(FLOW_SPLIT_NODE, "split_node")
+TBB_STRING_RESOURCE(FLOW_WRITE_ONCE_NODE, "write_once_node")
+TBB_STRING_RESOURCE(FLOW_INDEXER_NODE, "indexer_node")
+TBB_STRING_RESOURCE(FLOW_COMPOSITE_NODE, "composite_node")
+TBB_STRING_RESOURCE(FLOW_ASYNC_NODE, "async_node")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT, "input_port")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_0, "input_port_0")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_1, "input_port_1")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_2, "input_port_2")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_3, "input_port_3")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_4, "input_port_4")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_5, "input_port_5")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_6, "input_port_6")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_7, "input_port_7")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_8, "input_port_8")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_9, "input_port_9")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT, "output_port")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_0, "output_port_0")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_1, "output_port_1")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_2, "output_port_2")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_3, "output_port_3")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_4, "output_port_4")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_5, "output_port_5")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_6, "output_port_6")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_7, "output_port_7")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_8, "output_port_8")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_9, "output_port_9")
+TBB_STRING_RESOURCE(FLOW_OBJECT_NAME, "object_name")
+TBB_STRING_RESOURCE(FLOW_BODY, "body")
+TBB_STRING_RESOURCE(FLOW_GRAPH, "graph")
+TBB_STRING_RESOURCE(FLOW_NODE, "node")
+TBB_STRING_RESOURCE(FLOW_TASKS, "tbb_flow_graph")
+TBB_STRING_RESOURCE(USER_EVENT, "user_event")
+
+#if __TBB_FLOW_TRACE_CODEPTR
+TBB_STRING_RESOURCE(CODE_ADDRESS, "code_address")
+#endif
--- a/third_party/tbb/detail/_task.hh
+++ b/third_party/tbb/detail/_task.hh
@ -0,0 +1,233 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__task_H
+#define __TBB__task_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_assert.hh"
+#include "third_party/tbb/detail/_template_helpers.hh"
+#include "third_party/tbb/detail/_small_object_pool.hh"
+
+#include "third_party/tbb/profiling.hh"
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/climits"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/mutex"
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+using slot_id = unsigned short;
+constexpr slot_id no_slot = slot_id(~0);
+constexpr slot_id any_slot = slot_id(~1);
+
+class task;
+class wait_context;
+class task_group_context;
+struct execution_data;
+}
+
+namespace r1 {
+//! Task spawn/wait entry points
+TBB_EXPORT void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx);
+TBB_EXPORT void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx, d1::slot_id id);
+TBB_EXPORT void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context&, d1::task_group_context& w_ctx);
+TBB_EXPORT void __TBB_EXPORTED_FUNC wait(d1::wait_context&, d1::task_group_context& ctx);
+TBB_EXPORT d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data*);
+TBB_EXPORT d1::task_group_context* __TBB_EXPORTED_FUNC current_context();
+
+// Do not place under __TBB_RESUMABLE_TASKS. It is a stub for unsupported platforms.
+struct suspend_point_type;
+using suspend_callback_type = void(*)(void*, suspend_point_type*);
+//! The resumable tasks entry points
+TBB_EXPORT void __TBB_EXPORTED_FUNC suspend(suspend_callback_type suspend_callback, void* user_callback);
+TBB_EXPORT void __TBB_EXPORTED_FUNC resume(suspend_point_type* tag);
+TBB_EXPORT suspend_point_type* __TBB_EXPORTED_FUNC current_suspend_point();
+TBB_EXPORT void __TBB_EXPORTED_FUNC notify_waiters(std::uintptr_t wait_ctx_addr);
+
+class thread_data;
+class task_dispatcher;
+class external_waiter;
+struct task_accessor;
+struct task_arena_impl;
+} // namespace r1
+
+namespace d1 {
+
+class task_arena;
+using suspend_point = r1::suspend_point_type*;
+
+#if __TBB_RESUMABLE_TASKS
+template <typename F>
+static void suspend_callback(void* user_callback, suspend_point sp) {
+    // Copy user function to a new stack after the context switch to avoid a race when the previous
+    // suspend point is resumed while the user_callback is being called.
+    F user_callback_copy = *static_cast<F*>(user_callback);
+    user_callback_copy(sp);
+}
+
+template <typename F>
+void suspend(F f) {
+    r1::suspend(&suspend_callback<F>, &f);
+}
+
+inline void resume(suspend_point tag) {
+    r1::resume(tag);
+}
+#endif /* __TBB_RESUMABLE_TASKS */
+
+// TODO align wait_context on cache lane
+class wait_context {
+    static constexpr std::uint64_t overflow_mask = ~((1LLU << 32) - 1);
+
+    std::uint64_t m_version_and_traits{1};
+    std::atomic<std::uint64_t> m_ref_count{};
+
+    void add_reference(std::int64_t delta) {
+        call_itt_task_notify(releasing, this);
+        std::uint64_t r = m_ref_count.fetch_add(static_cast<std::uint64_t>(delta)) + static_cast<std::uint64_t>(delta);
+
+        __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected");
+
+        if (!r) {
+            // Some external waiters or coroutine waiters sleep in wait list
+            // Should to notify them that work is done
+            std::uintptr_t wait_ctx_addr = std::uintptr_t(this);
+            r1::notify_waiters(wait_ctx_addr);
+        }
+    }
+
+    bool continue_execution() const {
+        std::uint64_t r = m_ref_count.load(std::memory_order_acquire);
+        __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected");
+        return r > 0;
+    }
+
+    friend class r1::thread_data;
+    friend class r1::task_dispatcher;
+    friend class r1::external_waiter;
+    friend class task_group;
+    friend class task_group_base;
+    friend struct r1::task_arena_impl;
+    friend struct r1::suspend_point_type;
+public:
+    // Despite the internal reference count is uin64_t we limit the user interface with uint32_t
+    // to preserve a part of the internal reference count for special needs.
+    wait_context(std::uint32_t ref_count) : m_ref_count{ref_count} { suppress_unused_warning(m_version_and_traits); }
+    wait_context(const wait_context&) = delete;
+
+    ~wait_context() {
+        __TBB_ASSERT(!continue_execution(), nullptr);
+    }
+
+    void reserve(std::uint32_t delta = 1) {
+        add_reference(delta);
+    }
+
+    void release(std::uint32_t delta = 1) {
+        add_reference(-std::int64_t(delta));
+    }
+};
+
+struct execution_data {
+    task_group_context* context{};
+    slot_id original_slot{};
+    slot_id affinity_slot{};
+};
+
+inline task_group_context* context(const execution_data& ed) {
+    return ed.context;
+}
+
+inline slot_id original_slot(const execution_data& ed) {
+    return ed.original_slot;
+}
+
+inline slot_id affinity_slot(const execution_data& ed) {
+    return ed.affinity_slot;
+}
+
+inline slot_id execution_slot(const execution_data& ed) {
+    return r1::execution_slot(&ed);
+}
+
+inline bool is_same_affinity(const execution_data& ed) {
+    return affinity_slot(ed) == no_slot || affinity_slot(ed) == execution_slot(ed);
+}
+
+inline bool is_stolen(const execution_data& ed) {
+    return original_slot(ed) != execution_slot(ed);
+}
+
+inline void spawn(task& t, task_group_context& ctx) {
+    call_itt_task_notify(releasing, &t);
+    r1::spawn(t, ctx);
+}
+
+inline void spawn(task& t, task_group_context& ctx, slot_id id) {
+    call_itt_task_notify(releasing, &t);
+    r1::spawn(t, ctx, id);
+}
+
+inline void execute_and_wait(task& t, task_group_context& t_ctx, wait_context& wait_ctx, task_group_context& w_ctx) {
+    r1::execute_and_wait(t, t_ctx, wait_ctx, w_ctx);
+    call_itt_task_notify(acquired, &wait_ctx);
+    call_itt_task_notify(destroy, &wait_ctx);
+}
+
+inline void wait(wait_context& wait_ctx, task_group_context& ctx) {
+    r1::wait(wait_ctx, ctx);
+    call_itt_task_notify(acquired, &wait_ctx);
+    call_itt_task_notify(destroy, &wait_ctx);
+}
+
+using r1::current_context;
+
+class task_traits {
+    std::uint64_t m_version_and_traits{};
+    friend struct r1::task_accessor;
+};
+
+//! Alignment for a task object
+static constexpr std::size_t task_alignment = 64;
+
+//! Base class for user-defined tasks.
+/** @ingroup task_scheduling */
+class alignas(task_alignment) task : public task_traits {
+protected:
+    virtual ~task() = default;
+
+public:
+    virtual task* execute(execution_data&) = 0;
+    virtual task* cancel(execution_data&) = 0;
+
+private:
+    std::uint64_t m_reserved[6]{};
+    friend struct r1::task_accessor;
+};
+static_assert(sizeof(task) == task_alignment, "task size is broken");
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB__task_H */
--- a/third_party/tbb/detail/_task_handle.hh
+++ b/third_party/tbb/detail/_task_handle.hh
@ -0,0 +1,123 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+
+#ifndef __TBB_task_handle_H
+#define __TBB_task_handle_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_task.hh"
+#include "third_party/tbb/detail/_small_object_pool.hh"
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/libcxx/memory"
+
+namespace tbb {
+namespace detail {
+
+namespace d1 { class task_group_context; class wait_context; struct execution_data; }
+namespace d2 {
+
+class task_handle;
+
+class task_handle_task : public d1::task {
+    std::uint64_t m_version_and_traits{};
+    d1::wait_context& m_wait_ctx;
+    d1::task_group_context& m_ctx;
+    d1::small_object_allocator m_allocator;
+public:
+    void finalize(const d1::execution_data* ed = nullptr) {
+        if (ed) {
+            m_allocator.delete_object(this, *ed);
+        } else {
+            m_allocator.delete_object(this);
+        }
+    }
+
+    task_handle_task(d1::wait_context& wo, d1::task_group_context& ctx, d1::small_object_allocator& alloc)
+        : m_wait_ctx(wo)
+        , m_ctx(ctx)
+        , m_allocator(alloc) {
+        suppress_unused_warning(m_version_and_traits);
+    }
+
+    ~task_handle_task() override {
+        m_wait_ctx.release();
+    }
+
+    d1::task_group_context& ctx() const { return m_ctx; }
+};
+
+
+class task_handle {
+    struct task_handle_task_finalizer_t{
+        void operator()(task_handle_task* p){ p->finalize(); }
+    };
+    using handle_impl_t = std::unique_ptr<task_handle_task, task_handle_task_finalizer_t>;
+
+    handle_impl_t m_handle = {nullptr};
+public:
+    task_handle() = default;
+    task_handle(task_handle&&) = default;
+    task_handle& operator=(task_handle&&) = default;
+
+    explicit operator bool() const noexcept { return static_cast<bool>(m_handle); }
+
+    friend bool operator==(task_handle const& th, std::nullptr_t) noexcept;
+    friend bool operator==(std::nullptr_t, task_handle const& th) noexcept;
+
+    friend bool operator!=(task_handle const& th, std::nullptr_t) noexcept;
+    friend bool operator!=(std::nullptr_t, task_handle const& th) noexcept;
+
+private:
+    friend struct task_handle_accessor;
+
+    task_handle(task_handle_task* t) : m_handle {t}{};
+
+    d1::task* release() {
+       return m_handle.release();
+    }
+};
+
+struct task_handle_accessor {
+static task_handle              construct(task_handle_task* t)  { return {t}; }
+static d1::task*                release(task_handle& th)        { return th.release(); }
+static d1::task_group_context&  ctx_of(task_handle& th)         {
+    __TBB_ASSERT(th.m_handle, "ctx_of does not expect empty task_handle.");
+    return th.m_handle->ctx();
+}
+};
+
+inline bool operator==(task_handle const& th, std::nullptr_t) noexcept {
+    return th.m_handle == nullptr;
+}
+inline bool operator==(std::nullptr_t, task_handle const& th) noexcept {
+    return th.m_handle == nullptr;
+}
+
+inline bool operator!=(task_handle const& th, std::nullptr_t) noexcept {
+    return th.m_handle != nullptr;
+}
+
+inline bool operator!=(std::nullptr_t, task_handle const& th) noexcept {
+    return th.m_handle != nullptr;
+}
+
+} // namespace d2
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_task_handle_H */
--- a/third_party/tbb/detail/_template_helpers.hh
+++ b/third_party/tbb/detail/_template_helpers.hh
@ -0,0 +1,404 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__template_helpers_H
+#define __TBB_detail__template_helpers_H
+
+#include "third_party/tbb/detail/_utils.hh"
+#include "third_party/tbb/detail/_config.hh"
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/iterator"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+// An internal implementation of void_t, which can be used in SFINAE contexts
+template <typename...>
+struct void_impl {
+    using type = void;
+}; // struct void_impl
+
+template <typename... Args>
+using void_t = typename void_impl<Args...>::type;
+
+// Generic SFINAE helper for expression checks, based on the idea demonstrated in ISO C++ paper n4502
+template <typename T, typename, template <typename> class... Checks>
+struct supports_impl {
+    using type = std::false_type;
+};
+
+template <typename T, template <typename> class... Checks>
+struct supports_impl<T, void_t<Checks<T>...>, Checks...> {
+    using type = std::true_type;
+};
+
+template <typename T, template <typename> class... Checks>
+using supports = typename supports_impl<T, void, Checks...>::type;
+
+//! A template to select either 32-bit or 64-bit constant as compile time, depending on machine word size.
+template <unsigned u, unsigned long long ull >
+struct select_size_t_constant {
+    // Explicit cast is needed to avoid compiler warnings about possible truncation.
+    // The value of the right size,   which is selected by ?:, is anyway not truncated or promoted.
+    static const std::size_t value = static_cast<std::size_t>((sizeof(std::size_t)==sizeof(u)) ? u : ull);
+};
+
+// TODO: do we really need it?
+//! Cast between unrelated pointer types.
+/** This method should be used sparingly as a last resort for dealing with
+  situations that inherently break strict ISO C++ aliasing rules. */
+// T is a pointer type because it will be explicitly provided by the programmer as a template argument;
+// U is a referent type to enable the compiler to check that "ptr" is a pointer, deducing U in the process.
+template<typename T, typename U>
+inline T punned_cast( U* ptr ) {
+    std::uintptr_t x = reinterpret_cast<std::uintptr_t>(ptr);
+    return reinterpret_cast<T>(x);
+}
+
+template<class T, size_t S, size_t R>
+struct padded_base : T {
+    char pad[S - R];
+};
+template<class T, size_t S> struct padded_base<T, S, 0> : T {};
+
+//! Pads type T to fill out to a multiple of cache line size.
+template<class T, size_t S = max_nfs_size>
+struct padded : padded_base<T, S, sizeof(T) % S> {};
+
+#if __TBB_CPP14_INTEGER_SEQUENCE_PRESENT
+
+using std::index_sequence;
+using std::make_index_sequence;
+
+#else
+
+template<std::size_t... S> class index_sequence {};
+
+template<std::size_t N, std::size_t... S>
+struct make_index_sequence_impl : make_index_sequence_impl < N - 1, N - 1, S... > {};
+
+template<std::size_t... S>
+struct make_index_sequence_impl <0, S...> {
+    using type = index_sequence<S...>;
+};
+
+template<std::size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+
+#endif /* __TBB_CPP14_INTEGER_SEQUENCE_PRESENT */
+
+#if __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT
+using std::conjunction;
+using std::disjunction;
+#else // __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT
+
+template <typename...>
+struct conjunction : std::true_type {};
+
+template <typename First, typename... Args>
+struct conjunction<First, Args...>
+    : std::conditional<bool(First::value), conjunction<Args...>, First>::type {};
+
+template <typename T>
+struct conjunction<T> : T {};
+
+template <typename...>
+struct disjunction : std::false_type {};
+
+template <typename First, typename... Args>
+struct disjunction<First, Args...>
+    : std::conditional<bool(First::value), First, disjunction<Args...>>::type {};
+
+template <typename T>
+struct disjunction<T> : T {};
+
+#endif // __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT
+
+template <typename Iterator>
+using iterator_value_t = typename std::iterator_traits<Iterator>::value_type;
+
+template <typename Iterator>
+using iterator_key_t = typename std::remove_const<typename iterator_value_t<Iterator>::first_type>::type;
+
+template <typename Iterator>
+using iterator_mapped_t = typename iterator_value_t<Iterator>::second_type;
+
+template <typename Iterator>
+using iterator_alloc_pair_t = std::pair<typename std::add_const<iterator_key_t<Iterator>>::type,
+                                        iterator_mapped_t<Iterator>>;
+
+template <typename A> using alloc_value_type = typename A::value_type;
+template <typename A> using alloc_ptr_t = typename std::allocator_traits<A>::pointer;
+template <typename A> using has_allocate = decltype(std::declval<alloc_ptr_t<A>&>() = std::declval<A>().allocate(0));
+template <typename A> using has_deallocate = decltype(std::declval<A>().deallocate(std::declval<alloc_ptr_t<A>>(), 0));
+
+// alloc_value_type should be checked first, because it can be used in other checks
+template <typename T>
+using is_allocator = supports<T, alloc_value_type, has_allocate, has_deallocate>;
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename T>
+inline constexpr bool is_allocator_v = is_allocator<T>::value;
+#endif
+
+// Template class in which the "type" determines the type of the element number N in pack Args
+template <std::size_t N, typename... Args>
+struct pack_element {
+    using type = void;
+};
+
+template <std::size_t N, typename T, typename... Args>
+struct pack_element<N, T, Args...> {
+    using type = typename pack_element<N-1, Args...>::type;
+};
+
+template <typename T, typename... Args>
+struct pack_element<0, T, Args...> {
+    using type = T;
+};
+
+template <std::size_t N, typename... Args>
+using pack_element_t = typename pack_element<N, Args...>::type;
+
+template <typename Func>
+class raii_guard {
+public:
+    static_assert(
+        std::is_nothrow_copy_constructible<Func>::value &&
+        std::is_nothrow_move_constructible<Func>::value,
+        "Throwing an exception during the Func copy or move construction cause an unexpected behavior."
+    );
+
+    raii_guard( Func f ) noexcept : my_func(f), is_active(true) {}
+
+    raii_guard( raii_guard&& g ) noexcept : my_func(std::move(g.my_func)), is_active(g.is_active) {
+        g.is_active = false;
+    }
+
+    ~raii_guard() {
+        if (is_active) {
+            my_func();
+        }
+    }
+
+    void dismiss() {
+        is_active = false;
+    }
+private:
+    Func my_func;
+    bool is_active;
+}; // class raii_guard
+
+template <typename Func>
+raii_guard<Func> make_raii_guard( Func f ) {
+    return raii_guard<Func>(f);
+}
+
+template <typename Body>
+struct try_call_proxy {
+    try_call_proxy( Body b ) : body(b) {}
+
+    template <typename OnExceptionBody>
+    void on_exception( OnExceptionBody on_exception_body ) {
+        auto guard = make_raii_guard(on_exception_body);
+        body();
+        guard.dismiss();
+    }
+
+    template <typename OnCompletionBody>
+    void on_completion(OnCompletionBody on_completion_body) {
+        auto guard = make_raii_guard(on_completion_body);
+        body();
+    }
+
+    Body body;
+}; // struct try_call_proxy
+
+// Template helper function for API
+// try_call(lambda1).on_exception(lambda2)
+// Executes lambda1 and if it throws an exception - executes lambda2
+template <typename Body>
+try_call_proxy<Body> try_call( Body b ) {
+    return try_call_proxy<Body>(b);
+}
+
+#if __TBB_CPP17_IS_SWAPPABLE_PRESENT
+using std::is_nothrow_swappable;
+using std::is_swappable;
+#else // __TBB_CPP17_IS_SWAPPABLE_PRESENT
+namespace is_swappable_detail {
+using std::swap;
+
+template <typename T>
+using has_swap = decltype(swap(std::declval<T&>(), std::declval<T&>()));
+
+#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
+// Workaround for VS2015: it fails to instantiate noexcept(...) inside std::integral_constant.
+template <typename T>
+struct noexcept_wrapper {
+    static const bool value = noexcept(swap(std::declval<T&>(), std::declval<T&>()));
+};
+template <typename T>
+struct is_nothrow_swappable_impl : std::integral_constant<bool, noexcept_wrapper<T>::value> {};
+#else
+template <typename T>
+struct is_nothrow_swappable_impl : std::integral_constant<bool, noexcept(swap(std::declval<T&>(), std::declval<T&>()))> {};
+#endif
+}
+
+template <typename T>
+struct is_swappable : supports<T, is_swappable_detail::has_swap> {};
+
+template <typename T>
+struct is_nothrow_swappable
+    : conjunction<is_swappable<T>, is_swappable_detail::is_nothrow_swappable_impl<T>> {};
+#endif // __TBB_CPP17_IS_SWAPPABLE_PRESENT
+
+//! Allows to store a function parameter pack as a variable and later pass it to another function
+template< typename... Types >
+struct stored_pack;
+
+template<>
+struct stored_pack<>
+{
+    using pack_type = stored_pack<>;
+    stored_pack() {}
+
+    // Friend front-end functions
+    template< typename F, typename Pack > friend void call(F&& f, Pack&& p);
+    template< typename Ret, typename F, typename Pack > friend Ret call_and_return(F&& f, Pack&& p);
+
+protected:
+    // Ideally, ref-qualified non-static methods would be used,
+    // but that would greatly reduce the set of compilers where it works.
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, const pack_type& /*pack*/, Preceding&&... params) {
+        return std::forward<F>(f)(std::forward<Preceding>(params)...);
+    }
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, pack_type&& /*pack*/, Preceding&&... params) {
+        return std::forward<F>(f)(std::forward<Preceding>(params)...);
+    }
+};
+
+template< typename T, typename... Types >
+struct stored_pack<T, Types...> : stored_pack<Types...>
+{
+    using pack_type = stored_pack<T, Types...>;
+    using pack_remainder = stored_pack<Types...>;
+
+    // Since lifetime of original values is out of control, copies should be made.
+    // Thus references should be stripped away from the deduced type.
+    typename std::decay<T>::type leftmost_value;
+
+    // Here rvalue references act in the same way as forwarding references,
+    // as long as class template parameters were deduced via forwarding references.
+    stored_pack(T&& t, Types&&... types)
+    : pack_remainder(std::forward<Types>(types)...), leftmost_value(std::forward<T>(t)) {}
+
+    // Friend front-end functions
+    template< typename F, typename Pack > friend void call(F&& f, Pack&& p);
+    template< typename Ret, typename F, typename Pack > friend Ret call_and_return(F&& f, Pack&& p);
+
+protected:
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, pack_type& pack, Preceding&&... params) {
+        return pack_remainder::template call<Ret>(
+            std::forward<F>(f), static_cast<pack_remainder&>(pack),
+            std::forward<Preceding>(params)... , pack.leftmost_value
+        );
+    }
+
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, pack_type&& pack, Preceding&&... params) {
+        return pack_remainder::template call<Ret>(
+            std::forward<F>(f), static_cast<pack_remainder&&>(pack),
+            std::forward<Preceding>(params)... , std::move(pack.leftmost_value)
+        );
+    }
+};
+
+//! Calls the given function with arguments taken from a stored_pack
+template< typename F, typename Pack >
+void call(F&& f, Pack&& p) {
+    std::decay<Pack>::type::template call<void>(std::forward<F>(f), std::forward<Pack>(p));
+}
+
+template< typename Ret, typename F, typename Pack >
+Ret call_and_return(F&& f, Pack&& p) {
+    return std::decay<Pack>::type::template call<Ret>(std::forward<F>(f), std::forward<Pack>(p));
+}
+
+template< typename... Types >
+stored_pack<Types...> save_pack(Types&&... types) {
+    return stored_pack<Types...>(std::forward<Types>(types)...);
+}
+
+// A structure with the value which is equal to Trait::value
+// but can be used in the immediate context due to parameter T
+template <typename Trait, typename T>
+struct dependent_bool : std::integral_constant<bool, bool(Trait::value)> {};
+
+template <typename Callable>
+struct body_arg_detector;
+
+template <typename Callable, typename ReturnType, typename Arg>
+struct body_arg_detector<ReturnType(Callable::*)(Arg)> {
+    using arg_type = Arg;
+};
+
+template <typename Callable, typename ReturnType, typename Arg>
+struct body_arg_detector<ReturnType(Callable::*)(Arg) const> {
+    using arg_type = Arg;
+};
+
+template <typename Callable>
+struct argument_detector;
+
+template <typename Callable>
+struct argument_detector {
+    using type = typename body_arg_detector<decltype(&Callable::operator())>::arg_type;
+};
+
+template <typename ReturnType, typename Arg>
+struct argument_detector<ReturnType(*)(Arg)> {
+    using type = Arg;
+};
+
+// Detects the argument type of callable, works for callable with one argument.
+template <typename Callable>
+using argument_type_of = typename argument_detector<typename std::decay<Callable>::type>::type;
+
+template <typename T>
+struct type_identity {
+    using type = T;
+};
+
+template <typename T>
+using type_identity_t = typename type_identity<T>::type;
+
+} // inline namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__template_helpers_H
--- a/third_party/tbb/detail/_utils.hh
+++ b/third_party/tbb/detail/_utils.hh
@ -0,0 +1,394 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__utils_H
+#define __TBB_detail__utils_H
+
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/functional"
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_assert.hh"
+#include "third_party/tbb/detail/_machine.hh"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//! Utility template function to prevent "unused" warnings by various compilers.
+template<typename... T> void suppress_unused_warning(T&&...) {}
+
+//! Compile-time constant that is upper bound on cache line/sector size.
+/** It should be used only in situations where having a compile-time upper
+  bound is more useful than a run-time exact answer.
+  @ingroup memory_allocation */
+constexpr size_t max_nfs_size = 128;
+constexpr std::size_t max_nfs_size_exp = 7;
+static_assert(1 << max_nfs_size_exp == max_nfs_size, "max_nfs_size_exp must be a log2(max_nfs_size)");
+
+//! Class that implements exponential backoff.
+class atomic_backoff {
+    //! Time delay, in units of "pause" instructions.
+    /** Should be equal to approximately the number of "pause" instructions
+      that take the same time as an context switch. Must be a power of two.*/
+    static constexpr std::int32_t LOOPS_BEFORE_YIELD = 16;
+    std::int32_t count;
+
+public:
+    // In many cases, an object of this type is initialized eagerly on hot path,
+    // as in for(atomic_backoff b; ; b.pause()) { /*loop body*/ }
+    // For this reason, the construction cost must be very small!
+    atomic_backoff() : count(1) {}
+    // This constructor pauses immediately; do not use on hot paths!
+    atomic_backoff(bool) : count(1) { pause(); }
+
+    //! No Copy
+    atomic_backoff(const atomic_backoff&) = delete;
+    atomic_backoff& operator=(const atomic_backoff&) = delete;
+
+    //! Pause for a while.
+    void pause() {
+        if (count <= LOOPS_BEFORE_YIELD) {
+            machine_pause(count);
+            // Pause twice as long the next time.
+            count *= 2;
+        } else {
+            // Pause is so long that we might as well yield CPU to scheduler.
+            yield();
+        }
+    }
+
+    //! Pause for a few times and return false if saturated.
+    bool bounded_pause() {
+        machine_pause(count);
+        if (count < LOOPS_BEFORE_YIELD) {
+            // Pause twice as long the next time.
+            count *= 2;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    void reset() {
+        count = 1;
+    }
+};
+
+//! Spin WHILE the condition is true.
+/** T and U should be comparable types. */
+template <typename T, typename C>
+T spin_wait_while(const std::atomic<T>& location, C comp, std::memory_order order) {
+    atomic_backoff backoff;
+    T snapshot = location.load(order);
+    while (comp(snapshot)) {
+        backoff.pause();
+        snapshot = location.load(order);
+    }
+    return snapshot;
+}
+
+//! Spin WHILE the value of the variable is equal to a given value
+/** T and U should be comparable types. */
+template <typename T, typename U>
+T spin_wait_while_eq(const std::atomic<T>& location, const U value, std::memory_order order = std::memory_order_acquire) {
+    return spin_wait_while(location, [&value](T t) { return t == value; }, order);
+}
+
+//! Spin UNTIL the value of the variable is equal to a given value
+/** T and U should be comparable types. */
+template<typename T, typename U>
+T spin_wait_until_eq(const std::atomic<T>& location, const U value, std::memory_order order = std::memory_order_acquire) {
+    return spin_wait_while(location, [&value](T t) { return t != value; }, order);
+}
+
+//! Spin UNTIL the condition returns true or spinning time is up.
+/** Returns what the passed functor returned last time it was invoked. */
+template <typename Condition>
+bool timed_spin_wait_until(Condition condition) {
+    // 32 pauses + 32 yields are meausered as balanced spin time before sleep.
+    bool finish = condition();
+    for (int i = 1; !finish && i < 32; finish = condition(), i *= 2) {
+        machine_pause(i);
+    }
+    for (int i = 32; !finish && i < 64; finish = condition(), ++i) {
+        yield();
+    }
+    return finish;
+}
+
+template <typename T>
+T clamp(T value, T lower_bound, T upper_bound) {
+    __TBB_ASSERT(lower_bound <= upper_bound, "Incorrect bounds");
+    return value > lower_bound ? (value > upper_bound ? upper_bound : value) : lower_bound;
+}
+
+template <typename T>
+std::uintptr_t log2(T in) {
+    __TBB_ASSERT(in > 0, "The logarithm of a non-positive value is undefined.");
+    return machine_log2(in);
+}
+
+template<typename T>
+T reverse_bits(T src) {
+    return machine_reverse_bits(src);
+}
+
+template<typename T>
+T reverse_n_bits(T src, std::size_t n) {
+    __TBB_ASSERT(n != 0, "Reverse for 0 bits is undefined behavior.");
+    return reverse_bits(src) >> (number_of_bits<T>() - n);
+}
+
+// A function to check if passed integer is a power of two
+template <typename IntegerType>
+constexpr bool is_power_of_two( IntegerType arg ) {
+    static_assert(std::is_integral<IntegerType>::value,
+                  "An argument for is_power_of_two should be integral type");
+    return arg && (0 == (arg & (arg - 1)));
+}
+
+// A function to determine if passed integer is a power of two
+// at least as big as another power of two, i.e. for strictly positive i and j,
+// with j being a power of two, determines whether i==j<<k for some nonnegative k
+template <typename ArgIntegerType, typename DivisorIntegerType>
+constexpr bool is_power_of_two_at_least(ArgIntegerType arg, DivisorIntegerType divisor) {
+    // Divisor should be a power of two
+    static_assert(std::is_integral<ArgIntegerType>::value,
+                  "An argument for is_power_of_two_at_least should be integral type");
+    return 0 == (arg & (arg - divisor));
+}
+
+// A function to compute arg modulo divisor where divisor is a power of 2.
+template<typename ArgIntegerType, typename DivisorIntegerType>
+inline ArgIntegerType modulo_power_of_two(ArgIntegerType arg, DivisorIntegerType divisor) {
+    __TBB_ASSERT( is_power_of_two(divisor), "Divisor should be a power of two" );
+    return arg & (divisor - 1);
+}
+
+//! A function to check if passed in pointer is aligned on a specific border
+template<typename T>
+constexpr bool is_aligned(T* pointer, std::uintptr_t alignment) {
+    return 0 == (reinterpret_cast<std::uintptr_t>(pointer) & (alignment - 1));
+}
+
+#if TBB_USE_ASSERT
+static void* const poisoned_ptr = reinterpret_cast<void*>(-1);
+
+//! Set p to invalid pointer value.
+template<typename T>
+inline void poison_pointer( T* &p ) { p = reinterpret_cast<T*>(poisoned_ptr); }
+
+template<typename T>
+inline void poison_pointer(std::atomic<T*>& p) { p.store(reinterpret_cast<T*>(poisoned_ptr), std::memory_order_relaxed); }
+
+/** Expected to be used in assertions only, thus no empty form is defined. **/
+template<typename T>
+inline bool is_poisoned( T* p ) { return p == reinterpret_cast<T*>(poisoned_ptr); }
+
+template<typename T>
+inline bool is_poisoned(const std::atomic<T*>& p) { return is_poisoned(p.load(std::memory_order_relaxed)); }
+#else
+template<typename T>
+inline void poison_pointer(T&) {/*do nothing*/}
+#endif /* !TBB_USE_ASSERT */
+
+template <std::size_t alignment = 0, typename T>
+bool assert_pointer_valid(T* p, const char* comment = nullptr) {
+    suppress_unused_warning(p, comment);
+    __TBB_ASSERT(p != nullptr, comment);
+    __TBB_ASSERT(!is_poisoned(p), comment);
+#if !(_MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER)
+    __TBB_ASSERT(is_aligned(p, alignment == 0 ? alignof(T) : alignment), comment);
+#endif
+    // Returns something to simplify assert_pointers_valid implementation.
+    return true;
+}
+
+template <typename... Args>
+void assert_pointers_valid(Args*... p) {
+    // suppress_unused_warning is used as an evaluation context for the variadic pack.
+    suppress_unused_warning(assert_pointer_valid(p)...);
+}
+
+//! Base class for types that should not be assigned.
+class no_assign {
+public:
+    void operator=(const no_assign&) = delete;
+    no_assign(const no_assign&) = default;
+    no_assign() = default;
+};
+
+//! Base class for types that should not be copied or assigned.
+class no_copy: no_assign {
+public:
+    no_copy(const no_copy&) = delete;
+    no_copy() = default;
+};
+
+template <typename T>
+void swap_atomics_relaxed(std::atomic<T>& lhs, std::atomic<T>& rhs){
+    T tmp = lhs.load(std::memory_order_relaxed);
+    lhs.store(rhs.load(std::memory_order_relaxed), std::memory_order_relaxed);
+    rhs.store(tmp, std::memory_order_relaxed);
+}
+
+//! One-time initialization states
+enum class do_once_state {
+    uninitialized = 0,      ///< No execution attempts have been undertaken yet
+    pending,                ///< A thread is executing associated do-once routine
+    executed,               ///< Do-once routine has been executed
+    initialized = executed  ///< Convenience alias
+};
+
+//! One-time initialization function
+/** /param initializer Pointer to function without arguments
+           The variant that returns bool is used for cases when initialization can fail
+           and it is OK to continue execution, but the state should be reset so that
+           the initialization attempt was repeated the next time.
+    /param state Shared state associated with initializer that specifies its
+            initialization state. Must be initially set to #uninitialized value
+            (e.g. by means of default static zero initialization). **/
+template <typename F>
+void atomic_do_once( const F& initializer, std::atomic<do_once_state>& state ) {
+    // The loop in the implementation is necessary to avoid race when thread T2
+    // that arrived in the middle of initialization attempt by another thread T1
+    // has just made initialization possible.
+    // In such a case T2 has to rely on T1 to initialize, but T1 may already be past
+    // the point where it can recognize the changed conditions.
+    do_once_state expected_state;
+    while ( state.load( std::memory_order_acquire ) != do_once_state::executed ) {
+        if( state.load( std::memory_order_relaxed ) == do_once_state::uninitialized ) {
+            expected_state = do_once_state::uninitialized;
+#if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
+            using enum_type = typename std::underlying_type<do_once_state>::type;
+            if( ((std::atomic<enum_type>&)state).compare_exchange_strong( (enum_type&)expected_state, (enum_type)do_once_state::pending ) ) {
+#else
+            if( state.compare_exchange_strong( expected_state, do_once_state::pending ) ) {
+#endif
+                run_initializer( initializer, state );
+                break;
+            }
+        }
+        spin_wait_while_eq( state, do_once_state::pending );
+    }
+}
+
+// Run the initializer which can not fail
+template<typename Functor>
+void run_initializer(const Functor& f, std::atomic<do_once_state>& state ) {
+    f();
+    state.store(do_once_state::executed, std::memory_order_release);
+}
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+template <typename T>
+concept boolean_testable_impl = std::convertible_to<T, bool>;
+
+template <typename T>
+concept boolean_testable = boolean_testable_impl<T> && requires( T&& t ) {
+                               { !std::forward<T>(t) } -> boolean_testable_impl;
+                           };
+
+#if __TBB_CPP20_COMPARISONS_PRESENT
+struct synthesized_three_way_comparator {
+    template <typename T1, typename T2>
+    auto operator()( const T1& lhs, const T2& rhs ) const
+        requires requires {
+            { lhs < rhs } -> boolean_testable;
+            { rhs < lhs } -> boolean_testable;
+        }
+    {
+        if constexpr (std::three_way_comparable_with<T1, T2>) {
+            return lhs <=> rhs;
+        } else {
+            if (lhs < rhs) {
+                return std::weak_ordering::less;
+            }
+            if (rhs < lhs) {
+                return std::weak_ordering::greater;
+            }
+            return std::weak_ordering::equivalent;
+        }
+    }
+}; // struct synthesized_three_way_comparator
+
+template <typename T1, typename T2 = T1>
+using synthesized_three_way_result = decltype(synthesized_three_way_comparator{}(std::declval<T1&>(),
+                                                                                 std::declval<T2&>()));
+
+#endif // __TBB_CPP20_COMPARISONS_PRESENT
+
+// Check if the type T is implicitly OR explicitly convertible to U
+template <typename T, typename U>
+concept relaxed_convertible_to = std::constructible_from<U, T>;
+
+template <typename T, typename U>
+concept adaptive_same_as =
+#if __TBB_STRICT_CONSTRAINTS
+    std::same_as<T, U>;
+#else
+    std::convertible_to<T, U>;
+#endif
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+template <typename F, typename... Args>
+auto invoke(F&& f, Args&&... args)
+#if __TBB_CPP17_INVOKE_PRESENT
+    noexcept(std::is_nothrow_invocable_v<F, Args...>)
+    -> std::invoke_result_t<F, Args...>
+{
+    return std::invoke(std::forward<F>(f), std::forward<Args>(args)...);
+}
+#else // __TBB_CPP17_INVOKE_PRESENT
+    noexcept(noexcept(std::forward<F>(f)(std::forward<Args>(args)...)))
+    -> decltype(std::forward<F>(f)(std::forward<Args>(args)...))
+{
+    return std::forward<F>(f)(std::forward<Args>(args)...);
+}
+#endif // __TBB_CPP17_INVOKE_PRESENT
+
+} // namespace d0
+
+namespace d1 {
+
+class delegate_base {
+public:
+    virtual bool operator()() const = 0;
+    virtual ~delegate_base() {}
+};
+
+template <typename FuncType>
+class delegated_function : public delegate_base {
+public:
+    delegated_function(FuncType& f) : my_func(f) {}
+
+    bool operator()() const override {
+        return my_func();
+    }
+
+private:
+    FuncType &my_func;
+};
+} // namespace d1
+
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__utils_H
--- a/third_party/tbb/detail/_waitable_atomic.hh
+++ b/third_party/tbb/detail/_waitable_atomic.hh
@ -0,0 +1,105 @@
+// clang-format off
+/*
+    Copyright (c) 2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__address_waiters_H
+#define __TBB_detail__address_waiters_H
+
+#include "third_party/tbb/detail/_utils.hh"
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+TBB_EXPORT void __TBB_EXPORTED_FUNC wait_on_address(void* address, d1::delegate_base& wakeup_condition, std::uintptr_t context);
+TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address(void* address, std::uintptr_t context);
+TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address_one(void* address);
+TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address_all(void* address);
+} // namespace r1
+
+namespace d1 {
+
+template <typename Predicate>
+void adaptive_wait_on_address(void* address, Predicate wakeup_condition, std::uintptr_t context) {
+    if (!timed_spin_wait_until(wakeup_condition)) {
+        d1::delegated_function<Predicate> pred(wakeup_condition);
+        r1::wait_on_address(address, pred, context);
+    }
+}
+
+template <typename T>
+class waitable_atomic {
+public:
+    waitable_atomic() = default;
+
+    explicit waitable_atomic(T value) : my_atomic(value) {}
+
+    waitable_atomic(const waitable_atomic&) = delete;
+    waitable_atomic& operator=(const waitable_atomic&) = delete;
+
+    T load(std::memory_order order) const noexcept {
+        return my_atomic.load(order);
+    }
+
+    T exchange(T desired) noexcept {
+        return my_atomic.exchange(desired);
+    }
+
+    void wait(T old, std::uintptr_t context, std::memory_order order) {
+        auto wakeup_condition = [&] { return my_atomic.load(order) != old; };
+        if (!timed_spin_wait_until(wakeup_condition)) {
+            // We need to use while here, because notify_all() will wake up all threads
+            // But predicate for them might be false
+            d1::delegated_function<decltype(wakeup_condition)> pred(wakeup_condition);
+            do {
+                r1::wait_on_address(this, pred, context);
+            } while (!wakeup_condition());
+        }
+    }
+
+    void wait_until(T expected, std::uintptr_t context, std::memory_order order) {
+        auto wakeup_condition = [&] { return my_atomic.load(order) == expected; };
+        if (!timed_spin_wait_until(wakeup_condition)) {
+            // We need to use while here, because notify_all() will wake up all threads
+            // But predicate for them might be false
+            d1::delegated_function<decltype(wakeup_condition)> pred(wakeup_condition);
+            do {
+                r1::wait_on_address(this, pred, context);
+            } while (!wakeup_condition());
+        }
+    }
+
+    void notify_relaxed(std::uintptr_t context) {
+        r1::notify_by_address(this, context);
+    }
+
+    void notify_one_relaxed() {
+        r1::notify_by_address_one(this);
+    }
+
+    // TODO: consider adding following interfaces:
+    // store(desired, memory_order)
+    // notify_all_relaxed()
+
+private:
+    std::atomic<T> my_atomic{};
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__address_waiters_H
--- a/third_party/tbb/dynamic_link.cc
+++ b/third_party/tbb/dynamic_link.cc
@ -0,0 +1,516 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/dynamic_link.hh"
+#include "third_party/tbb/environment.hh"
+
+#include "third_party/tbb/detail/_template_helpers.hh"
+#include "third_party/tbb/detail/_utils.hh"
+
+/*
+    This file is used by both TBB and OpenMP RTL. Do not use __TBB_ASSERT() macro
+    and runtime_warning() function because they are not available in OpenMP. Use
+    __TBB_ASSERT_EX and DYNAMIC_LINK_WARNING instead.
+*/
+
+#include "third_party/libcxx/cstdarg"          // va_list etc.
+#include "third_party/libcxx/cstring"          // strrchr
+#if _WIN32
+    #include "libc/mem/mem.h"
+
+    // Unify system calls
+    #define dlopen( name, flags )   LoadLibrary( name )
+    #define dlsym( handle, name )   GetProcAddress( handle, name )
+    #define dlclose( handle )       ( ! FreeLibrary( handle ) )
+    #define dlerror()               GetLastError()
+#ifndef PATH_MAX
+    #define PATH_MAX                MAX_PATH
+#endif
+#else /* _WIN32 */
+    #include "libc/runtime/dlfcn.h"
+    #include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+
+    #include "third_party/libcxx/climits"
+    #include "third_party/libcxx/cstdlib"
+#endif /* _WIN32 */
+
+#if __TBB_WEAK_SYMBOLS_PRESENT && !__TBB_DYNAMIC_LOAD_ENABLED
+    //TODO: use function attribute for weak symbols instead of the pragma.
+    #pragma weak dlopen
+    #pragma weak dlsym
+    #pragma weak dlclose
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT && !__TBB_DYNAMIC_LOAD_ENABLED */
+
+
+#define __USE_STATIC_DL_INIT    ( !__ANDROID__ )
+
+
+/*
+dynamic_link is a common interface for searching for required symbols in an
+executable and dynamic libraries.
+
+dynamic_link provides certain guarantees:
+  1. Either all or none of the requested symbols are resolved. Moreover, if
+  symbols are not resolved, the dynamic_link_descriptor table is not modified;
+  2. All returned symbols have secured lifetime: this means that none of them
+  can be invalidated until dynamic_unlink is called;
+  3. Any loaded library is loaded only via the full path. The full path is that
+  from which the runtime itself was loaded. (This is done to avoid security
+  issues caused by loading libraries from insecure paths).
+
+dynamic_link searches for the requested symbols in three stages, stopping as
+soon as all of the symbols have been resolved.
+
+  1. Search the global scope:
+    a. On Windows: dynamic_link tries to obtain the handle of the requested
+    library and if it succeeds it resolves the symbols via that handle.
+    b. On Linux: dynamic_link tries to search for the symbols in the global
+    scope via the main program handle. If the symbols are present in the global
+    scope their lifetime is not guaranteed (since dynamic_link does not know
+    anything about the library from which they are exported). Therefore it
+    tries to "pin" the symbols by obtaining the library name and reopening it.
+    dlopen may fail to reopen the library in two cases:
+       i. The symbols are exported from the executable. Currently dynamic _link
+      cannot handle this situation, so it will not find these symbols in this
+      step.
+      ii. The necessary library has been unloaded and cannot be reloaded. It
+      seems there is nothing that can be done in this case. No symbols are
+      returned.
+
+  2. Dynamic load: an attempt is made to load the requested library via the
+  full path.
+    The full path used is that from which the runtime itself was loaded. If the
+    library can be loaded, then an attempt is made to resolve the requested
+    symbols in the newly loaded library.
+    If the symbols are not found the library is unloaded.
+
+  3. Weak symbols: if weak symbols are available they are returned.
+*/
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED
+
+#if !defined(DYNAMIC_LINK_WARNING) && !__TBB_WIN8UI_SUPPORT && __TBB_DYNAMIC_LOAD_ENABLED
+    // Report runtime errors and continue.
+    #define DYNAMIC_LINK_WARNING dynamic_link_warning
+    static void dynamic_link_warning( dynamic_link_error_t code, ... ) {
+        suppress_unused_warning(code);
+    } // library_warning
+#endif /* !defined(DYNAMIC_LINK_WARNING) && !__TBB_WIN8UI_SUPPORT && __TBB_DYNAMIC_LOAD_ENABLED */
+
+    static bool resolve_symbols( dynamic_link_handle module, const dynamic_link_descriptor descriptors[], std::size_t required )
+    {
+        if ( !module )
+            return false;
+
+        #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */
+            if ( !dlsym ) return false;
+        #endif /* !__TBB_DYNAMIC_LOAD_ENABLED */
+
+        const std::size_t n_desc=20; // Usually we don't have more than 20 descriptors per library
+        __TBB_ASSERT_EX( required <= n_desc, "Too many descriptors is required" );
+        if ( required > n_desc ) return false;
+        pointer_to_handler h[n_desc];
+
+        for ( std::size_t k = 0; k < required; ++k ) {
+            dynamic_link_descriptor const & desc = descriptors[k];
+            pointer_to_handler addr = (pointer_to_handler)dlsym( module, desc.name );
+            if ( !addr ) {
+                return false;
+            }
+            h[k] = addr;
+        }
+
+        // Commit the entry points.
+        // Cannot use memset here, because the writes must be atomic.
+        for( std::size_t k = 0; k < required; ++k )
+            *descriptors[k].handler = h[k];
+        return true;
+    }
+
+#if __TBB_WIN8UI_SUPPORT
+    bool dynamic_link( const char*  library, const dynamic_link_descriptor descriptors[], std::size_t required, dynamic_link_handle*, int flags ) {
+        dynamic_link_handle tmp_handle = nullptr;
+        TCHAR wlibrary[256];
+        if ( MultiByteToWideChar(CP_UTF8, 0, library, -1, wlibrary, 255) == 0 ) return false;
+        if ( flags & DYNAMIC_LINK_LOAD )
+            tmp_handle = LoadPackagedLibrary( wlibrary, 0 );
+        if (tmp_handle != nullptr){
+            return resolve_symbols(tmp_handle, descriptors, required);
+        }else{
+            return false;
+        }
+    }
+    void dynamic_unlink( dynamic_link_handle ) {}
+    void dynamic_unlink_all() {}
+#else
+#if __TBB_DYNAMIC_LOAD_ENABLED
+/*
+    There is a security issue on Windows: LoadLibrary() may load and execute malicious code.
+    See http://www.microsoft.com/technet/security/advisory/2269637.mspx for details.
+    To avoid the issue, we have to pass full path (not just library name) to LoadLibrary. This
+    function constructs full path to the specified library (it is assumed the library located
+    side-by-side with the tbb.dll.
+
+    The function constructs absolute path for given relative path. Important: Base directory is not
+    current one, it is the directory tbb.dll loaded from.
+
+    Example:
+        Let us assume "tbb.dll" is located in "c:\program files\common\intel\" directory, e.g.
+        absolute path of the library is "c:\program files\common\intel\tbb.dll". Absolute path for
+        "tbbmalloc.dll" would be "c:\program files\common\intel\tbbmalloc.dll". Absolute path for
+        "malloc\tbbmalloc.dll" would be "c:\program files\common\intel\malloc\tbbmalloc.dll".
+*/
+
+    // Struct handle_storage is used by dynamic_link routine to store handles of
+    // all loaded or pinned dynamic libraries. When TBB is shut down, it calls
+    // dynamic_unlink_all() that unloads modules referenced by handle_storage.
+    // This struct should not have any constructors since it may be used before
+    // the constructor is called.
+    #define MAX_LOADED_MODULES 8 // The number of maximum possible modules which can be loaded
+
+    using atomic_incrementer = std::atomic<std::size_t>;
+
+    static struct handles_t {
+        atomic_incrementer my_size;
+        dynamic_link_handle my_handles[MAX_LOADED_MODULES];
+
+        void add(const dynamic_link_handle &handle) {
+            const std::size_t ind = my_size++;
+            __TBB_ASSERT_EX( ind < MAX_LOADED_MODULES, "Too many modules are loaded" );
+            my_handles[ind] = handle;
+        }
+
+        void free() {
+            const std::size_t size = my_size;
+            for (std::size_t i=0; i<size; ++i)
+                dynamic_unlink( my_handles[i] );
+        }
+    } handles;
+
+    static std::once_flag init_dl_data_state;
+
+    static struct ap_data_t {
+        char _path[PATH_MAX+1];
+        std::size_t _len;
+    } ap_data;
+
+    static void init_ap_data() {
+    #if _WIN32
+        // Get handle of our DLL first.
+        HMODULE handle;
+        BOOL brc = GetModuleHandleEx(
+            GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+            (LPCSTR)( & dynamic_link ), // any function inside the library can be used for the address
+            & handle
+            );
+        if ( !brc ) { // Error occurred.
+            int err = GetLastError();
+            DYNAMIC_LINK_WARNING( dl_sys_fail, "GetModuleHandleEx", err );
+            return;
+        }
+        // Now get path to our DLL.
+        DWORD drc = GetModuleFileName( handle, ap_data._path, static_cast< DWORD >( PATH_MAX ) );
+        if ( drc == 0 ) { // Error occurred.
+            int err = GetLastError();
+            DYNAMIC_LINK_WARNING( dl_sys_fail, "GetModuleFileName", err );
+            return;
+        }
+        if ( drc >= PATH_MAX ) { // Buffer too short.
+            DYNAMIC_LINK_WARNING( dl_buff_too_small );
+            return;
+        }
+        // Find the position of the last backslash.
+        char *backslash = std::strrchr( ap_data._path, '\\' );
+
+        if ( !backslash ) {    // Backslash not found.
+            __TBB_ASSERT_EX( backslash != nullptr, "Unbelievable.");
+            return;
+        }
+        __TBB_ASSERT_EX( backslash >= ap_data._path, "Unbelievable.");
+        ap_data._len = (std::size_t)(backslash - ap_data._path) + 1;
+        *(backslash+1) = 0;
+    #else
+        // Get the library path
+        Dl_info dlinfo;
+        int res = dladdr( (void*)&dynamic_link, &dlinfo ); // any function inside the library can be used for the address
+        if ( !res ) {
+            char const * err = dlerror();
+            DYNAMIC_LINK_WARNING( dl_sys_fail, "dladdr", err );
+            return;
+        } else {
+            __TBB_ASSERT_EX( dlinfo.dli_fname!=nullptr, "Unbelievable." );
+        }
+
+        char const *slash = std::strrchr( dlinfo.dli_fname, '/' );
+        std::size_t fname_len=0;
+        if ( slash ) {
+            __TBB_ASSERT_EX( slash >= dlinfo.dli_fname, "Unbelievable.");
+            fname_len = (std::size_t)(slash - dlinfo.dli_fname) + 1;
+        }
+
+        std::size_t rc;
+        if ( dlinfo.dli_fname[0]=='/' ) {
+            // The library path is absolute
+            rc = 0;
+            ap_data._len = 0;
+        } else {
+            // The library path is relative so get the current working directory
+            if ( !getcwd( ap_data._path, sizeof(ap_data._path)/sizeof(ap_data._path[0]) ) ) {
+                DYNAMIC_LINK_WARNING( dl_buff_too_small );
+                return;
+            }
+            ap_data._len = std::strlen( ap_data._path );
+            ap_data._path[ap_data._len++]='/';
+            rc = ap_data._len;
+        }
+
+        if ( fname_len>0 ) {
+            ap_data._len += fname_len;
+            if ( ap_data._len>PATH_MAX ) {
+                DYNAMIC_LINK_WARNING( dl_buff_too_small );
+                ap_data._len=0;
+                return;
+            }
+            std::strncpy( ap_data._path+rc, dlinfo.dli_fname, fname_len );
+            ap_data._path[ap_data._len]=0;
+        }
+    #endif /* _WIN32 */
+    }
+
+    static void init_dl_data() {
+        init_ap_data();
+    }
+
+    /*
+        The function constructs absolute path for given relative path. Important: Base directory is not
+        current one, it is the directory libtbb.so loaded from.
+
+        Arguments:
+        in  name -- Name of a file (may be with relative path; it must not be an absolute one).
+        out path -- Buffer to save result (absolute path) to.
+        in  len  -- Size of buffer.
+        ret      -- 0         -- Error occurred.
+                    > len     -- Buffer too short, required size returned.
+                    otherwise -- Ok, number of characters (incl. terminating null) written to buffer.
+    */
+    static std::size_t abs_path( char const * name, char * path, std::size_t len ) {
+        if ( ap_data._len == 0 )
+            return 0;
+
+        std::size_t name_len = std::strlen( name );
+        std::size_t full_len = name_len+ap_data._len;
+        if ( full_len < len ) {
+            __TBB_ASSERT( ap_data._path[ap_data._len] == 0, nullptr);
+            __TBB_ASSERT( std::strlen(ap_data._path) == ap_data._len, nullptr);
+            std::strncpy( path, ap_data._path, ap_data._len + 1 );
+            __TBB_ASSERT( path[ap_data._len] == 0, nullptr);
+            std::strncat( path, name, len - ap_data._len );
+            __TBB_ASSERT( std::strlen(path) == full_len, nullptr);
+        }
+        return full_len+1; // +1 for null character
+    }
+#endif  // __TBB_DYNAMIC_LOAD_ENABLED
+    void init_dynamic_link_data() {
+    #if __TBB_DYNAMIC_LOAD_ENABLED
+        std::call_once( init_dl_data_state, init_dl_data );
+    #endif
+    }
+
+    #if __USE_STATIC_DL_INIT
+    // ap_data structure is initialized with current directory on Linux.
+    // So it should be initialized as soon as possible since the current directory may be changed.
+    // static_init_ap_data object provides this initialization during library loading.
+    static struct static_init_dl_data_t {
+        static_init_dl_data_t() {
+            init_dynamic_link_data();
+        }
+    } static_init_dl_data;
+    #endif
+
+    #if __TBB_WEAK_SYMBOLS_PRESENT
+    static bool weak_symbol_link( const dynamic_link_descriptor descriptors[], std::size_t required )
+    {
+        // Check if the required entries are present in what was loaded into our process.
+        for ( std::size_t k = 0; k < required; ++k )
+            if ( !descriptors[k].ptr )
+                return false;
+        // Commit the entry points.
+        for ( std::size_t k = 0; k < required; ++k )
+            *descriptors[k].handler = (pointer_to_handler) descriptors[k].ptr;
+        return true;
+    }
+    #else
+    static bool weak_symbol_link( const dynamic_link_descriptor[], std::size_t ) {
+        return false;
+    }
+    #endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+    void dynamic_unlink( dynamic_link_handle handle ) {
+    #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */
+        if ( !dlclose ) return;
+    #endif
+        if ( handle ) {
+            dlclose( handle );
+        }
+    }
+
+    void dynamic_unlink_all() {
+    #if __TBB_DYNAMIC_LOAD_ENABLED
+        handles.free();
+    #endif
+    }
+
+    static dynamic_link_handle global_symbols_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required ) {
+        dynamic_link_handle library_handle{};
+#if _WIN32
+        auto res = GetModuleHandleEx(0, library, &library_handle);
+        __TBB_ASSERT_EX((res && library_handle) || (!res && !library_handle), nullptr);
+#else /* _WIN32 */
+    #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */
+        if ( !dlopen ) return 0;
+    #endif /* !__TBB_DYNAMIC_LOAD_ENABLED */
+        // RTLD_GLOBAL - to guarantee that old TBB will find the loaded library
+        // RTLD_NOLOAD - not to load the library without the full path
+        library_handle = dlopen(library, RTLD_LAZY | RTLD_GLOBAL | RTLD_NOLOAD);
+#endif /* _WIN32 */
+        if (library_handle) {
+            if (!resolve_symbols(library_handle, descriptors, required)) {
+                dynamic_unlink(library_handle);
+                library_handle = nullptr;
+            }
+        }
+        return library_handle;
+    }
+
+    static void save_library_handle( dynamic_link_handle src, dynamic_link_handle *dst ) {
+        __TBB_ASSERT_EX( src, "The library handle to store must be non-zero" );
+        if ( dst )
+            *dst = src;
+    #if __TBB_DYNAMIC_LOAD_ENABLED
+        else
+            handles.add( src );
+    #endif /* __TBB_DYNAMIC_LOAD_ENABLED */
+    }
+
+#if !_WIN32
+    int loading_flags(bool local_binding) {
+        int flags = RTLD_NOW;
+        if (local_binding) {
+            flags = flags | RTLD_LOCAL;
+#if (__linux__ && __GLIBC__) && !__TBB_USE_SANITIZERS
+            if( !GetBoolEnvironmentVariable("TBB_ENABLE_SANITIZERS") ) {
+                flags = flags | RTLD_DEEPBIND;
+            }
+#endif
+        } else {
+            flags = flags | RTLD_GLOBAL;
+        }
+        return flags;
+    }
+#endif
+
+    dynamic_link_handle dynamic_load( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, bool local_binding ) {
+        ::tbb::detail::suppress_unused_warning( library, descriptors, required, local_binding );
+#if __TBB_DYNAMIC_LOAD_ENABLED
+        std::size_t const len = PATH_MAX + 1;
+        char path[ len ];
+        std::size_t rc = abs_path( library, path, len );
+        if ( 0 < rc && rc <= len ) {
+#if _WIN32
+            // Prevent Windows from displaying silly message boxes if it fails to load library
+            // (e.g. because of MS runtime problems - one of those crazy manifest related ones)
+            UINT prev_mode = SetErrorMode (SEM_FAILCRITICALERRORS);
+#endif /* _WIN32 */
+            // The second argument (loading_flags) is ignored on Windows
+            dynamic_link_handle library_handle = dlopen( path, loading_flags(local_binding) );
+#if _WIN32
+            SetErrorMode (prev_mode);
+#endif /* _WIN32 */
+            if( library_handle ) {
+                if( !resolve_symbols( library_handle, descriptors, required ) ) {
+                    // The loaded library does not contain all the expected entry points
+                    dynamic_unlink( library_handle );
+                    library_handle = nullptr;
+                }
+            } else
+                DYNAMIC_LINK_WARNING( dl_lib_not_found, path, dlerror() );
+            return library_handle;
+        } else if ( rc>len )
+                DYNAMIC_LINK_WARNING( dl_buff_too_small );
+                // rc == 0 means failing of init_ap_data so the warning has already been issued.
+
+#endif /* __TBB_DYNAMIC_LOAD_ENABLED */
+            return nullptr;
+    }
+
+    bool dynamic_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, dynamic_link_handle *handle, int flags ) {
+        init_dynamic_link_data();
+
+        // TODO: May global_symbols_link find weak symbols?
+        dynamic_link_handle library_handle = ( flags & DYNAMIC_LINK_GLOBAL ) ? global_symbols_link( library, descriptors, required ) : nullptr;
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#pragma warning (push)
+// MSVC 2015 warning: 'int': forcing value to bool 'true' or 'false'
+#pragma warning (disable: 4800)
+#endif
+        if ( !library_handle && ( flags & DYNAMIC_LINK_LOAD ) )
+            library_handle = dynamic_load( library, descriptors, required, flags & DYNAMIC_LINK_LOCAL );
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#pragma warning (pop)
+#endif
+        if ( !library_handle && ( flags & DYNAMIC_LINK_WEAK ) )
+            return weak_symbol_link( descriptors, required );
+
+        if ( library_handle ) {
+            save_library_handle( library_handle, handle );
+            return true;
+        }
+        return false;
+    }
+
+#endif /*__TBB_WIN8UI_SUPPORT*/
+#else /* __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED */
+    bool dynamic_link( const char*, const dynamic_link_descriptor*, std::size_t, dynamic_link_handle *handle, int ) {
+        if ( handle )
+            *handle=0;
+        return false;
+    }
+    void dynamic_unlink( dynamic_link_handle ) {}
+    void dynamic_unlink_all() {}
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
--- a/third_party/tbb/dynamic_link.hh
+++ b/third_party/tbb/dynamic_link.hh
@ -0,0 +1,137 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_dynamic_link
+#define __TBB_dynamic_link
+
+// Support for dynamic loading entry points from other shared libraries.
+
+#include "third_party/tbb/detail/_config.hh"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/mutex"
+
+/** By default, symbols declared and defined here go into namespace tbb::internal.
+    To put them in other namespace, define macros OPEN_INTERNAL_NAMESPACE
+    and CLOSE_INTERNAL_NAMESPACE to override the following default definitions. **/
+
+#include "third_party/libcxx/cstddef"
+#ifdef _WIN32
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#endif /* _WIN32 */
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Type definition for a pointer to a void somefunc(void)
+typedef void (*pointer_to_handler)();
+
+//! The helper to construct dynamic_link_descriptor structure
+// Double cast through the void* in DLD macro is necessary to
+// prevent warnings from some compilers (g++ 4.1)
+#if __TBB_WEAK_SYMBOLS_PRESENT
+#define DLD(s,h) {#s, (pointer_to_handler*)(void*)(&h), (pointer_to_handler)&s}
+#define DLD_NOWEAK(s,h) {#s, (pointer_to_handler*)(void*)(&h), nullptr}
+#else
+#define DLD(s,h) {#s, (pointer_to_handler*)(void*)(&h)}
+#define DLD_NOWEAK(s,h) DLD(s,h)
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+//! Association between a handler name and location of pointer to it.
+struct dynamic_link_descriptor {
+    //! Name of the handler
+    const char* name;
+    //! Pointer to the handler
+    pointer_to_handler* handler;
+#if __TBB_WEAK_SYMBOLS_PRESENT
+    //! Weak symbol
+    pointer_to_handler ptr;
+#endif
+};
+
+#if _WIN32
+using dynamic_link_handle = HMODULE;
+#else
+using dynamic_link_handle = void*;
+#endif /* _WIN32 */
+
+const int DYNAMIC_LINK_GLOBAL        = 0x01;
+const int DYNAMIC_LINK_LOAD          = 0x02;
+const int DYNAMIC_LINK_WEAK          = 0x04;
+const int DYNAMIC_LINK_LOCAL         = 0x08;
+
+const int DYNAMIC_LINK_LOCAL_BINDING = DYNAMIC_LINK_LOCAL | DYNAMIC_LINK_LOAD;
+const int DYNAMIC_LINK_DEFAULT       = DYNAMIC_LINK_GLOBAL | DYNAMIC_LINK_LOAD | DYNAMIC_LINK_WEAK;
+
+//! Fill in dynamically linked handlers.
+/** 'library' is the name of the requested library. It should not contain a full
+    path since dynamic_link adds the full path (from which the runtime itself
+    was loaded) to the library name.
+    'required' is the number of the initial entries in the array descriptors[]
+    that have to be found in order for the call to succeed. If the library and
+    all the required handlers are found, then the corresponding handler
+    pointers are set, and the return value is true.  Otherwise the original
+    array of descriptors is left untouched and the return value is false.
+    'required' is limited by 20 (exceeding of this value will result in failure
+    to load the symbols and the return value will be false).
+    'handle' is the handle of the library if it is loaded. Otherwise it is left
+    untouched.
+    'flags' is the set of DYNAMIC_LINK_* flags. Each of the DYNAMIC_LINK_* flags
+    allows its corresponding linking stage.
+**/
+bool dynamic_link( const char* library,
+                   const dynamic_link_descriptor descriptors[],
+                   std::size_t required,
+                   dynamic_link_handle* handle = nullptr,
+                   int flags = DYNAMIC_LINK_DEFAULT );
+
+void dynamic_unlink( dynamic_link_handle handle );
+
+void dynamic_unlink_all();
+
+enum dynamic_link_error_t {
+    dl_success = 0,
+    dl_lib_not_found,     // char const * lib, dlerr_t err
+    dl_sym_not_found,     // char const * sym, dlerr_t err
+                          // Note: dlerr_t depends on OS: it is char const * on Linux* and macOS*, int on Windows*.
+    dl_sys_fail,          // char const * func, int err
+    dl_buff_too_small     // none
+}; // dynamic_link_error_t
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_dynamic_link */
--- a/third_party/tbb/enumerable_thread_specific.hh
+++ b/third_party/tbb/enumerable_thread_specific.hh
--- a/third_party/tbb/environment.hh
+++ b/third_party/tbb/environment.hh
@ -0,0 +1,82 @@
+// clang-format off
+/*
+    Copyright (c) 2018-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_tbb_environment_H
+#define __TBB_tbb_environment_H
+
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/cerrno"
+#include "third_party/libcxx/cctype"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_WIN8UI_SUPPORT
+static inline bool GetBoolEnvironmentVariable( const char * ) {
+    return false;
+}
+
+static inline long GetIntegralEnvironmentVariable( const char * ) {
+    return -1;
+}
+#else  /* __TBB_WIN8UI_SUPPORT */
+static inline bool GetBoolEnvironmentVariable( const char * name ) {
+    if ( const char* s = std::getenv(name) ) {
+        // The result is defined as true only if the environment variable contains
+        // no characters except one '1' character and an arbitrary number of spaces
+        // (including the absence of spaces).
+        size_t index = std::strspn(s, " ");
+        if (s[index] != '1') return false;
+        index++;
+        // Memory access after incrementing is safe, since the getenv() returns a
+        // null-terminated string, and even if the character getting by index is '1',
+        // and this character is the end of string, after incrementing we will get
+        // an index of character, that contains '\0'
+        index += std::strspn(&s[index], " ");
+        return !s[index];
+    }
+    return false;
+}
+
+static inline long GetIntegralEnvironmentVariable( const char * name ) {
+    if ( const char* s = std::getenv(name) ) {
+        char* end = nullptr;
+        errno = 0;
+        long value = std::strtol(s, &end, 10);
+
+        // We have exceeded the range, value is negative or string is incovertable
+        if ( errno == ERANGE || value < 0 || end==s ) {
+            return -1;
+        }
+        for ( ; *end != '\0'; end++ ) {
+            if ( !std::isspace(*end) ) {
+                return -1;
+            }
+        }
+        return value;
+    }
+    return -1;
+}
+#endif /* __TBB_WIN8UI_SUPPORT */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_tbb_environment_H
--- a/third_party/tbb/exception.cc
+++ b/third_party/tbb/exception.cc
@ -0,0 +1,167 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_exception.hh"
+#include "third_party/tbb/detail/_assert.hh"
+#include "third_party/tbb/detail/_template_helpers.hh"
+
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/stdexcept" // std::runtime_error
+#include "third_party/libcxx/new"
+#include "third_party/libcxx/stdexcept"
+
+#define __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN                             \
+    (__GLIBCXX__ && __TBB_GLIBCXX_VERSION>=40700 && __TBB_GLIBCXX_VERSION<60000 && TBB_USE_EXCEPTIONS)
+
+#if __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN
+// GCC ABI declarations necessary for a workaround
+// MISSING #include <cxxabi.h>
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+const char* bad_last_alloc::what() const noexcept(true) { return "bad allocation in previous or concurrent attempt"; }
+const char* user_abort::what() const noexcept(true) { return "User-initiated abort has terminated this operation"; }
+const char* missing_wait::what() const noexcept(true) { return "wait() was not called on the structured_task_group"; }
+
+#if TBB_USE_EXCEPTIONS
+    template <typename F>
+    /*[[noreturn]]*/ void do_throw_noexcept(F throw_func) noexcept {
+        throw_func();
+    }
+
+    /*[[noreturn]]*/ void do_throw_noexcept(void (*throw_func)()) noexcept {
+        throw_func();
+#if __GNUC__ == 7
+        // In release, GCC 7 loses noexcept attribute during tail call optimization.
+        // The following statement prevents tail call optimization.
+        volatile bool reach_this_point = true;
+        suppress_unused_warning(reach_this_point);
+#endif
+    }
+
+    bool terminate_on_exception(); // defined in global_control.cpp and ipc_server.cpp
+
+    template <typename F>
+    /*[[noreturn]]*/ void do_throw(F throw_func) {
+        if (terminate_on_exception()) {
+            do_throw_noexcept(throw_func);
+        }
+        throw_func();
+    }
+
+    #define DO_THROW(exc, init_args) do_throw( []{ throw exc init_args; } );
+#else /* !TBB_USE_EXCEPTIONS */
+    #define PRINT_ERROR_AND_ABORT(exc_name, msg) \
+        std::fprintf (stderr, "Exception %s with message %s would have been thrown, "  \
+            "if exception handling had not been disabled. Aborting.\n", exc_name, msg); \
+        std::fflush(stderr); \
+        std::abort();
+    #define DO_THROW(exc, init_args) PRINT_ERROR_AND_ABORT(#exc, #init_args)
+#endif /* !TBB_USE_EXCEPTIONS */
+
+void throw_exception ( exception_id eid ) {
+    switch ( eid ) {
+    case exception_id::bad_alloc: DO_THROW(std::bad_alloc, ()); break;
+    case exception_id::bad_last_alloc: DO_THROW(bad_last_alloc, ()); break;
+    case exception_id::user_abort: DO_THROW( user_abort, () ); break;
+    case exception_id::nonpositive_step: DO_THROW(std::invalid_argument, ("Step must be positive") ); break;
+    case exception_id::out_of_range: DO_THROW(std::out_of_range, ("Index out of requested size range")); break;
+    case exception_id::reservation_length_error: DO_THROW(std::length_error, ("Attempt to exceed implementation defined length limits")); break;
+    case exception_id::missing_wait: DO_THROW(missing_wait, ()); break;
+    case exception_id::invalid_load_factor: DO_THROW(std::out_of_range, ("Invalid hash load factor")); break;
+    case exception_id::invalid_key: DO_THROW(std::out_of_range, ("invalid key")); break;
+    case exception_id::bad_tagged_msg_cast: DO_THROW(std::runtime_error, ("Illegal tagged_msg cast")); break;
+    case exception_id::unsafe_wait: DO_THROW(unsafe_wait, ("Unsafe to wait further")); break;
+    default: __TBB_ASSERT ( false, "Unknown exception ID" );
+    }
+    __TBB_ASSERT(false, "Unreachable code");
+}
+
+/* The "what" should be fairly short, not more than about 128 characters.
+   Because we control all the call sites to handle_perror, it is pointless
+   to bullet-proof it for very long strings.
+
+   Design note: ADR put this routine off to the side in tbb_misc.cpp instead of
+   Task.cpp because the throw generates a pathetic lot of code, and ADR wanted
+   this large chunk of code to be placed on a cold page. */
+void handle_perror( int error_code, const char* what ) {
+    const int BUF_SIZE = 255;
+    char buf[BUF_SIZE + 1] = { 0 };
+    std::strncat(buf, what, BUF_SIZE);
+    std::size_t buf_len = std::strlen(buf);
+    if (error_code) {
+        std::strncat(buf, ": ", BUF_SIZE - buf_len);
+        buf_len = std::strlen(buf);
+        std::strncat(buf, std::strerror(error_code), BUF_SIZE - buf_len);
+        buf_len = std::strlen(buf);
+    }
+    __TBB_ASSERT(buf_len <= BUF_SIZE && buf[buf_len] == 0, nullptr);
+#if TBB_USE_EXCEPTIONS
+    do_throw([&buf] { throw std::runtime_error(buf); });
+#else
+    PRINT_ERROR_AND_ABORT( "runtime_error", buf);
+#endif /* !TBB_USE_EXCEPTIONS */
+}
+
+#if __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN
+// Runtime detection and workaround for the GCC bug 62258.
+// The problem is that std::rethrow_exception() does not increment a counter
+// of active exceptions, causing std::uncaught_exception() to return a wrong value.
+// The code is created after, and roughly reflects, the workaround
+// at https://gcc.gnu.org/bugzilla/attachment.cgi?id=34683
+
+void fix_broken_rethrow() {
+    struct gcc_eh_data {
+        void *       caughtExceptions;
+        unsigned int uncaughtExceptions;
+    };
+    gcc_eh_data* eh_data = punned_cast<gcc_eh_data*>( abi::__cxa_get_globals() );
+    ++eh_data->uncaughtExceptions;
+}
+
+bool gcc_rethrow_exception_broken() {
+    bool is_broken;
+    __TBB_ASSERT( !std::uncaught_exception(),
+        "gcc_rethrow_exception_broken() must not be called when an exception is active" );
+    try {
+        // Throw, catch, and rethrow an exception
+        try {
+            throw __TBB_GLIBCXX_VERSION;
+        } catch(...) {
+            std::rethrow_exception( std::current_exception() );
+        }
+    } catch(...) {
+        // Check the bug presence
+        is_broken = std::uncaught_exception();
+    }
+    if( is_broken ) fix_broken_rethrow();
+    __TBB_ASSERT( !std::uncaught_exception(), nullptr);
+    return is_broken;
+}
+#else
+void fix_broken_rethrow() {}
+bool gcc_rethrow_exception_broken() { return false; }
+#endif /* __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
--- a/third_party/tbb/flow_graph.hh
+++ b/third_party/tbb/flow_graph.hh
--- a/third_party/tbb/flow_graph_abstractions.hh
+++ b/third_party/tbb/flow_graph_abstractions.hh
@ -0,0 +1,52 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_abstractions_H
+#define __TBB_flow_graph_abstractions_H
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Pure virtual template classes that define interfaces for async communication
+class graph_proxy {
+public:
+    //! Inform a graph that messages may come from outside, to prevent premature graph completion
+    virtual void reserve_wait() = 0;
+
+    //! Inform a graph that a previous call to reserve_wait is no longer in effect
+    virtual void release_wait() = 0;
+
+    virtual ~graph_proxy() {}
+};
+
+template <typename Input>
+class receiver_gateway : public graph_proxy {
+public:
+    //! Type of inputing data into FG.
+    typedef Input input_type;
+
+    //! Submit signal from an asynchronous activity to FG.
+    virtual bool try_put(const input_type&) = 0;
+};
+
+} // d1
+
+
+} // detail
+} // tbb
+#endif
--- a/third_party/tbb/global_control.cc
+++ b/third_party/tbb/global_control.cc
@ -0,0 +1,281 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_template_helpers.hh"
+
+#include "third_party/tbb/global_control.hh"
+#include "third_party/tbb/tbb_allocator.hh"
+#include "third_party/tbb/spin_mutex.hh"
+
+#include "third_party/tbb/governor.hh"
+#include "third_party/tbb/threading_control.hh"
+#include "third_party/tbb/market.hh"
+#include "third_party/tbb/misc.hh"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/set"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Comparator for a set of global_control objects
+struct control_storage_comparator {
+    bool operator()(const d1::global_control* lhs, const d1::global_control* rhs) const;
+};
+
+class control_storage {
+    friend struct global_control_impl;
+    friend std::size_t global_control_active_value(int);
+    friend void global_control_lock();
+    friend void global_control_unlock();
+    friend std::size_t global_control_active_value_unsafe(d1::global_control::parameter);
+protected:
+    std::size_t my_active_value{0};
+    std::set<d1::global_control*, control_storage_comparator, tbb_allocator<d1::global_control*>> my_list{};
+    spin_mutex my_list_mutex{};
+public:
+    virtual std::size_t default_value() const = 0;
+    virtual void apply_active(std::size_t new_active) {
+        my_active_value = new_active;
+    }
+    virtual bool is_first_arg_preferred(std::size_t a, std::size_t b) const {
+        return a>b; // prefer max by default
+    }
+    virtual std::size_t active_value() {
+        spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call
+        return !my_list.empty() ? my_active_value : default_value();
+    }
+
+    std::size_t active_value_unsafe() {
+        return !my_list.empty() ? my_active_value : default_value();
+    }
+};
+
+class alignas(max_nfs_size) allowed_parallelism_control : public control_storage {
+    std::size_t default_value() const override {
+        return max(1U, governor::default_num_threads());
+    }
+    bool is_first_arg_preferred(std::size_t a, std::size_t b) const override {
+        return a<b; // prefer min allowed parallelism
+    }
+    void apply_active(std::size_t new_active) override {
+        control_storage::apply_active(new_active);
+        __TBB_ASSERT(my_active_value >= 1, nullptr);
+        // -1 to take external thread into account
+        threading_control::set_active_num_workers(my_active_value - 1);
+    }
+    std::size_t active_value() override {
+        spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call
+        if (my_list.empty()) {
+            return default_value();
+        }
+
+        // non-zero, if market is active
+        const std::size_t workers = threading_control::max_num_workers();
+        // We can't exceed market's maximal number of workers.
+        // +1 to take external thread into account
+        return workers ? min(workers + 1, my_active_value) : my_active_value;
+    }
+public:
+    std::size_t active_value_if_present() const {
+        return !my_list.empty() ? my_active_value : 0;
+    }
+};
+
+class alignas(max_nfs_size) stack_size_control : public control_storage {
+    std::size_t default_value() const override {
+#if _WIN32_WINNT >= 0x0602 /* _WIN32_WINNT_WIN8 */
+        static auto ThreadStackSizeDefault = [] {
+            ULONG_PTR hi, lo;
+            GetCurrentThreadStackLimits(&lo, &hi);
+            return hi - lo;
+        }();
+        return ThreadStackSizeDefault;
+#else
+        return ThreadStackSize;
+#endif
+    }
+    void apply_active(std::size_t new_active) override {
+        control_storage::apply_active(new_active);
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+        __TBB_ASSERT( false, "For Windows 8 Store* apps we must not set stack size" );
+#endif
+    }
+};
+
+class alignas(max_nfs_size) terminate_on_exception_control : public control_storage {
+    std::size_t default_value() const override {
+        return 0;
+    }
+};
+
+class alignas(max_nfs_size) lifetime_control : public control_storage {
+    bool is_first_arg_preferred(std::size_t, std::size_t) const override {
+        return false; // not interested
+    }
+    std::size_t default_value() const override {
+        return 0;
+    }
+    void apply_active(std::size_t new_active) override {
+        if (new_active == 1) {
+            // reserve the market reference
+            threading_control::register_lifetime_control();
+        } else if (new_active == 0) { // new_active == 0
+            threading_control::unregister_lifetime_control(/*blocking_terminate*/ false);
+        }
+        control_storage::apply_active(new_active);
+    }
+
+public:
+    bool is_empty() {
+        spin_mutex::scoped_lock lock(my_list_mutex);
+        return my_list.empty();
+    }
+};
+
+static allowed_parallelism_control allowed_parallelism_ctl;
+static stack_size_control stack_size_ctl;
+static terminate_on_exception_control terminate_on_exception_ctl;
+static lifetime_control lifetime_ctl;
+static control_storage *controls[] = {&allowed_parallelism_ctl, &stack_size_ctl, &terminate_on_exception_ctl, &lifetime_ctl};
+
+void global_control_lock() {
+    for (auto& ctl : controls) {
+        ctl->my_list_mutex.lock();
+    }
+}
+
+void global_control_unlock() {
+    int N = std::distance(std::begin(controls), std::end(controls));
+    for (int i = N - 1; i >= 0; --i) {
+        controls[i]->my_list_mutex.unlock();
+    }
+}
+
+std::size_t global_control_active_value_unsafe(d1::global_control::parameter param) {
+    __TBB_ASSERT_RELEASE(param < d1::global_control::parameter_max, nullptr);
+    return controls[param]->active_value_unsafe();
+}
+
+//! Comparator for a set of global_control objects
+inline bool control_storage_comparator::operator()(const d1::global_control* lhs, const d1::global_control* rhs) const {
+    __TBB_ASSERT_RELEASE(lhs->my_param < d1::global_control::parameter_max , nullptr);
+    return lhs->my_value < rhs->my_value || (lhs->my_value == rhs->my_value && lhs < rhs);
+}
+
+bool terminate_on_exception() {
+    return d1::global_control::active_value(d1::global_control::terminate_on_exception) == 1;
+}
+
+struct global_control_impl {
+private:
+    static bool erase_if_present(control_storage* const c, d1::global_control& gc) {
+        auto it = c->my_list.find(&gc);
+        if (it != c->my_list.end()) {
+            c->my_list.erase(it);
+            return true;
+        }
+        return false;
+    }
+
+public:
+
+    static void create(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr);
+        control_storage* const c = controls[gc.my_param];
+
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        if (c->my_list.empty() || c->is_first_arg_preferred(gc.my_value, c->my_active_value)) {
+            // to guarantee that apply_active() is called with current active value,
+            // calls it here and in internal_destroy() under my_list_mutex
+            c->apply_active(gc.my_value);
+        }
+        c->my_list.insert(&gc);
+    }
+
+    static void destroy(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr);
+        control_storage* const c = controls[gc.my_param];
+        // Concurrent reading and changing global parameter is possible.
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        __TBB_ASSERT(gc.my_param == d1::global_control::scheduler_handle || !c->my_list.empty(), nullptr);
+        std::size_t new_active = (std::size_t)(-1), old_active = c->my_active_value;
+
+        if (!erase_if_present(c, gc)) {
+            __TBB_ASSERT(gc.my_param == d1::global_control::scheduler_handle , nullptr);
+            return;
+        }
+        if (c->my_list.empty()) {
+            __TBB_ASSERT(new_active == (std::size_t) - 1, nullptr);
+            new_active = c->default_value();
+        } else {
+            new_active = (*c->my_list.begin())->my_value;
+        }
+        if (new_active != old_active) {
+            c->apply_active(new_active);
+        }
+    }
+
+    static bool remove_and_check_if_empty(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr);
+        control_storage* const c = controls[gc.my_param];
+
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        __TBB_ASSERT(!c->my_list.empty(), nullptr);
+        erase_if_present(c, gc);
+        return c->my_list.empty();
+    }
+#if TBB_USE_ASSERT
+    static bool is_present(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr);
+        control_storage* const c = controls[gc.my_param];
+
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        auto it = c->my_list.find(&gc);
+        if (it != c->my_list.end()) {
+            return true;
+        }
+        return false;
+    }
+#endif // TBB_USE_ASSERT
+};
+
+void __TBB_EXPORTED_FUNC create(d1::global_control& gc) {
+    global_control_impl::create(gc);
+}
+void __TBB_EXPORTED_FUNC destroy(d1::global_control& gc) {
+    global_control_impl::destroy(gc);
+}
+
+bool remove_and_check_if_empty(d1::global_control& gc) {
+    return global_control_impl::remove_and_check_if_empty(gc);
+}
+#if TBB_USE_ASSERT
+bool is_present(d1::global_control& gc) {
+    return global_control_impl::is_present(gc);
+}
+#endif // TBB_USE_ASSERT
+std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int param) {
+    __TBB_ASSERT_RELEASE(param < d1::global_control::parameter_max, nullptr);
+    return controls[param]->active_value();
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
--- a/third_party/tbb/global_control.hh
+++ b/third_party/tbb/global_control.hh
@ -0,0 +1,201 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_global_control_H
+#define __TBB_global_control_H
+
+#include "third_party/tbb/detail/_config.hh"
+
+#include "third_party/tbb/detail/_assert.hh"
+#include "third_party/tbb/detail/_attach.hh"
+#include "third_party/tbb/detail/_exception.hh"
+#include "third_party/tbb/detail/_namespace_injection.hh"
+#include "third_party/tbb/detail/_template_helpers.hh"
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/new" // std::nothrow_t
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class global_control;
+class task_scheduler_handle;
+}
+
+namespace r1 {
+TBB_EXPORT void __TBB_EXPORTED_FUNC create(d1::global_control&);
+TBB_EXPORT void __TBB_EXPORTED_FUNC destroy(d1::global_control&);
+TBB_EXPORT std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int);
+struct global_control_impl;
+struct control_storage_comparator;
+void release_impl(d1::task_scheduler_handle& handle);
+bool finalize_impl(d1::task_scheduler_handle& handle);
+TBB_EXPORT void __TBB_EXPORTED_FUNC get(d1::task_scheduler_handle&);
+TBB_EXPORT bool __TBB_EXPORTED_FUNC finalize(d1::task_scheduler_handle&, std::intptr_t mode);
+}
+
+namespace d1 {
+
+class global_control {
+public:
+    enum parameter {
+        max_allowed_parallelism,
+        thread_stack_size,
+        terminate_on_exception,
+        scheduler_handle, // not a public parameter
+        parameter_max // insert new parameters above this point
+    };
+
+    global_control(parameter p, std::size_t value) :
+        my_value(value), my_reserved(), my_param(p) {
+        suppress_unused_warning(my_reserved);
+        __TBB_ASSERT(my_param < parameter_max, "Invalid parameter");
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+        // For Windows 8 Store* apps it's impossible to set stack size
+        if (p==thread_stack_size)
+            return;
+#elif __TBB_x86_64 && (_WIN32 || _WIN64)
+        if (p==thread_stack_size)
+            __TBB_ASSERT_RELEASE((unsigned)value == value, "Stack size is limited to unsigned int range");
+#endif
+        if (my_param==max_allowed_parallelism)
+            __TBB_ASSERT_RELEASE(my_value>0, "max_allowed_parallelism cannot be 0.");
+        r1::create(*this);
+    }
+
+    ~global_control() {
+        __TBB_ASSERT(my_param < parameter_max, "Invalid parameter");
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+        // For Windows 8 Store* apps it's impossible to set stack size
+        if (my_param==thread_stack_size)
+            return;
+#endif
+        r1::destroy(*this);
+    }
+
+    static std::size_t active_value(parameter p) {
+        __TBB_ASSERT(p < parameter_max, "Invalid parameter");
+        return r1::global_control_active_value((int)p);
+    }
+
+private:
+    std::size_t my_value;
+    std::intptr_t my_reserved; // TODO: substitution of global_control* not to break backward compatibility
+    parameter my_param;
+
+    friend struct r1::global_control_impl;
+    friend struct r1::control_storage_comparator;
+};
+
+//! Finalization options.
+//! Outside of the class to avoid extensive friendship.
+static constexpr std::intptr_t release_nothrowing = 0;
+static constexpr std::intptr_t finalize_nothrowing = 1;
+static constexpr std::intptr_t finalize_throwing = 2;
+
+//! User side wrapper for a task scheduler lifetime control object
+class task_scheduler_handle {
+public:
+    //! Creates an empty task_scheduler_handle
+    task_scheduler_handle() = default;
+
+    //! Creates an attached instance of task_scheduler_handle
+    task_scheduler_handle(attach) {
+        r1::get(*this);
+    }
+
+    //! Release a reference if any
+    ~task_scheduler_handle() {
+        release();
+    }
+
+    //! No copy
+    task_scheduler_handle(const task_scheduler_handle& other) = delete;
+    task_scheduler_handle& operator=(const task_scheduler_handle& other) = delete;
+
+    //! Move only
+    task_scheduler_handle(task_scheduler_handle&& other) noexcept {
+        std::swap(m_ctl, other.m_ctl);
+    }
+    task_scheduler_handle& operator=(task_scheduler_handle&& other) noexcept {
+        std::swap(m_ctl, other.m_ctl);
+        return *this;
+    };
+
+    //! Checks if the task_scheduler_handle is empty
+    explicit operator bool() const noexcept {
+        return m_ctl != nullptr;
+    }
+
+    //! Release the reference and deactivate handle
+    void release() {
+        if (m_ctl != nullptr) {
+            r1::finalize(*this, release_nothrowing);
+            m_ctl = nullptr;
+        }
+    }
+
+private:
+    friend void r1::release_impl(task_scheduler_handle& handle);
+    friend bool r1::finalize_impl(task_scheduler_handle& handle);
+    friend void __TBB_EXPORTED_FUNC r1::get(task_scheduler_handle&);
+
+    friend void finalize(task_scheduler_handle&);
+    friend bool finalize(task_scheduler_handle&, const std::nothrow_t&) noexcept;
+
+    global_control* m_ctl{nullptr};
+};
+
+#if TBB_USE_EXCEPTIONS
+//! Waits for worker threads termination. Throws exception on error.
+inline void finalize(task_scheduler_handle& handle) {
+    try_call([&] {
+        if (handle.m_ctl != nullptr) {
+            bool finalized = r1::finalize(handle, finalize_throwing);
+            __TBB_ASSERT_EX(finalized, "r1::finalize did not respect finalize_throwing ?");
+            
+        }
+    }).on_completion([&] {
+        __TBB_ASSERT(!handle, "The handle should be empty after finalize");
+    });
+}
+#endif
+//! Waits for worker threads termination. Returns false on error.
+inline bool finalize(task_scheduler_handle& handle, const std::nothrow_t&) noexcept {
+    bool finalized = true;
+    if (handle.m_ctl != nullptr) {
+        finalized = r1::finalize(handle, finalize_nothrowing);
+    }
+    __TBB_ASSERT(!handle, "The handle should be empty after finalize");
+    return finalized;
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::global_control;
+using detail::d1::attach;
+using detail::d1::finalize;
+using detail::d1::task_scheduler_handle;
+using detail::r1::unsafe_wait;
+} // namespace v1
+
+} // namespace tbb
+
+#endif // __TBB_global_control_H
--- a/third_party/tbb/governor.cc
+++ b/third_party/tbb/governor.cc
@ -0,0 +1,580 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/governor.hh"
+#include "third_party/tbb/threading_control.hh"
+#include "third_party/tbb/main.hh"
+#include "third_party/tbb/thread_data.hh"
+#include "third_party/tbb/market.hh"
+#include "third_party/tbb/arena.hh"
+#include "third_party/tbb/dynamic_link.hh"
+#include "third_party/tbb/concurrent_monitor.hh"
+#include "third_party/tbb/thread_dispatcher.hh"
+
+#include "third_party/tbb/task_group.hh"
+#include "third_party/tbb/global_control.hh"
+#include "third_party/tbb/tbb_allocator.hh"
+#include "third_party/tbb/info.hh"
+
+#include "third_party/tbb/task_dispatcher.hh"
+
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/algorithm"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+void clear_address_waiter_table();
+
+//! global_control.cpp contains definition
+bool remove_and_check_if_empty(d1::global_control& gc);
+bool is_present(d1::global_control& gc);
+
+namespace rml {
+tbb_server* make_private_server( tbb_client& client );
+} // namespace rml
+
+namespace system_topology {
+    void destroy();
+}
+
+//------------------------------------------------------------------------
+// governor
+//------------------------------------------------------------------------
+
+void governor::acquire_resources () {
+#if __TBB_USE_POSIX
+    int status = theTLS.create(auto_terminate);
+#else
+    int status = theTLS.create();
+#endif
+    if( status )
+        handle_perror(status, "TBB failed to initialize task scheduler TLS\n");
+    detect_cpu_features(cpu_features);
+
+    is_rethrow_broken = gcc_rethrow_exception_broken();
+}
+
+void governor::release_resources () {
+    theRMLServerFactory.close();
+    destroy_process_mask();
+
+    __TBB_ASSERT(!(__TBB_InitOnce::initialization_done() && theTLS.get()), "TBB is unloaded while thread data still alive?");
+
+    int status = theTLS.destroy();
+    if( status )
+        runtime_warning("failed to destroy task scheduler TLS: %s", std::strerror(status));
+    clear_address_waiter_table();
+
+    system_topology::destroy();
+    dynamic_unlink_all();
+}
+
+rml::tbb_server* governor::create_rml_server ( rml::tbb_client& client ) {
+    rml::tbb_server* server = nullptr;
+    if( !UsePrivateRML ) {
+        ::rml::factory::status_type status = theRMLServerFactory.make_server( server, client );
+        if( status != ::rml::factory::st_success ) {
+            UsePrivateRML = true;
+            runtime_warning( "rml::tbb_factory::make_server failed with status %x, falling back on private rml", status );
+        }
+    }
+    if ( !server ) {
+        __TBB_ASSERT( UsePrivateRML, nullptr);
+        server = rml::make_private_server( client );
+    }
+    __TBB_ASSERT( server, "Failed to create RML server" );
+    return server;
+}
+
+void governor::one_time_init() {
+    if ( !__TBB_InitOnce::initialization_done() ) {
+        DoOneTimeInitialization();
+    }
+}
+
+bool governor::does_client_join_workers(const rml::tbb_client &client) {
+    return ((const thread_dispatcher&)client).must_join_workers();
+}
+
+/*
+    There is no portable way to get stack base address in Posix, however the modern
+    Linux versions provide pthread_attr_np API that can be used  to obtain thread's
+    stack size and base address. Unfortunately even this function does not provide
+    enough information for the main thread on IA-64 architecture (RSE spill area
+    and memory stack are allocated as two separate discontinuous chunks of memory),
+    and there is no portable way to discern the main and the secondary threads.
+    Thus for macOS* and IA-64 architecture for Linux* OS we use the TBB worker stack size for
+    all threads and use the current stack top as the stack base. This simplified
+    approach is based on the following assumptions:
+    1) If the default stack size is insufficient for the user app needs, the
+    required amount will be explicitly specified by the user at the point of the
+    TBB scheduler initialization (as an argument to tbb::task_scheduler_init
+    constructor).
+    2) When an external thread initializes the scheduler, it has enough space on its
+    stack. Here "enough" means "at least as much as worker threads have".
+    3) If the user app strives to conserve the memory by cutting stack size, it
+    should do this for TBB workers too (as in the #1).
+*/
+static std::uintptr_t get_stack_base(std::size_t stack_size) {
+    // Stacks are growing top-down. Highest address is called "stack base",
+    // and the lowest is "stack limit".
+#if __TBB_USE_WINAPI
+    suppress_unused_warning(stack_size);
+    NT_TIB* pteb = (NT_TIB*)NtCurrentTeb();
+    __TBB_ASSERT(&pteb < pteb->StackBase && &pteb > pteb->StackLimit, "invalid stack info in TEB");
+    return reinterpret_cast<std::uintptr_t>(pteb->StackBase);
+#else
+    // There is no portable way to get stack base address in Posix, so we use
+    // non-portable method (on all modern Linux) or the simplified approach
+    // based on the common sense assumptions. The most important assumption
+    // is that the main thread's stack size is not less than that of other threads.
+
+    // Points to the lowest addressable byte of a stack.
+    void* stack_limit = nullptr;
+#if __linux__ && !__bg__
+    size_t np_stack_size = 0;
+    pthread_attr_t np_attr_stack;
+    if (0 == pthread_getattr_np(pthread_self(), &np_attr_stack)) {
+        if (0 == pthread_attr_getstack(&np_attr_stack, &stack_limit, &np_stack_size)) {
+            __TBB_ASSERT( &stack_limit > stack_limit, "stack size must be positive" );
+        }
+        pthread_attr_destroy(&np_attr_stack);
+    }
+#endif /* __linux__ */
+    std::uintptr_t stack_base{};
+    if (stack_limit) {
+        stack_base = reinterpret_cast<std::uintptr_t>(stack_limit) + stack_size;
+    } else {
+        // Use an anchor as a base stack address.
+        int anchor{};
+        stack_base = reinterpret_cast<std::uintptr_t>(&anchor);
+    }
+    return stack_base;
+#endif /* __TBB_USE_WINAPI */
+}
+
+#if (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED
+static void register_external_thread_destructor() {
+    struct thread_destructor {
+        ~thread_destructor() {
+            governor::terminate_external_thread();
+        }
+    };
+    // ~thread_destructor() will be call during the calling thread termination
+    static thread_local thread_destructor thr_destructor;
+}
+#endif // (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED
+
+void governor::init_external_thread() {
+    one_time_init();
+    // Create new scheduler instance with arena
+    int num_slots = default_num_threads();
+    // TODO_REVAMP: support an external thread without an implicit arena
+    int num_reserved_slots = 1;
+    unsigned arena_priority_level = 1; // corresponds to tbb::task_arena::priority::normal
+    std::size_t stack_size = 0;
+    threading_control* thr_control = threading_control::register_public_reference();
+    arena& a = arena::create(thr_control, num_slots, num_reserved_slots, arena_priority_level);
+    // External thread always occupies the first slot
+    thread_data& td = *new(cache_aligned_allocate(sizeof(thread_data))) thread_data(0, false);
+    td.attach_arena(a, /*slot index*/ 0);
+    __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr);
+
+    stack_size = a.my_threading_control->worker_stack_size();
+    std::uintptr_t stack_base = get_stack_base(stack_size);
+    task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher();
+    td.enter_task_dispatcher(task_disp, calculate_stealing_threshold(stack_base, stack_size));
+
+    td.my_arena_slot->occupy();
+    thr_control->register_thread(td);
+    set_thread_data(td);
+#if (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED
+    // The external thread destructor is called from dllMain but it is not available with a static build.
+    // Therefore, we need to register the current thread to call the destructor during thread termination.
+    register_external_thread_destructor();
+#endif
+}
+
+void governor::auto_terminate(void* tls) {
+    __TBB_ASSERT(get_thread_data_if_initialized() == nullptr ||
+        get_thread_data_if_initialized() == tls, nullptr);
+    if (tls) {
+        thread_data* td = static_cast<thread_data*>(tls);
+
+        auto clear_tls = [td] {
+            td->~thread_data();
+            cache_aligned_deallocate(td);
+            clear_thread_data();
+        };
+
+        // Only external thread can be inside an arena during termination.
+        if (td->my_arena_slot) {
+            arena* a = td->my_arena;
+            threading_control* thr_control = a->my_threading_control;
+
+            // If the TLS slot is already cleared by OS or underlying concurrency
+            // runtime, restore its value to properly clean up arena
+            if (!is_thread_data_set(td)) {
+                set_thread_data(*td);
+            }
+
+            a->my_observers.notify_exit_observers(td->my_last_observer, td->my_is_worker);
+
+            td->leave_task_dispatcher();
+            td->my_arena_slot->release();
+            // Release an arena
+            a->on_thread_leaving(arena::ref_external);
+
+            thr_control->unregister_thread(*td);
+
+            // The tls should be cleared before market::release because
+            // market can destroy the tls key if we keep the last reference
+            clear_tls();
+
+            // If there was an associated arena, it added a public market reference
+            thr_control->unregister_public_reference(/* blocking terminate =*/ false);
+        } else {
+            clear_tls();
+        }
+    }
+    __TBB_ASSERT(get_thread_data_if_initialized() == nullptr, nullptr);
+}
+
+void governor::initialize_rml_factory () {
+    ::rml::factory::status_type res = theRMLServerFactory.open();
+    UsePrivateRML = res != ::rml::factory::st_success;
+}
+
+void __TBB_EXPORTED_FUNC get(d1::task_scheduler_handle& handle) {
+    handle.m_ctl = new(allocate_memory(sizeof(global_control))) global_control(global_control::scheduler_handle, 1);
+}
+
+void release_impl(d1::task_scheduler_handle& handle) {
+    if (handle.m_ctl != nullptr) {
+        handle.m_ctl->~global_control();
+        deallocate_memory(handle.m_ctl);
+        handle.m_ctl = nullptr;
+    }
+}
+
+bool finalize_impl(d1::task_scheduler_handle& handle) {
+    __TBB_ASSERT_RELEASE(handle, "trying to finalize with null handle");
+    __TBB_ASSERT(is_present(*handle.m_ctl), "finalize or release was already called on this object");
+
+    bool ok = true; // ok if threading_control does not exist yet
+    if (threading_control::is_present()) {
+        thread_data* td = governor::get_thread_data_if_initialized();
+        if (td) {
+            task_dispatcher* task_disp = td->my_task_dispatcher;
+            __TBB_ASSERT(task_disp, nullptr);
+            if (task_disp->m_properties.outermost && !td->my_is_worker) { // is not inside a parallel region
+                governor::auto_terminate(td);
+            }
+        }
+
+        if (remove_and_check_if_empty(*handle.m_ctl)) {
+            ok = threading_control::unregister_lifetime_control(/*blocking_terminate*/ true);
+        } else {
+            ok = false;
+        }
+    }
+
+    return ok;
+}
+
+bool __TBB_EXPORTED_FUNC finalize(d1::task_scheduler_handle& handle, std::intptr_t mode) {
+    if (mode == d1::release_nothrowing) {
+        release_impl(handle);
+        return true;
+    } else {
+        bool ok = finalize_impl(handle);
+        // TODO: it is unsafe when finalize is called concurrently and further library unload
+        release_impl(handle);
+        if (mode == d1::finalize_throwing && !ok) {
+            throw_exception(exception_id::unsafe_wait);
+        }
+        return ok;
+    }
+}
+
+#if __TBB_ARENA_BINDING
+
+#if __TBB_WEAK_SYMBOLS_PRESENT
+#pragma weak __TBB_internal_initialize_system_topology
+#pragma weak __TBB_internal_destroy_system_topology
+#pragma weak __TBB_internal_allocate_binding_handler
+#pragma weak __TBB_internal_deallocate_binding_handler
+#pragma weak __TBB_internal_apply_affinity
+#pragma weak __TBB_internal_restore_affinity
+#pragma weak __TBB_internal_get_default_concurrency
+
+extern "C" {
+void __TBB_internal_initialize_system_topology(
+    size_t groups_num,
+    int& numa_nodes_count, int*& numa_indexes_list,
+    int& core_types_count, int*& core_types_indexes_list
+);
+void __TBB_internal_destroy_system_topology( );
+
+//TODO: consider renaming to `create_binding_handler` and `destroy_binding_handler`
+binding_handler* __TBB_internal_allocate_binding_handler( int slot_num, int numa_id, int core_type_id, int max_threads_per_core );
+void __TBB_internal_deallocate_binding_handler( binding_handler* handler_ptr );
+
+void __TBB_internal_apply_affinity( binding_handler* handler_ptr, int slot_num );
+void __TBB_internal_restore_affinity( binding_handler* handler_ptr, int slot_num );
+
+int __TBB_internal_get_default_concurrency( int numa_id, int core_type_id, int max_threads_per_core );
+}
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+// Stubs that will be used if TBBbind library is unavailable.
+static void dummy_destroy_system_topology ( ) { }
+static binding_handler* dummy_allocate_binding_handler ( int, int, int, int ) { return nullptr; }
+static void dummy_deallocate_binding_handler ( binding_handler* ) { }
+static void dummy_apply_affinity ( binding_handler*, int ) { }
+static void dummy_restore_affinity ( binding_handler*, int ) { }
+static int dummy_get_default_concurrency( int, int, int ) { return governor::default_num_threads(); }
+
+// Handlers for communication with TBBbind
+static void (*initialize_system_topology_ptr)(
+    size_t groups_num,
+    int& numa_nodes_count, int*& numa_indexes_list,
+    int& core_types_count, int*& core_types_indexes_list
+) = nullptr;
+static void (*destroy_system_topology_ptr)( ) = dummy_destroy_system_topology;
+
+static binding_handler* (*allocate_binding_handler_ptr)( int slot_num, int numa_id, int core_type_id, int max_threads_per_core )
+    = dummy_allocate_binding_handler;
+static void (*deallocate_binding_handler_ptr)( binding_handler* handler_ptr )
+    = dummy_deallocate_binding_handler;
+static void (*apply_affinity_ptr)( binding_handler* handler_ptr, int slot_num )
+    = dummy_apply_affinity;
+static void (*restore_affinity_ptr)( binding_handler* handler_ptr, int slot_num )
+    = dummy_restore_affinity;
+int (*get_default_concurrency_ptr)( int numa_id, int core_type_id, int max_threads_per_core )
+    = dummy_get_default_concurrency;
+
+#if _WIN32 || _WIN64 || __unix__
+// Table describing how to link the handlers.
+static const dynamic_link_descriptor TbbBindLinkTable[] = {
+    DLD(__TBB_internal_initialize_system_topology, initialize_system_topology_ptr),
+    DLD(__TBB_internal_destroy_system_topology, destroy_system_topology_ptr),
+    DLD(__TBB_internal_allocate_binding_handler, allocate_binding_handler_ptr),
+    DLD(__TBB_internal_deallocate_binding_handler, deallocate_binding_handler_ptr),
+    DLD(__TBB_internal_apply_affinity, apply_affinity_ptr),
+    DLD(__TBB_internal_restore_affinity, restore_affinity_ptr),
+    DLD(__TBB_internal_get_default_concurrency, get_default_concurrency_ptr)
+};
+
+static const unsigned LinkTableSize = sizeof(TbbBindLinkTable) / sizeof(dynamic_link_descriptor);
+
+#if TBB_USE_DEBUG
+#define DEBUG_SUFFIX "_debug"
+#else
+#define DEBUG_SUFFIX
+#endif /* TBB_USE_DEBUG */
+
+#if _WIN32 || _WIN64
+#define LIBRARY_EXTENSION ".dll"
+#define LIBRARY_PREFIX
+#elif __unix__
+#define LIBRARY_EXTENSION __TBB_STRING(.so.3)
+#define LIBRARY_PREFIX "lib"
+#endif /* __unix__ */
+
+#define TBBBIND_NAME LIBRARY_PREFIX "tbbbind" DEBUG_SUFFIX LIBRARY_EXTENSION
+#define TBBBIND_2_0_NAME LIBRARY_PREFIX "tbbbind_2_0" DEBUG_SUFFIX LIBRARY_EXTENSION
+
+#define TBBBIND_2_5_NAME LIBRARY_PREFIX "tbbbind_2_5" DEBUG_SUFFIX LIBRARY_EXTENSION
+#endif /* _WIN32 || _WIN64 || __unix__ */
+
+// Representation of system hardware topology information on the TBB side.
+// System topology may be initialized by third-party component (e.g. hwloc)
+// or just filled in with default stubs.
+namespace system_topology {
+
+constexpr int automatic = -1;
+
+static std::atomic<do_once_state> initialization_state;
+
+namespace {
+int  numa_nodes_count = 0;
+int* numa_nodes_indexes = nullptr;
+
+int  core_types_count = 0;
+int* core_types_indexes = nullptr;
+
+const char* load_tbbbind_shared_object() {
+#if _WIN32 || _WIN64 || __unix__
+#if _WIN32 && !_WIN64
+    // For 32-bit Windows applications, process affinity masks can only support up to 32 logical CPUs.
+    SYSTEM_INFO si;
+    GetNativeSystemInfo(&si);
+    if (si.dwNumberOfProcessors > 32) return nullptr;
+#endif /* _WIN32 && !_WIN64 */
+    for (const auto& tbbbind_version : {TBBBIND_2_5_NAME, TBBBIND_2_0_NAME, TBBBIND_NAME}) {
+        if (dynamic_link(tbbbind_version, TbbBindLinkTable, LinkTableSize, nullptr, DYNAMIC_LINK_LOCAL_BINDING)) {
+            return tbbbind_version;
+        }
+    }
+#endif /* _WIN32 || _WIN64 || __unix__ */
+    return nullptr;
+}
+
+int processor_groups_num() {
+#if _WIN32
+    return NumberOfProcessorGroups();
+#else
+    // Stub to improve code readability by reducing number of the compile-time conditions
+    return 1;
+#endif
+}
+} // internal namespace
+
+// Tries to load TBBbind library API, if success, gets NUMA topology information from it,
+// in another case, fills NUMA topology by stubs.
+void initialization_impl() {
+    governor::one_time_init();
+
+    if (const char* tbbbind_name = load_tbbbind_shared_object()) {
+        initialize_system_topology_ptr(
+            processor_groups_num(),
+            numa_nodes_count, numa_nodes_indexes,
+            core_types_count, core_types_indexes
+        );
+
+        PrintExtraVersionInfo("TBBBIND", tbbbind_name);
+        return;
+    }
+
+    static int dummy_index = automatic;
+
+    numa_nodes_count = 1;
+    numa_nodes_indexes = &dummy_index;
+
+    core_types_count = 1;
+    core_types_indexes = &dummy_index;
+
+    PrintExtraVersionInfo("TBBBIND", "UNAVAILABLE");
+}
+
+void initialize() {
+    atomic_do_once(initialization_impl, initialization_state);
+}
+
+void destroy() {
+    destroy_system_topology_ptr();
+}
+} // namespace system_topology
+
+binding_handler* construct_binding_handler(int slot_num, int numa_id, int core_type_id, int max_threads_per_core) {
+    system_topology::initialize();
+    return allocate_binding_handler_ptr(slot_num, numa_id, core_type_id, max_threads_per_core);
+}
+
+void destroy_binding_handler(binding_handler* handler_ptr) {
+    __TBB_ASSERT(deallocate_binding_handler_ptr, "tbbbind loading was not performed");
+    deallocate_binding_handler_ptr(handler_ptr);
+}
+
+void apply_affinity_mask(binding_handler* handler_ptr, int slot_index) {
+    __TBB_ASSERT(slot_index >= 0, "Negative thread index");
+    __TBB_ASSERT(apply_affinity_ptr, "tbbbind loading was not performed");
+    apply_affinity_ptr(handler_ptr, slot_index);
+}
+
+void restore_affinity_mask(binding_handler* handler_ptr, int slot_index) {
+    __TBB_ASSERT(slot_index >= 0, "Negative thread index");
+    __TBB_ASSERT(restore_affinity_ptr, "tbbbind loading was not performed");
+    restore_affinity_ptr(handler_ptr, slot_index);
+}
+
+unsigned __TBB_EXPORTED_FUNC numa_node_count() {
+    system_topology::initialize();
+    return system_topology::numa_nodes_count;
+}
+
+void __TBB_EXPORTED_FUNC fill_numa_indices(int* index_array) {
+    system_topology::initialize();
+    std::memcpy(index_array, system_topology::numa_nodes_indexes, system_topology::numa_nodes_count * sizeof(int));
+}
+
+int __TBB_EXPORTED_FUNC numa_default_concurrency(int node_id) {
+    if (node_id >= 0) {
+        system_topology::initialize();
+        int result = get_default_concurrency_ptr(
+            node_id,
+            /*core_type*/system_topology::automatic,
+            /*threads_per_core*/system_topology::automatic
+        );
+        if (result > 0) return result;
+    }
+    return governor::default_num_threads();
+}
+
+unsigned __TBB_EXPORTED_FUNC core_type_count(intptr_t /*reserved*/) {
+    system_topology::initialize();
+    return system_topology::core_types_count;
+}
+
+void __TBB_EXPORTED_FUNC fill_core_type_indices(int* index_array, intptr_t /*reserved*/) {
+    system_topology::initialize();
+    std::memcpy(index_array, system_topology::core_types_indexes, system_topology::core_types_count * sizeof(int));
+}
+
+void constraints_assertion(d1::constraints c) {
+    bool is_topology_initialized = system_topology::initialization_state == do_once_state::initialized;
+    __TBB_ASSERT_RELEASE(c.max_threads_per_core == system_topology::automatic || c.max_threads_per_core > 0,
+        "Wrong max_threads_per_core constraints field value.");
+
+    auto numa_nodes_begin = system_topology::numa_nodes_indexes;
+    auto numa_nodes_end = system_topology::numa_nodes_indexes + system_topology::numa_nodes_count;
+    __TBB_ASSERT_RELEASE(
+        c.numa_id == system_topology::automatic ||
+        (is_topology_initialized && std::find(numa_nodes_begin, numa_nodes_end, c.numa_id) != numa_nodes_end),
+        "The constraints::numa_id value is not known to the library. Use tbb::info::numa_nodes() to get the list of possible values.");
+
+    int* core_types_begin = system_topology::core_types_indexes;
+    int* core_types_end = system_topology::core_types_indexes + system_topology::core_types_count;
+    __TBB_ASSERT_RELEASE(c.core_type == system_topology::automatic ||
+        (is_topology_initialized && std::find(core_types_begin, core_types_end, c.core_type) != core_types_end),
+        "The constraints::core_type value is not known to the library. Use tbb::info::core_types() to get the list of possible values.");
+}
+
+int __TBB_EXPORTED_FUNC constraints_default_concurrency(const d1::constraints& c, intptr_t /*reserved*/) {
+    constraints_assertion(c);
+
+    if (c.numa_id >= 0 || c.core_type >= 0 || c.max_threads_per_core > 0) {
+        system_topology::initialize();
+        return get_default_concurrency_ptr(c.numa_id, c.core_type, c.max_threads_per_core);
+    }
+    return governor::default_num_threads();
+}
+
+int __TBB_EXPORTED_FUNC constraints_threads_per_core(const d1::constraints&, intptr_t /*reserved*/) {
+    return system_topology::automatic;
+}
+#endif /* __TBB_ARENA_BINDING */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
--- a/third_party/tbb/governor.hh
+++ b/third_party/tbb/governor.hh
@ -0,0 +1,157 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_governor_H
+#define _TBB_governor_H
+
+#include "third_party/tbb/rml_tbb.hh"
+
+#include "third_party/tbb/misc.hh" // for AvailableHwConcurrency
+#include "third_party/tbb/tls.hh"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class market;
+class thread_data;
+class __TBB_InitOnce;
+
+#if __TBB_USE_ITT_NOTIFY
+//! Defined in profiling.cpp
+extern bool ITT_Present;
+#endif
+
+typedef std::size_t stack_size_type;
+
+//------------------------------------------------------------------------
+// Class governor
+//------------------------------------------------------------------------
+
+//! The class handles access to the single instance of market, and to TLS to keep scheduler instances.
+/** It also supports automatic on-demand initialization of the TBB scheduler.
+    The class contains only static data members and methods.*/
+class governor {
+private:
+    friend class __TBB_InitOnce;
+    friend class thread_dispatcher;
+    friend class threading_control_impl;
+
+    // TODO: consider using thread_local (measure performance and side effects)
+    //! TLS for scheduler instances associated with individual threads
+    static basic_tls<thread_data*> theTLS;
+
+    // TODO (TBB_REVAMP_TODO): reconsider constant names
+    static rml::tbb_factory theRMLServerFactory;
+
+    static bool UsePrivateRML;
+
+    // Flags for runtime-specific conditions
+    static cpu_features_type cpu_features;
+    static bool is_rethrow_broken;
+
+    //! Create key for thread-local storage and initialize RML.
+    static void acquire_resources ();
+
+    //! Destroy the thread-local storage key and deinitialize RML.
+    static void release_resources ();
+
+    static rml::tbb_server* create_rml_server ( rml::tbb_client& );
+
+public:
+    static unsigned default_num_threads () {
+        // Caches the maximal level of parallelism supported by the hardware
+        static unsigned num_threads = AvailableHwConcurrency();
+        return num_threads;
+    }
+    static std::size_t default_page_size () {
+        // Caches the size of OS regular memory page
+        static std::size_t page_size = DefaultSystemPageSize();
+        return page_size;
+    }
+    static void one_time_init();
+    //! Processes scheduler initialization request (possibly nested) in an external thread
+    /** If necessary creates new instance of arena and/or local scheduler.
+        The auto_init argument specifies if the call is due to automatic initialization. **/
+    static void init_external_thread();
+
+    //! The routine to undo automatic initialization.
+    /** The signature is written with void* so that the routine
+        can be the destructor argument to pthread_key_create. */
+    static void auto_terminate(void* tls);
+
+    //! Obtain the thread-local instance of the thread data.
+    /** If the scheduler has not been initialized yet, initialization is done automatically.
+        Note that auto-initialized scheduler instance is destroyed only when its thread terminates. **/
+    static thread_data* get_thread_data() {
+        thread_data* td = theTLS.get();
+        if (td) {
+            return td;
+        }
+        init_external_thread();
+        td = theTLS.get();
+        __TBB_ASSERT(td, nullptr);
+        return td;
+    }
+
+    static void set_thread_data(thread_data& td) {
+        theTLS.set(&td);
+    }
+
+    static void clear_thread_data() {
+        theTLS.set(nullptr);
+    }
+
+    static thread_data* get_thread_data_if_initialized () {
+        return theTLS.get();
+    }
+
+    static bool is_thread_data_set(thread_data* td) {
+        return theTLS.get() == td;
+    }
+
+    //! Undo automatic initialization if necessary; call when a thread exits.
+    static void terminate_external_thread() {
+        auto_terminate(get_thread_data_if_initialized());
+    }
+
+    static void initialize_rml_factory ();
+
+    static bool does_client_join_workers (const rml::tbb_client &client);
+
+    static bool speculation_enabled() { return cpu_features.rtm_enabled; }
+
+#if __TBB_WAITPKG_INTRINSICS_PRESENT
+    static bool wait_package_enabled() { return cpu_features.waitpkg_enabled; }
+#endif
+
+    static bool rethrow_exception_broken() { return is_rethrow_broken; }
+
+    static bool is_itt_present() {
+#if __TBB_USE_ITT_NOTIFY
+        return ITT_Present;
+#else
+        return false;
+#endif
+    }
+}; // class governor
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_governor_H */
--- a/third_party/tbb/info.hh
+++ b/third_party/tbb/info.hh
@ -0,0 +1,126 @@
+// clang-format off
+/*
+    Copyright (c) 2019-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_info_H
+#define __TBB_info_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_namespace_injection.hh"
+
+#if __TBB_ARENA_BINDING
+#include "third_party/libcxx/vector"
+#include "third_party/libcxx/cstdint"
+
+namespace tbb {
+namespace detail {
+
+namespace d1{
+
+using numa_node_id = int;
+using core_type_id = int;
+
+// TODO: consider version approach to resolve backward compatibility potential issues.
+struct constraints {
+#if !__TBB_CPP20_PRESENT
+    constraints(numa_node_id id = -1, int maximal_concurrency = -1)
+        : numa_id(id)
+        , max_concurrency(maximal_concurrency)
+    {}
+#endif /*!__TBB_CPP20_PRESENT*/
+
+    constraints& set_numa_id(numa_node_id id) {
+        numa_id = id;
+        return *this;
+    }
+    constraints& set_max_concurrency(int maximal_concurrency) {
+        max_concurrency = maximal_concurrency;
+        return *this;
+    }
+    constraints& set_core_type(core_type_id id) {
+        core_type = id;
+        return *this;
+    }
+    constraints& set_max_threads_per_core(int threads_number) {
+        max_threads_per_core = threads_number;
+        return *this;
+    }
+
+    numa_node_id numa_id = -1;
+    int max_concurrency = -1;
+    core_type_id core_type = -1;
+    int max_threads_per_core = -1;
+};
+
+} // namespace d1
+
+namespace r1 {
+TBB_EXPORT unsigned __TBB_EXPORTED_FUNC numa_node_count();
+TBB_EXPORT void __TBB_EXPORTED_FUNC fill_numa_indices(int* index_array);
+TBB_EXPORT int __TBB_EXPORTED_FUNC numa_default_concurrency(int numa_id);
+
+// Reserved fields are required to save binary backward compatibility in case of future changes.
+// They must be defined to 0 at this moment.
+TBB_EXPORT unsigned __TBB_EXPORTED_FUNC core_type_count(intptr_t reserved = 0);
+TBB_EXPORT void __TBB_EXPORTED_FUNC fill_core_type_indices(int* index_array, intptr_t reserved = 0);
+
+TBB_EXPORT int __TBB_EXPORTED_FUNC constraints_default_concurrency(const d1::constraints& c, intptr_t reserved = 0);
+TBB_EXPORT int __TBB_EXPORTED_FUNC constraints_threads_per_core(const d1::constraints& c, intptr_t reserved = 0);
+} // namespace r1
+
+namespace d1 {
+
+inline std::vector<numa_node_id> numa_nodes() {
+    std::vector<numa_node_id> node_indices(r1::numa_node_count());
+    r1::fill_numa_indices(node_indices.data());
+    return node_indices;
+}
+
+inline int default_concurrency(numa_node_id id = -1) {
+    return r1::numa_default_concurrency(id);
+}
+
+inline std::vector<core_type_id> core_types() {
+    std::vector<int> core_type_indexes(r1::core_type_count());
+    r1::fill_core_type_indices(core_type_indexes.data());
+    return core_type_indexes;
+}
+
+inline int default_concurrency(constraints c) {
+    if (c.max_concurrency > 0) { return c.max_concurrency; }
+    return r1::constraints_default_concurrency(c);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::numa_node_id;
+using detail::d1::core_type_id;
+
+namespace info {
+using detail::d1::numa_nodes;
+using detail::d1::core_types;
+
+using detail::d1::default_concurrency;
+} // namespace info
+} // namespace v1
+
+} // namespace tbb
+
+#endif /*__TBB_ARENA_BINDING*/
+
+#endif /*__TBB_info_H*/
--- a/third_party/tbb/intrusive_list.hh
+++ b/third_party/tbb/intrusive_list.hh
@ -0,0 +1,234 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_intrusive_list_H
+#define _TBB_intrusive_list_H
+
+#include "third_party/tbb/detail/_intrusive_list_node.hh"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+using d1::intrusive_list_node;
+
+//! List of element of type T, where T is derived from intrusive_list_node
+/** The class is not thread safe. **/
+template <class List, class T>
+class intrusive_list_base {
+    //! Pointer to the head node
+    intrusive_list_node my_head;
+
+    //! Number of list elements
+    std::size_t my_size;
+
+    static intrusive_list_node& node ( T& item ) { return List::node(item); }
+
+    static T& item ( intrusive_list_node* node ) { return List::item(node); }
+
+    static const T& item( const intrusive_list_node* node ) { return List::item(node); }
+
+    template <typename DereferenceType>
+    class iterator_impl {
+        static_assert(std::is_same<DereferenceType, T>::value ||
+                      std::is_same<DereferenceType, const T>::value,
+                      "Incorrect DereferenceType in iterator_impl");
+
+        using pointer_type = typename std::conditional<std::is_same<DereferenceType, T>::value,
+                                                       intrusive_list_node*,
+                                                       const intrusive_list_node*>::type;
+
+    public:
+        iterator_impl() : my_pos(nullptr) {}
+
+        iterator_impl( pointer_type pos ) : my_pos(pos) {}
+
+        iterator_impl& operator++() {
+            my_pos = my_pos->my_next_node;
+            return *this;
+        }
+
+        iterator_impl operator++( int ) {
+            iterator_impl it(*this);
+            ++*this;
+            return it;
+        }
+
+        iterator_impl& operator--() {
+            my_pos = my_pos->my_prev_node;
+            return *this;
+        }
+
+        iterator_impl operator--( int ) {
+            iterator_impl it(*this);
+            --*this;
+            return it;
+        }
+
+        bool operator==( const iterator_impl& rhs ) const {
+            return my_pos == rhs.my_pos;
+        }
+
+        bool operator!=( const iterator_impl& rhs ) const {
+            return my_pos != rhs.my_pos;
+        }
+
+        DereferenceType& operator*() const {
+            return intrusive_list_base::item(my_pos);
+        }
+
+        DereferenceType* operator->() const {
+            return &intrusive_list_base::item(my_pos);
+        }
+    private:
+        // Node the iterator points to at the moment
+        pointer_type my_pos;
+    }; // class iterator_impl
+
+    void assert_ok () const {
+        __TBB_ASSERT( (my_head.my_prev_node == &my_head && !my_size) ||
+                      (my_head.my_next_node != &my_head && my_size >0), "intrusive_list_base corrupted" );
+#if TBB_USE_ASSERT >= 2
+        std::size_t i = 0;
+        for ( intrusive_list_node *n = my_head.my_next_node; n != &my_head; n = n->my_next_node )
+            ++i;
+        __TBB_ASSERT( my_size == i, "Wrong size" );
+#endif /* TBB_USE_ASSERT >= 2 */
+    }
+
+public:
+    using iterator = iterator_impl<T>;
+    using const_iterator = iterator_impl<const T>;
+
+    intrusive_list_base () : my_size(0) {
+        my_head.my_prev_node = &my_head;
+        my_head.my_next_node = &my_head;
+    }
+
+    bool empty () const { return my_head.my_next_node == &my_head; }
+
+    std::size_t size () const { return my_size; }
+
+    iterator begin () { return iterator(my_head.my_next_node); }
+
+    iterator end () { return iterator(&my_head); }
+
+    const_iterator begin () const { return const_iterator(my_head.my_next_node); }
+
+    const_iterator end () const { return const_iterator(&my_head); }
+
+    void push_front ( T& val ) {
+        __TBB_ASSERT( node(val).my_prev_node == &node(val) && node(val).my_next_node == &node(val),
+                    "Object with intrusive list node can be part of only one intrusive list simultaneously" );
+        // An object can be part of only one intrusive list at the given moment via the given node member
+        node(val).my_prev_node = &my_head;
+        node(val).my_next_node = my_head.my_next_node;
+        my_head.my_next_node->my_prev_node = &node(val);
+        my_head.my_next_node = &node(val);
+        ++my_size;
+        assert_ok();
+    }
+
+    void remove( T& val ) {
+        __TBB_ASSERT( node(val).my_prev_node != &node(val) && node(val).my_next_node != &node(val), "Element to remove is not in the list" );
+        __TBB_ASSERT( node(val).my_prev_node->my_next_node == &node(val) && node(val).my_next_node->my_prev_node == &node(val), "Element to remove is not in the list" );
+        --my_size;
+        node(val).my_next_node->my_prev_node = node(val).my_prev_node;
+        node(val).my_prev_node->my_next_node = node(val).my_next_node;
+#if TBB_USE_ASSERT
+        node(val).my_prev_node = node(val).my_next_node = &node(val);
+#endif
+        assert_ok();
+    }
+
+    iterator erase ( iterator it ) {
+        T& val = *it;
+        ++it;
+        remove( val );
+        return it;
+    }
+
+}; // intrusive_list_base
+
+#if __TBB_TODO
+// With standard compliant compilers memptr_intrusive_list could be named simply intrusive_list,
+// and inheritance based intrusive_list version would become its partial specialization.
+// Here are the corresponding declarations:
+
+struct dummy_intrusive_list_item { intrusive_list_node my_node; };
+
+template <class T, class U = dummy_intrusive_list_item, intrusive_list_node U::*NodePtr = &dummy_intrusive_list_item::my_node>
+class intrusive_list : public intrusive_list_base<intrusive_list<T, U, NodePtr>, T>;
+
+template <class T>
+class intrusive_list<T, dummy_intrusive_list_item, &dummy_intrusive_list_item::my_node>
+    : public intrusive_list_base<intrusive_list<T>, T>;
+
+#endif /* __TBB_TODO */
+
+//! Double linked list of items of type T containing a member of type intrusive_list_node.
+/** NodePtr is a member pointer to the node data field. Class U is either T or
+    a base class of T containing the node member. Default values exist for the sake
+    of a partial specialization working with inheritance case.
+
+    The list does not have ownership of its items. Its purpose is to avoid dynamic
+    memory allocation when forming lists of existing objects.
+
+    The class is not thread safe. **/
+template <class T, class U, intrusive_list_node U::*NodePtr>
+class memptr_intrusive_list : public intrusive_list_base<memptr_intrusive_list<T, U, NodePtr>, T>
+{
+    friend class intrusive_list_base<memptr_intrusive_list<T, U, NodePtr>, T>;
+
+    static intrusive_list_node& node ( T& val ) { return val.*NodePtr; }
+
+    static T& item ( intrusive_list_node* node ) {
+        // Cannot use __TBB_offsetof (and consequently __TBB_get_object_ref) macro
+        // with *NodePtr argument because gcc refuses to interpret pasted "->" and "*"
+        // as member pointer dereferencing operator, and explicit usage of ## in
+        // __TBB_offsetof implementation breaks operations with normal member names.
+        return *reinterpret_cast<T*>((char*)node - ((ptrdiff_t)&(reinterpret_cast<T*>(0x1000)->*NodePtr) - 0x1000));
+    }
+
+    static const T& item( const intrusive_list_node* node ) {
+        return item(const_cast<intrusive_list_node*>(node));
+    }
+
+}; // intrusive_list<T, U, NodePtr>
+
+//! Double linked list of items of type T that is derived from intrusive_list_node class.
+/** The list does not have ownership of its items. Its purpose is to avoid dynamic
+    memory allocation when forming lists of existing objects.
+
+    The class is not thread safe. **/
+template <class T>
+class intrusive_list : public intrusive_list_base<intrusive_list<T>, T>
+{
+    friend class intrusive_list_base<intrusive_list<T>, T>;
+
+    static intrusive_list_node& node ( T& val ) { return val; }
+
+    static T& item ( intrusive_list_node* node ) { return *static_cast<T*>(node); }
+
+    static const T& item( const intrusive_list_node* node ) { return *static_cast<const T*>(node); }
+}; // intrusive_list<T>
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_intrusive_list_H */
--- a/third_party/tbb/itt_notify.cc
+++ b/third_party/tbb/itt_notify.cc
@ -0,0 +1,70 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#if __TBB_USE_ITT_NOTIFY
+
+#if _WIN32||_WIN64
+    #ifndef UNICODE
+        #define UNICODE
+    #endif
+#else
+    #pragma weak dlopen
+    #pragma weak dlsym
+    #pragma weak dlerror
+#endif /* WIN */
+
+#if __TBB_BUILD
+
+extern "C" void ITT_DoOneTimeInitialization();
+#define __itt_init_ittlib_name(x,y) (ITT_DoOneTimeInitialization(), true)
+
+#elif __TBBMALLOC_BUILD
+
+extern "C" void MallocInitializeITT();
+#define __itt_init_ittlib_name(x,y) (MallocInitializeITT(), true)
+
+#else
+#error This file is expected to be used for either TBB or TBB allocator build.
+#endif // __TBB_BUILD
+
+// MISSING #include "tools_api/ittnotify_static.c"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+/** This extra proxy method is necessary since __itt_init_lib is declared as static **/
+int __TBB_load_ittnotify() {
+#if !(_WIN32||_WIN64)
+    // tool_api crashes without dlopen, check that it's present. Common case
+    // for lack of dlopen is static binaries, i.e. ones build with -static.
+    if (dlopen == nullptr)
+        return 0;
+#endif
+    return __itt_init_ittlib(nullptr,       // groups for:
+      (__itt_group_id)(__itt_group_sync     // prepare/cancel/acquired/releasing
+                       | __itt_group_thread // name threads
+                       | __itt_group_stitch // stack stitching
+                       | __itt_group_structure
+                           ));
+}
+
+} //namespace r1
+} //namespace detail
+} // namespace tbb
+
+#endif /* __TBB_USE_ITT_NOTIFY */
--- a/third_party/tbb/itt_notify.hh
+++ b/third_party/tbb/itt_notify.hh
@ -0,0 +1,118 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_ITT_NOTIFY
+#define _TBB_ITT_NOTIFY
+
+#include "third_party/tbb/detail/_config.hh"
+
+#if __TBB_USE_ITT_NOTIFY
+
+#if _WIN32||_WIN64
+    #ifndef UNICODE
+        #define UNICODE
+    #endif
+#endif /* WIN */
+
+#ifndef INTEL_ITTNOTIFY_API_PRIVATE
+#define INTEL_ITTNOTIFY_API_PRIVATE
+#endif
+
+// MISSING #include "tools_api/ittnotify.h"
+// MISSING #include "tools_api/legacy/ittnotify.h"
+extern "C" void __itt_fini_ittlib(void);
+extern "C" void __itt_release_resources(void);
+
+#if _WIN32||_WIN64
+    #undef _T
+#endif /* WIN */
+
+#endif /* __TBB_USE_ITT_NOTIFY */
+
+#if !ITT_CALLER_NULL
+#define ITT_CALLER_NULL ((__itt_caller)0)
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Unicode support
+#if (_WIN32||_WIN64)
+    //! Unicode character type. Always wchar_t on Windows.
+    /** We do not use typedefs from Windows TCHAR family to keep consistence of TBB coding style. **/
+    using tchar = wchar_t;
+    //! Standard Windows macro to markup the string literals.
+    #define _T(string_literal) L ## string_literal
+#else /* !WIN */
+    using tchar = char;
+    //! Standard Windows style macro to markup the string literals.
+    #define _T(string_literal) string_literal
+#endif /* !WIN */
+
+//! Display names of internal synchronization types
+extern const tchar
+    *SyncType_Scheduler;
+//! Display names of internal synchronization components/scenarios
+extern const tchar
+    *SyncObj_ContextsList
+    ;
+
+#if __TBB_USE_ITT_NOTIFY
+// const_cast<void*>() is necessary to cast off volatility
+#define ITT_NOTIFY(name,obj)            __itt_##name(const_cast<void*>(static_cast<volatile void*>(obj)))
+#define ITT_THREAD_SET_NAME(name)       __itt_thread_set_name(name)
+#define ITT_FINI_ITTLIB()               __itt_fini_ittlib()
+#define ITT_RELEASE_RESOURCES()         __itt_release_resources()
+#define ITT_SYNC_CREATE(obj, type, name) __itt_sync_create((void*)(obj), type, name, 2)
+#define ITT_STACK_CREATE(obj)           obj = __itt_stack_caller_create()
+#define ITT_STACK_DESTROY(obj)          (obj!=nullptr) ? __itt_stack_caller_destroy(static_cast<__itt_caller>(obj)) : ((void)0)
+#define ITT_CALLEE_ENTER(cond, t, obj)  if(cond) {\
+                                            __itt_stack_callee_enter(static_cast<__itt_caller>(obj));\
+                                            __itt_sync_acquired(t);\
+                                        }
+#define ITT_CALLEE_LEAVE(cond, obj)     (cond) ? __itt_stack_callee_leave(static_cast<__itt_caller>(obj)) : ((void)0)
+
+#define ITT_TASK_GROUP(obj,name,parent)     r1::itt_make_task_group(d1::ITT_DOMAIN_MAIN,(void*)(obj),ALGORITHM,(void*)(parent),(parent!=nullptr) ? ALGORITHM : FLOW_NULL,name)
+#define ITT_TASK_BEGIN(obj,name,id)         r1::itt_task_begin(d1::ITT_DOMAIN_MAIN,(void*)(id),ALGORITHM,(void*)(obj),ALGORITHM,name)
+#define ITT_TASK_END                        r1::itt_task_end(d1::ITT_DOMAIN_MAIN)
+
+
+#else /* !__TBB_USE_ITT_NOTIFY */
+
+#define ITT_NOTIFY(name,obj)            ((void)0)
+#define ITT_THREAD_SET_NAME(name)       ((void)0)
+#define ITT_FINI_ITTLIB()               ((void)0)
+#define ITT_RELEASE_RESOURCES()         ((void)0)
+#define ITT_SYNC_CREATE(obj, type, name) ((void)0)
+#define ITT_STACK_CREATE(obj)           ((void)0)
+#define ITT_STACK_DESTROY(obj)          ((void)0)
+#define ITT_CALLEE_ENTER(cond, t, obj)  ((void)0)
+#define ITT_CALLEE_LEAVE(cond, obj)     ((void)0)
+#define ITT_TASK_GROUP(type,name,parent)    ((void)0)
+#define ITT_TASK_BEGIN(type,name,id)        ((void)0)
+#define ITT_TASK_END                        ((void)0)
+
+#endif /* !__TBB_USE_ITT_NOTIFY */
+
+int __TBB_load_ittnotify();
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_ITT_NOTIFY */
--- a/third_party/tbb/mailbox.hh
+++ b/third_party/tbb/mailbox.hh
@ -0,0 +1,247 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_mailbox_H
+#define _TBB_mailbox_H
+
+#include "third_party/tbb/cache_aligned_allocator.hh"
+#include "third_party/tbb/detail/_small_object_pool.hh"
+
+#include "third_party/tbb/scheduler_common.hh"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+struct task_proxy : public d1::task {
+    static const intptr_t      pool_bit = 1<<0;
+    static const intptr_t   mailbox_bit = 1<<1;
+    static const intptr_t location_mask = pool_bit | mailbox_bit;
+    /* All but two low-order bits represent a (task*).
+       Two low-order bits mean:
+       1 = proxy is/was/will be in task pool
+       2 = proxy is/was/will be in mailbox */
+    std::atomic<intptr_t> task_and_tag;
+
+    //! Pointer to next task_proxy in a mailbox
+    std::atomic<task_proxy*> next_in_mailbox;
+
+    //! Mailbox to which this was mailed.
+    mail_outbox* outbox;
+
+    //! Task affinity id which is referenced
+    d1::slot_id slot;
+
+    d1::small_object_allocator allocator;
+
+    //! True if the proxy is stored both in its sender's pool and in the destination mailbox.
+    static bool is_shared ( intptr_t tat ) {
+        return (tat & location_mask) == location_mask;
+    }
+
+    //! Returns a pointer to the encapsulated task or nullptr.
+    static task* task_ptr ( intptr_t tat ) {
+        return (task*)(tat & ~location_mask);
+    }
+
+    //! Returns a pointer to the encapsulated task or nullptr, and frees proxy if necessary.
+    template<intptr_t from_bit>
+    inline task* extract_task () {
+        // __TBB_ASSERT( prefix().extra_state == es_task_proxy, "Normal task misinterpreted as a proxy?" );
+        intptr_t tat = task_and_tag.load(std::memory_order_acquire);
+        __TBB_ASSERT( tat == from_bit || (is_shared(tat) && task_ptr(tat)),
+            "Proxy's tag cannot specify both locations if the proxy "
+            "was retrieved from one of its original locations" );
+        if ( tat != from_bit ) {
+            const intptr_t cleaner_bit = location_mask & ~from_bit;
+            // Attempt to transition the proxy to the "empty" state with
+            // cleaner_bit specifying entity responsible for its eventual freeing.
+            // Explicit cast to void* is to work around a seeming ICC 11.1 bug.
+            if ( task_and_tag.compare_exchange_strong(tat, cleaner_bit) ) {
+                // Successfully grabbed the task, and left new owner with the job of freeing the proxy
+                return task_ptr(tat);
+            }
+        }
+        // Proxied task has already been claimed from another proxy location.
+        __TBB_ASSERT( task_and_tag.load(std::memory_order_relaxed) == from_bit, "Empty proxy cannot contain non-zero task pointer" );
+        return nullptr;
+    }
+
+    task* execute(d1::execution_data&) override {
+        __TBB_ASSERT_RELEASE(false, nullptr);
+        return nullptr;
+    }
+    task* cancel(d1::execution_data&) override {
+        __TBB_ASSERT_RELEASE(false, nullptr);
+        return nullptr;
+    }
+}; // struct task_proxy
+
+//! Internal representation of mail_outbox, without padding.
+class unpadded_mail_outbox {
+protected:
+    typedef std::atomic<task_proxy*> atomic_proxy_ptr;
+
+    //! Pointer to first task_proxy in mailbox, or nullptr if box is empty.
+    atomic_proxy_ptr my_first;
+
+    //! Pointer to pointer that will point to next item in the queue.  Never nullptr.
+    std::atomic<atomic_proxy_ptr*> my_last;
+
+    //! Owner of mailbox is not executing a task, and has drained its own task pool.
+    std::atomic<bool> my_is_idle;
+};
+
+// TODO: - consider moving to arena slot
+//! Class representing where mail is put.
+/** Padded to occupy a cache line. */
+class mail_outbox : padded<unpadded_mail_outbox> {
+
+    task_proxy* internal_pop( isolation_type isolation ) {
+        task_proxy* curr = my_first.load(std::memory_order_acquire);
+        if ( !curr )
+            return nullptr;
+        atomic_proxy_ptr* prev_ptr = &my_first;
+        if ( isolation != no_isolation ) {
+            while ( task_accessor::isolation(*curr) != isolation ) {
+                prev_ptr = &curr->next_in_mailbox;
+                // The next_in_mailbox should be read with acquire to guarantee (*curr) consistency.
+                curr = curr->next_in_mailbox.load(std::memory_order_acquire);
+                if ( !curr )
+                    return nullptr;
+            }
+        }
+        // There is a first item in the mailbox.  See if there is a second.
+        // The next_in_mailbox should be read with acquire to guarantee (*second) consistency.
+        if ( task_proxy* second = curr->next_in_mailbox.load(std::memory_order_acquire) ) {
+            // There are at least two items, so first item can be popped easily.
+            prev_ptr->store(second, std::memory_order_relaxed);
+        } else {
+            // There is only one item. Some care is required to pop it.
+
+            prev_ptr->store(nullptr, std::memory_order_relaxed);
+            atomic_proxy_ptr* expected = &curr->next_in_mailbox;
+            if ( my_last.compare_exchange_strong( expected, prev_ptr ) ) {
+                // Successfully transitioned mailbox from having one item to having none.
+                __TBB_ASSERT( !curr->next_in_mailbox.load(std::memory_order_relaxed), nullptr);
+            } else {
+                // Some other thread updated my_last but has not filled in first->next_in_mailbox
+                // Wait until first item points to second item.
+                atomic_backoff backoff;
+                // The next_in_mailbox should be read with acquire to guarantee (*second) consistency.
+                while ( !(second = curr->next_in_mailbox.load(std::memory_order_acquire)) ) backoff.pause();
+                prev_ptr->store( second, std::memory_order_relaxed);
+            }
+        }
+        assert_pointer_valid(curr);
+        return curr;
+    }
+public:
+    friend class mail_inbox;
+
+    //! Push task_proxy onto the mailbox queue of another thread.
+    /** Implementation is wait-free. */
+    void push( task_proxy* t ) {
+        assert_pointer_valid(t);
+        t->next_in_mailbox.store(nullptr, std::memory_order_relaxed);
+        atomic_proxy_ptr* const link = my_last.exchange(&t->next_in_mailbox);
+        // Logically, the release fence is not required because the exchange above provides the
+        // release-acquire semantic that guarantees that (*t) will be consistent when another thread
+        // loads the link atomic. However, C++11 memory model guarantees consistency of(*t) only
+        // when the same atomic is used for synchronization.
+        link->store(t, std::memory_order_release);
+    }
+
+    //! Return true if mailbox is empty
+    bool empty() {
+        return my_first.load(std::memory_order_relaxed) == nullptr;
+    }
+
+    //! Construct *this as a mailbox from zeroed memory.
+    /** Raise assertion if *this is not previously zeroed, or sizeof(this) is wrong.
+        This method is provided instead of a full constructor since we know the object
+        will be constructed in zeroed memory. */
+    void construct() {
+        __TBB_ASSERT( sizeof(*this)==max_nfs_size, nullptr );
+        __TBB_ASSERT( !my_first.load(std::memory_order_relaxed), nullptr );
+        __TBB_ASSERT( !my_last.load(std::memory_order_relaxed), nullptr );
+        __TBB_ASSERT( !my_is_idle.load(std::memory_order_relaxed), nullptr );
+        my_last = &my_first;
+        suppress_unused_warning(pad);
+    }
+
+    //! Drain the mailbox
+    void drain() {
+        // No fences here because other threads have already quit.
+        for( ; task_proxy* t = my_first; ) {
+            my_first.store(t->next_in_mailbox, std::memory_order_relaxed);
+            t->allocator.delete_object(t);
+        }
+    }
+
+    //! True if thread that owns this mailbox is looking for work.
+    bool recipient_is_idle() {
+        return my_is_idle.load(std::memory_order_relaxed);
+    }
+}; // class mail_outbox
+
+//! Class representing source of mail.
+class mail_inbox {
+    //! Corresponding sink where mail that we receive will be put.
+    mail_outbox* my_putter;
+public:
+    //! Construct unattached inbox
+    mail_inbox() : my_putter(nullptr) {}
+
+    //! Attach inbox to a corresponding outbox.
+    void attach( mail_outbox& putter ) {
+        my_putter = &putter;
+    }
+    //! Detach inbox from its outbox
+    void detach() {
+        __TBB_ASSERT(my_putter,"not attached");
+        my_putter = nullptr;
+    }
+    //! Get next piece of mail, or nullptr if mailbox is empty.
+    task_proxy* pop( isolation_type isolation ) {
+        return my_putter->internal_pop( isolation );
+    }
+    //! Return true if mailbox is empty
+    bool empty() {
+        return my_putter->empty();
+    }
+    //! Indicate whether thread that reads this mailbox is idle.
+    /** Raises assertion failure if mailbox is redundantly marked as not idle. */
+    void set_is_idle( bool value ) {
+        if( my_putter ) {
+            __TBB_ASSERT( my_putter->my_is_idle.load(std::memory_order_relaxed) || value, "attempt to redundantly mark mailbox as not idle" );
+            my_putter->my_is_idle.store(value, std::memory_order_relaxed);
+        }
+    }
+    //! Indicate whether thread that reads this mailbox is idle.
+    bool is_idle_state ( bool value ) const {
+        return !my_putter || my_putter->my_is_idle.load(std::memory_order_relaxed) == value;
+    }
+}; // class mail_inbox
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_mailbox_H */
--- a/third_party/tbb/main.cc
+++ b/third_party/tbb/main.cc
@ -0,0 +1,172 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_config.hh"
+
+#include "third_party/tbb/main.hh"
+#include "third_party/tbb/governor.hh"
+#include "third_party/tbb/threading_control.hh"
+#include "third_party/tbb/environment.hh"
+#include "third_party/tbb/market.hh"
+#include "third_party/tbb/misc.hh"
+#include "third_party/tbb/itt_notify.hh"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// Begin shared data layout.
+// The following global data items are mostly read-only after initialization.
+//------------------------------------------------------------------------
+
+//------------------------------------------------------------------------
+// governor data
+basic_tls<thread_data*> governor::theTLS;
+rml::tbb_factory governor::theRMLServerFactory;
+bool governor::UsePrivateRML;
+bool governor::is_rethrow_broken;
+
+//------------------------------------------------------------------------
+// threading_control data
+threading_control* threading_control::g_threading_control;
+threading_control::global_mutex_type threading_control::g_threading_control_mutex;
+
+//------------------------------------------------------------------------
+// context propagation data
+context_state_propagation_mutex_type the_context_state_propagation_mutex;
+std::atomic<uintptr_t> the_context_state_propagation_epoch{};
+
+//------------------------------------------------------------------------
+// One time initialization data
+
+//! Counter of references to global shared resources such as TLS.
+std::atomic<int> __TBB_InitOnce::count{};
+
+std::atomic_flag __TBB_InitOnce::InitializationLock = ATOMIC_FLAG_INIT;
+
+//! Flag that is set to true after one-time initializations are done.
+std::atomic<bool> __TBB_InitOnce::InitializationDone{};
+
+#if __TBB_USE_ITT_NOTIFY
+//! Defined in profiling.cpp
+extern bool ITT_Present;
+void ITT_DoUnsafeOneTimeInitialization();
+#endif
+
+#if !(_WIN32||_WIN64) || __TBB_SOURCE_DIRECTLY_INCLUDED
+static __TBB_InitOnce __TBB_InitOnceHiddenInstance;
+#endif
+
+#if TBB_USE_ASSERT
+std::atomic<int> the_observer_proxy_count;
+
+struct check_observer_proxy_count {
+    ~check_observer_proxy_count() {
+        if (the_observer_proxy_count != 0) {
+            runtime_warning("Leaked %ld observer_proxy objects\n", long(the_observer_proxy_count));
+        }
+    }
+};
+// The proxy count checker shall be defined after __TBB_InitOnceHiddenInstance to check the count
+// after auto termination.
+static check_observer_proxy_count the_check_observer_proxy_count;
+#endif /* TBB_USE_ASSERT */
+
+//------------------------------------------------------------------------
+// __TBB_InitOnce
+//------------------------------------------------------------------------
+
+void __TBB_InitOnce::add_ref() {
+    if( ++count==1 )
+        governor::acquire_resources();
+}
+
+void __TBB_InitOnce::remove_ref() {
+    int k = --count;
+    __TBB_ASSERT(k>=0,"removed __TBB_InitOnce ref that was not added?");
+    if( k==0 ) {
+        governor::release_resources();
+        ITT_FINI_ITTLIB();
+        ITT_RELEASE_RESOURCES();
+    }
+}
+
+//------------------------------------------------------------------------
+// One-time Initializations
+//------------------------------------------------------------------------
+
+//! Defined in cache_aligned_allocator.cpp
+void initialize_cache_aligned_allocator();
+
+//! Performs thread-safe lazy one-time general TBB initialization.
+void DoOneTimeInitialization() {
+    __TBB_InitOnce::lock();
+    // No fence required for load of InitializationDone, because we are inside a critical section.
+    if( !__TBB_InitOnce::InitializationDone ) {
+        __TBB_InitOnce::add_ref();
+        if( GetBoolEnvironmentVariable("TBB_VERSION") )
+            PrintVersion();
+        bool itt_present = false;
+#if __TBB_USE_ITT_NOTIFY
+        ITT_DoUnsafeOneTimeInitialization();
+        itt_present = ITT_Present;
+#endif /* __TBB_USE_ITT_NOTIFY */
+        initialize_cache_aligned_allocator();
+        governor::initialize_rml_factory();
+        // Force processor groups support detection
+        governor::default_num_threads();
+        // Force OS regular page size detection
+        governor::default_page_size();
+        PrintExtraVersionInfo( "TOOLS SUPPORT", itt_present ? "enabled" : "disabled" );
+        __TBB_InitOnce::InitializationDone = true;
+    }
+    __TBB_InitOnce::unlock();
+}
+
+#if (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED
+//! Windows "DllMain" that handles startup and shutdown of dynamic library.
+extern "C" bool WINAPI DllMain( HANDLE /*hinstDLL*/, DWORD reason, LPVOID lpvReserved ) {
+    switch( reason ) {
+        case DLL_PROCESS_ATTACH:
+            __TBB_InitOnce::add_ref();
+            break;
+        case DLL_PROCESS_DETACH:
+            // Since THREAD_DETACH is not called for the main thread, call auto-termination
+            // here as well - but not during process shutdown (due to risk of a deadlock).
+            if ( lpvReserved == nullptr ) { // library unload
+                governor::terminate_external_thread();
+            }
+            __TBB_InitOnce::remove_ref();
+            // It is assumed that InitializationDone is not set after DLL_PROCESS_DETACH,
+            // and thus no race on InitializationDone is possible.
+            if ( __TBB_InitOnce::initialization_done() ) {
+                // Remove reference that we added in DoOneTimeInitialization.
+                __TBB_InitOnce::remove_ref();
+            }
+            break;
+        case DLL_THREAD_DETACH:
+            governor::terminate_external_thread();
+            break;
+    }
+    return true;
+}
+#endif /* (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
--- a/third_party/tbb/main.hh
+++ b/third_party/tbb/main.hh
@ -0,0 +1,100 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_main_H
+#define _TBB_main_H
+
+#include "third_party/tbb/governor.hh"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+void DoOneTimeInitialization();
+
+//------------------------------------------------------------------------
+// __TBB_InitOnce
+//------------------------------------------------------------------------
+
+// TODO (TBB_REVAMP_TODO): consider better names
+//! Class that supports TBB initialization.
+/** It handles acquisition and release of global resources (e.g. TLS) during startup and shutdown,
+    as well as synchronization for DoOneTimeInitialization. */
+class __TBB_InitOnce {
+    friend void DoOneTimeInitialization();
+    friend void ITT_DoUnsafeOneTimeInitialization();
+
+    static std::atomic<int> count;
+
+    //! Platform specific code to acquire resources.
+    static void acquire_resources();
+
+    //! Platform specific code to release resources.
+    static void release_resources();
+
+    //! Specifies if the one-time initializations has been done.
+    static std::atomic<bool> InitializationDone;
+
+    //! Global initialization lock
+    /** Scenarios are possible when tools interop has to be initialized before the
+        TBB itself. This imposes a requirement that the global initialization lock
+        has to support valid static initialization, and does not issue any tool
+        notifications in any build mode. **/
+    static std::atomic_flag InitializationLock;
+
+public:
+    static void lock() {
+        tbb::detail::atomic_backoff backoff;
+        while( InitializationLock.test_and_set() ) backoff.pause();
+    }
+
+    static void unlock() { InitializationLock.clear(std::memory_order_release); }
+
+    static bool initialization_done() { return InitializationDone.load(std::memory_order_acquire); }
+
+    //! Add initial reference to resources.
+    /** We assume that dynamic loading of the library prevents any other threads
+        from entering the library until this constructor has finished running. **/
+    __TBB_InitOnce() { add_ref(); }
+
+    //! Remove the initial reference to resources.
+    /** This is not necessarily the last reference if other threads are still running. **/
+    ~__TBB_InitOnce() {
+        governor::terminate_external_thread(); // TLS dtor not called for the main thread
+        remove_ref();
+        // We assume that InitializationDone is not set after file-scope destructors
+        // start running, and thus no race on InitializationDone is possible.
+        if ( initialization_done() ) {
+            // Remove an extra reference that was added in DoOneTimeInitialization.
+            remove_ref();
+        }
+    }
+    //! Add reference to resources.  If first reference added, acquire the resources.
+    static void add_ref();
+
+    //! Remove reference to resources.  If last reference removed, release the resources.
+    static void remove_ref();
+
+}; // class __TBB_InitOnce
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_main_H */
--- a/third_party/tbb/market.cc
+++ b/third_party/tbb/market.cc
@ -0,0 +1,140 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/arena.hh"
+#include "third_party/tbb/market.hh"
+
+#include "third_party/libcxx/algorithm" // std::find
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+
+class tbb_permit_manager_client : public pm_client {
+public:
+    tbb_permit_manager_client(arena& a) : pm_client(a) {}
+
+    void set_allotment(unsigned allotment) {
+        my_arena.set_allotment(allotment);
+    }
+};
+
+//------------------------------------------------------------------------
+// market
+//------------------------------------------------------------------------
+
+market::market(unsigned workers_soft_limit)
+    : my_num_workers_soft_limit(workers_soft_limit)
+{}
+
+pm_client* market::create_client(arena& a) {
+    return new (cache_aligned_allocate(sizeof(tbb_permit_manager_client))) tbb_permit_manager_client(a);
+}
+
+void market::register_client(pm_client* c) {
+    mutex_type::scoped_lock lock(my_mutex);
+    my_clients[c->priority_level()].push_back(c);
+}
+
+void market::unregister_and_destroy_client(pm_client& c) {
+    {
+        mutex_type::scoped_lock lock(my_mutex);
+        auto& clients = my_clients[c.priority_level()];
+        auto it = std::find(clients.begin(), clients.end(), &c);
+        __TBB_ASSERT(it != clients.end(), "Destroying of an unregistered client");
+        clients.erase(it);
+    }
+
+    auto client = static_cast<tbb_permit_manager_client*>(&c);
+    client->~tbb_permit_manager_client();
+    cache_aligned_deallocate(client);
+}
+
+void market::update_allotment() {
+    int effective_soft_limit = my_mandatory_num_requested > 0 && my_num_workers_soft_limit == 0 ? 1 : my_num_workers_soft_limit;
+    int max_workers = min(my_total_demand, effective_soft_limit);
+    __TBB_ASSERT(max_workers >= 0, nullptr);
+
+    int unassigned_workers = max_workers;
+    int assigned = 0;
+    int carry = 0;
+    unsigned max_priority_level = num_priority_levels;
+    for (unsigned list_idx = 0; list_idx < num_priority_levels; ++list_idx ) {
+        int assigned_per_priority = min(my_priority_level_demand[list_idx], unassigned_workers);
+        unassigned_workers -= assigned_per_priority;
+        // We use reverse iterator there to serve last added clients first
+        for (auto it = my_clients[list_idx].rbegin(); it != my_clients[list_idx].rend(); ++it) {
+            tbb_permit_manager_client& client = static_cast<tbb_permit_manager_client&>(**it);
+            if (client.max_workers() == 0) {
+                client.set_allotment(0);
+                continue;
+            }
+
+            if (max_priority_level == num_priority_levels) {
+                max_priority_level = list_idx;
+            }
+
+            int allotted = 0;
+            if (my_num_workers_soft_limit == 0) {
+                __TBB_ASSERT(max_workers == 0 || max_workers == 1, nullptr);
+                allotted = client.min_workers() > 0 && assigned < max_workers ? 1 : 0;
+            } else {
+                int tmp = client.max_workers() * assigned_per_priority + carry;
+                allotted = tmp / my_priority_level_demand[list_idx];
+                carry = tmp % my_priority_level_demand[list_idx];
+                __TBB_ASSERT(allotted <= client.max_workers(), nullptr);
+            }
+            client.set_allotment(allotted);
+            client.set_top_priority(list_idx == max_priority_level);
+            assigned += allotted;
+        }
+    }
+    __TBB_ASSERT(assigned == max_workers, nullptr);
+}
+
+void market::set_active_num_workers(int soft_limit) {
+    mutex_type::scoped_lock lock(my_mutex);
+    if (my_num_workers_soft_limit != soft_limit) {
+        my_num_workers_soft_limit = soft_limit;
+        update_allotment();
+    }
+}
+
+void market::adjust_demand(pm_client& c, int mandatory_delta, int workers_delta) {
+    __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr);
+
+    int delta{};
+    {
+        mutex_type::scoped_lock lock(my_mutex);
+        // Update client's state
+        delta = c.update_request(mandatory_delta, workers_delta);
+
+        // Update market's state
+        my_total_demand += delta;
+        my_priority_level_demand[c.priority_level()] += delta;
+        my_mandatory_num_requested += mandatory_delta;
+
+        update_allotment();
+    }
+
+    notify_thread_request(delta);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
--- a/third_party/tbb/market.hh
+++ b/third_party/tbb/market.hh
@ -0,0 +1,79 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_market_H
+#define _TBB_market_H
+
+#include "third_party/tbb/rw_mutex.hh"
+#include "third_party/tbb/tbb_allocator.hh"
+#include "third_party/tbb/task_arena.hh"
+
+#include "third_party/tbb/permit_manager.hh"
+#include "third_party/tbb/pm_client.hh"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/vector"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class market : public permit_manager {
+public:
+    market(unsigned soft_limit);
+
+    pm_client* create_client(arena& a) override;
+    void register_client(pm_client* client) override;
+    void unregister_and_destroy_client(pm_client& c) override;
+
+    //! Request that arena's need in workers should be adjusted.
+    void adjust_demand(pm_client&, int mandatory_delta, int workers_delta) override;
+
+    //! Set number of active workers
+    void set_active_num_workers(int soft_limit) override;
+private:
+    //! Recalculates the number of workers assigned to each arena in the list.
+    void update_allotment();
+
+    //! Keys for the arena map array. The lower the value the higher priority of the arena list.
+    static constexpr unsigned num_priority_levels = d1::num_priority_levels;
+
+    using mutex_type = d1::rw_mutex;
+    mutex_type my_mutex;
+
+    //! Current application-imposed limit on the number of workers
+    int my_num_workers_soft_limit;
+
+    //! Number of workers that were requested by all arenas on all priority levels
+    int my_total_demand{0};
+
+    //! Number of workers that were requested by arenas per single priority list item
+    int my_priority_level_demand[num_priority_levels] = {0};
+
+    //! How many times mandatory concurrency was requested from the market
+    int my_mandatory_num_requested{0};
+
+    //! Per priority list of registered arenas
+    using clients_container_type = std::vector<pm_client*, tbb::tbb_allocator<pm_client*>>;
+    clients_container_type my_clients[num_priority_levels];
+}; // class market
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_market_H */
--- a/third_party/tbb/memory_pool.hh
+++ b/third_party/tbb/memory_pool.hh
@ -0,0 +1,273 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_memory_pool_H
+#define __TBB_memory_pool_H
+
+#if !TBB_PREVIEW_MEMORY_POOL
+#error Set TBB_PREVIEW_MEMORY_POOL to include memory_pool.h
+#endif
+/** @file */
+
+#include "third_party/tbb/scalable_allocator.hh"
+
+#include "third_party/libcxx/new" // std::bad_alloc
+#include "third_party/libcxx/stdexcept" // std::runtime_error, std::invalid_argument
+#include "third_party/libcxx/utility" // std::forward
+
+
+#if __TBB_EXTRA_DEBUG
+#define __TBBMALLOC_ASSERT ASSERT
+#else
+#define __TBBMALLOC_ASSERT(a,b) ((void)0)
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Base of thread-safe pool allocator for variable-size requests
+class pool_base : no_copy {
+    // Pool interface is separate from standard allocator classes because it has
+    // to maintain internal state, no copy or assignment. Move and swap are possible.
+public:
+    //! Reset pool to reuse its memory (free all objects at once)
+    void recycle() { rml::pool_reset(my_pool); }
+
+    //! The "malloc" analogue to allocate block of memory of size bytes
+    void *malloc(size_t size) { return rml::pool_malloc(my_pool, size); }
+
+    //! The "free" analogue to discard a previously allocated piece of memory.
+    void free(void* ptr) { rml::pool_free(my_pool, ptr); }
+
+    //! The "realloc" analogue complementing pool_malloc.
+    // Enables some low-level optimization possibilities
+    void *realloc(void* ptr, size_t size) {
+        return rml::pool_realloc(my_pool, ptr, size);
+    }
+
+protected:
+    //! destroy pool - must be called in a child class
+    void destroy() { rml::pool_destroy(my_pool); }
+
+    rml::MemoryPool *my_pool;
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Workaround for erroneous "unreferenced parameter" warning in method destroy.
+    #pragma warning (push)
+    #pragma warning (disable: 4100)
+#endif
+
+//! Meets "allocator" requirements of ISO C++ Standard, Section 20.1.5
+/** @ingroup memory_allocation */
+template<typename T, typename P = pool_base>
+class memory_pool_allocator {
+protected:
+    typedef P pool_type;
+    pool_type *my_pool;
+    template<typename U, typename R>
+    friend class memory_pool_allocator;
+    template<typename V, typename U, typename R>
+    friend bool operator==( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+    template<typename V, typename U, typename R>
+    friend bool operator!=( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+public:
+    typedef T value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+    typedef value_type& reference;
+    typedef const value_type& const_reference;
+    typedef size_t size_type;
+    typedef ptrdiff_t difference_type;
+    template<typename U> struct rebind {
+        typedef memory_pool_allocator<U, P> other;
+    };
+
+    explicit memory_pool_allocator(pool_type &pool) throw() : my_pool(&pool) {}
+    memory_pool_allocator(const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {}
+    template<typename U>
+    memory_pool_allocator(const memory_pool_allocator<U,P>& src) throw() : my_pool(src.my_pool) {}
+
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return &x; }
+
+    //! Allocate space for n objects.
+    pointer allocate( size_type n, const void* /*hint*/ = nullptr) {
+        pointer p = static_cast<pointer>( my_pool->malloc( n*sizeof(value_type) ) );
+        if (!p)
+            throw_exception(std::bad_alloc());
+        return p;
+    }
+    //! Free previously allocated block of memory.
+    void deallocate( pointer p, size_type ) {
+        my_pool->free(p);
+    }
+    //! Largest value for which method allocate might succeed.
+    size_type max_size() const throw() {
+        size_type max = static_cast<size_type>(-1) / sizeof (value_type);
+        return (max > 0 ? max : 1);
+    }
+    //! Copy-construct value at location pointed to by p.
+
+    template<typename U, typename... Args>
+    void construct(U *p, Args&&... args)
+        { ::new((void *)p) U(std::forward<Args>(args)...); }
+
+    //! Destroy value at location pointed to by p.
+    void destroy( pointer p ) { p->~value_type(); }
+
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning (pop)
+#endif // warning 4100 is back
+
+//! Analogous to std::allocator<void>, as defined in ISO C++ Standard, Section 20.4.1
+/** @ingroup memory_allocation */
+template<typename P>
+class memory_pool_allocator<void, P> {
+public:
+    typedef P pool_type;
+    typedef void* pointer;
+    typedef const void* const_pointer;
+    typedef void value_type;
+    template<typename U> struct rebind {
+        typedef memory_pool_allocator<U, P> other;
+    };
+
+    explicit memory_pool_allocator( pool_type &pool) throw() : my_pool(&pool) {}
+    memory_pool_allocator( const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {}
+    template<typename U>
+    memory_pool_allocator(const memory_pool_allocator<U,P>& src) throw() : my_pool(src.my_pool) {}
+
+protected:
+    pool_type *my_pool;
+    template<typename U, typename R>
+    friend class memory_pool_allocator;
+    template<typename V, typename U, typename R>
+    friend bool operator==( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+    template<typename V, typename U, typename R>
+    friend bool operator!=( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+};
+
+template<typename T, typename U, typename P>
+inline bool operator==( const memory_pool_allocator<T,P>& a, const memory_pool_allocator<U,P>& b) {return a.my_pool==b.my_pool;}
+
+template<typename T, typename U, typename P>
+inline bool operator!=( const memory_pool_allocator<T,P>& a, const memory_pool_allocator<U,P>& b) {return a.my_pool!=b.my_pool;}
+
+//! Thread-safe growable pool allocator for variable-size requests
+template <typename Alloc>
+class memory_pool : public pool_base {
+    Alloc my_alloc; // TODO: base-class optimization
+    static void *allocate_request(intptr_t pool_id, size_t & bytes);
+    static int deallocate_request(intptr_t pool_id, void*, size_t raw_bytes);
+
+public:
+    //! construct pool with underlying allocator
+    explicit memory_pool(const Alloc &src = Alloc());
+
+    //! destroy pool
+    ~memory_pool() { destroy(); } // call the callbacks first and destroy my_alloc latter
+};
+
+class fixed_pool : public pool_base {
+    void *my_buffer;
+    size_t my_size;
+    inline static void *allocate_request(intptr_t pool_id, size_t & bytes);
+
+public:
+    //! construct pool with underlying allocator
+    inline fixed_pool(void *buf, size_t size);
+    //! destroy pool
+    ~fixed_pool() { destroy(); }
+};
+
+//////////////// Implementation ///////////////
+
+template <typename Alloc>
+memory_pool<Alloc>::memory_pool(const Alloc &src) : my_alloc(src) {
+    rml::MemPoolPolicy args(allocate_request, deallocate_request,
+                            sizeof(typename Alloc::value_type));
+    rml::MemPoolError res = rml::pool_create_v1(intptr_t(this), &args, &my_pool);
+    if (res!=rml::POOL_OK)
+        throw_exception(std::runtime_error("Can't create pool"));
+}
+template <typename Alloc>
+void *memory_pool<Alloc>::allocate_request(intptr_t pool_id, size_t & bytes) {
+    memory_pool<Alloc> &self = *reinterpret_cast<memory_pool<Alloc>*>(pool_id);
+    const size_t unit_size = sizeof(typename Alloc::value_type);
+    __TBBMALLOC_ASSERT( 0 == bytes%unit_size, nullptr);
+    void *ptr;
+#if TBB_USE_EXCEPTIONS
+    try {
+#endif
+        ptr = self.my_alloc.allocate( bytes/unit_size );
+#if TBB_USE_EXCEPTIONS
+    } catch(...) {
+        return nullptr;
+    }
+#endif
+    return ptr;
+}
+#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED
+    // Workaround for erroneous "unreachable code" warning in the template below.
+    // Specific for VC++ 17-18 compiler
+    #pragma warning (push)
+    #pragma warning (disable: 4702)
+#endif
+template <typename Alloc>
+int memory_pool<Alloc>::deallocate_request(intptr_t pool_id, void* raw_ptr, size_t raw_bytes) {
+    memory_pool<Alloc> &self = *reinterpret_cast<memory_pool<Alloc>*>(pool_id);
+    const size_t unit_size = sizeof(typename Alloc::value_type);
+    __TBBMALLOC_ASSERT( 0 == raw_bytes%unit_size, nullptr);
+    self.my_alloc.deallocate( static_cast<typename Alloc::value_type*>(raw_ptr), raw_bytes/unit_size );
+    return 0;
+}
+#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED
+    #pragma warning (pop)
+#endif
+inline fixed_pool::fixed_pool(void *buf, size_t size) : my_buffer(buf), my_size(size) {
+    if (!buf || !size)
+        // TODO: improve support for mode with exceptions disabled
+        throw_exception(std::invalid_argument("Zero in parameter is invalid"));
+    rml::MemPoolPolicy args(allocate_request, nullptr, size, /*fixedPool=*/true);
+    rml::MemPoolError res = rml::pool_create_v1(intptr_t(this), &args, &my_pool);
+    if (res!=rml::POOL_OK)
+        throw_exception(std::runtime_error("Can't create pool"));
+}
+inline void *fixed_pool::allocate_request(intptr_t pool_id, size_t & bytes) {
+    fixed_pool &self = *reinterpret_cast<fixed_pool*>(pool_id);
+    __TBBMALLOC_ASSERT(0 != self.my_size, "The buffer must not be used twice.");
+    bytes = self.my_size;
+    self.my_size = 0; // remember that buffer has been used
+    return self.my_buffer;
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::memory_pool_allocator;
+using detail::d1::memory_pool;
+using detail::d1::fixed_pool;
+} // inline namepspace v1
+} // namespace tbb
+
+#undef __TBBMALLOC_ASSERT
+#endif// __TBB_memory_pool_H
--- a/third_party/tbb/misc.cc
+++ b/third_party/tbb/misc.cc
@ -0,0 +1,176 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// Source file for miscellaneous entities that are infrequently referenced by
+// an executing program.
+
+#include "third_party/tbb/detail/_exception.hh"
+#include "third_party/tbb/detail/_machine.hh"
+
+#include "third_party/tbb/version.hh"
+
+#include "third_party/tbb/misc.hh"
+#include "third_party/tbb/governor.hh"
+#include "third_party/tbb/assert_impl.hh" // Out-of-line TBB assertion handling routines are instantiated here.
+#include "third_party/tbb/concurrent_monitor_mutex.hh"
+
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/stdexcept"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/cstdarg"
+
+#if _WIN32||_WIN64
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#endif
+
+#if !_WIN32
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h" // sysconf(_SC_PAGESIZE)
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// governor data
+//------------------------------------------------------------------------
+cpu_features_type governor::cpu_features;
+
+//------------------------------------------------------------------------
+// concurrent_monitor_mutex data
+//------------------------------------------------------------------------
+#if !__TBB_USE_FUTEX
+std::mutex concurrent_monitor_mutex::my_init_mutex;
+#endif
+
+
+size_t DefaultSystemPageSize() {
+#if _WIN32
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    return si.dwPageSize;
+#else
+    return sysconf(_SC_PAGESIZE);
+#endif
+}
+
+/** The leading "\0" is here so that applying "strings" to the binary delivers a clean result. */
+static const char VersionString[] = "\0" TBB_VERSION_STRINGS;
+
+static bool PrintVersionFlag = false;
+
+void PrintVersion() {
+    PrintVersionFlag = true;
+    std::fputs(VersionString+1,stderr);
+}
+
+void PrintExtraVersionInfo( const char* category, const char* format, ... ) {
+    if( PrintVersionFlag ) {
+        char str[1024]; std::memset(str, 0, 1024);
+        va_list args; va_start(args, format);
+        // Note: correct vsnprintf definition obtained from tbb_assert_impl.h
+        std::vsnprintf( str, 1024-1, format, args);
+        va_end(args);
+        std::fprintf(stderr, "oneTBB: %s\t%s\n", category, str );
+    }
+}
+
+//! check for transaction support.
+#if _MSC_VER
+// MISSING #include <intrin.h> // for __cpuid
+#endif
+
+#if __TBB_x86_32 || __TBB_x86_64
+void check_cpuid(int leaf, int sub_leaf, int registers[4]) {
+#if _MSC_VER
+    __cpuidex(registers, leaf, sub_leaf);
+#else
+    int reg_eax = 0;
+    int reg_ebx = 0;
+    int reg_ecx = 0;
+    int reg_edx = 0;
+#if __TBB_x86_32 && __PIC__
+    // On 32-bit systems with position-independent code GCC fails to work around the stuff in EBX
+    // register. We help it using backup and restore.
+    __asm__("mov %%ebx, %%esi\n\t"
+            "cpuid\n\t"
+            "xchg %%ebx, %%esi"
+            : "=a"(reg_eax), "=S"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx)
+            : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx
+    );
+#else
+    __asm__("cpuid"
+            : "=a"(reg_eax), "=b"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx)
+            : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx
+    );
+#endif
+    registers[0] = reg_eax;
+    registers[1] = reg_ebx;
+    registers[2] = reg_ecx;
+    registers[3] = reg_edx;
+#endif
+}
+#endif
+
+void detect_cpu_features(cpu_features_type& cpu_features) {
+    suppress_unused_warning(cpu_features);
+#if __TBB_x86_32 || __TBB_x86_64
+    const int rtm_ebx_mask = 1 << 11;
+    const int waitpkg_ecx_mask = 1 << 5;
+    int registers[4] = {0};
+
+    // Check RTM and WAITPKG
+    check_cpuid(7, 0, registers);
+    cpu_features.rtm_enabled = (registers[1] & rtm_ebx_mask) != 0;
+    cpu_features.waitpkg_enabled = (registers[2] & waitpkg_ecx_mask) != 0;
+#endif /* (__TBB_x86_32 || __TBB_x86_64) */
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
--- a/third_party/tbb/misc.hh
+++ b/third_party/tbb/misc.hh
@ -0,0 +1,298 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_tbb_misc_H
+#define _TBB_tbb_misc_H
+
+#include "third_party/tbb/detail/_config.hh"
+#include "third_party/tbb/detail/_assert.hh"
+#include "third_party/tbb/detail/_utils.hh"
+
+#if __TBB_ARENA_BINDING
+#include "third_party/tbb/info.hh"
+#endif /*__TBB_ARENA_BINDING*/
+
+#if __unix__
+#include "libc/intrin/newbie.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/rlimit.h"
+#include "libc/calls/struct/rusage.h"
+#include "libc/calls/sysparam.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/limits.h"
+#include "libc/sysv/consts/endian.h"
+#include "libc/sysv/consts/prio.h"
+#include "libc/sysv/consts/rlim.h"
+#include "libc/sysv/consts/rlimit.h"
+#include "libc/sysv/consts/rusage.h"  // __FreeBSD_version
+#if __FreeBSD_version >= 701000
+// MISSING #include <sys/cpuset.h>
+#endif
+#endif
+
+#include "third_party/libcxx/atomic"
+
+// Does the operating system have a system call to pin a thread to a set of OS processors?
+#define __TBB_OS_AFFINITY_SYSCALL_PRESENT ((__linux__ && !__ANDROID__) || (__FreeBSD_version >= 701000))
+// On IBM* Blue Gene* CNK nodes, the affinity API has restrictions that prevent its usability for TBB,
+// and also sysconf(_SC_NPROCESSORS_ONLN) already takes process affinity into account.
+#define __TBB_USE_OS_AFFINITY_SYSCALL (__TBB_OS_AFFINITY_SYSCALL_PRESENT && !__bg__)
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+void runtime_warning(const char* format, ... );
+
+#if __TBB_ARENA_BINDING
+class task_arena;
+class task_scheduler_observer;
+#endif /*__TBB_ARENA_BINDING*/
+
+const std::size_t MByte = 1024*1024;
+
+#if __TBB_USE_WINAPI
+// The Microsoft Documentation about Thread Stack Size states that
+// "The default stack reservation size used by the linker is 1 MB"
+const std::size_t ThreadStackSize = 1*MByte;
+#else
+const std::size_t ThreadStackSize = (sizeof(uintptr_t) <= 4 ? 2 : 4 )*MByte;
+#endif
+
+#ifndef __TBB_HardwareConcurrency
+
+//! Returns maximal parallelism level supported by the current OS configuration.
+int AvailableHwConcurrency();
+
+#else
+
+inline int AvailableHwConcurrency() {
+    int n = __TBB_HardwareConcurrency();
+    return n > 0 ? n : 1; // Fail safety strap
+}
+#endif /* __TBB_HardwareConcurrency */
+
+//! Returns OS regular memory page size
+size_t DefaultSystemPageSize();
+
+//! Returns number of processor groups in the current OS configuration.
+/** AvailableHwConcurrency must be called at least once before calling this method. **/
+int NumberOfProcessorGroups();
+
+#if _WIN32||_WIN64
+
+//! Retrieves index of processor group containing processor with the given index
+int FindProcessorGroupIndex ( int processorIndex );
+
+//! Affinitizes the thread to the specified processor group
+void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex );
+
+#endif /* _WIN32||_WIN64 */
+
+//! Prints TBB version information on stderr
+void PrintVersion();
+
+//! Prints arbitrary extra TBB version information on stderr
+void PrintExtraVersionInfo( const char* category, const char* format, ... );
+
+//! A callback routine to print RML version information on stderr
+void PrintRMLVersionInfo( void* arg, const char* server_info );
+
+// For TBB compilation only; not to be used in public headers
+#if defined(min) || defined(max)
+#undef min
+#undef max
+#endif
+
+//! Utility template function returning lesser of the two values.
+/** Provided here to avoid including not strict safe <algorithm>.\n
+    In case operands cause signed/unsigned or size mismatch warnings it is caller's
+    responsibility to do the appropriate cast before calling the function. **/
+template<typename T>
+T min ( const T& val1, const T& val2 ) {
+    return val1 < val2 ? val1 : val2;
+}
+
+//! Utility template function returning greater of the two values.
+/** Provided here to avoid including not strict safe <algorithm>.\n
+    In case operands cause signed/unsigned or size mismatch warnings it is caller's
+    responsibility to do the appropriate cast before calling the function. **/
+template<typename T>
+T max ( const T& val1, const T& val2 ) {
+    return val1 < val2 ? val2 : val1;
+}
+
+//! Utility helper structure to ease overload resolution
+template<int > struct int_to_type {};
+
+//------------------------------------------------------------------------
+// FastRandom
+//------------------------------------------------------------------------
+
+//! A fast random number generator.
+/** Uses linear congruential method. */
+class FastRandom {
+private:
+    unsigned x, c;
+    static const unsigned a = 0x9e3779b1; // a big prime number
+public:
+    //! Get a random number.
+    unsigned short get() {
+        return get(x);
+    }
+    //! Get a random number for the given seed; update the seed for next use.
+    unsigned short get( unsigned& seed ) {
+        unsigned short r = (unsigned short)(seed>>16);
+        __TBB_ASSERT(c&1, "c must be odd for big rng period");
+        seed = seed*a+c;
+        return r;
+    }
+    //! Construct a random number generator.
+    FastRandom( void* unique_ptr ) { init(uintptr_t(unique_ptr)); }
+
+    template <typename T>
+    void init( T seed ) {
+        init(seed,int_to_type<sizeof(seed)>());
+    }
+    void init( uint64_t seed , int_to_type<8> ) {
+        init(uint32_t((seed>>32)+seed), int_to_type<4>());
+    }
+    void init( uint32_t seed, int_to_type<4> ) {
+        // threads use different seeds for unique sequences
+        c = (seed|1)*0xba5703f5; // c must be odd, shuffle by a prime number
+        x = c^(seed>>1); // also shuffle x for the first get() invocation
+    }
+};
+
+//------------------------------------------------------------------------
+// Atomic extensions
+//------------------------------------------------------------------------
+
+//! Atomically replaces value of dst with newValue if they satisfy condition of compare predicate
+/** Return value semantics is the same as for CAS. **/
+template<typename T1, class Pred>
+T1 atomic_update(std::atomic<T1>& dst, T1 newValue, Pred compare) {
+    T1 oldValue = dst.load(std::memory_order_acquire);
+    while ( compare(oldValue, newValue) ) {
+        if ( dst.compare_exchange_strong(oldValue, newValue) )
+            break;
+    }
+    return oldValue;
+}
+
+#if __TBB_USE_OS_AFFINITY_SYSCALL
+  #if __linux__
+    typedef cpu_set_t basic_mask_t;
+  #elif __FreeBSD_version >= 701000
+    typedef cpuset_t basic_mask_t;
+  #else
+    #error affinity_helper is not implemented in this OS
+  #endif
+    class affinity_helper : no_copy {
+        basic_mask_t* threadMask;
+        int is_changed;
+    public:
+        affinity_helper() : threadMask(nullptr), is_changed(0) {}
+        ~affinity_helper();
+        void protect_affinity_mask( bool restore_process_mask  );
+        void dismiss();
+    };
+    void destroy_process_mask();
+#else
+    class affinity_helper : no_copy {
+    public:
+        void protect_affinity_mask( bool ) {}
+    };
+    inline void destroy_process_mask(){}
+#endif /* __TBB_USE_OS_AFFINITY_SYSCALL */
+
+struct cpu_features_type {
+    bool rtm_enabled{false};
+    bool waitpkg_enabled{false};
+};
+
+void detect_cpu_features(cpu_features_type& cpu_features);
+
+#if __TBB_ARENA_BINDING
+class binding_handler;
+
+binding_handler* construct_binding_handler(int slot_num, int numa_id, int core_type_id, int max_threads_per_core);
+void destroy_binding_handler(binding_handler* handler_ptr);
+void apply_affinity_mask(binding_handler* handler_ptr, int slot_num);
+void restore_affinity_mask(binding_handler* handler_ptr, int slot_num);
+
+#endif /*__TBB_ARENA_BINDING*/
+
+// RTM specific section
+// abort code for mutexes that detect a conflict with another thread.
+enum {
+    speculation_not_supported       = 0x00,
+    speculation_transaction_aborted = 0x01,
+    speculation_can_retry           = 0x02,
+    speculation_memadd_conflict     = 0x04,
+    speculation_buffer_overflow     = 0x08,
+    speculation_breakpoint_hit      = 0x10,
+    speculation_nested_abort        = 0x20,
+    speculation_xabort_mask         = 0xFF000000,
+    speculation_xabort_shift        = 24,
+    speculation_xabort_not_free     = 0xFF, // The value (0xFF) below comes from the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual 12.4.5 lock not free
+    speculation_successful_begin    = 0xFFFFFFFF,
+    speculation_retry               = speculation_transaction_aborted
+                                      | speculation_can_retry
+                                      | speculation_memadd_conflict
+};
+
+// We suppose that successful transactions are sequentially ordered and
+// do not require additional memory fences around them.
+// Technically it can be achieved only if xbegin has implicit
+// acquire memory semantics an xend/xabort has release memory semantics on compiler and hardware level.
+// See the article: https://arxiv.org/pdf/1710.04839.pdf
+static inline unsigned int begin_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    return _xbegin();
+#else
+    return speculation_not_supported; // return unsuccessful code
+#endif
+}
+
+static inline void end_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    _xend();
+#endif
+}
+
+static inline void abort_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    _xabort(speculation_xabort_not_free);
+#endif
+}
+
+#if TBB_USE_ASSERT
+static inline unsigned char is_in_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    return _xtest();
+#else
+    return 0;
+#endif
+}
+#endif // TBB_USE_ASSERT
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_tbb_misc_H */
--- a/Show more
+++ b/Show more