From 7debaaf365240f17d63a7cf889c6747d0b7d1552 Mon Sep 17 00:00:00 2001
From: Farid Zakaria <fmzakari@google.com>
Date: Sat, 24 Jun 2023 23:17:56 +0000
Subject: [PATCH] introduce tbb library

---
 Makefile                                      |    1 +
 third_party/tbb/README.cosmo                  |   17 +
 third_party/tbb/address_waiter.cpp            |  107 +
 third_party/tbb/allocator.cpp                 |  314 ++
 third_party/tbb/arena.cpp                     |  858 +++++
 third_party/tbb/arena.h                       |  511 +++
 third_party/tbb/arena_slot.cpp                |  219 ++
 third_party/tbb/arena_slot.h                  |  415 ++
 third_party/tbb/assert_impl.h                 |   98 +
 third_party/tbb/blocked_range.h               |  171 +
 third_party/tbb/blocked_range2d.h             |  112 +
 third_party/tbb/blocked_range3d.h             |  131 +
 third_party/tbb/blocked_rangeNd.h             |  148 +
 third_party/tbb/cache_aligned_allocator.h     |  190 +
 third_party/tbb/cancellation_disseminator.h   |   86 +
 third_party/tbb/co_context.h                  |  428 +++
 third_party/tbb/collaborative_call_once.h     |  236 ++
 third_party/tbb/combinable.h                  |   70 +
 third_party/tbb/concurrent_bounded_queue.cpp  |   85 +
 third_party/tbb/concurrent_hash_map.h         | 1665 ++++++++
 third_party/tbb/concurrent_lru_cache.h        |  375 ++
 third_party/tbb/concurrent_map.h              |  351 ++
 third_party/tbb/concurrent_monitor.h          |  489 +++
 third_party/tbb/concurrent_monitor_mutex.h    |  114 +
 third_party/tbb/concurrent_priority_queue.h   |  491 +++
 third_party/tbb/concurrent_queue.h            |  701 ++++
 third_party/tbb/concurrent_set.h              |  268 ++
 third_party/tbb/concurrent_unordered_map.h    |  415 ++
 third_party/tbb/concurrent_unordered_set.h    |  334 ++
 third_party/tbb/concurrent_vector.h           | 1130 ++++++
 third_party/tbb/detail/_aggregator.h          |  177 +
 third_party/tbb/detail/_aligned_space.h       |   47 +
 third_party/tbb/detail/_allocator_traits.h    |  108 +
 third_party/tbb/detail/_assert.h              |   65 +
 third_party/tbb/detail/_attach.h              |   33 +
 .../tbb/detail/_concurrent_queue_base.h       |  651 ++++
 .../tbb/detail/_concurrent_skip_list.h        | 1291 +++++++
 .../tbb/detail/_concurrent_unordered_base.h   | 1515 ++++++++
 third_party/tbb/detail/_config.h              |  530 +++
 third_party/tbb/detail/_containers_helpers.h  |   68 +
 third_party/tbb/detail/_exception.h           |   89 +
 third_party/tbb/detail/_export.h              |   47 +
 .../tbb/detail/_flow_graph_body_impl.h        |  386 ++
 .../tbb/detail/_flow_graph_cache_impl.h       |  435 +++
 third_party/tbb/detail/_flow_graph_impl.h     |  477 +++
 .../tbb/detail/_flow_graph_indexer_impl.h     |  352 ++
 .../tbb/detail/_flow_graph_item_buffer_impl.h |  280 ++
 .../tbb/detail/_flow_graph_join_impl.h        | 1709 +++++++++
 .../tbb/detail/_flow_graph_node_impl.h        |  775 ++++
 .../tbb/detail/_flow_graph_node_set_impl.h    |  266 ++
 .../tbb/detail/_flow_graph_nodes_deduction.h  |  278 ++
 .../detail/_flow_graph_tagged_buffer_impl.h   |  258 ++
 .../tbb/detail/_flow_graph_trace_impl.h       |  365 ++
 .../tbb/detail/_flow_graph_types_impl.h       |  408 ++
 third_party/tbb/detail/_hash_compare.h        |  148 +
 third_party/tbb/detail/_intrusive_list_node.h |   42 +
 third_party/tbb/detail/_machine.h             |  397 ++
 third_party/tbb/detail/_mutex_common.h        |   62 +
 third_party/tbb/detail/_namespace_injection.h |   25 +
 third_party/tbb/detail/_node_handle.h         |  163 +
 third_party/tbb/detail/_pipeline_filters.h    |  456 +++
 .../tbb/detail/_pipeline_filters_deduction.h  |   47 +
 third_party/tbb/detail/_range_common.h        |  131 +
 third_party/tbb/detail/_rtm_mutex.h           |  163 +
 third_party/tbb/detail/_rtm_rw_mutex.h        |  216 ++
 third_party/tbb/detail/_scoped_lock.h         |  175 +
 third_party/tbb/detail/_segment_table.h       |  567 +++
 third_party/tbb/detail/_small_object_pool.h   |  109 +
 third_party/tbb/detail/_string_resource.h     |   79 +
 third_party/tbb/detail/_task.h                |  233 ++
 third_party/tbb/detail/_task_handle.h         |  123 +
 third_party/tbb/detail/_template_helpers.h    |  404 ++
 third_party/tbb/detail/_utils.h               |  394 ++
 third_party/tbb/detail/_waitable_atomic.h     |  105 +
 third_party/tbb/dynamic_link.cpp              |  516 +++
 third_party/tbb/dynamic_link.h                |  137 +
 third_party/tbb/enumerable_thread_specific.h  | 1135 ++++++
 third_party/tbb/environment.h                 |   82 +
 third_party/tbb/exception.cpp                 |  167 +
 third_party/tbb/flow_graph.h                  | 3377 +++++++++++++++++
 third_party/tbb/flow_graph_abstractions.h     |   52 +
 third_party/tbb/global_control.cpp            |  281 ++
 third_party/tbb/global_control.h              |  201 +
 third_party/tbb/governor.cpp                  |  580 +++
 third_party/tbb/governor.h                    |  157 +
 third_party/tbb/info.h                        |  126 +
 third_party/tbb/intrusive_list.h              |  234 ++
 third_party/tbb/itt_notify.cpp                |   70 +
 third_party/tbb/itt_notify.h                  |  118 +
 third_party/tbb/mailbox.h                     |  247 ++
 third_party/tbb/main.cpp                      |  172 +
 third_party/tbb/main.h                        |  100 +
 third_party/tbb/market.cpp                    |  140 +
 third_party/tbb/market.h                      |   79 +
 third_party/tbb/memory_pool.h                 |  273 ++
 third_party/tbb/misc.cpp                      |  176 +
 third_party/tbb/misc.h                        |  298 ++
 third_party/tbb/misc_ex.cpp                   |  457 +++
 third_party/tbb/mutex.h                       |   94 +
 third_party/tbb/null_mutex.h                  |   81 +
 third_party/tbb/null_rw_mutex.h               |   88 +
 third_party/tbb/observer_proxy.cpp            |  320 ++
 third_party/tbb/observer_proxy.h              |  153 +
 third_party/tbb/parallel_for.h                |  470 +++
 third_party/tbb/parallel_for_each.h           |  682 ++++
 third_party/tbb/parallel_invoke.h             |  228 ++
 third_party/tbb/parallel_pipeline.cpp         |  472 +++
 third_party/tbb/parallel_pipeline.h           |  154 +
 third_party/tbb/parallel_reduce.h             |  772 ++++
 third_party/tbb/parallel_scan.h               |  631 +++
 third_party/tbb/parallel_sort.h               |  289 ++
 third_party/tbb/partitioner.h                 |  682 ++++
 third_party/tbb/permit_manager.h              |   61 +
 third_party/tbb/pm_client.h                   |   71 +
 third_party/tbb/private_server.cpp            |  437 +++
 third_party/tbb/profiling.cpp                 |  268 ++
 third_party/tbb/profiling.h                   |  259 ++
 third_party/tbb/queuing_mutex.h               |  193 +
 third_party/tbb/queuing_rw_mutex.cpp          |  618 +++
 third_party/tbb/queuing_rw_mutex.h            |  208 +
 third_party/tbb/rml_base.h                    |  182 +
 third_party/tbb/rml_tbb.cpp                   |  113 +
 third_party/tbb/rml_tbb.h                     |   95 +
 third_party/tbb/rml_thread_monitor.h          |  277 ++
 third_party/tbb/rtm_mutex.cpp                 |  122 +
 third_party/tbb/rtm_rw_mutex.cpp              |  272 ++
 third_party/tbb/rw_mutex.h                    |  217 ++
 third_party/tbb/scalable_allocator.h          |  338 ++
 third_party/tbb/scheduler_common.h            |  599 +++
 third_party/tbb/semaphore.cpp                 |   93 +
 third_party/tbb/semaphore.h                   |  331 ++
 third_party/tbb/small_object_pool.cpp         |  155 +
 third_party/tbb/small_object_pool_impl.h      |   60 +
 third_party/tbb/spin_mutex.h                  |  135 +
 third_party/tbb/spin_rw_mutex.h               |  230 ++
 third_party/tbb/task.cpp                      |  228 ++
 third_party/tbb/task.h                        |   38 +
 third_party/tbb/task_arena.h                  |  500 +++
 third_party/tbb/task_dispatcher.cpp           |  245 ++
 third_party/tbb/task_dispatcher.h             |  469 +++
 third_party/tbb/task_group.h                  |  747 ++++
 third_party/tbb/task_group_context.cpp        |  359 ++
 third_party/tbb/task_scheduler_observer.h     |  117 +
 third_party/tbb/task_stream.h                 |  287 ++
 third_party/tbb/tbb.h                         |   75 +
 third_party/tbb/tbb.mk                        |   43 +
 third_party/tbb/tbb.rc                        |   75 +
 third_party/tbb/tbb_allocator.h               |  127 +
 third_party/tbb/tbbmalloc_proxy.h             |   66 +
 third_party/tbb/thread_control_monitor.h      |  117 +
 third_party/tbb/thread_data.h                 |  260 ++
 third_party/tbb/thread_dispatcher.cpp         |  225 ++
 third_party/tbb/thread_dispatcher.h           |  107 +
 third_party/tbb/thread_dispatcher_client.h    |   65 +
 third_party/tbb/thread_request_serializer.cpp |  139 +
 third_party/tbb/thread_request_serializer.h   |   83 +
 third_party/tbb/threading_control.cpp         |  392 ++
 third_party/tbb/threading_control.h           |  153 +
 third_party/tbb/threading_control_client.h    |   59 +
 third_party/tbb/tick_count.h                  |  100 +
 third_party/tbb/tls.h                         |  103 +
 third_party/tbb/version.cpp                   |   27 +
 third_party/tbb/version.h                     |  115 +
 third_party/tbb/waiters.h                     |  202 +
 third_party/third_party.mk                    |    1 +
 165 files changed, 50328 insertions(+)
 create mode 100644 third_party/tbb/README.cosmo
 create mode 100644 third_party/tbb/address_waiter.cpp
 create mode 100644 third_party/tbb/allocator.cpp
 create mode 100644 third_party/tbb/arena.cpp
 create mode 100644 third_party/tbb/arena.h
 create mode 100644 third_party/tbb/arena_slot.cpp
 create mode 100644 third_party/tbb/arena_slot.h
 create mode 100644 third_party/tbb/assert_impl.h
 create mode 100644 third_party/tbb/blocked_range.h
 create mode 100644 third_party/tbb/blocked_range2d.h
 create mode 100644 third_party/tbb/blocked_range3d.h
 create mode 100644 third_party/tbb/blocked_rangeNd.h
 create mode 100644 third_party/tbb/cache_aligned_allocator.h
 create mode 100644 third_party/tbb/cancellation_disseminator.h
 create mode 100644 third_party/tbb/co_context.h
 create mode 100644 third_party/tbb/collaborative_call_once.h
 create mode 100644 third_party/tbb/combinable.h
 create mode 100644 third_party/tbb/concurrent_bounded_queue.cpp
 create mode 100644 third_party/tbb/concurrent_hash_map.h
 create mode 100644 third_party/tbb/concurrent_lru_cache.h
 create mode 100644 third_party/tbb/concurrent_map.h
 create mode 100644 third_party/tbb/concurrent_monitor.h
 create mode 100644 third_party/tbb/concurrent_monitor_mutex.h
 create mode 100644 third_party/tbb/concurrent_priority_queue.h
 create mode 100644 third_party/tbb/concurrent_queue.h
 create mode 100644 third_party/tbb/concurrent_set.h
 create mode 100644 third_party/tbb/concurrent_unordered_map.h
 create mode 100644 third_party/tbb/concurrent_unordered_set.h
 create mode 100644 third_party/tbb/concurrent_vector.h
 create mode 100644 third_party/tbb/detail/_aggregator.h
 create mode 100644 third_party/tbb/detail/_aligned_space.h
 create mode 100644 third_party/tbb/detail/_allocator_traits.h
 create mode 100644 third_party/tbb/detail/_assert.h
 create mode 100644 third_party/tbb/detail/_attach.h
 create mode 100644 third_party/tbb/detail/_concurrent_queue_base.h
 create mode 100644 third_party/tbb/detail/_concurrent_skip_list.h
 create mode 100644 third_party/tbb/detail/_concurrent_unordered_base.h
 create mode 100644 third_party/tbb/detail/_config.h
 create mode 100644 third_party/tbb/detail/_containers_helpers.h
 create mode 100644 third_party/tbb/detail/_exception.h
 create mode 100644 third_party/tbb/detail/_export.h
 create mode 100644 third_party/tbb/detail/_flow_graph_body_impl.h
 create mode 100644 third_party/tbb/detail/_flow_graph_cache_impl.h
 create mode 100644 third_party/tbb/detail/_flow_graph_impl.h
 create mode 100644 third_party/tbb/detail/_flow_graph_indexer_impl.h
 create mode 100644 third_party/tbb/detail/_flow_graph_item_buffer_impl.h
 create mode 100644 third_party/tbb/detail/_flow_graph_join_impl.h
 create mode 100644 third_party/tbb/detail/_flow_graph_node_impl.h
 create mode 100644 third_party/tbb/detail/_flow_graph_node_set_impl.h
 create mode 100644 third_party/tbb/detail/_flow_graph_nodes_deduction.h
 create mode 100644 third_party/tbb/detail/_flow_graph_tagged_buffer_impl.h
 create mode 100644 third_party/tbb/detail/_flow_graph_trace_impl.h
 create mode 100644 third_party/tbb/detail/_flow_graph_types_impl.h
 create mode 100644 third_party/tbb/detail/_hash_compare.h
 create mode 100644 third_party/tbb/detail/_intrusive_list_node.h
 create mode 100644 third_party/tbb/detail/_machine.h
 create mode 100644 third_party/tbb/detail/_mutex_common.h
 create mode 100644 third_party/tbb/detail/_namespace_injection.h
 create mode 100644 third_party/tbb/detail/_node_handle.h
 create mode 100644 third_party/tbb/detail/_pipeline_filters.h
 create mode 100644 third_party/tbb/detail/_pipeline_filters_deduction.h
 create mode 100644 third_party/tbb/detail/_range_common.h
 create mode 100644 third_party/tbb/detail/_rtm_mutex.h
 create mode 100644 third_party/tbb/detail/_rtm_rw_mutex.h
 create mode 100644 third_party/tbb/detail/_scoped_lock.h
 create mode 100644 third_party/tbb/detail/_segment_table.h
 create mode 100644 third_party/tbb/detail/_small_object_pool.h
 create mode 100644 third_party/tbb/detail/_string_resource.h
 create mode 100644 third_party/tbb/detail/_task.h
 create mode 100644 third_party/tbb/detail/_task_handle.h
 create mode 100644 third_party/tbb/detail/_template_helpers.h
 create mode 100644 third_party/tbb/detail/_utils.h
 create mode 100644 third_party/tbb/detail/_waitable_atomic.h
 create mode 100644 third_party/tbb/dynamic_link.cpp
 create mode 100644 third_party/tbb/dynamic_link.h
 create mode 100644 third_party/tbb/enumerable_thread_specific.h
 create mode 100644 third_party/tbb/environment.h
 create mode 100644 third_party/tbb/exception.cpp
 create mode 100644 third_party/tbb/flow_graph.h
 create mode 100644 third_party/tbb/flow_graph_abstractions.h
 create mode 100644 third_party/tbb/global_control.cpp
 create mode 100644 third_party/tbb/global_control.h
 create mode 100644 third_party/tbb/governor.cpp
 create mode 100644 third_party/tbb/governor.h
 create mode 100644 third_party/tbb/info.h
 create mode 100644 third_party/tbb/intrusive_list.h
 create mode 100644 third_party/tbb/itt_notify.cpp
 create mode 100644 third_party/tbb/itt_notify.h
 create mode 100644 third_party/tbb/mailbox.h
 create mode 100644 third_party/tbb/main.cpp
 create mode 100644 third_party/tbb/main.h
 create mode 100644 third_party/tbb/market.cpp
 create mode 100644 third_party/tbb/market.h
 create mode 100644 third_party/tbb/memory_pool.h
 create mode 100644 third_party/tbb/misc.cpp
 create mode 100644 third_party/tbb/misc.h
 create mode 100644 third_party/tbb/misc_ex.cpp
 create mode 100644 third_party/tbb/mutex.h
 create mode 100644 third_party/tbb/null_mutex.h
 create mode 100644 third_party/tbb/null_rw_mutex.h
 create mode 100644 third_party/tbb/observer_proxy.cpp
 create mode 100644 third_party/tbb/observer_proxy.h
 create mode 100644 third_party/tbb/parallel_for.h
 create mode 100644 third_party/tbb/parallel_for_each.h
 create mode 100644 third_party/tbb/parallel_invoke.h
 create mode 100644 third_party/tbb/parallel_pipeline.cpp
 create mode 100644 third_party/tbb/parallel_pipeline.h
 create mode 100644 third_party/tbb/parallel_reduce.h
 create mode 100644 third_party/tbb/parallel_scan.h
 create mode 100644 third_party/tbb/parallel_sort.h
 create mode 100644 third_party/tbb/partitioner.h
 create mode 100644 third_party/tbb/permit_manager.h
 create mode 100644 third_party/tbb/pm_client.h
 create mode 100644 third_party/tbb/private_server.cpp
 create mode 100644 third_party/tbb/profiling.cpp
 create mode 100644 third_party/tbb/profiling.h
 create mode 100644 third_party/tbb/queuing_mutex.h
 create mode 100644 third_party/tbb/queuing_rw_mutex.cpp
 create mode 100644 third_party/tbb/queuing_rw_mutex.h
 create mode 100644 third_party/tbb/rml_base.h
 create mode 100644 third_party/tbb/rml_tbb.cpp
 create mode 100644 third_party/tbb/rml_tbb.h
 create mode 100644 third_party/tbb/rml_thread_monitor.h
 create mode 100644 third_party/tbb/rtm_mutex.cpp
 create mode 100644 third_party/tbb/rtm_rw_mutex.cpp
 create mode 100644 third_party/tbb/rw_mutex.h
 create mode 100644 third_party/tbb/scalable_allocator.h
 create mode 100644 third_party/tbb/scheduler_common.h
 create mode 100644 third_party/tbb/semaphore.cpp
 create mode 100644 third_party/tbb/semaphore.h
 create mode 100644 third_party/tbb/small_object_pool.cpp
 create mode 100644 third_party/tbb/small_object_pool_impl.h
 create mode 100644 third_party/tbb/spin_mutex.h
 create mode 100644 third_party/tbb/spin_rw_mutex.h
 create mode 100644 third_party/tbb/task.cpp
 create mode 100644 third_party/tbb/task.h
 create mode 100644 third_party/tbb/task_arena.h
 create mode 100644 third_party/tbb/task_dispatcher.cpp
 create mode 100644 third_party/tbb/task_dispatcher.h
 create mode 100644 third_party/tbb/task_group.h
 create mode 100644 third_party/tbb/task_group_context.cpp
 create mode 100644 third_party/tbb/task_scheduler_observer.h
 create mode 100644 third_party/tbb/task_stream.h
 create mode 100644 third_party/tbb/tbb.h
 create mode 100644 third_party/tbb/tbb.mk
 create mode 100644 third_party/tbb/tbb.rc
 create mode 100644 third_party/tbb/tbb_allocator.h
 create mode 100644 third_party/tbb/tbbmalloc_proxy.h
 create mode 100644 third_party/tbb/thread_control_monitor.h
 create mode 100644 third_party/tbb/thread_data.h
 create mode 100644 third_party/tbb/thread_dispatcher.cpp
 create mode 100644 third_party/tbb/thread_dispatcher.h
 create mode 100644 third_party/tbb/thread_dispatcher_client.h
 create mode 100644 third_party/tbb/thread_request_serializer.cpp
 create mode 100644 third_party/tbb/thread_request_serializer.h
 create mode 100644 third_party/tbb/threading_control.cpp
 create mode 100644 third_party/tbb/threading_control.h
 create mode 100644 third_party/tbb/threading_control_client.h
 create mode 100644 third_party/tbb/tick_count.h
 create mode 100644 third_party/tbb/tls.h
 create mode 100644 third_party/tbb/version.cpp
 create mode 100644 third_party/tbb/version.h
 create mode 100644 third_party/tbb/waiters.h

diff --git a/Makefile b/Makefile
index d5eba45b3..dda487114 100644
--- a/Makefile
+++ b/Makefile
@@ -182,6 +182,7 @@ include net/finger/finger.mk
 include third_party/double-conversion/test/test.mk
 include third_party/lua/lua.mk
 include third_party/tr/tr.mk
+include third_party/tbb/tbb.mk
 include third_party/sed/sed.mk
 include third_party/awk/awk.mk
 include third_party/hiredis/hiredis.mk
diff --git a/third_party/tbb/README.cosmo b/third_party/tbb/README.cosmo
new file mode 100644
index 000000000..05e9b3008
--- /dev/null
+++ b/third_party/tbb/README.cosmo
@@ -0,0 +1,17 @@
+// clang-format off
+DESCRIPTION
+
+  oneAPI Threading Building Blocks (oneTBB)
+
+  oneTBB is a flexible C++ library that simplifies the work of adding parallelism to complex applications,
+  even if you are not a threading expert.
+
+SOURCE
+
+  https://github.com/oneapi-src/oneTBB
+
+  commit e813596ba3a1bee0ffa06fb66b5e30b7ea801319
+  Author: Alexandra <alexandra.epanchinzeva@intel.com>
+  Date:   Wed Jun 21 18:46:54 2023 +0200
+
+      Documentation for std::invoke (#1112)
diff --git a/third_party/tbb/address_waiter.cpp b/third_party/tbb/address_waiter.cpp
new file mode 100644
index 000000000..0508f06f7
--- /dev/null
+++ b/third_party/tbb/address_waiter.cpp
@@ -0,0 +1,107 @@
+// clang-format off
+/*
+    Copyright (c) 2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/concurrent_monitor.h"
+#include "third_party/tbb/detail/_waitable_atomic.h"
+
+#include "third_party/libcxx/type_traits"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+struct address_context {
+    address_context() = default;
+
+    address_context(void* address, std::uintptr_t context) :
+        my_address(address), my_context(context)
+    {}
+
+    void* my_address{nullptr};
+    std::uintptr_t my_context{0};
+};
+
+class address_waiter : public concurrent_monitor_base<address_context> {
+    using base_type = concurrent_monitor_base<address_context>;
+public:
+    using base_type::base_type;
+    /** per-thread descriptor for concurrent_monitor */
+    using thread_context = sleep_node<address_context>;
+};
+
+// 1024 is a rough estimate based on two assumptions:
+//   1) there are no more than 1000 threads in the application;
+//   2) the mutexes are optimized for short critical sections less than a couple of microseconds,
+//      which is less than 1/1000 of a time slice.
+// In the worst case, we have single mutex that is locked and its thread is preempted.
+// Therefore, the probability of a collision while taking unrelated mutex is about 1/size of a table.
+static constexpr std::size_t num_address_waiters = 2 << 10;
+static_assert(std::is_standard_layout<address_waiter>::value,
+              "address_waiter must be with standard layout");
+static address_waiter address_waiter_table[num_address_waiters];
+
+void clear_address_waiter_table() {
+    for (std::size_t i = 0; i < num_address_waiters; ++i) {
+        address_waiter_table[i].destroy();
+    }
+}
+
+static address_waiter& get_address_waiter(void* address) {
+    std::uintptr_t tag = std::uintptr_t(address);
+    return address_waiter_table[((tag >> 5) ^ tag) % num_address_waiters];
+}
+
+void wait_on_address(void* address, d1::delegate_base& predicate, std::uintptr_t context) {
+    address_waiter& waiter = get_address_waiter(address);
+    waiter.wait<address_waiter::thread_context>(predicate, address_context{address, context});
+}
+
+void notify_by_address(void* address, std::uintptr_t target_context) {
+    address_waiter& waiter = get_address_waiter(address);
+
+    auto predicate = [address, target_context] (address_context ctx) {
+        return ctx.my_address == address && ctx.my_context == target_context;
+    };
+
+    waiter.notify_relaxed(predicate);
+}
+
+void notify_by_address_one(void* address) {
+    address_waiter& waiter = get_address_waiter(address);
+
+    auto predicate = [address] (address_context ctx) {
+        return ctx.my_address == address;
+    };
+
+    waiter.notify_one_relaxed(predicate);
+}
+
+void notify_by_address_all(void* address) {
+    address_waiter& waiter = get_address_waiter(address);
+
+    auto predicate = [address] (address_context ctx) {
+        return ctx.my_address == address;
+    };
+
+    waiter.notify_relaxed(predicate);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/allocator.cpp b/third_party/tbb/allocator.cpp
new file mode 100644
index 000000000..aec21f80e
--- /dev/null
+++ b/third_party/tbb/allocator.cpp
@@ -0,0 +1,314 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/version.h"
+
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/tbb_allocator.h" // Is this OK?
+#include "third_party/tbb/cache_aligned_allocator.h"
+
+#include "third_party/tbb/dynamic_link.h"
+#include "third_party/tbb/misc.h"
+
+#include "third_party/libcxx/cstdlib"
+
+#ifdef _WIN32
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#else
+#include "libc/runtime/dlfcn.h"
+#endif
+
+#if (!defined(_WIN32) && !defined(_WIN64)) || defined(__CYGWIN__)
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h" // posix_memalign, free
+// With glibc, uClibc and musl on Linux and bionic on Android it is safe to use memalign(), as the allocated memory
+// can be freed with free(). It is also better to use memalign() since posix_memalign() is just a wrapper on top of
+// memalign() and it offers nothing but overhead due to inconvenient interface. This is likely the case with other
+// standard libraries as well, and more libraries can be added to the preprocessor check below. Unfortunately, we
+// can't detect musl, so we simply enable memalign() on Linux and Android in general.
+#if defined(linux) || defined(__linux) || defined(__linux__) || defined(__ANDROID__)
+#include "libc/mem/mem.h" // memalign
+#define __TBB_USE_MEMALIGN
+#else
+#define __TBB_USE_POSIX_MEMALIGN
+#endif
+#elif defined(_MSC_VER) || defined(__MINGW32__)
+#include "libc/mem/mem.h" // _aligned_malloc, _aligned_free
+#define __TBB_USE_MSVC_ALIGNED_MALLOC
+#endif
+
+#if __TBB_WEAK_SYMBOLS_PRESENT
+
+#pragma weak scalable_malloc
+#pragma weak scalable_free
+#pragma weak scalable_aligned_malloc
+#pragma weak scalable_aligned_free
+
+extern "C" {
+    void* scalable_malloc(std::size_t);
+    void  scalable_free(void*);
+    void* scalable_aligned_malloc(std::size_t, std::size_t);
+    void  scalable_aligned_free(void*);
+}
+
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Initialization routine used for first indirect call via allocate_handler.
+static void* initialize_allocate_handler(std::size_t size);
+
+//! Handler for memory allocation
+using allocate_handler_type = void* (*)(std::size_t size);
+static std::atomic<allocate_handler_type> allocate_handler{ &initialize_allocate_handler };
+allocate_handler_type allocate_handler_unsafe = nullptr;
+
+//! Handler for memory deallocation
+static void  (*deallocate_handler)(void* pointer) = nullptr;
+
+//! Initialization routine used for first indirect call via cache_aligned_allocate_handler.
+static void* initialize_cache_aligned_allocate_handler(std::size_t n, std::size_t alignment);
+
+//! Allocates overaligned memory using standard memory allocator. It is used when scalable_allocator is not available.
+static void* std_cache_aligned_allocate(std::size_t n, std::size_t alignment);
+
+//! Deallocates overaligned memory using standard memory allocator. It is used when scalable_allocator is not available.
+static void  std_cache_aligned_deallocate(void* p);
+
+//! Handler for padded memory allocation
+using cache_aligned_allocate_handler_type = void* (*)(std::size_t n, std::size_t alignment);
+static std::atomic<cache_aligned_allocate_handler_type> cache_aligned_allocate_handler{ &initialize_cache_aligned_allocate_handler };
+cache_aligned_allocate_handler_type cache_aligned_allocate_handler_unsafe = nullptr;
+
+//! Handler for padded memory deallocation
+static void (*cache_aligned_deallocate_handler)(void* p) = nullptr;
+
+//! Table describing how to link the handlers.
+static const dynamic_link_descriptor MallocLinkTable[] = {
+    DLD(scalable_malloc, allocate_handler_unsafe),
+    DLD(scalable_free, deallocate_handler),
+    DLD(scalable_aligned_malloc, cache_aligned_allocate_handler_unsafe),
+    DLD(scalable_aligned_free, cache_aligned_deallocate_handler),
+};
+
+
+#if TBB_USE_DEBUG
+#define DEBUG_SUFFIX "_debug"
+#else
+#define DEBUG_SUFFIX
+#endif /* TBB_USE_DEBUG */
+
+// MALLOCLIB_NAME is the name of the oneTBB memory allocator library.
+#if _WIN32||_WIN64
+#define MALLOCLIB_NAME "tbbmalloc" DEBUG_SUFFIX ".dll"
+#elif __APPLE__
+#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".2.dylib"
+#elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX || __ANDROID__
+#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so"
+#elif __unix__  // Note that order of these #elif's is important!
+#define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so.2"
+#else
+#error Unknown OS
+#endif
+
+//! Initialize the allocation/free handler pointers.
+/** Caller is responsible for ensuring this routine is called exactly once.
+    The routine attempts to dynamically link with the TBB memory allocator.
+    If that allocator is not found, it links to malloc and free. */
+void initialize_handler_pointers() {
+    __TBB_ASSERT(allocate_handler == &initialize_allocate_handler, nullptr);
+    bool success = dynamic_link(MALLOCLIB_NAME, MallocLinkTable, 4);
+    if(!success) {
+        // If unsuccessful, set the handlers to the default routines.
+        // This must be done now, and not before FillDynamicLinks runs, because if other
+        // threads call the handlers, we want them to go through the DoOneTimeInitializations logic,
+        // which forces them to wait.
+        allocate_handler_unsafe = &std::malloc;
+        deallocate_handler = &std::free;
+        cache_aligned_allocate_handler_unsafe = &std_cache_aligned_allocate;
+        cache_aligned_deallocate_handler = &std_cache_aligned_deallocate;
+    }
+
+    allocate_handler.store(allocate_handler_unsafe, std::memory_order_release);
+    cache_aligned_allocate_handler.store(cache_aligned_allocate_handler_unsafe, std::memory_order_release);
+
+    PrintExtraVersionInfo( "ALLOCATOR", success?"scalable_malloc":"malloc" );
+}
+
+static std::once_flag initialization_state;
+void initialize_cache_aligned_allocator() {
+    std::call_once(initialization_state, &initialize_handler_pointers);
+}
+
+//! Executed on very first call through allocate_handler
+static void* initialize_allocate_handler(std::size_t size) {
+    initialize_cache_aligned_allocator();
+    __TBB_ASSERT(allocate_handler != &initialize_allocate_handler, nullptr);
+    return (*allocate_handler)(size);
+}
+
+//! Executed on very first call through cache_aligned_allocate_handler
+static void* initialize_cache_aligned_allocate_handler(std::size_t bytes, std::size_t alignment) {
+    initialize_cache_aligned_allocator();
+    __TBB_ASSERT(cache_aligned_allocate_handler != &initialize_cache_aligned_allocate_handler, nullptr);
+    return (*cache_aligned_allocate_handler)(bytes, alignment);
+}
+
+// TODO: use CPUID to find actual line size, though consider backward compatibility
+// nfs - no false sharing
+static constexpr std::size_t nfs_size = 128;
+
+std::size_t __TBB_EXPORTED_FUNC cache_line_size() {
+    return nfs_size;
+}
+
+void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size) {
+    const std::size_t cache_line_size = nfs_size;
+    __TBB_ASSERT(is_power_of_two(cache_line_size), "must be power of two");
+
+    // Check for overflow
+    if (size + cache_line_size < size) {
+        throw_exception(exception_id::bad_alloc);
+    }
+    // scalable_aligned_malloc considers zero size request an error, and returns nullptr
+    if (size == 0) size = 1;
+
+    void* result = cache_aligned_allocate_handler.load(std::memory_order_acquire)(size, cache_line_size);
+    if (!result) {
+        throw_exception(exception_id::bad_alloc);
+    }
+    __TBB_ASSERT(is_aligned(result, cache_line_size), "The returned address isn't aligned");
+    return result;
+}
+
+void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p) {
+    __TBB_ASSERT(cache_aligned_deallocate_handler, "Initialization has not been yet.");
+    (*cache_aligned_deallocate_handler)(p);
+}
+
+static void* std_cache_aligned_allocate(std::size_t bytes, std::size_t alignment) {
+#if defined(__TBB_USE_MEMALIGN)
+    return memalign(alignment, bytes);
+#elif defined(__TBB_USE_POSIX_MEMALIGN)
+    void* p = nullptr;
+    int res = posix_memalign(&p, alignment, bytes);
+    if (res != 0)
+        p = nullptr;
+    return p;
+#elif defined(__TBB_USE_MSVC_ALIGNED_MALLOC)
+    return _aligned_malloc(bytes, alignment);
+#else
+    // TODO: make it common with cache_aligned_resource
+    std::size_t space = alignment + bytes;
+    std::uintptr_t base = reinterpret_cast<std::uintptr_t>(std::malloc(space));
+    if (!base) {
+        return nullptr;
+    }
+    std::uintptr_t result = (base + nfs_size) & ~(nfs_size - 1);
+    // Round up to the next cache line (align the base address)
+    __TBB_ASSERT((result - base) >= sizeof(std::uintptr_t), "Cannot store a base pointer to the header");
+    __TBB_ASSERT(space - (result - base) >= bytes, "Not enough space for the storage");
+
+    // Record where block actually starts.
+    (reinterpret_cast<std::uintptr_t*>(result))[-1] = base;
+    return reinterpret_cast<void*>(result);
+#endif
+}
+
+static void std_cache_aligned_deallocate(void* p) {
+#if defined(__TBB_USE_MEMALIGN) || defined(__TBB_USE_POSIX_MEMALIGN)
+    free(p);
+#elif defined(__TBB_USE_MSVC_ALIGNED_MALLOC)
+    _aligned_free(p);
+#else
+    if (p) {
+        __TBB_ASSERT(reinterpret_cast<std::uintptr_t>(p) >= 0x4096, "attempt to free block not obtained from cache_aligned_allocator");
+        // Recover where block actually starts
+        std::uintptr_t base = (reinterpret_cast<std::uintptr_t*>(p))[-1];
+        __TBB_ASSERT(((base + nfs_size) & ~(nfs_size - 1)) == reinterpret_cast<std::uintptr_t>(p), "Incorrect alignment or not allocated by std_cache_aligned_deallocate?");
+        std::free(reinterpret_cast<void*>(base));
+    }
+#endif
+}
+
+void* __TBB_EXPORTED_FUNC allocate_memory(std::size_t size) {
+    void* result = allocate_handler.load(std::memory_order_acquire)(size);
+    if (!result) {
+        throw_exception(exception_id::bad_alloc);
+    }
+    return result;
+}
+
+void __TBB_EXPORTED_FUNC deallocate_memory(void* p) {
+    if (p) {
+        __TBB_ASSERT(deallocate_handler, "Initialization has not been yet.");
+        (*deallocate_handler)(p);
+    }
+}
+
+bool __TBB_EXPORTED_FUNC is_tbbmalloc_used() {
+    auto handler_snapshot = allocate_handler.load(std::memory_order_acquire);
+    if (handler_snapshot == &initialize_allocate_handler) {
+        initialize_cache_aligned_allocator();
+    }
+    handler_snapshot = allocate_handler.load(std::memory_order_relaxed);
+    __TBB_ASSERT(handler_snapshot != &initialize_allocate_handler && deallocate_handler != nullptr, nullptr);
+    // Cast to void avoids type mismatch errors on some compilers (e.g. __IBMCPP__)
+    __TBB_ASSERT((reinterpret_cast<void*>(handler_snapshot) == reinterpret_cast<void*>(&std::malloc)) == (reinterpret_cast<void*>(deallocate_handler) == reinterpret_cast<void*>(&std::free)),
+                  "Both shim pointers must refer to routines from the same package (either TBB or CRT)");
+    return reinterpret_cast<void*>(handler_snapshot) == reinterpret_cast<void*>(&std::malloc);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/arena.cpp b/third_party/tbb/arena.cpp
new file mode 100644
index 000000000..6c290b898
--- /dev/null
+++ b/third_party/tbb/arena.cpp
@@ -0,0 +1,858 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/task_dispatcher.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/threading_control.h"
+#include "third_party/tbb/arena.h"
+#include "third_party/tbb/itt_notify.h"
+#include "third_party/tbb/semaphore.h"
+#include "third_party/tbb/waiters.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/info.h"
+#include "third_party/tbb/tbb_allocator.h"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/functional"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_ARENA_BINDING
+class numa_binding_observer : public tbb::task_scheduler_observer {
+    binding_handler* my_binding_handler;
+public:
+    numa_binding_observer( d1::task_arena* ta, int num_slots, int numa_id, core_type_id core_type, int max_threads_per_core )
+        : task_scheduler_observer(*ta)
+        , my_binding_handler(construct_binding_handler(num_slots, numa_id, core_type, max_threads_per_core))
+    {}
+
+    void on_scheduler_entry( bool ) override {
+        apply_affinity_mask(my_binding_handler, this_task_arena::current_thread_index());
+    }
+
+    void on_scheduler_exit( bool ) override {
+        restore_affinity_mask(my_binding_handler, this_task_arena::current_thread_index());
+    }
+
+    ~numa_binding_observer() override{
+        destroy_binding_handler(my_binding_handler);
+    }
+};
+
+numa_binding_observer* construct_binding_observer( d1::task_arena* ta, int num_slots, int numa_id, core_type_id core_type, int max_threads_per_core ) {
+    numa_binding_observer* binding_observer = nullptr;
+    if ((core_type >= 0 && core_type_count() > 1) || (numa_id >= 0 && numa_node_count() > 1) || max_threads_per_core > 0) {
+        binding_observer = new(allocate_memory(sizeof(numa_binding_observer))) numa_binding_observer(ta, num_slots, numa_id, core_type, max_threads_per_core);
+        __TBB_ASSERT(binding_observer, "Failure during NUMA binding observer allocation and construction");
+        binding_observer->observe(true);
+    }
+    return binding_observer;
+}
+
+void destroy_binding_observer( numa_binding_observer* binding_observer ) {
+    __TBB_ASSERT(binding_observer, "Trying to deallocate nullptr pointer");
+    binding_observer->observe(false);
+    binding_observer->~numa_binding_observer();
+    deallocate_memory(binding_observer);
+}
+#endif /*!__TBB_ARENA_BINDING*/
+
+void arena::on_thread_leaving(unsigned ref_param) {
+    //
+    // Implementation of arena destruction synchronization logic contained various
+    // bugs/flaws at the different stages of its evolution, so below is a detailed
+    // description of the issues taken into consideration in the framework of the
+    // current design.
+    //
+    // In case of using fire-and-forget tasks (scheduled via task::enqueue())
+    // external thread is allowed to leave its arena before all its work is executed,
+    // and market may temporarily revoke all workers from this arena. Since revoked
+    // workers never attempt to reset arena state to EMPTY and cancel its request
+    // to RML for threads, the arena object is destroyed only when both the last
+    // thread is leaving it and arena's state is EMPTY (that is its external thread
+    // left and it does not contain any work).
+    // Thus resetting arena to EMPTY state (as earlier TBB versions did) should not
+    // be done here (or anywhere else in the external thread to that matter); doing so
+    // can result either in arena's premature destruction (at least without
+    // additional costly checks in workers) or in unnecessary arena state changes
+    // (and ensuing workers migration).
+    //
+    // A worker that checks for work presence and transitions arena to the EMPTY
+    // state (in snapshot taking procedure arena::out_of_work()) updates
+    // arena::my_pool_state first and only then arena::my_num_workers_requested.
+    // So the check for work absence must be done against the latter field.
+    //
+    // In a time window between decrementing the active threads count and checking
+    // if there is an outstanding request for workers. New worker thread may arrive,
+    // finish remaining work, set arena state to empty, and leave decrementing its
+    // refcount and destroying. Then the current thread will destroy the arena
+    // the second time. To preclude it a local copy of the outstanding request
+    // value can be stored before decrementing active threads count.
+    //
+    // But this technique may cause two other problem. When the stored request is
+    // zero, it is possible that arena still has threads and they can generate new
+    // tasks and thus re-establish non-zero requests. Then all the threads can be
+    // revoked (as described above) leaving this thread the last one, and causing
+    // it to destroy non-empty arena.
+    //
+    // The other problem takes place when the stored request is non-zero. Another
+    // thread may complete the work, set arena state to empty, and leave without
+    // arena destruction before this thread decrements the refcount. This thread
+    // cannot destroy the arena either. Thus the arena may be "orphaned".
+    //
+    // In both cases we cannot dereference arena pointer after the refcount is
+    // decremented, as our arena may already be destroyed.
+    //
+    // If this is the external thread, the market is protected by refcount to it.
+    // In case of workers market's liveness is ensured by the RML connection
+    // rundown protocol, according to which the client (i.e. the market) lives
+    // until RML server notifies it about connection termination, and this
+    // notification is fired only after all workers return into RML.
+    //
+    // Thus if we decremented refcount to zero we ask the market to check arena
+    // state (including the fact if it is alive) under the lock.
+    //
+
+    __TBB_ASSERT(my_references.load(std::memory_order_relaxed) >= ref_param, "broken arena reference counter");
+
+    // When there is no workers someone must free arena, as
+    // without workers, no one calls out_of_work().
+    if (ref_param == ref_external && !my_mandatory_concurrency.test()) {
+        out_of_work();
+    }
+
+    threading_control* tc = my_threading_control;
+    auto tc_client_snapshot = tc->prepare_client_destruction(my_tc_client);
+    // Release our reference to sync with destroy_client
+    unsigned remaining_ref = my_references.fetch_sub(ref_param, std::memory_order_release) - ref_param;
+    // do not access `this` it might be destroyed already
+    if (remaining_ref == 0) {
+        if (tc->try_destroy_client(tc_client_snapshot)) {
+            // We are requested to destroy ourself
+            free_arena();
+        }
+    }
+}
+
+std::size_t arena::occupy_free_slot_in_range( thread_data& tls, std::size_t lower, std::size_t upper ) {
+    if ( lower >= upper ) return out_of_arena;
+    // Start search for an empty slot from the one we occupied the last time
+    std::size_t index = tls.my_arena_index;
+    if ( index < lower || index >= upper ) index = tls.my_random.get() % (upper - lower) + lower;
+    __TBB_ASSERT( index >= lower && index < upper, nullptr);
+    // Find a free slot
+    for ( std::size_t i = index; i < upper; ++i )
+        if (my_slots[i].try_occupy()) return i;
+    for ( std::size_t i = lower; i < index; ++i )
+        if (my_slots[i].try_occupy()) return i;
+    return out_of_arena;
+}
+
+template <bool as_worker>
+std::size_t arena::occupy_free_slot(thread_data& tls) {
+    // Firstly, external threads try to occupy reserved slots
+    std::size_t index = as_worker ? out_of_arena : occupy_free_slot_in_range( tls,  0, my_num_reserved_slots );
+    if ( index == out_of_arena ) {
+        // Secondly, all threads try to occupy all non-reserved slots
+        index = occupy_free_slot_in_range(tls, my_num_reserved_slots, my_num_slots );
+        // Likely this arena is already saturated
+        if ( index == out_of_arena )
+            return out_of_arena;
+    }
+
+    atomic_update( my_limit, (unsigned)(index + 1), std::less<unsigned>() );
+    return index;
+}
+
+std::uintptr_t arena::calculate_stealing_threshold() {
+    stack_anchor_type anchor;
+    return r1::calculate_stealing_threshold(reinterpret_cast<std::uintptr_t>(&anchor), my_threading_control->worker_stack_size());
+}
+
+void arena::process(thread_data& tls) {
+    governor::set_thread_data(tls); // TODO: consider moving to create_one_job.
+    __TBB_ASSERT( is_alive(my_guard), nullptr);
+    __TBB_ASSERT( my_num_slots >= 1, nullptr);
+
+    std::size_t index = occupy_free_slot</*as_worker*/true>(tls);
+    if (index == out_of_arena) {
+        on_thread_leaving(ref_worker);
+        return;
+    }
+
+    __TBB_ASSERT( index >= my_num_reserved_slots, "Workers cannot occupy reserved slots" );
+    tls.attach_arena(*this, index);
+    // worker thread enters the dispatch loop to look for a work
+    tls.my_inbox.set_is_idle(true);
+    if (tls.my_arena_slot->is_task_pool_published()) {
+        tls.my_inbox.set_is_idle(false);
+    }
+
+    task_dispatcher& task_disp = tls.my_arena_slot->default_task_dispatcher();
+    tls.enter_task_dispatcher(task_disp, calculate_stealing_threshold());
+    __TBB_ASSERT(task_disp.can_steal(), nullptr);
+
+    __TBB_ASSERT( !tls.my_last_observer, "There cannot be notified local observers when entering arena" );
+    my_observers.notify_entry_observers(tls.my_last_observer, tls.my_is_worker);
+
+    // Waiting on special object tied to this arena
+    outermost_worker_waiter waiter(*this);
+    d1::task* t = tls.my_task_dispatcher->local_wait_for_all(nullptr, waiter);
+    // For purposes of affinity support, the slot's mailbox is considered idle while no thread is
+    // attached to it.
+    tls.my_inbox.set_is_idle(true);
+
+    __TBB_ASSERT_EX(t == nullptr, "Outermost worker must not leave dispatch loop with a task");
+    __TBB_ASSERT(governor::is_thread_data_set(&tls), nullptr);
+    __TBB_ASSERT(tls.my_task_dispatcher == &task_disp, nullptr);
+
+    my_observers.notify_exit_observers(tls.my_last_observer, tls.my_is_worker);
+    tls.my_last_observer = nullptr;
+
+    tls.leave_task_dispatcher();
+
+    // Arena slot detach (arena may be used in market::process)
+    // TODO: Consider moving several calls below into a new method(e.g.detach_arena).
+    tls.my_arena_slot->release();
+    tls.my_arena_slot = nullptr;
+    tls.my_inbox.detach();
+    __TBB_ASSERT(tls.my_inbox.is_idle_state(true), nullptr);
+    __TBB_ASSERT(is_alive(my_guard), nullptr);
+
+    // In contrast to earlier versions of TBB (before 3.0 U5) now it is possible
+    // that arena may be temporarily left unpopulated by threads. See comments in
+    // arena::on_thread_leaving() for more details.
+    on_thread_leaving(ref_worker);
+    __TBB_ASSERT(tls.my_arena == this, "my_arena is used as a hint when searching the arena to join");
+}
+
+arena::arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned priority_level) {
+    __TBB_ASSERT( !my_guard, "improperly allocated arena?" );
+    __TBB_ASSERT( sizeof(my_slots[0]) % cache_line_size()==0, "arena::slot size not multiple of cache line size" );
+    __TBB_ASSERT( is_aligned(this, cache_line_size()), "arena misaligned" );
+    my_threading_control = control;
+    my_limit = 1;
+    // Two slots are mandatory: for the external thread, and for 1 worker (required to support starvation resistant tasks).
+    my_num_slots = num_arena_slots(num_slots, num_reserved_slots);
+    my_num_reserved_slots = num_reserved_slots;
+    my_max_num_workers = num_slots-num_reserved_slots;
+    my_priority_level = priority_level;
+    my_references = ref_external; // accounts for the external thread
+    my_observers.my_arena = this;
+    my_co_cache.init(4 * num_slots);
+    __TBB_ASSERT ( my_max_num_workers <= my_num_slots, nullptr);
+    // Initialize the default context. It should be allocated before task_dispatch construction.
+    my_default_ctx = new (cache_aligned_allocate(sizeof(d1::task_group_context)))
+        d1::task_group_context{ d1::task_group_context::isolated, d1::task_group_context::fp_settings };
+    // Construct slots. Mark internal synchronization elements for the tools.
+    task_dispatcher* base_td_pointer = reinterpret_cast<task_dispatcher*>(my_slots + my_num_slots);
+    for( unsigned i = 0; i < my_num_slots; ++i ) {
+        // __TBB_ASSERT( !my_slots[i].my_scheduler && !my_slots[i].task_pool, nullptr);
+        __TBB_ASSERT( !my_slots[i].task_pool_ptr, nullptr);
+        __TBB_ASSERT( !my_slots[i].my_task_pool_size, nullptr);
+        mailbox(i).construct();
+        my_slots[i].init_task_streams(i);
+        my_slots[i].my_default_task_dispatcher = new(base_td_pointer + i) task_dispatcher(this);
+        my_slots[i].my_is_occupied.store(false, std::memory_order_relaxed);
+    }
+    my_fifo_task_stream.initialize(my_num_slots);
+    my_resume_task_stream.initialize(my_num_slots);
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    my_critical_task_stream.initialize(my_num_slots);
+#endif
+    my_mandatory_requests = 0;
+}
+
+arena& arena::allocate_arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots,
+                              unsigned priority_level)
+{
+    __TBB_ASSERT( sizeof(base_type) + sizeof(arena_slot) == sizeof(arena), "All arena data fields must go to arena_base" );
+    __TBB_ASSERT( sizeof(base_type) % cache_line_size() == 0, "arena slots area misaligned: wrong padding" );
+    __TBB_ASSERT( sizeof(mail_outbox) == max_nfs_size, "Mailbox padding is wrong" );
+    std::size_t n = allocation_size(num_arena_slots(num_slots, num_reserved_slots));
+    unsigned char* storage = (unsigned char*)cache_aligned_allocate(n);
+    // Zero all slots to indicate that they are empty
+    std::memset( storage, 0, n );
+
+    return *new( storage + num_arena_slots(num_slots, num_reserved_slots) * sizeof(mail_outbox) )
+        arena(control, num_slots, num_reserved_slots, priority_level);
+}
+
+void arena::free_arena () {
+    __TBB_ASSERT( is_alive(my_guard), nullptr);
+    __TBB_ASSERT( !my_references.load(std::memory_order_relaxed), "There are threads in the dying arena" );
+    __TBB_ASSERT( !my_total_num_workers_requested && !my_num_workers_allotted, "Dying arena requests workers" );
+    __TBB_ASSERT( is_empty(), "Inconsistent state of a dying arena" );
+#if __TBB_ARENA_BINDING
+    if (my_numa_binding_observer != nullptr) {
+        destroy_binding_observer(my_numa_binding_observer);
+        my_numa_binding_observer = nullptr;
+    }
+#endif /*__TBB_ARENA_BINDING*/
+    poison_value( my_guard );
+    for ( unsigned i = 0; i < my_num_slots; ++i ) {
+        // __TBB_ASSERT( !my_slots[i].my_scheduler, "arena slot is not empty" );
+        // TODO: understand the assertion and modify
+        // __TBB_ASSERT( my_slots[i].task_pool == EmptyTaskPool, nullptr);
+        __TBB_ASSERT( my_slots[i].head == my_slots[i].tail, nullptr); // TODO: replace by is_quiescent_local_task_pool_empty
+        my_slots[i].free_task_pool();
+        mailbox(i).drain();
+        my_slots[i].my_default_task_dispatcher->~task_dispatcher();
+    }
+    __TBB_ASSERT(my_fifo_task_stream.empty(), "Not all enqueued tasks were executed");
+    __TBB_ASSERT(my_resume_task_stream.empty(), "Not all enqueued tasks were executed");
+    // Cleanup coroutines/schedulers cache
+    my_co_cache.cleanup();
+    my_default_ctx->~task_group_context();
+    cache_aligned_deallocate(my_default_ctx);
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    __TBB_ASSERT( my_critical_task_stream.empty(), "Not all critical tasks were executed");
+#endif
+    // Clear enfources synchronization with observe(false)
+    my_observers.clear();
+
+    void* storage  = &mailbox(my_num_slots-1);
+    __TBB_ASSERT( my_references.load(std::memory_order_relaxed) == 0, nullptr);
+    this->~arena();
+#if TBB_USE_ASSERT > 1
+    std::memset( storage, 0, allocation_size(my_num_slots) );
+#endif /* TBB_USE_ASSERT */
+    cache_aligned_deallocate( storage );
+}
+
+bool arena::has_enqueued_tasks() {
+    return !my_fifo_task_stream.empty();
+}
+
+void arena::request_workers(int mandatory_delta, int workers_delta, bool wakeup_threads) {
+    my_threading_control->adjust_demand(my_tc_client, mandatory_delta, workers_delta);
+
+    if (wakeup_threads) {
+        // Notify all sleeping threads that work has appeared in the arena.
+        get_waiting_threads_monitor().notify([&] (market_context context) {
+            return this == context.my_arena_addr;
+        });
+    }
+}
+
+bool arena::has_tasks() {
+    // TODO: rework it to return at least a hint about where a task was found; better if the task itself.
+    std::size_t n = my_limit.load(std::memory_order_acquire);
+    bool tasks_are_available = false;
+    for (std::size_t k = 0; k < n && !tasks_are_available; ++k) {
+        tasks_are_available = !my_slots[k].is_empty();
+    }
+    tasks_are_available = tasks_are_available || has_enqueued_tasks() || !my_resume_task_stream.empty();
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    tasks_are_available = tasks_are_available || !my_critical_task_stream.empty();
+#endif
+    return tasks_are_available;
+}
+
+void arena::out_of_work() {
+    // We should try unset my_pool_state first due to keep arena invariants in consistent state
+    // Otherwise, we might have my_pool_state = false and my_mandatory_concurrency = true that is broken invariant
+    bool disable_mandatory = my_mandatory_concurrency.try_clear_if([this] { return !has_enqueued_tasks(); });
+    bool release_workers = my_pool_state.try_clear_if([this] { return !has_tasks(); });
+
+    if (disable_mandatory || release_workers) {
+        int mandatory_delta = disable_mandatory ? -1 : 0;
+        int workers_delta = release_workers ? -(int)my_max_num_workers : 0;
+
+        if (disable_mandatory && is_arena_workerless()) {
+            // We had set workers_delta to 1 when enabled mandatory concurrency, so revert it now
+            workers_delta = -1;
+        }
+        request_workers(mandatory_delta, workers_delta);
+    }
+}
+
+void arena::set_top_priority(bool is_top_priority) {
+    my_is_top_priority.store(is_top_priority, std::memory_order_relaxed);
+}
+
+bool arena::is_top_priority() const {
+    return my_is_top_priority.load(std::memory_order_relaxed);
+}
+
+bool arena::try_join() {
+    if (num_workers_active() < my_num_workers_allotted.load(std::memory_order_relaxed)) {
+        my_references += arena::ref_worker;
+        return true;
+    }
+    return false;
+}
+
+void arena::set_allotment(unsigned allotment) {
+    if (my_num_workers_allotted.load(std::memory_order_relaxed) != allotment) {
+        my_num_workers_allotted.store(allotment, std::memory_order_relaxed);
+    }
+}
+
+std::pair<int, int> arena::update_request(int mandatory_delta, int workers_delta) {
+    __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr);
+
+    int min_workers_request = 0;
+    int max_workers_request = 0;
+
+    // Calculate min request
+    my_mandatory_requests += mandatory_delta;
+    min_workers_request = my_mandatory_requests > 0 ? 1 : 0;
+
+    // Calculate max request
+    my_total_num_workers_requested += workers_delta;
+    // Clamp worker request into interval [0, my_max_num_workers]
+    max_workers_request = clamp(my_total_num_workers_requested, 0,
+        min_workers_request > 0 && is_arena_workerless() ? 1 : (int)my_max_num_workers);
+
+    return { min_workers_request, max_workers_request };
+}
+
+thread_control_monitor& arena::get_waiting_threads_monitor() {
+    return my_threading_control->get_waiting_threads_monitor();
+}
+
+void arena::enqueue_task(d1::task& t, d1::task_group_context& ctx, thread_data& td) {
+    task_group_context_impl::bind_to(ctx, &td);
+    task_accessor::context(t) = &ctx;
+    task_accessor::isolation(t) = no_isolation;
+    my_fifo_task_stream.push( &t, random_lane_selector(td.my_random) );
+    advertise_new_work<work_enqueued>();
+}
+
+arena& arena::create(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned arena_priority_level)
+{
+    __TBB_ASSERT(num_slots > 0, NULL);
+    __TBB_ASSERT(num_reserved_slots <= num_slots, NULL);
+    // Add public market reference for an external thread/task_arena (that adds an internal reference in exchange).
+    arena& a = arena::allocate_arena(control, num_slots, num_reserved_slots, arena_priority_level);
+    a.my_tc_client = control->create_client(a);
+    // We should not publish arena until all fields are initialized
+    control->publish_client(a.my_tc_client);
+    return a;
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+// Enable task_arena.h
+#include "third_party/tbb/task_arena.h" // task_arena_base
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if TBB_USE_ASSERT
+void assert_arena_priority_valid( tbb::task_arena::priority a_priority ) {
+    bool is_arena_priority_correct =
+        a_priority == tbb::task_arena::priority::high   ||
+        a_priority == tbb::task_arena::priority::normal ||
+        a_priority == tbb::task_arena::priority::low;
+    __TBB_ASSERT( is_arena_priority_correct,
+                  "Task arena priority should be equal to one of the predefined values." );
+}
+#else
+void assert_arena_priority_valid( tbb::task_arena::priority ) {}
+#endif
+
+unsigned arena_priority_level( tbb::task_arena::priority a_priority ) {
+    assert_arena_priority_valid( a_priority );
+    return d1::num_priority_levels - unsigned(int(a_priority) / d1::priority_stride);
+}
+
+tbb::task_arena::priority arena_priority( unsigned priority_level ) {
+    auto priority = tbb::task_arena::priority(
+        (d1::num_priority_levels - priority_level) * d1::priority_stride
+    );
+    assert_arena_priority_valid( priority );
+    return priority;
+}
+
+struct task_arena_impl {
+    static void initialize(d1::task_arena_base&);
+    static void terminate(d1::task_arena_base&);
+    static bool attach(d1::task_arena_base&);
+    static void execute(d1::task_arena_base&, d1::delegate_base&);
+    static void wait(d1::task_arena_base&);
+    static int max_concurrency(const d1::task_arena_base*);
+    static void enqueue(d1::task&, d1::task_group_context*, d1::task_arena_base*);
+};
+
+void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base& ta) {
+    task_arena_impl::initialize(ta);
+}
+void __TBB_EXPORTED_FUNC terminate(d1::task_arena_base& ta) {
+    task_arena_impl::terminate(ta);
+}
+bool __TBB_EXPORTED_FUNC attach(d1::task_arena_base& ta) {
+    return task_arena_impl::attach(ta);
+}
+void __TBB_EXPORTED_FUNC execute(d1::task_arena_base& ta, d1::delegate_base& d) {
+    task_arena_impl::execute(ta, d);
+}
+void __TBB_EXPORTED_FUNC wait(d1::task_arena_base& ta) {
+    task_arena_impl::wait(ta);
+}
+
+int __TBB_EXPORTED_FUNC max_concurrency(const d1::task_arena_base* ta) {
+    return task_arena_impl::max_concurrency(ta);
+}
+
+void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_arena_base* ta) {
+    task_arena_impl::enqueue(t, nullptr, ta);
+}
+
+void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_group_context& ctx, d1::task_arena_base* ta) {
+    task_arena_impl::enqueue(t, &ctx, ta);
+}
+
+void task_arena_impl::initialize(d1::task_arena_base& ta) {
+    // Enforce global market initialization to properly initialize soft limit
+    (void)governor::get_thread_data();
+    if (ta.my_max_concurrency < 1) {
+#if __TBB_ARENA_BINDING
+        d1::constraints arena_constraints = d1::constraints{}
+            .set_core_type(ta.core_type())
+            .set_max_threads_per_core(ta.max_threads_per_core())
+            .set_numa_id(ta.my_numa_id);
+        ta.my_max_concurrency = (int)default_concurrency(arena_constraints);
+#else /*!__TBB_ARENA_BINDING*/
+        ta.my_max_concurrency = (int)governor::default_num_threads();
+#endif /*!__TBB_ARENA_BINDING*/
+    }
+
+    __TBB_ASSERT(ta.my_arena.load(std::memory_order_relaxed) == nullptr, "Arena already initialized");
+    unsigned priority_level = arena_priority_level(ta.my_priority);
+    threading_control* thr_control = threading_control::register_public_reference();
+    arena& a = arena::create(thr_control, unsigned(ta.my_max_concurrency), ta.my_num_reserved_slots, priority_level);
+    ta.my_arena.store(&a, std::memory_order_release);
+#if __TBB_ARENA_BINDING
+    a.my_numa_binding_observer = construct_binding_observer(
+        static_cast<d1::task_arena*>(&ta), a.my_num_slots, ta.my_numa_id, ta.core_type(), ta.max_threads_per_core());
+#endif /*__TBB_ARENA_BINDING*/
+}
+
+void task_arena_impl::terminate(d1::task_arena_base& ta) {
+    arena* a = ta.my_arena.load(std::memory_order_relaxed);
+    assert_pointer_valid(a);
+    threading_control::unregister_public_reference(/*blocking_terminate=*/false);
+    a->on_thread_leaving(arena::ref_external);
+    ta.my_arena.store(nullptr, std::memory_order_relaxed);
+}
+
+bool task_arena_impl::attach(d1::task_arena_base& ta) {
+    __TBB_ASSERT(!ta.my_arena.load(std::memory_order_relaxed), nullptr);
+    thread_data* td = governor::get_thread_data_if_initialized();
+    if( td && td->my_arena ) {
+        arena* a = td->my_arena;
+        // There is an active arena to attach to.
+        // It's still used by s, so won't be destroyed right away.
+        __TBB_ASSERT(a->my_references > 0, nullptr);
+        a->my_references += arena::ref_external;
+        ta.my_num_reserved_slots = a->my_num_reserved_slots;
+        ta.my_priority = arena_priority(a->my_priority_level);
+        ta.my_max_concurrency = ta.my_num_reserved_slots + a->my_max_num_workers;
+        __TBB_ASSERT(arena::num_arena_slots(ta.my_max_concurrency, ta.my_num_reserved_slots) == a->my_num_slots, nullptr);
+        ta.my_arena.store(a, std::memory_order_release);
+        // increases threading_control's ref count for task_arena
+        threading_control::register_public_reference();
+        return true;
+    }
+    return false;
+}
+
+void task_arena_impl::enqueue(d1::task& t, d1::task_group_context* c, d1::task_arena_base* ta) {
+    thread_data* td = governor::get_thread_data();  // thread data is only needed for FastRandom instance
+    assert_pointer_valid(td, "thread_data pointer should not be null");
+    arena* a = ta ?
+              ta->my_arena.load(std::memory_order_relaxed)
+            : td->my_arena
+    ;
+    assert_pointer_valid(a, "arena pointer should not be null");
+    auto* ctx = c ? c : a->my_default_ctx;
+    assert_pointer_valid(ctx, "context pointer should not be null");
+    // Is there a better place for checking the state of ctx?
+     __TBB_ASSERT(!a->my_default_ctx->is_group_execution_cancelled(),
+                  "The task will not be executed because its task_group_context is cancelled.");
+     a->enqueue_task(t, *ctx, *td);
+}
+
+class nested_arena_context : no_copy {
+public:
+    nested_arena_context(thread_data& td, arena& nested_arena, std::size_t slot_index)
+        : m_orig_execute_data_ext(td.my_task_dispatcher->m_execute_data_ext)
+    {
+        if (td.my_arena != &nested_arena) {
+            m_orig_arena = td.my_arena;
+            m_orig_slot_index = td.my_arena_index;
+            m_orig_last_observer = td.my_last_observer;
+
+            td.detach_task_dispatcher();
+            td.attach_arena(nested_arena, slot_index);
+            if (td.my_inbox.is_idle_state(true))
+                td.my_inbox.set_is_idle(false);
+            task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher();
+            td.enter_task_dispatcher(task_disp, m_orig_execute_data_ext.task_disp->m_stealing_threshold);
+
+            // If the calling thread occupies the slots out of external thread reserve we need to notify the
+            // market that this arena requires one worker less.
+            if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) {
+                td.my_arena->request_workers(/* mandatory_delta = */ 0, /* workers_delta = */ -1);
+            }
+
+            td.my_last_observer = nullptr;
+            // The task_arena::execute method considers each calling thread as an external thread.
+            td.my_arena->my_observers.notify_entry_observers(td.my_last_observer, /* worker*/false);
+        }
+
+        m_task_dispatcher = td.my_task_dispatcher;
+        m_orig_fifo_tasks_allowed = m_task_dispatcher->allow_fifo_task(true);
+        m_orig_critical_task_allowed = m_task_dispatcher->m_properties.critical_task_allowed;
+        m_task_dispatcher->m_properties.critical_task_allowed = true;
+
+        execution_data_ext& ed_ext = td.my_task_dispatcher->m_execute_data_ext;
+        ed_ext.context = td.my_arena->my_default_ctx;
+        ed_ext.original_slot = td.my_arena_index;
+        ed_ext.affinity_slot = d1::no_slot;
+        ed_ext.task_disp = td.my_task_dispatcher;
+        ed_ext.isolation = no_isolation;
+
+        __TBB_ASSERT(td.my_arena_slot, nullptr);
+        __TBB_ASSERT(td.my_arena_slot->is_occupied(), nullptr);
+        __TBB_ASSERT(td.my_task_dispatcher, nullptr);
+    }
+    ~nested_arena_context() {
+        thread_data& td = *m_task_dispatcher->m_thread_data;
+        __TBB_ASSERT(governor::is_thread_data_set(&td), nullptr);
+        m_task_dispatcher->allow_fifo_task(m_orig_fifo_tasks_allowed);
+        m_task_dispatcher->m_properties.critical_task_allowed = m_orig_critical_task_allowed;
+        if (m_orig_arena) {
+            td.my_arena->my_observers.notify_exit_observers(td.my_last_observer, /*worker*/ false);
+            td.my_last_observer = m_orig_last_observer;
+
+            // Notify the market that this thread releasing a one slot
+            // that can be used by a worker thread.
+            if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) {
+                td.my_arena->request_workers(/* mandatory_delta = */ 0, /* workers_delta = */ 1);
+            }
+
+            td.leave_task_dispatcher();
+            td.my_arena_slot->release();
+            td.my_arena->my_exit_monitors.notify_one(); // do not relax!
+
+            td.attach_arena(*m_orig_arena, m_orig_slot_index);
+            td.attach_task_dispatcher(*m_orig_execute_data_ext.task_disp);
+            __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr);
+        }
+        td.my_task_dispatcher->m_execute_data_ext = m_orig_execute_data_ext;
+    }
+
+private:
+    execution_data_ext    m_orig_execute_data_ext{};
+    arena*              m_orig_arena{ nullptr };
+    observer_proxy*     m_orig_last_observer{ nullptr };
+    task_dispatcher*    m_task_dispatcher{ nullptr };
+    unsigned            m_orig_slot_index{};
+    bool                m_orig_fifo_tasks_allowed{};
+    bool                m_orig_critical_task_allowed{};
+};
+
+class delegated_task : public d1::task {
+    d1::delegate_base&  m_delegate;
+    concurrent_monitor& m_monitor;
+    d1::wait_context&   m_wait_ctx;
+    std::atomic<bool>   m_completed;
+    d1::task* execute(d1::execution_data& ed) override {
+        const execution_data_ext& ed_ext = static_cast<const execution_data_ext&>(ed);
+        execution_data_ext orig_execute_data_ext = ed_ext.task_disp->m_execute_data_ext;
+        __TBB_ASSERT(&ed_ext.task_disp->m_execute_data_ext == &ed,
+            "The execute data shall point to the current task dispatcher execute data");
+        __TBB_ASSERT(ed_ext.task_disp->m_execute_data_ext.isolation == no_isolation, nullptr);
+
+        ed_ext.task_disp->m_execute_data_ext.context = ed_ext.task_disp->get_thread_data().my_arena->my_default_ctx;
+        bool fifo_task_allowed = ed_ext.task_disp->allow_fifo_task(true);
+        try_call([&] {
+            m_delegate();
+        }).on_completion([&] {
+            ed_ext.task_disp->m_execute_data_ext = orig_execute_data_ext;
+            ed_ext.task_disp->allow_fifo_task(fifo_task_allowed);
+        });
+
+        finalize();
+        return nullptr;
+    }
+    d1::task* cancel(d1::execution_data&) override {
+        finalize();
+        return nullptr;
+    }
+    void finalize() {
+        m_wait_ctx.release(); // must precede the wakeup
+        m_monitor.notify([this] (std::uintptr_t ctx) {
+            return ctx == std::uintptr_t(&m_delegate);
+        }); // do not relax, it needs a fence!
+        m_completed.store(true, std::memory_order_release);
+    }
+public:
+    delegated_task(d1::delegate_base& d, concurrent_monitor& s, d1::wait_context& wo)
+        : m_delegate(d), m_monitor(s), m_wait_ctx(wo), m_completed{ false }{}
+    ~delegated_task() override {
+        // The destructor can be called earlier than the m_monitor is notified
+        // because the waiting thread can be released after m_wait_ctx.release_wait.
+        // To close that race we wait for the m_completed signal.
+        spin_wait_until_eq(m_completed, true);
+    }
+};
+
+void task_arena_impl::execute(d1::task_arena_base& ta, d1::delegate_base& d) {
+    arena* a = ta.my_arena.load(std::memory_order_relaxed);
+    __TBB_ASSERT(a != nullptr, nullptr);
+    thread_data* td = governor::get_thread_data();
+
+    bool same_arena = td->my_arena == a;
+    std::size_t index1 = td->my_arena_index;
+    if (!same_arena) {
+        index1 = a->occupy_free_slot</*as_worker */false>(*td);
+        if (index1 == arena::out_of_arena) {
+            concurrent_monitor::thread_context waiter((std::uintptr_t)&d);
+            d1::wait_context wo(1);
+            d1::task_group_context exec_context(d1::task_group_context::isolated);
+            task_group_context_impl::copy_fp_settings(exec_context, *a->my_default_ctx);
+
+            delegated_task dt(d, a->my_exit_monitors, wo);
+            a->enqueue_task( dt, exec_context, *td);
+            size_t index2 = arena::out_of_arena;
+            do {
+                a->my_exit_monitors.prepare_wait(waiter);
+                if (!wo.continue_execution()) {
+                    a->my_exit_monitors.cancel_wait(waiter);
+                    break;
+                }
+                index2 = a->occupy_free_slot</*as_worker*/false>(*td);
+                if (index2 != arena::out_of_arena) {
+                    a->my_exit_monitors.cancel_wait(waiter);
+                    nested_arena_context scope(*td, *a, index2 );
+                    r1::wait(wo, exec_context);
+                    __TBB_ASSERT(!exec_context.my_exception.load(std::memory_order_relaxed), nullptr); // exception can be thrown above, not deferred
+                    break;
+                }
+                a->my_exit_monitors.commit_wait(waiter);
+            } while (wo.continue_execution());
+            if (index2 == arena::out_of_arena) {
+                // notify a waiting thread even if this thread did not enter arena,
+                // in case it was woken by a leaving thread but did not need to enter
+                a->my_exit_monitors.notify_one(); // do not relax!
+            }
+            // process possible exception
+            auto exception = exec_context.my_exception.load(std::memory_order_acquire);
+            if (exception) {
+                __TBB_ASSERT(exec_context.is_group_execution_cancelled(), "The task group context with an exception should be canceled.");
+                exception->throw_self();
+            }
+            __TBB_ASSERT(governor::is_thread_data_set(td), nullptr);
+            return;
+        } // if (index1 == arena::out_of_arena)
+    } // if (!same_arena)
+
+    context_guard_helper</*report_tasks=*/false> context_guard;
+    context_guard.set_ctx(a->my_default_ctx);
+    nested_arena_context scope(*td, *a, index1);
+#if _WIN64
+    try {
+#endif
+        d();
+        __TBB_ASSERT(same_arena || governor::is_thread_data_set(td), nullptr);
+#if _WIN64
+    } catch (...) {
+        context_guard.restore_default();
+        throw;
+    }
+#endif
+}
+
+void task_arena_impl::wait(d1::task_arena_base& ta) {
+    arena* a = ta.my_arena.load(std::memory_order_relaxed);
+    __TBB_ASSERT(a != nullptr, nullptr);
+    thread_data* td = governor::get_thread_data();
+    __TBB_ASSERT_EX(td, "Scheduler is not initialized");
+    __TBB_ASSERT(td->my_arena != a || td->my_arena_index == 0, "internal_wait is not supported within a worker context" );
+    if (a->my_max_num_workers != 0) {
+        while (a->num_workers_active() || !a->is_empty()) {
+            yield();
+        }
+    }
+}
+
+int task_arena_impl::max_concurrency(const d1::task_arena_base *ta) {
+    arena* a = nullptr;
+    if( ta ) // for special cases of ta->max_concurrency()
+        a = ta->my_arena.load(std::memory_order_relaxed);
+    else if( thread_data* td = governor::get_thread_data_if_initialized() )
+        a = td->my_arena; // the current arena if any
+
+    if( a ) { // Get parameters from the arena
+        __TBB_ASSERT( !ta || ta->my_max_concurrency==1, nullptr);
+        int mandatory_worker = 0;
+        if (a->is_arena_workerless() && a->my_num_reserved_slots == 1) {
+            mandatory_worker = a->my_mandatory_concurrency.test() ? 1 : 0;
+        }
+        return a->my_num_reserved_slots + a->my_max_num_workers + mandatory_worker;
+    }
+
+    if (ta && ta->my_max_concurrency == 1) {
+        return 1;
+    }
+
+#if __TBB_ARENA_BINDING
+    if (ta) {
+        d1::constraints arena_constraints = d1::constraints{}
+            .set_numa_id(ta->my_numa_id)
+            .set_core_type(ta->core_type())
+            .set_max_threads_per_core(ta->max_threads_per_core());
+        return (int)default_concurrency(arena_constraints);
+    }
+#endif /*!__TBB_ARENA_BINDING*/
+
+    __TBB_ASSERT(!ta || ta->my_max_concurrency==d1::task_arena_base::automatic, nullptr);
+    return int(governor::default_num_threads());
+}
+
+void isolate_within_arena(d1::delegate_base& d, std::intptr_t isolation) {
+    // TODO: Decide what to do if the scheduler is not initialized. Is there a use case for it?
+    thread_data* tls = governor::get_thread_data();
+    assert_pointers_valid(tls, tls->my_task_dispatcher);
+    task_dispatcher* dispatcher = tls->my_task_dispatcher;
+    isolation_type previous_isolation = dispatcher->m_execute_data_ext.isolation;
+    try_call([&] {
+        // We temporarily change the isolation tag of the currently running task. It will be restored in the destructor of the guard.
+        isolation_type current_isolation = isolation ? isolation : reinterpret_cast<isolation_type>(&d);
+        // Save the current isolation value and set new one
+        previous_isolation = dispatcher->set_isolation(current_isolation);
+        // Isolation within this callable
+        d();
+    }).on_completion([&] {
+        __TBB_ASSERT(governor::get_thread_data()->my_task_dispatcher == dispatcher, nullptr);
+        dispatcher->set_isolation(previous_isolation);
+    });
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/arena.h b/third_party/tbb/arena.h
new file mode 100644
index 000000000..18c02828e
--- /dev/null
+++ b/third_party/tbb/arena.h
@@ -0,0 +1,511 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_arena_H
+#define _TBB_arena_H
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/cstring"
+
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/spin_mutex.h"
+
+#include "third_party/tbb/scheduler_common.h"
+#include "third_party/tbb/intrusive_list.h"
+#include "third_party/tbb/task_stream.h"
+#include "third_party/tbb/arena_slot.h"
+#include "third_party/tbb/rml_tbb.h"
+#include "third_party/tbb/mailbox.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/concurrent_monitor.h"
+#include "third_party/tbb/observer_proxy.h"
+#include "third_party/tbb/thread_control_monitor.h"
+#include "third_party/tbb/threading_control_client.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class task_dispatcher;
+class task_group_context;
+class threading_control;
+class allocate_root_with_context_proxy;
+
+#if __TBB_ARENA_BINDING
+class numa_binding_observer;
+#endif /*__TBB_ARENA_BINDING*/
+
+//! Bounded coroutines cache LIFO ring buffer
+class arena_co_cache {
+    //! Ring buffer storage
+    task_dispatcher** my_co_scheduler_cache;
+    //! Current cache index
+    unsigned my_head;
+    //! Cache capacity for arena
+    unsigned my_max_index;
+    //! Accessor lock for modification operations
+    tbb::spin_mutex my_co_cache_mutex;
+
+    unsigned next_index() {
+        return ( my_head == my_max_index ) ? 0 : my_head + 1;
+    }
+
+    unsigned prev_index() {
+        return ( my_head == 0 ) ? my_max_index : my_head - 1;
+    }
+
+    bool internal_empty() {
+        return my_co_scheduler_cache[prev_index()] == nullptr;
+    }
+
+    void internal_task_dispatcher_cleanup(task_dispatcher* to_cleanup) {
+        to_cleanup->~task_dispatcher();
+        cache_aligned_deallocate(to_cleanup);
+    }
+
+public:
+    void init(unsigned cache_capacity) {
+        std::size_t alloc_size = cache_capacity * sizeof(task_dispatcher*);
+        my_co_scheduler_cache = (task_dispatcher**)cache_aligned_allocate(alloc_size);
+        std::memset( my_co_scheduler_cache, 0, alloc_size );
+        my_head = 0;
+        my_max_index = cache_capacity - 1;
+    }
+
+    void cleanup() {
+        while (task_dispatcher* to_cleanup = pop()) {
+            internal_task_dispatcher_cleanup(to_cleanup);
+        }
+        cache_aligned_deallocate(my_co_scheduler_cache);
+    }
+
+    //! Insert scheduler to the current available place.
+    //! Replace an old value, if necessary.
+    void push(task_dispatcher* s) {
+        task_dispatcher* to_cleanup = nullptr;
+        {
+            tbb::spin_mutex::scoped_lock lock(my_co_cache_mutex);
+            // Check if we are replacing some existing buffer entrance
+            if (my_co_scheduler_cache[my_head] != nullptr) {
+                to_cleanup = my_co_scheduler_cache[my_head];
+            }
+            // Store the cached value
+            my_co_scheduler_cache[my_head] = s;
+            // Move head index to the next slot
+            my_head = next_index();
+        }
+        // Cleanup replaced buffer if any
+        if (to_cleanup) {
+            internal_task_dispatcher_cleanup(to_cleanup);
+        }
+    }
+
+    //! Get a cached scheduler if any
+    task_dispatcher* pop() {
+        tbb::spin_mutex::scoped_lock lock(my_co_cache_mutex);
+        // No cached coroutine
+        if (internal_empty()) {
+            return nullptr;
+        }
+        // Move head index to the currently available value
+        my_head = prev_index();
+        // Retrieve the value from the buffer
+        task_dispatcher* to_return = my_co_scheduler_cache[my_head];
+        // Clear the previous entrance value
+        my_co_scheduler_cache[my_head] = nullptr;
+        return to_return;
+    }
+};
+
+struct stack_anchor_type {
+    stack_anchor_type() = default;
+    stack_anchor_type(const stack_anchor_type&) = delete;
+};
+
+class atomic_flag {
+    static const std::uintptr_t SET = 1;
+    static const std::uintptr_t UNSET = 0;
+    std::atomic<std::uintptr_t> my_state{UNSET};
+public:
+    bool test_and_set() {
+        std::uintptr_t state = my_state.load(std::memory_order_acquire);
+        switch (state) {
+        case SET:
+            return false;
+        default: /* busy */
+            if (my_state.compare_exchange_strong(state, SET)) {
+                // We interrupted clear transaction
+                return false;
+            }
+            if (state != UNSET) {
+                // We lost our epoch
+                return false;
+            }
+            // We are too late but still in the same epoch
+            __TBB_fallthrough;
+        case UNSET:
+            return my_state.compare_exchange_strong(state, SET);
+        }
+    }
+    template <typename Pred>
+    bool try_clear_if(Pred&& pred) {
+        std::uintptr_t busy = std::uintptr_t(&busy);
+        std::uintptr_t state = my_state.load(std::memory_order_acquire);
+        if (state == SET && my_state.compare_exchange_strong(state, busy)) {
+            if (pred()) {
+                return my_state.compare_exchange_strong(busy, UNSET);
+            }
+            // The result of the next operation is discarded, always false should be returned.
+            my_state.compare_exchange_strong(busy, SET);
+        }
+        return false;
+    }
+    void clear() {
+        my_state.store(UNSET, std::memory_order_release);
+    }
+    bool test(std::memory_order order = std::memory_order_acquire) {
+        return my_state.load(order) != UNSET;
+    }
+};
+
+//! The structure of an arena, except the array of slots.
+/** Separated in order to simplify padding.
+    Intrusive list node base class is used by market to form a list of arenas. **/
+// TODO: Analyze arena_base cache lines placement
+struct arena_base : padded<intrusive_list_node> {
+    //! The number of workers that have been marked out by the resource manager to service the arena.
+    std::atomic<unsigned> my_num_workers_allotted;   // heavy use in stealing loop
+
+    //! Reference counter for the arena.
+    /** Worker and external thread references are counted separately: first several bits are for references
+        from external thread threads or explicit task_arenas (see arena::ref_external_bits below);
+        the rest counts the number of workers servicing the arena. */
+    std::atomic<unsigned> my_references;     // heavy use in stealing loop
+
+    //! The maximal number of currently busy slots.
+    std::atomic<unsigned> my_limit;          // heavy use in stealing loop
+
+    //! Task pool for the tasks scheduled via task::enqueue() method.
+    /** Such scheduling guarantees eventual execution even if
+        - new tasks are constantly coming (by extracting scheduled tasks in
+          relaxed FIFO order);
+        - the enqueuing thread does not call any of wait_for_all methods. **/
+    task_stream<front_accessor> my_fifo_task_stream; // heavy use in stealing loop
+
+    //! Task pool for the tasks scheduled via tbb::resume() function.
+    task_stream<front_accessor> my_resume_task_stream; // heavy use in stealing loop
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    //! Task pool for the tasks with critical property set.
+    /** Critical tasks are scheduled for execution ahead of other sources (including local task pool
+        and even bypassed tasks) unless the thread already executes a critical task in an outer
+        dispatch loop **/
+    // used on the hot path of the task dispatch loop
+    task_stream<back_nonnull_accessor> my_critical_task_stream;
+#endif
+
+    //! The total number of workers that are requested from the resource manager.
+    int my_total_num_workers_requested;
+
+    //! The index in the array of per priority lists of arenas this object is in.
+    /*const*/ unsigned my_priority_level;
+
+    //! The max priority level of arena in permit manager.
+    std::atomic<bool> my_is_top_priority{false};
+
+    //! Current task pool state and estimate of available tasks amount.
+    atomic_flag my_pool_state;
+
+    //! The list of local observers attached to this arena.
+    observer_list my_observers;
+
+#if __TBB_ARENA_BINDING
+    //! Pointer to internal observer that allows to bind threads in arena to certain NUMA node.
+    numa_binding_observer* my_numa_binding_observer;
+#endif /*__TBB_ARENA_BINDING*/
+
+    // Below are rarely modified members
+
+    threading_control* my_threading_control;
+
+    //! Default task group context.
+    d1::task_group_context* my_default_ctx;
+
+    //! Waiting object for external threads that cannot join the arena.
+    concurrent_monitor my_exit_monitors;
+
+    //! Coroutines (task_dispathers) cache buffer
+    arena_co_cache my_co_cache;
+
+    // arena needs an extra worker despite the arena limit
+    atomic_flag my_mandatory_concurrency;
+    // the number of local mandatory concurrency requests
+    int my_mandatory_requests;
+
+    //! The number of slots in the arena.
+    unsigned my_num_slots;
+    //! The number of reserved slots (can be occupied only by external threads).
+    unsigned my_num_reserved_slots;
+    //! The number of workers requested by the external thread owning the arena.
+    unsigned my_max_num_workers;
+
+    threading_control_client my_tc_client;
+
+#if TBB_USE_ASSERT
+    //! Used to trap accesses to the object after its destruction.
+    std::uintptr_t my_guard;
+#endif /* TBB_USE_ASSERT */
+}; // struct arena_base
+
+class arena: public padded<arena_base>
+{
+public:
+    using base_type = padded<arena_base>;
+
+    //! Types of work advertised by advertise_new_work()
+    enum new_work_type {
+        work_spawned,
+        wakeup,
+        work_enqueued
+    };
+
+    //! Constructor
+    arena(threading_control* control, unsigned max_num_workers, unsigned num_reserved_slots, unsigned priority_level);
+
+    //! Allocate an instance of arena.
+    static arena& allocate_arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots,
+                                  unsigned priority_level);
+
+    static arena& create(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned arena_priority_level);
+
+    static int unsigned num_arena_slots ( unsigned num_slots, unsigned num_reserved_slots ) {
+        return num_reserved_slots == 0 ? num_slots : max(2u, num_slots);
+    }
+
+    static int allocation_size( unsigned num_slots ) {
+        return sizeof(base_type) + num_slots * (sizeof(mail_outbox) + sizeof(arena_slot) + sizeof(task_dispatcher));
+    }
+
+    //! Get reference to mailbox corresponding to given slot_id
+    mail_outbox& mailbox( d1::slot_id slot ) {
+        __TBB_ASSERT( slot != d1::no_slot, "affinity should be specified" );
+
+        return reinterpret_cast<mail_outbox*>(this)[-(int)(slot+1)]; // cast to 'int' is redundant but left for readability
+    }
+
+    //! Completes arena shutdown, destructs and deallocates it.
+    void free_arena();
+
+    //! The number of least significant bits for external references
+    static const unsigned ref_external_bits = 12; // up to 4095 external and 1M workers
+
+    //! Reference increment values for externals and workers
+    static const unsigned ref_external = 1;
+    static const unsigned ref_worker   = 1 << ref_external_bits;
+
+    //! The number of workers active in the arena.
+    unsigned num_workers_active() const {
+        return my_references.load(std::memory_order_acquire) >> ref_external_bits;
+    }
+
+    //! Check if the recall is requested by the market.
+    bool is_recall_requested() const {
+        return num_workers_active() > my_num_workers_allotted.load(std::memory_order_relaxed);
+    }
+
+    void request_workers(int mandatory_delta, int workers_delta, bool wakeup_threads = false);
+
+    //! If necessary, raise a flag that there is new job in arena.
+    template<arena::new_work_type work_type> void advertise_new_work();
+
+    //! Attempts to steal a task from a randomly chosen arena slot
+    d1::task* steal_task(unsigned arena_index, FastRandom& frnd, execution_data_ext& ed, isolation_type isolation);
+
+    //! Get a task from a global starvation resistant queue
+    template<task_stream_accessor_type accessor>
+    d1::task* get_stream_task(task_stream<accessor>& stream, unsigned& hint);
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    //! Tries to find a critical task in global critical task stream
+    d1::task* get_critical_task(unsigned& hint, isolation_type isolation);
+#endif
+
+    //! Check if there is job anywhere in arena.
+    void out_of_work();
+
+    //! enqueue a task into starvation-resistance queue
+    void enqueue_task(d1::task&, d1::task_group_context&, thread_data&);
+
+    //! Registers the worker with the arena and enters TBB scheduler dispatch loop
+    void process(thread_data&);
+
+    //! Notification that the thread leaves its arena
+
+    void on_thread_leaving(unsigned ref_param);
+
+    //! Check for the presence of enqueued tasks
+    bool has_enqueued_tasks();
+
+    //! Check for the presence of any tasks
+    bool has_tasks();
+
+    bool is_empty() { return my_pool_state.test() == /* EMPTY */ false; }
+
+    thread_control_monitor& get_waiting_threads_monitor();
+
+    static const std::size_t out_of_arena = ~size_t(0);
+    //! Tries to occupy a slot in the arena. On success, returns the slot index; if no slot is available, returns out_of_arena.
+    template <bool as_worker>
+    std::size_t occupy_free_slot(thread_data&);
+    //! Tries to occupy a slot in the specified range.
+    std::size_t occupy_free_slot_in_range(thread_data& tls, std::size_t lower, std::size_t upper);
+
+    std::uintptr_t calculate_stealing_threshold();
+
+    unsigned priority_level() { return my_priority_level; }
+
+    bool has_request() { return my_total_num_workers_requested; }
+
+    unsigned references() const { return my_references.load(std::memory_order_acquire); }
+
+    bool is_arena_workerless() const { return my_max_num_workers == 0; }
+
+    void set_top_priority(bool);
+
+    bool is_top_priority() const;
+
+    bool try_join();
+
+    void set_allotment(unsigned allotment);
+
+    std::pair</*min workers = */ int, /*max workers = */ int> update_request(int mandatory_delta, int workers_delta);
+
+    /** Must be the last data field */
+    arena_slot my_slots[1];
+}; // class arena
+
+template <arena::new_work_type work_type>
+void arena::advertise_new_work() {
+    bool is_mandatory_needed = false;
+    bool are_workers_needed = false;
+
+    if (work_type != work_spawned) {
+        // Local memory fence here and below is required to avoid missed wakeups; see the comment below.
+        // Starvation resistant tasks require concurrency, so missed wakeups are unacceptable.
+        atomic_fence_seq_cst();
+    }
+
+    if (work_type == work_enqueued && my_num_slots > my_num_reserved_slots) {
+        is_mandatory_needed = my_mandatory_concurrency.test_and_set();
+    }
+
+    // Double-check idiom that, in case of spawning, is deliberately sloppy about memory fences.
+    // Technically, to avoid missed wakeups, there should be a full memory fence between the point we
+    // released the task pool (i.e. spawned task) and read the arena's state.  However, adding such a
+    // fence might hurt overall performance more than it helps, because the fence would be executed
+    // on every task pool release, even when stealing does not occur.  Since TBB allows parallelism,
+    // but never promises parallelism, the missed wakeup is not a correctness problem.
+    are_workers_needed = my_pool_state.test_and_set();
+
+    if (is_mandatory_needed || are_workers_needed) {
+        int mandatory_delta = is_mandatory_needed ? 1 : 0;
+        int workers_delta = are_workers_needed ? my_max_num_workers : 0;
+
+        if (is_mandatory_needed && is_arena_workerless()) {
+            // Set workers_delta to 1 to keep arena invariants consistent
+            workers_delta = 1;
+        }
+
+        bool wakeup_workers = is_mandatory_needed || are_workers_needed;
+        request_workers(mandatory_delta, workers_delta, wakeup_workers);
+    }
+}
+
+inline d1::task* arena::steal_task(unsigned arena_index, FastRandom& frnd, execution_data_ext& ed, isolation_type isolation) {
+    auto slot_num_limit = my_limit.load(std::memory_order_relaxed);
+    if (slot_num_limit == 1) {
+        // No slots to steal from
+        return nullptr;
+    }
+    // Try to steal a task from a random victim.
+    std::size_t k = frnd.get() % (slot_num_limit - 1);
+    // The following condition excludes the external thread that might have
+    // already taken our previous place in the arena from the list .
+    // of potential victims. But since such a situation can take
+    // place only in case of significant oversubscription, keeping
+    // the checks simple seems to be preferable to complicating the code.
+    if (k >= arena_index) {
+        ++k; // Adjusts random distribution to exclude self
+    }
+    arena_slot* victim = &my_slots[k];
+    d1::task **pool = victim->task_pool.load(std::memory_order_relaxed);
+    d1::task *t = nullptr;
+    if (pool == EmptyTaskPool || !(t = victim->steal_task(*this, isolation, k))) {
+        return nullptr;
+    }
+    if (task_accessor::is_proxy_task(*t)) {
+        task_proxy &tp = *(task_proxy*)t;
+        d1::slot_id slot = tp.slot;
+        t = tp.extract_task<task_proxy::pool_bit>();
+        if (!t) {
+            // Proxy was empty, so it's our responsibility to free it
+            tp.allocator.delete_object(&tp, ed);
+            return nullptr;
+        }
+        // Note affinity is called for any stolen task (proxy or general)
+        ed.affinity_slot = slot;
+    } else {
+        // Note affinity is called for any stolen task (proxy or general)
+        ed.affinity_slot = d1::any_slot;
+    }
+    // Update task owner thread id to identify stealing
+    ed.original_slot = k;
+    return t;
+}
+
+template<task_stream_accessor_type accessor>
+inline d1::task* arena::get_stream_task(task_stream<accessor>& stream, unsigned& hint) {
+    if (stream.empty())
+        return nullptr;
+    return stream.pop(subsequent_lane_selector(hint));
+}
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+// Retrieves critical task respecting isolation level, if provided. The rule is:
+// 1) If no outer critical task and no isolation => take any critical task
+// 2) If working on an outer critical task and no isolation => cannot take any critical task
+// 3) If no outer critical task but isolated => respect isolation
+// 4) If working on an outer critical task and isolated => respect isolation
+// Hint is used to keep some LIFO-ness, start search with the lane that was used during push operation.
+inline d1::task* arena::get_critical_task(unsigned& hint, isolation_type isolation) {
+    if (my_critical_task_stream.empty())
+        return nullptr;
+
+    if ( isolation != no_isolation ) {
+        return my_critical_task_stream.pop_specific( hint, isolation );
+    } else {
+        return my_critical_task_stream.pop(preceding_lane_selector(hint));
+    }
+}
+#endif // __TBB_PREVIEW_CRITICAL_TASKS
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_arena_H */
diff --git a/third_party/tbb/arena_slot.cpp b/third_party/tbb/arena_slot.cpp
new file mode 100644
index 000000000..8c10cf071
--- /dev/null
+++ b/third_party/tbb/arena_slot.cpp
@@ -0,0 +1,219 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/arena_slot.h"
+#include "third_party/tbb/arena.h"
+#include "third_party/tbb/thread_data.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// Arena Slot
+//------------------------------------------------------------------------
+d1::task* arena_slot::get_task_impl(size_t T, execution_data_ext& ed, bool& tasks_omitted, isolation_type isolation) {
+    __TBB_ASSERT(tail.load(std::memory_order_relaxed) <= T || is_local_task_pool_quiescent(),
+            "Is it safe to get a task at position T?");
+
+    d1::task* result = task_pool_ptr[T];
+    __TBB_ASSERT(!is_poisoned( result ), "The poisoned task is going to be processed");
+
+    if (!result) {
+        return nullptr;
+    }
+    bool omit = isolation != no_isolation && isolation != task_accessor::isolation(*result);
+    if (!omit && !task_accessor::is_proxy_task(*result)) {
+        return result;
+    } else if (omit) {
+        tasks_omitted = true;
+        return nullptr;
+    }
+
+    task_proxy& tp = static_cast<task_proxy&>(*result);
+    d1::slot_id aff_id = tp.slot;
+    if ( d1::task *t = tp.extract_task<task_proxy::pool_bit>() ) {
+        ed.affinity_slot = aff_id;
+        return t;
+    }
+    // Proxy was empty, so it's our responsibility to free it
+    tp.allocator.delete_object(&tp, ed);
+
+    if ( tasks_omitted ) {
+        task_pool_ptr[T] = nullptr;
+    }
+    return nullptr;
+}
+
+d1::task* arena_slot::get_task(execution_data_ext& ed, isolation_type isolation) {
+    __TBB_ASSERT(is_task_pool_published(), nullptr);
+    // The current task position in the task pool.
+    std::size_t T0 = tail.load(std::memory_order_relaxed);
+    // The bounds of available tasks in the task pool. H0 is only used when the head bound is reached.
+    std::size_t H0 = (std::size_t)-1, T = T0;
+    d1::task* result = nullptr;
+    bool task_pool_empty = false;
+    bool tasks_omitted = false;
+    do {
+        __TBB_ASSERT( !result, nullptr );
+        // The full fence is required to sync the store of `tail` with the load of `head` (write-read barrier)
+        T = --tail;
+        // The acquire load of head is required to guarantee consistency of our task pool
+        // when a thief rolls back the head.
+        if ( (std::intptr_t)( head.load(std::memory_order_acquire) ) > (std::intptr_t)T ) {
+            acquire_task_pool();
+            H0 = head.load(std::memory_order_relaxed);
+            if ( (std::intptr_t)H0 > (std::intptr_t)T ) {
+                // The thief has not backed off - nothing to grab.
+                __TBB_ASSERT( H0 == head.load(std::memory_order_relaxed)
+                    && T == tail.load(std::memory_order_relaxed)
+                    && H0 == T + 1, "victim/thief arbitration algorithm failure" );
+                reset_task_pool_and_leave();
+                // No tasks in the task pool.
+                task_pool_empty = true;
+                break;
+            } else if ( H0 == T ) {
+                // There is only one task in the task pool.
+                reset_task_pool_and_leave();
+                task_pool_empty = true;
+            } else {
+                // Release task pool if there are still some tasks.
+                // After the release, the tail will be less than T, thus a thief
+                // will not attempt to get a task at position T.
+                release_task_pool();
+            }
+        }
+        result = get_task_impl( T, ed, tasks_omitted, isolation );
+        if ( result ) {
+            poison_pointer( task_pool_ptr[T] );
+            break;
+        } else if ( !tasks_omitted ) {
+            poison_pointer( task_pool_ptr[T] );
+            __TBB_ASSERT( T0 == T+1, nullptr );
+            T0 = T;
+        }
+    } while ( !result && !task_pool_empty );
+
+    if ( tasks_omitted ) {
+        if ( task_pool_empty ) {
+            // All tasks have been checked. The task pool should be  in reset state.
+            // We just restore the bounds for the available tasks.
+            // TODO: Does it have sense to move them to the beginning of the task pool?
+            __TBB_ASSERT( is_quiescent_local_task_pool_reset(), nullptr );
+            if ( result ) {
+                // If we have a task, it should be at H0 position.
+                __TBB_ASSERT( H0 == T, nullptr );
+                ++H0;
+            }
+            __TBB_ASSERT( H0 <= T0, nullptr );
+            if ( H0 < T0 ) {
+                // Restore the task pool if there are some tasks.
+                head.store(H0, std::memory_order_relaxed);
+                tail.store(T0, std::memory_order_relaxed);
+                // The release fence is used in publish_task_pool.
+                publish_task_pool();
+                // Synchronize with snapshot as we published some tasks.
+                ed.task_disp->m_thread_data->my_arena->advertise_new_work<arena::wakeup>();
+            }
+        } else {
+            // A task has been obtained. We need to make a hole in position T.
+            __TBB_ASSERT( is_task_pool_published(), nullptr );
+            __TBB_ASSERT( result, nullptr );
+            task_pool_ptr[T] = nullptr;
+            tail.store(T0, std::memory_order_release);
+            // Synchronize with snapshot as we published some tasks.
+            // TODO: consider some approach not to call wakeup for each time. E.g. check if the tail reached the head.
+            ed.task_disp->m_thread_data->my_arena->advertise_new_work<arena::wakeup>();
+        }
+    }
+
+    __TBB_ASSERT( (std::intptr_t)tail.load(std::memory_order_relaxed) >= 0, nullptr );
+    __TBB_ASSERT( result || tasks_omitted || is_quiescent_local_task_pool_reset(), nullptr );
+    return result;
+}
+
+d1::task* arena_slot::steal_task(arena& a, isolation_type isolation, std::size_t slot_index) {
+    d1::task** victim_pool = lock_task_pool();
+    if (!victim_pool) {
+        return nullptr;
+    }
+    d1::task* result = nullptr;
+    std::size_t H = head.load(std::memory_order_relaxed); // mirror
+    std::size_t H0 = H;
+    bool tasks_omitted = false;
+    do {
+        // The full fence is required to sync the store of `head` with the load of `tail` (write-read barrier)
+        H = ++head;
+        // The acquire load of tail is required to guarantee consistency of victim_pool
+        // because the owner synchronizes task spawning via tail.
+        if ((std::intptr_t)H > (std::intptr_t)(tail.load(std::memory_order_acquire))) {
+            // Stealing attempt failed, deque contents has not been changed by us
+            head.store( /*dead: H = */ H0, std::memory_order_relaxed );
+            __TBB_ASSERT( !result, nullptr );
+            goto unlock;
+        }
+        result = victim_pool[H-1];
+        __TBB_ASSERT( !is_poisoned( result ), nullptr );
+
+        if (result) {
+            if (isolation == no_isolation || isolation == task_accessor::isolation(*result)) {
+                if (!task_accessor::is_proxy_task(*result)) {
+                    break;
+                }
+                task_proxy& tp = *static_cast<task_proxy*>(result);
+                // If mailed task is likely to be grabbed by its destination thread, skip it.
+                if (!task_proxy::is_shared(tp.task_and_tag) || !tp.outbox->recipient_is_idle() || a.mailbox(slot_index).recipient_is_idle()) {
+                    break;
+                }
+            }
+            // The task cannot be executed either due to isolation or proxy constraints.
+            result = nullptr;
+            tasks_omitted = true;
+        } else if (!tasks_omitted) {
+            // Cleanup the task pool from holes until a task is skipped.
+            __TBB_ASSERT( H0 == H-1, nullptr );
+            poison_pointer( victim_pool[H0] );
+            H0 = H;
+        }
+    } while (!result);
+    __TBB_ASSERT( result, nullptr );
+
+    // emit "task was consumed" signal
+    poison_pointer( victim_pool[H-1] );
+    if (tasks_omitted) {
+        // Some proxies in the task pool have been omitted. Set the stolen task to nullptr.
+        victim_pool[H-1] = nullptr;
+        // The release store synchronizes the victim_pool update(the store of nullptr).
+        head.store( /*dead: H = */ H0, std::memory_order_release );
+    }
+unlock:
+    unlock_task_pool(victim_pool);
+
+#if __TBB_PREFETCHING
+    __TBB_cl_evict(&victim_slot.head);
+    __TBB_cl_evict(&victim_slot.tail);
+#endif
+    if (tasks_omitted) {
+        // Synchronize with snapshot as the head and tail can be bumped which can falsely trigger EMPTY state
+        a.advertise_new_work<arena::wakeup>();
+    }
+    return result;
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/arena_slot.h b/third_party/tbb/arena_slot.h
new file mode 100644
index 000000000..3f18342c5
--- /dev/null
+++ b/third_party/tbb/arena_slot.h
@@ -0,0 +1,415 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_arena_slot_H
+#define _TBB_arena_slot_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_task.h"
+
+#include "third_party/tbb/cache_aligned_allocator.h"
+
+#include "third_party/tbb/misc.h"
+#include "third_party/tbb/mailbox.h"
+#include "third_party/tbb/scheduler_common.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class arena;
+class task_group_context;
+
+//--------------------------------------------------------------------------------------------------------
+// Arena Slot
+//--------------------------------------------------------------------------------------------------------
+
+static d1::task** const EmptyTaskPool  = nullptr;
+static d1::task** const LockedTaskPool = reinterpret_cast<d1::task**>(~std::intptr_t(0));
+
+struct alignas(max_nfs_size) arena_slot_shared_state {
+    //! Scheduler of the thread attached to the slot
+    /** Marks the slot as busy, and is used to iterate through the schedulers belonging to this arena **/
+    std::atomic<bool> my_is_occupied;
+
+    // Synchronization of access to Task pool
+    /** Also is used to specify if the slot is empty or locked:
+         0 - empty
+        -1 - locked **/
+    std::atomic<d1::task**> task_pool;
+
+    //! Index of the first ready task in the deque.
+    /** Modified by thieves, and by the owner during compaction/reallocation **/
+    std::atomic<std::size_t> head;
+};
+
+struct alignas(max_nfs_size) arena_slot_private_state {
+    //! Hint provided for operations with the container of starvation-resistant tasks.
+    /** Modified by the owner thread (during these operations). **/
+    unsigned hint_for_fifo_stream;
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    //! Similar to 'hint_for_fifo_stream' but for critical tasks.
+    unsigned hint_for_critical_stream;
+#endif
+
+    //! Similar to 'hint_for_fifo_stream' but for the resume tasks.
+    unsigned hint_for_resume_stream;
+
+    //! Index of the element following the last ready task in the deque.
+    /** Modified by the owner thread. **/
+    std::atomic<std::size_t> tail;
+
+    //! Capacity of the primary task pool (number of elements - pointers to task).
+    std::size_t my_task_pool_size;
+
+    //! Task pool of the scheduler that owns this slot
+    // TODO: previously was task**__TBB_atomic, but seems like not accessed on other thread
+    d1::task** task_pool_ptr;
+};
+
+class arena_slot : private arena_slot_shared_state, private arena_slot_private_state {
+    friend class arena;
+    friend class outermost_worker_waiter;
+    friend class task_dispatcher;
+    friend class thread_data;
+    friend class nested_arena_context;
+
+    //! The original task dispather associated with this slot
+    task_dispatcher* my_default_task_dispatcher;
+
+#if TBB_USE_ASSERT
+    void fill_with_canary_pattern ( std::size_t first, std::size_t last ) {
+        for ( std::size_t i = first; i < last; ++i )
+            poison_pointer(task_pool_ptr[i]);
+    }
+#else
+    void fill_with_canary_pattern ( size_t, std::size_t ) {}
+#endif /* TBB_USE_ASSERT */
+
+    static constexpr std::size_t min_task_pool_size = 64;
+
+    void allocate_task_pool( std::size_t n ) {
+        std::size_t byte_size = ((n * sizeof(d1::task*) + max_nfs_size - 1) / max_nfs_size) * max_nfs_size;
+        my_task_pool_size = byte_size / sizeof(d1::task*);
+        task_pool_ptr = (d1::task**)cache_aligned_allocate(byte_size);
+        // No need to clear the fresh deque since valid items are designated by the head and tail members.
+        // But fill it with a canary pattern in the high vigilance debug mode.
+        fill_with_canary_pattern( 0, my_task_pool_size );
+    }
+
+public:
+    //! Deallocate task pool that was allocated by means of allocate_task_pool.
+    void free_task_pool( ) {
+        // TODO: understand the assertion and modify
+        // __TBB_ASSERT( !task_pool /* TODO: == EmptyTaskPool */, nullptr);
+        if( task_pool_ptr ) {
+           __TBB_ASSERT( my_task_pool_size, nullptr);
+           cache_aligned_deallocate( task_pool_ptr );
+           task_pool_ptr = nullptr;
+           my_task_pool_size = 0;
+        }
+    }
+
+    //! Get a task from the local pool.
+    /** Called only by the pool owner.
+        Returns the pointer to the task or nullptr if a suitable task is not found.
+        Resets the pool if it is empty. **/
+    d1::task* get_task(execution_data_ext&, isolation_type);
+
+    //! Steal task from slot's ready pool
+    d1::task* steal_task(arena&, isolation_type, std::size_t);
+
+    //! Some thread is now the owner of this slot
+    void occupy() {
+        __TBB_ASSERT(!my_is_occupied.load(std::memory_order_relaxed), nullptr);
+        my_is_occupied.store(true, std::memory_order_release);
+    }
+
+    //! Try to occupy the slot
+    bool try_occupy() {
+        return !is_occupied() && my_is_occupied.exchange(true) == false;
+    }
+
+    //! Some thread is now the owner of this slot
+    void release() {
+        __TBB_ASSERT(my_is_occupied.load(std::memory_order_relaxed), nullptr);
+        my_is_occupied.store(false, std::memory_order_release);
+    }
+
+    //! Spawn newly created tasks
+    void spawn(d1::task& t) {
+        std::size_t T = prepare_task_pool(1);
+        __TBB_ASSERT(is_poisoned(task_pool_ptr[T]), nullptr);
+        task_pool_ptr[T] = &t;
+        commit_spawned_tasks(T + 1);
+        if (!is_task_pool_published()) {
+            publish_task_pool();
+        }
+    }
+
+    bool is_task_pool_published() const {
+        return task_pool.load(std::memory_order_relaxed) != EmptyTaskPool;
+    }
+
+    bool is_empty() const {
+        return task_pool.load(std::memory_order_relaxed) == EmptyTaskPool ||
+               head.load(std::memory_order_relaxed) >= tail.load(std::memory_order_relaxed);
+    }
+
+    bool is_occupied() const {
+        return my_is_occupied.load(std::memory_order_relaxed);
+    }
+
+    task_dispatcher& default_task_dispatcher() {
+        __TBB_ASSERT(my_default_task_dispatcher != nullptr, nullptr);
+        return *my_default_task_dispatcher;
+    }
+
+    void init_task_streams(unsigned h) {
+        hint_for_fifo_stream = h;
+#if __TBB_RESUMABLE_TASKS
+        hint_for_resume_stream = h;
+#endif
+#if __TBB_PREVIEW_CRITICAL_TASKS
+        hint_for_critical_stream = h;
+#endif
+    }
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+    unsigned& critical_hint() {
+        return hint_for_critical_stream;
+    }
+#endif
+private:
+    //! Get a task from the local pool at specified location T.
+    /** Returns the pointer to the task or nullptr if the task cannot be executed,
+        e.g. proxy has been deallocated or isolation constraint is not met.
+        tasks_omitted tells if some tasks have been omitted.
+        Called only by the pool owner. The caller should guarantee that the
+        position T is not available for a thief. **/
+    d1::task* get_task_impl(size_t T, execution_data_ext& ed, bool& tasks_omitted, isolation_type isolation);
+
+    //! Makes sure that the task pool can accommodate at least n more elements
+    /** If necessary relocates existing task pointers or grows the ready task deque.
+     *  Returns (possible updated) tail index (not accounting for n). **/
+    std::size_t prepare_task_pool(std::size_t num_tasks) {
+        std::size_t T = tail.load(std::memory_order_relaxed); // mirror
+        if ( T + num_tasks <= my_task_pool_size ) {
+            return T;
+        }
+
+        std::size_t new_size = num_tasks;
+        if ( !my_task_pool_size ) {
+            __TBB_ASSERT( !is_task_pool_published() && is_quiescent_local_task_pool_reset(), nullptr);
+            __TBB_ASSERT( !task_pool_ptr, nullptr);
+            if ( num_tasks < min_task_pool_size ) new_size = min_task_pool_size;
+            allocate_task_pool( new_size );
+            return 0;
+        }
+        acquire_task_pool();
+        std::size_t H =  head.load(std::memory_order_relaxed); // mirror
+        d1::task** new_task_pool = task_pool_ptr;
+        __TBB_ASSERT( my_task_pool_size >= min_task_pool_size, nullptr);
+        // Count not skipped tasks. Consider using std::count_if.
+        for ( std::size_t i = H; i < T; ++i )
+            if ( new_task_pool[i] ) ++new_size;
+        // If the free space at the beginning of the task pool is too short, we
+        // are likely facing a pathological single-producer-multiple-consumers
+        // scenario, and thus it's better to expand the task pool
+        bool allocate = new_size > my_task_pool_size - min_task_pool_size/4;
+        if ( allocate ) {
+            // Grow task pool. As this operation is rare, and its cost is asymptotically
+            // amortizable, we can tolerate new task pool allocation done under the lock.
+            if ( new_size < 2 * my_task_pool_size )
+                new_size = 2 * my_task_pool_size;
+            allocate_task_pool( new_size ); // updates my_task_pool_size
+        }
+        // Filter out skipped tasks. Consider using std::copy_if.
+        std::size_t T1 = 0;
+        for ( std::size_t i = H; i < T; ++i ) {
+            if ( new_task_pool[i] ) {
+                task_pool_ptr[T1++] = new_task_pool[i];
+            }
+        }
+        // Deallocate the previous task pool if a new one has been allocated.
+        if ( allocate )
+            cache_aligned_deallocate( new_task_pool );
+        else
+            fill_with_canary_pattern( T1, tail );
+        // Publish the new state.
+        commit_relocated_tasks( T1 );
+        // assert_task_pool_valid();
+        return T1;
+    }
+
+    //! Makes newly spawned tasks visible to thieves
+    void commit_spawned_tasks(std::size_t new_tail) {
+        __TBB_ASSERT (new_tail <= my_task_pool_size, "task deque end was overwritten");
+        // emit "task was released" signal
+        // Release fence is necessary to make sure that previously stored task pointers
+        // are visible to thieves.
+        tail.store(new_tail, std::memory_order_release);
+    }
+
+    //! Used by workers to enter the task pool
+    /** Does not lock the task pool in case if arena slot has been successfully grabbed. **/
+    void publish_task_pool() {
+        __TBB_ASSERT ( task_pool == EmptyTaskPool, "someone else grabbed my arena slot?" );
+        __TBB_ASSERT ( head.load(std::memory_order_relaxed) < tail.load(std::memory_order_relaxed),
+                "entering arena without tasks to share" );
+        // Release signal on behalf of previously spawned tasks (when this thread was not in arena yet)
+        task_pool.store(task_pool_ptr, std::memory_order_release );
+    }
+
+    //! Locks the local task pool
+    /** Garbles task_pool for the duration of the lock. Requires correctly set task_pool_ptr.
+        ATTENTION: This method is mostly the same as generic_scheduler::lock_task_pool(), with
+        a little different logic of slot state checks (slot is either locked or points
+        to our task pool). Thus if either of them is changed, consider changing the counterpart as well. **/
+    void acquire_task_pool() {
+        if (!is_task_pool_published()) {
+            return; // we are not in arena - nothing to lock
+        }
+        bool sync_prepare_done = false;
+        for( atomic_backoff b;;b.pause() ) {
+#if TBB_USE_ASSERT
+            // Local copy of the arena slot task pool pointer is necessary for the next
+            // assertion to work correctly to exclude asynchronous state transition effect.
+            d1::task** tp = task_pool.load(std::memory_order_relaxed);
+            __TBB_ASSERT( tp == LockedTaskPool || tp == task_pool_ptr, "slot ownership corrupt?" );
+#endif
+            d1::task** expected = task_pool_ptr;
+            if( task_pool.load(std::memory_order_relaxed) != LockedTaskPool &&
+                task_pool.compare_exchange_strong(expected, LockedTaskPool ) ) {
+                // We acquired our own slot
+                break;
+            } else if( !sync_prepare_done ) {
+                // Start waiting
+                sync_prepare_done = true;
+            }
+            // Someone else acquired a lock, so pause and do exponential backoff.
+        }
+        __TBB_ASSERT( task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "not really acquired task pool" );
+    }
+
+    //! Unlocks the local task pool
+    /** Restores task_pool munged by acquire_task_pool. Requires
+        correctly set task_pool_ptr. **/
+    void release_task_pool() {
+        if ( !(task_pool.load(std::memory_order_relaxed) != EmptyTaskPool) )
+            return; // we are not in arena - nothing to unlock
+        __TBB_ASSERT( task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "arena slot is not locked" );
+        task_pool.store( task_pool_ptr, std::memory_order_release );
+    }
+
+    //! Locks victim's task pool, and returns pointer to it. The pointer can be nullptr.
+    /** Garbles victim_arena_slot->task_pool for the duration of the lock. **/
+    d1::task** lock_task_pool() {
+        d1::task** victim_task_pool;
+        for ( atomic_backoff backoff;; /*backoff pause embedded in the loop*/) {
+            victim_task_pool = task_pool.load(std::memory_order_relaxed);
+            // Microbenchmarks demonstrated that aborting stealing attempt when the
+            // victim's task pool is locked degrade performance.
+            // NOTE: Do not use comparison of head and tail indices to check for
+            // the presence of work in the victim's task pool, as they may give
+            // incorrect indication because of task pool relocations and resizes.
+            if (victim_task_pool == EmptyTaskPool) {
+                break;
+            }
+            d1::task** expected = victim_task_pool;
+            if (victim_task_pool != LockedTaskPool && task_pool.compare_exchange_strong(expected, LockedTaskPool) ) {
+                // We've locked victim's task pool
+                break;
+            } 
+            // Someone else acquired a lock, so pause and do exponential backoff.
+            backoff.pause();
+        }
+        __TBB_ASSERT(victim_task_pool == EmptyTaskPool ||
+                    (task_pool.load(std::memory_order_relaxed) == LockedTaskPool &&
+                    victim_task_pool != LockedTaskPool), "not really locked victim's task pool?");
+        return victim_task_pool;
+    }
+
+    //! Unlocks victim's task pool
+    /** Restores victim_arena_slot->task_pool munged by lock_task_pool. **/
+    void unlock_task_pool(d1::task** victim_task_pool) {
+        __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "victim arena slot is not locked");
+        __TBB_ASSERT(victim_task_pool != LockedTaskPool, nullptr);
+        task_pool.store(victim_task_pool, std::memory_order_release);
+    }
+
+#if TBB_USE_ASSERT
+    bool is_local_task_pool_quiescent() const {
+        d1::task** tp = task_pool.load(std::memory_order_relaxed);
+        return tp == EmptyTaskPool || tp == LockedTaskPool;
+    }
+
+    bool is_quiescent_local_task_pool_empty() const {
+        __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool is not quiescent");
+        return head.load(std::memory_order_relaxed) == tail.load(std::memory_order_relaxed);
+    }
+
+    bool is_quiescent_local_task_pool_reset() const {
+        __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool is not quiescent");
+        return head.load(std::memory_order_relaxed) == 0 && tail.load(std::memory_order_relaxed) == 0;
+    }
+#endif // TBB_USE_ASSERT
+
+    //! Leave the task pool
+    /** Leaving task pool automatically releases the task pool if it is locked. **/
+    void leave_task_pool() {
+        __TBB_ASSERT(is_task_pool_published(), "Not in arena");
+        // Do not reset my_arena_index. It will be used to (attempt to) re-acquire the slot next time
+        __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "Task pool must be locked when leaving arena");
+        __TBB_ASSERT(is_quiescent_local_task_pool_empty(), "Cannot leave arena when the task pool is not empty");
+        // No release fence is necessary here as this assignment precludes external
+        // accesses to the local task pool when becomes visible. Thus it is harmless
+        // if it gets hoisted above preceding local bookkeeping manipulations.
+        task_pool.store(EmptyTaskPool, std::memory_order_relaxed);
+    }
+
+    //! Resets head and tail indices to 0, and leaves task pool
+    /** The task pool must be locked by the owner (via acquire_task_pool).**/
+    void reset_task_pool_and_leave() {
+        __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "Task pool must be locked when resetting task pool");
+        tail.store(0, std::memory_order_relaxed);
+        head.store(0, std::memory_order_relaxed);
+        leave_task_pool();
+    }
+
+    //! Makes relocated tasks visible to thieves and releases the local task pool.
+    /** Obviously, the task pool must be locked when calling this method. **/
+    void commit_relocated_tasks(std::size_t new_tail) {
+        __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool must be locked when calling commit_relocated_tasks()");
+        head.store(0, std::memory_order_relaxed);
+        // Tail is updated last to minimize probability of a thread making arena
+        // snapshot being misguided into thinking that this task pool is empty.
+        tail.store(new_tail, std::memory_order_release);
+        release_task_pool();
+    }
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_arena_slot_H
diff --git a/third_party/tbb/assert_impl.h b/third_party/tbb/assert_impl.h
new file mode 100644
index 000000000..c958d3a40
--- /dev/null
+++ b/third_party/tbb/assert_impl.h
@@ -0,0 +1,98 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_assert_impl_H
+#define __TBB_assert_impl_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_utils.h"
+
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/cstdarg"
+#if _MSC_VER && _DEBUG
+// MISSING #include <crtdbg.h>
+#endif
+
+#include "third_party/libcxx/mutex"
+
+#if __TBBMALLOC_BUILD
+namespace rml { namespace internal {
+#else
+namespace tbb {
+namespace detail {
+namespace r1 {
+#endif
+// TODO: consider extension for formatted error description string
+static void assertion_failure_impl(const char* location, int line, const char* expression, const char* comment) {
+
+    std::fprintf(stderr, "Assertion %s failed (located in the %s function, line in file: %d)\n",
+        expression, location, line);
+
+    if (comment) {
+        std::fprintf(stderr, "Detailed description: %s\n", comment);
+    }
+#if _MSC_VER && _DEBUG
+    if (1 == _CrtDbgReport(_CRT_ASSERT, location, line, "tbb_debug.dll", "%s\r\n%s", expression, comment?comment:"")) {
+        _CrtDbgBreak();
+    } else
+#endif
+    {
+        std::fflush(stderr);
+        std::abort();
+    }
+}
+
+// Do not move the definition into the assertion_failure function because it will require "magic statics".
+// It will bring a dependency on C++ runtime on some platforms while assert_impl.h is reused in tbbmalloc 
+// that should not depend on C++ runtime
+static std::atomic<tbb::detail::do_once_state> assertion_state;
+
+void __TBB_EXPORTED_FUNC assertion_failure(const char* location, int line, const char* expression, const char* comment) {
+#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED
+    // Workaround for erroneous "unreachable code" during assertion throwing using call_once
+    #pragma warning (push)
+    #pragma warning (disable: 4702)
+#endif
+    // We cannot use std::call_once because it brings a dependency on C++ runtime on some platforms 
+    // while assert_impl.h is reused in tbbmalloc that should not depend on C++ runtime
+    atomic_do_once([&](){ assertion_failure_impl(location, line, expression, comment); }, assertion_state);
+#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED
+    #pragma warning (pop)
+#endif
+}
+
+//! Report a runtime warning.
+void runtime_warning( const char* format, ... ) {
+    char str[1024]; std::memset(str, 0, 1024);
+    va_list args; va_start(args, format);
+    vsnprintf( str, 1024-1, format, args);
+    va_end(args);
+    fprintf(stderr, "TBB Warning: %s\n", str);
+}
+
+#if __TBBMALLOC_BUILD
+}} // namespaces rml::internal
+#else
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+#endif
+
+#endif // __TBB_assert_impl_H
+
diff --git a/third_party/tbb/blocked_range.h b/third_party/tbb/blocked_range.h
new file mode 100644
index 000000000..4f3041ab2
--- /dev/null
+++ b/third_party/tbb/blocked_range.h
@@ -0,0 +1,171 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_range_H
+#define __TBB_blocked_range_H
+
+#include "third_party/libcxx/cstddef"
+
+#include "third_party/tbb/detail/_range_common.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+
+#include "third_party/tbb/version.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+/** \page range_req Requirements on range concept
+    Class \c R implementing the concept of range must define:
+    - \code R::R( const R& ); \endcode               Copy constructor
+    - \code R::~R(); \endcode                        Destructor
+    - \code bool R::is_divisible() const; \endcode   True if range can be partitioned into two subranges
+    - \code bool R::empty() const; \endcode          True if range is empty
+    - \code R::R( R& r, split ); \endcode            Split range \c r into two subranges.
+**/
+
+//! A range over which to iterate.
+/** @ingroup algorithms */
+template<typename Value>
+    __TBB_requires(blocked_range_value<Value>)
+class blocked_range {
+public:
+    //! Type of a value
+    /** Called a const_iterator for sake of algorithms that need to treat a blocked_range
+        as an STL container. */
+    using const_iterator = Value;
+
+    //! Type for size of a range
+    using size_type = std::size_t;
+
+    //! Construct range over half-open interval [begin,end), with the given grainsize.
+    blocked_range( Value begin_, Value end_, size_type grainsize_=1 ) :
+        my_end(end_), my_begin(begin_), my_grainsize(grainsize_)
+    {
+        __TBB_ASSERT( my_grainsize>0, "grainsize must be positive" );
+    }
+
+    //! Beginning of range.
+    const_iterator begin() const { return my_begin; }
+
+    //! One past last value in range.
+    const_iterator end() const { return my_end; }
+
+    //! Size of the range
+    /** Unspecified if end()<begin(). */
+    size_type size() const {
+        __TBB_ASSERT( !(end()<begin()), "size() unspecified if end()<begin()" );
+        return size_type(my_end-my_begin);
+    }
+
+    //! The grain size for this range.
+    size_type grainsize() const { return my_grainsize; }
+
+    //------------------------------------------------------------------------
+    // Methods that implement Range concept
+    //------------------------------------------------------------------------
+
+    //! True if range is empty.
+    bool empty() const { return !(my_begin<my_end); }
+
+    //! True if range is divisible.
+    /** Unspecified if end()<begin(). */
+    bool is_divisible() const { return my_grainsize<size(); }
+
+    //! Split range.
+    /** The new Range *this has the second part, the old range r has the first part.
+        Unspecified if end()<begin() or !is_divisible(). */
+    blocked_range( blocked_range& r, split ) :
+        my_end(r.my_end),
+        my_begin(do_split(r, split())),
+        my_grainsize(r.my_grainsize)
+    {
+        // only comparison 'less than' is required from values of blocked_range objects
+        __TBB_ASSERT( !(my_begin < r.my_end) && !(r.my_end < my_begin), "blocked_range has been split incorrectly" );
+    }
+
+    //! Split range.
+    /** The new Range *this has the second part split according to specified proportion, the old range r has the first part.
+        Unspecified if end()<begin() or !is_divisible(). */
+    blocked_range( blocked_range& r, proportional_split& proportion ) :
+        my_end(r.my_end),
+        my_begin(do_split(r, proportion)),
+        my_grainsize(r.my_grainsize)
+    {
+        // only comparison 'less than' is required from values of blocked_range objects
+        __TBB_ASSERT( !(my_begin < r.my_end) && !(r.my_end < my_begin), "blocked_range has been split incorrectly" );
+    }
+
+private:
+    /** NOTE: my_end MUST be declared before my_begin, otherwise the splitting constructor will break. */
+    Value my_end;
+    Value my_begin;
+    size_type my_grainsize;
+
+    //! Auxiliary function used by the splitting constructor.
+    static Value do_split( blocked_range& r, split )
+    {
+        __TBB_ASSERT( r.is_divisible(), "cannot split blocked_range that is not divisible" );
+        Value middle = r.my_begin + (r.my_end - r.my_begin) / 2u;
+        r.my_end = middle;
+        return middle;
+    }
+
+    static Value do_split( blocked_range& r, proportional_split& proportion )
+    {
+        __TBB_ASSERT( r.is_divisible(), "cannot split blocked_range that is not divisible" );
+
+        // usage of 32-bit floating point arithmetic is not enough to handle ranges of
+        // more than 2^24 iterations accurately. However, even on ranges with 2^64
+        // iterations the computational error approximately equals to 0.000001% which
+        // makes small impact on uniform distribution of such range's iterations (assuming
+        // all iterations take equal time to complete). See 'test_partitioner_whitebox'
+        // for implementation of an exact split algorithm
+        size_type right_part = size_type(float(r.size()) * float(proportion.right())
+                                         / float(proportion.left() + proportion.right()) + 0.5f);
+        return r.my_end = Value(r.my_end - right_part);
+    }
+
+    template<typename RowValue, typename ColValue>
+        __TBB_requires(blocked_range_value<RowValue> &&
+                       blocked_range_value<ColValue>)
+    friend class blocked_range2d;
+
+    template<typename RowValue, typename ColValue, typename PageValue>
+        __TBB_requires(blocked_range_value<RowValue> &&
+                       blocked_range_value<ColValue> &&
+                       blocked_range_value<PageValue>)
+    friend class blocked_range3d;
+
+    template<typename DimValue, unsigned int N, typename>
+        __TBB_requires(blocked_range_value<DimValue>)
+    friend class blocked_rangeNd_impl;
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_range;
+// Split types
+using detail::split;
+using detail::proportional_split;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_blocked_range_H */
diff --git a/third_party/tbb/blocked_range2d.h b/third_party/tbb/blocked_range2d.h
new file mode 100644
index 000000000..e8f3df03e
--- /dev/null
+++ b/third_party/tbb/blocked_range2d.h
@@ -0,0 +1,112 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_range2d_H
+#define __TBB_blocked_range2d_H
+
+#include "third_party/libcxx/cstddef"
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_range_common.h"
+
+#include "third_party/tbb/blocked_range.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! A 2-dimensional range that models the Range concept.
+/** @ingroup algorithms */
+template<typename RowValue, typename ColValue = RowValue>
+    __TBB_requires(blocked_range_value<RowValue> &&
+                   blocked_range_value<ColValue>)
+class blocked_range2d {
+public:
+    //! Type for size of an iteration range
+    using row_range_type = blocked_range<RowValue>;
+    using col_range_type = blocked_range<ColValue>;
+
+private:
+    row_range_type my_rows;
+    col_range_type my_cols;
+
+public:
+    blocked_range2d( RowValue row_begin, RowValue row_end, typename row_range_type::size_type row_grainsize,
+                     ColValue col_begin, ColValue col_end, typename col_range_type::size_type col_grainsize ) :
+        my_rows(row_begin,row_end,row_grainsize),
+        my_cols(col_begin,col_end,col_grainsize)
+    {}
+
+    blocked_range2d( RowValue row_begin, RowValue row_end,
+                     ColValue col_begin, ColValue col_end ) :
+        my_rows(row_begin,row_end),
+        my_cols(col_begin,col_end)
+    {}
+
+    //! True if range is empty
+    bool empty() const {
+        // Range is empty if at least one dimension is empty.
+        return my_rows.empty() || my_cols.empty();
+    }
+
+    //! True if range is divisible into two pieces.
+    bool is_divisible() const {
+        return my_rows.is_divisible() || my_cols.is_divisible();
+    }
+
+    blocked_range2d( blocked_range2d& r, split ) :
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        split split_obj;
+        do_split(r, split_obj);
+    }
+
+    blocked_range2d( blocked_range2d& r, proportional_split& proportion ) :
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        do_split(r, proportion);
+    }
+
+    //! The rows of the iteration space
+    const row_range_type& rows() const { return my_rows; }
+
+    //! The columns of the iteration space
+    const col_range_type& cols() const { return my_cols; }
+
+private:
+    template <typename Split>
+    void do_split( blocked_range2d& r, Split& split_obj ) {
+        if ( my_rows.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_rows.grainsize()) ) {
+            my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj);
+        } else {
+            my_rows.my_begin = row_range_type::do_split(r.my_rows, split_obj);
+        }
+    }
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_range2d;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_blocked_range2d_H */
diff --git a/third_party/tbb/blocked_range3d.h b/third_party/tbb/blocked_range3d.h
new file mode 100644
index 000000000..dd5e2312f
--- /dev/null
+++ b/third_party/tbb/blocked_range3d.h
@@ -0,0 +1,131 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_range3d_H
+#define __TBB_blocked_range3d_H
+
+#include "third_party/libcxx/cstddef"
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+
+#include "third_party/tbb/blocked_range.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! A 3-dimensional range that models the Range concept.
+/** @ingroup algorithms */
+template<typename PageValue, typename RowValue = PageValue, typename ColValue = RowValue>
+    __TBB_requires(blocked_range_value<PageValue> &&
+                   blocked_range_value<RowValue> &&
+                   blocked_range_value<ColValue>)
+class blocked_range3d {
+public:
+    //! Type for size of an iteration range
+    using page_range_type = blocked_range<PageValue>;
+    using row_range_type = blocked_range<RowValue>;
+    using col_range_type = blocked_range<ColValue>;
+
+private:
+    page_range_type my_pages;
+    row_range_type  my_rows;
+    col_range_type  my_cols;
+
+public:
+
+    blocked_range3d( PageValue page_begin, PageValue page_end,
+                     RowValue  row_begin,  RowValue row_end,
+                     ColValue  col_begin,  ColValue col_end ) :
+        my_pages(page_begin,page_end),
+        my_rows(row_begin,row_end),
+        my_cols(col_begin,col_end)
+    {}
+
+    blocked_range3d( PageValue page_begin, PageValue page_end, typename page_range_type::size_type page_grainsize,
+                     RowValue  row_begin,  RowValue row_end,   typename row_range_type::size_type row_grainsize,
+                     ColValue  col_begin,  ColValue col_end,   typename col_range_type::size_type col_grainsize ) :
+        my_pages(page_begin,page_end,page_grainsize),
+        my_rows(row_begin,row_end,row_grainsize),
+        my_cols(col_begin,col_end,col_grainsize)
+    {}
+
+    //! True if range is empty
+    bool empty() const {
+        // Range is empty if at least one dimension is empty.
+        return my_pages.empty() || my_rows.empty() || my_cols.empty();
+    }
+
+    //! True if range is divisible into two pieces.
+    bool is_divisible() const {
+        return  my_pages.is_divisible() || my_rows.is_divisible() || my_cols.is_divisible();
+    }
+
+    blocked_range3d( blocked_range3d& r, split split_obj ) :
+        my_pages(r.my_pages),
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        do_split(r, split_obj);
+    }
+
+    blocked_range3d( blocked_range3d& r, proportional_split& proportion ) :
+        my_pages(r.my_pages),
+        my_rows(r.my_rows),
+        my_cols(r.my_cols)
+    {
+        do_split(r, proportion);
+    }
+
+    //! The pages of the iteration space
+    const page_range_type& pages() const { return my_pages; }
+
+    //! The rows of the iteration space
+    const row_range_type& rows() const { return my_rows; }
+
+    //! The columns of the iteration space
+    const col_range_type& cols() const { return my_cols; }
+
+private:
+    template <typename Split>
+    void do_split( blocked_range3d& r, Split& split_obj) {
+        if ( my_pages.size()*double(my_rows.grainsize()) < my_rows.size()*double(my_pages.grainsize()) ) {
+            if ( my_rows.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_rows.grainsize()) ) {
+                my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj);
+            } else {
+                my_rows.my_begin = row_range_type::do_split(r.my_rows, split_obj);
+            }
+        } else {
+            if ( my_pages.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_pages.grainsize()) ) {
+                my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj);
+            } else {
+                my_pages.my_begin = page_range_type::do_split(r.my_pages, split_obj);
+            }
+        }
+    }
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_range3d;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_blocked_range3d_H */
diff --git a/third_party/tbb/blocked_rangeNd.h b/third_party/tbb/blocked_rangeNd.h
new file mode 100644
index 000000000..3b48046de
--- /dev/null
+++ b/third_party/tbb/blocked_rangeNd.h
@@ -0,0 +1,148 @@
+// clang-format off
+/*
+    Copyright (c) 2017-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_blocked_rangeNd_H
+#define __TBB_blocked_rangeNd_H
+
+#if !TBB_PREVIEW_BLOCKED_RANGE_ND
+    #error Set TBB_PREVIEW_BLOCKED_RANGE_ND to include blocked_rangeNd.h
+#endif
+
+#include "third_party/libcxx/algorithm"    // std::any_of
+#include "third_party/libcxx/array"
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/type_traits"  // std::is_same, std::enable_if
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_template_helpers.h" // index_sequence, make_index_sequence
+#include "third_party/tbb/detail/_range_common.h"
+
+#include "third_party/tbb/blocked_range.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+/*
+    The blocked_rangeNd_impl uses make_index_sequence<N> to automatically generate a ctor with
+    exactly N arguments of the type tbb::blocked_range<Value>. Such ctor provides an opportunity
+    to use braced-init-list parameters to initialize each dimension.
+    Use of parameters, whose representation is a braced-init-list, but they're not
+    std::initializer_list or a reference to one, produces a non-deduced context
+    within template argument deduction.
+
+    NOTE: blocked_rangeNd must be exactly a templated alias to the blocked_rangeNd_impl
+    (and not e.g. a derived class), otherwise it would need to declare its own ctor
+    facing the same problem that the impl class solves.
+*/
+
+template<typename Value, unsigned int N, typename = detail::make_index_sequence<N>>
+    __TBB_requires(blocked_range_value<Value>)
+class blocked_rangeNd_impl;
+
+template<typename Value, unsigned int N, std::size_t... Is>
+    __TBB_requires(blocked_range_value<Value>)
+class blocked_rangeNd_impl<Value, N, detail::index_sequence<Is...>> {
+public:
+    //! Type of a value.
+    using value_type = Value;
+
+private:
+    //! Helper type to construct range with N tbb::blocked_range<value_type> objects.
+    template<std::size_t>
+    using dim_type_helper = tbb::blocked_range<value_type>;
+
+public:
+    blocked_rangeNd_impl() = delete;
+
+    //! Constructs N-dimensional range over N half-open intervals each represented as tbb::blocked_range<Value>.
+    blocked_rangeNd_impl(const dim_type_helper<Is>&... args) : my_dims{ {args...} } {}
+
+    //! Dimensionality of a range.
+    static constexpr unsigned int ndims() { return N; }
+
+    //! Range in certain dimension.
+    const tbb::blocked_range<value_type>& dim(unsigned int dimension) const {
+        __TBB_ASSERT(dimension < N, "out of bound");
+        return my_dims[dimension];
+    }
+
+    //------------------------------------------------------------------------
+    // Methods that implement Range concept
+    //------------------------------------------------------------------------
+
+    //! True if at least one dimension is empty.
+    bool empty() const {
+        return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& d) {
+            return d.empty();
+        });
+    }
+
+    //! True if at least one dimension is divisible.
+    bool is_divisible() const {
+        return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& d) {
+            return d.is_divisible();
+        });
+    }
+
+    blocked_rangeNd_impl(blocked_rangeNd_impl& r, proportional_split proportion) : my_dims(r.my_dims) {
+        do_split(r, proportion);
+    }
+
+    blocked_rangeNd_impl(blocked_rangeNd_impl& r, split proportion) : my_dims(r.my_dims) {
+        do_split(r, proportion);
+    }
+
+private:
+    static_assert(N != 0, "zero dimensional blocked_rangeNd can't be constructed");
+
+    //! Ranges in each dimension.
+    std::array<tbb::blocked_range<value_type>, N> my_dims;
+
+    template<typename split_type>
+    void do_split(blocked_rangeNd_impl& r, split_type proportion) {
+        static_assert((std::is_same<split_type, split>::value || std::is_same<split_type, proportional_split>::value), "type of split object is incorrect");
+        __TBB_ASSERT(r.is_divisible(), "can't split not divisible range");
+
+        auto my_it = std::max_element(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& first, const tbb::blocked_range<value_type>& second) {
+            return (first.size() * second.grainsize() < second.size() * first.grainsize());
+        });
+
+        auto r_it = r.my_dims.begin() + (my_it - my_dims.begin());
+
+        my_it->my_begin = tbb::blocked_range<value_type>::do_split(*r_it, proportion);
+
+        // (!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin)) equals to
+        // (my_it->my_begin == r_it->my_end), but we can't use operator== due to Value concept
+        __TBB_ASSERT(!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin),
+                     "blocked_range has been split incorrectly");
+    }
+};
+
+template<typename Value, unsigned int N>
+using blocked_rangeNd = blocked_rangeNd_impl<Value, N>;
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::blocked_rangeNd;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_blocked_rangeNd_H */
+
diff --git a/third_party/tbb/cache_aligned_allocator.h b/third_party/tbb/cache_aligned_allocator.h
new file mode 100644
index 000000000..0e79b238b
--- /dev/null
+++ b/third_party/tbb/cache_aligned_allocator.h
@@ -0,0 +1,190 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_cache_aligned_allocator_H
+#define __TBB_cache_aligned_allocator_H
+
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/utility"
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+// MISSING #include <memory_resource>
+#endif
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+TBB_EXPORT void*       __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size);
+TBB_EXPORT void        __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p);
+TBB_EXPORT std::size_t __TBB_EXPORTED_FUNC cache_line_size();
+}
+
+namespace d1 {
+
+template<typename T>
+class cache_aligned_allocator {
+public:
+    using value_type = T;
+    using propagate_on_container_move_assignment = std::true_type;
+
+    //! Always defined for TBB containers (supported since C++17 for std containers)
+    using is_always_equal = std::true_type;
+
+    cache_aligned_allocator() = default;
+    template<typename U> cache_aligned_allocator(const cache_aligned_allocator<U>&) noexcept {}
+
+    //! Allocate space for n objects, starting on a cache/sector line.
+    __TBB_nodiscard T* allocate(std::size_t n) {
+        return static_cast<T*>(r1::cache_aligned_allocate(n * sizeof(value_type)));
+    }
+
+    //! Free block of memory that starts on a cache line
+    void deallocate(T* p, std::size_t) {
+        r1::cache_aligned_deallocate(p);
+    }
+
+    //! Largest value for which method allocate might succeed.
+    std::size_t max_size() const noexcept {
+        return (~std::size_t(0) - r1::cache_line_size()) / sizeof(value_type);
+    }
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    using pointer = value_type*;
+    using const_pointer = const value_type*;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using difference_type = std::ptrdiff_t;
+    using size_type = std::size_t;
+    template<typename U> struct rebind {
+        using other = cache_aligned_allocator<U>;
+    };
+    template<typename U, typename... Args>
+    void construct(U *p, Args&&... args)
+        { ::new (p) U(std::forward<Args>(args)...); }
+    void destroy(pointer p) { p->~value_type(); }
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return &x; }
+#endif // TBB_ALLOCATOR_TRAITS_BROKEN
+};
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    template<>
+    class cache_aligned_allocator<void> {
+    public:
+        using pointer = void*;
+        using const_pointer = const void*;
+        using value_type = void;
+        template<typename U> struct rebind {
+            using other = cache_aligned_allocator<U>;
+        };
+    };
+#endif
+
+template<typename T, typename U>
+bool operator==(const cache_aligned_allocator<T>&, const cache_aligned_allocator<U>&) noexcept { return true; }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template<typename T, typename U>
+bool operator!=(const cache_aligned_allocator<T>&, const cache_aligned_allocator<U>&) noexcept { return false; }
+#endif
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+
+//! C++17 memory resource wrapper to ensure cache line size alignment
+class cache_aligned_resource : public std::pmr::memory_resource {
+public:
+    cache_aligned_resource() : cache_aligned_resource(std::pmr::get_default_resource()) {}
+    explicit cache_aligned_resource(std::pmr::memory_resource* upstream) : m_upstream(upstream) {}
+
+    std::pmr::memory_resource* upstream_resource() const {
+        return m_upstream;
+    }
+
+private:
+    //! We don't know what memory resource set. Use padding to guarantee alignment
+    void* do_allocate(std::size_t bytes, std::size_t alignment) override {
+        // TODO: make it common with tbb_allocator.cpp
+        std::size_t cache_line_alignment = correct_alignment(alignment);
+        std::size_t space = correct_size(bytes) + cache_line_alignment;
+        std::uintptr_t base = reinterpret_cast<std::uintptr_t>(m_upstream->allocate(space));
+        __TBB_ASSERT(base != 0, "Upstream resource returned nullptr.");
+
+        // Round up to the next cache line (align the base address)
+        std::uintptr_t result = (base + cache_line_alignment) & ~(cache_line_alignment - 1);
+        __TBB_ASSERT((result - base) >= sizeof(std::uintptr_t), "Can`t store a base pointer to the header");
+        __TBB_ASSERT(space - (result - base) >= bytes, "Not enough space for the storage");
+
+        // Record where block actually starts.
+        (reinterpret_cast<std::uintptr_t*>(result))[-1] = base;
+        return reinterpret_cast<void*>(result);
+    }
+
+    void do_deallocate(void* ptr, std::size_t bytes, std::size_t alignment) override {
+        if (ptr) {
+            // Recover where block actually starts
+            std::uintptr_t base = (reinterpret_cast<std::uintptr_t*>(ptr))[-1];
+            m_upstream->deallocate(reinterpret_cast<void*>(base), correct_size(bytes) + correct_alignment(alignment));
+        }
+    }
+
+    bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override {
+        if (this == &other) { return true; }
+#if __TBB_USE_OPTIONAL_RTTI
+        const cache_aligned_resource* other_res = dynamic_cast<const cache_aligned_resource*>(&other);
+        return other_res && (upstream_resource() == other_res->upstream_resource());
+#else
+        return false;
+#endif
+    }
+
+    std::size_t correct_alignment(std::size_t alignment) {
+        __TBB_ASSERT(tbb::detail::is_power_of_two(alignment), "Alignment is not a power of 2");
+#if __TBB_CPP17_HW_INTERFERENCE_SIZE_PRESENT
+        std::size_t cache_line_size = std::hardware_destructive_interference_size;
+#else
+        std::size_t cache_line_size = r1::cache_line_size();
+#endif
+        return alignment < cache_line_size ? cache_line_size : alignment;
+    }
+
+    std::size_t correct_size(std::size_t bytes) {
+        // To handle the case, when small size requested. There could be not
+        // enough space to store the original pointer.
+        return bytes < sizeof(std::uintptr_t) ? sizeof(std::uintptr_t) : bytes;
+    }
+
+    std::pmr::memory_resource* m_upstream;
+};
+
+#endif // __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::cache_aligned_allocator;
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+using detail::d1::cache_aligned_resource;
+#endif
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_cache_aligned_allocator_H */
+
diff --git a/third_party/tbb/cancellation_disseminator.h b/third_party/tbb/cancellation_disseminator.h
new file mode 100644
index 000000000..4ec879718
--- /dev/null
+++ b/third_party/tbb/cancellation_disseminator.h
@@ -0,0 +1,86 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_cancellation_disseminator_H
+#define _TBB_cancellation_disseminator_H
+
+#include "third_party/tbb/mutex.h"
+#include "third_party/tbb/task_group.h"
+
+#include "third_party/tbb/intrusive_list.h"
+#include "third_party/tbb/thread_data.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class cancellation_disseminator {
+public:
+    //! Finds all contexts affected by the state change and propagates the new state to them.
+    /*  The propagation is relayed to the cancellation_disseminator because tasks created by one
+        external thread can be passed to and executed by other external threads. This means
+        that context trees can span several arenas at once and thus state change
+        propagation cannot be generally localized to one arena only.
+    */
+    bool propagate_task_group_state(std::atomic<uint32_t> d1::task_group_context::*mptr_state, d1::task_group_context& src, uint32_t new_state) {
+        if (src.my_may_have_children.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) {
+            return true;
+        }
+
+        // The whole propagation algorithm is under the lock in order to ensure correctness
+        // in case of concurrent state changes at the different levels of the context tree.
+        threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex);
+        // TODO: consider to use double-check idiom
+        if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state) {
+            // Another thread has concurrently changed the state. Back down.
+            return false;
+        }
+
+        // Advance global state propagation epoch
+        ++the_context_state_propagation_epoch;
+        // Propagate to all workers and external threads and sync up their local epochs with the global one
+        // The whole propagation sequence is locked, thus no contention is expected
+        for (auto& thr_data : my_threads_list) {
+            thr_data.propagate_task_group_state(mptr_state, src, new_state);
+        }
+
+        return true;
+    }
+
+    void register_thread(thread_data& td) {
+        threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex);
+        my_threads_list.push_front(td);
+    }
+
+    void unregister_thread(thread_data& td) {
+        threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex);
+        my_threads_list.remove(td);
+    }
+
+private:
+    using thread_data_list_type = intrusive_list<thread_data>;
+    using threads_list_mutex_type = d1::mutex;
+
+    threads_list_mutex_type my_threads_list_mutex;
+    thread_data_list_type my_threads_list;
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_cancellation_disseminator_H
diff --git a/third_party/tbb/co_context.h b/third_party/tbb/co_context.h
new file mode 100644
index 000000000..fe1ddaee2
--- /dev/null
+++ b/third_party/tbb/co_context.h
@@ -0,0 +1,428 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_co_context_H
+#define _TBB_co_context_H
+
+#include "third_party/tbb/detail/_config.h"
+
+#if __TBB_RESUMABLE_TASKS
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/cstdint"
+
+#if __TBB_RESUMABLE_TASKS_USE_THREADS
+
+#if _WIN32 || _WIN64
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#else
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/thread2.h"
+#endif
+
+#include "third_party/libcxx/condition_variable"
+#include "third_party/tbb/governor.h"
+
+#elif _WIN32 || _WIN64
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#else
+// ucontext.h API is deprecated since macOS 10.6
+#if __APPLE__
+    #if __INTEL_COMPILER
+        #pragma warning(push)
+        #pragma warning(disable:1478)
+    #elif __clang__
+        #pragma clang diagnostic push
+        #pragma clang diagnostic ignored "-Wdeprecated-declarations"
+    #endif
+#endif // __APPLE__
+
+// MISSING #include <ucontext.h>
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/mlock.h"
+#include "libc/sysv/consts/msync.h"
+#include "libc/sysv/consts/posix.h"
+#include "libc/sysv/consts/prot.h"
+#include "libc/sysv/consts/madv.h"
+#include "libc/sysv/consts/mfd.h"
+#include "libc/sysv/consts/mremap.h" // mprotect
+
+#include "third_party/tbb/governor.h" // default_page_size()
+
+#ifndef MAP_STACK
+// macOS* does not define MAP_STACK
+#define MAP_STACK 0
+#endif
+#ifndef MAP_ANONYMOUS
+// macOS* defines MAP_ANON, which is deprecated in Linux*.
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+#endif // _WIN32 || _WIN64
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_RESUMABLE_TASKS_USE_THREADS
+    struct coroutine_type {
+#if _WIN32 || _WIN64
+        using handle_type = HANDLE;
+#else
+        using handle_type = pthread_t;
+#endif
+
+        handle_type my_thread;
+        std::condition_variable my_condvar;
+        std::mutex my_mutex;
+        thread_data* my_thread_data{ nullptr };
+        bool my_is_active{ true };
+    };
+#elif _WIN32 || _WIN64
+    typedef LPVOID coroutine_type;
+#else
+    struct coroutine_type {
+        coroutine_type() : my_context(), my_stack(), my_stack_size() {}
+        ucontext_t my_context;
+        void* my_stack;
+        std::size_t my_stack_size;
+    };
+#endif
+
+    // Forward declaration of the coroutine API.
+    void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg);
+    void current_coroutine(coroutine_type& c);
+    void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine);
+    void destroy_coroutine(coroutine_type& c);
+
+class co_context {
+    enum co_state {
+        co_invalid,
+        co_suspended,
+        co_executing,
+        co_destroyed
+    };
+    coroutine_type      my_coroutine;
+    co_state            my_state;
+
+public:
+    co_context(std::size_t stack_size, void* arg)
+        : my_state(stack_size ? co_suspended : co_executing)
+    {
+        if (stack_size) {
+            __TBB_ASSERT(arg != nullptr, nullptr);
+            create_coroutine(my_coroutine, stack_size, arg);
+        } else {
+            current_coroutine(my_coroutine);
+        }
+    }
+
+    ~co_context() {
+        __TBB_ASSERT(1 << my_state & (1 << co_suspended | 1 << co_executing), nullptr);
+        if (my_state == co_suspended) {
+#if __TBB_RESUMABLE_TASKS_USE_THREADS
+            my_state = co_executing;
+#endif
+            destroy_coroutine(my_coroutine);
+        }
+        my_state = co_destroyed;
+    }
+
+    void resume(co_context& target) {
+        // Do not create non-trivial objects on the stack of this function. They might never be destroyed.
+        __TBB_ASSERT(my_state == co_executing, nullptr);
+        __TBB_ASSERT(target.my_state == co_suspended, nullptr);
+
+        my_state = co_suspended;
+        target.my_state = co_executing;
+
+        // 'target' can reference an invalid object after swap_coroutine. Do not access it.
+        swap_coroutine(my_coroutine, target.my_coroutine);
+
+        __TBB_ASSERT(my_state == co_executing, nullptr);
+    }
+};
+
+#if _WIN32 || _WIN64
+/* [[noreturn]] */ void __stdcall co_local_wait_for_all(void* arg) noexcept;
+#else
+/* [[noreturn]] */ void co_local_wait_for_all(unsigned hi, unsigned lo) noexcept;
+#endif
+
+#if __TBB_RESUMABLE_TASKS_USE_THREADS
+void handle_perror(int error_code, const char* what);
+
+inline void check(int error_code, const char* routine) {
+    if (error_code) {
+        handle_perror(error_code, routine);
+    }
+}
+
+using thread_data_t = std::pair<coroutine_type&, void*&>;
+
+#if _WIN32 || _WIN64
+inline unsigned WINAPI coroutine_thread_func(void* d)
+#else
+inline void* coroutine_thread_func(void* d)
+#endif
+{
+    thread_data_t& data = *static_cast<thread_data_t*>(d);
+    coroutine_type& c = data.first;
+    void* arg = data.second;
+    {
+        std::unique_lock<std::mutex> lock(c.my_mutex);
+        __TBB_ASSERT(c.my_thread_data == nullptr, nullptr);
+        c.my_is_active = false;
+
+        // We read the data notify the waiting thread
+        data.second = nullptr;
+        c.my_condvar.notify_one();
+
+        c.my_condvar.wait(lock, [&c] { return c.my_is_active == true; });
+    }
+    __TBB_ASSERT(c.my_thread_data != nullptr, nullptr);
+    governor::set_thread_data(*c.my_thread_data);
+
+#if _WIN32 || _WIN64
+    co_local_wait_for_all(arg);
+
+    return 0;
+#else
+    std::uintptr_t addr = std::uintptr_t(arg);
+    unsigned lo = unsigned(addr);
+    unsigned hi = unsigned(std::uint64_t(addr) >> 32);
+    __TBB_ASSERT(sizeof(addr) == 8 || hi == 0, nullptr);
+
+    co_local_wait_for_all(hi, lo);
+
+    return nullptr;
+#endif
+};
+
+inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) {
+    thread_data_t data{ c, arg };
+
+#if _WIN32 || _WIN64
+    c.my_thread = (HANDLE)_beginthreadex(nullptr, unsigned(stack_size), coroutine_thread_func, &data, STACK_SIZE_PARAM_IS_A_RESERVATION, nullptr);
+    if (!c.my_thread) {
+        handle_perror(0, "create_coroutine: _beginthreadex failed\n");
+    }
+#else
+    pthread_attr_t s;
+    check(pthread_attr_init(&s), "pthread_attr_init has failed");
+    if (stack_size > 0) {
+        check(pthread_attr_setstacksize(&s, stack_size), "pthread_attr_setstack_size has failed");
+    }
+    check(pthread_create(&c.my_thread, &s, coroutine_thread_func, &data), "pthread_create has failed");
+    check(pthread_attr_destroy(&s), "pthread_attr_destroy has failed");
+#endif
+
+    // Wait for the just created thread to read the data
+    std::unique_lock<std::mutex> lock(c.my_mutex);
+    c.my_condvar.wait(lock, [&arg] { return arg == nullptr; });
+}
+
+inline void current_coroutine(coroutine_type& c) {
+#if _WIN32 || _WIN64
+    c.my_thread = GetCurrentThread();
+#else
+    c.my_thread = pthread_self();
+#endif
+}
+
+inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) {
+    thread_data* td = governor::get_thread_data();
+    __TBB_ASSERT(prev_coroutine.my_is_active == true, "The current thread should be active");
+
+    // Detach our state before notification other thread
+    // (because we might be notified just after other thread notification)
+    prev_coroutine.my_thread_data = nullptr;
+    prev_coroutine.my_is_active = false;
+    governor::clear_thread_data();
+
+    {
+        std::unique_lock<std::mutex> lock(new_coroutine.my_mutex);
+        __TBB_ASSERT(new_coroutine.my_is_active == false, "The sleeping thread should not be active");
+        __TBB_ASSERT(new_coroutine.my_thread_data == nullptr, "The sleeping thread should not be active");
+
+        new_coroutine.my_thread_data = td;
+        new_coroutine.my_is_active = true;
+        new_coroutine.my_condvar.notify_one();
+    }
+
+    std::unique_lock<std::mutex> lock(prev_coroutine.my_mutex);
+    prev_coroutine.my_condvar.wait(lock, [&prev_coroutine] { return prev_coroutine.my_is_active == true; });
+    __TBB_ASSERT(governor::get_thread_data() != nullptr, nullptr);
+    governor::set_thread_data(*prev_coroutine.my_thread_data);
+}
+
+inline void destroy_coroutine(coroutine_type& c) {
+    {
+        std::unique_lock<std::mutex> lock(c.my_mutex);
+        __TBB_ASSERT(c.my_thread_data == nullptr, "The sleeping thread should not be active");
+        __TBB_ASSERT(c.my_is_active == false, "The sleeping thread should not be active");
+        c.my_is_active = true;
+        c.my_condvar.notify_one();
+    }
+#if _WIN32 || _WIN64
+    WaitForSingleObject(c.my_thread, INFINITE);
+    CloseHandle(c.my_thread);
+#else
+    check(pthread_join(c.my_thread, nullptr), "pthread_join has failed");
+#endif
+}
+#elif _WIN32 || _WIN64
+inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) {
+    __TBB_ASSERT(arg, nullptr);
+    c = CreateFiber(stack_size, co_local_wait_for_all, arg);
+    __TBB_ASSERT(c, nullptr);
+}
+
+inline void current_coroutine(coroutine_type& c) {
+    c = IsThreadAFiber() ? GetCurrentFiber() :
+        ConvertThreadToFiberEx(nullptr, FIBER_FLAG_FLOAT_SWITCH);
+    __TBB_ASSERT(c, nullptr);
+}
+
+inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) {
+    if (!IsThreadAFiber()) {
+        ConvertThreadToFiberEx(nullptr, FIBER_FLAG_FLOAT_SWITCH);
+    }
+    __TBB_ASSERT(new_coroutine, nullptr);
+    prev_coroutine = GetCurrentFiber();
+    __TBB_ASSERT(prev_coroutine, nullptr);
+    SwitchToFiber(new_coroutine);
+}
+
+inline void destroy_coroutine(coroutine_type& c) {
+    __TBB_ASSERT(c, nullptr);
+    DeleteFiber(c);
+}
+#else // !(_WIN32 || _WIN64)
+
+inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) {
+    const std::size_t REG_PAGE_SIZE = governor::default_page_size();
+    const std::size_t page_aligned_stack_size = (stack_size + (REG_PAGE_SIZE - 1)) & ~(REG_PAGE_SIZE - 1);
+    const std::size_t protected_stack_size = page_aligned_stack_size + 2 * REG_PAGE_SIZE;
+
+    // Allocate the stack with protection property
+    std::uintptr_t stack_ptr = (std::uintptr_t)mmap(nullptr, protected_stack_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+    __TBB_ASSERT((void*)stack_ptr != MAP_FAILED, nullptr);
+
+    // Allow read write on our stack (guarded pages are still protected)
+    int err = mprotect((void*)(stack_ptr + REG_PAGE_SIZE), page_aligned_stack_size, PROT_READ | PROT_WRITE);
+    __TBB_ASSERT_EX(!err, nullptr);
+
+    // Remember the stack state
+    c.my_stack = (void*)(stack_ptr + REG_PAGE_SIZE);
+    c.my_stack_size = page_aligned_stack_size;
+
+    err = getcontext(&c.my_context);
+    __TBB_ASSERT_EX(!err, nullptr);
+
+    c.my_context.uc_link = nullptr;
+    // cast to char* to disable FreeBSD clang-3.4.1 'incompatible type' error
+    c.my_context.uc_stack.ss_sp = (char*)c.my_stack;
+    c.my_context.uc_stack.ss_size = c.my_stack_size;
+    c.my_context.uc_stack.ss_flags = 0;
+
+    typedef void(*coroutine_func_t)();
+
+    std::uintptr_t addr = std::uintptr_t(arg);
+    unsigned lo = unsigned(addr);
+    unsigned hi = unsigned(std::uint64_t(addr) >> 32);
+    __TBB_ASSERT(sizeof(addr) == 8 || hi == 0, nullptr);
+
+    makecontext(&c.my_context, (coroutine_func_t)co_local_wait_for_all, 2, hi, lo);
+}
+
+inline void current_coroutine(coroutine_type& c) {
+    int err = getcontext(&c.my_context);
+    __TBB_ASSERT_EX(!err, nullptr);
+}
+
+inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) {
+    int err = swapcontext(&prev_coroutine.my_context, &new_coroutine.my_context);
+    __TBB_ASSERT_EX(!err, nullptr);
+}
+
+inline void destroy_coroutine(coroutine_type& c) {
+    const std::size_t REG_PAGE_SIZE = governor::default_page_size();
+    // Free stack memory with guarded pages
+    munmap((void*)((std::uintptr_t)c.my_stack - REG_PAGE_SIZE), c.my_stack_size + 2 * REG_PAGE_SIZE);
+    // Clear the stack state afterwards
+    c.my_stack = nullptr;
+    c.my_stack_size = 0;
+}
+
+#if __APPLE__
+    #if __INTEL_COMPILER
+        #pragma warning(pop) // 1478 warning
+    #elif __clang__
+        #pragma clang diagnostic pop // "-Wdeprecated-declarations"
+    #endif
+#endif
+
+#endif // _WIN32 || _WIN64
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_RESUMABLE_TASKS */
+
+#endif /* _TBB_co_context_H */
diff --git a/third_party/tbb/collaborative_call_once.h b/third_party/tbb/collaborative_call_once.h
new file mode 100644
index 000000000..b154b6f7f
--- /dev/null
+++ b/third_party/tbb/collaborative_call_once.h
@@ -0,0 +1,236 @@
+// clang-format off
+/*
+    Copyright (c) 2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_collaborative_call_once_H
+#define __TBB_collaborative_call_once_H
+
+#include "third_party/tbb/task_arena.h"
+#include "third_party/tbb/task_group.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress warning: structure was padded due to alignment specifier
+    #pragma warning (push)
+    #pragma warning (disable: 4324)
+#endif
+
+constexpr std::uintptr_t collaborative_once_max_references = max_nfs_size;
+constexpr std::uintptr_t collaborative_once_references_mask = collaborative_once_max_references-1;
+
+class alignas(max_nfs_size) collaborative_once_runner : no_copy {
+
+    struct storage_t {
+        task_arena m_arena{ task_arena::attach{} };
+        wait_context m_wait_context{1};
+    };
+
+    std::atomic<std::int64_t> m_ref_count{0};
+    std::atomic<bool> m_is_ready{false};
+
+    // Storage with task_arena and wait_context must be initialized only by winner thread
+    union {
+        storage_t m_storage;
+    };
+
+    template<typename Fn>
+    void isolated_execute(Fn f) {
+        auto func = [f] {
+            f();
+           // delegate_base requires bool returning functor while isolate_within_arena ignores the result
+            return true;
+        };
+
+        delegated_function<decltype(func)> delegate(func);
+
+        r1::isolate_within_arena(delegate, reinterpret_cast<std::intptr_t>(this));
+    }
+
+public:
+    class lifetime_guard : no_copy {
+        collaborative_once_runner& m_runner;
+    public:
+        lifetime_guard(collaborative_once_runner& r) : m_runner(r) {
+            m_runner.m_ref_count++;
+        }
+        ~lifetime_guard() {
+            m_runner.m_ref_count--;
+        }
+    };
+
+    collaborative_once_runner() {}
+
+    ~collaborative_once_runner() {
+        spin_wait_until_eq(m_ref_count, 0, std::memory_order_acquire);
+        if (m_is_ready.load(std::memory_order_relaxed)) {
+            m_storage.~storage_t();
+        }
+    }
+
+    std::uintptr_t to_bits() {
+        return reinterpret_cast<std::uintptr_t>(this);
+    }
+
+    static collaborative_once_runner* from_bits(std::uintptr_t bits) {
+        __TBB_ASSERT( (bits & collaborative_once_references_mask) == 0, "invalid pointer, last log2(max_nfs_size) bits must be zero" );
+        return reinterpret_cast<collaborative_once_runner*>(bits);
+    }
+
+    template <typename F>
+    void run_once(F&& f) {
+        __TBB_ASSERT(!m_is_ready.load(std::memory_order_relaxed), "storage with task_arena and wait_context is already initialized");
+        // Initialize internal state
+        new(&m_storage) storage_t();
+        m_storage.m_arena.execute([&] {
+            isolated_execute([&] {
+                task_group_context context{ task_group_context::bound,
+                    task_group_context::default_traits | task_group_context::concurrent_wait };
+
+                function_stack_task<F> t{ std::forward<F>(f), m_storage.m_wait_context };
+
+                // Set the ready flag after entering the execute body to prevent
+                // moonlighting threads from occupying all slots inside the arena.
+                m_is_ready.store(true, std::memory_order_release);
+                execute_and_wait(t, context, m_storage.m_wait_context, context);
+            });
+        });
+    }
+
+    void assist() noexcept {
+        // Do not join the arena until the winner thread takes the slot
+        spin_wait_while_eq(m_is_ready, false);
+        m_storage.m_arena.execute([&] {
+            isolated_execute([&] {
+                // We do not want to get an exception from user functor on moonlighting threads.
+                // The exception is handled with the winner thread
+                task_group_context stub_context;
+                wait(m_storage.m_wait_context, stub_context);
+            });
+        });
+    }
+
+};
+
+class collaborative_once_flag : no_copy {
+    enum state : std::uintptr_t {
+        uninitialized,
+        done,
+#if TBB_USE_ASSERT
+        dead
+#endif
+    };
+    std::atomic<std::uintptr_t> m_state{ state::uninitialized };
+
+    template <typename Fn, typename... Args>
+    friend void collaborative_call_once(collaborative_once_flag& flag, Fn&& f, Args&&... args);
+
+    void set_completion_state(std::uintptr_t runner_bits, std::uintptr_t desired) {
+        std::uintptr_t expected = runner_bits;
+        do {
+            expected = runner_bits;
+            // Possible inefficiency: when we start waiting,
+            // some moonlighting threads might continue coming that will prolong our waiting.
+            // Fortunately, there are limited number of threads on the system so wait time is limited.
+            spin_wait_until_eq(m_state, expected);
+        } while (!m_state.compare_exchange_strong(expected, desired));
+    }
+    
+    template <typename Fn>
+    void do_collaborative_call_once(Fn&& f) {
+        std::uintptr_t expected = m_state.load(std::memory_order_acquire);
+        collaborative_once_runner runner;
+
+        do {
+            if (expected == state::uninitialized && m_state.compare_exchange_strong(expected, runner.to_bits())) {
+                // Winner thread
+                runner.run_once([&] {
+                    try_call([&] {
+                        std::forward<Fn>(f)();
+                    }).on_exception([&] {
+                        // Reset the state to uninitialized to allow other threads to try initialization again
+                        set_completion_state(runner.to_bits(), state::uninitialized);
+                    });
+                    // We successfully executed functor
+                    set_completion_state(runner.to_bits(), state::done);
+                });
+                break;
+            } else {
+                // Moonlighting thread: we need to add a reference to the state to prolong runner lifetime.
+                // However, the maximum number of references are limited with runner alignment.
+                // So, we use CAS loop and spin_wait to guarantee that references never exceed "max_value".
+                do {
+                    auto max_value = expected | collaborative_once_references_mask;
+                    expected = spin_wait_while_eq(m_state, max_value);
+                // "expected > state::done" prevents storing values, when state is uninitialized or done
+                } while (expected > state::done && !m_state.compare_exchange_strong(expected, expected + 1));
+
+                if (auto shared_runner = collaborative_once_runner::from_bits(expected & ~collaborative_once_references_mask)) {
+                    collaborative_once_runner::lifetime_guard guard{*shared_runner};
+                    m_state.fetch_sub(1);
+
+                    // The moonlighting threads are not expected to handle exceptions from user functor.
+                    // Therefore, no exception is expected from assist().
+                    shared_runner->assist();
+                }
+            }
+            __TBB_ASSERT(m_state.load(std::memory_order_relaxed) != state::dead,
+                         "collaborative_once_flag has been prematurely destroyed");
+        } while (expected != state::done);
+    }
+
+#if TBB_USE_ASSERT
+public:
+    ~collaborative_once_flag() {
+        m_state.store(state::dead, std::memory_order_relaxed);
+    }
+#endif
+};
+
+
+template <typename Fn, typename... Args>
+void collaborative_call_once(collaborative_once_flag& flag, Fn&& fn, Args&&... args) {
+    __TBB_ASSERT(flag.m_state.load(std::memory_order_relaxed) != collaborative_once_flag::dead,
+                 "collaborative_once_flag has been prematurely destroyed");
+    if (flag.m_state.load(std::memory_order_acquire) != collaborative_once_flag::done) {
+    #if __TBB_GCC_PARAMETER_PACK_IN_LAMBDAS_BROKEN
+        // Using stored_pack to suppress bug in GCC 4.8
+        // with parameter pack expansion in lambda
+        auto stored_pack = save_pack(std::forward<Args>(args)...);
+        auto func = [&] { call(std::forward<Fn>(fn), std::move(stored_pack)); };
+    #else
+        auto func = [&] { fn(std::forward<Args>(args)...); };
+    #endif
+        flag.do_collaborative_call_once(func);
+    }
+}
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning (pop) // 4324 warning
+#endif
+
+} // namespace d1
+} // namespace detail
+
+using detail::d1::collaborative_call_once;
+using detail::d1::collaborative_once_flag;
+} // namespace tbb
+
+#endif // __TBB_collaborative_call_once_H
diff --git a/third_party/tbb/combinable.h b/third_party/tbb/combinable.h
new file mode 100644
index 000000000..63eaf36e7
--- /dev/null
+++ b/third_party/tbb/combinable.h
@@ -0,0 +1,70 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_combinable_H
+#define __TBB_combinable_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+
+#include "third_party/tbb/enumerable_thread_specific.h"
+#include "third_party/tbb/cache_aligned_allocator.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+/** \name combinable **/
+//@{
+//! Thread-local storage with optional reduction
+/** @ingroup containers */
+template <typename T>
+class combinable {
+    using my_alloc = typename tbb::cache_aligned_allocator<T>;
+    using my_ets_type = typename tbb::enumerable_thread_specific<T, my_alloc, ets_no_key>;
+    my_ets_type my_ets;
+
+public:
+    combinable() = default;
+
+    template <typename Finit>
+    explicit combinable(Finit _finit) : my_ets(_finit) { }
+
+    void clear() { my_ets.clear(); }
+
+    T& local() { return my_ets.local(); }
+
+    T& local(bool& exists) { return my_ets.local(exists); }
+
+    // combine_func_t has signature T(T,T) or T(const T&, const T&)
+    template <typename CombineFunc>
+    T combine(CombineFunc f_combine) { return my_ets.combine(f_combine); }
+
+    // combine_func_t has signature void(T) or void(const T&)
+    template <typename CombineFunc>
+    void combine_each(CombineFunc f_combine) { my_ets.combine_each(f_combine); }
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::combinable;
+} // inline namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_combinable_H */
+
diff --git a/third_party/tbb/concurrent_bounded_queue.cpp b/third_party/tbb/concurrent_bounded_queue.cpp
new file mode 100644
index 000000000..6608c59a8
--- /dev/null
+++ b/third_party/tbb/concurrent_bounded_queue.cpp
@@ -0,0 +1,85 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/concurrent_queue.h"
+#include "third_party/tbb/cache_aligned_allocator.h"
+#include "third_party/tbb/concurrent_monitor.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+static constexpr std::size_t monitors_number = 2;
+
+std::uint8_t* __TBB_EXPORTED_FUNC allocate_bounded_queue_rep( std::size_t queue_rep_size )
+{
+    std::size_t monitors_mem_size = sizeof(concurrent_monitor) * monitors_number;
+    std::uint8_t* mem = static_cast<std::uint8_t*>(cache_aligned_allocate(queue_rep_size + monitors_mem_size));
+
+    concurrent_monitor* monitors = reinterpret_cast<concurrent_monitor*>(mem + queue_rep_size);
+    for (std::size_t i = 0; i < monitors_number; ++i) {
+        new (monitors + i) concurrent_monitor();
+    }
+
+    return mem;
+}
+
+void __TBB_EXPORTED_FUNC deallocate_bounded_queue_rep( std::uint8_t* mem, std::size_t queue_rep_size )
+{
+    concurrent_monitor* monitors = reinterpret_cast<concurrent_monitor*>(mem + queue_rep_size);
+    for (std::size_t i = 0; i < monitors_number; ++i) {
+        monitors[i].~concurrent_monitor();
+    }
+
+    cache_aligned_deallocate(mem);
+}
+
+void __TBB_EXPORTED_FUNC wait_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag,
+                                                        std::ptrdiff_t target, d1::delegate_base& predicate )
+{
+    __TBB_ASSERT(monitor_tag < monitors_number, nullptr);
+    concurrent_monitor& monitor = monitors[monitor_tag];
+
+    monitor.wait<concurrent_monitor::thread_context>([&] { return !predicate(); }, std::uintptr_t(target));
+}
+
+void __TBB_EXPORTED_FUNC abort_bounded_queue_monitors( concurrent_monitor* monitors ) {
+    concurrent_monitor& items_avail = monitors[d2::cbq_items_avail_tag];
+    concurrent_monitor& slots_avail = monitors[d2::cbq_slots_avail_tag];
+
+    items_avail.abort_all();
+    slots_avail.abort_all();
+}
+
+struct predicate_leq {
+    std::size_t my_ticket;
+    predicate_leq( std::size_t ticket ) : my_ticket(ticket) {}
+    bool operator() ( std::uintptr_t ticket ) const { return static_cast<std::size_t>(ticket) <= my_ticket; }
+};
+
+void __TBB_EXPORTED_FUNC notify_bounded_queue_monitor( concurrent_monitor* monitors,
+                                                               std::size_t monitor_tag, std::size_t ticket)
+{
+    __TBB_ASSERT(monitor_tag < monitors_number, nullptr);
+    concurrent_monitor& monitor = monitors[monitor_tag];
+    monitor.notify(predicate_leq(ticket));
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/concurrent_hash_map.h b/third_party/tbb/concurrent_hash_map.h
new file mode 100644
index 000000000..ae1a0e0a2
--- /dev/null
+++ b/third_party/tbb/concurrent_hash_map.h
@@ -0,0 +1,1665 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_hash_map_H
+#define __TBB_concurrent_hash_map_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_allocator_traits.h"
+#include "third_party/tbb/detail/_containers_helpers.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_hash_compare.h"
+#include "third_party/tbb/detail/_range_common.h"
+#include "third_party/tbb/tbb_allocator.h"
+#include "third_party/tbb/spin_rw_mutex.h"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/initializer_list"
+#include "third_party/libcxx/tuple"
+#include "third_party/libcxx/iterator"
+#include "third_party/libcxx/utility"      // Need std::pair
+#include "third_party/libcxx/cstring"      // Need std::memset
+
+namespace tbb {
+namespace detail {
+namespace d2 {
+
+#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS && __TBB_CPP20_CONCEPTS_PRESENT
+template <typename Mutex>
+concept ch_map_rw_scoped_lockable = rw_scoped_lockable<Mutex> &&
+	requires(const typename Mutex::scoped_lock& sl) {
+		{ sl.is_writer() } -> std::convertible_to<bool>;
+};
+#endif
+
+template <typename MutexType>
+struct hash_map_node_base : no_copy {
+    using mutex_type = MutexType;
+    // Scoped lock type for mutex
+    using scoped_type = typename MutexType::scoped_lock;
+    // Next node in chain
+    hash_map_node_base* next;
+    mutex_type mutex;
+};
+
+// Incompleteness flag value
+static void* const rehash_req_flag = reinterpret_cast<void*>(std::size_t(3));
+// Rehashed empty bucket flag
+static void* const empty_rehashed_flag = reinterpret_cast<void*>(std::size_t(0));
+
+template <typename MutexType>
+bool rehash_required( hash_map_node_base<MutexType>* node_ptr ) {
+    return reinterpret_cast<void*>(node_ptr) == rehash_req_flag;
+}
+
+#if TBB_USE_ASSERT
+template <typename MutexType>
+bool empty_rehashed( hash_map_node_base<MutexType>* node_ptr ) {
+    return reinterpret_cast<void*>(node_ptr) == empty_rehashed_flag;
+}
+#endif
+
+// base class of concurrent_hash_map
+
+template <typename Allocator, typename MutexType>
+class hash_map_base {
+public:
+    using size_type = std::size_t;
+    using hashcode_type = std::size_t;
+    using segment_index_type = std::size_t;
+    using node_base = hash_map_node_base<MutexType>;
+
+    struct bucket : no_copy {
+        using mutex_type = MutexType;
+        using scoped_type = typename mutex_type::scoped_lock;
+
+        bucket() : node_list(nullptr) {}
+        bucket( node_base* ptr ) : node_list(ptr) {}
+
+        mutex_type mutex;
+        std::atomic<node_base*> node_list;
+    };
+
+    using allocator_type = Allocator;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using bucket_allocator_type = typename allocator_traits_type::template rebind_alloc<bucket>;
+    using bucket_allocator_traits = tbb::detail::allocator_traits<bucket_allocator_type>;
+
+    // Count of segments in the first block
+    static constexpr size_type embedded_block = 1;
+    // Count of segments in the first block
+    static constexpr size_type embedded_buckets = 1 << embedded_block;
+    // Count of segments in the first block
+    static constexpr size_type first_block = 8; //including embedded_block. perfect with bucket size 16, so the allocations are power of 4096
+    // Size of a pointer / table size
+    static constexpr size_type pointers_per_table = sizeof(segment_index_type) * 8; // one segment per bit
+
+    using segment_ptr_type = bucket*;
+    using atomic_segment_type = std::atomic<segment_ptr_type>;
+    using segments_table_type = atomic_segment_type[pointers_per_table];
+
+    hash_map_base( const allocator_type& alloc ) : my_allocator(alloc), my_mask(embedded_buckets - 1), my_size(0) {
+        for (size_type i = 0; i != embedded_buckets; ++i) {
+            my_embedded_segment[i].node_list.store(nullptr, std::memory_order_relaxed);
+        }
+
+        for (size_type segment_index = 0; segment_index < pointers_per_table; ++segment_index) {
+            auto argument = segment_index < embedded_block ? my_embedded_segment + segment_base(segment_index) : nullptr;
+            my_table[segment_index].store(argument, std::memory_order_relaxed);
+        }
+
+        __TBB_ASSERT( embedded_block <= first_block, "The first block number must include embedded blocks");
+    }
+
+    // segment index of given index in the array
+    static segment_index_type segment_index_of( size_type index ) {
+        return segment_index_type(tbb::detail::log2( index|1 ));
+    }
+
+    // the first array index of given segment
+    static segment_index_type segment_base( segment_index_type k ) {
+        return (segment_index_type(1) << k & ~segment_index_type(1));
+    }
+
+    // segment size except for k == 0
+    static size_type segment_size( segment_index_type k ) {
+        return size_type(1) << k; // fake value for k==0
+    }
+
+    // true if ptr is valid pointer
+    static bool is_valid( void* ptr ) {
+        return reinterpret_cast<uintptr_t>(ptr) > uintptr_t(63);
+    }
+
+    template <typename... Args>
+    void init_buckets_impl( segment_ptr_type ptr, size_type sz, const Args&... args ) {
+        for (size_type i = 0; i < sz; ++i) {
+            bucket_allocator_traits::construct(my_allocator, ptr + i, args...);
+        }
+    }
+
+    // Initialize buckets
+    void init_buckets( segment_ptr_type ptr, size_type sz, bool is_initial ) {
+        if (is_initial) {
+            init_buckets_impl(ptr, sz);
+        } else {
+            init_buckets_impl(ptr, sz, reinterpret_cast<node_base*>(rehash_req_flag));
+        }
+    }
+
+    // Add node n to bucket b
+    static void add_to_bucket( bucket* b, node_base* n ) {
+        __TBB_ASSERT(!rehash_required(b->node_list.load(std::memory_order_relaxed)), nullptr);
+        n->next = b->node_list.load(std::memory_order_relaxed);
+        b->node_list.store(n, std::memory_order_relaxed); // its under lock and flag is set
+    }
+
+    const bucket_allocator_type& get_allocator() const {
+        return my_allocator;
+    }
+
+    bucket_allocator_type& get_allocator() {
+        return my_allocator;
+    }
+
+    // Enable segment
+    void enable_segment( segment_index_type k, bool is_initial = false ) {
+        __TBB_ASSERT( k, "Zero segment must be embedded" );
+        size_type sz;
+        __TBB_ASSERT( !is_valid(my_table[k].load(std::memory_order_relaxed)), "Wrong concurrent assignment");
+        if (k >= first_block) {
+            sz = segment_size(k);
+            segment_ptr_type ptr = nullptr;
+            try_call( [&] {
+                ptr = bucket_allocator_traits::allocate(my_allocator, sz);
+            } ).on_exception( [&] {
+                my_table[k].store(nullptr, std::memory_order_relaxed);
+            });
+
+            __TBB_ASSERT(ptr, nullptr);
+            init_buckets(ptr, sz, is_initial);
+            my_table[k].store(ptr, std::memory_order_release);
+            sz <<= 1;// double it to get entire capacity of the container
+        } else { // the first block
+            __TBB_ASSERT( k == embedded_block, "Wrong segment index" );
+            sz = segment_size(first_block);
+            segment_ptr_type ptr = nullptr;
+            try_call( [&] {
+                ptr = bucket_allocator_traits::allocate(my_allocator, sz - embedded_buckets);
+            } ).on_exception( [&] {
+                my_table[k].store(nullptr, std::memory_order_relaxed);
+            });
+
+            __TBB_ASSERT(ptr, nullptr);
+            init_buckets(ptr, sz - embedded_buckets, is_initial);
+            ptr -= segment_base(embedded_block);
+            for(segment_index_type i = embedded_block; i < first_block; i++) // calc the offsets
+                my_table[i].store(ptr + segment_base(i), std::memory_order_release);
+        }
+        my_mask.store(sz-1, std::memory_order_release);
+    }
+
+    void delete_segment( segment_index_type s ) {
+        segment_ptr_type buckets_ptr = my_table[s].load(std::memory_order_relaxed);
+        size_type sz = segment_size( s ? s : 1 );
+
+        size_type deallocate_size = 0;
+
+        if (s >= first_block) { // the first segment or the next
+            deallocate_size = sz;
+        } else if (s == embedded_block && embedded_block != first_block) {
+            deallocate_size = segment_size(first_block) - embedded_buckets;
+        }
+
+        for (size_type i = 0; i < deallocate_size; ++i) {
+            bucket_allocator_traits::destroy(my_allocator, buckets_ptr + i);
+        }
+        if (deallocate_size != 0) {
+            bucket_allocator_traits::deallocate(my_allocator, buckets_ptr, deallocate_size);
+        }
+
+        if (s >= embedded_block) my_table[s].store(nullptr, std::memory_order_relaxed);
+    }
+
+    // Get bucket by (masked) hashcode
+    bucket *get_bucket( hashcode_type h ) const noexcept {
+        segment_index_type s = segment_index_of( h );
+        h -= segment_base(s);
+        segment_ptr_type seg = my_table[s].load(std::memory_order_acquire);
+        __TBB_ASSERT( is_valid(seg), "hashcode must be cut by valid mask for allocated segments" );
+        return &seg[h];
+    }
+
+    // detail serial rehashing helper
+    void mark_rehashed_levels( hashcode_type h ) noexcept {
+        segment_index_type s = segment_index_of( h );
+        while (segment_ptr_type seg = my_table[++s].load(std::memory_order_relaxed))
+            if (rehash_required(seg[h].node_list.load(std::memory_order_relaxed))) {
+                seg[h].node_list.store(reinterpret_cast<node_base*>(empty_rehashed_flag), std::memory_order_relaxed);
+                mark_rehashed_levels( h + ((hashcode_type)1<<s) ); // optimized segment_base(s)
+            }
+    }
+
+    // Check for mask race
+    // Splitting into two functions should help inlining
+    inline bool check_mask_race( const hashcode_type h, hashcode_type &m ) const {
+        hashcode_type m_now, m_old = m;
+        m_now = my_mask.load(std::memory_order_acquire);
+        if (m_old != m_now) {
+            return check_rehashing_collision(h, m_old, m = m_now);
+        }
+        return false;
+    }
+
+    // Process mask race, check for rehashing collision
+    bool check_rehashing_collision( const hashcode_type h, hashcode_type m_old, hashcode_type m ) const {
+        __TBB_ASSERT(m_old != m, nullptr); // TODO?: m arg could be optimized out by passing h = h&m
+        if( (h & m_old) != (h & m) ) { // mask changed for this hashcode, rare event
+            // condition above proves that 'h' has some other bits set beside 'm_old'
+            // find next applicable mask after m_old    //TODO: look at bsl instruction
+            for( ++m_old; !(h & m_old); m_old <<= 1 ) // at maximum few rounds depending on the first block size
+                ;
+            m_old = (m_old<<1) - 1; // get full mask from a bit
+            __TBB_ASSERT((m_old&(m_old+1))==0 && m_old <= m, nullptr);
+            // check whether it is rehashing/ed
+            if (!rehash_required(get_bucket(h & m_old)->node_list.load(std::memory_order_acquire))) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // Insert a node and check for load factor. @return segment index to enable.
+    segment_index_type insert_new_node( bucket *b, node_base *n, hashcode_type mask ) {
+        size_type sz = ++my_size; // prefix form is to enforce allocation after the first item inserted
+        add_to_bucket( b, n );
+        // check load factor
+        if( sz >= mask ) { // TODO: add custom load_factor
+            segment_index_type new_seg = tbb::detail::log2( mask+1 ); //optimized segment_index_of
+            __TBB_ASSERT( is_valid(my_table[new_seg-1].load(std::memory_order_relaxed)), "new allocations must not publish new mask until segment has allocated");
+            static const segment_ptr_type is_allocating = segment_ptr_type(2);
+            segment_ptr_type disabled = nullptr;
+            if (!(my_table[new_seg].load(std::memory_order_acquire))
+                && my_table[new_seg].compare_exchange_strong(disabled, is_allocating))
+                return new_seg; // The value must be processed
+        }
+        return 0;
+    }
+
+    // Prepare enough segments for number of buckets
+    void reserve(size_type buckets) {
+        if( !buckets-- ) return;
+        bool is_initial = !my_size.load(std::memory_order_relaxed);
+        for (size_type m = my_mask.load(std::memory_order_relaxed); buckets > m;
+            m = my_mask.load(std::memory_order_relaxed))
+        {
+            enable_segment( segment_index_of( m+1 ), is_initial );
+        }
+    }
+
+    // Swap hash_map_bases
+    void internal_swap_content(hash_map_base &table) {
+        using std::swap;
+        swap_atomics_relaxed(my_mask, table.my_mask);
+        swap_atomics_relaxed(my_size, table.my_size);
+
+        for(size_type i = 0; i < embedded_buckets; i++) {
+            auto temp = my_embedded_segment[i].node_list.load(std::memory_order_relaxed);
+            my_embedded_segment[i].node_list.store(table.my_embedded_segment[i].node_list.load(std::memory_order_relaxed),
+                std::memory_order_relaxed);
+            table.my_embedded_segment[i].node_list.store(temp, std::memory_order_relaxed);
+        }
+        for(size_type i = embedded_block; i < pointers_per_table; i++) {
+            auto temp = my_table[i].load(std::memory_order_relaxed);
+            my_table[i].store(table.my_table[i].load(std::memory_order_relaxed),
+                std::memory_order_relaxed);
+            table.my_table[i].store(temp, std::memory_order_relaxed);
+        }
+    }
+
+    void internal_move(hash_map_base&& other) {
+        my_mask.store(other.my_mask.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_mask.store(embedded_buckets - 1, std::memory_order_relaxed);
+
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_size.store(0, std::memory_order_relaxed);
+
+        for (size_type i = 0; i < embedded_buckets; ++i) {
+            my_embedded_segment[i].node_list.store(other.my_embedded_segment[i].node_list, std::memory_order_relaxed);
+            other.my_embedded_segment[i].node_list.store(nullptr, std::memory_order_relaxed);
+        }
+
+        for (size_type i = embedded_block; i < pointers_per_table; ++i) {
+            my_table[i].store(other.my_table[i].load(std::memory_order_relaxed),
+                std::memory_order_relaxed);
+            other.my_table[i].store(nullptr, std::memory_order_relaxed);
+        }
+    }
+
+protected:
+    bucket_allocator_type my_allocator;
+    // Hash mask = sum of allocated segment sizes - 1
+    std::atomic<hashcode_type> my_mask;
+    // Size of container in stored items
+    std::atomic<size_type> my_size; // It must be in separate cache line from my_mask due to performance effects
+    // Zero segment
+    bucket my_embedded_segment[embedded_buckets];
+    // Segment pointers table. Also prevents false sharing between my_mask and my_size
+    segments_table_type my_table;
+};
+
+template <typename Iterator>
+class hash_map_range;
+
+// Meets requirements of a forward iterator for STL
+// Value is either the T or const T type of the container.
+template <typename Container, typename Value>
+class hash_map_iterator {
+    using map_type = Container;
+    using node = typename Container::node;
+    using map_base = typename Container::base_type;
+    using node_base = typename map_base::node_base;
+    using bucket = typename map_base::bucket;
+public:
+    using value_type = Value;
+    using size_type = typename Container::size_type;
+    using difference_type = typename Container::difference_type;
+    using pointer = value_type*;
+    using reference = value_type&;
+    using iterator_category = std::forward_iterator_tag;
+
+    // Construct undefined iterator
+    hash_map_iterator(): my_map(), my_index(), my_bucket(), my_node() {}
+    hash_map_iterator( const hash_map_iterator<Container, typename Container::value_type>& other ) :
+        my_map(other.my_map),
+        my_index(other.my_index),
+        my_bucket(other.my_bucket),
+        my_node(other.my_node)
+    {}
+
+    hash_map_iterator& operator=( const hash_map_iterator<Container, typename Container::value_type>& other ) {
+        my_map = other.my_map;
+        my_index = other.my_index;
+        my_bucket = other.my_bucket;
+        my_node = other.my_node;
+        return *this;
+    }
+
+    Value& operator*() const {
+        __TBB_ASSERT( map_base::is_valid(my_node), "iterator uninitialized or at end of container?" );
+        return my_node->value();
+    }
+
+    Value* operator->() const {return &operator*();}
+
+    hash_map_iterator& operator++() {
+        my_node = static_cast<node*>( my_node->next );
+        if( !my_node ) advance_to_next_bucket();
+        return *this;
+    }
+
+    // Post increment
+    hash_map_iterator operator++(int) {
+        hash_map_iterator old(*this);
+        operator++();
+        return old;
+    }
+private:
+    template <typename C, typename T, typename U>
+    friend bool operator==( const hash_map_iterator<C,T>& i, const hash_map_iterator<C,U>& j );
+
+    template <typename C, typename T, typename U>
+    friend bool operator!=( const hash_map_iterator<C,T>& i, const hash_map_iterator<C,U>& j );
+
+    template <typename C, typename T, typename U>
+    friend ptrdiff_t operator-( const hash_map_iterator<C,T>& i, const hash_map_iterator<C,U>& j );
+
+    template <typename C, typename U>
+    friend class hash_map_iterator;
+
+    template <typename I>
+    friend class hash_map_range;
+
+    void advance_to_next_bucket() { // TODO?: refactor to iterator_base class
+        size_t k = my_index+1;
+        __TBB_ASSERT( my_bucket, "advancing an invalid iterator?");
+        while (k <= my_map->my_mask.load(std::memory_order_relaxed)) {
+            // Following test uses 2's-complement wizardry
+            if( k&(k-2) ) // not the beginning of a segment
+                ++my_bucket;
+            else my_bucket = my_map->get_bucket( k );
+            node_base *n = my_bucket->node_list.load(std::memory_order_relaxed);
+            if( map_base::is_valid(n) ) {
+                my_node = static_cast<node*>(n);
+                my_index = k;
+                return;
+            }
+            ++k;
+        }
+        my_bucket = nullptr; my_node = nullptr; my_index = k; // the end
+    }
+
+    template <typename Key, typename T, typename HashCompare, typename A
+#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS
+            , typename M
+             >
+        __TBB_requires(tbb::detail::hash_compare<HashCompare, Key> &&
+                       ch_map_rw_scoped_lockable<M>)
+#else
+             >
+        __TBB_requires(tbb::detail::hash_compare<HashCompare, Key>)
+#endif
+    friend class concurrent_hash_map;
+
+    hash_map_iterator( const Container &map, std::size_t index, const bucket *b, node_base *n ) :
+        my_map(&map), my_index(index), my_bucket(b), my_node(static_cast<node*>(n))
+    {
+        if( b && !map_base::is_valid(n) )
+            advance_to_next_bucket();
+    }
+
+    // concurrent_hash_map over which we are iterating.
+    const Container *my_map;
+    // Index in hash table for current item
+    size_t my_index;
+    // Pointer to bucket
+    const bucket* my_bucket;
+    // Pointer to node that has current item
+    node* my_node;
+};
+
+template <typename Container, typename T, typename U>
+bool operator==( const hash_map_iterator<Container,T>& i, const hash_map_iterator<Container,U>& j ) {
+    return i.my_node == j.my_node && i.my_map == j.my_map;
+}
+
+template <typename Container, typename T, typename U>
+bool operator!=( const hash_map_iterator<Container,T>& i, const hash_map_iterator<Container,U>& j ) {
+    return i.my_node != j.my_node || i.my_map != j.my_map;
+}
+
+// Range class used with concurrent_hash_map
+template <typename Iterator>
+class hash_map_range {
+    using map_type = typename Iterator::map_type;
+public:
+    // Type for size of a range
+    using size_type = std::size_t;
+    using value_type = typename Iterator::value_type;
+    using reference = typename Iterator::reference;
+    using difference_type = typename Iterator::difference_type;
+    using iterator = Iterator;
+
+    // True if range is empty.
+    bool empty() const { return my_begin == my_end; }
+
+    // True if range can be partitioned into two subranges.
+    bool is_divisible() const {
+        return my_midpoint != my_end;
+    }
+
+    // Split range.
+    hash_map_range( hash_map_range& r, split ) :
+        my_end(r.my_end),
+        my_grainsize(r.my_grainsize)
+    {
+        r.my_end = my_begin = r.my_midpoint;
+        __TBB_ASSERT( !empty(), "Splitting despite the range is not divisible" );
+        __TBB_ASSERT( !r.empty(), "Splitting despite the range is not divisible" );
+        set_midpoint();
+        r.set_midpoint();
+    }
+
+    // Init range with container and grainsize specified
+    hash_map_range( const map_type &map, size_type grainsize_ = 1 ) :
+        my_begin( Iterator( map, 0, map.my_embedded_segment, map.my_embedded_segment->node_list.load(std::memory_order_relaxed) ) ),
+        my_end( Iterator( map, map.my_mask.load(std::memory_order_relaxed) + 1, nullptr, nullptr ) ),
+        my_grainsize( grainsize_ )
+    {
+        __TBB_ASSERT( grainsize_>0, "grainsize must be positive" );
+        set_midpoint();
+    }
+
+    Iterator begin() const { return my_begin; }
+    Iterator end() const { return my_end; }
+    // The grain size for this range.
+    size_type grainsize() const { return my_grainsize; }
+
+private:
+    Iterator my_begin;
+    Iterator my_end;
+    mutable Iterator my_midpoint;
+    size_t my_grainsize;
+    // Set my_midpoint to point approximately half way between my_begin and my_end.
+    void set_midpoint() const;
+    template <typename U> friend class hash_map_range;
+};
+
+template <typename Iterator>
+void hash_map_range<Iterator>::set_midpoint() const {
+    // Split by groups of nodes
+    size_t m = my_end.my_index-my_begin.my_index;
+    if( m > my_grainsize ) {
+        m = my_begin.my_index + m/2u;
+        auto b = my_begin.my_map->get_bucket(m);
+        my_midpoint = Iterator(*my_begin.my_map,m,b,b->node_list.load(std::memory_order_relaxed));
+    } else {
+        my_midpoint = my_end;
+    }
+    __TBB_ASSERT( my_begin.my_index <= my_midpoint.my_index,
+        "my_begin is after my_midpoint" );
+    __TBB_ASSERT( my_midpoint.my_index <= my_end.my_index,
+        "my_midpoint is after my_end" );
+    __TBB_ASSERT( my_begin != my_midpoint || my_begin == my_end,
+        "[my_begin, my_midpoint) range should not be empty" );
+}
+
+template <typename Key, typename T,
+          typename HashCompare = d1::tbb_hash_compare<Key>,
+          typename Allocator = tbb_allocator<std::pair<const Key, T>>
+#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS
+        , typename MutexType = spin_rw_mutex
+         >
+    __TBB_requires(tbb::detail::hash_compare<HashCompare, Key> &&
+                   ch_map_rw_scoped_lockable<MutexType>)
+#else
+         >
+    __TBB_requires(tbb::detail::hash_compare<HashCompare, Key>)
+#endif
+class concurrent_hash_map
+#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS
+    : protected hash_map_base<Allocator, MutexType>
+#else
+    : protected hash_map_base<Allocator, spin_rw_mutex>
+#endif
+{
+    template <typename Container, typename Value>
+    friend class hash_map_iterator;
+
+    template <typename I>
+    friend class hash_map_range;
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+
+#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS
+    using base_type = hash_map_base<Allocator, MutexType>;
+#else
+    using base_type = hash_map_base<Allocator, spin_rw_mutex>;
+#endif
+public:
+    using key_type = Key;
+    using mapped_type = T;
+    // type_identity is needed to disable implicit deduction guides for std::initializer_list constructors
+    // and copy/move constructor with explicit allocator argument
+    using allocator_type = tbb::detail::type_identity_t<Allocator>;
+    using hash_compare_type = tbb::detail::type_identity_t<HashCompare>;
+    using value_type = std::pair<const Key, T>;
+    using size_type = typename base_type::size_type;
+    using difference_type = std::ptrdiff_t;
+#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS
+    using mutex_type = MutexType;
+#endif
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using iterator = hash_map_iterator<concurrent_hash_map, value_type>;
+    using const_iterator = hash_map_iterator<concurrent_hash_map, const value_type>;
+    using range_type = hash_map_range<iterator>;
+    using const_range_type = hash_map_range<const_iterator>;
+
+protected:
+    static_assert(std::is_same<value_type, typename Allocator::value_type>::value,
+        "value_type of the container must be the same as its allocator's");
+
+    friend class const_accessor;
+    class node;
+    using segment_index_type = typename base_type::segment_index_type;
+    using segment_ptr_type = typename base_type::segment_ptr_type;
+    using node_base = typename base_type::node_base;
+    using bucket = typename base_type::bucket;
+    using hashcode_type = typename base_type::hashcode_type;
+    using bucket_allocator_type = typename base_type::bucket_allocator_type;
+    using node_allocator_type = typename base_type::allocator_traits_type::template rebind_alloc<node>;
+    using node_allocator_traits = tbb::detail::allocator_traits<node_allocator_type>;
+    hash_compare_type my_hash_compare;
+
+    class node : public node_base {
+    public:
+        node() {}
+        ~node() {}
+        pointer storage() { return &my_value; }
+        value_type& value() { return *storage(); }
+    private:
+        union {
+            value_type my_value;
+        };
+    };
+
+    void delete_node( node_base *n ) {
+        node_allocator_type node_allocator(this->get_allocator());
+        node_allocator_traits::destroy(node_allocator, static_cast<node*>(n)->storage());
+        node_allocator_traits::destroy(node_allocator, static_cast<node*>(n));
+        node_allocator_traits::deallocate(node_allocator, static_cast<node*>(n), 1);
+    }
+
+    template <typename... Args>
+    static node* create_node(bucket_allocator_type& allocator, Args&&... args) {
+        node_allocator_type node_allocator(allocator);
+        node* node_ptr = node_allocator_traits::allocate(node_allocator, 1);
+        auto guard = make_raii_guard([&] {
+            node_allocator_traits::destroy(node_allocator, node_ptr);
+            node_allocator_traits::deallocate(node_allocator, node_ptr, 1);
+        });
+
+        node_allocator_traits::construct(node_allocator, node_ptr);
+        node_allocator_traits::construct(node_allocator, node_ptr->storage(), std::forward<Args>(args)...);
+        guard.dismiss();
+        return node_ptr;
+    }
+
+    static node* allocate_node_copy_construct(bucket_allocator_type& allocator, const Key &key, const T * t){
+        return create_node(allocator, key, *t);
+    }
+
+    static node* allocate_node_move_construct(bucket_allocator_type& allocator, const Key &key, const T * t){
+        return create_node(allocator, key, std::move(*const_cast<T*>(t)));
+    }
+
+    template <typename K = Key>
+    static node* allocate_node_default_construct(bucket_allocator_type& allocator, const K &key, const T * ){
+        // Emplace construct an empty T object inside the pair
+        return create_node(allocator, std::piecewise_construct,
+                           std::forward_as_tuple(key), std::forward_as_tuple());
+    }
+
+    static node* do_not_allocate_node(bucket_allocator_type& , const Key &, const T * ){
+        __TBB_ASSERT(false,"this dummy function should not be called");
+        return nullptr;
+    }
+
+    template <typename K>
+    node *search_bucket( const K &key, bucket *b ) const {
+        node *n = static_cast<node*>( b->node_list.load(std::memory_order_relaxed) );
+        while (this->is_valid(n) && !my_hash_compare.equal(key, n->value().first))
+            n = static_cast<node*>( n->next );
+        __TBB_ASSERT(!rehash_required(n), "Search can be executed only for rehashed bucket");
+        return n;
+    }
+
+    // bucket accessor is to find, rehash, acquire a lock, and access a bucket
+    class bucket_accessor : public bucket::scoped_type {
+        bucket *my_b;
+    public:
+        bucket_accessor( concurrent_hash_map *base, const hashcode_type h, bool writer = false ) { acquire( base, h, writer ); }
+        // find a bucket by masked hashcode, optionally rehash, and acquire the lock
+        inline void acquire( concurrent_hash_map *base, const hashcode_type h, bool writer = false ) {
+            my_b = base->get_bucket( h );
+            // TODO: actually, notification is unnecessary here, just hiding double-check
+            if (rehash_required(my_b->node_list.load(std::memory_order_acquire))
+                && bucket::scoped_type::try_acquire( my_b->mutex, /*write=*/true ) )
+            {
+                if (rehash_required(my_b->node_list.load(std::memory_order_relaxed))) base->rehash_bucket(my_b, h); // recursive rehashing
+            }
+            else bucket::scoped_type::acquire( my_b->mutex, writer );
+            __TBB_ASSERT(!rehash_required(my_b->node_list.load(std::memory_order_relaxed)), nullptr);
+        }
+
+        // get bucket pointer
+        bucket *operator() () { return my_b; }
+    };
+
+    // TODO refactor to hash_base
+    void rehash_bucket( bucket *b_new, const hashcode_type hash ) {
+        __TBB_ASSERT( hash > 1, "The lowermost buckets can't be rehashed" );
+        b_new->node_list.store(reinterpret_cast<node_base*>(empty_rehashed_flag), std::memory_order_release); // mark rehashed
+        hashcode_type mask = (hashcode_type(1) << tbb::detail::log2(hash)) - 1; // get parent mask from the topmost bit
+        bucket_accessor b_old( this, hash & mask );
+
+        mask = (mask<<1) | 1; // get full mask for new bucket
+        __TBB_ASSERT( (mask&(mask+1))==0 && (hash & mask) == hash, nullptr );
+    restart:
+        node_base* prev = nullptr;
+        node_base* curr = b_old()->node_list.load(std::memory_order_acquire);
+        while (this->is_valid(curr)) {
+            hashcode_type curr_node_hash = my_hash_compare.hash(static_cast<node*>(curr)->value().first);
+
+            if ((curr_node_hash & mask) == hash) {
+                if (!b_old.is_writer()) {
+                    if (!b_old.upgrade_to_writer()) {
+                        goto restart; // node ptr can be invalid due to concurrent erase
+                    }
+                }
+                node_base* next = curr->next;
+                // exclude from b_old
+                if (prev == nullptr) {
+                    b_old()->node_list.store(curr->next, std::memory_order_relaxed);
+                } else {
+                    prev->next = curr->next;
+                }
+                this->add_to_bucket(b_new, curr);
+                curr = next;
+            } else {
+                prev = curr;
+                curr = curr->next;
+            }
+        }
+    }
+
+    template <typename U>
+    using hash_compare_is_transparent = dependent_bool<comp_is_transparent<hash_compare_type>, U>;
+
+public:
+
+    class accessor;
+    // Combines data access, locking, and garbage collection.
+    class const_accessor : private node::scoped_type /*which derived from no_copy*/ {
+#if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS
+        friend class concurrent_hash_map<Key,T,HashCompare,Allocator,MutexType>;
+#else
+        friend class concurrent_hash_map<Key,T,HashCompare,Allocator>;
+#endif
+        friend class accessor;
+    public:
+        // Type of value
+        using value_type = const typename concurrent_hash_map::value_type;
+
+        // True if result is empty.
+        bool empty() const { return !my_node; }
+
+        // Set to null
+        void release() {
+            if( my_node ) {
+                node::scoped_type::release();
+                my_node = nullptr;
+            }
+        }
+
+        // Return reference to associated value in hash table.
+        const_reference operator*() const {
+            __TBB_ASSERT( my_node, "attempt to dereference empty accessor" );
+            return my_node->value();
+        }
+
+        // Return pointer to associated value in hash table.
+        const_pointer operator->() const {
+            return &operator*();
+        }
+
+        // Create empty result
+        const_accessor() : my_node(nullptr), my_hash() {}
+
+        // Destroy result after releasing the underlying reference.
+        ~const_accessor() {
+            my_node = nullptr; // scoped lock's release() is called in its destructor
+        }
+    protected:
+        bool is_writer() { return node::scoped_type::is_writer(); }
+        node *my_node;
+        hashcode_type my_hash;
+    };
+
+    // Allows write access to elements and combines data access, locking, and garbage collection.
+    class accessor: public const_accessor {
+    public:
+        // Type of value
+        using value_type = typename concurrent_hash_map::value_type;
+
+        // Return reference to associated value in hash table.
+        reference operator*() const {
+            __TBB_ASSERT( this->my_node, "attempt to dereference empty accessor" );
+            return this->my_node->value();
+        }
+
+        // Return pointer to associated value in hash table.
+        pointer operator->() const {
+            return &operator*();
+        }
+    };
+
+    explicit concurrent_hash_map( const hash_compare_type& compare, const allocator_type& a = allocator_type() )
+        : base_type(a)
+        , my_hash_compare(compare)
+    {}
+
+    concurrent_hash_map() : concurrent_hash_map(hash_compare_type()) {}
+
+    explicit concurrent_hash_map( const allocator_type& a )
+        : concurrent_hash_map(hash_compare_type(), a)
+    {}
+
+    // Construct empty table with n preallocated buckets. This number serves also as initial concurrency level.
+    concurrent_hash_map( size_type n, const allocator_type &a = allocator_type() )
+        : concurrent_hash_map(a)
+    {
+        this->reserve(n);
+    }
+
+    concurrent_hash_map( size_type n, const hash_compare_type& compare, const allocator_type& a = allocator_type() )
+        : concurrent_hash_map(compare, a)
+    {
+        this->reserve(n);
+    }
+
+    // Copy constructor
+    concurrent_hash_map( const concurrent_hash_map &table )
+        : concurrent_hash_map(node_allocator_traits::select_on_container_copy_construction(table.get_allocator()))
+    {
+        try_call( [&] {
+            internal_copy(table);
+        }).on_exception( [&] {
+            this->clear();
+        });
+    }
+
+    concurrent_hash_map( const concurrent_hash_map &table, const allocator_type &a)
+        : concurrent_hash_map(a)
+    {
+        try_call( [&] {
+            internal_copy(table);
+        }).on_exception( [&] {
+            this->clear();
+        });
+    }
+
+    // Move constructor
+    concurrent_hash_map( concurrent_hash_map &&table )
+        : concurrent_hash_map(std::move(table.get_allocator()))
+    {
+        this->internal_move(std::move(table));
+    }
+
+    // Move constructor
+    concurrent_hash_map( concurrent_hash_map &&table, const allocator_type &a )
+        : concurrent_hash_map(a)
+    {
+        using is_equal_type = typename node_allocator_traits::is_always_equal;
+        internal_move_construct_with_allocator(std::move(table), a, is_equal_type());
+    }
+
+    // Construction with copying iteration range and given allocator instance
+    template <typename I>
+    concurrent_hash_map( I first, I last, const allocator_type &a = allocator_type() )
+        : concurrent_hash_map(a)
+    {
+        try_call( [&] {
+            internal_copy(first, last, std::distance(first, last));
+        }).on_exception( [&] {
+            this->clear();
+        });
+    }
+
+    template <typename I>
+    concurrent_hash_map( I first, I last, const hash_compare_type& compare, const allocator_type& a = allocator_type() )
+        : concurrent_hash_map(compare, a)
+    {
+        try_call( [&] {
+            internal_copy(first, last, std::distance(first, last));
+        }).on_exception( [&] {
+            this->clear();
+        });
+    }
+
+    concurrent_hash_map( std::initializer_list<value_type> il, const hash_compare_type& compare = hash_compare_type(), const allocator_type& a = allocator_type() )
+        : concurrent_hash_map(compare, a)
+    {
+        try_call( [&] {
+            internal_copy(il.begin(), il.end(), il.size());
+        }).on_exception( [&] {
+            this->clear();
+        });
+    }
+
+    concurrent_hash_map( std::initializer_list<value_type> il, const allocator_type& a )
+        : concurrent_hash_map(il, hash_compare_type(), a) {}
+
+    // Assignment
+    concurrent_hash_map& operator=( const concurrent_hash_map &table ) {
+        if( this != &table ) {
+            clear();
+            copy_assign_allocators(this->my_allocator, table.my_allocator);
+            internal_copy(table);
+        }
+        return *this;
+    }
+
+    // Move Assignment
+    concurrent_hash_map& operator=( concurrent_hash_map &&table ) {
+        if( this != &table ) {
+            using pocma_type = typename node_allocator_traits::propagate_on_container_move_assignment;
+            using is_equal_type = typename node_allocator_traits::is_always_equal;
+            move_assign_allocators(this->my_allocator, table.my_allocator);
+            internal_move_assign(std::move(table), tbb::detail::disjunction<is_equal_type, pocma_type>());
+        }
+        return *this;
+    }
+
+    // Assignment
+    concurrent_hash_map& operator=( std::initializer_list<value_type> il ) {
+        clear();
+        internal_copy(il.begin(), il.end(), il.size());
+        return *this;
+    }
+
+    // Rehashes and optionally resizes the whole table.
+    /** Useful to optimize performance before or after concurrent operations.
+        Also enables using of find() and count() concurrent methods in serial context. */
+    void rehash(size_type sz = 0) {
+        this->reserve(sz); // TODO: add reduction of number of buckets as well
+        hashcode_type mask = this->my_mask.load(std::memory_order_relaxed);
+        hashcode_type b = (mask+1)>>1; // size or first index of the last segment
+        __TBB_ASSERT((b&(b-1))==0, nullptr); // zero or power of 2
+        bucket *bp = this->get_bucket( b ); // only the last segment should be scanned for rehashing
+        for(; b <= mask; b++, bp++ ) {
+            node_base *n = bp->node_list.load(std::memory_order_relaxed);
+            __TBB_ASSERT( this->is_valid(n) || empty_rehashed(n) || rehash_required(n), "Broken internal structure" );
+            __TBB_ASSERT( *reinterpret_cast<intptr_t*>(&bp->mutex) == 0, "concurrent or unexpectedly terminated operation during rehash() execution" );
+            if (rehash_required(n)) { // rehash bucket, conditional because rehashing of a previous bucket may affect this one
+                hashcode_type h = b; bucket *b_old = bp;
+                do {
+                    __TBB_ASSERT( h > 1, "The lowermost buckets can't be rehashed" );
+                    hashcode_type m = ( hashcode_type(1) << tbb::detail::log2( h ) ) - 1; // get parent mask from the topmost bit
+                    b_old = this->get_bucket( h &= m );
+                } while( rehash_required(b_old->node_list.load(std::memory_order_relaxed)) );
+                // now h - is index of the root rehashed bucket b_old
+                this->mark_rehashed_levels( h ); // mark all non-rehashed children recursively across all segments
+                node_base* prev = nullptr;
+                node_base* curr = b_old->node_list.load(std::memory_order_relaxed);
+                while (this->is_valid(curr)) {
+                    hashcode_type curr_node_hash = my_hash_compare.hash(static_cast<node*>(curr)->value().first);
+
+                    if ((curr_node_hash & mask) != h) { // should be rehashed
+                        node_base* next = curr->next;
+                        // exclude from b_old
+                        if (prev == nullptr) {
+                            b_old->node_list.store(curr->next, std::memory_order_relaxed);
+                        } else {
+                            prev->next = curr->next;
+                        }
+                        bucket *b_new = this->get_bucket(curr_node_hash & mask);
+                        __TBB_ASSERT(!rehash_required(b_new->node_list.load(std::memory_order_relaxed)), "hash() function changed for key in table or internal error");
+                        this->add_to_bucket(b_new, curr);
+                        curr = next;
+                    } else {
+                        prev = curr;
+                        curr = curr->next;
+                    }
+                }
+            }
+        }
+    }
+
+    // Clear table
+    void clear() {
+        hashcode_type m = this->my_mask.load(std::memory_order_relaxed);
+        __TBB_ASSERT((m&(m+1))==0, "data structure is invalid");
+        this->my_size.store(0, std::memory_order_relaxed);
+        segment_index_type s = this->segment_index_of( m );
+        __TBB_ASSERT( s+1 == this->pointers_per_table || !this->my_table[s+1].load(std::memory_order_relaxed), "wrong mask or concurrent grow" );
+        do {
+            __TBB_ASSERT(this->is_valid(this->my_table[s].load(std::memory_order_relaxed)), "wrong mask or concurrent grow" );
+            segment_ptr_type buckets_ptr = this->my_table[s].load(std::memory_order_relaxed);
+            size_type sz = this->segment_size( s ? s : 1 );
+            for( segment_index_type i = 0; i < sz; i++ )
+                for( node_base *n = buckets_ptr[i].node_list.load(std::memory_order_relaxed);
+                    this->is_valid(n); n = buckets_ptr[i].node_list.load(std::memory_order_relaxed) )
+                {
+                    buckets_ptr[i].node_list.store(n->next, std::memory_order_relaxed);
+                    delete_node( n );
+                }
+            this->delete_segment(s);
+        } while(s-- > 0);
+        this->my_mask.store(this->embedded_buckets - 1, std::memory_order_relaxed);
+    }
+
+    // Clear table and destroy it.
+    ~concurrent_hash_map() { clear(); }
+
+    //------------------------------------------------------------------------
+    // Parallel algorithm support
+    //------------------------------------------------------------------------
+    range_type range( size_type grainsize=1 ) {
+        return range_type( *this, grainsize );
+    }
+    const_range_type range( size_type grainsize=1 ) const {
+        return const_range_type( *this, grainsize );
+    }
+
+    //------------------------------------------------------------------------
+    // STL support - not thread-safe methods
+    //------------------------------------------------------------------------
+    iterator begin() { return iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); }
+    const_iterator begin() const { return const_iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); }
+    const_iterator cbegin() const { return const_iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); }
+    iterator end() { return iterator( *this, 0, nullptr, nullptr ); }
+    const_iterator end() const { return const_iterator( *this, 0, nullptr, nullptr ); }
+    const_iterator cend() const { return const_iterator( *this, 0, nullptr, nullptr ); }
+    std::pair<iterator, iterator> equal_range( const Key& key ) { return internal_equal_range( key, end() ); }
+    std::pair<const_iterator, const_iterator> equal_range( const Key& key ) const { return internal_equal_range( key, end() ); }
+
+    template <typename K>
+    typename std::enable_if<hash_compare_is_transparent<K>::value,
+                            std::pair<iterator, iterator>>::type equal_range( const K& key ) {
+        return internal_equal_range(key, end());
+    }
+
+    template <typename K>
+    typename std::enable_if<hash_compare_is_transparent<K>::value,
+                            std::pair<const_iterator, const_iterator>>::type equal_range( const K& key ) const {
+        return internal_equal_range(key, end());
+    }
+
+    // Number of items in table.
+    size_type size() const { return this->my_size.load(std::memory_order_acquire); }
+
+    // True if size()==0.
+    __TBB_nodiscard bool empty() const { return size() == 0; }
+
+    // Upper bound on size.
+    size_type max_size() const {
+        return allocator_traits_type::max_size(base_type::get_allocator());
+    }
+
+    // Returns the current number of buckets
+    size_type bucket_count() const { return this->my_mask.load(std::memory_order_relaxed) + 1; }
+
+    // return allocator object
+    allocator_type get_allocator() const { return base_type::get_allocator(); }
+
+    // swap two instances. Iterators are invalidated
+    void swap(concurrent_hash_map& table) {
+        using pocs_type = typename node_allocator_traits::propagate_on_container_swap;
+        using is_equal_type = typename node_allocator_traits::is_always_equal;
+        swap_allocators(this->my_allocator, table.my_allocator);
+        internal_swap(table, tbb::detail::disjunction<pocs_type, is_equal_type>());
+    }
+
+    //------------------------------------------------------------------------
+    // concurrent map operations
+    //------------------------------------------------------------------------
+
+    // Return count of items (0 or 1)
+    size_type count( const Key &key ) const {
+        return const_cast<concurrent_hash_map*>(this)->lookup</*insert*/false>(key, nullptr, nullptr, /*write=*/false, &do_not_allocate_node);
+    }
+
+    template <typename K>
+    typename std::enable_if<hash_compare_is_transparent<K>::value,
+                            size_type>::type count( const K& key ) const {
+        return const_cast<concurrent_hash_map*>(this)->lookup</*insert*/false>(key, nullptr, nullptr, /*write=*/false, &do_not_allocate_node);
+    }
+
+    // Find item and acquire a read lock on the item.
+    /** Return true if item is found, false otherwise. */
+    bool find( const_accessor &result, const Key &key ) const {
+        result.release();
+        return const_cast<concurrent_hash_map*>(this)->lookup</*insert*/false>(key, nullptr, &result, /*write=*/false, &do_not_allocate_node );
+    }
+
+    // Find item and acquire a write lock on the item.
+    /** Return true if item is found, false otherwise. */
+    bool find( accessor &result, const Key &key ) {
+        result.release();
+        return lookup</*insert*/false>(key, nullptr, &result, /*write=*/true, &do_not_allocate_node);
+    }
+
+    template <typename K>
+    typename std::enable_if<hash_compare_is_transparent<K>::value,
+                            bool>::type find( const_accessor& result, const K& key ) {
+        result.release();
+        return lookup</*insert*/false>(key, nullptr, &result, /*write=*/false, &do_not_allocate_node);
+    }
+
+    template <typename K>
+    typename std::enable_if<hash_compare_is_transparent<K>::value,
+                            bool>::type find( accessor& result, const K& key ) {
+        result.release();
+        return lookup</*insert*/false>(key, nullptr, &result, /*write=*/true, &do_not_allocate_node);
+    }
+
+    // Insert item (if not already present) and acquire a read lock on the item.
+    /** Returns true if item is new. */
+    bool insert( const_accessor &result, const Key &key ) {
+        result.release();
+        return lookup</*insert*/true>(key, nullptr, &result, /*write=*/false, &allocate_node_default_construct<>);
+    }
+
+    // Insert item (if not already present) and acquire a write lock on the item.
+    /** Returns true if item is new. */
+    bool insert( accessor &result, const Key &key ) {
+        result.release();
+        return lookup</*insert*/true>(key, nullptr, &result, /*write=*/true, &allocate_node_default_construct<>);
+    }
+
+    template <typename K>
+    typename std::enable_if<hash_compare_is_transparent<K>::value &&
+                            std::is_constructible<key_type, const K&>::value,
+                            bool>::type insert( const_accessor& result, const K& key ) {
+        result.release();
+        return lookup</*insert*/true>(key, nullptr, &result, /*write=*/false, &allocate_node_default_construct<K>);
+    }
+
+    template <typename K>
+    typename std::enable_if<hash_compare_is_transparent<K>::value &&
+                            std::is_constructible<key_type, const K&>::value,
+                            bool>::type insert( accessor& result, const K& key ) {
+        result.release();
+        return lookup</*insert*/true>(key, nullptr, &result, /*write=*/true, &allocate_node_default_construct<K>);
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a read lock on the item.
+    /** Returns true if item is new. */
+    bool insert( const_accessor &result, const value_type &value ) {
+        result.release();
+        return lookup</*insert*/true>(value.first, &value.second, &result, /*write=*/false, &allocate_node_copy_construct);
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a write lock on the item.
+    /** Returns true if item is new. */
+    bool insert( accessor &result, const value_type &value ) {
+        result.release();
+        return lookup</*insert*/true>(value.first, &value.second, &result, /*write=*/true, &allocate_node_copy_construct);
+    }
+
+    // Insert item by copying if there is no such key present already
+    /** Returns true if item is inserted. */
+    bool insert( const value_type &value ) {
+        return lookup</*insert*/true>(value.first, &value.second, nullptr, /*write=*/false, &allocate_node_copy_construct);
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a read lock on the item.
+    /** Returns true if item is new. */
+    bool insert( const_accessor &result, value_type && value ) {
+        return generic_move_insert(result, std::move(value));
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a write lock on the item.
+    /** Returns true if item is new. */
+    bool insert( accessor &result, value_type && value ) {
+        return generic_move_insert(result, std::move(value));
+    }
+
+    // Insert item by copying if there is no such key present already
+    /** Returns true if item is inserted. */
+    bool insert( value_type && value ) {
+        return generic_move_insert(accessor_not_used(), std::move(value));
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a read lock on the item.
+    /** Returns true if item is new. */
+    template <typename... Args>
+    bool emplace( const_accessor &result, Args&&... args ) {
+        return generic_emplace(result, std::forward<Args>(args)...);
+    }
+
+    // Insert item by copying if there is no such key present already and acquire a write lock on the item.
+    /** Returns true if item is new. */
+    template <typename... Args>
+    bool emplace( accessor &result, Args&&... args ) {
+        return generic_emplace(result, std::forward<Args>(args)...);
+    }
+
+    // Insert item by copying if there is no such key present already
+    /** Returns true if item is inserted. */
+    template <typename... Args>
+    bool emplace( Args&&... args ) {
+        return generic_emplace(accessor_not_used(), std::forward<Args>(args)...);
+    }
+
+    // Insert range [first, last)
+    template <typename I>
+    void insert( I first, I last ) {
+        for ( ; first != last; ++first )
+            insert( *first );
+    }
+
+    // Insert initializer list
+    void insert( std::initializer_list<value_type> il ) {
+        insert( il.begin(), il.end() );
+    }
+
+    // Erase item.
+    /** Return true if item was erased by particularly this call. */
+    bool erase( const Key &key ) {
+        return internal_erase(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<hash_compare_is_transparent<K>::value,
+                            bool>::type erase( const K& key ) {
+        return internal_erase(key);
+    }
+
+    // Erase item by const_accessor.
+    /** Return true if item was erased by particularly this call. */
+    bool erase( const_accessor& item_accessor ) {
+        return exclude( item_accessor );
+    }
+
+    // Erase item by accessor.
+    /** Return true if item was erased by particularly this call. */
+    bool erase( accessor& item_accessor ) {
+        return exclude( item_accessor );
+    }
+
+protected:
+    template <typename K, typename AllocateNodeType>
+    node* allocate_node_helper( const K& key, const T* t, AllocateNodeType allocate_node, std::true_type ) {
+        return allocate_node(base_type::get_allocator(), key, t);
+    }
+
+    template <typename K, typename AllocateNodeType>
+    node* allocate_node_helper( const K&, const T*, AllocateNodeType, std::false_type ) {
+        __TBB_ASSERT(false, "allocate_node_helper with std::false_type should never been called");
+        return nullptr;
+    }
+
+    // Insert or find item and optionally acquire a lock on the item.
+    template <bool OpInsert, typename K, typename AllocateNodeType>
+    bool lookup( const K &key, const T *t, const_accessor *result, bool write, AllocateNodeType allocate_node, node *tmp_n  = nullptr)
+    {
+        __TBB_ASSERT( !result || !result->my_node, nullptr );
+        bool return_value;
+        hashcode_type const h = my_hash_compare.hash( key );
+        hashcode_type m = this->my_mask.load(std::memory_order_acquire);
+        segment_index_type grow_segment = 0;
+        node *n;
+        restart:
+        {//lock scope
+            __TBB_ASSERT((m&(m+1))==0, "data structure is invalid");
+            return_value = false;
+            // get bucket
+            bucket_accessor b( this, h & m );
+            // find a node
+            n = search_bucket( key, b() );
+            if( OpInsert ) {
+                // [opt] insert a key
+                if( !n ) {
+                    if( !tmp_n ) {
+                        tmp_n = allocate_node_helper(key, t, allocate_node, std::integral_constant<bool, OpInsert>{});
+                    }
+                    while ( !b.is_writer() && !b.upgrade_to_writer() ) { // TODO: improved insertion
+                        // Rerun search list, in case another thread inserted the intem during the upgrade
+                        n = search_bucket(key, b());
+                        if (this->is_valid(n)) { // unfortunately, it did
+                            if (!b.downgrade_to_reader()) {
+                                // If the lock was downgraded with reacquiring the mutex
+                                // Rerun search list in case another thread removed the item during the downgrade
+                                n = search_bucket(key, b());
+                                if (!this->is_valid(n)) {
+                                    // Unfortunately, it did
+                                    // We need to try upgrading to writer again
+                                    continue;
+                                }
+                            }
+                            goto exists;
+                        }
+                    }
+
+                    if( this->check_mask_race(h, m) )
+                        goto restart; // b.release() is done in ~b().
+                    // insert and set flag to grow the container
+                    grow_segment = this->insert_new_node( b(), n = tmp_n, m );
+                    tmp_n = nullptr;
+                    return_value = true;
+                }
+            } else { // find or count
+                if( !n ) {
+                    if( this->check_mask_race( h, m ) )
+                        goto restart; // b.release() is done in ~b(). TODO: replace by continue
+                    return false;
+                }
+                return_value = true;
+            }
+        exists:
+            if( !result ) goto check_growth;
+            // TODO: the following seems as generic/regular operation
+            // acquire the item
+            if( !result->try_acquire( n->mutex, write ) ) {
+                for( tbb::detail::atomic_backoff backoff(true);; ) {
+                    if( result->try_acquire( n->mutex, write ) ) break;
+                    if( !backoff.bounded_pause() ) {
+                        // the wait takes really long, restart the operation
+                        b.release();
+                        __TBB_ASSERT( !OpInsert || !return_value, "Can't acquire new item in locked bucket?" );
+                        yield();
+                        m = this->my_mask.load(std::memory_order_acquire);
+                        goto restart;
+                    }
+                }
+            }
+        }//lock scope
+        result->my_node = n;
+        result->my_hash = h;
+    check_growth:
+        // [opt] grow the container
+        if( grow_segment ) {
+            this->enable_segment( grow_segment );
+        }
+        if( tmp_n ) // if OpInsert only
+            delete_node( tmp_n );
+        return return_value;
+    }
+
+    struct accessor_not_used { void release(){}};
+    friend const_accessor* accessor_location( accessor_not_used const& ){ return nullptr;}
+    friend const_accessor* accessor_location( const_accessor & a )      { return &a;}
+
+    friend bool is_write_access_needed( accessor const& )           { return true;}
+    friend bool is_write_access_needed( const_accessor const& )     { return false;}
+    friend bool is_write_access_needed( accessor_not_used const& )  { return false;}
+
+    template <typename Accessor>
+    bool generic_move_insert( Accessor && result, value_type && value ) {
+        result.release();
+        return lookup</*insert*/true>(value.first, &value.second, accessor_location(result), is_write_access_needed(result), &allocate_node_move_construct);
+    }
+
+    template <typename Accessor, typename... Args>
+    bool generic_emplace( Accessor && result, Args &&... args ) {
+        result.release();
+        node * node_ptr = create_node(base_type::get_allocator(), std::forward<Args>(args)...);
+        return lookup</*insert*/true>(node_ptr->value().first, nullptr, accessor_location(result), is_write_access_needed(result), &do_not_allocate_node, node_ptr);
+    }
+
+    // delete item by accessor
+    bool exclude( const_accessor &item_accessor ) {
+        __TBB_ASSERT( item_accessor.my_node, nullptr );
+        node_base *const exclude_node = item_accessor.my_node;
+        hashcode_type const hash = item_accessor.my_hash;
+        hashcode_type mask = this->my_mask.load(std::memory_order_acquire);
+        do {
+            // get bucket
+            bucket_accessor b( this, hash & mask, /*writer=*/true );
+            node_base* prev = nullptr;
+            node_base* curr = b()->node_list.load(std::memory_order_relaxed);
+
+            while (curr && curr != exclude_node) {
+                prev = curr;
+                curr = curr->next;
+            }
+
+            if (curr == nullptr) { // someone else was first
+                if (this->check_mask_race(hash, mask))
+                    continue;
+                item_accessor.release();
+                return false;
+            }
+            __TBB_ASSERT( curr == exclude_node, nullptr );
+            // remove from container
+            if (prev == nullptr) {
+                b()->node_list.store(curr->next, std::memory_order_relaxed);
+            } else {
+                prev->next = curr->next;
+            }
+
+            this->my_size--;
+            break;
+        } while(true);
+        if (!item_accessor.is_writer()) { // need to get exclusive lock
+            item_accessor.upgrade_to_writer(); // return value means nothing here
+        }
+
+        item_accessor.release();
+        delete_node(exclude_node); // Only one thread can delete it
+        return true;
+    }
+
+    template <typename K>
+    bool internal_erase( const K& key ) {
+        node_base *erase_node;
+        hashcode_type const hash = my_hash_compare.hash(key);
+        hashcode_type mask = this->my_mask.load(std::memory_order_acquire);
+    restart:
+        {//lock scope
+            // get bucket
+            bucket_accessor b( this, hash & mask );
+        search:
+            node_base* prev = nullptr;
+            erase_node = b()->node_list.load(std::memory_order_relaxed);
+            while (this->is_valid(erase_node) && !my_hash_compare.equal(key, static_cast<node*>(erase_node)->value().first ) ) {
+                prev = erase_node;
+                erase_node = erase_node->next;
+            }
+
+            if (erase_node == nullptr) { // not found, but mask could be changed
+                if (this->check_mask_race(hash, mask))
+                    goto restart;
+                return false;
+            } else if (!b.is_writer() && !b.upgrade_to_writer()) {
+                if (this->check_mask_race(hash, mask)) // contended upgrade, check mask
+                    goto restart;
+                goto search;
+            }
+
+            // remove from container
+            if (prev == nullptr) {
+                b()->node_list.store(erase_node->next, std::memory_order_relaxed);
+            } else {
+                prev->next = erase_node->next;
+            }
+            this->my_size--;
+        }
+        {
+            typename node::scoped_type item_locker( erase_node->mutex, /*write=*/true );
+        }
+        // note: there should be no threads pretending to acquire this mutex again, do not try to upgrade const_accessor!
+        delete_node(erase_node); // Only one thread can delete it due to write lock on the bucket
+        return true;
+    }
+
+    // Returns an iterator for an item defined by the key, or for the next item after it (if upper==true)
+    template <typename K, typename I>
+    std::pair<I, I> internal_equal_range( const K& key, I end_ ) const {
+        hashcode_type h = my_hash_compare.hash( key );
+        hashcode_type m = this->my_mask.load(std::memory_order_relaxed);
+        __TBB_ASSERT((m&(m+1))==0, "data structure is invalid");
+        h &= m;
+        bucket *b = this->get_bucket( h );
+        while (rehash_required(b->node_list.load(std::memory_order_relaxed))) {
+            m = ( hashcode_type(1) << tbb::detail::log2( h ) ) - 1; // get parent mask from the topmost bit
+            b = this->get_bucket( h &= m );
+        }
+        node *n = search_bucket( key, b );
+        if( !n )
+            return std::make_pair(end_, end_);
+        iterator lower(*this, h, b, n), upper(lower);
+        return std::make_pair(lower, ++upper);
+    }
+
+    // Copy "source" to *this, where *this must start out empty.
+    void internal_copy( const concurrent_hash_map& source ) {
+        hashcode_type mask = source.my_mask.load(std::memory_order_relaxed);
+        if( this->my_mask.load(std::memory_order_relaxed) == mask ) { // optimized version
+            this->reserve(source.my_size.load(std::memory_order_relaxed)); // TODO: load_factor?
+            bucket *dst = nullptr, *src = nullptr;
+            bool rehashing_required = false;
+            for( hashcode_type k = 0; k <= mask; k++ ) {
+                if( k & (k-2) ) ++dst,src++; // not the beginning of a segment
+                else { dst = this->get_bucket( k ); src = source.get_bucket( k ); }
+                __TBB_ASSERT(!rehash_required(dst->node_list.load(std::memory_order_relaxed)), "Invalid bucket in destination table");
+                node *n = static_cast<node*>( src->node_list.load(std::memory_order_relaxed) );
+                if (rehash_required(n)) { // source is not rehashed, items are in previous buckets
+                    rehashing_required = true;
+                    dst->node_list.store(reinterpret_cast<node_base*>(rehash_req_flag), std::memory_order_relaxed);
+                } else for(; n; n = static_cast<node*>( n->next ) ) {
+                    node* node_ptr = create_node(base_type::get_allocator(), n->value().first, n->value().second);
+                    this->add_to_bucket( dst, node_ptr);
+                    this->my_size.fetch_add(1, std::memory_order_relaxed);
+                }
+            }
+            if( rehashing_required ) rehash();
+        } else internal_copy(source.begin(), source.end(), source.my_size.load(std::memory_order_relaxed));
+    }
+
+    template <typename I>
+    void internal_copy( I first, I last, size_type reserve_size ) {
+        this->reserve(reserve_size); // TODO: load_factor?
+        hashcode_type m = this->my_mask.load(std::memory_order_relaxed);
+        for(; first != last; ++first) {
+            hashcode_type h = my_hash_compare.hash( (*first).first );
+            bucket *b = this->get_bucket( h & m );
+            __TBB_ASSERT(!rehash_required(b->node_list.load(std::memory_order_relaxed)), "Invalid bucket in destination table");
+            node* node_ptr = create_node(base_type::get_allocator(), (*first).first, (*first).second);
+            this->add_to_bucket( b, node_ptr );
+            ++this->my_size; // TODO: replace by non-atomic op
+        }
+    }
+
+    void internal_move_construct_with_allocator( concurrent_hash_map&& other, const allocator_type&,
+                                                /*is_always_equal=*/std::true_type )
+    {
+        this->internal_move(std::move(other));
+    }
+
+    void internal_move_construct_with_allocator( concurrent_hash_map&& other, const allocator_type& a,
+                                                /*is_always_equal=*/std::false_type )
+    {
+        if (a == other.get_allocator()){
+            this->internal_move(std::move(other));
+        } else {
+            try_call( [&] {
+                internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()),
+                    other.size());
+            }).on_exception( [&] {
+                this->clear();
+            });
+        }
+    }
+
+    void internal_move_assign( concurrent_hash_map&& other,
+        /*is_always_equal || POCMA = */std::true_type)
+    {
+        this->internal_move(std::move(other));
+    }
+
+    void internal_move_assign(concurrent_hash_map&& other, /*is_always_equal=*/ std::false_type) {
+        if (this->my_allocator == other.my_allocator) {
+            this->internal_move(std::move(other));
+        } else {
+            //do per element move
+            internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()),
+                other.size());
+        }
+    }
+
+    void internal_swap(concurrent_hash_map& other, /*is_always_equal || POCS = */ std::true_type) {
+        this->internal_swap_content(other);
+    }
+
+    void internal_swap(concurrent_hash_map& other, /*is_always_equal || POCS = */ std::false_type) {
+        __TBB_ASSERT(this->my_allocator == other.my_allocator, nullptr);
+        this->internal_swap_content(other);
+    }
+
+    // Fast find when no concurrent erasure is used. For internal use inside TBB only!
+    /** Return pointer to item with given key, or nullptr if no such item exists.
+        Must not be called concurrently with erasure operations. */
+    const_pointer internal_fast_find( const Key& key ) const {
+        hashcode_type h = my_hash_compare.hash( key );
+        hashcode_type m = this->my_mask.load(std::memory_order_acquire);
+        node *n;
+    restart:
+        __TBB_ASSERT((m&(m+1))==0, "data structure is invalid");
+        bucket *b = this->get_bucket( h & m );
+        // TODO: actually, notification is unnecessary here, just hiding double-check
+        if (rehash_required(b->node_list.load(std::memory_order_acquire)))
+        {
+            typename bucket::scoped_type lock;
+            if( lock.try_acquire( b->mutex, /*write=*/true ) ) {
+                if (rehash_required(b->node_list.load(std::memory_order_relaxed)))
+                    const_cast<concurrent_hash_map*>(this)->rehash_bucket( b, h & m ); //recursive rehashing
+            }
+            else lock.acquire( b->mutex, /*write=*/false );
+            __TBB_ASSERT(!rehash_required(b->node_list.load(std::memory_order_relaxed)), nullptr);
+        }
+        n = search_bucket( key, b );
+        if( n )
+            return n->storage();
+        else if( this->check_mask_race( h, m ) )
+            goto restart;
+        return nullptr;
+    }
+};
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename It,
+          typename HashCompare = d1::tbb_hash_compare<iterator_key_t<It>>,
+          typename Alloc = tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<HashCompare>>>
+concurrent_hash_map( It, It, HashCompare = HashCompare(), Alloc = Alloc() )
+-> concurrent_hash_map<iterator_key_t<It>, iterator_mapped_t<It>, HashCompare, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_hash_map( It, It, Alloc )
+-> concurrent_hash_map<iterator_key_t<It>, iterator_mapped_t<It>, d1::tbb_hash_compare<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T,
+          typename HashCompare = d1::tbb_hash_compare<std::remove_const_t<Key>>,
+          typename Alloc = tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<HashCompare>>>
+concurrent_hash_map( std::initializer_list<std::pair<Key, T>>, HashCompare = HashCompare(), Alloc = Alloc() )
+-> concurrent_hash_map<std::remove_const_t<Key>, T, HashCompare, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_hash_map( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_hash_map<std::remove_const_t<Key>, T, d1::tbb_hash_compare<std::remove_const_t<Key>>, Alloc>;
+
+#endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */
+
+template <typename Key, typename T, typename HashCompare, typename A1, typename A2>
+inline bool operator==(const concurrent_hash_map<Key, T, HashCompare, A1> &a, const concurrent_hash_map<Key, T, HashCompare, A2> &b) {
+    if(a.size() != b.size()) return false;
+    typename concurrent_hash_map<Key, T, HashCompare, A1>::const_iterator i(a.begin()), i_end(a.end());
+    typename concurrent_hash_map<Key, T, HashCompare, A2>::const_iterator j, j_end(b.end());
+    for(; i != i_end; ++i) {
+        j = b.equal_range(i->first).first;
+        if( j == j_end || !(i->second == j->second) ) return false;
+    }
+    return true;
+}
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template <typename Key, typename T, typename HashCompare, typename A1, typename A2>
+inline bool operator!=(const concurrent_hash_map<Key, T, HashCompare, A1> &a, const concurrent_hash_map<Key, T, HashCompare, A2> &b)
+{    return !(a == b); }
+#endif // !__TBB_CPP20_COMPARISONS_PRESENT
+
+template <typename Key, typename T, typename HashCompare, typename A>
+inline void swap(concurrent_hash_map<Key, T, HashCompare, A> &a, concurrent_hash_map<Key, T, HashCompare, A> &b)
+{    a.swap( b ); }
+
+} // namespace d2
+} // namespace detail
+
+inline namespace v1 {
+    using detail::split;
+    using detail::d2::concurrent_hash_map;
+    using detail::d1::tbb_hash_compare;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_concurrent_hash_map_H */
diff --git a/third_party/tbb/concurrent_lru_cache.h b/third_party/tbb/concurrent_lru_cache.h
new file mode 100644
index 000000000..d8fe096b4
--- /dev/null
+++ b/third_party/tbb/concurrent_lru_cache.h
@@ -0,0 +1,375 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_lru_cache_H
+#define __TBB_concurrent_lru_cache_H
+
+#if ! TBB_PREVIEW_CONCURRENT_LRU_CACHE
+    #error Set TBB_PREVIEW_CONCURRENT_LRU_CACHE to include concurrent_lru_cache.h
+#endif
+
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_aggregator.h"
+
+#include "third_party/libcxx/map"       // for std::map
+#include "third_party/libcxx/list"      // for std::list
+#include "third_party/libcxx/utility"   // for std::make_pair
+#include "third_party/libcxx/algorithm" // for std::find
+#include "third_party/libcxx/atomic"    // for std::atomic<bool>
+
+namespace tbb {
+
+namespace detail {
+namespace d1 {
+
+//-----------------------------------------------------------------------------
+// Concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT = ValT (*) (KeyT)>
+class concurrent_lru_cache : no_assign {
+// incapsulated helper classes
+private:
+    struct handle_object;
+    struct storage_map_value_type;
+
+    struct aggregator_operation;
+    struct retrieve_aggregator_operation;
+    struct signal_end_of_usage_aggregator_operation;
+
+// typedefs
+public:
+    using key_type = KeyT;
+    using value_type = ValT;
+    using pointer = ValT*;
+    using reference = ValT&;
+    using const_pointer = const ValT*;
+    using const_reference = const ValT&;
+
+    using value_function_type = KeyToValFunctorT;
+    using handle = handle_object;
+private:
+    using lru_cache_type = concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>;
+
+    using storage_map_type = std::map<key_type, storage_map_value_type>;
+    using storage_map_iterator_type = typename storage_map_type::iterator;
+    using storage_map_pointer_type = typename storage_map_type::pointer;
+    using storage_map_reference_type = typename storage_map_type::reference;
+
+    using history_list_type = std::list<storage_map_iterator_type>;
+    using history_list_iterator_type = typename history_list_type::iterator;
+
+    using aggregator_operation_type = aggregator_operation;
+    using aggregator_function_type = aggregating_functor<lru_cache_type, aggregator_operation_type>;
+    using aggregator_type = aggregator<aggregator_function_type, aggregator_operation_type>;
+
+    friend class aggregating_functor<lru_cache_type,aggregator_operation_type>;
+
+// fields
+private:
+    value_function_type my_value_function;
+    aggregator_type my_aggregator;
+
+    storage_map_type my_storage_map;            // storage map for used objects
+    history_list_type my_history_list;          // history list for unused objects
+    const std::size_t my_history_list_capacity; // history list's allowed capacity
+
+// interface
+public:
+
+    concurrent_lru_cache(value_function_type value_function, std::size_t cache_capacity)
+        : my_value_function(value_function), my_history_list_capacity(cache_capacity) {
+        my_aggregator.initialize_handler(aggregator_function_type(this));
+    }
+
+    handle operator[](key_type key) {
+        retrieve_aggregator_operation op(key);
+        my_aggregator.execute(&op);
+
+        if (op.is_new_value_needed()) {
+            op.result().second.my_value = my_value_function(key);
+            op.result().second.my_is_ready.store(true, std::memory_order_release);
+        } else {
+            spin_wait_while_eq(op.result().second.my_is_ready, false);
+        }
+
+        return handle(*this, op.result());
+    }
+
+private:
+
+    void handle_operations(aggregator_operation* op_list) {
+        while (op_list) {
+            op_list->cast_and_handle(*this);
+            aggregator_operation* prev_op = op_list;
+            op_list = op_list->next;
+
+            (prev_op->status).store(1, std::memory_order_release);
+        }
+    }
+
+    void signal_end_of_usage(storage_map_reference_type map_record_ref) {
+        signal_end_of_usage_aggregator_operation op(map_record_ref);
+        my_aggregator.execute(&op);
+    }
+
+    void signal_end_of_usage_serial(storage_map_reference_type map_record_ref) {
+        storage_map_iterator_type map_it = my_storage_map.find(map_record_ref.first);
+
+        __TBB_ASSERT(map_it != my_storage_map.end(),
+            "cache should not return past-end iterators to outer world");
+        __TBB_ASSERT(&(*map_it) == &map_record_ref,
+            "dangling reference has been returned to outside world: data race?");
+        __TBB_ASSERT(std::find(my_history_list.begin(), my_history_list.end(), map_it) == my_history_list.end(),
+            "object in use should not be in list of unused objects ");
+
+        // if it was the last reference, put it to the LRU history
+        if (! --(map_it->second.my_ref_counter)) {
+            // if the LRU history is full, evict the oldest items to get space
+            if (my_history_list.size() >= my_history_list_capacity) {
+                if (my_history_list_capacity == 0) {
+                    // Since LRU history capacity is zero, there is no need to keep the element in history
+                    my_storage_map.erase(map_it);
+                    return;
+                }
+                std::size_t number_of_elements_to_evict = 1 + my_history_list.size() - my_history_list_capacity;
+
+                for (std::size_t i = 0; i < number_of_elements_to_evict; ++i) {
+                    storage_map_iterator_type map_it_to_evict = my_history_list.back();
+
+                    __TBB_ASSERT(map_it_to_evict->second.my_ref_counter == 0,
+                        "item to be evicted should not have a live references");
+
+                    // TODO: can we use forward_list instead of list? pop_front / insert_after last
+                    my_history_list.pop_back();
+                    my_storage_map.erase(map_it_to_evict);
+                }
+            }
+
+            // TODO: can we use forward_list instead of list? pop_front / insert_after last
+            my_history_list.push_front(map_it);
+            map_it->second.my_history_list_iterator = my_history_list.begin();
+        }
+    }
+
+    storage_map_reference_type retrieve_serial(key_type key, bool& is_new_value_needed) {
+        storage_map_iterator_type map_it = my_storage_map.find(key);
+
+        if (map_it == my_storage_map.end()) {
+            map_it = my_storage_map.emplace_hint(
+                map_it, std::piecewise_construct, std::make_tuple(key), std::make_tuple(value_type(), 0, my_history_list.end(), false));
+            is_new_value_needed = true;
+        } else {
+            history_list_iterator_type list_it = map_it->second.my_history_list_iterator;
+            if (list_it != my_history_list.end()) {
+                __TBB_ASSERT(map_it->second.my_ref_counter == 0,
+                    "item to be evicted should not have a live references");
+
+                // Item is going to be used. Therefore it is not a subject for eviction,
+                // so we remove it from LRU history.
+                my_history_list.erase(list_it);
+                map_it->second.my_history_list_iterator = my_history_list.end();
+            }
+        }
+
+        ++(map_it->second.my_ref_counter);
+        return *map_it;
+    }
+};
+
+//-----------------------------------------------------------------------------
+// Value type for storage map in concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::storage_map_value_type {
+//typedefs
+public:
+    using ref_counter_type = std::size_t;
+
+// fields
+public:
+    value_type my_value;
+    ref_counter_type my_ref_counter;
+    history_list_iterator_type my_history_list_iterator;
+    std::atomic<bool> my_is_ready;
+
+// interface
+public:
+    storage_map_value_type(
+        value_type const& value, ref_counter_type ref_counter,
+        history_list_iterator_type history_list_iterator, bool is_ready)
+        : my_value(value), my_ref_counter(ref_counter),
+          my_history_list_iterator(history_list_iterator), my_is_ready(is_ready) {}
+};
+
+//-----------------------------------------------------------------------------
+// Handle object for operator[] in concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::handle_object {
+// fields
+private:
+    lru_cache_type* my_lru_cache_ptr;
+    storage_map_pointer_type my_map_record_ptr;
+
+// interface
+public:
+    handle_object()
+        : my_lru_cache_ptr(nullptr), my_map_record_ptr(nullptr) {}
+    handle_object(lru_cache_type& lru_cache_ref, storage_map_reference_type map_record_ref)
+        : my_lru_cache_ptr(&lru_cache_ref), my_map_record_ptr(&map_record_ref) {}
+
+    handle_object(handle_object&) = delete;
+    void operator=(handle_object&) = delete;
+
+    handle_object(handle_object&& other)
+        : my_lru_cache_ptr(other.my_lru_cache_ptr), my_map_record_ptr(other.my_map_record_ptr) {
+
+        __TBB_ASSERT(
+            (other.my_lru_cache_ptr != nullptr && other.my_map_record_ptr != nullptr) ||
+            (other.my_lru_cache_ptr == nullptr && other.my_map_record_ptr == nullptr),
+            "invalid state of moving object?");
+
+        other.my_lru_cache_ptr = nullptr;
+        other.my_map_record_ptr = nullptr;
+    }
+
+    handle_object& operator=(handle_object&& other) {
+        __TBB_ASSERT(
+            (other.my_lru_cache_ptr != nullptr && other.my_map_record_ptr != nullptr) ||
+            (other.my_lru_cache_ptr == nullptr && other.my_map_record_ptr == nullptr),
+            "invalid state of moving object?");
+
+        if (my_lru_cache_ptr)
+            my_lru_cache_ptr->signal_end_of_usage(*my_map_record_ptr);
+
+        my_lru_cache_ptr = other.my_lru_cache_ptr;
+        my_map_record_ptr = other.my_map_record_ptr;
+        other.my_lru_cache_ptr = nullptr;
+        other.my_map_record_ptr = nullptr;
+
+        return *this;
+    }
+
+    ~handle_object() {
+        if (my_lru_cache_ptr)
+            my_lru_cache_ptr->signal_end_of_usage(*my_map_record_ptr);
+    }
+
+    operator bool() const {
+        return (my_lru_cache_ptr && my_map_record_ptr);
+    }
+
+    value_type& value() {
+        __TBB_ASSERT(my_lru_cache_ptr, "get value from already moved object?");
+        __TBB_ASSERT(my_map_record_ptr, "get value from an invalid or already moved object?");
+
+        return my_map_record_ptr->second.my_value;
+    }
+};
+
+//-----------------------------------------------------------------------------
+// Aggregator operation for aggregator type in concurrent LRU cache
+//-----------------------------------------------------------------------------
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::aggregator_operation
+    : aggregated_operation<aggregator_operation> {
+// incapsulated helper classes
+public:
+    enum class op_type { retrieve, signal_end_of_usage };
+
+// fields
+private:
+    op_type my_op;
+
+// interface
+public:
+    aggregator_operation(op_type op) : my_op(op) {}
+
+    // TODO: aggregator_operation can be implemented
+    //   - as a statically typed variant type or CRTP? (static, dependent on the use case)
+    //   - or use pointer to function and apply_visitor (dynamic)
+    //   - or use virtual functions (dynamic)
+    void cast_and_handle(lru_cache_type& lru_cache_ref) {
+        if (my_op == op_type::retrieve)
+            static_cast<retrieve_aggregator_operation*>(this)->handle(lru_cache_ref);
+        else
+            static_cast<signal_end_of_usage_aggregator_operation*>(this)->handle(lru_cache_ref);
+    }
+};
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::retrieve_aggregator_operation
+    : aggregator_operation, private no_assign {
+public:
+    key_type my_key;
+    storage_map_pointer_type my_map_record_ptr;
+    bool my_is_new_value_needed;
+
+public:
+    retrieve_aggregator_operation(key_type key)
+        : aggregator_operation(aggregator_operation::op_type::retrieve),
+          my_key(key), my_map_record_ptr(nullptr), my_is_new_value_needed(false) {}
+
+    void handle(lru_cache_type& lru_cache_ref) {
+        my_map_record_ptr = &lru_cache_ref.retrieve_serial(my_key, my_is_new_value_needed);
+    }
+
+    storage_map_reference_type result() {
+        __TBB_ASSERT(my_map_record_ptr, "Attempt to call result() before calling handle()");
+        return *my_map_record_ptr;
+    }
+
+    bool is_new_value_needed() { return my_is_new_value_needed; }
+};
+
+template<typename KeyT, typename ValT, typename KeyToValFunctorT>
+struct concurrent_lru_cache<KeyT, ValT, KeyToValFunctorT>::signal_end_of_usage_aggregator_operation
+    : aggregator_operation, private no_assign {
+
+private:
+    storage_map_reference_type my_map_record_ref;
+
+public:
+    signal_end_of_usage_aggregator_operation(storage_map_reference_type map_record_ref)
+        : aggregator_operation(aggregator_operation::op_type::signal_end_of_usage),
+          my_map_record_ref(map_record_ref) {}
+
+    void handle(lru_cache_type& lru_cache_ref) {
+        lru_cache_ref.signal_end_of_usage_serial(my_map_record_ref);
+    }
+};
+
+// TODO: if we have guarantees that KeyToValFunctorT always have
+//       ValT as a return type and KeyT as an argument type
+//       we can deduce template parameters of concurrent_lru_cache
+//       by pattern matching on KeyToValFunctorT
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_lru_cache;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_lru_cache_H
diff --git a/third_party/tbb/concurrent_map.h b/third_party/tbb/concurrent_map.h
new file mode 100644
index 000000000..55e2f3568
--- /dev/null
+++ b/third_party/tbb/concurrent_map.h
@@ -0,0 +1,351 @@
+// clang-format off
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_map_H
+#define __TBB_concurrent_map_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_concurrent_skip_list.h"
+#include "third_party/tbb/tbb_allocator.h"
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/tuple"
+#include "third_party/libcxx/utility"
+
+namespace tbb {
+namespace detail {
+namespace d2 {
+
+template<typename Key, typename Value, typename KeyCompare, typename RandomGenerator,
+         typename Allocator, bool AllowMultimapping>
+struct map_traits {
+    static constexpr std::size_t max_level = RandomGenerator::max_level;
+    using random_level_generator_type = RandomGenerator;
+    using key_type = Key;
+    using mapped_type = Value;
+    using compare_type = KeyCompare;
+    using value_type = std::pair<const key_type, mapped_type>;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using allocator_type = Allocator;
+
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    class value_compare {
+    public:
+        bool operator()(const value_type& lhs, const value_type& rhs) const {
+            return comp(lhs.first, rhs.first);
+        }
+
+    protected:
+        value_compare(compare_type c) : comp(c) {}
+
+        friend struct map_traits;
+
+        compare_type comp;
+    };
+
+    static value_compare value_comp(compare_type comp) { return value_compare(comp); }
+
+    static const key_type& get_key(const_reference val) {
+        return val.first;
+    }
+}; // struct map_traits
+
+template <typename Key, typename Value, typename Compare, typename Allocator>
+class concurrent_multimap;
+
+template <typename Key, typename Value, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<std::pair<const Key, Value>>>
+class concurrent_map : public concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, false>> {
+    using base_type = concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, false>>;
+public:
+    using key_type = Key;
+    using mapped_type = Value;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base type
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_map() = default;
+    concurrent_map( const concurrent_map& ) = default;
+    concurrent_map( const concurrent_map& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_map( concurrent_map&& ) = default;
+    concurrent_map( concurrent_map&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_map& operator=( const concurrent_map& ) = default;
+    concurrent_map& operator=( concurrent_map&& ) = default;
+
+    concurrent_map& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    // Observers
+    mapped_type& at(const key_type& key) {
+        iterator it = this->find(key);
+
+        if (it == this->end()) {
+            throw_exception(exception_id::invalid_key);
+        }
+        return it->second;
+    }
+
+    const mapped_type& at(const key_type& key) const {
+        return const_cast<concurrent_map*>(this)->at(key);
+    }
+
+    mapped_type& operator[](const key_type& key) {
+        iterator it = this->find(key);
+
+        if (it == this->end()) {
+            it = this->emplace(std::piecewise_construct, std::forward_as_tuple(key), std::tuple<>()).first;
+        }
+        return it->second;
+    }
+
+    mapped_type& operator[](key_type&& key) {
+        iterator it = this->find(key);
+
+        if (it == this->end()) {
+            it = this->emplace(std::piecewise_construct, std::forward_as_tuple(std::move(key)), std::tuple<>()).first;
+        }
+        return it->second;
+    }
+
+    using base_type::insert;
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value )
+    {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value )
+    {
+        return this->emplace_hint(hint, std::forward<P>(value));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_map
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_map( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_map<iterator_key_t<It>, iterator_mapped_t<It>, Comp, Alloc>;
+
+template <typename Key, typename T,
+          typename Comp = std::less<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_map( std::initializer_list<std::pair<Key, T>>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_map<std::remove_const_t<Key>, T, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_map( It, It, Alloc )
+-> concurrent_map<iterator_key_t<It>, iterator_mapped_t<It>,
+                  std::less<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_map( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_map<std::remove_const_t<Key>, T, std::less<std::remove_const_t<Key>>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Value, typename Compare, typename Allocator>
+void swap( concurrent_map<Key, Value, Compare, Allocator>& lhs,
+           concurrent_map<Key, Value, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename Value, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<std::pair<const Key, Value>>>
+class concurrent_multimap : public concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, true>> {
+    using base_type = concurrent_skip_list<map_traits<Key, Value, Compare, concurrent_geometric_level_generator<32>, Allocator, true>>;
+public:
+    using key_type = Key;
+    using mapped_type = Value;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type
+    using base_type::base_type;
+    using base_type::insert;
+
+    // Required for implicit deduction guides
+    concurrent_multimap() = default;
+    concurrent_multimap( const concurrent_multimap& ) = default;
+    concurrent_multimap( const concurrent_multimap& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_multimap( concurrent_multimap&& ) = default;
+    concurrent_multimap( concurrent_multimap&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_multimap& operator=( const concurrent_multimap& ) = default;
+    concurrent_multimap& operator=( concurrent_multimap&& ) = default;
+
+    concurrent_multimap& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value )
+    {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value )
+    {
+        return this->emplace_hint(hint, std::forward<P>(value));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multimap<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_map<key_type, mapped_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_multimap
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multimap( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multimap<iterator_key_t<It>, iterator_mapped_t<It>, Comp, Alloc>;
+
+template <typename Key, typename T,
+          typename Comp = std::less<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multimap( std::initializer_list<std::pair<Key, T>>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multimap<std::remove_const_t<Key>, T, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multimap( It, It, Alloc )
+-> concurrent_multimap<iterator_key_t<It>, iterator_mapped_t<It>,
+                       std::less<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multimap( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_multimap<std::remove_const_t<Key>, T, std::less<std::remove_const_t<Key>>, Alloc>;
+
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Value, typename Compare, typename Allocator>
+void swap( concurrent_multimap<Key, Value, Compare, Allocator>& lhs,
+           concurrent_multimap<Key, Value, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+} // namespace d2
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d2::concurrent_map;
+using detail::d2::concurrent_multimap;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_map_H
diff --git a/third_party/tbb/concurrent_monitor.h b/third_party/tbb/concurrent_monitor.h
new file mode 100644
index 000000000..539706ed1
--- /dev/null
+++ b/third_party/tbb/concurrent_monitor.h
@@ -0,0 +1,489 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_monitor_H
+#define __TBB_concurrent_monitor_H
+
+#include "third_party/tbb/spin_mutex.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_aligned_space.h"
+#include "third_party/tbb/concurrent_monitor_mutex.h"
+#include "third_party/tbb/semaphore.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Circular doubly-linked list with sentinel
+/** head.next points to the front and head.prev points to the back */
+class circular_doubly_linked_list_with_sentinel {
+public:
+    struct base_node {
+        base_node* next;
+        base_node* prev;
+
+        constexpr base_node(base_node* n, base_node* p) : next(n), prev(p) {}
+        explicit base_node() : next((base_node*)(uintptr_t)0xcdcdcdcd), prev((base_node*)(uintptr_t)0xcdcdcdcd) {}
+    };
+
+    // ctor
+    constexpr circular_doubly_linked_list_with_sentinel() : count(0), head(&head, &head) {}
+
+    circular_doubly_linked_list_with_sentinel(const circular_doubly_linked_list_with_sentinel&) = delete;
+    circular_doubly_linked_list_with_sentinel& operator=(const circular_doubly_linked_list_with_sentinel&) = delete;
+
+    inline std::size_t size() const { return count.load(std::memory_order_relaxed); }
+    inline bool empty() const { return size() == 0; }
+    inline base_node* front() const { return head.next; }
+    inline base_node* last() const { return head.prev; }
+    inline const base_node* end() const { return &head; }
+
+    //! add to the back of the list
+    inline void add( base_node* n ) {
+        count.store(count.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+        n->prev = head.prev;
+        n->next = &head;
+        head.prev->next = n;
+        head.prev = n;
+    }
+
+    //! remove node 'n'
+    inline void remove( base_node& n ) {
+        __TBB_ASSERT(count.load(std::memory_order_relaxed) > 0, "attempt to remove an item from an empty list");
+        count.store(count.load( std::memory_order_relaxed ) - 1, std::memory_order_relaxed);
+        n.prev->next = n.next;
+        n.next->prev = n.prev;
+    }
+
+    //! move all elements to 'lst' and initialize the 'this' list
+    inline void flush_to( circular_doubly_linked_list_with_sentinel& lst ) {
+        const std::size_t l_count = size();
+        if (l_count > 0) {
+            lst.count.store(l_count, std::memory_order_relaxed);
+            lst.head.next = head.next;
+            lst.head.prev = head.prev;
+            head.next->prev = &lst.head;
+            head.prev->next = &lst.head;
+            clear();
+        }
+    }
+
+    void clear() {
+        head.next = &head;
+        head.prev = &head;
+        count.store(0, std::memory_order_relaxed);
+    }
+private:
+    std::atomic<std::size_t> count;
+    base_node head;
+};
+
+using base_list = circular_doubly_linked_list_with_sentinel;
+using base_node = circular_doubly_linked_list_with_sentinel::base_node;
+
+template <typename Context>
+class concurrent_monitor_base;
+
+template <typename Context>
+class wait_node : public base_node {
+public:
+
+#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900
+    wait_node(Context ctx) : my_context(ctx), my_is_in_list(false) {}
+#else
+    wait_node(Context ctx) : my_context(ctx) {}
+#endif
+
+    virtual ~wait_node() = default;
+
+    virtual void init() {
+        __TBB_ASSERT(!my_initialized, nullptr);
+        my_initialized = true;
+    }
+
+    virtual void wait() = 0;
+
+    virtual void reset() {
+        __TBB_ASSERT(my_skipped_wakeup, nullptr);
+        my_skipped_wakeup = false;
+    }
+
+    virtual void notify() = 0;
+
+protected:
+    friend class concurrent_monitor_base<Context>;
+    friend class thread_data;
+
+    Context my_context{};
+#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900
+    std::atomic<bool> my_is_in_list;
+#else
+    std::atomic<bool> my_is_in_list{false};
+#endif
+
+    bool my_initialized{false};
+    bool my_skipped_wakeup{false};
+    bool my_aborted{false};
+    unsigned my_epoch{0};
+};
+
+template <typename Context>
+class sleep_node : public wait_node<Context> {
+    using base_type = wait_node<Context>;
+public:
+    using base_type::base_type;
+
+    ~sleep_node() override {
+        if (this->my_initialized) {
+            if (this->my_skipped_wakeup) semaphore().P();
+            semaphore().~binary_semaphore();
+        }
+    }
+
+    binary_semaphore& semaphore() { return *sema.begin(); }
+
+    void init() override {
+        if (!this->my_initialized) {
+            new (sema.begin()) binary_semaphore;
+            base_type::init();
+        }
+    }
+
+    void wait() override {
+        __TBB_ASSERT(this->my_initialized,
+            "Use of commit_wait() without prior prepare_wait()");
+        semaphore().P();
+        __TBB_ASSERT(!this->my_is_in_list.load(std::memory_order_relaxed), "Still in the queue?");
+        if (this->my_aborted)
+            throw_exception(exception_id::user_abort);
+    }
+
+    void reset() override {
+        base_type::reset();
+        semaphore().P();
+    }
+
+    void notify() override {
+        semaphore().V();
+    }
+
+private:
+    tbb::detail::aligned_space<binary_semaphore> sema;
+};
+
+//! concurrent_monitor
+/** fine-grained concurrent_monitor implementation */
+template <typename Context>
+class concurrent_monitor_base {
+public:
+    //! ctor
+    constexpr concurrent_monitor_base() {}
+    //! dtor
+    ~concurrent_monitor_base() = default;
+
+    concurrent_monitor_base(const concurrent_monitor_base&) = delete;
+    concurrent_monitor_base& operator=(const concurrent_monitor_base&) = delete;
+
+    //! prepare wait by inserting 'thr' into the wait queue
+    void prepare_wait( wait_node<Context>& node) {
+        // TODO: consider making even more lazy instantiation of the semaphore, that is only when it is actually needed, e.g. move it in node::wait()
+        if (!node.my_initialized) {
+            node.init();
+        }
+        // this is good place to pump previous skipped wakeup
+        else if (node.my_skipped_wakeup) {
+            node.reset();
+        }
+
+        node.my_is_in_list.store(true, std::memory_order_relaxed);
+
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            node.my_epoch = my_epoch.load(std::memory_order_relaxed);
+            my_waitset.add(&node);
+        }
+
+        // Prepare wait guarantees Write Read memory barrier.
+        // In C++ only full fence covers this type of barrier.
+        atomic_fence_seq_cst();
+    }
+
+    //! Commit wait if event count has not changed; otherwise, cancel wait.
+    /** Returns true if committed, false if canceled. */
+    inline bool commit_wait( wait_node<Context>& node ) {
+        const bool do_it = node.my_epoch == my_epoch.load(std::memory_order_relaxed);
+        // this check is just an optimization
+        if (do_it) {
+           node.wait();
+        } else {
+            cancel_wait( node );
+        }
+        return do_it;
+    }
+
+    //! Cancel the wait. Removes the thread from the wait queue if not removed yet.
+    void cancel_wait( wait_node<Context>& node ) {
+        // possible skipped wakeup will be pumped in the following prepare_wait()
+        node.my_skipped_wakeup = true;
+        // try to remove node from waitset
+        // Cancel wait guarantees acquire memory barrier.
+        bool in_list = node.my_is_in_list.load(std::memory_order_acquire);
+        if (in_list) {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            if (node.my_is_in_list.load(std::memory_order_relaxed)) {
+                my_waitset.remove(node);
+                // node is removed from waitset, so there will be no wakeup
+                node.my_is_in_list.store(false, std::memory_order_relaxed);
+                node.my_skipped_wakeup = false;
+            }
+        }
+    }
+
+    //! Wait for a condition to be satisfied with waiting-on my_context
+    template <typename NodeType, typename Pred>
+    bool wait(Pred&& pred, NodeType&& node) {
+        prepare_wait(node);
+        while (!guarded_call(std::forward<Pred>(pred), node)) {
+            if (commit_wait(node)) {
+                return true;
+            }
+
+            prepare_wait(node);
+        }
+
+        cancel_wait(node);
+        return false;
+    }
+
+    //! Notify one thread about the event
+    void notify_one() {
+        atomic_fence_seq_cst();
+        notify_one_relaxed();
+    }
+
+    //! Notify one thread about the event. Relaxed version.
+    void notify_one_relaxed() {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_node* n;
+        const base_node* end = my_waitset.end();
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+            n = my_waitset.front();
+            if (n != end) {
+                my_waitset.remove(*n);
+                to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed);
+            }
+        }
+
+        if (n != end) {
+            to_wait_node(n)->notify();
+        }
+    }
+
+    //! Notify all waiting threads of the event
+    void notify_all() {
+        atomic_fence_seq_cst();
+        notify_all_relaxed();
+    }
+
+    // ! Notify all waiting threads of the event; Relaxed version
+    void notify_all_relaxed() {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_list temp;
+        const base_node* end;
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+            // TODO: Possible optimization, don't change node state under lock, just do flush
+            my_waitset.flush_to(temp);
+            end = temp.end();
+            for (base_node* n = temp.front(); n != end; n = n->next) {
+                to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed);
+            }
+        }
+
+        base_node* nxt;
+        for (base_node* n = temp.front(); n != end; n=nxt) {
+            nxt = n->next;
+            to_wait_node(n)->notify();
+        }
+#if TBB_USE_ASSERT
+        temp.clear();
+#endif
+    }
+
+    //! Notify waiting threads of the event that satisfies the given predicate
+    template <typename P>
+    void notify( const P& predicate ) {
+        atomic_fence_seq_cst();
+        notify_relaxed( predicate );
+    }
+
+    //! Notify waiting threads of the event that satisfies the given predicate;
+    //! the predicate is called under the lock. Relaxed version.
+    template<typename P>
+    void notify_relaxed( const P& predicate ) {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_list temp;
+        base_node* nxt;
+        const base_node* end = my_waitset.end();
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load( std::memory_order_relaxed ) + 1, std::memory_order_relaxed);
+            for (base_node* n = my_waitset.last(); n != end; n = nxt) {
+                nxt = n->prev;
+                auto* node = static_cast<wait_node<Context>*>(n);
+                if (predicate(node->my_context)) {
+                    my_waitset.remove(*n);
+                    node->my_is_in_list.store(false, std::memory_order_relaxed);
+                    temp.add(n);
+                }
+            }
+        }
+
+        end = temp.end();
+        for (base_node* n=temp.front(); n != end; n = nxt) {
+            nxt = n->next;
+            to_wait_node(n)->notify();
+        }
+#if TBB_USE_ASSERT
+        temp.clear();
+#endif
+    }
+
+    //! Notify waiting threads of the event that satisfies the given predicate;
+    //! the predicate is called under the lock. Relaxed version.
+    template<typename P>
+    void notify_one_relaxed( const P& predicate ) {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_node* tmp = nullptr;
+        base_node* next{};
+        const base_node* end = my_waitset.end();
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load( std::memory_order_relaxed ) + 1, std::memory_order_relaxed);
+            for (base_node* n = my_waitset.last(); n != end; n = next) {
+                next = n->prev;
+                auto* node = static_cast<wait_node<Context>*>(n);
+                if (predicate(node->my_context)) {
+                    my_waitset.remove(*n);
+                    node->my_is_in_list.store(false, std::memory_order_relaxed);
+                    tmp = n;
+                    break;
+                }
+            }
+        }
+
+        if (tmp) {
+            to_wait_node(tmp)->notify();
+        }
+    }
+
+    //! Abort any sleeping threads at the time of the call
+    void abort_all() {
+        atomic_fence_seq_cst();
+        abort_all_relaxed();
+    }
+
+    //! Abort any sleeping threads at the time of the call; Relaxed version
+    void abort_all_relaxed() {
+        if (my_waitset.empty()) {
+            return;
+        }
+
+        base_list temp;
+        const base_node* end;
+        {
+            concurrent_monitor_mutex::scoped_lock l(my_mutex);
+            my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+            my_waitset.flush_to(temp);
+            end = temp.end();
+            for (base_node* n = temp.front(); n != end; n = n->next) {
+                to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed);
+            }
+        }
+
+        base_node* nxt;
+        for (base_node* n = temp.front(); n != end; n = nxt) {
+            nxt = n->next;
+            to_wait_node(n)->my_aborted = true;
+            to_wait_node(n)->notify();
+        }
+#if TBB_USE_ASSERT
+        temp.clear();
+#endif
+    }
+
+    void destroy() {
+        this->abort_all();
+        my_mutex.destroy();
+        __TBB_ASSERT(this->my_waitset.empty(), "waitset not empty?");
+    }
+
+private:
+    template <typename NodeType, typename Pred>
+    bool guarded_call(Pred&& predicate, NodeType& node) {
+        bool res = false;
+        tbb::detail::d0::try_call( [&] {
+            res = std::forward<Pred>(predicate)();
+        }).on_exception( [&] {
+            cancel_wait(node);
+        });
+
+        return res;
+    }
+
+    concurrent_monitor_mutex my_mutex{};
+    base_list my_waitset{};
+    std::atomic<unsigned> my_epoch{};
+
+    wait_node<Context>* to_wait_node( base_node* node ) { return static_cast<wait_node<Context>*>(node); }
+};
+
+class concurrent_monitor : public concurrent_monitor_base<std::uintptr_t> {
+    using base_type = concurrent_monitor_base<std::uintptr_t>;
+public:
+    using base_type::base_type;
+
+    ~concurrent_monitor() {
+        destroy();
+    }
+
+    /** per-thread descriptor for concurrent_monitor */
+    using thread_context = sleep_node<std::uintptr_t>;
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_concurrent_monitor_H */
diff --git a/third_party/tbb/concurrent_monitor_mutex.h b/third_party/tbb/concurrent_monitor_mutex.h
new file mode 100644
index 000000000..14d6317e7
--- /dev/null
+++ b/third_party/tbb/concurrent_monitor_mutex.h
@@ -0,0 +1,114 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_monitor_mutex_H
+#define __TBB_monitor_mutex_H
+
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_aligned_space.h"
+#include "third_party/tbb/semaphore.h"
+
+#include "third_party/libcxx/mutex"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class concurrent_monitor_mutex {
+public:
+    using scoped_lock = std::lock_guard<concurrent_monitor_mutex>;
+
+    constexpr concurrent_monitor_mutex() {}
+
+    ~concurrent_monitor_mutex() = default;
+
+    void destroy() {
+#if !__TBB_USE_FUTEX
+        if (my_init_flag.load(std::memory_order_relaxed)) {
+            get_semaphore().~semaphore();
+        }
+#endif
+    }
+
+    void lock() {
+        auto wakeup_condition = [&] {
+            return my_flag.load(std::memory_order_relaxed) == 0;
+        };
+
+        while (my_flag.exchange(1)) {
+            if (!timed_spin_wait_until(wakeup_condition)) {
+                ++my_waiters;
+                while (!wakeup_condition()) {
+                    wait();
+                }
+                --my_waiters;
+            }
+        }
+    }
+
+    void unlock() {
+        my_flag.exchange(0); // full fence, so the next load is relaxed
+        if (my_waiters.load(std::memory_order_relaxed)) {
+            wakeup();
+        }
+    }
+
+private:
+    void wait() {
+#if __TBB_USE_FUTEX
+        futex_wait(&my_flag, 1);
+#else
+        get_semaphore().P();
+#endif
+    }
+
+    void wakeup() {
+#if __TBB_USE_FUTEX
+        futex_wakeup_one(&my_flag);
+#else
+        get_semaphore().V();
+#endif
+    }
+
+    // The flag should be int for the futex operations
+    std::atomic<int> my_flag{0};
+    std::atomic<int> my_waiters{0};
+
+#if !__TBB_USE_FUTEX
+    semaphore& get_semaphore() {
+        if (!my_init_flag.load(std::memory_order_acquire)) {
+            std::lock_guard<std::mutex> lock(my_init_mutex);
+            if (!my_init_flag.load(std::memory_order_relaxed)) {
+                new (my_semaphore.begin()) semaphore();
+                my_init_flag.store(true, std::memory_order_release);
+            }
+        }
+
+        return *my_semaphore.begin();
+    }
+
+    static std::mutex my_init_mutex;
+    std::atomic<bool> my_init_flag{false};
+    aligned_space<semaphore> my_semaphore{};
+#endif
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_monitor_mutex_H
diff --git a/third_party/tbb/concurrent_priority_queue.h b/third_party/tbb/concurrent_priority_queue.h
new file mode 100644
index 000000000..86e915dee
--- /dev/null
+++ b/third_party/tbb/concurrent_priority_queue.h
@@ -0,0 +1,491 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_priority_queue_H
+#define __TBB_concurrent_priority_queue_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_aggregator.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_allocator_traits.h"
+#include "third_party/tbb/detail/_range_common.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_containers_helpers.h"
+#include "third_party/tbb/cache_aligned_allocator.h"
+#include "third_party/libcxx/vector"
+#include "third_party/libcxx/iterator"
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/initializer_list"
+#include "third_party/libcxx/type_traits"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename T, typename Compare = std::less<T>, typename Allocator = cache_aligned_allocator<T>>
+class concurrent_priority_queue {
+public:
+    using value_type = T;
+    using reference = T&;
+    using const_reference = const T&;
+
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+
+    using allocator_type = Allocator;
+
+    concurrent_priority_queue() : concurrent_priority_queue(allocator_type{}) {}
+
+    explicit concurrent_priority_queue( const allocator_type& alloc )
+        : mark(0), my_size(0), my_compare(), data(alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    explicit concurrent_priority_queue( const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_size(0), my_compare(compare), data(alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    explicit concurrent_priority_queue( size_type init_capacity, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_size(0), my_compare(), data(alloc)
+    {
+        data.reserve(init_capacity);
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    explicit concurrent_priority_queue( size_type init_capacity, const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_size(0), my_compare(compare), data(alloc)
+    {
+        data.reserve(init_capacity);
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    template <typename InputIterator>
+    concurrent_priority_queue( InputIterator begin, InputIterator end, const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : mark(0), my_compare(compare), data(begin, end, alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+        heapify();
+        my_size.store(data.size(), std::memory_order_relaxed);
+    }
+
+    template <typename InputIterator>
+    concurrent_priority_queue( InputIterator begin, InputIterator end, const allocator_type& alloc = allocator_type() )
+        : concurrent_priority_queue(begin, end, Compare(), alloc) {}
+
+    concurrent_priority_queue( std::initializer_list<value_type> init, const Compare& compare, const allocator_type& alloc = allocator_type() )
+        : concurrent_priority_queue(init.begin(), init.end(), compare, alloc) {}
+
+    concurrent_priority_queue( std::initializer_list<value_type> init, const allocator_type& alloc = allocator_type() )
+        : concurrent_priority_queue(init, Compare(), alloc) {}
+
+    concurrent_priority_queue( const concurrent_priority_queue& other )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(other.data)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue( const concurrent_priority_queue& other, const allocator_type& alloc )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(other.data, alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue( concurrent_priority_queue&& other )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(std::move(other.data))
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue( concurrent_priority_queue&& other, const allocator_type& alloc )
+        : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare),
+          data(std::move(other.data), alloc)
+    {
+        my_aggregator.initialize_handler(functor{this});
+    }
+
+    concurrent_priority_queue& operator=( const concurrent_priority_queue& other ) {
+        if (this != &other) {
+            data = other.data;
+            mark = other.mark;
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        }
+        return *this;
+    }
+
+    concurrent_priority_queue& operator=( concurrent_priority_queue&& other ) {
+        if (this != &other) {
+            // TODO: check if exceptions from std::vector::operator=(vector&&) should be handled separately
+            data = std::move(other.data);
+            mark = other.mark;
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        }
+        return *this;
+    }
+
+    concurrent_priority_queue& operator=( std::initializer_list<value_type> init ) {
+        assign(init.begin(), init.end());
+        return *this;
+    }
+
+    template <typename InputIterator>
+    void assign( InputIterator begin, InputIterator end ) {
+        data.assign(begin, end);
+        mark = 0;
+        my_size.store(data.size(), std::memory_order_relaxed);
+        heapify();
+    }
+
+    void assign( std::initializer_list<value_type> init ) {
+        assign(init.begin(), init.end());
+    }
+
+    /* Returned value may not reflect results of pending operations.
+       This operation reads shared data and will trigger a race condition. */
+    __TBB_nodiscard bool empty() const { return size() == 0; }
+
+    // Returns the current number of elements contained in the queue
+    /* Returned value may not reflect results of pending operations.
+       This operation reads shared data and will trigger a race condition. */
+    size_type size() const { return my_size.load(std::memory_order_relaxed); }
+
+    /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    void push( const value_type& value ) {
+        cpq_operation op_data(value, PUSH_OP);
+        my_aggregator.execute(&op_data);
+        if (op_data.status == FAILED)
+            throw_exception(exception_id::bad_alloc);
+    }
+
+    /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    void push( value_type&& value ) {
+        cpq_operation op_data(value, PUSH_RVALUE_OP);
+        my_aggregator.execute(&op_data);
+        if (op_data.status == FAILED)
+            throw_exception(exception_id::bad_alloc);
+    }
+
+    /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    template <typename... Args>
+    void emplace( Args&&... args ) {
+        // TODO: support uses allocator construction in this place
+        push(value_type(std::forward<Args>(args)...));
+    }
+
+    // Gets a reference to and removes highest priority element
+    /* If a highest priority element was found, sets elem and returns true,
+       otherwise returns false.
+       This operation can be safely used concurrently with other push, try_pop or emplace operations. */
+    bool try_pop( value_type& value ) {
+        cpq_operation op_data(value, POP_OP);
+        my_aggregator.execute(&op_data);
+        return op_data.status == SUCCEEDED;
+    }
+
+    // This operation affects the whole container => it is not thread-safe
+    void clear() {
+        data.clear();
+        mark = 0;
+        my_size.store(0, std::memory_order_relaxed);
+    }
+
+    // This operation affects the whole container => it is not thread-safe
+    void swap( concurrent_priority_queue& other ) {
+        if (this != &other) {
+            using std::swap;
+            swap(data, other.data);
+            swap(mark, other.mark);
+
+            size_type sz = my_size.load(std::memory_order_relaxed);
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            other.my_size.store(sz, std::memory_order_relaxed);
+        }
+    }
+
+    allocator_type get_allocator() const { return data.get_allocator(); }
+private:
+    enum operation_type {INVALID_OP, PUSH_OP, POP_OP, PUSH_RVALUE_OP};
+    enum operation_status {WAIT = 0, SUCCEEDED, FAILED};
+
+    class cpq_operation : public aggregated_operation<cpq_operation> {
+    public:
+        operation_type type;
+        union {
+            value_type* elem;
+            size_type sz;
+        };
+        cpq_operation( const value_type& value, operation_type t )
+            : type(t), elem(const_cast<value_type*>(&value)) {}
+    }; // class cpq_operation
+
+    class functor {
+        concurrent_priority_queue* my_cpq;
+    public:
+        functor() : my_cpq(nullptr) {}
+        functor( concurrent_priority_queue* cpq ) : my_cpq(cpq) {}
+
+        void operator()(cpq_operation* op_list) {
+            __TBB_ASSERT(my_cpq != nullptr, "Invalid functor");
+            my_cpq->handle_operations(op_list);
+        }
+    }; // class functor
+
+    void handle_operations( cpq_operation* op_list ) {
+        call_itt_notify(acquired, this);
+        cpq_operation* tmp, *pop_list = nullptr;
+        __TBB_ASSERT(mark == data.size(), nullptr);
+
+        // First pass processes all constant (amortized; reallocation may happen) time pushes and pops.
+        while(op_list) {
+            // ITT note: &(op_list->status) tag is used to cover accesses to op_list
+            // node. This thread is going to handle the operation, and so will acquire it
+            // and perform the associated operation w/o triggering a race condition; the
+            // thread that created the operation is waiting on the status field, so when
+            // this thread is done with the operation, it will perform a
+            // store_with_release to give control back to the waiting thread in
+            // aggregator::insert_operation.
+            // TODO: enable
+            call_itt_notify(acquired, &(op_list->status));
+            __TBB_ASSERT(op_list->type != INVALID_OP, nullptr);
+
+            tmp = op_list;
+            op_list = op_list->next.load(std::memory_order_relaxed);
+            if (tmp->type == POP_OP) {
+                if (mark < data.size() &&
+                    my_compare(data[0], data.back()))
+                {
+                    // there are newly pushed elems and the last one is higher than top
+                    *(tmp->elem) = std::move(data.back());
+                    my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+
+                    data.pop_back();
+                    __TBB_ASSERT(mark <= data.size(), nullptr);
+                } else { // no convenient item to pop; postpone
+                    tmp->next.store(pop_list, std::memory_order_relaxed);
+                    pop_list = tmp;
+                }
+            } else { // PUSH_OP or PUSH_RVALUE_OP
+                __TBB_ASSERT(tmp->type == PUSH_OP || tmp->type == PUSH_RVALUE_OP, "Unknown operation");
+#if TBB_USE_EXCEPTIONS
+                try
+#endif
+                {
+                    if (tmp->type == PUSH_OP) {
+                        push_back_helper(*(tmp->elem));
+                    } else {
+                        data.push_back(std::move(*(tmp->elem)));
+                    }
+                    my_size.store(my_size.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+                }
+#if TBB_USE_EXCEPTIONS
+                catch(...) {
+                    tmp->status.store(uintptr_t(FAILED), std::memory_order_release);
+                }
+#endif
+            }
+        }
+
+        // Second pass processes pop operations
+        while(pop_list) {
+            tmp = pop_list;
+            pop_list = pop_list->next.load(std::memory_order_relaxed);
+            __TBB_ASSERT(tmp->type == POP_OP, nullptr);
+            if (data.empty()) {
+                tmp->status.store(uintptr_t(FAILED), std::memory_order_release);
+            } else {
+                __TBB_ASSERT(mark <= data.size(), nullptr);
+                if (mark < data.size() &&
+                    my_compare(data[0], data.back()))
+                {
+                    // there are newly pushed elems and the last one is higher than top
+                    *(tmp->elem) = std::move(data.back());
+                    my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+                    data.pop_back();
+                } else { // extract top and push last element down heap
+                    *(tmp->elem) = std::move(data[0]);
+                    my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+                    tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release);
+                    reheap();
+                }
+            }
+        }
+
+        // heapify any leftover pushed elements before doing the next
+        // batch of operations
+        if (mark < data.size()) heapify();
+        __TBB_ASSERT(mark == data.size(), nullptr);
+        call_itt_notify(releasing, this);
+    }
+
+    // Merge unsorted elements into heap
+    void heapify() {
+        if (!mark && data.size() > 0) mark = 1;
+        for (; mark < data.size(); ++mark) {
+            // for each unheapified element under size
+            size_type cur_pos = mark;
+            value_type to_place = std::move(data[mark]);
+            do { // push to_place up the heap
+                size_type parent = (cur_pos - 1) >> 1;
+                if (!my_compare(data[parent], to_place))
+                    break;
+                data[cur_pos] = std::move(data[parent]);
+                cur_pos = parent;
+            } while(cur_pos);
+            data[cur_pos] = std::move(to_place);
+        }
+    }
+
+    // Re-heapify after an extraction
+    // Re-heapify by pushing last element down the heap from the root.
+    void reheap() {
+        size_type cur_pos = 0, child = 1;
+
+        while(child < mark) {
+            size_type target = child;
+            if (child + 1 < mark && my_compare(data[child], data[child + 1]))
+                ++target;
+            // target now has the higher priority child
+            if (my_compare(data[target], data.back()))
+                break;
+            data[cur_pos] = std::move(data[target]);
+            cur_pos = target;
+            child = (cur_pos << 1) + 1;
+        }
+        if (cur_pos != data.size() - 1)
+            data[cur_pos] = std::move(data.back());
+        data.pop_back();
+        if (mark > data.size()) mark = data.size();
+    }
+
+    void push_back_helper( const T& value ) {
+        push_back_helper_impl(value, std::is_copy_constructible<T>{});
+    }
+
+    void push_back_helper_impl( const T& value, /*is_copy_constructible = */std::true_type ) {
+        data.push_back(value);
+    }
+
+    void push_back_helper_impl( const T&, /*is_copy_constructible = */std::false_type ) {
+        __TBB_ASSERT(false, "error: calling tbb::concurrent_priority_queue.push(const value_type&) for move-only type");
+    }
+
+    using aggregator_type = aggregator<functor, cpq_operation>;
+
+    aggregator_type my_aggregator;
+    // Padding added to avoid false sharing
+    char padding1[max_nfs_size - sizeof(aggregator_type)];
+    // The point at which unsorted elements begin
+    size_type mark;
+    std::atomic<size_type> my_size;
+    Compare my_compare;
+
+    // Padding added to avoid false sharing
+    char padding2[max_nfs_size - (2*sizeof(size_type)) - sizeof(Compare)];
+    //! Storage for the heap of elements in queue, plus unheapified elements
+    /** data has the following structure:
+
+         binary unheapified
+          heap   elements
+        ____|_______|____
+        |       |       |
+        v       v       v
+        [_|...|_|_|...|_| |...| ]
+         0       ^       ^       ^
+                 |       |       |__capacity
+                 |       |__my_size
+                 |__mark
+
+        Thus, data stores the binary heap starting at position 0 through
+        mark-1 (it may be empty).  Then there are 0 or more elements
+        that have not yet been inserted into the heap, in positions
+        mark through my_size-1. */
+
+    using vector_type = std::vector<value_type, allocator_type>;
+    vector_type data;
+
+    friend bool operator==( const concurrent_priority_queue& lhs,
+                            const concurrent_priority_queue& rhs )
+    {
+        return lhs.data == rhs.data;
+    }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+    friend bool operator!=( const concurrent_priority_queue& lhs,
+                            const concurrent_priority_queue& rhs )
+    {
+        return !(lhs == rhs);
+    }
+#endif
+}; // class concurrent_priority_queue
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename It,
+          typename Comp = std::less<iterator_value_t<It>>,
+          typename Alloc = tbb::cache_aligned_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_priority_queue( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_priority_queue<iterator_value_t<It>, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_priority_queue( It, It, Alloc )
+-> concurrent_priority_queue<iterator_value_t<It>, std::less<iterator_value_t<It>>, Alloc>;
+
+template <typename T,
+          typename Comp = std::less<T>,
+          typename Alloc = tbb::cache_aligned_allocator<T>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_priority_queue( std::initializer_list<T>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_priority_queue<T, Comp, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_priority_queue( std::initializer_list<T>, Alloc )
+-> concurrent_priority_queue<T, std::less<T>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename T, typename Compare, typename Allocator>
+void swap( concurrent_priority_queue<T, Compare, Allocator>& lhs,
+           concurrent_priority_queue<T, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+inline namespace v1 {
+using detail::d1::concurrent_priority_queue;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_priority_queue_H
diff --git a/third_party/tbb/concurrent_queue.h b/third_party/tbb/concurrent_queue.h
new file mode 100644
index 000000000..2cceab80f
--- /dev/null
+++ b/third_party/tbb/concurrent_queue.h
@@ -0,0 +1,701 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_queue_H
+#define __TBB_concurrent_queue_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_concurrent_queue_base.h"
+#include "third_party/tbb/detail/_allocator_traits.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_containers_helpers.h"
+#include "third_party/tbb/cache_aligned_allocator.h"
+
+namespace tbb {
+namespace detail {
+namespace d2 {
+
+template <typename QueueRep, typename Allocator>
+std::pair<bool, ticket_type> internal_try_pop_impl(void* dst, QueueRep& queue, Allocator& alloc ) {
+    ticket_type ticket{};
+    do {
+        // Basically, we need to read `head_counter` before `tail_counter`. To achieve it we build happens-before on `head_counter`
+        ticket = queue.head_counter.load(std::memory_order_acquire);
+        do {
+            if (static_cast<std::ptrdiff_t>(queue.tail_counter.load(std::memory_order_relaxed) - ticket) <= 0) { // queue is empty
+                // Queue is empty
+                return { false, ticket };
+            }
+            // Queue had item with ticket k when we looked.  Attempt to get that item.
+            // Another thread snatched the item, retry.
+        } while (!queue.head_counter.compare_exchange_strong(ticket, ticket + 1));
+    } while (!queue.choose(ticket).pop(dst, ticket, queue, alloc));
+    return { true, ticket };
+}
+
+// A high-performance thread-safe non-blocking concurrent queue.
+// Multiple threads may each push and pop concurrently.
+// Assignment construction is not allowed.
+template <typename T, typename Allocator = tbb::cache_aligned_allocator<T>>
+class concurrent_queue {
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+    using queue_representation_type = concurrent_queue_rep<T, Allocator>;
+    using queue_allocator_type = typename allocator_traits_type::template rebind_alloc<queue_representation_type>;
+    using queue_allocator_traits = tbb::detail::allocator_traits<queue_allocator_type>;
+public:
+    using size_type = std::size_t;
+    using value_type = T;
+    using reference = T&;
+    using const_reference = const T&;
+    using difference_type = std::ptrdiff_t;
+
+    using allocator_type = Allocator;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using iterator = concurrent_queue_iterator<concurrent_queue, T, Allocator>;
+    using const_iterator = concurrent_queue_iterator<concurrent_queue, const T, Allocator>;
+
+    concurrent_queue() : concurrent_queue(allocator_type()) {}
+
+    explicit concurrent_queue(const allocator_type& a) :
+        my_allocator(a), my_queue_representation(nullptr)
+    {
+        my_queue_representation = static_cast<queue_representation_type*>(r1::cache_aligned_allocate(sizeof(queue_representation_type)));
+        queue_allocator_traits::construct(my_allocator, my_queue_representation);
+
+        __TBB_ASSERT(is_aligned(my_queue_representation, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->head_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->tail_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->array, max_nfs_size), "alignment error" );
+    }
+
+    template <typename InputIterator>
+    concurrent_queue(InputIterator begin, InputIterator end, const allocator_type& a = allocator_type()) :
+        concurrent_queue(a)
+    {
+        for (; begin != end; ++begin)
+            push(*begin);
+    }
+
+    concurrent_queue( std::initializer_list<value_type> init, const allocator_type& alloc = allocator_type() ) :
+        concurrent_queue(init.begin(), init.end(), alloc)
+    {}
+
+    concurrent_queue(const concurrent_queue& src, const allocator_type& a) :
+        concurrent_queue(a)
+    {
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item);
+    }
+
+    concurrent_queue(const concurrent_queue& src) :
+        concurrent_queue(queue_allocator_traits::select_on_container_copy_construction(src.get_allocator()))
+    {
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item);
+    }
+
+    // Move constructors
+    concurrent_queue(concurrent_queue&& src) :
+        concurrent_queue(std::move(src.my_allocator))
+    {
+        internal_swap(src);
+    }
+
+    concurrent_queue(concurrent_queue&& src, const allocator_type& a) :
+        concurrent_queue(a)
+    {
+        // checking that memory allocated by one instance of allocator can be deallocated
+        // with another
+        if (my_allocator == src.my_allocator) {
+            internal_swap(src);
+        } else {
+            // allocators are different => performing per-element move
+            my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item);
+            src.clear();
+        }
+    }
+
+    // Destroy queue
+    ~concurrent_queue() {
+        clear();
+        my_queue_representation->clear(my_allocator);
+        queue_allocator_traits::destroy(my_allocator, my_queue_representation);
+        r1::cache_aligned_deallocate(my_queue_representation);
+    }
+
+    concurrent_queue& operator=( const concurrent_queue& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_copy_assignment
+        if (my_queue_representation != other.my_queue_representation) {
+            clear();
+            my_allocator = other.my_allocator;
+            my_queue_representation->assign(*other.my_queue_representation, my_allocator, copy_construct_item);
+        }
+        return *this;
+    }
+
+    concurrent_queue& operator=( concurrent_queue&& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_move_assignment
+        if (my_queue_representation != other.my_queue_representation) {
+            clear();
+            if (my_allocator == other.my_allocator) {
+                internal_swap(other);
+            } else {
+                my_queue_representation->assign(*other.my_queue_representation, other.my_allocator, move_construct_item);
+                other.clear();
+                my_allocator = std::move(other.my_allocator);
+            }
+        }
+        return *this;
+    }
+
+    concurrent_queue& operator=( std::initializer_list<value_type> init ) {
+        assign(init);
+        return *this;
+    }
+
+    template <typename InputIterator>
+    void assign( InputIterator first, InputIterator last ) {
+        concurrent_queue src(first, last);
+        clear();
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item);
+    }
+
+    void assign( std::initializer_list<value_type> init ) {
+        assign(init.begin(), init.end());
+    }
+
+    void swap ( concurrent_queue& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_swap
+        __TBB_ASSERT(my_allocator == other.my_allocator, "unequal allocators");
+        internal_swap(other);
+    }
+
+    // Enqueue an item at tail of queue.
+    void push(const T& value) {
+        internal_push(value);
+    }
+
+    void push(T&& value) {
+        internal_push(std::move(value));
+    }
+
+    template <typename... Args>
+    void emplace( Args&&... args ) {
+        internal_push(std::forward<Args>(args)...);
+    }
+
+    // Attempt to dequeue an item from head of queue.
+    /** Does not wait for item to become available.
+        Returns true if successful; false otherwise. */
+    bool try_pop( T& result ) {
+        return internal_try_pop(&result);
+    }
+
+    // Return the number of items in the queue; thread unsafe
+    size_type unsafe_size() const {
+        std::ptrdiff_t size = my_queue_representation->size();
+        return size < 0 ? 0 :  size_type(size);
+    }
+
+    // Equivalent to size()==0.
+    __TBB_nodiscard bool empty() const {
+        return my_queue_representation->empty();
+    }
+
+    // Clear the queue. not thread-safe.
+    void clear() {
+        my_queue_representation->clear(my_allocator);
+    }
+
+    // Return allocator object
+    allocator_type get_allocator() const { return my_allocator; }
+
+    //------------------------------------------------------------------------
+    // The iterators are intended only for debugging.  They are slow and not thread safe.
+    //------------------------------------------------------------------------
+
+    iterator unsafe_begin() { return concurrent_queue_iterator_provider::get<iterator>(*this); }
+    iterator unsafe_end() { return iterator(); }
+    const_iterator unsafe_begin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_end() const { return const_iterator(); }
+    const_iterator unsafe_cbegin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_cend() const { return const_iterator(); }
+
+private:
+    void internal_swap(concurrent_queue& src) {
+        using std::swap;
+        swap(my_queue_representation, src.my_queue_representation);
+    }
+
+    template <typename... Args>
+    void internal_push( Args&&... args ) {
+        ticket_type k = my_queue_representation->tail_counter++;
+        my_queue_representation->choose(k).push(k, *my_queue_representation, my_allocator, std::forward<Args>(args)...);
+    }
+
+    bool internal_try_pop( void* dst ) {
+        return internal_try_pop_impl(dst, *my_queue_representation, my_allocator).first;
+    }
+
+    template <typename Container, typename Value, typename A>
+    friend class concurrent_queue_iterator;
+
+    static void copy_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for copy construction
+        new (location) value_type(*static_cast<const value_type*>(src));
+        // queue_allocator_traits::construct(my_allocator, location, *static_cast<const T*>(src));
+    }
+
+    static void move_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for move construction
+        new (location) value_type(std::move(*static_cast<value_type*>(const_cast<void*>(src))));
+    }
+
+    queue_allocator_type my_allocator;
+    queue_representation_type* my_queue_representation;
+
+    friend void swap( concurrent_queue& lhs, concurrent_queue& rhs ) {
+        lhs.swap(rhs);
+    }
+
+    friend bool operator==( const concurrent_queue& lhs, const concurrent_queue& rhs ) {
+        return lhs.unsafe_size() == rhs.unsafe_size() && std::equal(lhs.unsafe_begin(), lhs.unsafe_end(), rhs.unsafe_begin());
+    }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+    friend bool operator!=( const concurrent_queue& lhs,  const concurrent_queue& rhs ) {
+        return !(lhs == rhs);
+    }
+#endif // __TBB_CPP20_COMPARISONS_PRESENT
+}; // class concurrent_queue
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+// Deduction guide for the constructor from two iterators
+template <typename It, typename Alloc = tbb::cache_aligned_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_queue( It, It, Alloc = Alloc() )
+-> concurrent_queue<iterator_value_t<It>, Alloc>;
+
+#endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */
+
+class concurrent_monitor;
+
+// The concurrent monitor tags for concurrent_bounded_queue.
+static constexpr std::size_t cbq_slots_avail_tag = 0;
+static constexpr std::size_t cbq_items_avail_tag = 1;
+} // namespace d2
+
+
+namespace r1 {
+    class concurrent_monitor;
+
+    TBB_EXPORT std::uint8_t* __TBB_EXPORTED_FUNC allocate_bounded_queue_rep( std::size_t queue_rep_size );
+    TBB_EXPORT void __TBB_EXPORTED_FUNC deallocate_bounded_queue_rep( std::uint8_t* mem, std::size_t queue_rep_size );
+    TBB_EXPORT void __TBB_EXPORTED_FUNC abort_bounded_queue_monitors( concurrent_monitor* monitors );
+    TBB_EXPORT void __TBB_EXPORTED_FUNC notify_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag
+                                                            , std::size_t ticket );
+    TBB_EXPORT void __TBB_EXPORTED_FUNC wait_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag,
+                                                            std::ptrdiff_t target, d1::delegate_base& predicate );
+} // namespace r1
+
+
+namespace d2 {
+// A high-performance thread-safe blocking concurrent bounded queue.
+// Supports boundedness and blocking semantics.
+// Multiple threads may each push and pop concurrently.
+// Assignment construction is not allowed.
+template <typename T, typename Allocator = tbb::cache_aligned_allocator<T>>
+class concurrent_bounded_queue {
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+    using queue_representation_type = concurrent_queue_rep<T, Allocator>;
+    using queue_allocator_type = typename allocator_traits_type::template rebind_alloc<queue_representation_type>;
+    using queue_allocator_traits = tbb::detail::allocator_traits<queue_allocator_type>;
+
+    template <typename FuncType>
+    void internal_wait(r1::concurrent_monitor* monitors, std::size_t monitor_tag, std::ptrdiff_t target, FuncType pred) {
+        d1::delegated_function<FuncType> func(pred);
+        r1::wait_bounded_queue_monitor(monitors, monitor_tag, target, func);
+    }
+public:
+    using size_type = std::ptrdiff_t;
+    using value_type = T;
+    using reference = T&;
+    using const_reference = const T&;
+    using difference_type = std::ptrdiff_t;
+
+    using allocator_type = Allocator;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using iterator = concurrent_queue_iterator<concurrent_bounded_queue, T, Allocator>;
+    using const_iterator = concurrent_queue_iterator<concurrent_bounded_queue, const T, Allocator> ;
+
+    concurrent_bounded_queue() : concurrent_bounded_queue(allocator_type()) {}
+
+    explicit concurrent_bounded_queue( const allocator_type& a ) :
+        my_allocator(a), my_capacity(0), my_abort_counter(0), my_queue_representation(nullptr)
+    {
+        my_queue_representation = reinterpret_cast<queue_representation_type*>(
+            r1::allocate_bounded_queue_rep(sizeof(queue_representation_type)));
+        my_monitors = reinterpret_cast<r1::concurrent_monitor*>(my_queue_representation + 1);
+        queue_allocator_traits::construct(my_allocator, my_queue_representation);
+        my_capacity = std::size_t(-1) / (queue_representation_type::item_size > 1 ? queue_representation_type::item_size : 2);
+
+        __TBB_ASSERT(is_aligned(my_queue_representation, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->head_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->tail_counter, max_nfs_size), "alignment error" );
+        __TBB_ASSERT(is_aligned(&my_queue_representation->array, max_nfs_size), "alignment error" );
+    }
+
+    template <typename InputIterator>
+    concurrent_bounded_queue( InputIterator begin, InputIterator end, const allocator_type& a = allocator_type() ) :
+        concurrent_bounded_queue(a)
+    {
+        for (; begin != end; ++begin)
+            push(*begin);
+    }
+
+    concurrent_bounded_queue( std::initializer_list<value_type> init, const allocator_type& alloc = allocator_type() ):
+        concurrent_bounded_queue(init.begin(), init.end(), alloc)
+    {}
+
+    concurrent_bounded_queue( const concurrent_bounded_queue& src, const allocator_type& a ) :
+        concurrent_bounded_queue(a)
+    {
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item);
+    }
+
+    concurrent_bounded_queue( const concurrent_bounded_queue& src ) :
+        concurrent_bounded_queue(queue_allocator_traits::select_on_container_copy_construction(src.get_allocator()))
+    {
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item);
+    }
+
+    // Move constructors
+    concurrent_bounded_queue( concurrent_bounded_queue&& src ) :
+        concurrent_bounded_queue(std::move(src.my_allocator))
+    {
+        internal_swap(src);
+    }
+
+    concurrent_bounded_queue( concurrent_bounded_queue&& src, const allocator_type& a ) :
+        concurrent_bounded_queue(a)
+    {
+        // checking that memory allocated by one instance of allocator can be deallocated
+        // with another
+        if (my_allocator == src.my_allocator) {
+            internal_swap(src);
+        } else {
+            // allocators are different => performing per-element move
+            my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item);
+            src.clear();
+        }
+    }
+
+    // Destroy queue
+    ~concurrent_bounded_queue() {
+        clear();
+        my_queue_representation->clear(my_allocator);
+        queue_allocator_traits::destroy(my_allocator, my_queue_representation);
+        r1::deallocate_bounded_queue_rep(reinterpret_cast<std::uint8_t*>(my_queue_representation),
+                                         sizeof(queue_representation_type));
+    }
+
+    concurrent_bounded_queue& operator=( const concurrent_bounded_queue& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_copy_assignment
+        if (my_queue_representation != other.my_queue_representation) {
+            clear();
+            my_allocator = other.my_allocator;
+            my_queue_representation->assign(*other.my_queue_representation, my_allocator, copy_construct_item);
+        }
+        return *this;
+    }
+
+    concurrent_bounded_queue& operator=( concurrent_bounded_queue&& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_move_assignment
+        if (my_queue_representation != other.my_queue_representation) {
+            clear();
+            if (my_allocator == other.my_allocator) {
+                internal_swap(other);
+            } else {
+                my_queue_representation->assign(*other.my_queue_representation, other.my_allocator, move_construct_item);
+                other.clear();
+                my_allocator = std::move(other.my_allocator);
+            }
+        }
+        return *this;
+    }
+
+    concurrent_bounded_queue& operator=( std::initializer_list<value_type> init ) {
+        assign(init);
+        return *this;
+    }
+
+    template <typename InputIterator>
+    void assign( InputIterator first, InputIterator last ) {
+        concurrent_bounded_queue src(first, last);
+        clear();
+        my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item);
+    }
+
+    void assign( std::initializer_list<value_type> init ) {
+        assign(init.begin(), init.end());
+    }
+
+    void swap ( concurrent_bounded_queue& other ) {
+        //TODO: implement support for std::allocator_traits::propagate_on_container_swap
+        __TBB_ASSERT(my_allocator == other.my_allocator, "unequal allocators");
+        internal_swap(other);
+    }
+
+    // Enqueue an item at tail of queue.
+    void push( const T& value ) {
+        internal_push(value);
+    }
+
+    void push( T&& value ) {
+        internal_push(std::move(value));
+    }
+
+    // Enqueue an item at tail of queue if queue is not already full.
+    // Does not wait for queue to become not full.
+    // Returns true if item is pushed; false if queue was already full.
+    bool try_push( const T& value ) {
+        return internal_push_if_not_full(value);
+    }
+
+    bool try_push( T&& value ) {
+        return internal_push_if_not_full(std::move(value));
+    }
+
+    template <typename... Args>
+    void emplace( Args&&... args ) {
+        internal_push(std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    bool try_emplace( Args&&... args ) {
+        return internal_push_if_not_full(std::forward<Args>(args)...);
+    }
+
+    // Attempt to dequeue an item from head of queue.
+    void pop( T& result ) {
+        internal_pop(&result);
+    }
+
+    /** Does not wait for item to become available.
+        Returns true if successful; false otherwise. */
+    bool try_pop( T& result ) {
+        return internal_pop_if_present(&result);
+    }
+
+    void abort() {
+        internal_abort();
+    }
+
+    // Return the number of items in the queue; thread unsafe
+    std::ptrdiff_t size() const {
+        return my_queue_representation->size();
+    }
+
+    void set_capacity( size_type new_capacity ) {
+        std::ptrdiff_t c = new_capacity < 0 ? infinite_capacity : new_capacity;
+        my_capacity = c;
+    }
+
+    size_type capacity() const {
+        return my_capacity;
+    }
+
+    // Equivalent to size()==0.
+    __TBB_nodiscard bool empty() const {
+        return my_queue_representation->empty();
+    }
+
+    // Clear the queue. not thread-safe.
+    void clear() {
+        my_queue_representation->clear(my_allocator);
+    }
+
+    // Return allocator object
+    allocator_type get_allocator() const { return my_allocator; }
+
+    //------------------------------------------------------------------------
+    // The iterators are intended only for debugging.  They are slow and not thread safe.
+    //------------------------------------------------------------------------
+
+    iterator unsafe_begin() { return concurrent_queue_iterator_provider::get<iterator>(*this); }
+    iterator unsafe_end() { return iterator(); }
+    const_iterator unsafe_begin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_end() const { return const_iterator(); }
+    const_iterator unsafe_cbegin() const { return concurrent_queue_iterator_provider::get<const_iterator>(*this); }
+    const_iterator unsafe_cend() const { return const_iterator(); }
+
+private:
+    void internal_swap( concurrent_bounded_queue& src ) {
+        std::swap(my_queue_representation, src.my_queue_representation);
+        std::swap(my_monitors, src.my_monitors);
+    }
+
+    static constexpr std::ptrdiff_t infinite_capacity = std::ptrdiff_t(~size_type(0) / 2);
+
+    template <typename... Args>
+    void internal_push( Args&&... args ) {
+        unsigned old_abort_counter = my_abort_counter.load(std::memory_order_relaxed);
+        ticket_type ticket = my_queue_representation->tail_counter++;
+        std::ptrdiff_t target = ticket - my_capacity;
+
+        if (static_cast<std::ptrdiff_t>(my_queue_representation->head_counter.load(std::memory_order_relaxed)) <= target) { // queue is full
+            auto pred = [&] {
+                if (my_abort_counter.load(std::memory_order_relaxed) != old_abort_counter) {
+                    throw_exception(exception_id::user_abort);
+                }
+
+                return static_cast<std::ptrdiff_t>(my_queue_representation->head_counter.load(std::memory_order_relaxed)) <= target;
+            };
+
+            try_call( [&] {
+                internal_wait(my_monitors, cbq_slots_avail_tag, target, pred);
+            }).on_exception( [&] {
+                my_queue_representation->choose(ticket).abort_push(ticket, *my_queue_representation, my_allocator);
+            });
+
+        }
+        __TBB_ASSERT((static_cast<std::ptrdiff_t>(my_queue_representation->head_counter.load(std::memory_order_relaxed)) > target), nullptr);
+        my_queue_representation->choose(ticket).push(ticket, *my_queue_representation, my_allocator, std::forward<Args>(args)...);
+        r1::notify_bounded_queue_monitor(my_monitors, cbq_items_avail_tag, ticket);
+    }
+
+    template <typename... Args>
+    bool internal_push_if_not_full( Args&&... args ) {
+        ticket_type ticket = my_queue_representation->tail_counter.load(std::memory_order_relaxed);
+        do {
+            if (static_cast<std::ptrdiff_t>(ticket - my_queue_representation->head_counter.load(std::memory_order_relaxed)) >= my_capacity) {
+                // Queue is full
+                return false;
+            }
+            // Queue had empty slot with ticket k when we looked. Attempt to claim that slot.
+            // Another thread claimed the slot, so retry.
+        } while (!my_queue_representation->tail_counter.compare_exchange_strong(ticket, ticket + 1));
+
+        my_queue_representation->choose(ticket).push(ticket, *my_queue_representation, my_allocator, std::forward<Args>(args)...);
+        r1::notify_bounded_queue_monitor(my_monitors, cbq_items_avail_tag, ticket);
+        return true;
+    }
+
+    void internal_pop( void* dst ) {
+        std::ptrdiff_t target;
+        // This loop is a single pop operation; abort_counter should not be re-read inside
+        unsigned old_abort_counter = my_abort_counter.load(std::memory_order_relaxed);
+
+        do {
+            target = my_queue_representation->head_counter++;
+            if (static_cast<std::ptrdiff_t>(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) <= target) {
+                auto pred = [&] {
+                    if (my_abort_counter.load(std::memory_order_relaxed) != old_abort_counter) {
+                            throw_exception(exception_id::user_abort);
+                    }
+
+                    return static_cast<std::ptrdiff_t>(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) <= target;
+                };
+
+                try_call( [&] {
+                    internal_wait(my_monitors, cbq_items_avail_tag, target, pred);
+                }).on_exception( [&] {
+                    my_queue_representation->head_counter--;
+                });
+            }
+            __TBB_ASSERT(static_cast<std::ptrdiff_t>(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) > target, nullptr);
+        } while (!my_queue_representation->choose(target).pop(dst, target, *my_queue_representation, my_allocator));
+
+        r1::notify_bounded_queue_monitor(my_monitors, cbq_slots_avail_tag, target);
+    }
+
+    bool internal_pop_if_present( void* dst ) {
+        bool present{};
+        ticket_type ticket{};
+        std::tie(present, ticket) = internal_try_pop_impl(dst, *my_queue_representation, my_allocator);
+
+        if (present) {
+            r1::notify_bounded_queue_monitor(my_monitors, cbq_slots_avail_tag, ticket);
+        }
+        return present;
+    }
+
+    void internal_abort() {
+        ++my_abort_counter;
+        r1::abort_bounded_queue_monitors(my_monitors);
+    }
+
+    static void copy_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for copy construction
+        new (location) value_type(*static_cast<const value_type*>(src));
+    }
+
+    static void move_construct_item(T* location, const void* src) {
+        // TODO: use allocator_traits for move construction
+        new (location) value_type(std::move(*static_cast<value_type*>(const_cast<void*>(src))));
+    }
+
+    template <typename Container, typename Value, typename A>
+    friend class concurrent_queue_iterator;
+
+    queue_allocator_type my_allocator;
+    std::ptrdiff_t my_capacity;
+    std::atomic<unsigned> my_abort_counter;
+    queue_representation_type* my_queue_representation;
+
+    r1::concurrent_monitor* my_monitors;
+
+    friend void swap( concurrent_bounded_queue& lhs, concurrent_bounded_queue& rhs ) {
+        lhs.swap(rhs);
+    }
+
+    friend bool operator==( const concurrent_bounded_queue& lhs, const concurrent_bounded_queue& rhs ) {
+        return lhs.size() == rhs.size() && std::equal(lhs.unsafe_begin(), lhs.unsafe_end(), rhs.unsafe_begin());
+    }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+    friend bool operator!=( const concurrent_bounded_queue& lhs, const concurrent_bounded_queue& rhs ) {
+        return !(lhs == rhs);
+    }
+#endif // __TBB_CPP20_COMPARISONS_PRESENT
+}; // class concurrent_bounded_queue
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+// Deduction guide for the constructor from two iterators
+template <typename It, typename Alloc = tbb::cache_aligned_allocator<iterator_value_t<It>>>
+concurrent_bounded_queue( It, It, Alloc = Alloc() )
+-> concurrent_bounded_queue<iterator_value_t<It>, Alloc>;
+
+#endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */
+
+} //namespace d2
+} // namesapce detail
+
+inline namespace v1 {
+
+using detail::d2::concurrent_queue;
+using detail::d2::concurrent_bounded_queue;
+using detail::r1::user_abort;
+using detail::r1::bad_last_alloc;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_queue_H
diff --git a/third_party/tbb/concurrent_set.h b/third_party/tbb/concurrent_set.h
new file mode 100644
index 000000000..f1c8babdd
--- /dev/null
+++ b/third_party/tbb/concurrent_set.h
@@ -0,0 +1,268 @@
+// clang-format off
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_set_H
+#define __TBB_concurrent_set_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_concurrent_skip_list.h"
+#include "third_party/tbb/tbb_allocator.h"
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/utility"
+
+namespace tbb {
+namespace detail {
+namespace d2 {
+
+template<typename Key, typename KeyCompare, typename RandomGenerator, typename Allocator, bool AllowMultimapping>
+struct set_traits {
+    static constexpr std::size_t max_level = RandomGenerator::max_level;
+    using random_level_generator_type = RandomGenerator;
+    using key_type = Key;
+    using value_type = key_type;
+    using compare_type = KeyCompare;
+    using value_compare = compare_type;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using allocator_type = Allocator;
+
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    static const key_type& get_key(const_reference val) {
+        return val;
+    }
+
+    static value_compare value_comp(compare_type comp) { return comp; }
+}; // struct set_traits
+
+template <typename Key, typename Compare, typename Allocator>
+class concurrent_multiset;
+
+template <typename Key, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_set : public concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, false>> {
+    using base_type = concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, false>>;
+public:
+    using key_type = Key;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_set() = default;
+    concurrent_set( const concurrent_set& ) = default;
+    concurrent_set( const concurrent_set& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_set( concurrent_set&& ) = default;
+    concurrent_set( concurrent_set&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_set& operator=( const concurrent_set& ) = default;
+    concurrent_set& operator=( concurrent_set&& ) = default;
+
+    concurrent_set& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_set
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_set( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_set<iterator_value_t<It>, Comp, Alloc>;
+
+template <typename Key,
+          typename Comp = std::less<Key>,
+          typename Alloc = tbb::tbb_allocator<Key>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_set( std::initializer_list<Key>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_set<Key, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_set( It, It, Alloc )
+-> concurrent_set<iterator_value_t<It>,
+                  std::less<iterator_value_t<It>>, Alloc>;
+
+template <typename Key, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_set( std::initializer_list<Key>, Alloc )
+-> concurrent_set<Key, std::less<Key>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Compare, typename Allocator>
+void swap( concurrent_set<Key, Compare, Allocator>& lhs,
+           concurrent_set<Key, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename Compare = std::less<Key>, typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_multiset : public concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, true>> {
+    using base_type = concurrent_skip_list<set_traits<Key, Compare, concurrent_geometric_level_generator<32>, Allocator, true>>;
+public:
+    using key_type = Key;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using key_compare = Compare;
+    using value_compare = typename base_type::value_compare;
+    using allocator_type = Allocator;
+
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type;
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_multiset() = default;
+    concurrent_multiset( const concurrent_multiset& ) = default;
+    concurrent_multiset( const concurrent_multiset& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_multiset( concurrent_multiset&& ) = default;
+    concurrent_multiset( concurrent_multiset&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_multiset& operator=( const concurrent_multiset& ) = default;
+    concurrent_multiset& operator=( concurrent_multiset&& ) = default;
+
+    concurrent_multiset& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_set<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>& source) {
+        this->internal_merge(source);
+    }
+
+    template<typename OtherCompare>
+    void merge(concurrent_multiset<key_type, OtherCompare, Allocator>&& source) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_multiset
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Comp = std::less<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multiset( It, It, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multiset<iterator_value_t<It>, Comp, Alloc>;
+
+template <typename Key,
+          typename Comp = std::less<Key>,
+          typename Alloc = tbb::tbb_allocator<Key>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Comp>>>
+concurrent_multiset( std::initializer_list<Key>, Comp = Comp(), Alloc = Alloc() )
+-> concurrent_multiset<Key, Comp, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multiset( It, It, Alloc )
+-> concurrent_multiset<iterator_value_t<It>, std::less<iterator_value_t<It>>, Alloc>;
+
+template <typename Key, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_multiset( std::initializer_list<Key>, Alloc )
+-> concurrent_multiset<Key, std::less<Key>, Alloc>;
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Compare, typename Allocator>
+void swap( concurrent_multiset<Key, Compare, Allocator>& lhs,
+           concurrent_multiset<Key, Compare, Allocator>& rhs )
+{
+    lhs.swap(rhs);
+}
+
+} // namespace d2
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d2::concurrent_set;
+using detail::d2::concurrent_multiset;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_set_H
diff --git a/third_party/tbb/concurrent_unordered_map.h b/third_party/tbb/concurrent_unordered_map.h
new file mode 100644
index 000000000..d9fce65d6
--- /dev/null
+++ b/third_party/tbb/concurrent_unordered_map.h
@@ -0,0 +1,415 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_unordered_map_H
+#define __TBB_concurrent_unordered_map_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_concurrent_unordered_base.h"
+#include "third_party/tbb/tbb_allocator.h"
+#include "third_party/libcxx/functional"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
+struct concurrent_unordered_map_traits {
+    using value_type = std::pair<const Key, T>;
+    using key_type = Key;
+    using allocator_type = Allocator;
+    using hash_compare_type = hash_compare<Key, Hash, KeyEqual>;
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    static constexpr const key_type& get_key( const value_type& value ) {
+        return value.first;
+    }
+}; // struct concurrent_unordered_map_traits
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator>
+class concurrent_unordered_multimap;
+
+template <typename Key, typename T, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<std::pair<const Key, T>> >
+class concurrent_unordered_map
+    : public concurrent_unordered_base<concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, false>>
+{
+    using traits_type = concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, false>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using mapped_type = T;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base type
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_unordered_map() = default;
+    concurrent_unordered_map( const concurrent_unordered_map& ) = default;
+    concurrent_unordered_map( const concurrent_unordered_map& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_map( concurrent_unordered_map&& ) = default;
+    concurrent_unordered_map( concurrent_unordered_map&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_map& operator=( const concurrent_unordered_map& ) = default;
+    concurrent_unordered_map& operator=( concurrent_unordered_map&& ) = default;
+
+    concurrent_unordered_map& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    // Observers
+    mapped_type& operator[]( const key_type& key ) {
+        iterator where = this->find(key);
+
+        if (where == this->end()) {
+            where = this->emplace(std::piecewise_construct, std::forward_as_tuple(key), std::tuple<>()).first;
+        }
+        return where->second;
+    }
+
+    mapped_type& operator[]( key_type&& key ) {
+        iterator where = this->find(key);
+
+        if (where == this->end()) {
+            where = this->emplace(std::piecewise_construct, std::forward_as_tuple(std::move(key)), std::tuple<>()).first;
+        }
+        return where->second;
+    }
+
+    mapped_type& at( const key_type& key ) {
+        iterator where = this->find(key);
+
+        if (where == this->end()) {
+            throw_exception(exception_id::invalid_key);
+        }
+        return where->second;
+    }
+
+    const mapped_type& at( const key_type& key ) const {
+        const_iterator where = this->find(key);
+
+        if (where == this->end()) {
+            throw_exception(exception_id::out_of_range);
+        }
+        return where->second;
+    }
+
+    using base_type::insert;
+
+    template<typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value ) {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template<typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value ) {
+        return this->emplace_hint(hint, std::forward<P>(value));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_map
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename It,
+          typename Hash = std::hash<iterator_key_t<It>>,
+          typename KeyEq = std::equal_to<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( It, It, std::size_t =  {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_map<iterator_key_t<It>, iterator_mapped_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename Key, typename T,
+          typename Hash = std::hash<std::remove_const_t<Key>>,
+          typename KeyEq = std::equal_to<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, std::size_t = {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_map( It, It, std::size_t, Alloc )
+-> concurrent_unordered_map<iterator_key_t<It>, iterator_mapped_t<It>,
+                            std::hash<iterator_key_t<It>>,
+                            std::equal_to<iterator_key_t<It>>, Alloc>;
+
+// TODO: investigate if a deduction guide for concurrent_unordered_map(It, It, Alloc) is needed
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_map<iterator_key_t<It>, iterator_mapped_t<It>,
+                            Hash, std::equal_to<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, std::size_t, Alloc )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                            std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                            std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_map( std::initializer_list<std::pair<Key, T>>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_map<std::remove_const_t<Key>, T, Hash,
+                            std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+#if __APPLE__ && __TBB_CLANG_VERSION == 100000
+// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0
+// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances.
+// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides
+// The issue reproduces only on this version of the compiler
+template <typename Key, typename T, typename Hash, typename KeyEq, typename Alloc>
+concurrent_unordered_map( concurrent_unordered_map<Key, T, Hash, KeyEq, Alloc>, Alloc )
+-> concurrent_unordered_map<Key, T, Hash, KeyEq, Alloc>;
+#endif
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_map<Key, T, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_map<Key, T, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename T, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<std::pair<const Key, T>> >
+class concurrent_unordered_multimap
+    : public concurrent_unordered_base<concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, true>>
+{
+    using traits_type = concurrent_unordered_map_traits<Key, T, Hash, KeyEqual, Allocator, true>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using mapped_type = T;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base type
+    using base_type::base_type;
+    using base_type::insert;
+
+    // Required for implicit deduction guides
+    concurrent_unordered_multimap() = default;
+    concurrent_unordered_multimap( const concurrent_unordered_multimap& ) = default;
+    concurrent_unordered_multimap( const concurrent_unordered_multimap& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_multimap( concurrent_unordered_multimap&& ) = default;
+    concurrent_unordered_multimap( concurrent_unordered_multimap&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_multimap& operator=( const concurrent_unordered_multimap& ) = default;
+    concurrent_unordered_multimap& operator=( concurrent_unordered_multimap&& ) = default;
+
+    concurrent_unordered_multimap& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template <typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            std::pair<iterator, bool>>::type insert( P&& value ) {
+        return this->emplace(std::forward<P>(value));
+    }
+
+    template<typename P>
+    typename std::enable_if<std::is_constructible<value_type, P&&>::value,
+                            iterator>::type insert( const_iterator hint, P&& value ) {
+        return this->emplace_hint(hint, std::forward<P&&>(value));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_map<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multimap<key_type, mapped_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_multimap
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Hash = std::hash<iterator_key_t<It>>,
+          typename KeyEq = std::equal_to<iterator_key_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_alloc_pair_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multimap<iterator_key_t<It>, iterator_mapped_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename Key, typename T,
+          typename Hash = std::hash<std::remove_const_t<Key>>,
+          typename KeyEq = std::equal_to<std::remove_const_t<Key>>,
+          typename Alloc = tbb::tbb_allocator<std::pair<const Key, T>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, std::size_t = {},
+                               Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multimap( It, It, std::size_t, Alloc )
+-> concurrent_unordered_multimap<iterator_key_t<It>, iterator_mapped_t<It>,
+                                 std::hash<iterator_key_t<It>>,
+                                 std::equal_to<iterator_key_t<It>>, Alloc>;
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multimap<iterator_key_t<It>, iterator_mapped_t<It>, Hash,
+                                 std::equal_to<iterator_key_t<It>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, std::size_t, Alloc )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                                 std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, Alloc )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, std::hash<std::remove_const_t<Key>>,
+                                 std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+template <typename Key, typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multimap( std::initializer_list<std::pair<Key, T>>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multimap<std::remove_const_t<Key>, T, Hash,
+                                 std::equal_to<std::remove_const_t<Key>>, Alloc>;
+
+#if __APPLE__ && __TBB_CLANG_VERSION == 100000
+// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0
+// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances.
+// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides
+// The issue reproduces only on this version of the compiler
+template <typename Key, typename T, typename Hash, typename KeyEq, typename Alloc>
+concurrent_unordered_multimap( concurrent_unordered_multimap<Key, T, Hash, KeyEq, Alloc>, Alloc )
+-> concurrent_unordered_multimap<Key, T, Hash, KeyEq, Alloc>;
+#endif
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_multimap<Key, T, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_multimap<Key, T, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_unordered_map;
+using detail::d1::concurrent_unordered_multimap;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_unordered_map_H
diff --git a/third_party/tbb/concurrent_unordered_set.h b/third_party/tbb/concurrent_unordered_set.h
new file mode 100644
index 000000000..ff53ac024
--- /dev/null
+++ b/third_party/tbb/concurrent_unordered_set.h
@@ -0,0 +1,334 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_unordered_set_H
+#define __TBB_concurrent_unordered_set_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_concurrent_unordered_base.h"
+#include "third_party/tbb/tbb_allocator.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
+struct concurrent_unordered_set_traits {
+    using key_type = Key;
+    using value_type = key_type;
+    using allocator_type = Allocator;
+    using hash_compare_type = hash_compare<key_type, Hash, KeyEqual>;
+    static constexpr bool allow_multimapping = AllowMultimapping;
+
+    static constexpr const key_type& get_key( const value_type& value ) {
+        return value;
+    }
+}; // class concurrent_unordered_set_traits
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator>
+class concurrent_unordered_multiset;
+
+template <typename Key, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_unordered_set
+    : public concurrent_unordered_base<concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, false>>
+{
+    using traits_type = concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, false>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type;
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_unordered_set() = default;
+    concurrent_unordered_set( const concurrent_unordered_set& ) = default;
+    concurrent_unordered_set( const concurrent_unordered_set& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_set( concurrent_unordered_set&& ) = default;
+    concurrent_unordered_set( concurrent_unordered_set&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_set& operator=( const concurrent_unordered_set& ) = default;
+    concurrent_unordered_set& operator=( concurrent_unordered_set&& ) = default;
+
+    concurrent_unordered_set& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_set
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename It,
+          typename Hash = std::hash<iterator_value_t<It>>,
+          typename KeyEq = std::equal_to<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_set<iterator_value_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename T,
+          typename Hash = std::hash<T>,
+          typename KeyEq = std::equal_to<T>,
+          typename Alloc = tbb::tbb_allocator<T>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( std::initializer_list<T>, std::size_t = {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_set<T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_set( It, It, std::size_t, Alloc )
+-> concurrent_unordered_set<iterator_value_t<It>, std::hash<iterator_value_t<It>>,
+                            std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_set<iterator_value_t<It>, Hash, std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_set( std::initializer_list<T>, std::size_t, Alloc )
+-> concurrent_unordered_set<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_set( std::initializer_list<T>, Alloc )
+-> concurrent_unordered_set<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_set( std::initializer_list<T>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_set<T, Hash, std::equal_to<T>, Alloc>;
+
+#if __APPLE__ && __TBB_CLANG_VERSION == 100000
+// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0
+// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances.
+// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides
+// The issue reproduces only on this version of the compiler
+template <typename T, typename Hash, typename KeyEq, typename Alloc>
+concurrent_unordered_set( concurrent_unordered_set<T, Hash, KeyEq, Alloc>, Alloc )
+-> concurrent_unordered_set<T, Hash, KeyEq, Alloc>;
+#endif
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_set<Key, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_set<Key, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+template <typename Key, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>,
+          typename Allocator = tbb::tbb_allocator<Key>>
+class concurrent_unordered_multiset
+    : public concurrent_unordered_base<concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, true>>
+{
+    using traits_type = concurrent_unordered_set_traits<Key, Hash, KeyEqual, Allocator, true>;
+    using base_type = concurrent_unordered_base<traits_type>;
+public:
+    using key_type = typename base_type::key_type;
+    using value_type = typename base_type::value_type;
+    using size_type = typename base_type::size_type;
+    using difference_type = typename base_type::difference_type;
+    using hasher = typename base_type::hasher;
+    using key_equal = typename base_type::key_equal;
+    using allocator_type = typename base_type::allocator_type;
+    using reference = typename base_type::reference;
+    using const_reference = typename base_type::const_reference;
+    using pointer = typename base_type::pointer;
+    using const_pointer = typename base_type::const_pointer;
+    using iterator = typename base_type::iterator;
+    using const_iterator = typename base_type::const_iterator;
+    using local_iterator = typename base_type::local_iterator;
+    using const_local_iterator = typename base_type::const_local_iterator;
+    using node_type = typename base_type::node_type;
+
+    // Include constructors of base_type;
+    using base_type::base_type;
+
+    // Required for implicit deduction guides
+    concurrent_unordered_multiset() = default;
+    concurrent_unordered_multiset( const concurrent_unordered_multiset& ) = default;
+    concurrent_unordered_multiset( const concurrent_unordered_multiset& other, const allocator_type& alloc ) : base_type(other, alloc) {}
+    concurrent_unordered_multiset( concurrent_unordered_multiset&& ) = default;
+    concurrent_unordered_multiset( concurrent_unordered_multiset&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {}
+    // Required to respect the rule of 5
+    concurrent_unordered_multiset& operator=( const concurrent_unordered_multiset& ) = default;
+    concurrent_unordered_multiset& operator=( concurrent_unordered_multiset&& ) = default;
+
+    concurrent_unordered_multiset& operator=( std::initializer_list<value_type> il ) {
+        base_type::operator= (il);
+        return *this;
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
+        this->internal_merge(source);
+    }
+
+    template <typename OtherHash, typename OtherKeyEqual>
+    void merge( concurrent_unordered_multiset<key_type, OtherHash, OtherKeyEqual, allocator_type>&& source ) {
+        this->internal_merge(std::move(source));
+    }
+}; // class concurrent_unordered_multiset
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename It,
+          typename Hash = std::hash<iterator_value_t<It>>,
+          typename KeyEq = std::equal_to<iterator_value_t<It>>,
+          typename Alloc = tbb::tbb_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multiset<iterator_value_t<It>, Hash, KeyEq, Alloc>;
+
+template <typename T,
+          typename Hash = std::hash<T>,
+          typename KeyEq = std::equal_to<T>,
+          typename Alloc = tbb::tbb_allocator<T>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!is_allocator_v<KeyEq>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( std::initializer_list<T>, std::size_t = {},
+                          Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() )
+-> concurrent_unordered_multiset<T, Hash, KeyEq, Alloc>;
+
+template <typename It, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multiset( It, It, std::size_t, Alloc )
+-> concurrent_unordered_multiset<iterator_value_t<It>, std::hash<iterator_value_t<It>>,
+                            std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename It, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( It, It, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multiset<iterator_value_t<It>, Hash, std::equal_to<iterator_value_t<It>>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multiset( std::initializer_list<T>, std::size_t, Alloc )
+-> concurrent_unordered_multiset<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_unordered_multiset( std::initializer_list<T>, Alloc )
+-> concurrent_unordered_multiset<T, std::hash<T>, std::equal_to<T>, Alloc>;
+
+template <typename T, typename Hash, typename Alloc,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>,
+          typename = std::enable_if_t<!is_allocator_v<Hash>>,
+          typename = std::enable_if_t<!std::is_integral_v<Hash>>>
+concurrent_unordered_multiset( std::initializer_list<T>, std::size_t, Hash, Alloc )
+-> concurrent_unordered_multiset<T, Hash, std::equal_to<T>, Alloc>;
+
+#if __APPLE__ && __TBB_CLANG_VERSION == 100000
+// An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0
+// due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances.
+// Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides
+// The issue reproduces only on this version of the compiler
+template <typename T, typename Hash, typename KeyEq, typename Alloc>
+concurrent_unordered_multiset( concurrent_unordered_multiset<T, Hash, KeyEq, Alloc>, Alloc )
+-> concurrent_unordered_multiset<T, Hash, KeyEq, Alloc>;
+#endif
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+template <typename Key, typename Hash, typename KeyEqual, typename Allocator>
+void swap( concurrent_unordered_multiset<Key, Hash, KeyEqual, Allocator>& lhs,
+           concurrent_unordered_multiset<Key, Hash, KeyEqual, Allocator>& rhs ) {
+    lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+
+using detail::d1::concurrent_unordered_set;
+using detail::d1::concurrent_unordered_multiset;
+using detail::split;
+
+} // inline namespace v1
+} // namespace tbb
+
+#endif // __TBB_concurrent_unordered_set_H
diff --git a/third_party/tbb/concurrent_vector.h b/third_party/tbb/concurrent_vector.h
new file mode 100644
index 000000000..012dbe931
--- /dev/null
+++ b/third_party/tbb/concurrent_vector.h
@@ -0,0 +1,1130 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_concurrent_vector_H
+#define __TBB_concurrent_vector_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_allocator_traits.h"
+#include "third_party/tbb/detail/_segment_table.h"
+#include "third_party/tbb/detail/_containers_helpers.h"
+#include "third_party/tbb/blocked_range.h"
+#include "third_party/tbb/cache_aligned_allocator.h"
+
+#include "third_party/libcxx/algorithm"
+#include "third_party/libcxx/utility" // std::move_if_noexcept
+#include "third_party/libcxx/algorithm"
+#if __TBB_CPP20_COMPARISONS_PRESENT
+#include "third_party/libcxx/compare"
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Vector, typename Value>
+class vector_iterator {
+    using vector_type = Vector;
+
+public:
+    using value_type = Value;
+    using size_type = typename vector_type::size_type;
+    using difference_type = typename vector_type::difference_type;
+    using pointer = value_type*;
+    using reference = value_type&;
+    using iterator_category = std::random_access_iterator_tag;
+
+    template <typename Vec, typename Val>
+    friend vector_iterator<Vec, Val> operator+( typename vector_iterator<Vec, Val>::difference_type, const vector_iterator<Vec, Val>& );
+
+    template <typename Vec, typename Val1, typename Val2>
+    friend typename vector_iterator<Vec, Val1>::difference_type operator-( const vector_iterator<Vec, Val1>&, const vector_iterator<Vec, Val2>& );
+
+    template <typename Vec, typename Val1, typename Val2>
+    friend bool operator==( const vector_iterator<Vec, Val1>&, const vector_iterator<Vec, Val2>& );
+
+    template <typename Vec, typename Val1, typename Val2>
+    friend bool operator<( const vector_iterator<Vec, Val1>&, const vector_iterator<Vec, Val2>& );
+
+    template <typename Vec, typename Val>
+    friend class vector_iterator;
+
+    template <typename T, typename Allocator>
+    friend class concurrent_vector;
+
+private:
+    vector_iterator( const vector_type& vector, size_type index, value_type* item = nullptr )
+        : my_vector(const_cast<vector_type*>(&vector)), my_index(index), my_item(item)
+    {}
+
+public:
+    vector_iterator() : my_vector(nullptr), my_index(~size_type(0)), my_item(nullptr)
+    {}
+
+    vector_iterator( const vector_iterator<vector_type, typename vector_type::value_type>& other )
+        : my_vector(other.my_vector), my_index(other.my_index), my_item(other.my_item)
+    {}
+
+    vector_iterator& operator=( const vector_iterator<vector_type, typename vector_type::value_type>& other ) {
+        my_vector = other.my_vector;
+        my_index = other.my_index;
+        my_item = other.my_item;
+        return *this;
+    }
+
+    vector_iterator operator+( difference_type offset ) const {
+        return vector_iterator(*my_vector, my_index + offset);
+    }
+
+    vector_iterator& operator+=( difference_type offset ) {
+        my_index += offset;
+        my_item = nullptr;
+        return *this;
+    }
+
+    vector_iterator operator-( difference_type offset ) const {
+        return vector_iterator(*my_vector, my_index - offset);
+    }
+
+    vector_iterator& operator-=( difference_type offset ) {
+        my_index -= offset;
+        my_item = nullptr;
+        return *this;
+    }
+
+    reference operator*() const {
+        value_type *item = my_item;
+        if (item == nullptr) {
+            item = &my_vector->internal_subscript(my_index);
+        } else {
+            __TBB_ASSERT(item == &my_vector->internal_subscript(my_index), "corrupt cache");
+        }
+        return *item;
+    }
+
+    pointer operator->() const { return &(operator*()); }
+
+    reference operator[]( difference_type k ) const {
+        return my_vector->internal_subscript(my_index + k);
+    }
+
+    vector_iterator& operator++() {
+        ++my_index;
+        if (my_item != nullptr) {
+            if (vector_type::is_first_element_in_segment(my_index)) {
+                // If the iterator crosses a segment boundary, the pointer become invalid
+                // as possibly next segment is in another memory location
+                my_item = nullptr;
+            } else {
+                ++my_item;
+            }
+        }
+        return *this;
+    }
+
+    vector_iterator operator++(int) {
+        vector_iterator result = *this;
+        ++(*this);
+        return result;
+    }
+
+    vector_iterator& operator--() {
+        __TBB_ASSERT(my_index > 0, "operator--() applied to iterator already at beginning of concurrent_vector");
+        --my_index;
+        if (my_item != nullptr) {
+            if (vector_type::is_first_element_in_segment(my_index)) {
+                // If the iterator crosses a segment boundary, the pointer become invalid
+                // as possibly next segment is in another memory location
+                my_item = nullptr;
+            } else {
+                --my_item;
+            }
+        }
+        return *this;
+    }
+
+    vector_iterator operator--(int) {
+        vector_iterator result = *this;
+        --(*this);
+        return result;
+    }
+
+private:
+    // concurrent_vector over which we are iterating.
+    vector_type* my_vector;
+
+    // Index into the vector
+    size_type my_index;
+
+    // Caches my_vector *it;
+    // If my_item == nullptr cached value is not available use internal_subscript(my_index)
+    mutable value_type* my_item;
+}; // class vector_iterator
+
+template <typename Vector, typename T>
+vector_iterator<Vector, T> operator+( typename vector_iterator<Vector, T>::difference_type offset,
+                                      const vector_iterator<Vector, T>& v )
+{
+    return vector_iterator<Vector, T>(*v.my_vector, v.my_index + offset);
+}
+
+template <typename Vector, typename T, typename U>
+typename vector_iterator<Vector, T>::difference_type operator-( const vector_iterator<Vector, T>& i,
+                                                                const vector_iterator<Vector, U>& j )
+{
+    using difference_type = typename vector_iterator<Vector, T>::difference_type;
+    return static_cast<difference_type>(i.my_index) - static_cast<difference_type>(j.my_index);
+}
+
+template <typename Vector, typename T, typename U>
+bool operator==( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return i.my_vector == j.my_vector && i.my_index == j.my_index;
+}
+
+template <typename Vector, typename T, typename U>
+bool operator!=( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return !(i == j);
+}
+
+template <typename Vector, typename T, typename U>
+bool operator<( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return i.my_index < j.my_index;
+}
+
+template <typename Vector, typename T, typename U>
+bool operator>( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return j < i;
+}
+
+template <typename Vector, typename T, typename U>
+bool operator>=( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return !(i < j);
+}
+
+template <typename Vector, typename T, typename U>
+bool operator<=( const vector_iterator<Vector, T>& i, const vector_iterator<Vector, U>& j ) {
+    return !(j < i);
+}
+
+static constexpr std::size_t embedded_table_num_segments = 3;
+
+template <typename T, typename Allocator = tbb::cache_aligned_allocator<T>>
+class concurrent_vector
+    : private segment_table<T, Allocator, concurrent_vector<T, Allocator>, embedded_table_num_segments>
+{
+    using self_type = concurrent_vector<T, Allocator>;
+    using base_type = segment_table<T, Allocator, self_type, embedded_table_num_segments>;
+
+    friend class segment_table<T, Allocator, self_type, embedded_table_num_segments>;
+
+    template <typename Iterator>
+    class generic_range_type : public tbb::blocked_range<Iterator> {
+        using base_type = tbb::blocked_range<Iterator>;
+    public:
+        using value_type = T;
+        using reference = T&;
+        using const_reference = const T&;
+        using iterator = Iterator;
+        using difference_type = std::ptrdiff_t;
+
+        using base_type::base_type;
+
+        template<typename U>
+        generic_range_type( const generic_range_type<U>& r) : blocked_range<Iterator>(r.begin(), r.end(), r.grainsize()) {}
+        generic_range_type( generic_range_type& r, split ) : blocked_range<Iterator>(r, split()) {}
+    }; // class generic_range_type
+
+    static_assert(std::is_same<T, typename Allocator::value_type>::value,
+                  "value_type of the container must be the same as its allocator's");
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+    // Segment table for concurrent_vector can be extended
+    static constexpr bool allow_table_extending = true;
+    static constexpr bool is_noexcept_assignment = allocator_traits_type::propagate_on_container_move_assignment::value ||
+                                                   allocator_traits_type::is_always_equal::value;
+    static constexpr bool is_noexcept_swap = allocator_traits_type::propagate_on_container_swap::value ||
+                                             allocator_traits_type::is_always_equal::value;
+
+public:
+    using value_type = T;
+    using allocator_type = Allocator;
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using iterator = vector_iterator<concurrent_vector, value_type>;
+    using const_iterator = vector_iterator<concurrent_vector, const value_type>;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+    using range_type = generic_range_type<iterator>;
+    using const_range_type = generic_range_type<const_iterator>;
+
+    concurrent_vector() : concurrent_vector(allocator_type()) {}
+
+    explicit concurrent_vector( const allocator_type& alloc ) noexcept
+        : base_type(alloc)
+    {}
+
+    explicit concurrent_vector( size_type count, const value_type& value,
+                                const allocator_type& alloc = allocator_type() )
+        : concurrent_vector(alloc)
+    {
+        try_call( [&] {
+            grow_by(count, value);
+        } ).on_exception( [&] {
+            base_type::clear();
+        });
+    }
+
+    explicit concurrent_vector( size_type count, const allocator_type& alloc = allocator_type() )
+        : concurrent_vector(alloc)
+    {
+        try_call( [&] {
+            grow_by(count);
+        } ).on_exception( [&] {
+            base_type::clear();
+        });
+    }
+
+    template <typename InputIterator>
+    concurrent_vector( InputIterator first, InputIterator last, const allocator_type& alloc = allocator_type() )
+        : concurrent_vector(alloc)
+    {
+        try_call( [&] {
+            grow_by(first, last);
+        } ).on_exception( [&] {
+            base_type::clear();
+        });
+    }
+
+    concurrent_vector( const concurrent_vector& other )
+        : base_type(segment_table_allocator_traits::select_on_container_copy_construction(other.get_allocator()))
+    {
+        try_call( [&] {
+            grow_by(other.begin(), other.end());
+        } ).on_exception( [&] {
+            base_type::clear();
+        });
+    }
+
+    concurrent_vector( const concurrent_vector& other, const allocator_type& alloc )
+        : base_type(other, alloc) {}
+
+    concurrent_vector(concurrent_vector&& other) noexcept
+        : base_type(std::move(other))
+    {}
+
+    concurrent_vector( concurrent_vector&& other, const allocator_type& alloc )
+        : base_type(std::move(other), alloc)
+    {}
+
+    concurrent_vector( std::initializer_list<value_type> init,
+                       const allocator_type& alloc = allocator_type() )
+        : concurrent_vector(init.begin(), init.end(), alloc)
+    {}
+
+    ~concurrent_vector() {}
+
+    // Assignment
+    concurrent_vector& operator=( const concurrent_vector& other ) {
+        base_type::operator=(other);
+        return *this;
+    }
+
+    concurrent_vector& operator=( concurrent_vector&& other ) noexcept(is_noexcept_assignment) {
+        base_type::operator=(std::move(other));
+        return *this;
+    }
+
+    concurrent_vector& operator=( std::initializer_list<value_type> init ) {
+        assign(init);
+        return *this;
+    }
+
+    void assign( size_type count, const value_type& value ) {
+        destroy_elements();
+        grow_by(count, value);
+    }
+
+    template <typename InputIterator>
+    typename std::enable_if<is_input_iterator<InputIterator>::value, void>::type
+    assign( InputIterator first, InputIterator last ) {
+        destroy_elements();
+        grow_by(first, last);
+    }
+
+    void assign( std::initializer_list<value_type> init ) {
+        destroy_elements();
+        assign(init.begin(), init.end());
+    }
+
+    // Concurrent growth
+    iterator grow_by( size_type delta ) {
+        return internal_grow_by_delta(delta);
+    }
+
+    iterator grow_by( size_type delta, const value_type& value ) {
+        return internal_grow_by_delta(delta, value);
+    }
+
+    template <typename ForwardIterator>
+    typename std::enable_if<is_input_iterator<ForwardIterator>::value, iterator>::type
+    grow_by( ForwardIterator first, ForwardIterator last ) {
+        auto delta = std::distance(first, last);
+        return internal_grow_by_delta(delta, first, last);
+    }
+
+    iterator grow_by( std::initializer_list<value_type> init ) {
+        return grow_by(init.begin(), init.end());
+    }
+
+    iterator grow_to_at_least( size_type n ) {
+        return internal_grow_to_at_least(n);
+    }
+    iterator grow_to_at_least( size_type n, const value_type& value ) {
+        return internal_grow_to_at_least(n, value);
+    }
+
+    iterator push_back( const value_type& item ) {
+        return internal_emplace_back(item);
+    }
+
+    iterator push_back( value_type&& item ) {
+        return internal_emplace_back(std::move(item));
+    }
+
+    template <typename... Args>
+    iterator emplace_back( Args&&... args ) {
+        return internal_emplace_back(std::forward<Args>(args)...);
+    }
+
+    // Items access
+    reference operator[]( size_type index ) {
+        return internal_subscript(index);
+    }
+    const_reference operator[]( size_type index ) const {
+        return internal_subscript(index);
+    }
+
+    reference at( size_type index ) {
+        return internal_subscript_with_exceptions(index);
+    }
+    const_reference at( size_type index ) const {
+        return internal_subscript_with_exceptions(index);
+    }
+
+    // Get range for iterating with parallel algorithms
+    range_type range( size_t grainsize = 1 ) {
+        return range_type(begin(), end(), grainsize);
+    }
+
+    // Get const range for iterating with parallel algorithms
+    const_range_type range( size_t grainsize = 1 ) const {
+        return const_range_type(begin(), end(), grainsize);
+    }
+
+    reference front() {
+        return internal_subscript(0);
+    }
+
+    const_reference front() const {
+        return internal_subscript(0);
+    }
+
+    reference back() {
+        return internal_subscript(size() - 1);
+    }
+
+    const_reference back() const {
+        return internal_subscript(size() - 1);
+    }
+
+    // Iterators
+    iterator begin() { return iterator(*this, 0); }
+    const_iterator begin() const { return const_iterator(*this, 0); }
+    const_iterator cbegin() const { return const_iterator(*this, 0); }
+
+    iterator end() { return iterator(*this, size()); }
+    const_iterator end() const { return const_iterator(*this, size()); }
+    const_iterator cend() const { return const_iterator(*this, size()); }
+
+    reverse_iterator rbegin() { return reverse_iterator(end()); }
+    const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); }
+    const_reverse_iterator crbegin() const { return const_reverse_iterator(cend()); }
+
+    reverse_iterator rend() { return reverse_iterator(begin()); }
+    const_reverse_iterator rend() const { return const_reverse_iterator(begin()); }
+    const_reverse_iterator crend() const { return const_reverse_iterator(cbegin()); }
+
+    allocator_type get_allocator() const {
+        return base_type::get_allocator();
+    }
+
+    // Storage
+    bool empty() const noexcept {
+        return 0 == size();
+    }
+
+    size_type size() const noexcept {
+        return std::min(this->my_size.load(std::memory_order_acquire), capacity());
+    }
+
+    size_type max_size() const noexcept {
+        return allocator_traits_type::max_size(base_type::get_allocator());
+    }
+
+    size_type capacity() const noexcept {
+        return base_type::capacity();
+    }
+
+    void reserve( size_type n ) {
+        if (n == 0) return;
+
+        if (n > max_size()) {
+            tbb::detail::throw_exception(exception_id::reservation_length_error);
+        }
+
+        this->assign_first_block_if_necessary(this->segment_index_of(n - 1) + 1);
+        base_type::reserve(n);
+    }
+
+    void resize( size_type n ) {
+        internal_resize(n);
+    }
+
+    void resize( size_type n, const value_type& val ) {
+        internal_resize(n, val);
+    }
+
+    void shrink_to_fit() {
+        internal_compact();
+    }
+
+    void swap(concurrent_vector& other) noexcept(is_noexcept_swap) {
+        base_type::swap(other);
+    }
+
+    void clear() {
+        destroy_elements();
+    }
+
+private:
+    using segment_type = typename base_type::segment_type;
+    using segment_table_type = typename base_type::segment_table_type;
+    using segment_table_allocator_traits = typename base_type::segment_table_allocator_traits;
+    using segment_index_type = typename base_type::segment_index_type;
+
+    using segment_element_type = typename base_type::value_type;
+    using segment_element_allocator_type = typename allocator_traits_type::template rebind_alloc<segment_element_type>;
+    using segment_element_allocator_traits = tbb::detail::allocator_traits<segment_element_allocator_type>;
+
+    segment_table_type allocate_long_table( const typename base_type::atomic_segment* embedded_table, size_type start_index ) {
+        __TBB_ASSERT(start_index <= this->embedded_table_size, "Start index out of embedded table");
+
+        // If other threads are trying to set pointers in the short segment, wait for them to finish their
+        // assignments before we copy the short segment to the long segment. Note: grow_to_at_least depends on it
+        for (segment_index_type i = 0; this->segment_base(i) < start_index; ++i) {
+            spin_wait_while_eq(embedded_table[i], segment_type(nullptr));
+        }
+
+        // It is possible that the table was extend by a thread allocating first_block, need to check this.
+        if (this->get_table() != embedded_table) {
+            return nullptr;
+        }
+
+        // Allocate long segment table and fill with null pointers
+        segment_table_type new_segment_table = segment_table_allocator_traits::allocate(base_type::get_allocator(), this->pointers_per_long_table);
+        // Copy segment pointers from the embedded table
+        for (size_type segment_index = 0; segment_index < this->pointers_per_embedded_table; ++segment_index) {
+            segment_table_allocator_traits::construct(base_type::get_allocator(), &new_segment_table[segment_index],
+                embedded_table[segment_index].load(std::memory_order_relaxed));
+        }
+        for (size_type segment_index = this->pointers_per_embedded_table; segment_index < this->pointers_per_long_table; ++segment_index) {
+            segment_table_allocator_traits::construct(base_type::get_allocator(), &new_segment_table[segment_index], nullptr);
+        }
+
+        return new_segment_table;
+    }
+
+    // create_segment function is required by the segment_table base class
+    segment_type create_segment( segment_table_type table, segment_index_type seg_index, size_type index ) {
+        size_type first_block = this->my_first_block.load(std::memory_order_relaxed);
+        // First block allocation
+        if (seg_index < first_block) {
+            // If 0 segment is already allocated, then it remains to wait until the segments are filled to requested
+            if (table[0].load(std::memory_order_acquire) != nullptr) {
+                spin_wait_while_eq(table[seg_index], segment_type(nullptr));
+                return nullptr;
+            }
+
+            segment_element_allocator_type segment_allocator(base_type::get_allocator());
+            segment_type new_segment = nullptr;
+            size_type first_block_size = this->segment_size(first_block);
+            try_call( [&] {
+                new_segment = segment_element_allocator_traits::allocate(segment_allocator, first_block_size);
+            } ).on_exception( [&] {
+                segment_type disabled_segment = nullptr;
+                if (table[0].compare_exchange_strong(disabled_segment, this->segment_allocation_failure_tag)) {
+                    size_type end_segment = table == this->my_embedded_table ? this->pointers_per_embedded_table : first_block;
+                    for (size_type i = 1; i < end_segment; ++i) {
+                        table[i].store(this->segment_allocation_failure_tag, std::memory_order_release);
+                    }
+                }
+            });
+
+            segment_type disabled_segment = nullptr;
+            if (table[0].compare_exchange_strong(disabled_segment, new_segment)) {
+                this->extend_table_if_necessary(table, 0, first_block_size);
+                for (size_type i = 1; i < first_block; ++i) {
+                    table[i].store(new_segment, std::memory_order_release);
+                }
+
+                // Other threads can wait on a snapshot of an embedded table, need to fill it.
+                for (size_type i = 1; i < first_block && i < this->pointers_per_embedded_table; ++i) {
+                    this->my_embedded_table[i].store(new_segment, std::memory_order_release);
+                }
+            } else if (new_segment != this->segment_allocation_failure_tag) {
+                // Deallocate the memory
+                segment_element_allocator_traits::deallocate(segment_allocator, new_segment, first_block_size);
+                // 0 segment is already allocated, then it remains to wait until the segments are filled to requested
+                spin_wait_while_eq(table[seg_index], segment_type(nullptr));
+            }
+        } else {
+            size_type offset = this->segment_base(seg_index);
+            if (index == offset) {
+                __TBB_ASSERT(table[seg_index].load(std::memory_order_relaxed) == nullptr, "Only this thread can enable this segment");
+                segment_element_allocator_type segment_allocator(base_type::get_allocator());
+                segment_type new_segment = this->segment_allocation_failure_tag;
+                try_call( [&] {
+                    new_segment = segment_element_allocator_traits::allocate(segment_allocator,this->segment_size(seg_index));
+                    // Shift base address to simplify access by index
+                    new_segment -= this->segment_base(seg_index);
+                } ).on_completion( [&] {
+                    table[seg_index].store(new_segment, std::memory_order_release);
+                });
+            } else {
+                spin_wait_while_eq(table[seg_index], segment_type(nullptr));
+            }
+        }
+        return nullptr;
+    }
+
+    // Returns the number of elements in the segment to be destroy
+    size_type number_of_elements_in_segment( segment_index_type seg_index ) {
+        size_type curr_vector_size = this->my_size.load(std::memory_order_relaxed);
+        size_type curr_segment_base = this->segment_base(seg_index);
+
+        if (seg_index == 0) {
+            return std::min(curr_vector_size, this->segment_size(seg_index));
+        } else {
+            // Perhaps the segment is allocated, but there are no elements in it.
+            if (curr_vector_size < curr_segment_base) {
+                return 0;
+            }
+            return curr_segment_base * 2 > curr_vector_size ? curr_vector_size - curr_segment_base : curr_segment_base;
+        }
+    }
+
+    segment_type nullify_segment( segment_table_type table, size_type segment_index ) {
+        segment_type target_segment = table[segment_index].load(std::memory_order_relaxed);
+        if (segment_index >= this->my_first_block) {
+            table[segment_index].store(nullptr, std::memory_order_relaxed);
+        } else {
+            if (segment_index == 0) {
+                for (size_type i = 0; i < this->my_first_block; ++i) {
+                    table[i].store(nullptr, std::memory_order_relaxed);
+                }
+            }
+        }
+
+        return target_segment;
+    }
+
+    void deallocate_segment( segment_type address, segment_index_type seg_index ) {
+        segment_element_allocator_type segment_allocator(base_type::get_allocator());
+        size_type first_block = this->my_first_block.load(std::memory_order_relaxed);
+        if (seg_index >= first_block) {
+            segment_element_allocator_traits::deallocate(segment_allocator, address, this->segment_size(seg_index));
+        }
+        else if (seg_index == 0) {
+            size_type elements_to_deallocate = first_block > 0 ? this->segment_size(first_block) : this->segment_size(0);
+            segment_element_allocator_traits::deallocate(segment_allocator, address, elements_to_deallocate);
+        }
+    }
+
+    // destroy_segment function is required by the segment_table base class
+    void destroy_segment( segment_type address, segment_index_type seg_index ) {
+        size_type elements_to_destroy = number_of_elements_in_segment(seg_index);
+        segment_element_allocator_type segment_allocator(base_type::get_allocator());
+
+        for (size_type i = 0; i < elements_to_destroy; ++i) {
+            segment_element_allocator_traits::destroy(segment_allocator, address + i);
+        }
+
+        deallocate_segment(address, seg_index);
+    }
+
+    // copy_segment function is required by the segment_table base class
+    void copy_segment( segment_index_type seg_index, segment_type from, segment_type to ) {
+        size_type i = 0;
+        try_call( [&] {
+            for (; i != number_of_elements_in_segment(seg_index); ++i) {
+                segment_table_allocator_traits::construct(base_type::get_allocator(), to + i, from[i]);
+            }
+        } ).on_exception( [&] {
+            // Zero-initialize items left not constructed after the exception
+            zero_unconstructed_elements(this->get_segment(seg_index) + i, this->segment_size(seg_index) - i);
+
+            segment_index_type last_segment = this->segment_index_of(this->my_size.load(std::memory_order_relaxed));
+            auto table = this->get_table();
+            for (segment_index_type j = seg_index + 1; j != last_segment; ++j) {
+                auto curr_segment = table[j].load(std::memory_order_relaxed);
+                if (curr_segment) {
+                    zero_unconstructed_elements(curr_segment + this->segment_base(j), this->segment_size(j));
+                }
+            }
+            this->my_size.store(this->segment_size(seg_index) + i, std::memory_order_relaxed);
+        });
+    }
+
+    // move_segment function is required by the segment_table base class
+    void move_segment( segment_index_type seg_index, segment_type from, segment_type to ) {
+        size_type i = 0;
+        try_call( [&] {
+            for (; i != number_of_elements_in_segment(seg_index); ++i) {
+                segment_table_allocator_traits::construct(base_type::get_allocator(), to + i, std::move(from[i]));
+            }
+        } ).on_exception( [&] {
+            // Zero-initialize items left not constructed after the exception
+            zero_unconstructed_elements(this->get_segment(seg_index) + i, this->segment_size(seg_index) - i);
+
+            segment_index_type last_segment = this->segment_index_of(this->my_size.load(std::memory_order_relaxed));
+            auto table = this->get_table();
+            for (segment_index_type j = seg_index + 1; j != last_segment; ++j) {
+                auto curr_segment = table[j].load(std::memory_order_relaxed);
+                if (curr_segment) {
+                    zero_unconstructed_elements(curr_segment + this->segment_base(j), this->segment_size(j));
+                }
+            }
+            this->my_size.store(this->segment_size(seg_index) + i, std::memory_order_relaxed);
+        });
+    }
+
+    static constexpr bool is_first_element_in_segment( size_type index ) {
+        // An element is the first in a segment if its index is equal to a power of two
+        return is_power_of_two_at_least(index, 2);
+    }
+
+    const_reference internal_subscript( size_type index ) const {
+        return const_cast<self_type*>(this)->internal_subscript(index);
+    }
+
+    reference internal_subscript( size_type index ) {
+        __TBB_ASSERT(index < this->my_size.load(std::memory_order_relaxed), "Invalid subscript index");
+        return base_type::template internal_subscript</*allow_out_of_range_access=*/false>(index);
+    }
+
+    const_reference internal_subscript_with_exceptions( size_type index ) const {
+        return const_cast<self_type*>(this)->internal_subscript_with_exceptions(index);
+    }
+
+    reference internal_subscript_with_exceptions( size_type index ) {
+        if (index >= this->my_size.load(std::memory_order_acquire)) {
+            tbb::detail::throw_exception(exception_id::out_of_range);
+        }
+
+        segment_table_type table = this->my_segment_table.load(std::memory_order_acquire);
+
+        size_type seg_index = this->segment_index_of(index);
+        if (base_type::number_of_segments(table) < seg_index) {
+            tbb::detail::throw_exception(exception_id::out_of_range);
+        }
+
+        if (table[seg_index] <= this->segment_allocation_failure_tag) {
+            tbb::detail::throw_exception(exception_id::out_of_range);
+        }
+
+        return base_type::template internal_subscript</*allow_out_of_range_access=*/false>(index);
+    }
+
+    static void zero_unconstructed_elements( pointer start, size_type count ) {
+        std::memset(static_cast<void *>(start), 0, count * sizeof(value_type));
+    }
+
+    template <typename... Args>
+    iterator internal_emplace_back( Args&&... args ) {
+        size_type old_size = this->my_size++;
+        this->assign_first_block_if_necessary(default_first_block_size);
+        auto element_address = &base_type::template internal_subscript</*allow_out_of_range_access=*/true>(old_size);
+
+        // try_call API is not convenient here due to broken
+        // variadic capture on GCC 4.8.5
+        auto value_guard = make_raii_guard([&] {
+            zero_unconstructed_elements(element_address, /*count =*/1);
+        });
+
+        segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, std::forward<Args>(args)...);
+        value_guard.dismiss();
+        return iterator(*this, old_size, element_address);
+    }
+
+    template <typename... Args>
+    void internal_loop_construct( segment_table_type table, size_type start_idx, size_type end_idx, const Args&... args ) {
+        static_assert(sizeof...(Args) < 2, "Too many parameters");
+        for (size_type idx = start_idx; idx < end_idx; ++idx) {
+            auto element_address = &base_type::template internal_subscript</*allow_out_of_range_access=*/true>(idx);
+            // try_call API is not convenient here due to broken
+            // variadic capture on GCC 4.8.5
+            auto value_guard = make_raii_guard( [&] {
+                segment_index_type last_allocated_segment = this->find_last_allocated_segment(table);
+                size_type segment_size = this->segment_size(last_allocated_segment);
+                end_idx = end_idx < segment_size ? end_idx : segment_size;
+                for (size_type i = idx; i < end_idx; ++i) {
+                    zero_unconstructed_elements(&this->internal_subscript(i), /*count =*/1);
+                }
+            });
+            segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, args...);
+            value_guard.dismiss();
+        }
+    }
+
+    template <typename ForwardIterator>
+    void internal_loop_construct( segment_table_type table, size_type start_idx, size_type end_idx, ForwardIterator first, ForwardIterator ) {
+        for (size_type idx = start_idx; idx < end_idx; ++idx) {
+            auto element_address = &base_type::template internal_subscript</*allow_out_of_range_access=*/true>(idx);
+            try_call( [&] {
+                segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, *first++);
+            } ).on_exception( [&] {
+                segment_index_type last_allocated_segment = this->find_last_allocated_segment(table);
+                size_type segment_size = this->segment_size(last_allocated_segment);
+                end_idx = end_idx < segment_size ? end_idx : segment_size;
+                for (size_type i = idx; i < end_idx; ++i) {
+                    zero_unconstructed_elements(&this->internal_subscript(i), /*count =*/1);
+                }
+            });
+        }
+    }
+
+    template <typename... Args>
+    iterator internal_grow( size_type start_idx, size_type end_idx, const Args&... args ) {
+        this->assign_first_block_if_necessary(this->segment_index_of(end_idx - 1) + 1);
+        size_type seg_index = this->segment_index_of(end_idx - 1);
+        segment_table_type table = this->get_table();
+        this->extend_table_if_necessary(table, start_idx, end_idx);
+
+        if (seg_index > this->my_first_block.load(std::memory_order_relaxed)) {
+            // So that other threads be able to work with the last segment of grow_by, allocate it immediately.
+            // If the last segment is not less than the first block
+            if (table[seg_index].load(std::memory_order_relaxed) == nullptr) {
+                size_type first_element = this->segment_base(seg_index);
+                if (first_element >= start_idx && first_element < end_idx) {
+                    segment_type segment = table[seg_index].load(std::memory_order_relaxed);
+                    base_type::enable_segment(segment, table, seg_index, first_element);
+                }
+            }
+        }
+
+        internal_loop_construct(table, start_idx, end_idx, args...);
+
+        return iterator(*this, start_idx, &base_type::template internal_subscript</*allow_out_of_range_access=*/false>(start_idx));
+    }
+
+
+    template <typename... Args>
+    iterator internal_grow_by_delta( size_type delta, const Args&... args ) {
+        if (delta == size_type(0)) {
+            return end();
+        }
+        size_type start_idx = this->my_size.fetch_add(delta);
+        size_type end_idx = start_idx + delta;
+
+        return internal_grow(start_idx, end_idx, args...);
+    }
+
+    template <typename... Args>
+    iterator internal_grow_to_at_least( size_type new_size, const Args&... args ) {
+        size_type old_size = this->my_size.load(std::memory_order_relaxed);
+        if (new_size == size_type(0)) return iterator(*this, 0);
+        while (old_size < new_size && !this->my_size.compare_exchange_weak(old_size, new_size))
+        {}
+
+        int delta = static_cast<int>(new_size) - static_cast<int>(old_size);
+        if (delta > 0) {
+            return internal_grow(old_size, new_size, args...);
+        }
+
+        size_type end_segment = this->segment_index_of(new_size - 1);
+
+        // Check/wait for segments allocation completes
+        if (end_segment >= this->pointers_per_embedded_table &&
+            this->get_table() == this->my_embedded_table)
+        {
+            spin_wait_while_eq(this->my_segment_table, this->my_embedded_table);
+        }
+
+        for (segment_index_type seg_idx = 0; seg_idx <= end_segment; ++seg_idx) {
+            if (this->get_table()[seg_idx].load(std::memory_order_relaxed) == nullptr) {
+                atomic_backoff backoff(true);
+                while (this->get_table()[seg_idx].load(std::memory_order_relaxed) == nullptr) {
+                    backoff.pause();
+                }
+            }
+        }
+
+    #if TBB_USE_DEBUG
+        size_type cap = capacity();
+        __TBB_ASSERT( cap >= new_size, nullptr);
+    #endif
+        return iterator(*this, size());
+    }
+
+    template <typename... Args>
+    void internal_resize( size_type n, const Args&... args ) {
+        if (n == 0) {
+            clear();
+            return;
+        }
+
+        size_type old_size = this->my_size.load(std::memory_order_acquire);
+        if (n > old_size) {
+            reserve(n);
+            grow_to_at_least(n, args...);
+        } else {
+            if (old_size == n) {
+                return;
+            }
+            size_type last_segment = this->segment_index_of(old_size - 1);
+            // Delete segments
+            for (size_type seg_idx = this->segment_index_of(n - 1) + 1; seg_idx <= last_segment; ++seg_idx) {
+                this->delete_segment(seg_idx);
+            }
+
+            // If n > segment_size(n) => we need to destroy all of the items in the first segment
+            // Otherwise, we need to destroy only items with the index < n
+            size_type n_segment = this->segment_index_of(n - 1);
+            size_type last_index_to_destroy = std::min(this->segment_base(n_segment) + this->segment_size(n_segment), old_size);
+            // Destroy elements in curr segment
+            for (size_type idx = n; idx < last_index_to_destroy; ++idx) {
+                segment_table_allocator_traits::destroy(base_type::get_allocator(), &base_type::template internal_subscript</*allow_out_of_range_access=*/false>(idx));
+            }
+            this->my_size.store(n, std::memory_order_release);
+        }
+    }
+
+    void destroy_elements() {
+        allocator_type alloc(base_type::get_allocator());
+        for (size_type i = 0; i < this->my_size.load(std::memory_order_relaxed); ++i) {
+            allocator_traits_type::destroy(alloc, &base_type::template internal_subscript</*allow_out_of_range_access=*/false>(i));
+        }
+        this->my_size.store(0, std::memory_order_relaxed);
+    }
+
+    static bool incompact_predicate( size_type size ) {
+        // memory page size
+        const size_type page_size = 4096;
+        return size < page_size || ((size - 1) % page_size < page_size / 2 && size < page_size * 128);
+    }
+
+    void internal_compact() {
+        const size_type curr_size = this->my_size.load(std::memory_order_relaxed);
+        segment_table_type table = this->get_table();
+        const segment_index_type k_end = this->find_last_allocated_segment(table);                   // allocated segments
+        const segment_index_type k_stop = curr_size ? this->segment_index_of(curr_size - 1) + 1 : 0; // number of segments to store existing items: 0=>0; 1,2=>1; 3,4=>2; [5-8]=>3;..
+        const segment_index_type first_block = this->my_first_block;                                 // number of merged segments, getting values from atomics
+
+        segment_index_type k = first_block;
+        if (k_stop < first_block) {
+            k = k_stop;
+        }
+        else {
+            while (k < k_stop && incompact_predicate(this->segment_size(k) * sizeof(value_type))) k++;
+        }
+
+        if (k_stop == k_end && k == first_block) {
+            return;
+        }
+
+        // First segment optimization
+        if (k != first_block && k) {
+            size_type max_block = std::max(first_block, k);
+
+            auto buffer_table = segment_table_allocator_traits::allocate(base_type::get_allocator(), max_block);
+
+            for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) {
+                segment_table_allocator_traits::construct(base_type::get_allocator(), &buffer_table[seg_idx],
+                    table[seg_idx].load(std::memory_order_relaxed));
+                table[seg_idx].store(nullptr, std::memory_order_relaxed);
+            }
+
+            this->my_first_block.store(k, std::memory_order_relaxed);
+            size_type index = 0;
+            try_call( [&] {
+                for (; index < std::min(this->segment_size(max_block), curr_size); ++index) {
+                    auto element_address = &static_cast<base_type*>(this)->operator[](index);
+                    segment_index_type seg_idx = this->segment_index_of(index);
+                    segment_table_allocator_traits::construct(base_type::get_allocator(), element_address,
+                    std::move_if_noexcept(buffer_table[seg_idx].load(std::memory_order_relaxed)[index]));
+                }
+            } ).on_exception( [&] {
+                segment_element_allocator_type allocator(base_type::get_allocator());
+                for (size_type i = 0; i < index; ++i) {
+                    auto element_adress = &this->operator[](i);
+                    segment_element_allocator_traits::destroy(allocator, element_adress);
+                }
+                segment_element_allocator_traits::deallocate(allocator,
+                    table[0].load(std::memory_order_relaxed), this->segment_size(max_block));
+
+                for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) {
+                    table[seg_idx].store(buffer_table[seg_idx].load(std::memory_order_relaxed),
+                        std::memory_order_relaxed);
+                    buffer_table[seg_idx].store(nullptr, std::memory_order_relaxed);
+                }
+                segment_table_allocator_traits::deallocate(base_type::get_allocator(),
+                    buffer_table, max_block);
+                this->my_first_block.store(first_block, std::memory_order_relaxed);
+            });
+
+            // Need to correct deallocate old segments
+            // Method destroy_segment respect active first_block, therefore,
+            // in order for the segment deletion to work correctly, set the first_block size that was earlier,
+            // destroy the unnecessary segments.
+            this->my_first_block.store(first_block, std::memory_order_relaxed);
+            for (size_type seg_idx = max_block; seg_idx > 0 ; --seg_idx) {
+                auto curr_segment = buffer_table[seg_idx - 1].load(std::memory_order_relaxed);
+                if (curr_segment != nullptr) {
+                    destroy_segment(buffer_table[seg_idx - 1].load(std::memory_order_relaxed) + this->segment_base(seg_idx - 1),
+                        seg_idx - 1);
+                }
+            }
+
+            this->my_first_block.store(k, std::memory_order_relaxed);
+
+            for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) {
+                segment_table_allocator_traits::destroy(base_type::get_allocator(), &buffer_table[seg_idx]);
+            }
+
+            segment_table_allocator_traits::deallocate(base_type::get_allocator(), buffer_table, max_block);
+        }
+        // free unnecessary segments allocated by reserve() call
+        if (k_stop < k_end) {
+            for (size_type seg_idx = k_end; seg_idx != k_stop; --seg_idx) {
+                if (table[seg_idx - 1].load(std::memory_order_relaxed) != nullptr) {
+                    this->delete_segment(seg_idx - 1);
+                }
+            }
+            if (!k) this->my_first_block.store(0, std::memory_order_relaxed);
+        }
+    }
+
+    // Lever for adjusting the size of first_block at the very first insertion.
+    // TODO: consider >1 value, check performance
+    static constexpr size_type default_first_block_size = 1;
+
+    template <typename Vector, typename Value>
+    friend class vector_iterator;
+}; // class concurrent_vector
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+// Deduction guide for the constructor from two iterators
+template <typename It, typename Alloc = tbb::cache_aligned_allocator<iterator_value_t<It>>,
+          typename = std::enable_if_t<is_input_iterator_v<It>>,
+          typename = std::enable_if_t<is_allocator_v<Alloc>>>
+concurrent_vector( It, It, Alloc = Alloc() )
+-> concurrent_vector<iterator_value_t<It>, Alloc>;
+#endif
+
+template <typename T, typename Allocator>
+void swap(concurrent_vector<T, Allocator> &lhs,
+          concurrent_vector<T, Allocator> &rhs)
+{
+    lhs.swap(rhs);
+}
+
+template <typename T, typename Allocator>
+bool operator==(const concurrent_vector<T, Allocator> &lhs,
+                const concurrent_vector<T, Allocator> &rhs)
+{
+    return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin());
+}
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template <typename T, typename Allocator>
+bool operator!=(const concurrent_vector<T, Allocator> &lhs,
+                const concurrent_vector<T, Allocator> &rhs)
+{
+    return !(lhs == rhs);
+}
+#endif // !__TBB_CPP20_COMPARISONS_PRESENT
+
+#if __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT
+template <typename T, typename Allocator>
+tbb::detail::synthesized_three_way_result<typename concurrent_vector<T, Allocator>::value_type>
+operator<=>(const concurrent_vector<T, Allocator> &lhs,
+            const concurrent_vector<T, Allocator> &rhs)
+{
+    return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(),
+                                                  rhs.begin(), rhs.end(),
+                                                  tbb::detail::synthesized_three_way_comparator{});
+}
+
+#else
+
+template <typename T, typename Allocator>
+bool operator<(const concurrent_vector<T, Allocator> &lhs,
+               const concurrent_vector<T, Allocator> &rhs)
+{
+    return std::lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+}
+
+template <typename T, typename Allocator>
+bool operator<=(const concurrent_vector<T, Allocator> &lhs,
+                const concurrent_vector<T, Allocator> &rhs)
+{
+    return !(rhs < lhs);
+}
+
+template <typename T, typename Allocator>
+bool operator>(const concurrent_vector<T, Allocator> &lhs,
+               const concurrent_vector<T, Allocator> &rhs)
+{
+    return rhs < lhs;
+}
+
+template <typename T, typename Allocator>
+bool operator>=(const concurrent_vector<T, Allocator> &lhs,
+                const concurrent_vector<T, Allocator> &rhs)
+{
+    return !(lhs < rhs);
+}
+#endif // __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+    using detail::d1::concurrent_vector;
+} // namespace v1
+
+} // namespace tbb
+
+#endif // __TBB_concurrent_vector_H
diff --git a/third_party/tbb/detail/_aggregator.h b/third_party/tbb/detail/_aggregator.h
new file mode 100644
index 000000000..bc263885c
--- /dev/null
+++ b/third_party/tbb/detail/_aggregator.h
@@ -0,0 +1,177 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+
+#ifndef __TBB_detail__aggregator_H
+#define __TBB_detail__aggregator_H
+
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/libcxx/atomic"
+#if !__TBBMALLOC_BUILD // TODO: check this macro with TBB Malloc
+#include "third_party/tbb/profiling.h"
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+// Base class for aggregated operation
+template <typename Derived>
+class aggregated_operation {
+public:
+    // Zero value means "wait" status, all other values are "user" specified values and
+    // are defined into the scope of a class which uses "status"
+    std::atomic<uintptr_t> status;
+
+    std::atomic<Derived*> next;
+    aggregated_operation() : status{}, next(nullptr) {}
+}; // class aggregated_operation
+
+// Aggregator base class
+/* An aggregator for collecting operations coming from multiple sources and executing
+   them serially on a single thread.  OperationType must be derived from
+   aggregated_operation. The parameter HandlerType is a functor that will be passed the
+   list of operations and is expected to handle each operation appropriately, setting the
+   status of each operation to non-zero. */
+template <typename OperationType>
+class aggregator_generic {
+public:
+    aggregator_generic() : pending_operations(nullptr), handler_busy(false) {}
+
+    // Execute an operation
+    /* Places an operation into the waitlist (pending_operations), and either handles the list,
+       or waits for the operation to complete, or returns.
+       The long_life_time parameter specifies the life time of the given operation object.
+       Operations with long_life_time == true may be accessed after execution.
+       A "short" life time operation (long_life_time == false) can be destroyed
+       during execution, and so any access to it after it was put into the waitlist,
+       including status check, is invalid. As a consequence, waiting for completion
+       of such operation causes undefined behavior. */
+    template <typename HandlerType>
+    void execute( OperationType* op, HandlerType& handle_operations, bool long_life_time = true ) {
+        // op->status should be read before inserting the operation into the
+        // aggregator waitlist since it can become invalid after executing a
+        // handler (if the operation has 'short' life time.)
+        const uintptr_t status = op->status.load(std::memory_order_relaxed);
+
+        // ITT note: &(op->status) tag is used to cover accesses to this op node. This
+        // thread has created the operation, and now releases it so that the handler
+        // thread may handle the associated operation w/o triggering a race condition;
+        // thus this tag will be acquired just before the operation is handled in the
+        // handle_operations functor.
+        call_itt_notify(releasing, &(op->status));
+        // insert the operation in the queue.
+        OperationType* res = pending_operations.load(std::memory_order_relaxed);
+        do {
+            op->next.store(res, std::memory_order_relaxed);
+        } while (!pending_operations.compare_exchange_strong(res, op));
+        if (!res) { // first in the list; handle the operations
+            // ITT note: &pending_operations tag covers access to the handler_busy flag,
+            // which this waiting handler thread will try to set before entering
+            // handle_operations.
+            call_itt_notify(acquired, &pending_operations);
+            start_handle_operations(handle_operations);
+            // The operation with 'short' life time can already be destroyed
+            if (long_life_time)
+                __TBB_ASSERT(op->status.load(std::memory_order_relaxed), nullptr);
+        }
+        // Not first; wait for op to be ready
+        else if (!status) { // operation is blocking here.
+            __TBB_ASSERT(long_life_time, "Waiting for an operation object that might be destroyed during processing");
+            call_itt_notify(prepare, &(op->status));
+            spin_wait_while_eq(op->status, uintptr_t(0));
+        }
+   }
+
+private:
+    // Trigger the handling of operations when the handler is free
+    template <typename HandlerType>
+    void start_handle_operations( HandlerType& handle_operations ) {
+        OperationType* op_list;
+
+        // ITT note: &handler_busy tag covers access to pending_operations as it is passed
+        // between active and waiting handlers.  Below, the waiting handler waits until
+        // the active handler releases, and the waiting handler acquires &handler_busy as
+        // it becomes the active_handler. The release point is at the end of this
+        // function, when all operations in pending_operations have been handled by the
+        // owner of this aggregator.
+        call_itt_notify(prepare, &handler_busy);
+        // get the handler_busy:
+        // only one thread can possibly spin here at a time
+        spin_wait_until_eq(handler_busy, uintptr_t(0));
+        call_itt_notify(acquired, &handler_busy);
+        // acquire fence not necessary here due to causality rule and surrounding atomics
+        handler_busy.store(1, std::memory_order_relaxed);
+
+        // ITT note: &pending_operations tag covers access to the handler_busy flag
+        // itself. Capturing the state of the pending_operations signifies that
+        // handler_busy has been set and a new active handler will now process that list's
+        // operations.
+        call_itt_notify(releasing, &pending_operations);
+        // grab pending_operations
+        op_list = pending_operations.exchange(nullptr);
+
+        // handle all the operations
+        handle_operations(op_list);
+
+        // release the handler
+        handler_busy.store(0, std::memory_order_release);
+    }
+
+    // An atomically updated list (aka mailbox) of pending operations
+    std::atomic<OperationType*> pending_operations;
+    // Controls threads access to handle_operations
+    std::atomic<uintptr_t> handler_busy;
+}; // class aggregator_generic
+
+template <typename HandlerType, typename OperationType>
+class aggregator : public aggregator_generic<OperationType> {
+    HandlerType handle_operations;
+public:
+    aggregator() = default;
+
+    void initialize_handler( HandlerType h ) { handle_operations = h; }
+
+    void execute(OperationType* op) {
+        aggregator_generic<OperationType>::execute(op, handle_operations);
+    }
+}; // class aggregator
+
+// the most-compatible friend declaration (vs, gcc, icc) is
+// template<class U, class V> friend class aggregating_functor;
+template <typename AggregatingClass, typename OperationList>
+class aggregating_functor {
+    AggregatingClass* my_object{nullptr};
+public:
+    aggregating_functor() = default;
+    aggregating_functor( AggregatingClass* object ) : my_object(object) {
+        __TBB_ASSERT(my_object, nullptr);
+    }
+
+    void operator()( OperationList* op_list ) {
+        __TBB_ASSERT(my_object, nullptr);
+        my_object->handle_operations(op_list);
+    }
+}; // class aggregating_functor
+
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__aggregator_H
diff --git a/third_party/tbb/detail/_aligned_space.h b/third_party/tbb/detail/_aligned_space.h
new file mode 100644
index 000000000..9a5addba4
--- /dev/null
+++ b/third_party/tbb/detail/_aligned_space.h
@@ -0,0 +1,47 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+#ifndef __TBB_aligned_space_H
+#define __TBB_aligned_space_H
+
+#include "third_party/libcxx/cstddef"
+
+#include "third_party/tbb/detail/_template_helpers.h"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//! Block of space aligned sufficiently to construct an array T with N elements.
+/** The elements are not constructed or destroyed by this class.
+    @ingroup memory_allocation */
+template<typename T, std::size_t N = 1>
+class aligned_space {
+    alignas(alignof(T)) std::uint8_t aligned_array[N * sizeof(T)];
+
+public:
+    //! Pointer to beginning of array
+    T* begin() const { return punned_cast<T*>(&aligned_array); }
+
+    //! Pointer to one past last element in array.
+    T* end() const { return begin() + N; }
+};
+
+} // namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_aligned_space_H */
diff --git a/third_party/tbb/detail/_allocator_traits.h b/third_party/tbb/detail/_allocator_traits.h
new file mode 100644
index 000000000..366cc63d1
--- /dev/null
+++ b/third_party/tbb/detail/_allocator_traits.h
@@ -0,0 +1,108 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__allocator_traits_H
+#define __TBB_detail__allocator_traits_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/type_traits"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+#if !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT
+// Struct is_always_equal_detector provides the member type "type" which is
+// Allocator::is_always_equal if it is present, std::false_type otherwise
+template <typename Allocator, typename = void>
+struct is_always_equal_detector {
+    using type = std::false_type;
+};
+
+template <typename Allocator>
+struct is_always_equal_detector<Allocator, tbb::detail::void_t<typename Allocator::is_always_equal>>
+{
+    using type = typename Allocator::is_always_equal;
+};
+#endif // !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT
+
+template <typename Allocator>
+class allocator_traits : public std::allocator_traits<Allocator>
+{
+    using base_type = std::allocator_traits<Allocator>;
+public:
+#if !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT
+    using is_always_equal = typename is_always_equal_detector<Allocator>::type;
+#endif
+
+    template <typename T>
+    using rebind_traits = typename tbb::detail::allocator_traits<typename base_type::template rebind_alloc<T>>;
+}; // struct allocator_traits
+
+template <typename Allocator>
+void copy_assign_allocators_impl( Allocator& lhs, const Allocator& rhs, /*pocca = */std::true_type ) {
+    lhs = rhs;
+}
+
+template <typename Allocator>
+void copy_assign_allocators_impl( Allocator&, const Allocator&, /*pocca = */ std::false_type ) {}
+
+// Copy assigns allocators only if propagate_on_container_copy_assignment is true
+template <typename Allocator>
+void copy_assign_allocators( Allocator& lhs, const Allocator& rhs ) {
+    using pocca_type = typename allocator_traits<Allocator>::propagate_on_container_copy_assignment;
+    copy_assign_allocators_impl(lhs, rhs, pocca_type());
+}
+
+template <typename Allocator>
+void move_assign_allocators_impl( Allocator& lhs, Allocator& rhs, /*pocma = */ std::true_type ) {
+    lhs = std::move(rhs);
+}
+
+template <typename Allocator>
+void move_assign_allocators_impl( Allocator&, Allocator&, /*pocma = */ std::false_type ) {}
+
+// Move assigns allocators only if propagate_on_container_move_assignment is true
+template <typename Allocator>
+void move_assign_allocators( Allocator& lhs, Allocator& rhs ) {
+    using pocma_type = typename allocator_traits<Allocator>::propagate_on_container_move_assignment;
+    move_assign_allocators_impl(lhs, rhs, pocma_type());
+}
+
+template <typename Allocator>
+void swap_allocators_impl( Allocator& lhs, Allocator& rhs, /*pocs = */ std::true_type ) {
+    using std::swap;
+    swap(lhs, rhs);
+}
+
+template <typename Allocator>
+void swap_allocators_impl( Allocator&, Allocator&, /*pocs = */ std::false_type ) {}
+
+// Swaps allocators only if propagate_on_container_swap is true
+template <typename Allocator>
+void swap_allocators( Allocator& lhs, Allocator& rhs ) {
+    using pocs_type = typename allocator_traits<Allocator>::propagate_on_container_swap;
+    swap_allocators_impl(lhs, rhs, pocs_type());
+}
+
+} // inline namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__allocator_traits_H
diff --git a/third_party/tbb/detail/_assert.h b/third_party/tbb/detail/_assert.h
new file mode 100644
index 000000000..0d1210860
--- /dev/null
+++ b/third_party/tbb/detail/_assert.h
@@ -0,0 +1,65 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__assert_H
+#define __TBB_detail__assert_H
+
+#include "third_party/tbb/detail/_config.h"
+
+#if __TBBMALLOC_BUILD
+namespace rml { namespace internal {
+#else
+namespace tbb {
+namespace detail {
+namespace r1 {
+#endif
+//! Process an assertion failure.
+/** Normally called from __TBB_ASSERT macro.
+  If assertion handler is null, print message for assertion failure and abort.
+  Otherwise call the assertion handler. */
+TBB_EXPORT void __TBB_EXPORTED_FUNC assertion_failure(const char* location, int line, const char* expression, const char* comment);
+#if __TBBMALLOC_BUILD
+}} // namespaces rml::internal
+#else
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+#endif
+
+#if __TBBMALLOC_BUILD
+//! Release version of assertions
+#define __TBB_ASSERT_RELEASE(predicate,message) ((predicate)?((void)0) : rml::internal::assertion_failure(__func__,__LINE__,#predicate,message))
+#else
+#define __TBB_ASSERT_RELEASE(predicate,message) ((predicate)?((void)0) : tbb::detail::r1::assertion_failure(__func__,__LINE__,#predicate,message))
+#endif
+
+#if TBB_USE_ASSERT
+    //! Assert that predicate is true.
+    /** If predicate is false, print assertion failure message.
+        If the comment argument is not nullptr, it is printed as part of the failure message.
+        The comment argument has no other effect. */
+    #define __TBB_ASSERT(predicate,message) __TBB_ASSERT_RELEASE(predicate,message)
+    //! "Extended" version
+    #define __TBB_ASSERT_EX __TBB_ASSERT
+#else
+    //! No-op version of __TBB_ASSERT.
+    #define __TBB_ASSERT(predicate,comment) ((void)0)
+    //! "Extended" version is useful to suppress warnings if a variable is only used with an assert
+    #define __TBB_ASSERT_EX(predicate,comment) ((void)(1 && (predicate)))
+#endif // TBB_USE_ASSERT
+
+#endif // __TBB_detail__assert_H
diff --git a/third_party/tbb/detail/_attach.h b/third_party/tbb/detail/_attach.h
new file mode 100644
index 000000000..ddf21d590
--- /dev/null
+++ b/third_party/tbb/detail/_attach.h
@@ -0,0 +1,33 @@
+// clang-format off
+/*
+    Copyright (c) 2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__attach_H
+#define __TBB_detail__attach_H
+
+#include "third_party/tbb/detail/_config.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+    struct attach {};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__attach_H
diff --git a/third_party/tbb/detail/_concurrent_queue_base.h b/third_party/tbb/detail/_concurrent_queue_base.h
new file mode 100644
index 000000000..2cec3e168
--- /dev/null
+++ b/third_party/tbb/detail/_concurrent_queue_base.h
@@ -0,0 +1,651 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__concurrent_queue_base_H
+#define __TBB_detail__concurrent_queue_base_H
+
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_machine.h"
+#include "third_party/tbb/detail/_allocator_traits.h"
+
+#include "third_party/tbb/profiling.h"
+#include "third_party/tbb/spin_mutex.h"
+#include "third_party/tbb/cache_aligned_allocator.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace d2 {
+
+using ticket_type = std::size_t;
+
+template <typename Page>
+inline bool is_valid_page(const Page p) {
+    return reinterpret_cast<std::uintptr_t>(p) > 1;
+}
+
+template <typename T, typename Allocator>
+struct concurrent_queue_rep;
+
+template <typename Container, typename T, typename Allocator>
+class micro_queue_pop_finalizer;
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+// unary minus operator applied to unsigned type, result still unsigned
+#pragma warning( push )
+#pragma warning( disable: 4146 )
+#endif
+
+// A queue using simple locking.
+// For efficiency, this class has no constructor.
+// The caller is expected to zero-initialize it.
+template <typename T, typename Allocator>
+class micro_queue {
+private:
+    using queue_rep_type = concurrent_queue_rep<T, Allocator>;
+    using self_type = micro_queue<T, Allocator>;
+public:
+    using size_type = std::size_t;
+    using value_type = T;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+
+    using allocator_type = Allocator;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using queue_allocator_type = typename allocator_traits_type::template rebind_alloc<queue_rep_type>;
+
+    static constexpr size_type item_size = sizeof(T);
+    static constexpr size_type items_per_page = item_size <=   8 ? 32 :
+                                                item_size <=  16 ? 16 :
+                                                item_size <=  32 ?  8 :
+                                                item_size <=  64 ?  4 :
+                                                item_size <= 128 ?  2 : 1;
+
+    struct padded_page {
+        padded_page() {}
+        ~padded_page() {}
+
+        reference operator[] (std::size_t index) {
+            __TBB_ASSERT(index < items_per_page, "Index out of range");
+            return items[index];
+        }
+
+        const_reference operator[] (std::size_t index) const {
+            __TBB_ASSERT(index < items_per_page, "Index out of range");
+            return items[index];
+        }
+
+        padded_page* next{ nullptr };
+        std::atomic<std::uintptr_t> mask{};
+
+        union {
+            value_type items[items_per_page];
+        };
+    }; // struct padded_page
+
+    using page_allocator_type = typename allocator_traits_type::template rebind_alloc<padded_page>;
+protected:
+    using page_allocator_traits = tbb::detail::allocator_traits<page_allocator_type>;
+
+public:
+    using item_constructor_type = void (*)(value_type* location, const void* src);
+    micro_queue() = default;
+    micro_queue( const micro_queue& ) = delete;
+    micro_queue& operator=( const micro_queue& ) = delete;
+
+    size_type prepare_page( ticket_type k, queue_rep_type& base, page_allocator_type page_allocator,
+                            padded_page*& p ) {
+        __TBB_ASSERT(p == nullptr, "Invalid page argument for prepare_page");
+        k &= -queue_rep_type::n_queue;
+        size_type index = modulo_power_of_two(k / queue_rep_type::n_queue, items_per_page);
+        if (!index) {
+            try_call( [&] {
+                p = page_allocator_traits::allocate(page_allocator, 1);
+            }).on_exception( [&] {
+                ++base.n_invalid_entries;
+                invalidate_page( k );
+            });
+            page_allocator_traits::construct(page_allocator, p);
+        }
+
+        spin_wait_until_my_turn(tail_counter, k, base);
+        d1::call_itt_notify(d1::acquired, &tail_counter);
+
+        if (p) {
+            spin_mutex::scoped_lock lock( page_mutex );
+            padded_page* q = tail_page.load(std::memory_order_relaxed);
+            if (is_valid_page(q)) {
+                q->next = p;
+            } else {
+                head_page.store(p, std::memory_order_relaxed);
+            }
+            tail_page.store(p, std::memory_order_relaxed);
+        } else {
+            p = tail_page.load(std::memory_order_relaxed);
+        }
+        return index;
+    }
+
+    template<typename... Args>
+    void push( ticket_type k, queue_rep_type& base, queue_allocator_type& allocator, Args&&... args )
+    {
+        padded_page* p = nullptr;
+        page_allocator_type page_allocator(allocator);
+        size_type index = prepare_page(k, base, page_allocator, p);
+        __TBB_ASSERT(p != nullptr, "Page was not prepared");
+
+        // try_call API is not convenient here due to broken
+        // variadic capture on GCC 4.8.5
+        auto value_guard = make_raii_guard([&] {
+            ++base.n_invalid_entries;
+            d1::call_itt_notify(d1::releasing, &tail_counter);
+            tail_counter.fetch_add(queue_rep_type::n_queue);
+        });
+
+        page_allocator_traits::construct(page_allocator, &(*p)[index], std::forward<Args>(args)...);
+        // If no exception was thrown, mark item as present.
+        p->mask.store(p->mask.load(std::memory_order_relaxed) | uintptr_t(1) << index, std::memory_order_relaxed);
+        d1::call_itt_notify(d1::releasing, &tail_counter);
+
+        value_guard.dismiss();
+        tail_counter.fetch_add(queue_rep_type::n_queue);
+    }
+
+    void abort_push( ticket_type k, queue_rep_type& base, queue_allocator_type& allocator ) {
+        padded_page* p = nullptr;
+        prepare_page(k, base, allocator, p);
+        ++base.n_invalid_entries;
+        tail_counter.fetch_add(queue_rep_type::n_queue);
+    }
+
+    bool pop( void* dst, ticket_type k, queue_rep_type& base, queue_allocator_type& allocator ) {
+        k &= -queue_rep_type::n_queue;
+        spin_wait_until_eq(head_counter, k);
+        d1::call_itt_notify(d1::acquired, &head_counter);
+        spin_wait_while_eq(tail_counter, k);
+        d1::call_itt_notify(d1::acquired, &tail_counter);
+        padded_page *p = head_page.load(std::memory_order_relaxed);
+        __TBB_ASSERT( p, nullptr );
+        size_type index = modulo_power_of_two( k/queue_rep_type::n_queue, items_per_page );
+        bool success = false;
+        {
+            page_allocator_type page_allocator(allocator);
+            micro_queue_pop_finalizer<self_type, value_type, page_allocator_type> finalizer(*this, page_allocator,
+                k + queue_rep_type::n_queue, index == items_per_page - 1 ? p : nullptr );
+            if (p->mask.load(std::memory_order_relaxed) & (std::uintptr_t(1) << index)) {
+                success = true;
+                assign_and_destroy_item(dst, *p, index);
+            } else {
+                --base.n_invalid_entries;
+            }
+        }
+        return success;
+    }
+
+    micro_queue& assign( const micro_queue& src, queue_allocator_type& allocator,
+        item_constructor_type construct_item )
+    {
+        head_counter.store(src.head_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        tail_counter.store(src.tail_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+
+        const padded_page* srcp = src.head_page.load(std::memory_order_relaxed);
+        if( is_valid_page(srcp) ) {
+            ticket_type g_index = head_counter.load(std::memory_order_relaxed);
+            size_type n_items  = (tail_counter.load(std::memory_order_relaxed) - head_counter.load(std::memory_order_relaxed))
+                / queue_rep_type::n_queue;
+            size_type index = modulo_power_of_two(head_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue, items_per_page);
+            size_type end_in_first_page = (index+n_items < items_per_page) ? (index + n_items) : items_per_page;
+
+            try_call( [&] {
+                head_page.store(make_copy(allocator, srcp, index, end_in_first_page, g_index, construct_item), std::memory_order_relaxed);
+            }).on_exception( [&] {
+                head_counter.store(0, std::memory_order_relaxed);
+                tail_counter.store(0, std::memory_order_relaxed);
+            });
+            padded_page* cur_page = head_page.load(std::memory_order_relaxed);
+
+            try_call( [&] {
+                if (srcp != src.tail_page.load(std::memory_order_relaxed)) {
+                    for (srcp = srcp->next; srcp != src.tail_page.load(std::memory_order_relaxed); srcp=srcp->next ) {
+                        cur_page->next = make_copy( allocator, srcp, 0, items_per_page, g_index, construct_item );
+                        cur_page = cur_page->next;
+                    }
+
+                    __TBB_ASSERT(srcp == src.tail_page.load(std::memory_order_relaxed), nullptr );
+                    size_type last_index = modulo_power_of_two(tail_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue, items_per_page);
+                    if( last_index==0 ) last_index = items_per_page;
+
+                    cur_page->next = make_copy( allocator, srcp, 0, last_index, g_index, construct_item );
+                    cur_page = cur_page->next;
+                }
+                tail_page.store(cur_page, std::memory_order_relaxed);
+            }).on_exception( [&] {
+                padded_page* invalid_page = reinterpret_cast<padded_page*>(std::uintptr_t(1));
+                tail_page.store(invalid_page, std::memory_order_relaxed);
+            });
+        } else {
+            head_page.store(nullptr, std::memory_order_relaxed);
+            tail_page.store(nullptr, std::memory_order_relaxed);
+        }
+        return *this;
+    }
+
+    padded_page* make_copy( queue_allocator_type& allocator, const padded_page* src_page, size_type begin_in_page,
+        size_type end_in_page, ticket_type& g_index, item_constructor_type construct_item )
+    {
+        page_allocator_type page_allocator(allocator);
+        padded_page* new_page = page_allocator_traits::allocate(page_allocator, 1);
+        new_page->next = nullptr;
+        new_page->mask.store(src_page->mask.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        for (; begin_in_page!=end_in_page; ++begin_in_page, ++g_index) {
+            if (new_page->mask.load(std::memory_order_relaxed) & uintptr_t(1) << begin_in_page) {
+                copy_item(*new_page, begin_in_page, *src_page, begin_in_page, construct_item);
+            }
+        }
+        return new_page;
+    }
+
+    void invalidate_page( ticket_type k )  {
+        // Append an invalid page at address 1 so that no more pushes are allowed.
+        padded_page* invalid_page = reinterpret_cast<padded_page*>(std::uintptr_t(1));
+        {
+            spin_mutex::scoped_lock lock( page_mutex );
+            tail_counter.store(k + queue_rep_type::n_queue + 1, std::memory_order_relaxed);
+            padded_page* q = tail_page.load(std::memory_order_relaxed);
+            if (is_valid_page(q)) {
+                q->next = invalid_page;
+            } else {
+                head_page.store(invalid_page, std::memory_order_relaxed);
+            }
+            tail_page.store(invalid_page, std::memory_order_relaxed);
+        }
+    }
+
+    padded_page* get_head_page() {
+        return head_page.load(std::memory_order_relaxed);
+    }
+
+    void clear(queue_allocator_type& allocator, padded_page* new_head = nullptr, padded_page* new_tail = nullptr) {
+        padded_page* curr_page = get_head_page();
+        size_type index = (head_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue) % items_per_page;
+        page_allocator_type page_allocator(allocator);
+
+        while (curr_page && is_valid_page(curr_page)) {
+            while (index != items_per_page) {
+                if (curr_page->mask.load(std::memory_order_relaxed) & (std::uintptr_t(1) << index)) {
+                    page_allocator_traits::destroy(page_allocator, &curr_page->operator[](index));
+                }
+                ++index;
+            }
+
+            index = 0;
+            padded_page* next_page = curr_page->next;
+            page_allocator_traits::destroy(page_allocator, curr_page);
+            page_allocator_traits::deallocate(page_allocator, curr_page, 1);
+            curr_page = next_page;
+        }
+        head_counter.store(0, std::memory_order_relaxed);
+        tail_counter.store(0, std::memory_order_relaxed);
+        head_page.store(new_head, std::memory_order_relaxed);
+        tail_page.store(new_tail, std::memory_order_relaxed);
+    }
+
+    void clear_and_invalidate(queue_allocator_type& allocator) {
+        padded_page* invalid_page = reinterpret_cast<padded_page*>(std::uintptr_t(1));
+        clear(allocator, invalid_page, invalid_page);
+    }
+
+private:
+    // template <typename U, typename A>
+    friend class micro_queue_pop_finalizer<self_type, value_type, page_allocator_type>;
+
+    // Class used to ensure exception-safety of method "pop"
+    class destroyer  {
+        value_type& my_value;
+    public:
+        destroyer( reference value ) : my_value(value) {}
+        destroyer( const destroyer& ) = delete;
+        destroyer& operator=( const destroyer& ) = delete;
+        ~destroyer() {my_value.~T();}
+    }; // class destroyer
+
+    void copy_item( padded_page& dst, size_type dindex, const padded_page& src, size_type sindex,
+        item_constructor_type construct_item )
+    {
+        auto& src_item = src[sindex];
+        construct_item( &dst[dindex], static_cast<const void*>(&src_item) );
+    }
+
+    void assign_and_destroy_item( void* dst, padded_page& src, size_type index ) {
+        auto& from = src[index];
+        destroyer d(from);
+        *static_cast<T*>(dst) = std::move(from);
+    }
+
+    void spin_wait_until_my_turn( std::atomic<ticket_type>& counter, ticket_type k, queue_rep_type& rb ) const {
+        for (atomic_backoff b{};; b.pause()) {
+            ticket_type c = counter.load(std::memory_order_acquire);
+            if (c == k) return;
+            else if (c & 1) {
+                ++rb.n_invalid_entries;
+                throw_exception( exception_id::bad_last_alloc);
+            }
+        }
+    }
+
+    std::atomic<padded_page*> head_page{};
+    std::atomic<ticket_type> head_counter{};
+
+    std::atomic<padded_page*> tail_page{};
+    std::atomic<ticket_type> tail_counter{};
+
+    spin_mutex page_mutex{};
+}; // class micro_queue
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+#pragma warning( pop )
+#endif // warning 4146 is back
+
+template <typename Container, typename T, typename Allocator>
+class micro_queue_pop_finalizer {
+public:
+    using padded_page = typename Container::padded_page;
+    using allocator_type = Allocator;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+
+    micro_queue_pop_finalizer( Container& queue, Allocator& alloc, ticket_type k, padded_page* p ) :
+        my_ticket_type(k), my_queue(queue), my_page(p), allocator(alloc)
+    {}
+
+    micro_queue_pop_finalizer( const micro_queue_pop_finalizer& ) = delete;
+    micro_queue_pop_finalizer& operator=( const micro_queue_pop_finalizer& ) = delete;
+
+    ~micro_queue_pop_finalizer() {
+        padded_page* p = my_page;
+        if( is_valid_page(p) ) {
+            spin_mutex::scoped_lock lock( my_queue.page_mutex );
+            padded_page* q = p->next;
+            my_queue.head_page.store(q, std::memory_order_relaxed);
+            if( !is_valid_page(q) ) {
+                my_queue.tail_page.store(nullptr, std::memory_order_relaxed);
+            }
+        }
+        my_queue.head_counter.store(my_ticket_type, std::memory_order_release);
+        if ( is_valid_page(p) ) {
+            allocator_traits_type::destroy(allocator, static_cast<padded_page*>(p));
+            allocator_traits_type::deallocate(allocator, static_cast<padded_page*>(p), 1);
+        }
+    }
+private:
+    ticket_type my_ticket_type;
+    Container& my_queue;
+    padded_page* my_page;
+    Allocator& allocator;
+}; // class micro_queue_pop_finalizer
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+// structure was padded due to alignment specifier
+#pragma warning( push )
+#pragma warning( disable: 4324 )
+#endif
+
+template <typename T, typename Allocator>
+struct concurrent_queue_rep {
+    using self_type = concurrent_queue_rep<T, Allocator>;
+    using size_type = std::size_t;
+    using micro_queue_type = micro_queue<T, Allocator>;
+    using allocator_type = Allocator;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using padded_page = typename micro_queue_type::padded_page;
+    using page_allocator_type = typename micro_queue_type::page_allocator_type;
+    using item_constructor_type = typename micro_queue_type::item_constructor_type;
+private:
+    using page_allocator_traits = tbb::detail::allocator_traits<page_allocator_type>;
+    using queue_allocator_type = typename allocator_traits_type::template rebind_alloc<self_type>;
+
+public:
+    // must be power of 2
+    static constexpr size_type n_queue = 8;
+    // Approximately n_queue/golden ratio
+    static constexpr size_type phi = 3;
+    static constexpr size_type item_size = micro_queue_type::item_size;
+    static constexpr size_type items_per_page = micro_queue_type::items_per_page;
+
+    concurrent_queue_rep() {}
+
+    concurrent_queue_rep( const concurrent_queue_rep& ) = delete;
+    concurrent_queue_rep& operator=( const concurrent_queue_rep& ) = delete;
+
+    void clear( queue_allocator_type& alloc ) {
+        for (size_type index = 0; index < n_queue; ++index) {
+            array[index].clear(alloc);
+        }
+        head_counter.store(0, std::memory_order_relaxed);
+        tail_counter.store(0, std::memory_order_relaxed);
+        n_invalid_entries.store(0, std::memory_order_relaxed);
+    }
+
+    void assign( const concurrent_queue_rep& src, queue_allocator_type& alloc, item_constructor_type construct_item ) {
+        head_counter.store(src.head_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        tail_counter.store(src.tail_counter.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        n_invalid_entries.store(src.n_invalid_entries.load(std::memory_order_relaxed), std::memory_order_relaxed);
+
+        // copy or move micro_queues
+        size_type queue_idx = 0;
+        try_call( [&] {
+            for (; queue_idx < n_queue; ++queue_idx) {
+                array[queue_idx].assign(src.array[queue_idx], alloc, construct_item);
+            }
+        }).on_exception( [&] {
+            for (size_type i = 0; i < queue_idx + 1; ++i) {
+                array[i].clear_and_invalidate(alloc);
+            }
+            head_counter.store(0, std::memory_order_relaxed);
+            tail_counter.store(0, std::memory_order_relaxed);
+            n_invalid_entries.store(0, std::memory_order_relaxed);
+        });
+
+        __TBB_ASSERT(head_counter.load(std::memory_order_relaxed) == src.head_counter.load(std::memory_order_relaxed) &&
+                     tail_counter.load(std::memory_order_relaxed) == src.tail_counter.load(std::memory_order_relaxed),
+                     "the source concurrent queue should not be concurrently modified." );
+    }
+
+    bool empty() const {
+        ticket_type tc = tail_counter.load(std::memory_order_acquire);
+        ticket_type hc = head_counter.load(std::memory_order_relaxed);
+        // if tc!=r.tail_counter, the queue was not empty at some point between the two reads.
+        return tc == tail_counter.load(std::memory_order_relaxed) &&
+               std::ptrdiff_t(tc - hc - n_invalid_entries.load(std::memory_order_relaxed)) <= 0;
+    }
+
+    std::ptrdiff_t size() const {
+        __TBB_ASSERT(sizeof(std::ptrdiff_t) <= sizeof(size_type), nullptr);
+        std::ptrdiff_t hc = head_counter.load(std::memory_order_acquire);
+        std::ptrdiff_t tc = tail_counter.load(std::memory_order_relaxed);
+        std::ptrdiff_t nie = n_invalid_entries.load(std::memory_order_relaxed);
+
+        return tc - hc - nie;
+    }
+
+    friend class micro_queue<T, Allocator>;
+
+    // Map ticket_type to an array index
+    static size_type index( ticket_type k ) {
+        return k * phi % n_queue;
+    }
+
+    micro_queue_type& choose( ticket_type k ) {
+        // The formula here approximates LRU in a cache-oblivious way.
+        return array[index(k)];
+    }
+
+    alignas(max_nfs_size) micro_queue_type array[n_queue];
+
+    alignas(max_nfs_size) std::atomic<ticket_type> head_counter{};
+    alignas(max_nfs_size) std::atomic<ticket_type> tail_counter{};
+    alignas(max_nfs_size) std::atomic<size_type> n_invalid_entries{};
+}; // class concurrent_queue_rep
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+#pragma warning( pop )
+#endif
+
+template <typename Value, typename Allocator>
+class concurrent_queue_iterator_base {
+    using queue_rep_type = concurrent_queue_rep<Value, Allocator>;
+    using padded_page = typename queue_rep_type::padded_page;
+protected:
+    concurrent_queue_iterator_base() = default;
+
+    concurrent_queue_iterator_base( const concurrent_queue_iterator_base& other ) {
+        assign(other);
+    }
+
+    concurrent_queue_iterator_base( queue_rep_type* queue_rep )
+        : my_queue_rep(queue_rep),
+          my_head_counter(my_queue_rep->head_counter.load(std::memory_order_relaxed))
+    {
+        for (std::size_t i = 0; i < queue_rep_type::n_queue; ++i) {
+            my_array[i] = my_queue_rep->array[i].get_head_page();
+        }
+
+        if (!get_item(my_item, my_head_counter)) advance();
+    }
+
+    void assign( const concurrent_queue_iterator_base& other ) {
+        my_item = other.my_item;
+        my_queue_rep = other.my_queue_rep;
+
+        if (my_queue_rep != nullptr) {
+            my_head_counter = other.my_head_counter;
+
+            for (std::size_t i = 0; i < queue_rep_type::n_queue; ++i) {
+                my_array[i] = other.my_array[i];
+            }
+        }
+    }
+
+    void advance() {
+        __TBB_ASSERT(my_item, "Attempt to increment iterator past end of the queue");
+        std::size_t k = my_head_counter;
+#if TBB_USE_ASSERT
+        Value* tmp;
+        get_item(tmp, k);
+        __TBB_ASSERT(my_item == tmp, nullptr);
+#endif
+        std::size_t i = modulo_power_of_two(k / queue_rep_type::n_queue, my_queue_rep->items_per_page);
+        if (i == my_queue_rep->items_per_page - 1) {
+            padded_page*& root = my_array[queue_rep_type::index(k)];
+            root = root->next;
+        }
+        // Advance k
+        my_head_counter = ++k;
+        if (!get_item(my_item, k)) advance();
+    }
+
+    concurrent_queue_iterator_base& operator=( const concurrent_queue_iterator_base& other ) {
+        this->assign(other);
+        return *this;
+    }
+
+    bool get_item( Value*& item, std::size_t k ) {
+        if (k == my_queue_rep->tail_counter.load(std::memory_order_relaxed)) {
+            item = nullptr;
+            return true;
+        } else {
+            padded_page* p = my_array[queue_rep_type::index(k)];
+            __TBB_ASSERT(p, nullptr);
+            std::size_t i = modulo_power_of_two(k / queue_rep_type::n_queue, my_queue_rep->items_per_page);
+            item = &(*p)[i];
+            return (p->mask & uintptr_t(1) << i) != 0;
+        }
+    }
+
+    Value* my_item{ nullptr };
+    queue_rep_type* my_queue_rep{ nullptr };
+    ticket_type my_head_counter{};
+    padded_page* my_array[queue_rep_type::n_queue]{};
+}; // class concurrent_queue_iterator_base
+
+struct concurrent_queue_iterator_provider {
+    template <typename Iterator, typename Container>
+    static Iterator get( const Container& container ) {
+        return Iterator(container);
+    }
+}; // struct concurrent_queue_iterator_provider
+
+template <typename Container, typename Value, typename Allocator>
+class concurrent_queue_iterator : public concurrent_queue_iterator_base<typename std::remove_cv<Value>::type, Allocator> {
+    using base_type = concurrent_queue_iterator_base<typename std::remove_cv<Value>::type, Allocator>;
+public:
+    using value_type = Value;
+    using pointer = value_type*;
+    using reference = value_type&;
+    using difference_type = std::ptrdiff_t;
+    using iterator_category = std::forward_iterator_tag;
+
+    concurrent_queue_iterator() = default;
+
+    /** If Value==Container::value_type, then this routine is the copy constructor.
+        If Value==const Container::value_type, then this routine is a conversion constructor. */
+    concurrent_queue_iterator( const concurrent_queue_iterator<Container, typename Container::value_type, Allocator>& other )
+        : base_type(other) {}
+
+private:
+    concurrent_queue_iterator( const Container& container )
+        : base_type(container.my_queue_representation) {}
+public:
+    concurrent_queue_iterator& operator=( const concurrent_queue_iterator<Container, typename Container::value_type, Allocator>& other ) {
+        this->assign(other);
+        return *this;
+    }
+
+    reference operator*() const {
+        return *static_cast<pointer>(this->my_item);
+    }
+
+    pointer operator->() const { return &operator*(); }
+
+    concurrent_queue_iterator& operator++() {
+        this->advance();
+        return *this;
+    }
+
+    concurrent_queue_iterator operator++(int) {
+        concurrent_queue_iterator tmp = *this;
+        ++*this;
+        return tmp;
+    }
+
+    friend bool operator==( const concurrent_queue_iterator& lhs, const concurrent_queue_iterator& rhs ) {
+        return lhs.my_item == rhs.my_item;
+    }
+
+    friend bool operator!=( const concurrent_queue_iterator& lhs, const concurrent_queue_iterator& rhs ) {
+        return lhs.my_item != rhs.my_item;
+    }
+private:
+    friend struct concurrent_queue_iterator_provider;
+}; // class concurrent_queue_iterator
+
+} // namespace d2
+} // namespace detail
+} // tbb
+
+#endif // __TBB_detail__concurrent_queue_base_H
diff --git a/third_party/tbb/detail/_concurrent_skip_list.h b/third_party/tbb/detail/_concurrent_skip_list.h
new file mode 100644
index 000000000..df1f80f07
--- /dev/null
+++ b/third_party/tbb/detail/_concurrent_skip_list.h
@@ -0,0 +1,1291 @@
+// clang-format off
+/*
+    Copyright (c) 2019-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__concurrent_skip_list_H
+#define __TBB_detail__concurrent_skip_list_H
+
+#if !defined(__TBB_concurrent_map_H) && !defined(__TBB_concurrent_set_H)
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_range_common.h"
+#include "third_party/tbb/detail/_allocator_traits.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_node_handle.h"
+#include "third_party/tbb/detail/_containers_helpers.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/enumerable_thread_specific.h"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/initializer_list"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/array"
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/random" // Need std::geometric_distribution
+#include "third_party/libcxx/algorithm" // Need std::equal and std::lexicographical_compare
+#include "third_party/libcxx/cstdint"
+#if __TBB_CPP20_COMPARISONS_PRESENT
+#include "third_party/libcxx/compare"
+#endif
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: conditional expression is constant
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d2 {
+
+template <typename Value, typename Allocator>
+class skip_list_node {
+    using node_ptr = skip_list_node*;
+public:
+    using value_type = Value;
+    using atomic_node_ptr = std::atomic<node_ptr>;
+    using size_type = std::size_t;
+    using container_allocator_type = Allocator;
+
+    using reference = value_type&;
+    using const_reference = const value_type&;
+private:
+    using allocator_traits = tbb::detail::allocator_traits<container_allocator_type>;
+
+    // Allocator is the same as the container allocator=> allocates unitptr_t
+    // It is required to rebind it to value_type to get the correct pointer and const_pointer
+    using value_allocator_traits = typename allocator_traits::template rebind_traits<value_type>;
+public:
+    using pointer = typename value_allocator_traits::pointer;
+    using const_pointer = typename value_allocator_traits::const_pointer;
+
+    //In perfect world these constructor and destructor would have been private,
+    //however this seems technically impractical due to use of allocator_traits.
+
+    //Should not be called directly, instead use create method
+    skip_list_node( size_type levels )
+        : my_height(levels), my_index_number(0)
+    {}
+
+    //Should not be called directly, instead use destroy method
+    ~skip_list_node() {}
+
+    skip_list_node( const skip_list_node& ) = delete;
+    skip_list_node( skip_list_node&& ) = delete;
+    skip_list_node& operator=( const skip_list_node& ) = delete;
+    skip_list_node& operator=( skip_list_node&& ) = delete;
+
+    static skip_list_node* create( container_allocator_type& alloc, size_type height ) {
+        size_type sz = calc_node_size(height);
+        static_assert(std::is_same<typename allocator_traits::value_type, std::uint8_t>::value, "skip_list_node assumes that passed in allocator operates on bytes");
+        auto* node = reinterpret_cast<skip_list_node*>(allocator_traits::allocate(alloc, sz));
+
+        //Construct the node itself
+        allocator_traits::construct(alloc, node, height);
+
+        //Construct the level pointers
+        for (size_type l = 0; l < height; ++l) {
+            allocator_traits::construct(alloc, &node->get_atomic_next(l), nullptr);
+        }
+
+        return node;
+    }
+
+    static void destroy( container_allocator_type& alloc, skip_list_node* node ) {
+        //Destroy the level pointers
+        for (size_type l = 0; l < node->height(); ++l) {
+            allocator_traits::destroy(alloc, &node->atomic_next(l));
+        }
+        size_type sz = calc_node_size(node->height());
+        // Destroy the node itself
+        allocator_traits::destroy(alloc, node);
+
+        // Deallocate the node
+        allocator_traits::deallocate(alloc, reinterpret_cast<std::uint8_t*>(node), sz);
+    }
+
+
+    pointer storage() {
+        return &my_value;
+    }
+
+    reference value() {
+        return *storage();
+    }
+
+    node_ptr next( size_type level ) const {
+        node_ptr res = get_atomic_next(level).load(std::memory_order_acquire);
+        __TBB_ASSERT(res == nullptr || res->height() > level, "Broken internal structure");
+        return res;
+    }
+
+    atomic_node_ptr& atomic_next( size_type level ) {
+        atomic_node_ptr& res = get_atomic_next(level);
+#if TBB_USE_DEBUG
+        node_ptr node = res.load(std::memory_order_acquire);
+        __TBB_ASSERT(node == nullptr || node->height() > level, "Broken internal structure");
+#endif
+        return res;
+    }
+
+    void set_next( size_type level, node_ptr n ) {
+        __TBB_ASSERT(n == nullptr || n->height() > level, "Broken internal structure");
+        get_atomic_next(level).store(n, std::memory_order_relaxed);
+    }
+
+    size_type height() const {
+        return my_height;
+    }
+
+    void set_index_number( size_type index_num ) {
+        my_index_number = index_num;
+    }
+
+    size_type index_number() const {
+        return my_index_number;
+    }
+
+private:
+    static size_type calc_node_size( size_type height ) {
+        static_assert(alignof(skip_list_node) >= alignof(atomic_node_ptr), "Incorrect alignment");
+        return sizeof(skip_list_node) + height * sizeof(atomic_node_ptr);
+    }
+
+    atomic_node_ptr& get_atomic_next( size_type level ) {
+        atomic_node_ptr* arr = reinterpret_cast<atomic_node_ptr*>(this + 1);
+        return arr[level];
+    }
+
+    const atomic_node_ptr& get_atomic_next( size_type level ) const {
+        const atomic_node_ptr* arr = reinterpret_cast<const atomic_node_ptr*>(this + 1);
+        return arr[level];
+    }
+
+    union {
+        value_type my_value;
+    };
+    size_type my_height;
+    size_type my_index_number;
+}; // class skip_list_node
+
+template <typename NodeType, typename ValueType>
+class skip_list_iterator {
+    using node_type = NodeType;
+    using node_ptr = node_type*;
+public:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = ValueType;
+
+    using difference_type = std::ptrdiff_t;
+    using pointer = value_type*;
+    using reference = value_type&;
+
+    skip_list_iterator() : skip_list_iterator(nullptr) {}
+
+    skip_list_iterator( const skip_list_iterator<node_type, typename node_type::value_type>& other )
+        : my_node_ptr(other.my_node_ptr) {}
+
+    skip_list_iterator& operator=( const skip_list_iterator<node_type, typename node_type::value_type>& other ) {
+        my_node_ptr = other.my_node_ptr;
+        return *this;
+    }
+
+    reference operator*() const { return my_node_ptr->value(); }
+    pointer operator->() const { return my_node_ptr->storage(); }
+
+    skip_list_iterator& operator++() {
+        __TBB_ASSERT(my_node_ptr != nullptr, nullptr);
+        my_node_ptr = my_node_ptr->next(0);
+        return *this;
+    }
+
+    skip_list_iterator operator++(int) {
+        skip_list_iterator tmp = *this;
+        ++*this;
+        return tmp;
+    }
+
+private:
+    skip_list_iterator(node_type* n) : my_node_ptr(n) {}
+
+    node_ptr my_node_ptr;
+
+    template <typename Traits>
+    friend class concurrent_skip_list;
+
+    template <typename N, typename V>
+    friend class skip_list_iterator;
+
+    friend class const_range;
+    friend class range;
+
+    friend bool operator==( const skip_list_iterator& lhs, const skip_list_iterator& rhs ) {
+        return lhs.my_node_ptr == rhs.my_node_ptr;
+    }
+
+    friend bool operator!=( const skip_list_iterator& lhs, const skip_list_iterator& rhs ) {
+        return lhs.my_node_ptr != rhs.my_node_ptr;
+    }
+}; // class skip_list_iterator
+
+template <typename Traits>
+class concurrent_skip_list {
+protected:
+    using container_traits = Traits;
+    using self_type = concurrent_skip_list<container_traits>;
+    using allocator_type = typename container_traits::allocator_type;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using key_compare = typename container_traits::compare_type;
+    using value_compare = typename container_traits::value_compare;
+    using key_type = typename container_traits::key_type;
+    using value_type = typename container_traits::value_type;
+    static_assert(std::is_same<value_type, typename allocator_type::value_type>::value,
+                  "value_type of the container should be the same as its allocator");
+
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+
+    static constexpr size_type max_level = container_traits::max_level;
+
+    using node_allocator_type = typename allocator_traits_type::template rebind_alloc<std::uint8_t>;
+    using node_allocator_traits = tbb::detail::allocator_traits<node_allocator_type>;
+
+    using list_node_type = skip_list_node<value_type, node_allocator_type>;
+    using node_type = d1::node_handle<key_type, value_type, list_node_type, allocator_type>;
+
+    using iterator = skip_list_iterator<list_node_type, value_type>;
+    using const_iterator = skip_list_iterator<list_node_type, const value_type>;
+
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using random_level_generator_type = typename container_traits::random_level_generator_type;
+
+    using node_ptr = list_node_type*;
+
+    using array_type = std::array<node_ptr, max_level>;
+private:
+    template <typename T>
+    using is_transparent = dependent_bool<comp_is_transparent<key_compare>, T>;
+public:
+    static constexpr bool allow_multimapping = container_traits::allow_multimapping;
+
+    concurrent_skip_list() : my_head_ptr(nullptr), my_size(0), my_max_height(0) {}
+
+    explicit concurrent_skip_list( const key_compare& comp, const allocator_type& alloc = allocator_type() )
+        : my_node_allocator(alloc), my_compare(comp), my_head_ptr(nullptr), my_size(0), my_max_height(0) {}
+
+    explicit concurrent_skip_list( const allocator_type& alloc )
+        : concurrent_skip_list(key_compare(), alloc) {}
+
+    template<typename InputIterator>
+    concurrent_skip_list( InputIterator first, InputIterator last, const key_compare& comp = key_compare(),
+                          const allocator_type& alloc = allocator_type() )
+        : concurrent_skip_list(comp, alloc)
+    {
+        internal_copy(first, last);
+    }
+
+    template <typename InputIterator>
+    concurrent_skip_list( InputIterator first, InputIterator last, const allocator_type& alloc )
+        : concurrent_skip_list(first, last, key_compare(), alloc) {}
+
+    concurrent_skip_list( std::initializer_list<value_type> init, const key_compare& comp = key_compare(),
+                          const allocator_type& alloc = allocator_type() )
+        : concurrent_skip_list(init.begin(), init.end(), comp, alloc) {}
+
+    concurrent_skip_list( std::initializer_list<value_type> init, const allocator_type& alloc )
+        : concurrent_skip_list(init, key_compare(), alloc) {}
+
+    concurrent_skip_list( const concurrent_skip_list& other )
+        : my_node_allocator(node_allocator_traits::select_on_container_copy_construction(other.get_allocator())),
+          my_compare(other.my_compare), my_rng(other.my_rng), my_head_ptr(nullptr),
+          my_size(0), my_max_height(0)
+    {
+        internal_copy(other);
+        __TBB_ASSERT(my_size == other.my_size, "Wrong size of copy-constructed container");
+    }
+
+    concurrent_skip_list( const concurrent_skip_list& other, const allocator_type& alloc )
+        : my_node_allocator(alloc), my_compare(other.my_compare), my_rng(other.my_rng), my_head_ptr(nullptr),
+          my_size(0), my_max_height(0)
+    {
+        internal_copy(other);
+        __TBB_ASSERT(my_size == other.my_size, "Wrong size of copy-constructed container");
+    }
+
+    concurrent_skip_list( concurrent_skip_list&& other )
+        : my_node_allocator(std::move(other.my_node_allocator)), my_compare(other.my_compare),
+          my_rng(std::move(other.my_rng)), my_head_ptr(nullptr) // my_head_ptr would be stored in internal_move
+    {
+        internal_move(std::move(other));
+    }
+
+    concurrent_skip_list( concurrent_skip_list&& other, const allocator_type& alloc )
+        : my_node_allocator(alloc), my_compare(other.my_compare),
+          my_rng(std::move(other.my_rng)), my_head_ptr(nullptr)
+    {
+        using is_always_equal = typename allocator_traits_type::is_always_equal;
+        internal_move_construct_with_allocator(std::move(other), is_always_equal());
+    }
+
+    ~concurrent_skip_list() {
+        clear();
+        delete_head();
+    }
+
+    concurrent_skip_list& operator=( const concurrent_skip_list& other ) {
+        if (this != &other) {
+            clear();
+            copy_assign_allocators(my_node_allocator, other.my_node_allocator);
+            my_compare = other.my_compare;
+            my_rng = other.my_rng;
+            internal_copy(other);
+        }
+        return *this;
+    }
+
+    concurrent_skip_list& operator=( concurrent_skip_list&& other ) {
+        if (this != &other) {
+            clear();
+            delete_head();
+
+            my_compare = std::move(other.my_compare);
+            my_rng = std::move(other.my_rng);
+
+            move_assign_allocators(my_node_allocator, other.my_node_allocator);
+            using pocma_type = typename node_allocator_traits::propagate_on_container_move_assignment;
+            using is_always_equal = typename node_allocator_traits::is_always_equal;
+            internal_move_assign(std::move(other), tbb::detail::disjunction<pocma_type, is_always_equal>());
+        }
+        return *this;
+    }
+
+    concurrent_skip_list& operator=( std::initializer_list<value_type> il )
+    {
+        clear();
+        insert(il.begin(),il.end());
+        return *this;
+    }
+
+    std::pair<iterator, bool> insert( const value_type& value ) {
+        return internal_insert(value);
+    }
+
+    std::pair<iterator, bool> insert( value_type&& value ) {
+        return internal_insert(std::move(value));
+    }
+
+    iterator insert( const_iterator, const_reference value ) {
+        // Ignore hint
+        return insert(value).first;
+    }
+
+    iterator insert( const_iterator, value_type&& value ) {
+        // Ignore hint
+        return insert(std::move(value)).first;
+    }
+
+    template<typename InputIterator>
+    void insert( InputIterator first, InputIterator last ) {
+        while (first != last) {
+            insert(*first);
+            ++first;
+        }
+    }
+
+    void insert( std::initializer_list<value_type> init ) {
+        insert(init.begin(), init.end());
+    }
+
+    std::pair<iterator, bool> insert( node_type&& nh ) {
+        if (!nh.empty()) {
+            auto insert_node = d1::node_handle_accessor::get_node_ptr(nh);
+            std::pair<iterator, bool> insert_result = internal_insert_node(insert_node);
+            if (insert_result.second) {
+                d1::node_handle_accessor::deactivate(nh);
+            }
+            return insert_result;
+        }
+        return std::pair<iterator, bool>(end(), false);
+    }
+
+    iterator insert( const_iterator, node_type&& nh ) {
+        // Ignore hint
+        return insert(std::move(nh)).first;
+    }
+
+    template<typename... Args>
+    std::pair<iterator, bool> emplace( Args&&... args ) {
+        return internal_insert(std::forward<Args>(args)...);
+    }
+
+    template<typename... Args>
+    iterator emplace_hint( const_iterator, Args&&... args ) {
+        // Ignore hint
+        return emplace(std::forward<Args>(args)...).first;
+    }
+
+    iterator unsafe_erase( iterator pos ) {
+        std::pair<node_ptr, node_ptr> extract_result = internal_extract(pos);
+        if (extract_result.first) { // node was extracted
+            delete_value_node(extract_result.first);
+            return extract_result.second;
+        }
+        return end();
+    }
+
+    iterator unsafe_erase( const_iterator pos ) {
+        return unsafe_erase(get_iterator(pos));
+    }
+
+    iterator unsafe_erase( const_iterator first, const_iterator last ) {
+        while (first != last) {
+            // Unsafe erase returns the iterator which follows the erased one
+            first = unsafe_erase(first);
+        }
+        return get_iterator(first);
+    }
+
+    size_type unsafe_erase( const key_type& key ) {
+        return internal_erase(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value
+                            && !std::is_convertible<K, const_iterator>::value
+                            && !std::is_convertible<K, iterator>::value,
+                            size_type>::type unsafe_erase( const K& key )
+    {
+        return internal_erase(key);
+    }
+
+    node_type unsafe_extract( const_iterator pos ) {
+        std::pair<node_ptr, node_ptr> extract_result = internal_extract(pos);
+        return extract_result.first ? d1::node_handle_accessor::construct<node_type>(extract_result.first) : node_type();
+    }
+
+    node_type unsafe_extract( iterator pos ) {
+        return unsafe_extract(const_iterator(pos));
+    }
+
+    node_type unsafe_extract( const key_type& key ) {
+        return unsafe_extract(find(key));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value
+                            && !std::is_convertible<K, const_iterator>::value
+                            && !std::is_convertible<K, iterator>::value,
+                            node_type>::type unsafe_extract( const K& key )
+    {
+        return unsafe_extract(find(key));
+    }
+
+    iterator lower_bound( const key_type& key ) {
+        return iterator(internal_get_bound(key, my_compare));
+    }
+
+    const_iterator lower_bound( const key_type& key ) const {
+        return const_iterator(internal_get_bound(key, my_compare));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, iterator>::type lower_bound( const K& key ) {
+        return iterator(internal_get_bound(key, my_compare));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, const_iterator>::type lower_bound( const K& key ) const {
+        return const_iterator(internal_get_bound(key, my_compare));
+    }
+
+    iterator upper_bound( const key_type& key ) {
+        return iterator(internal_get_bound(key, not_greater_compare(my_compare)));
+    }
+
+    const_iterator upper_bound( const key_type& key ) const {
+        return const_iterator(internal_get_bound(key, not_greater_compare(my_compare)));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, iterator>::type upper_bound( const K& key ) {
+        return iterator(internal_get_bound(key, not_greater_compare(my_compare)));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, const_iterator>::type upper_bound( const K& key ) const {
+        return const_iterator(internal_get_bound(key, not_greater_compare(my_compare)));
+    }
+
+    iterator find( const key_type& key ) {
+        return iterator(internal_find(key));
+    }
+
+    const_iterator find( const key_type& key ) const {
+        return const_iterator(internal_find(key));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, iterator>::type find( const K& key ) {
+        return iterator(internal_find(key));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, const_iterator>::type find( const K& key ) const {
+        return const_iterator(internal_find(key));
+    }
+
+    size_type count( const key_type& key ) const {
+        return internal_count(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, size_type>::type count( const K& key ) const {
+        return internal_count(key);
+    }
+
+    bool contains( const key_type& key ) const {
+        return find(key) != end();
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, bool>::type contains( const K& key ) const {
+        return find(key) != end();
+    }
+
+    void clear() noexcept {
+        // clear is not thread safe - load can be relaxed
+        node_ptr head = my_head_ptr.load(std::memory_order_relaxed);
+
+        if (head == nullptr) return; // Head is not allocated => container is empty
+
+        node_ptr current = head->next(0);
+
+        // Delete all value nodes in the container
+        while (current) {
+            node_ptr next = current->next(0);
+            delete_value_node(current);
+            current = next;
+        }
+
+        for (size_type level = 0; level < head->height(); ++level) {
+            head->set_next(level, nullptr);
+        }
+
+        my_size.store(0, std::memory_order_relaxed);
+        my_max_height.store(0, std::memory_order_relaxed);
+    }
+
+    iterator begin() {
+        return iterator(internal_begin());
+    }
+
+    const_iterator begin() const {
+        return const_iterator(internal_begin());
+    }
+
+    const_iterator cbegin() const {
+        return const_iterator(internal_begin());
+    }
+
+    iterator end() {
+        return iterator(nullptr);
+    }
+
+    const_iterator end() const {
+        return const_iterator(nullptr);
+    }
+
+    const_iterator cend() const {
+        return const_iterator(nullptr);
+    }
+
+    size_type size() const {
+        return my_size.load(std::memory_order_relaxed);
+    }
+
+    size_type max_size() const {
+        return node_allocator_traits::max_size(my_node_allocator);
+    }
+
+    __TBB_nodiscard bool empty() const {
+        return 0 == size();
+    }
+
+    allocator_type get_allocator() const {
+        return my_node_allocator;
+    }
+
+    void swap(concurrent_skip_list& other) {
+        if (this != &other) {
+            using pocs_type = typename node_allocator_traits::propagate_on_container_swap;
+            using is_always_equal = typename node_allocator_traits::is_always_equal;
+            internal_swap(other, tbb::detail::disjunction<pocs_type, is_always_equal>());
+        }
+    }
+
+    std::pair<iterator, iterator> equal_range(const key_type& key) {
+        return internal_equal_range(key);
+    }
+
+    std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const {
+        return internal_equal_range(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, std::pair<iterator, iterator>>::type equal_range( const K& key ) {
+        return internal_equal_range(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, std::pair<const_iterator, const_iterator>>::type equal_range( const K& key ) const {
+        return internal_equal_range(key);
+    }
+
+    key_compare key_comp() const { return my_compare; }
+
+    value_compare value_comp() const { return container_traits::value_comp(my_compare); }
+
+    class const_range_type {
+    public:
+        using size_type = typename concurrent_skip_list::size_type;
+        using difference_type = typename concurrent_skip_list::difference_type;
+        using iterator = typename concurrent_skip_list::const_iterator;
+        using value_type = typename iterator::value_type;
+        using reference = typename iterator::reference;
+
+        bool empty() const {
+            return my_begin.my_node_ptr ? (my_begin.my_node_ptr->next(0) == my_end.my_node_ptr)
+                                        : true;
+        }
+
+        bool is_divisible() const {
+            return my_begin.my_node_ptr && my_level != 0
+                        ? my_begin.my_node_ptr->next(my_level - 1) != my_end.my_node_ptr
+                        : false;
+        }
+
+        size_type size() const { return std::distance(my_begin, my_end); }
+
+        const_range_type( const_range_type& r, split)
+            : my_end(r.my_end) {
+            if (r.empty()) {
+                __TBB_ASSERT(my_end.my_node_ptr == nullptr, nullptr);
+                my_begin = my_end;
+                my_level = 0;
+            } else {
+                my_begin = iterator(r.my_begin.my_node_ptr->next(r.my_level - 1));
+                my_level = my_begin.my_node_ptr->height();
+            }
+            r.my_end = my_begin;
+        }
+
+        const_range_type( const concurrent_skip_list& l)
+            : my_end(l.end()), my_begin(l.begin()),
+              my_level(my_begin.my_node_ptr ? my_begin.my_node_ptr->height() : 0) {}
+
+        iterator begin() const { return my_begin; }
+        iterator end() const { return my_end; }
+        size_type grainsize() const { return 1; }
+
+    private:
+        const_iterator my_end;
+        const_iterator my_begin;
+        size_type my_level;
+    }; // class const_range_type
+
+    class range_type : public const_range_type {
+    public:
+        using iterator = typename concurrent_skip_list::iterator;
+        using value_type = typename iterator::value_type;
+        using reference = typename iterator::reference;
+
+        range_type(range_type& r, split) : const_range_type(r, split()) {}
+        range_type(const concurrent_skip_list& l) : const_range_type(l) {}
+
+        iterator begin() const {
+            node_ptr node = const_range_type::begin().my_node_ptr;
+            return iterator(node);
+        }
+
+        iterator end() const {
+            node_ptr node = const_range_type::end().my_node_ptr;
+            return iterator(node);
+        }
+    }; // class range_type
+
+    range_type range() { return range_type(*this); }
+    const_range_type range() const { return const_range_type(*this); }
+
+private:
+    node_ptr internal_begin() const {
+        node_ptr head = get_head();
+        return head == nullptr ? head : head->next(0);
+    }
+
+    void internal_move(concurrent_skip_list&& other) {
+        my_head_ptr.store(other.my_head_ptr.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_head_ptr.store(nullptr, std::memory_order_relaxed);
+
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_size.store(0, std::memory_order_relaxed);
+
+        my_max_height.store(other.my_max_height.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_max_height.store(0, std::memory_order_relaxed);
+    }
+
+    void internal_move_construct_with_allocator(concurrent_skip_list&& other,
+                                                /*is_always_equal = */std::true_type) {
+        internal_move(std::move(other));
+    }
+
+    void internal_move_construct_with_allocator(concurrent_skip_list&& other,
+                                                /*is_always_equal = */std::false_type) {
+        if (my_node_allocator == other.get_allocator()) {
+            internal_move(std::move(other));
+        } else {
+            my_size.store(0, std::memory_order_relaxed);
+            my_max_height.store(other.my_max_height.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()));
+        }
+    }
+
+    static const key_type& get_key( node_ptr n ) {
+        __TBB_ASSERT(n, nullptr);
+        return container_traits::get_key(static_cast<node_ptr>(n)->value());
+    }
+
+    template <typename K>
+    bool found( node_ptr node, const K& key ) const {
+        return node != nullptr && !my_compare(key, get_key(node));
+    }
+
+    template <typename K>
+    node_ptr internal_find(const K& key) const {
+        return allow_multimapping ? internal_find_multi(key) : internal_find_unique(key);
+    }
+
+    template <typename K>
+    node_ptr internal_find_multi( const K& key ) const {
+        node_ptr prev = get_head();
+        if (prev == nullptr) return nullptr; // If the head node is not allocated - exit
+
+        node_ptr curr = nullptr;
+        node_ptr old_curr = curr;
+
+        for (size_type h = my_max_height.load(std::memory_order_acquire); h > 0; --h) {
+            curr = internal_find_position(h - 1, prev, key, my_compare);
+
+            if (curr != old_curr && found(curr, key)) {
+                return curr;
+            }
+            old_curr = curr;
+        }
+        return nullptr;
+    }
+
+    template <typename K>
+    node_ptr internal_find_unique( const K& key ) const {
+        const_iterator it = lower_bound(key);
+        return (it == end() || my_compare(key, container_traits::get_key(*it))) ? nullptr : it.my_node_ptr;
+    }
+
+    template <typename K>
+    size_type internal_count( const K& key ) const {
+        if (allow_multimapping) {
+            // TODO: reimplement without double traversal
+            std::pair<const_iterator, const_iterator> r = equal_range(key);
+            return std::distance(r.first, r.second);
+        }
+        return size_type(contains(key) ? 1 : 0);
+    }
+
+    template <typename K>
+    std::pair<iterator, iterator> internal_equal_range(const K& key) const {
+        iterator lb = get_iterator(lower_bound(key));
+        auto result = std::make_pair(lb, lb);
+
+        // If the lower bound points to the node with the requested key
+        if (found(lb.my_node_ptr, key)) {
+
+            if (!allow_multimapping) {
+                // For unique containers - move the second iterator forward and exit
+                ++result.second;
+            } else {
+                // For multi containers - find the upper bound starting from the lower bound
+                node_ptr prev = lb.my_node_ptr;
+                node_ptr curr = nullptr;
+                not_greater_compare cmp(my_compare);
+
+                // Start from the lower bound of the range
+                for (size_type h = prev->height(); h > 0; --h) {
+                    curr = prev->next(h - 1);
+                    while (curr && cmp(get_key(curr), key)) {
+                        prev = curr;
+                        // If the height of the next node is greater than the current one - jump to its height
+                        if (h < curr->height()) {
+                            h = curr->height();
+                        }
+                        curr = prev->next(h - 1);
+                    }
+                }
+                result.second = iterator(curr);
+            }
+        }
+
+        return result;
+    }
+
+    // Finds position on the level using comparator cmp starting from the node prev
+    template <typename K, typename Comparator>
+    node_ptr internal_find_position( size_type level, node_ptr& prev, const K& key,
+                                     const Comparator& cmp ) const {
+        __TBB_ASSERT(level < prev->height(), "Wrong level to find position");
+        node_ptr curr = prev->next(level);
+
+        while (curr && cmp(get_key(curr), key)) {
+            prev = curr;
+            __TBB_ASSERT(level < prev->height(), nullptr);
+            curr = prev->next(level);
+        }
+
+        return curr;
+    }
+
+    // The same as previous overload, but allows index_number comparison
+    template <typename Comparator>
+    node_ptr internal_find_position( size_type level, node_ptr& prev, node_ptr node,
+                                     const Comparator& cmp ) const {
+        __TBB_ASSERT(level < prev->height(), "Wrong level to find position");
+        node_ptr curr = prev->next(level);
+
+        while (curr && cmp(get_key(curr), get_key(node))) {
+            if (allow_multimapping && cmp(get_key(node), get_key(curr)) && curr->index_number() > node->index_number()) {
+                break;
+            }
+
+            prev = curr;
+            __TBB_ASSERT(level < prev->height(), nullptr);
+            curr = prev->next(level);
+        }
+        return curr;
+    }
+
+    template <typename Comparator>
+    void fill_prev_curr_arrays(array_type& prev_nodes, array_type& curr_nodes, node_ptr node, const key_type& key,
+                               const Comparator& cmp, node_ptr head ) {
+
+        size_type curr_max_height = my_max_height.load(std::memory_order_acquire);
+        size_type node_height = node->height();
+        if (curr_max_height < node_height) {
+            std::fill(prev_nodes.begin() + curr_max_height, prev_nodes.begin() + node_height, head);
+            std::fill(curr_nodes.begin() + curr_max_height, curr_nodes.begin() + node_height, nullptr);
+        }
+
+        node_ptr prev = head;
+        for (size_type level = curr_max_height; level > 0; --level) {
+            node_ptr curr = internal_find_position(level - 1, prev, key, cmp);
+            prev_nodes[level - 1] = prev;
+            curr_nodes[level - 1] = curr;
+        }
+    }
+
+    void fill_prev_array_for_existing_node( array_type& prev_nodes, node_ptr node ) {
+        node_ptr head = create_head_if_necessary();
+        prev_nodes.fill(head);
+
+        node_ptr prev = head;
+        for (size_type level = node->height(); level > 0; --level) {
+            while (prev->next(level - 1) != node) {
+                prev = prev->next(level - 1);
+            }
+            prev_nodes[level - 1] = prev;
+        }
+    }
+
+    struct not_greater_compare {
+        const key_compare& my_less_compare;
+
+        not_greater_compare( const key_compare& less_compare ) : my_less_compare(less_compare) {}
+
+        template <typename K1, typename K2>
+        bool operator()( const K1& first, const K2& second ) const {
+            return !my_less_compare(second, first);
+        }
+    };
+
+    not_greater_compare select_comparator( /*allow_multimapping = */ std::true_type ) {
+        return not_greater_compare(my_compare);
+    }
+
+    key_compare select_comparator( /*allow_multimapping = */ std::false_type ) {
+        return my_compare;
+    }
+
+    template<typename... Args>
+    std::pair<iterator, bool> internal_insert( Args&&... args ) {
+        node_ptr new_node = create_value_node(std::forward<Args>(args)...);
+        std::pair<iterator, bool> insert_result = internal_insert_node(new_node);
+        if (!insert_result.second) {
+            delete_value_node(new_node);
+        }
+        return insert_result;
+    }
+
+    std::pair<iterator, bool> internal_insert_node( node_ptr new_node ) {
+        array_type prev_nodes;
+        array_type curr_nodes;
+        size_type new_height = new_node->height();
+        auto compare = select_comparator(std::integral_constant<bool, allow_multimapping>{});
+
+        node_ptr head_node = create_head_if_necessary();
+
+        for (;;) {
+            fill_prev_curr_arrays(prev_nodes, curr_nodes, new_node, get_key(new_node), compare, head_node);
+
+            node_ptr prev = prev_nodes[0];
+            node_ptr next = curr_nodes[0];
+
+            if (allow_multimapping) {
+                new_node->set_index_number(prev->index_number() + 1);
+            } else {
+                if (found(next, get_key(new_node))) {
+                    return std::pair<iterator, bool>(iterator(next), false);
+                }
+            }
+
+            new_node->set_next(0, next);
+            if (!prev->atomic_next(0).compare_exchange_strong(next, new_node)) {
+                continue;
+            }
+
+            // If the node was successfully linked on the first level - it will be linked on other levels
+            // Insertion cannot fail starting from this point
+
+            // If the height of inserted node is greater than maximum - increase maximum
+            size_type max_height = my_max_height.load(std::memory_order_acquire);
+            for (;;) {
+                if (new_height <= max_height || my_max_height.compare_exchange_strong(max_height, new_height)) {
+                    // If the maximum was successfully updated by current thread
+                    // or by an other thread for the value, greater or equal to new_height
+                    break;
+                }
+            }
+
+            for (std::size_t level = 1; level < new_height; ++level) {
+                // Link the node on upper levels
+                for (;;) {
+                    prev = prev_nodes[level];
+                    next = static_cast<node_ptr>(curr_nodes[level]);
+
+                    new_node->set_next(level, next);
+                    __TBB_ASSERT(new_node->height() > level, "Internal structure break");
+                    if (prev->atomic_next(level).compare_exchange_strong(next, new_node)) {
+                        break;
+                    }
+
+                    for (size_type lev = level; lev != new_height; ++lev ) {
+                        curr_nodes[lev] = internal_find_position(lev, prev_nodes[lev], new_node, compare);
+                    }
+                }
+            }
+            ++my_size;
+            return std::pair<iterator, bool>(iterator(new_node), true);
+        }
+    }
+
+    template <typename K, typename Comparator>
+    node_ptr internal_get_bound( const K& key, const Comparator& cmp ) const {
+        node_ptr prev = get_head();
+        if (prev == nullptr) return nullptr; // If the head node is not allocated - exit
+
+        node_ptr curr = nullptr;
+
+        for (size_type h = my_max_height.load(std::memory_order_acquire); h > 0; --h) {
+            curr = internal_find_position(h - 1, prev, key, cmp);
+        }
+
+        return curr;
+    }
+
+    template <typename K>
+    size_type internal_erase( const K& key ) {
+        auto eq = equal_range(key);
+        size_type old_size = size();
+        unsafe_erase(eq.first, eq.second);
+        return old_size - size();
+    }
+
+    // Returns node_ptr to the extracted node and node_ptr to the next node after the extracted
+    std::pair<node_ptr, node_ptr> internal_extract( const_iterator it ) {
+        std::pair<node_ptr, node_ptr> result(nullptr, nullptr);
+        if ( it != end() ) {
+            array_type prev_nodes;
+
+            node_ptr erase_node = it.my_node_ptr;
+            node_ptr next_node = erase_node->next(0);
+            fill_prev_array_for_existing_node(prev_nodes, erase_node);
+
+            for (size_type level = 0; level < erase_node->height(); ++level) {
+                prev_nodes[level]->set_next(level, erase_node->next(level));
+                erase_node->set_next(level, nullptr);
+            }
+            my_size.fetch_sub(1, std::memory_order_relaxed);
+
+            result.first = erase_node;
+            result.second = next_node;
+        }
+        return result;
+    }
+
+protected:
+    template<typename SourceType>
+    void internal_merge( SourceType&& source ) {
+        using source_type = typename std::decay<SourceType>::type;
+        using source_iterator = typename source_type::iterator;
+        static_assert((std::is_same<node_type, typename source_type::node_type>::value), "Incompatible containers cannot be merged");
+
+        for (source_iterator it = source.begin(); it != source.end();) {
+            source_iterator where = it++;
+            if (allow_multimapping || !contains(container_traits::get_key(*where))) {
+                node_type handle = source.unsafe_extract(where);
+                __TBB_ASSERT(!handle.empty(), "Extracted handle in merge is empty");
+
+                if (!insert(std::move(handle)).second) {
+                    __TBB_ASSERT(!handle.empty(), "Handle should not be empty if insert fails");
+                    //If the insertion fails - return the node into source
+                    source.insert(std::move(handle));
+                }
+                __TBB_ASSERT(handle.empty(), "Node handle should be empty after the insertion");
+            }
+        }
+    }
+
+private:
+    void internal_copy( const concurrent_skip_list& other ) {
+        internal_copy(other.begin(), other.end());
+    }
+
+    template<typename Iterator>
+    void internal_copy( Iterator first, Iterator last ) {
+        try_call([&] {
+            for (auto it = first; it != last; ++it) {
+                insert(*it);
+            }
+        }).on_exception([&] {
+            clear();
+            delete_head();
+        });
+    }
+
+    node_ptr create_node( size_type height ) {
+        return list_node_type::create(my_node_allocator, height);
+    }
+
+    template <typename... Args>
+    node_ptr create_value_node( Args&&... args ) {
+        node_ptr node = create_node(my_rng());
+
+        // try_call API is not convenient here due to broken
+        // variadic capture on GCC 4.8.5
+        auto value_guard = make_raii_guard([&] {
+            delete_node(node);
+        });
+
+        // Construct the value inside the node
+        node_allocator_traits::construct(my_node_allocator, node->storage(), std::forward<Args>(args)...);
+        value_guard.dismiss();
+        return node;
+    }
+
+    node_ptr create_head_node() {
+        return create_node(max_level);
+    }
+
+    void delete_head() {
+        node_ptr head = my_head_ptr.load(std::memory_order_relaxed);
+        if (head != nullptr) {
+            delete_node(head);
+            my_head_ptr.store(nullptr, std::memory_order_relaxed);
+        }
+    }
+
+    void delete_node( node_ptr node ) {
+        list_node_type::destroy(my_node_allocator, node);
+    }
+
+    void delete_value_node( node_ptr node ) {
+        // Destroy the value inside the node
+        node_allocator_traits::destroy(my_node_allocator, node->storage());
+        delete_node(node);
+    }
+
+    node_ptr get_head() const {
+        return my_head_ptr.load(std::memory_order_acquire);
+    }
+
+    node_ptr create_head_if_necessary() {
+        node_ptr current_head = get_head();
+        if (current_head == nullptr) {
+            // Head node was not created - create it
+            node_ptr new_head = create_head_node();
+            if (my_head_ptr.compare_exchange_strong(current_head, new_head)) {
+                current_head = new_head;
+            } else {
+                // If an other thread has already created the head node - destroy new_head
+                // current_head now points to the actual head node
+                delete_node(new_head);
+            }
+        }
+        __TBB_ASSERT(my_head_ptr.load(std::memory_order_relaxed) != nullptr, nullptr);
+        __TBB_ASSERT(current_head != nullptr, nullptr);
+        return current_head;
+    }
+
+    static iterator get_iterator( const_iterator it ) {
+        return iterator(it.my_node_ptr);
+    }
+
+    void internal_move_assign( concurrent_skip_list&& other, /*POCMA || is_always_equal =*/std::true_type ) {
+        internal_move(std::move(other));
+    }
+
+    void internal_move_assign( concurrent_skip_list&& other, /*POCMA || is_always_equal =*/std::false_type ) {
+        if (my_node_allocator == other.my_node_allocator) {
+            internal_move(std::move(other));
+        } else {
+            internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()));
+        }
+    }
+
+    void internal_swap_fields( concurrent_skip_list& other ) {
+        using std::swap;
+        swap_allocators(my_node_allocator, other.my_node_allocator);
+        swap(my_compare, other.my_compare);
+        swap(my_rng, other.my_rng);
+
+        swap_atomics_relaxed(my_head_ptr, other.my_head_ptr);
+        swap_atomics_relaxed(my_size, other.my_size);
+        swap_atomics_relaxed(my_max_height, other.my_max_height);
+    }
+
+    void internal_swap( concurrent_skip_list& other, /*POCMA || is_always_equal =*/std::true_type ) {
+        internal_swap_fields(other);
+    }
+
+    void internal_swap( concurrent_skip_list& other, /*POCMA || is_always_equal =*/std::false_type ) {
+        __TBB_ASSERT(my_node_allocator == other.my_node_allocator, "Swapping with unequal allocators is not allowed");
+        internal_swap_fields(other);
+    }
+
+    node_allocator_type my_node_allocator;
+    key_compare my_compare;
+    random_level_generator_type my_rng;
+    std::atomic<list_node_type*> my_head_ptr;
+    std::atomic<size_type> my_size;
+    std::atomic<size_type> my_max_height;
+
+    template<typename OtherTraits>
+    friend class concurrent_skip_list;
+}; // class concurrent_skip_list
+
+template <typename Traits>
+bool operator==( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    if (lhs.size() != rhs.size()) return false;
+#if _MSC_VER
+    // Passing "unchecked" iterators to std::equal with 3 parameters
+    // causes compiler warnings.
+    // The workaround is to use overload with 4 parameters, which is
+    // available since C++14 - minimally supported version on MSVC
+    return std::equal(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+#else
+    return std::equal(lhs.begin(), lhs.end(), rhs.begin());
+#endif
+}
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template <typename Traits>
+bool operator!=( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return !(lhs == rhs);
+}
+#endif
+
+#if __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT
+template <typename Traits>
+tbb::detail::synthesized_three_way_result<typename Traits::value_type>
+operator<=>( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(),
+                                                  rhs.begin(), rhs.end(),
+                                                  tbb::detail::synthesized_three_way_comparator{});
+}
+#else
+template <typename Traits>
+bool operator<( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return std::lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+}
+
+template <typename Traits>
+bool operator>( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return rhs < lhs;
+}
+
+template <typename Traits>
+bool operator<=( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return !(rhs < lhs);
+}
+
+template <typename Traits>
+bool operator>=( const concurrent_skip_list<Traits>& lhs, const concurrent_skip_list<Traits>& rhs ) {
+    return !(lhs < rhs);
+}
+#endif // __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT
+
+// Generates a number from the interval [0, MaxLevel).
+template <std::size_t MaxLevel>
+class concurrent_geometric_level_generator {
+public:
+    static constexpr std::size_t max_level = MaxLevel;
+    // TODO: modify the algorithm to accept other values of max_level
+    static_assert(max_level == 32, "Incompatible max_level for rng");
+
+    concurrent_geometric_level_generator() : engines(std::minstd_rand::result_type(time(nullptr))) {}
+
+    std::size_t operator()() {
+        // +1 is required to pass at least 1 into log2 (log2(0) is undefined)
+        // -1 is required to have an ability to return 0 from the generator (max_level - log2(2^31) - 1)
+        std::size_t result = max_level - std::size_t(tbb::detail::log2(engines.local()() + 1)) - 1;
+        __TBB_ASSERT(result <= max_level, nullptr);
+        return result;
+    }
+
+private:
+    tbb::enumerable_thread_specific<std::minstd_rand> engines;
+};
+
+} // namespace d2
+
+} // namespace detail
+} // namespace tbb
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(pop) // warning 4127 is back
+#endif
+
+#endif // __TBB_detail__concurrent_skip_list_H
diff --git a/third_party/tbb/detail/_concurrent_unordered_base.h b/third_party/tbb/detail/_concurrent_unordered_base.h
new file mode 100644
index 000000000..9dd0ad499
--- /dev/null
+++ b/third_party/tbb/detail/_concurrent_unordered_base.h
@@ -0,0 +1,1515 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__concurrent_unordered_base_H
+#define __TBB_detail__concurrent_unordered_base_H
+
+#if !defined(__TBB_concurrent_unordered_map_H) && !defined(__TBB_concurrent_unordered_set_H)
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+#include "third_party/tbb/detail/_range_common.h"
+#include "third_party/tbb/detail/_containers_helpers.h"
+#include "third_party/tbb/detail/_segment_table.h"
+#include "third_party/tbb/detail/_hash_compare.h"
+#include "third_party/tbb/detail/_allocator_traits.h"
+#include "third_party/tbb/detail/_node_handle.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/libcxx/iterator"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/initializer_list"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/algorithm"
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: conditional expression is constant
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Traits>
+class concurrent_unordered_base;
+
+template<typename Container, typename Value>
+class solist_iterator {
+private:
+    using node_ptr = typename Container::value_node_ptr;
+    template <typename T, typename Allocator>
+    friend class split_ordered_list;
+    template<typename M, typename V>
+    friend class solist_iterator;
+    template <typename Traits>
+    friend class concurrent_unordered_base;
+    template<typename M, typename T, typename U>
+    friend bool operator==( const solist_iterator<M,T>& i, const solist_iterator<M,U>& j );
+    template<typename M, typename T, typename U>
+    friend bool operator!=( const solist_iterator<M,T>& i, const solist_iterator<M,U>& j );
+public:
+    using value_type = Value;
+    using difference_type = typename Container::difference_type;
+    using pointer = value_type*;
+    using reference = value_type&;
+    using iterator_category = std::forward_iterator_tag;
+
+    solist_iterator() : my_node_ptr(nullptr) {}
+    solist_iterator( const solist_iterator<Container, typename Container::value_type>& other )
+        : my_node_ptr(other.my_node_ptr) {}
+
+    solist_iterator& operator=( const solist_iterator<Container, typename Container::value_type>& other ) {
+        my_node_ptr = other.my_node_ptr;
+        return *this;
+    }
+
+    reference operator*() const {
+        return my_node_ptr->value();
+    }
+
+    pointer operator->() const {
+        return my_node_ptr->storage();
+    }
+
+    solist_iterator& operator++() {
+        auto next_node = my_node_ptr->next();
+        while(next_node && next_node->is_dummy()) {
+            next_node = next_node->next();
+        }
+        my_node_ptr = static_cast<node_ptr>(next_node);
+        return *this;
+    }
+
+    solist_iterator operator++(int) {
+        solist_iterator tmp = *this;
+        ++*this;
+        return tmp;
+    }
+
+private:
+    solist_iterator( node_ptr pnode ) : my_node_ptr(pnode) {}
+
+    node_ptr get_node_ptr() const { return my_node_ptr; }
+
+    node_ptr my_node_ptr;
+};
+
+template<typename Solist, typename T, typename U>
+bool operator==( const solist_iterator<Solist, T>& i, const solist_iterator<Solist, U>& j ) {
+    return i.my_node_ptr == j.my_node_ptr;
+}
+
+template<typename Solist, typename T, typename U>
+bool operator!=( const solist_iterator<Solist, T>& i, const solist_iterator<Solist, U>& j ) {
+    return i.my_node_ptr != j.my_node_ptr;
+}
+
+template <typename SokeyType>
+class list_node {
+public:
+    using node_ptr = list_node*;
+    using sokey_type = SokeyType;
+
+    list_node(sokey_type key) : my_next(nullptr), my_order_key(key) {}
+
+    void init( sokey_type key ) {
+        my_order_key = key;
+    }
+
+    sokey_type order_key() const {
+        return my_order_key;
+    }
+
+    bool is_dummy() {
+        // The last bit of order key is unset for dummy nodes
+        return (my_order_key & 0x1) == 0;
+    }
+
+    node_ptr next() const {
+        return my_next.load(std::memory_order_acquire);
+    }
+
+    void set_next( node_ptr next_node ) {
+        my_next.store(next_node, std::memory_order_release);
+    }
+
+    bool try_set_next( node_ptr expected_next, node_ptr new_next ) {
+        return my_next.compare_exchange_strong(expected_next, new_next);
+    }
+
+private:
+    std::atomic<node_ptr> my_next;
+    sokey_type my_order_key;
+}; // class list_node
+
+template <typename ValueType, typename SokeyType>
+class value_node : public list_node<SokeyType>
+{
+public:
+    using base_type = list_node<SokeyType>;
+    using sokey_type = typename base_type::sokey_type;
+    using value_type = ValueType;
+
+    value_node( sokey_type ord_key ) : base_type(ord_key) {}
+    ~value_node() {}
+    value_type* storage() {
+        return reinterpret_cast<value_type*>(&my_value);
+    }
+
+    value_type& value() {
+        return *storage();
+    }
+
+private:
+    using aligned_storage_type = typename std::aligned_storage<sizeof(value_type)>::type;
+    aligned_storage_type my_value;
+}; // class value_node
+
+template <typename Traits>
+class concurrent_unordered_base {
+    using self_type = concurrent_unordered_base<Traits>;
+    using traits_type = Traits;
+    using hash_compare_type = typename traits_type::hash_compare_type;
+    class unordered_segment_table;
+public:
+    using value_type = typename traits_type::value_type;
+    using key_type = typename traits_type::key_type;
+    using allocator_type = typename traits_type::allocator_type;
+
+private:
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    // TODO: check assert conditions for different C++ standards
+    static_assert(std::is_same<typename allocator_traits_type::value_type, value_type>::value,
+                  "value_type of the container must be the same as its allocator");
+    using sokey_type = std::size_t;
+
+public:
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+
+    using iterator = solist_iterator<self_type, value_type>;
+    using const_iterator = solist_iterator<self_type, const value_type>;
+    using local_iterator = iterator;
+    using const_local_iterator = const_iterator;
+
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    using hasher = typename hash_compare_type::hasher;
+    using key_equal = typename hash_compare_type::key_equal;
+
+private:
+    using list_node_type = list_node<sokey_type>;
+    using value_node_type = value_node<value_type, sokey_type>;
+    using node_ptr = list_node_type*;
+    using value_node_ptr = value_node_type*;
+
+    using value_node_allocator_type = typename allocator_traits_type::template rebind_alloc<value_node_type>;
+    using node_allocator_type = typename allocator_traits_type::template rebind_alloc<list_node_type>;
+
+    using node_allocator_traits = tbb::detail::allocator_traits<node_allocator_type>;
+    using value_node_allocator_traits = tbb::detail::allocator_traits<value_node_allocator_type>;
+
+    static constexpr size_type round_up_to_power_of_two( size_type bucket_count ) {
+        return size_type(1) << size_type(tbb::detail::log2(uintptr_t(bucket_count == 0 ? 1 : bucket_count) * 2 - 1));
+    }
+
+    template <typename T>
+    using is_transparent = dependent_bool<has_transparent_key_equal<key_type, hasher, key_equal>, T>;
+public:
+    using node_type = node_handle<key_type, value_type, value_node_type, allocator_type>;
+
+    explicit concurrent_unordered_base( size_type bucket_count, const hasher& hash = hasher(),
+                                        const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() )
+        : my_size(0),
+          my_bucket_count(round_up_to_power_of_two(bucket_count)),
+          my_max_load_factor(float(initial_max_load_factor)),
+          my_hash_compare(hash, equal),
+          my_head(sokey_type(0)),
+          my_segments(alloc) {}
+
+    concurrent_unordered_base() : concurrent_unordered_base(initial_bucket_count) {}
+
+    concurrent_unordered_base( size_type bucket_count, const allocator_type& alloc )
+        : concurrent_unordered_base(bucket_count, hasher(), key_equal(), alloc) {}
+
+    concurrent_unordered_base( size_type bucket_count, const hasher& hash, const allocator_type& alloc )
+        : concurrent_unordered_base(bucket_count, hash, key_equal(), alloc) {}
+
+    explicit concurrent_unordered_base( const allocator_type& alloc )
+        : concurrent_unordered_base(initial_bucket_count, hasher(), key_equal(), alloc) {}
+
+    template <typename InputIterator>
+    concurrent_unordered_base( InputIterator first, InputIterator last,
+                               size_type bucket_count = initial_bucket_count, const hasher& hash = hasher(),
+                               const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() )
+        : concurrent_unordered_base(bucket_count, hash, equal, alloc)
+    {
+        insert(first, last);
+    }
+
+    template <typename InputIterator>
+    concurrent_unordered_base( InputIterator first, InputIterator last,
+                               size_type bucket_count, const allocator_type& alloc )
+        : concurrent_unordered_base(first, last, bucket_count, hasher(), key_equal(), alloc) {}
+
+    template <typename InputIterator>
+    concurrent_unordered_base( InputIterator first, InputIterator last,
+                               size_type bucket_count, const hasher& hash, const allocator_type& alloc )
+        : concurrent_unordered_base(first, last, bucket_count, hash, key_equal(), alloc) {}
+
+    concurrent_unordered_base( const concurrent_unordered_base& other )
+        : my_size(other.my_size.load(std::memory_order_relaxed)),
+          my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)),
+          my_max_load_factor(other.my_max_load_factor),
+          my_hash_compare(other.my_hash_compare),
+          my_head(other.my_head.order_key()),
+          my_segments(other.my_segments)
+    {
+        try_call( [&] {
+            internal_copy(other);
+        } ).on_exception( [&] {
+            clear();
+        });
+    }
+
+    concurrent_unordered_base( const concurrent_unordered_base& other, const allocator_type& alloc )
+        : my_size(other.my_size.load(std::memory_order_relaxed)),
+          my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)),
+          my_max_load_factor(other.my_max_load_factor),
+          my_hash_compare(other.my_hash_compare),
+          my_head(other.my_head.order_key()),
+          my_segments(other.my_segments, alloc)
+    {
+        try_call( [&] {
+            internal_copy(other);
+        } ).on_exception( [&] {
+            clear();
+        });
+    }
+
+    concurrent_unordered_base( concurrent_unordered_base&& other )
+        : my_size(other.my_size.load(std::memory_order_relaxed)),
+          my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)),
+          my_max_load_factor(std::move(other.my_max_load_factor)),
+          my_hash_compare(std::move(other.my_hash_compare)),
+          my_head(other.my_head.order_key()),
+          my_segments(std::move(other.my_segments))
+    {
+        move_content(std::move(other));
+    }
+
+    concurrent_unordered_base( concurrent_unordered_base&& other, const allocator_type& alloc )
+        : my_size(other.my_size.load(std::memory_order_relaxed)),
+          my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)),
+          my_max_load_factor(std::move(other.my_max_load_factor)),
+          my_hash_compare(std::move(other.my_hash_compare)),
+          my_head(other.my_head.order_key()),
+          my_segments(std::move(other.my_segments), alloc)
+    {
+        using is_always_equal = typename allocator_traits_type::is_always_equal;
+        internal_move_construct_with_allocator(std::move(other), alloc, is_always_equal());
+    }
+
+    concurrent_unordered_base( std::initializer_list<value_type> init,
+                               size_type bucket_count = initial_bucket_count,
+                               const hasher& hash = hasher(), const key_equal& equal = key_equal(),
+                               const allocator_type& alloc = allocator_type() )
+        : concurrent_unordered_base(init.begin(), init.end(), bucket_count, hash, equal, alloc) {}
+
+    concurrent_unordered_base( std::initializer_list<value_type> init,
+                               size_type bucket_count, const allocator_type& alloc )
+        : concurrent_unordered_base(init, bucket_count, hasher(), key_equal(), alloc) {}
+
+    concurrent_unordered_base( std::initializer_list<value_type> init,
+                               size_type bucket_count, const hasher& hash, const allocator_type& alloc )
+        : concurrent_unordered_base(init, bucket_count, hash, key_equal(), alloc) {}
+
+    ~concurrent_unordered_base() {
+        internal_clear();
+    }
+
+    concurrent_unordered_base& operator=( const concurrent_unordered_base& other ) {
+        if (this != &other) {
+            clear();
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            my_max_load_factor = other.my_max_load_factor;
+            my_hash_compare = other.my_hash_compare;
+            my_segments = other.my_segments;
+            internal_copy(other); // TODO: guards for exceptions?
+        }
+        return *this;
+    }
+
+    concurrent_unordered_base& operator=( concurrent_unordered_base&& other ) noexcept(unordered_segment_table::is_noexcept_assignment) {
+        if (this != &other) {
+            clear();
+            my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            my_max_load_factor = std::move(other.my_max_load_factor);
+            my_hash_compare = std::move(other.my_hash_compare);
+            my_segments = std::move(other.my_segments);
+
+            using pocma_type = typename allocator_traits_type::propagate_on_container_move_assignment;
+            using is_always_equal = typename allocator_traits_type::is_always_equal;
+            internal_move_assign(std::move(other), tbb::detail::disjunction<pocma_type, is_always_equal>());
+        }
+        return *this;
+    }
+
+    concurrent_unordered_base& operator=( std::initializer_list<value_type> init ) {
+        clear();
+        insert(init);
+        return *this;
+    }
+
+    void swap( concurrent_unordered_base& other ) noexcept(unordered_segment_table::is_noexcept_swap) {
+        if (this != &other) {
+            using pocs_type = typename allocator_traits_type::propagate_on_container_swap;
+            using is_always_equal = typename allocator_traits_type::is_always_equal;
+            internal_swap(other, tbb::detail::disjunction<pocs_type, is_always_equal>());
+        }
+    }
+
+    allocator_type get_allocator() const noexcept { return my_segments.get_allocator(); }
+
+    iterator begin() noexcept { return iterator(first_value_node(&my_head)); }
+    const_iterator begin() const noexcept { return const_iterator(first_value_node(const_cast<node_ptr>(&my_head))); }
+    const_iterator cbegin() const noexcept { return const_iterator(first_value_node(const_cast<node_ptr>(&my_head))); }
+
+    iterator end() noexcept { return iterator(nullptr); }
+    const_iterator end() const noexcept { return const_iterator(nullptr); }
+    const_iterator cend() const noexcept { return const_iterator(nullptr); }
+
+    __TBB_nodiscard bool empty() const noexcept { return size() == 0; }
+    size_type size() const noexcept { return my_size.load(std::memory_order_relaxed); }
+    size_type max_size() const noexcept { return allocator_traits_type::max_size(get_allocator()); }
+
+    void clear() noexcept {
+        internal_clear();
+    }
+
+    std::pair<iterator, bool> insert( const value_type& value ) {
+        return internal_insert_value(value);
+    }
+
+    std::pair<iterator, bool> insert( value_type&& value ) {
+        return internal_insert_value(std::move(value));
+    }
+
+    iterator insert( const_iterator, const value_type& value ) {
+        // Ignore hint
+        return insert(value).first;
+    }
+
+    iterator insert( const_iterator, value_type&& value ) {
+        // Ignore hint
+        return insert(std::move(value)).first;
+    }
+
+    template <typename InputIterator>
+    void insert( InputIterator first, InputIterator last ) {
+        for (; first != last; ++first) {
+            insert(*first);
+        }
+    }
+
+    void insert( std::initializer_list<value_type> init ) {
+        insert(init.begin(), init.end());
+    }
+
+    std::pair<iterator, bool> insert( node_type&& nh ) {
+        if (!nh.empty()) {
+            value_node_ptr insert_node = node_handle_accessor::get_node_ptr(nh);
+            auto init_node = [&insert_node]( sokey_type order_key )->value_node_ptr {
+                insert_node->init(order_key);
+                return insert_node;
+            };
+            auto insert_result = internal_insert(insert_node->value(), init_node);
+            if (insert_result.inserted) {
+                // If the insertion succeeded - set node handle to the empty state
+                __TBB_ASSERT(insert_result.remaining_node == nullptr,
+                            "internal_insert_node should not return the remaining node if the insertion succeeded");
+                node_handle_accessor::deactivate(nh);
+            }
+            return { iterator(insert_result.node_with_equal_key), insert_result.inserted };
+        }
+        return {end(), false};
+    }
+
+    iterator insert( const_iterator, node_type&& nh ) {
+        // Ignore hint
+        return insert(std::move(nh)).first;
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> emplace( Args&&... args ) {
+        // Create a node with temporary order_key 0, which will be reinitialize
+        // in internal_insert after the hash calculation
+        value_node_ptr insert_node = create_node(0, std::forward<Args>(args)...);
+
+        auto init_node = [&insert_node]( sokey_type order_key )->value_node_ptr {
+            insert_node->init(order_key);
+            return insert_node;
+        };
+
+        auto insert_result = internal_insert(insert_node->value(), init_node);
+
+        if (!insert_result.inserted) {
+            // If the insertion failed - destroy the node which was created
+            insert_node->init(split_order_key_regular(1));
+            destroy_node(insert_node);
+        }
+
+        return { iterator(insert_result.node_with_equal_key), insert_result.inserted };
+    }
+
+    template <typename... Args>
+    iterator emplace_hint( const_iterator, Args&&... args ) {
+        // Ignore hint
+        return emplace(std::forward<Args>(args)...).first;
+    }
+
+    iterator unsafe_erase( const_iterator pos ) {
+        return iterator(first_value_node(internal_erase(pos.get_node_ptr())));
+    }
+
+    iterator unsafe_erase( iterator pos ) {
+        return iterator(first_value_node(internal_erase(pos.get_node_ptr())));
+    }
+
+    iterator unsafe_erase( const_iterator first, const_iterator last ) {
+        while(first != last) {
+            first = unsafe_erase(first);
+        }
+        return iterator(first.get_node_ptr());
+    }
+
+    size_type unsafe_erase( const key_type& key ) {
+        return internal_erase_by_key(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value
+                            && !std::is_convertible<K, const_iterator>::value
+                            && !std::is_convertible<K, iterator>::value,
+                            size_type>::type unsafe_erase( const K& key )
+    {
+        return internal_erase_by_key(key);
+    }
+
+    node_type unsafe_extract( const_iterator pos ) {
+        internal_extract(pos.get_node_ptr());
+        return node_handle_accessor::construct<node_type>(pos.get_node_ptr());
+    }
+
+    node_type unsafe_extract( iterator pos ) {
+        internal_extract(pos.get_node_ptr());
+        return node_handle_accessor::construct<node_type>(pos.get_node_ptr());
+    }
+
+    node_type unsafe_extract( const key_type& key ) {
+        iterator item = find(key);
+        return item == end() ? node_type() : unsafe_extract(item);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value
+                            && !std::is_convertible<K, const_iterator>::value
+                            && !std::is_convertible<K, iterator>::value,
+                            node_type>::type unsafe_extract( const K& key )
+    {
+        iterator item = find(key);
+        return item == end() ? node_type() : unsafe_extract(item);
+    }
+
+    // Lookup functions
+    iterator find( const key_type& key ) {
+        value_node_ptr result = internal_find(key);
+        return result == nullptr ? end() : iterator(result);
+    }
+
+    const_iterator find( const key_type& key ) const {
+        value_node_ptr result = const_cast<self_type*>(this)->internal_find(key);
+        return result == nullptr ? end() : const_iterator(result);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, iterator>::type find( const K& key ) {
+        value_node_ptr result = internal_find(key);
+        return result == nullptr ? end() : iterator(result);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, const_iterator>::type find( const K& key ) const {
+        value_node_ptr result = const_cast<self_type*>(this)->internal_find(key);
+        return result == nullptr ? end() : const_iterator(result);
+    }
+
+    std::pair<iterator, iterator> equal_range( const key_type& key ) {
+        auto result = internal_equal_range(key);
+        return std::make_pair(iterator(result.first), iterator(result.second));
+    }
+
+    std::pair<const_iterator, const_iterator> equal_range( const key_type& key ) const {
+        auto result = const_cast<self_type*>(this)->internal_equal_range(key);
+        return std::make_pair(const_iterator(result.first), const_iterator(result.second));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, std::pair<iterator, iterator>>::type equal_range( const K& key ) {
+        auto result = internal_equal_range(key);
+        return std::make_pair(iterator(result.first), iterator(result.second));
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, std::pair<const_iterator, const_iterator>>::type equal_range( const K& key ) const {
+        auto result = const_cast<self_type*>(this)->internal_equal_range(key);
+        return std::make_pair(iterator(result.first), iterator(result.second));
+    }
+
+    size_type count( const key_type& key ) const {
+        return internal_count(key);
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, size_type>::type count( const K& key ) const {
+        return internal_count(key);
+    }
+
+    bool contains( const key_type& key ) const {
+        return find(key) != end();
+    }
+
+    template <typename K>
+    typename std::enable_if<is_transparent<K>::value, bool>::type contains( const K& key ) const {
+        return find(key) != end();
+    }
+
+    // Bucket interface
+    local_iterator unsafe_begin( size_type n ) {
+        return local_iterator(first_value_node(get_bucket(n)));
+    }
+
+    const_local_iterator unsafe_begin( size_type n ) const {
+        auto bucket_begin = first_value_node(const_cast<self_type*>(this)->get_bucket(n));
+        return const_local_iterator(bucket_begin);
+    }
+
+    const_local_iterator unsafe_cbegin( size_type n ) const {
+        auto bucket_begin = first_value_node(const_cast<self_type*>(this)->get_bucket(n));
+        return const_local_iterator(bucket_begin);
+    }
+
+    local_iterator unsafe_end( size_type n ) {
+        size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed);
+        return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : local_iterator(nullptr);
+    }
+
+    const_local_iterator unsafe_end( size_type n ) const {
+        size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed);
+        return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : const_local_iterator(nullptr);
+    }
+
+    const_local_iterator unsafe_cend( size_type n ) const {
+        size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed);
+        return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : const_local_iterator(nullptr);
+    }
+
+    size_type unsafe_bucket_count() const { return my_bucket_count.load(std::memory_order_relaxed); }
+
+    size_type unsafe_max_bucket_count() const {
+        return max_size();
+    }
+
+    size_type unsafe_bucket_size( size_type n ) const {
+        return size_type(std::distance(unsafe_begin(n), unsafe_end(n)));
+    }
+
+    size_type unsafe_bucket( const key_type& key ) const {
+        return my_hash_compare(key) % my_bucket_count.load(std::memory_order_relaxed);
+    }
+
+    // Hash policy
+    float load_factor() const {
+        return float(size() / float(my_bucket_count.load(std::memory_order_acquire)));
+    }
+
+    float max_load_factor() const { return my_max_load_factor; }
+
+    void max_load_factor( float mlf ) {
+        if (mlf != mlf || mlf < 0) {
+            tbb::detail::throw_exception(exception_id::invalid_load_factor);
+        }
+        my_max_load_factor = mlf;
+    } // TODO: unsafe?
+
+    void rehash( size_type bucket_count ) {
+        size_type current_bucket_count = my_bucket_count.load(std::memory_order_acquire);
+        if (current_bucket_count < bucket_count) {
+            // TODO: do we need do-while here?
+            my_bucket_count.compare_exchange_strong(current_bucket_count, round_up_to_power_of_two(bucket_count));
+        }
+    }
+
+    void reserve( size_type elements_count ) {
+        size_type current_bucket_count = my_bucket_count.load(std::memory_order_acquire);
+        size_type necessary_bucket_count = current_bucket_count;
+
+        // max_load_factor() is currently unsafe, so we can assume that my_max_load_factor
+        // would not be changed during the calculation
+        // TODO: Log2 seems useful here
+        while (necessary_bucket_count * max_load_factor() < elements_count) {
+                necessary_bucket_count <<= 1;
+        }
+
+        while (!my_bucket_count.compare_exchange_strong(current_bucket_count, necessary_bucket_count)) {
+            if (current_bucket_count >= necessary_bucket_count)
+                break;
+        }
+    }
+
+    // Observers
+    hasher hash_function() const { return my_hash_compare.hash_function(); }
+    key_equal key_eq() const { return my_hash_compare.key_eq(); }
+
+    class const_range_type {
+    private:
+        const concurrent_unordered_base& my_instance;
+        node_ptr my_begin_node; // may be node* const
+        node_ptr my_end_node;
+        mutable node_ptr my_midpoint_node;
+    public:
+        using size_type = typename concurrent_unordered_base::size_type;
+        using value_type = typename concurrent_unordered_base::value_type;
+        using reference = typename concurrent_unordered_base::reference;
+        using difference_type = typename concurrent_unordered_base::difference_type;
+        using iterator = typename concurrent_unordered_base::const_iterator;
+
+        bool empty() const { return my_begin_node == my_end_node; }
+
+        bool is_divisible() const {
+            return my_midpoint_node != my_end_node;
+        }
+
+        size_type grainsize() const { return 1; }
+
+        const_range_type( const_range_type& range, split )
+            : my_instance(range.my_instance),
+              my_begin_node(range.my_midpoint_node),
+              my_end_node(range.my_end_node)
+        {
+            range.my_end_node = my_begin_node;
+            __TBB_ASSERT(!empty(), "Splitting despite the range is not divisible");
+            __TBB_ASSERT(!range.empty(), "Splitting despite the range is not divisible");
+            set_midpoint();
+            range.set_midpoint();
+        }
+
+        iterator begin() const { return iterator(my_instance.first_value_node(my_begin_node)); }
+        iterator end() const { return iterator(my_instance.first_value_node(my_end_node)); }
+
+        const_range_type( const concurrent_unordered_base& table )
+            : my_instance(table), my_begin_node(my_instance.first_value_node(const_cast<node_ptr>(&table.my_head))), my_end_node(nullptr)
+        {
+            set_midpoint();
+        }
+    private:
+        void set_midpoint() const {
+            if (empty()) {
+                my_midpoint_node = my_end_node;
+            } else {
+                sokey_type invalid_key = ~sokey_type(0);
+                sokey_type begin_key = my_begin_node != nullptr ? my_begin_node->order_key() : invalid_key;
+                sokey_type end_key = my_end_node != nullptr ? my_end_node->order_key() : invalid_key;
+
+                size_type mid_bucket = reverse_bits(begin_key + (end_key - begin_key) / 2) %
+                    my_instance.my_bucket_count.load(std::memory_order_relaxed);
+                while( my_instance.my_segments[mid_bucket].load(std::memory_order_relaxed) == nullptr) {
+                    mid_bucket = my_instance.get_parent(mid_bucket);
+                }
+                if (reverse_bits(mid_bucket) > begin_key) {
+                    // Found a dummy node between begin and end
+                    my_midpoint_node = my_instance.first_value_node(
+                        my_instance.my_segments[mid_bucket].load(std::memory_order_relaxed));
+                } else {
+                    // Didn't find a dummy node between begin and end
+                    my_midpoint_node = my_end_node;
+                }
+            }
+        }
+    }; // class const_range_type
+
+    class range_type : public const_range_type {
+    public:
+        using iterator = typename concurrent_unordered_base::iterator;
+        using const_range_type::const_range_type;
+
+        iterator begin() const { return iterator(const_range_type::begin().get_node_ptr()); }
+        iterator end() const { return iterator(const_range_type::end().get_node_ptr()); }
+    }; // class range_type
+
+    // Parallel iteration
+    range_type range() {
+        return range_type(*this);
+    }
+
+    const_range_type range() const {
+        return const_range_type(*this);
+    }
+protected:
+    static constexpr bool allow_multimapping = traits_type::allow_multimapping;
+
+private:
+    static constexpr size_type initial_bucket_count = 8;
+    static constexpr float initial_max_load_factor = 4; // TODO: consider 1?
+    static constexpr size_type pointers_per_embedded_table = sizeof(size_type) * 8 - 1;
+
+    class unordered_segment_table
+        : public segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>
+    {
+        using self_type = unordered_segment_table;
+        using atomic_node_ptr = std::atomic<node_ptr>;
+        using base_type = segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>;
+        using segment_type = typename base_type::segment_type;
+        using base_allocator_type = typename base_type::allocator_type;
+
+        using segment_allocator_type = typename allocator_traits_type::template rebind_alloc<atomic_node_ptr>;
+        using segment_allocator_traits = tbb::detail::allocator_traits<segment_allocator_type>;
+    public:
+        // Segment table for unordered containers should not be extended in the wait- free implementation
+        static constexpr bool allow_table_extending = false;
+        static constexpr bool is_noexcept_assignment = std::is_nothrow_move_assignable<hasher>::value &&
+                                                       std::is_nothrow_move_assignable<key_equal>::value &&
+                                                       segment_allocator_traits::is_always_equal::value;
+        static constexpr bool is_noexcept_swap = tbb::detail::is_nothrow_swappable<hasher>::value &&
+                                                 tbb::detail::is_nothrow_swappable<key_equal>::value &&
+                                                 segment_allocator_traits::is_always_equal::value;
+
+        // TODO: using base_type::base_type is not compiling on Windows and Intel Compiler - investigate
+        unordered_segment_table( const base_allocator_type& alloc = base_allocator_type() )
+            : base_type(alloc) {}
+
+        unordered_segment_table( const unordered_segment_table& ) = default;
+
+        unordered_segment_table( const unordered_segment_table& other, const base_allocator_type& alloc )
+            : base_type(other, alloc) {}
+
+        unordered_segment_table( unordered_segment_table&& ) = default;
+
+        unordered_segment_table( unordered_segment_table&& other, const base_allocator_type& alloc )
+            : base_type(std::move(other), alloc) {}
+
+        unordered_segment_table& operator=( const unordered_segment_table& ) = default;
+
+        unordered_segment_table& operator=( unordered_segment_table&& ) = default;
+
+        segment_type create_segment( typename base_type::segment_table_type, typename base_type::segment_index_type segment_index, size_type ) {
+            segment_allocator_type alloc(this->get_allocator());
+            size_type seg_size = this->segment_size(segment_index);
+            segment_type new_segment = segment_allocator_traits::allocate(alloc, seg_size);
+            for (size_type i = 0; i != seg_size; ++i) {
+                segment_allocator_traits::construct(alloc, new_segment + i, nullptr);
+            }
+            return new_segment;
+        }
+
+        segment_type nullify_segment( typename base_type::segment_table_type table, size_type segment_index ) {
+            segment_type target_segment = table[segment_index].load(std::memory_order_relaxed);
+            table[segment_index].store(nullptr, std::memory_order_relaxed);
+            return target_segment;
+        }
+
+        // deallocate_segment is required by the segment_table base class, but
+        // in unordered, it is also necessary to call the destructor during deallocation
+        void deallocate_segment( segment_type address, size_type index ) {
+            destroy_segment(address, index);
+        }
+
+        void destroy_segment( segment_type address, size_type index ) {
+            segment_allocator_type alloc(this->get_allocator());
+            for (size_type i = 0; i != this->segment_size(index); ++i) {
+                segment_allocator_traits::destroy(alloc, address + i);
+            }
+            segment_allocator_traits::deallocate(alloc, address, this->segment_size(index));
+        }
+
+
+        void copy_segment( size_type index, segment_type, segment_type to ) {
+            if (index == 0) {
+                // The first element in the first segment is embedded into the table (my_head)
+                // so the first pointer should not be stored here
+                // It would be stored during move ctor/assignment operation
+                to[1].store(nullptr, std::memory_order_relaxed);
+            } else {
+                for (size_type i = 0; i != this->segment_size(index); ++i) {
+                    to[i].store(nullptr, std::memory_order_relaxed);
+                }
+            }
+        }
+
+        void move_segment( size_type index, segment_type from, segment_type to ) {
+            if (index == 0) {
+                // The first element in the first segment is embedded into the table (my_head)
+                // so the first pointer should not be stored here
+                // It would be stored during move ctor/assignment operation
+                to[1].store(from[1].load(std::memory_order_relaxed), std::memory_order_relaxed);
+            } else {
+                for (size_type i = 0; i != this->segment_size(index); ++i) {
+                    to[i].store(from[i].load(std::memory_order_relaxed), std::memory_order_relaxed);
+                    from[i].store(nullptr, std::memory_order_relaxed);
+                }
+            }
+        }
+
+        // allocate_long_table is required by the segment_table base class, but unused for unordered containers
+        typename base_type::segment_table_type allocate_long_table( const typename base_type::atomic_segment*, size_type ) {
+            __TBB_ASSERT(false, "This method should never been called");
+            // TableType is a pointer
+            return nullptr;
+        }
+
+        // destroy_elements is required by the segment_table base class, but unused for unordered containers
+        // this function call but do nothing
+        void destroy_elements() {}
+    }; // struct unordered_segment_table
+
+    void internal_clear() {
+        // TODO: consider usefulness of two versions of clear() - with dummy nodes deallocation and without it
+        node_ptr next = my_head.next();
+        node_ptr curr = next;
+
+        my_head.set_next(nullptr);
+
+        while (curr != nullptr) {
+            next = curr->next();
+            destroy_node(curr);
+            curr = next;
+        }
+
+        my_size.store(0, std::memory_order_relaxed);
+        my_segments.clear();
+    }
+
+    void destroy_node( node_ptr node ) {
+        if (node->is_dummy()) {
+            node_allocator_type dummy_node_allocator(my_segments.get_allocator());
+            // Destroy the node
+            node_allocator_traits::destroy(dummy_node_allocator, node);
+            // Deallocate the memory
+            node_allocator_traits::deallocate(dummy_node_allocator, node, 1);
+        } else {
+            // GCC 11.1 issues a warning here that incorrect destructor might be called for dummy_nodes
+            #if (__TBB_GCC_VERSION >= 110100 && __TBB_GCC_VERSION < 130000 ) && !__clang__ && !__INTEL_COMPILER
+            volatile
+            #endif
+            value_node_ptr val_node = static_cast<value_node_ptr>(node);
+            value_node_allocator_type value_node_allocator(my_segments.get_allocator());
+            // Destroy the value
+            value_node_allocator_traits::destroy(value_node_allocator, val_node->storage());
+            // Destroy the node
+            value_node_allocator_traits::destroy(value_node_allocator, val_node);
+            // Deallocate the memory
+            value_node_allocator_traits::deallocate(value_node_allocator, val_node, 1);
+        }
+    }
+
+    struct internal_insert_return_type {
+        // If the insertion failed - the remaining_node points to the node, which was failed to insert
+        // This node can be allocated in process of insertion
+        value_node_ptr remaining_node;
+        // If the insertion failed - node_with_equal_key points to the node in the list with the
+        // key, equivalent to the inserted, otherwise it points to the node, which was inserted.
+        value_node_ptr node_with_equal_key;
+        // Insertion status
+        // NOTE: if it is true - remaining_node should be nullptr
+        bool inserted;
+    }; // struct internal_insert_return_type
+
+    // Inserts the value into the split ordered list
+    template <typename ValueType>
+    std::pair<iterator, bool> internal_insert_value( ValueType&& value ) {
+
+        auto create_value_node = [&value, this]( sokey_type order_key )->value_node_ptr {
+            return create_node(order_key, std::forward<ValueType>(value));
+        };
+
+        auto insert_result = internal_insert(value, create_value_node);
+
+        if (insert_result.remaining_node != nullptr) {
+            // If the insertion fails - destroy the node which was failed to insert if it exist
+            __TBB_ASSERT(!insert_result.inserted,
+                         "remaining_node should be nullptr if the node was successfully inserted");
+            destroy_node(insert_result.remaining_node);
+        }
+
+        return { iterator(insert_result.node_with_equal_key), insert_result.inserted };
+    }
+
+    // Inserts the node into the split ordered list
+    // Creates a node using the specified callback after the place for insertion was found
+    // Returns internal_insert_return_type object, where:
+    //     - If the insertion succeeded:
+    //         - remaining_node is nullptr
+    //         - node_with_equal_key point to the inserted node
+    //         - inserted is true
+    //     - If the insertion failed:
+    //         - remaining_node points to the node, that was failed to insert if it was created.
+    //           nullptr if the node was not created, because the requested key was already
+    //           presented in the list
+    //         - node_with_equal_key point to the element in the list with the key, equivalent to
+    //           to the requested key
+    //         - inserted is false
+    template <typename ValueType, typename CreateInsertNode>
+    internal_insert_return_type internal_insert( ValueType&& value, CreateInsertNode create_insert_node ) {
+        static_assert(std::is_same<typename std::decay<ValueType>::type, value_type>::value,
+                      "Incorrect type in internal_insert");
+        const key_type& key = traits_type::get_key(value);
+        sokey_type hash_key = sokey_type(my_hash_compare(key));
+
+        sokey_type order_key = split_order_key_regular(hash_key);
+        node_ptr prev = prepare_bucket(hash_key);
+        __TBB_ASSERT(prev != nullptr, "Invalid head node");
+
+        auto search_result = search_after(prev, order_key, key);
+
+        if (search_result.second) {
+            return internal_insert_return_type{ nullptr, search_result.first, false };
+        }
+
+        value_node_ptr new_node = create_insert_node(order_key);
+        node_ptr curr = search_result.first;
+
+        while (!try_insert(prev, new_node, curr)) {
+            search_result = search_after(prev, order_key, key);
+            if (search_result.second) {
+                return internal_insert_return_type{ new_node, search_result.first, false };
+            }
+            curr = search_result.first;
+        }
+
+        auto sz = my_size.fetch_add(1);
+        adjust_table_size(sz + 1, my_bucket_count.load(std::memory_order_acquire));
+        return internal_insert_return_type{ nullptr, static_cast<value_node_ptr>(new_node), true };
+    }
+
+    // Searches the node with the key, equivalent to key with requested order key after the node prev
+    // Returns the existing node and true if the node is already in the list
+    // Returns the first node with the order key, greater than requested and false if the node is not presented in the list
+    std::pair<value_node_ptr, bool> search_after( node_ptr& prev, sokey_type order_key, const key_type& key ) {
+        // NOTE: static_cast<value_node_ptr>(curr) should be done only after we would ensure
+        // that the node is not a dummy node
+
+        node_ptr curr = prev->next();
+
+        while (curr != nullptr && (curr->order_key() < order_key ||
+               (curr->order_key() == order_key && !my_hash_compare(traits_type::get_key(static_cast<value_node_ptr>(curr)->value()), key))))
+        {
+            prev = curr;
+            curr = curr->next();
+        }
+
+        if (curr != nullptr && curr->order_key() == order_key && !allow_multimapping) {
+            return { static_cast<value_node_ptr>(curr), true };
+        }
+        return { static_cast<value_node_ptr>(curr), false };
+    }
+
+    void adjust_table_size( size_type total_elements, size_type current_size ) {
+        // Grow the table by a factor of 2 if possible and needed
+        if ( (float(total_elements) / float(current_size)) > my_max_load_factor ) {
+            // Double the size of the hash only if size hash not changed in between loads
+            my_bucket_count.compare_exchange_strong(current_size, 2u * current_size);
+        }
+    }
+
+    node_ptr insert_dummy_node( node_ptr parent_dummy_node, sokey_type order_key ) {
+        node_ptr prev_node = parent_dummy_node;
+
+        node_ptr dummy_node = create_dummy_node(order_key);
+        node_ptr next_node;
+
+        do {
+            next_node = prev_node->next();
+            // Move forward through the list while the order key is less than requested
+            while (next_node != nullptr && next_node->order_key() < order_key) {
+                prev_node = next_node;
+                next_node = next_node->next();
+            }
+
+            if (next_node != nullptr && next_node->order_key() == order_key) {
+                // Another dummy node with the same order key was inserted by another thread
+                // Destroy the node and exit
+                destroy_node(dummy_node);
+                return next_node;
+            }
+        } while (!try_insert(prev_node, dummy_node, next_node));
+
+        return dummy_node;
+    }
+
+    // Try to insert a node between prev_node and expected next
+    // If the next is not equal to expected next - return false
+    static bool try_insert( node_ptr prev_node, node_ptr new_node, node_ptr current_next_node ) {
+        new_node->set_next(current_next_node);
+        return prev_node->try_set_next(current_next_node, new_node);
+    }
+
+    // Returns the bucket, associated with the hash_key
+    node_ptr prepare_bucket( sokey_type hash_key ) {
+        size_type bucket = hash_key % my_bucket_count.load(std::memory_order_acquire);
+        return get_bucket(bucket);
+    }
+
+    // Initialize the corresponding bucket if it is not initialized
+    node_ptr get_bucket( size_type bucket_index ) {
+        if (my_segments[bucket_index].load(std::memory_order_acquire) == nullptr) {
+            init_bucket(bucket_index);
+        }
+        return my_segments[bucket_index].load(std::memory_order_acquire);
+    }
+
+    void init_bucket( size_type bucket ) {
+        if (bucket == 0) {
+            // Atomicaly store the first bucket into my_head
+            node_ptr disabled = nullptr;
+            my_segments[0].compare_exchange_strong(disabled, &my_head);
+            return;
+        }
+
+        size_type parent_bucket = get_parent(bucket);
+
+        while (my_segments[parent_bucket].load(std::memory_order_acquire) == nullptr) {
+            // Initialize all of the parent buckets
+            init_bucket(parent_bucket);
+        }
+
+        __TBB_ASSERT(my_segments[parent_bucket].load(std::memory_order_acquire) != nullptr, "Parent bucket should be initialized");
+        node_ptr parent = my_segments[parent_bucket].load(std::memory_order_acquire);
+
+        // Insert dummy node into the list
+        node_ptr dummy_node = insert_dummy_node(parent, split_order_key_dummy(bucket));
+        // TODO: consider returning pair<node_ptr, bool> to avoid store operation if the bucket was stored by an other thread
+        // or move store to insert_dummy_node
+        // Add dummy_node into the segment table
+        my_segments[bucket].store(dummy_node, std::memory_order_release);
+    }
+
+    node_ptr create_dummy_node( sokey_type order_key ) {
+        node_allocator_type dummy_node_allocator(my_segments.get_allocator());
+        node_ptr dummy_node = node_allocator_traits::allocate(dummy_node_allocator, 1);
+        node_allocator_traits::construct(dummy_node_allocator, dummy_node, order_key);
+        return dummy_node;
+    }
+
+    template <typename... Args>
+    value_node_ptr create_node( sokey_type order_key, Args&&... args ) {
+        value_node_allocator_type value_node_allocator(my_segments.get_allocator());
+        // Allocate memory for the value_node
+        value_node_ptr new_node = value_node_allocator_traits::allocate(value_node_allocator, 1);
+        // Construct the node
+        value_node_allocator_traits::construct(value_node_allocator, new_node, order_key);
+
+        // try_call API is not convenient here due to broken
+        // variadic capture on GCC 4.8.5
+        auto value_guard = make_raii_guard([&] {
+            value_node_allocator_traits::destroy(value_node_allocator, new_node);
+            value_node_allocator_traits::deallocate(value_node_allocator, new_node, 1);
+        });
+
+        // Construct the value in the node
+        value_node_allocator_traits::construct(value_node_allocator, new_node->storage(), std::forward<Args>(args)...);
+        value_guard.dismiss();
+        return new_node;
+    }
+
+    value_node_ptr first_value_node( node_ptr first_node ) const {
+        while (first_node != nullptr && first_node->is_dummy()) {
+            first_node = first_node->next();
+        }
+        return static_cast<value_node_ptr>(first_node);
+    }
+
+    // Unsafe method, which removes the node from the list and returns the next node
+    node_ptr internal_erase( value_node_ptr node_to_erase ) {
+        __TBB_ASSERT(node_to_erase != nullptr, "Invalid iterator for erase");
+        node_ptr next_node = node_to_erase->next();
+        internal_extract(node_to_erase);
+        destroy_node(node_to_erase);
+        return next_node;
+    }
+
+    template <typename K>
+    size_type internal_erase_by_key( const K& key ) {
+        // TODO: consider reimplementation without equal_range - it is not effective to perform lookup over a bucket
+        // for each unsafe_erase call
+        auto eq_range = equal_range(key);
+        size_type erased_count = 0;
+
+        for (auto it = eq_range.first; it != eq_range.second;) {
+            it = unsafe_erase(it);
+            ++erased_count;
+        }
+        return erased_count;
+    }
+
+    // Unsafe method, which extracts the node from the list
+    void internal_extract( value_node_ptr node_to_extract ) {
+        const key_type& key = traits_type::get_key(node_to_extract->value());
+        sokey_type hash_key = sokey_type(my_hash_compare(key));
+
+        node_ptr prev_node = prepare_bucket(hash_key);
+
+        for (node_ptr node = prev_node->next(); node != nullptr; prev_node = node, node = node->next()) {
+            if (node == node_to_extract) {
+                unlink_node(prev_node, node, node_to_extract->next());
+                my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
+                return;
+            }
+            __TBB_ASSERT(node->order_key() <= node_to_extract->order_key(),
+                         "node, which is going to be extracted should be presented in the list");
+        }
+    }
+
+protected:
+    template <typename SourceType>
+    void internal_merge( SourceType&& source ) {
+        static_assert(std::is_same<node_type, typename std::decay<SourceType>::type::node_type>::value,
+                      "Incompatible containers cannot be merged");
+
+        for (node_ptr source_prev = &source.my_head; source_prev->next() != nullptr;) {
+            if (!source_prev->next()->is_dummy()) {
+                value_node_ptr curr = static_cast<value_node_ptr>(source_prev->next());
+                // If the multimapping is allowed, or the key is not presented
+                // in the *this container - extract the node from the list
+                if (allow_multimapping || !contains(traits_type::get_key(curr->value()))) {
+                    node_ptr next_node = curr->next();
+                    source.unlink_node(source_prev, curr, next_node);
+
+                    // Remember the old order key
+                    sokey_type old_order_key = curr->order_key();
+
+                    // Node handle with curr cannot be used directly in insert call, because
+                    // the destructor of node_type will destroy curr
+                    node_type curr_node = node_handle_accessor::construct<node_type>(curr);
+
+                    // If the insertion fails - return ownership of the node to the source
+                    if (!insert(std::move(curr_node)).second) {
+                        __TBB_ASSERT(!allow_multimapping, "Insertion should succeed for multicontainer");
+                        __TBB_ASSERT(source_prev->next() == next_node,
+                                     "Concurrent operations with the source container in merge are prohibited");
+
+                        // Initialize the node with the old order key, because the order key
+                        // can change during the insertion
+                        curr->init(old_order_key);
+                        __TBB_ASSERT(old_order_key >= source_prev->order_key() &&
+                                     (next_node == nullptr || old_order_key <= next_node->order_key()),
+                                     "Wrong nodes order in the source container");
+                        // Merge is unsafe for source container, so the insertion back can be done without compare_exchange
+                        curr->set_next(next_node);
+                        source_prev->set_next(curr);
+                        source_prev = curr;
+                        node_handle_accessor::deactivate(curr_node);
+                    } else {
+                        source.my_size.fetch_sub(1, std::memory_order_relaxed);
+                    }
+                } else {
+                    source_prev = curr;
+                }
+            } else {
+                source_prev = source_prev->next();
+            }
+        }
+    }
+
+private:
+    // Unsafe method, which unlinks the node between prev and next
+    void unlink_node( node_ptr prev_node, node_ptr node_to_unlink, node_ptr next_node ) {
+        __TBB_ASSERT(prev_node->next() == node_to_unlink &&
+                     node_to_unlink->next() == next_node,
+                     "erasing and extracting nodes from the containers are unsafe in concurrent mode");
+        prev_node->set_next(next_node);
+        node_to_unlink->set_next(nullptr);
+    }
+
+    template <typename K>
+    value_node_ptr internal_find( const K& key ) {
+        sokey_type hash_key = sokey_type(my_hash_compare(key));
+        sokey_type order_key = split_order_key_regular(hash_key);
+
+        node_ptr curr = prepare_bucket(hash_key);
+
+        while (curr != nullptr) {
+            if (curr->order_key() > order_key) {
+                // If the order key is greater than the requested order key,
+                // the element is not in the hash table
+                return nullptr;
+            } else if (curr->order_key() == order_key &&
+                       my_hash_compare(traits_type::get_key(static_cast<value_node_ptr>(curr)->value()), key)) {
+                // The fact that order keys match does not mean that the element is found.
+                // Key function comparison has to be performed to check whether this is the
+                // right element. If not, keep searching while order key is the same.
+                return static_cast<value_node_ptr>(curr);
+            }
+            curr = curr->next();
+        }
+
+        return nullptr;
+    }
+
+    template <typename K>
+    std::pair<value_node_ptr, value_node_ptr> internal_equal_range( const K& key ) {
+        sokey_type hash_key = sokey_type(my_hash_compare(key));
+        sokey_type order_key = split_order_key_regular(hash_key);
+
+        node_ptr curr = prepare_bucket(hash_key);
+
+        while (curr != nullptr) {
+            if (curr->order_key() > order_key) {
+                // If the order key is greater than the requested order key,
+                // the element is not in the hash table
+                return std::make_pair(nullptr, nullptr);
+            } else if (curr->order_key() == order_key &&
+                       my_hash_compare(traits_type::get_key(static_cast<value_node_ptr>(curr)->value()), key)) {
+                value_node_ptr first = static_cast<value_node_ptr>(curr);
+                node_ptr last = first;
+                do {
+                    last = last->next();
+                } while (allow_multimapping && last != nullptr && !last->is_dummy() &&
+                        my_hash_compare(traits_type::get_key(static_cast<value_node_ptr>(last)->value()), key));
+                return std::make_pair(first, first_value_node(last));
+            }
+            curr = curr->next();
+        }
+        return {nullptr, nullptr};
+    }
+
+    template <typename K>
+    size_type internal_count( const K& key ) const {
+        if (allow_multimapping) {
+            // TODO: consider reimplementing the internal_equal_range with elements counting to avoid std::distance
+            auto eq_range = equal_range(key);
+            return std::distance(eq_range.first, eq_range.second);
+        } else {
+            return contains(key) ? 1 : 0;
+        }
+    }
+
+    void internal_copy( const concurrent_unordered_base& other ) {
+        node_ptr last_node = &my_head;
+        my_segments[0].store(&my_head, std::memory_order_relaxed);
+
+        for (node_ptr node = other.my_head.next(); node != nullptr; node = node->next()) {
+            node_ptr new_node;
+            if (!node->is_dummy()) {
+                // The node in the right table contains a value
+                new_node = create_node(node->order_key(), static_cast<value_node_ptr>(node)->value());
+            } else {
+                // The node in the right table is a dummy node
+                new_node = create_dummy_node(node->order_key());
+                my_segments[reverse_bits(node->order_key())].store(new_node, std::memory_order_relaxed);
+            }
+
+            last_node->set_next(new_node);
+            last_node = new_node;
+        }
+    }
+
+    void internal_move( concurrent_unordered_base&& other ) {
+        node_ptr last_node = &my_head;
+        my_segments[0].store(&my_head, std::memory_order_relaxed);
+
+        for (node_ptr node = other.my_head.next(); node != nullptr; node = node->next()) {
+            node_ptr new_node;
+            if (!node->is_dummy()) {
+                // The node in the right table contains a value
+                new_node = create_node(node->order_key(), std::move(static_cast<value_node_ptr>(node)->value()));
+            } else {
+                // TODO: do we need to destroy a dummy node in the right container?
+                // The node in the right table is a dummy_node
+                new_node = create_dummy_node(node->order_key());
+                my_segments[reverse_bits(node->order_key())].store(new_node, std::memory_order_relaxed);
+            }
+
+            last_node->set_next(new_node);
+            last_node = new_node;
+        }
+    }
+
+    void move_content( concurrent_unordered_base&& other ) {
+        // NOTE: allocators should be equal
+        my_head.set_next(other.my_head.next());
+        other.my_head.set_next(nullptr);
+        my_segments[0].store(&my_head, std::memory_order_relaxed);
+
+        other.my_bucket_count.store(initial_bucket_count, std::memory_order_relaxed);
+        other.my_max_load_factor = initial_max_load_factor;
+        other.my_size.store(0, std::memory_order_relaxed);
+    }
+
+    void internal_move_construct_with_allocator( concurrent_unordered_base&& other, const allocator_type&,
+                                                 /*is_always_equal = */std::true_type ) {
+        // Allocators are always equal - no need to compare for equality
+        move_content(std::move(other));
+    }
+
+    void internal_move_construct_with_allocator( concurrent_unordered_base&& other, const allocator_type& alloc,
+                                                 /*is_always_equal = */std::false_type ) {
+        // Allocators are not always equal
+        if (alloc == other.my_segments.get_allocator()) {
+            move_content(std::move(other));
+        } else {
+            try_call( [&] {
+                internal_move(std::move(other));
+            } ).on_exception( [&] {
+                clear();
+            });
+        }
+    }
+
+    // Move assigns the hash table to other is any instances of allocator_type are always equal
+    // or propagate_on_container_move_assignment is true
+    void internal_move_assign( concurrent_unordered_base&& other, /*is_always_equal || POCMA = */std::true_type ) {
+        move_content(std::move(other));
+    }
+
+    // Move assigns the hash table to other is any instances of allocator_type are not always equal
+    // and propagate_on_container_move_assignment is false
+    void internal_move_assign( concurrent_unordered_base&& other, /*is_always_equal || POCMA = */std::false_type ) {
+        if (my_segments.get_allocator() == other.my_segments.get_allocator()) {
+            move_content(std::move(other));
+        } else {
+            // TODO: guards for exceptions
+            internal_move(std::move(other));
+        }
+    }
+
+    void internal_swap( concurrent_unordered_base& other, /*is_always_equal || POCS = */std::true_type ) {
+        internal_swap_fields(other);
+    }
+
+    void internal_swap( concurrent_unordered_base& other, /*is_always_equal || POCS = */std::false_type ) {
+        __TBB_ASSERT(my_segments.get_allocator() == other.my_segments.get_allocator(),
+                     "Swapping with unequal allocators is not allowed");
+        internal_swap_fields(other);
+    }
+
+    void internal_swap_fields( concurrent_unordered_base& other ) {
+        node_ptr first_node = my_head.next();
+        my_head.set_next(other.my_head.next());
+        other.my_head.set_next(first_node);
+
+        size_type current_size = my_size.load(std::memory_order_relaxed);
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_size.store(current_size, std::memory_order_relaxed);
+
+        size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed);
+        my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        other.my_bucket_count.store(bucket_count, std::memory_order_relaxed);
+
+        using std::swap;
+        swap(my_max_load_factor, other.my_max_load_factor);
+        swap(my_hash_compare, other.my_hash_compare);
+        my_segments.swap(other.my_segments);
+
+        // swap() method from segment table swaps all of the segments including the first segment
+        // We should restore it to my_head. Without it the first segment of the container will point
+        // to other.my_head.
+        my_segments[0].store(&my_head, std::memory_order_relaxed);
+        other.my_segments[0].store(&other.my_head, std::memory_order_relaxed);
+    }
+
+    // A regular order key has its original hash value reversed and the last bit set
+    static constexpr sokey_type split_order_key_regular( sokey_type hash ) {
+        return reverse_bits(hash) | 0x1;
+    }
+
+    // A dummy order key has its original hash value reversed and the last bit unset
+    static constexpr sokey_type split_order_key_dummy( sokey_type hash ) {
+        return reverse_bits(hash) & ~sokey_type(0x1);
+    }
+
+    size_type get_parent( size_type bucket ) const {
+        // Unset bucket's most significant turned-on bit
+        __TBB_ASSERT(bucket != 0, "Unable to get_parent of the bucket 0");
+        size_type msb = tbb::detail::log2(bucket);
+        return bucket & ~(size_type(1) << msb);
+    }
+
+    size_type get_next_bucket_index( size_type bucket ) const {
+        size_type bits = tbb::detail::log2(my_bucket_count.load(std::memory_order_relaxed));
+        size_type reversed_next = reverse_n_bits(bucket, bits) + 1;
+        return reverse_n_bits(reversed_next, bits);
+    }
+
+    std::atomic<size_type> my_size;
+    std::atomic<size_type> my_bucket_count;
+    float my_max_load_factor;
+    hash_compare_type my_hash_compare;
+
+    list_node_type my_head; // Head node for split ordered list
+    unordered_segment_table my_segments; // Segment table of pointers to nodes
+
+    template <typename Container, typename Value>
+    friend class solist_iterator;
+
+    template <typename OtherTraits>
+    friend class concurrent_unordered_base;
+}; // class concurrent_unordered_base
+
+template <typename Traits>
+bool operator==( const concurrent_unordered_base<Traits>& lhs,
+                 const concurrent_unordered_base<Traits>& rhs ) {
+    if (&lhs == &rhs) { return true; }
+    if (lhs.size() != rhs.size()) { return false; }
+
+#if _MSC_VER
+    // Passing "unchecked" iterators to std::permutation with 3 parameters
+    // causes compiler warnings.
+    // The workaround is to use overload with 4 parameters, which is
+    // available since C++14 - minimally supported version on MSVC
+    return std::is_permutation(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+#else
+    return std::is_permutation(lhs.begin(), lhs.end(), rhs.begin());
+#endif
+}
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template <typename Traits>
+bool operator!=( const concurrent_unordered_base<Traits>& lhs,
+                 const concurrent_unordered_base<Traits>& rhs ) {
+    return !(lhs == rhs);
+}
+#endif
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(pop) // warning 4127 is back
+#endif
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__concurrent_unordered_base_H
diff --git a/third_party/tbb/detail/_config.h b/third_party/tbb/detail/_config.h
new file mode 100644
index 000000000..ae3383243
--- /dev/null
+++ b/third_party/tbb/detail/_config.h
@@ -0,0 +1,530 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__config_H
+#define __TBB_detail__config_H
+
+/** This header is supposed to contain macro definitions only.
+    The macros defined here are intended to control such aspects of TBB build as
+    - presence of compiler features
+    - compilation modes
+    - feature sets
+    - known compiler/platform issues
+**/
+
+/* Check which standard library we use. */
+#include "third_party/libcxx/cstddef"
+
+#ifdef __has_include
+#if __has_include(<version>)
+#include "third_party/libcxx/version"
+#endif
+#endif
+
+#include "third_party/tbb/detail/_export.h"
+
+#if _MSC_VER
+    #define __TBB_EXPORTED_FUNC   __cdecl
+    #define __TBB_EXPORTED_METHOD __thiscall
+#else
+    #define __TBB_EXPORTED_FUNC
+    #define __TBB_EXPORTED_METHOD
+#endif
+
+#if defined(_MSVC_LANG)
+    #define __TBB_LANG _MSVC_LANG
+#else
+    #define __TBB_LANG __cplusplus
+#endif // _MSVC_LANG
+
+#define __TBB_CPP14_PRESENT (__TBB_LANG >= 201402L)
+#define __TBB_CPP17_PRESENT (__TBB_LANG >= 201703L)
+#define __TBB_CPP20_PRESENT (__TBB_LANG >= 202002L)
+
+#if __INTEL_COMPILER || _MSC_VER
+    #define __TBB_NOINLINE(decl) __declspec(noinline) decl
+#elif __GNUC__
+    #define __TBB_NOINLINE(decl) decl __attribute__ ((noinline))
+#else
+    #define __TBB_NOINLINE(decl) decl
+#endif
+
+#define __TBB_STRING_AUX(x) #x
+#define __TBB_STRING(x) __TBB_STRING_AUX(x)
+
+// Note that when ICC or Clang is in use, __TBB_GCC_VERSION might not fully match
+// the actual GCC version on the system.
+#define __TBB_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+
+/* Check which standard library we use. */
+
+// Prior to GCC 7, GNU libstdc++ did not have a convenient version macro.
+// Therefore we use different ways to detect its version.
+#ifdef TBB_USE_GLIBCXX_VERSION
+    // The version is explicitly specified in our public TBB_USE_GLIBCXX_VERSION macro.
+    // Its format should match the __TBB_GCC_VERSION above, e.g. 70301 for libstdc++ coming with GCC 7.3.1.
+    #define __TBB_GLIBCXX_VERSION TBB_USE_GLIBCXX_VERSION
+#elif _GLIBCXX_RELEASE && _GLIBCXX_RELEASE != __GNUC__
+    // Reported versions of GCC and libstdc++ do not match; trust the latter
+    #define __TBB_GLIBCXX_VERSION (_GLIBCXX_RELEASE*10000)
+#elif __GLIBCPP__ || __GLIBCXX__
+    // The version macro is not defined or matches the GCC version; use __TBB_GCC_VERSION
+    #define __TBB_GLIBCXX_VERSION __TBB_GCC_VERSION
+#endif
+
+#if __clang__
+    // according to clang documentation, version can be vendor specific
+    #define __TBB_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+#endif
+
+/** Macro helpers **/
+
+#define __TBB_CONCAT_AUX(A,B) A##B
+// The additional level of indirection is needed to expand macros A and B (not to get the AB macro).
+// See [cpp.subst] and [cpp.concat] for more details.
+#define __TBB_CONCAT(A,B) __TBB_CONCAT_AUX(A,B)
+// The IGNORED argument and comma are needed to always have 2 arguments (even when A is empty).
+#define __TBB_IS_MACRO_EMPTY(A,IGNORED) __TBB_CONCAT_AUX(__TBB_MACRO_EMPTY,A)
+#define __TBB_MACRO_EMPTY 1
+
+#if _M_X64 || _M_ARM64
+    #define __TBB_W(name) name##64
+#else
+    #define __TBB_W(name) name
+#endif
+
+/** User controlled TBB features & modes **/
+
+#ifndef TBB_USE_DEBUG
+    /*
+    There are four cases that are supported:
+    1. "_DEBUG is undefined" means "no debug";
+    2. "_DEBUG defined to something that is evaluated to 0" (including "garbage", as per [cpp.cond]) means "no debug";
+    3. "_DEBUG defined to something that is evaluated to a non-zero value" means "debug";
+    4. "_DEBUG defined to nothing (empty)" means "debug".
+    */
+    #ifdef _DEBUG
+        // Check if _DEBUG is empty.
+        #define __TBB_IS__DEBUG_EMPTY (__TBB_IS_MACRO_EMPTY(_DEBUG,IGNORED)==__TBB_MACRO_EMPTY)
+        #if __TBB_IS__DEBUG_EMPTY
+            #define TBB_USE_DEBUG 1
+        #else
+            #define TBB_USE_DEBUG _DEBUG
+        #endif // __TBB_IS__DEBUG_EMPTY
+    #else
+        #define TBB_USE_DEBUG 0
+    #endif // _DEBUG
+#endif // TBB_USE_DEBUG
+
+#ifndef TBB_USE_ASSERT
+    #define TBB_USE_ASSERT TBB_USE_DEBUG
+#endif // TBB_USE_ASSERT
+
+#ifndef TBB_USE_PROFILING_TOOLS
+#if TBB_USE_DEBUG
+    #define TBB_USE_PROFILING_TOOLS 2
+#else // TBB_USE_DEBUG
+    #define TBB_USE_PROFILING_TOOLS 0
+#endif // TBB_USE_DEBUG
+#endif // TBB_USE_PROFILING_TOOLS
+
+// Exceptions support cases
+#if !(__EXCEPTIONS || defined(_CPPUNWIND) || __SUNPRO_CC)
+    #if TBB_USE_EXCEPTIONS
+        #error Compilation settings do not support exception handling. Please do not set TBB_USE_EXCEPTIONS macro or set it to 0.
+    #elif !defined(TBB_USE_EXCEPTIONS)
+        #define TBB_USE_EXCEPTIONS 0
+    #endif
+#elif !defined(TBB_USE_EXCEPTIONS)
+    #define TBB_USE_EXCEPTIONS 1
+#endif
+
+/** Preprocessor symbols to determine HW architecture **/
+
+#if _WIN32 || _WIN64
+    #if defined(_M_X64) || defined(__x86_64__)  // the latter for MinGW support
+        #define __TBB_x86_64 1
+    #elif defined(_M_IA64)
+        #define __TBB_ipf 1
+    #elif defined(_M_IX86) || defined(__i386__) // the latter for MinGW support
+        #define __TBB_x86_32 1
+    #else
+        #define __TBB_generic_arch 1
+    #endif
+#else /* Assume generic Unix */
+    #if __x86_64__
+        #define __TBB_x86_64 1
+    #elif __ia64__
+        #define __TBB_ipf 1
+    #elif __i386__||__i386  // __i386 is for Sun OS
+        #define __TBB_x86_32 1
+    #else
+        #define __TBB_generic_arch 1
+    #endif
+#endif
+
+/** Windows API or POSIX API **/
+
+#if _WIN32 || _WIN64
+    #define __TBB_USE_WINAPI 1
+#else
+    #define __TBB_USE_POSIX 1
+#endif
+
+/** Internal TBB features & modes **/
+
+/** __TBB_DYNAMIC_LOAD_ENABLED describes the system possibility to load shared libraries at run time **/
+#ifndef __TBB_DYNAMIC_LOAD_ENABLED
+    #define __TBB_DYNAMIC_LOAD_ENABLED 1
+#endif
+
+/** __TBB_WIN8UI_SUPPORT enables support of Windows* Store Apps and limit a possibility to load
+    shared libraries at run time only from application container **/
+#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
+    #define __TBB_WIN8UI_SUPPORT 1
+#else
+    #define __TBB_WIN8UI_SUPPORT 0
+#endif
+
+/** __TBB_WEAK_SYMBOLS_PRESENT denotes that the system supports the weak symbol mechanism **/
+#ifndef __TBB_WEAK_SYMBOLS_PRESENT
+    #define __TBB_WEAK_SYMBOLS_PRESENT ( !_WIN32 && !__APPLE__ && !__sun && (__TBB_GCC_VERSION >= 40000 || __INTEL_COMPILER ) )
+#endif
+
+/** Presence of compiler features **/
+
+#if __clang__ && !__INTEL_COMPILER
+    #define __TBB_USE_OPTIONAL_RTTI __has_feature(cxx_rtti)
+#elif defined(_CPPRTTI)
+    #define __TBB_USE_OPTIONAL_RTTI 1
+#else
+    #define __TBB_USE_OPTIONAL_RTTI (__GXX_RTTI || __RTTI || __INTEL_RTTI__)
+#endif
+
+/** Address sanitizer detection **/
+#ifdef __SANITIZE_ADDRESS__
+    #define __TBB_USE_ADDRESS_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(address_sanitizer)
+    #define __TBB_USE_ADDRESS_SANITIZER 1
+#endif
+#endif
+
+/** Library features presence macros **/
+
+#define __TBB_CPP14_INTEGER_SEQUENCE_PRESENT       (__TBB_LANG >= 201402L)
+#define __TBB_CPP17_INVOKE_PRESENT                 (__TBB_LANG >= 201703L)
+
+// TODO: Remove the condition(__INTEL_COMPILER > 2021) from the __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+// macro when this feature start working correctly on this compiler.
+#if __INTEL_COMPILER && (!_MSC_VER || __INTEL_CXX11_MOVE__)
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L)
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (__INTEL_COMPILER > 2021 && __TBB_LANG >= 201703L)
+    #define __TBB_CPP20_CONCEPTS_PRESENT           0 // TODO: add a mechanism for future addition
+#elif __clang__
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__has_feature(cxx_variable_templates))
+    #define __TBB_CPP20_CONCEPTS_PRESENT           0 // TODO: add a mechanism for future addition
+    #ifdef __cpp_deduction_guides
+        #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (__cpp_deduction_guides >= 201611L)
+    #else
+        #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT 0
+    #endif
+#elif __GNUC__
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L && __TBB_GCC_VERSION >= 50000)
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (__cpp_deduction_guides >= 201606L)
+    #define __TBB_CPP20_CONCEPTS_PRESENT           (__TBB_LANG >= 201709L && __TBB_GCC_VERSION >= 100201)
+#elif _MSC_VER
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (_MSC_FULL_VER >= 190023918 && (!__INTEL_COMPILER || __INTEL_COMPILER >= 1700))
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (_MSC_VER >= 1914 && __TBB_LANG >= 201703L && (!__INTEL_COMPILER || __INTEL_COMPILER > 2021))
+    #define __TBB_CPP20_CONCEPTS_PRESENT           (_MSC_VER >= 1923 && __TBB_LANG >= 202002L) // TODO: INTEL_COMPILER?
+#else
+    #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L)
+    #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT   (__TBB_LANG >= 201703L)
+    #define __TBB_CPP20_CONCEPTS_PRESENT           (__TBB_LANG >= 202002L)
+#endif
+
+// GCC4.8 on RHEL7 does not support std::get_new_handler
+#define __TBB_CPP11_GET_NEW_HANDLER_PRESENT             (_MSC_VER >= 1900 || __TBB_GLIBCXX_VERSION >= 40900 && __GXX_EXPERIMENTAL_CXX0X__ || _LIBCPP_VERSION)
+// GCC4.8 on RHEL7 does not support std::is_trivially_copyable
+#define __TBB_CPP11_TYPE_PROPERTIES_PRESENT             (_LIBCPP_VERSION || _MSC_VER >= 1700 || (__TBB_GLIBCXX_VERSION >= 50000 && __GXX_EXPERIMENTAL_CXX0X__))
+
+#define __TBB_CPP17_MEMORY_RESOURCE_PRESENT             (_MSC_VER >= 1913 && (__TBB_LANG > 201402L) || \
+                                                        __TBB_GLIBCXX_VERSION >= 90000 && __TBB_LANG >= 201703L)
+#define __TBB_CPP17_HW_INTERFERENCE_SIZE_PRESENT        (_MSC_VER >= 1911)
+#define __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT          (__TBB_LANG >= 201703L)
+#define __TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT   (__TBB_LANG >= 201703L)
+#define __TBB_CPP17_IS_SWAPPABLE_PRESENT                (__TBB_LANG >= 201703L)
+
+#if defined(__cpp_impl_three_way_comparison) && defined(__cpp_lib_three_way_comparison)
+    #define __TBB_CPP20_COMPARISONS_PRESENT ((__cpp_impl_three_way_comparison >= 201907L) && (__cpp_lib_three_way_comparison >= 201907L))
+#else
+    #define __TBB_CPP20_COMPARISONS_PRESENT 0
+#endif
+
+#define __TBB_RESUMABLE_TASKS                           (!__TBB_WIN8UI_SUPPORT && !__ANDROID__ && !__QNXNTO__ && (!__linux__ || __GLIBC__))
+
+/* This macro marks incomplete code or comments describing ideas which are considered for the future.
+ * See also for plain comment with TODO and FIXME marks for small improvement opportunities.
+ */
+#define __TBB_TODO 0
+
+/* Check which standard library we use. */
+/* __TBB_SYMBOL is defined only while processing exported symbols list where C++ is not allowed. */
+#if !defined(__TBB_SYMBOL) && !__TBB_CONFIG_PREPROC_ONLY
+    #include "third_party/libcxx/cstddef"
+#endif
+
+/** Target OS is either iOS* or iOS* simulator **/
+#if __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__
+    #define __TBB_IOS 1
+#endif
+
+#if __APPLE__
+    #if __INTEL_COMPILER && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1099 \
+                         && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101000
+        // ICC does not correctly set the macro if -mmacosx-min-version is not specified
+        #define __TBB_MACOS_TARGET_VERSION  (100000 + 10*(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ - 1000))
+    #else
+        #define __TBB_MACOS_TARGET_VERSION  __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
+    #endif
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+    #define __TBB_GCC_WARNING_IGNORED_ATTRIBUTES_PRESENT (__TBB_GCC_VERSION >= 60100)
+#endif
+
+#if __GNUC__ && !__INTEL_COMPILER && !__clang__
+    #define __TBB_GCC_PARAMETER_PACK_IN_LAMBDAS_BROKEN (__TBB_GCC_VERSION <= 40805)
+#endif
+
+#define __TBB_CPP17_FALLTHROUGH_PRESENT (__TBB_LANG >= 201703L)
+#define __TBB_CPP17_NODISCARD_PRESENT   (__TBB_LANG >= 201703L)
+#define __TBB_FALLTHROUGH_PRESENT       (__TBB_GCC_VERSION >= 70000 && !__INTEL_COMPILER)
+
+#if __TBB_CPP17_FALLTHROUGH_PRESENT
+    #define __TBB_fallthrough [[fallthrough]]
+#elif __TBB_FALLTHROUGH_PRESENT
+    #define __TBB_fallthrough __attribute__ ((fallthrough))
+#else
+    #define __TBB_fallthrough
+#endif
+
+#if __TBB_CPP17_NODISCARD_PRESENT
+    #define __TBB_nodiscard [[nodiscard]]
+#elif __clang__ || __GNUC__
+    #define __TBB_nodiscard __attribute__((warn_unused_result))
+#else
+    #define __TBB_nodiscard
+#endif
+
+#define __TBB_CPP17_UNCAUGHT_EXCEPTIONS_PRESENT             (_MSC_VER >= 1900 || __GLIBCXX__ && __cpp_lib_uncaught_exceptions \
+                                                            || _LIBCPP_VERSION >= 3700 && (!__TBB_MACOS_TARGET_VERSION || __TBB_MACOS_TARGET_VERSION >= 101200))
+
+#define __TBB_TSX_INTRINSICS_PRESENT (__RTM__ || __INTEL_COMPILER || (_MSC_VER>=1700 && (__TBB_x86_64 || __TBB_x86_32)))
+
+#define __TBB_WAITPKG_INTRINSICS_PRESENT ((__INTEL_COMPILER >= 1900 || __TBB_GCC_VERSION >= 110000 || __TBB_CLANG_VERSION >= 120000) \
+                                         && (_WIN32 || _WIN64 || __unix__ || __APPLE__) && (__TBB_x86_32 || __TBB_x86_64) && !__ANDROID__)
+
+/** Internal TBB features & modes **/
+
+/** __TBB_SOURCE_DIRECTLY_INCLUDED is a mode used in whitebox testing when
+    it's necessary to test internal functions not exported from TBB DLLs
+**/
+#if (_WIN32||_WIN64) && (__TBB_SOURCE_DIRECTLY_INCLUDED || TBB_USE_PREVIEW_BINARY)
+    #define __TBB_NO_IMPLICIT_LINKAGE 1
+    #define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1
+#endif
+
+#if (__TBB_BUILD || __TBBMALLOC_BUILD || __TBBMALLOCPROXY_BUILD || __TBBBIND_BUILD) && !defined(__TBB_NO_IMPLICIT_LINKAGE)
+    #define __TBB_NO_IMPLICIT_LINKAGE 1
+#endif
+
+#if _MSC_VER
+    #if !__TBB_NO_IMPLICIT_LINKAGE
+        #ifdef _DEBUG
+            #pragma comment(lib, "tbb12_debug.lib")
+        #else
+            #pragma comment(lib, "tbb12.lib")
+        #endif
+    #endif
+#endif
+
+#ifndef __TBB_SCHEDULER_OBSERVER
+    #define __TBB_SCHEDULER_OBSERVER 1
+#endif /* __TBB_SCHEDULER_OBSERVER */
+
+#ifndef __TBB_FP_CONTEXT
+    #define __TBB_FP_CONTEXT 1
+#endif /* __TBB_FP_CONTEXT */
+
+#define __TBB_RECYCLE_TO_ENQUEUE __TBB_BUILD // keep non-official
+
+#ifndef __TBB_ARENA_OBSERVER
+    #define __TBB_ARENA_OBSERVER __TBB_SCHEDULER_OBSERVER
+#endif /* __TBB_ARENA_OBSERVER */
+
+#ifndef __TBB_ARENA_BINDING
+    #define __TBB_ARENA_BINDING 1
+#endif
+
+#ifndef __TBB_ENQUEUE_ENFORCED_CONCURRENCY
+    #define __TBB_ENQUEUE_ENFORCED_CONCURRENCY 1
+#endif
+
+#if !defined(__TBB_SURVIVE_THREAD_SWITCH) && \
+          (_WIN32 || _WIN64 || __APPLE__ || (defined(__unix__) && !__ANDROID__))
+    #define __TBB_SURVIVE_THREAD_SWITCH 1
+#endif /* __TBB_SURVIVE_THREAD_SWITCH */
+
+#ifndef TBB_PREVIEW_FLOW_GRAPH_FEATURES
+    #define TBB_PREVIEW_FLOW_GRAPH_FEATURES __TBB_CPF_BUILD
+#endif
+
+#ifndef __TBB_DEFAULT_PARTITIONER
+    #define __TBB_DEFAULT_PARTITIONER tbb::auto_partitioner
+#endif
+
+#ifndef __TBB_FLOW_TRACE_CODEPTR
+    #define __TBB_FLOW_TRACE_CODEPTR __TBB_CPF_BUILD
+#endif
+
+// Intel(R) C++ Compiler starts analyzing usages of the deprecated content at the template
+// instantiation site, which is too late for suppression of the corresponding messages for internal
+// stuff.
+#if !defined(__INTEL_COMPILER) && (!defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0))
+    #if (__TBB_LANG >= 201402L && (!defined(_MSC_VER) || _MSC_VER >= 1920))
+        #define __TBB_DEPRECATED [[deprecated]]
+        #define __TBB_DEPRECATED_MSG(msg) [[deprecated(msg)]]
+    #elif _MSC_VER
+        #define __TBB_DEPRECATED __declspec(deprecated)
+        #define __TBB_DEPRECATED_MSG(msg) __declspec(deprecated(msg))
+    #elif (__GNUC__ && __TBB_GCC_VERSION >= 40805) || __clang__
+        #define __TBB_DEPRECATED __attribute__((deprecated))
+        #define __TBB_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+    #endif
+#endif  // !defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)
+
+#if !defined(__TBB_DEPRECATED)
+    #define __TBB_DEPRECATED
+    #define __TBB_DEPRECATED_MSG(msg)
+#elif !defined(__TBB_SUPPRESS_INTERNAL_DEPRECATED_MESSAGES)
+    // Suppress deprecated messages from self
+    #define __TBB_SUPPRESS_INTERNAL_DEPRECATED_MESSAGES 1
+#endif
+
+#if defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) && (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)
+    #define __TBB_DEPRECATED_VERBOSE __TBB_DEPRECATED
+    #define __TBB_DEPRECATED_VERBOSE_MSG(msg) __TBB_DEPRECATED_MSG(msg)
+#else
+    #define __TBB_DEPRECATED_VERBOSE
+    #define __TBB_DEPRECATED_VERBOSE_MSG(msg)
+#endif // (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)
+
+#if (!defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)) && !(__TBB_LANG >= 201103L || _MSC_VER >= 1900)
+    #pragma message("TBB Warning: Support for C++98/03 is deprecated. Please use the compiler that supports C++11 features at least.")
+#endif
+
+#ifdef _VARIADIC_MAX
+    #define __TBB_VARIADIC_MAX _VARIADIC_MAX
+#else
+    #if _MSC_VER == 1700
+        #define __TBB_VARIADIC_MAX 5 // VS11 setting, issue resolved in VS12
+    #elif _MSC_VER == 1600
+        #define __TBB_VARIADIC_MAX 10 // VS10 setting
+    #else
+        #define __TBB_VARIADIC_MAX 15
+    #endif
+#endif
+
+#if __SANITIZE_THREAD__
+    #define __TBB_USE_THREAD_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+    #define __TBB_USE_THREAD_SANITIZER 1
+#endif
+#endif
+
+#ifndef __TBB_USE_SANITIZERS
+#define __TBB_USE_SANITIZERS (__TBB_USE_THREAD_SANITIZER || __TBB_USE_ADDRESS_SANITIZER)
+#endif
+
+#ifndef __TBB_RESUMABLE_TASKS_USE_THREADS
+#define __TBB_RESUMABLE_TASKS_USE_THREADS __TBB_USE_SANITIZERS
+#endif
+
+#ifndef __TBB_USE_CONSTRAINTS
+#define __TBB_USE_CONSTRAINTS 1
+#endif
+
+#ifndef __TBB_STRICT_CONSTRAINTS
+#define __TBB_STRICT_CONSTRAINTS 1
+#endif
+
+#if __TBB_CPP20_CONCEPTS_PRESENT && __TBB_USE_CONSTRAINTS
+    #define __TBB_requires(...) requires __VA_ARGS__
+#else // __TBB_CPP20_CONCEPTS_PRESENT
+    #define __TBB_requires(...)
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+/** Macros of the form __TBB_XXX_BROKEN denote known issues that are caused by
+    the bugs in compilers, standard or OS specific libraries. They should be
+    removed as soon as the corresponding bugs are fixed or the buggy OS/compiler
+    versions go out of the support list.
+**/
+
+// Some STL containers not support allocator traits in old GCC versions
+#if __GXX_EXPERIMENTAL_CXX0X__ && __TBB_GLIBCXX_VERSION <= 50301
+    #define TBB_ALLOCATOR_TRAITS_BROKEN 1
+#endif
+
+// GCC 4.8 C++ standard library implements std::this_thread::yield as no-op.
+#if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900
+    #define __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN 1
+#endif
+
+/** End of __TBB_XXX_BROKEN macro section **/
+
+#if defined(_MSC_VER) && _MSC_VER>=1500 && !defined(__INTEL_COMPILER)
+    // A macro to suppress erroneous or benign "unreachable code" MSVC warning (4702)
+    #define __TBB_MSVC_UNREACHABLE_CODE_IGNORED 1
+#endif
+
+// Many OS versions (Android 4.0.[0-3] for example) need workaround for dlopen to avoid non-recursive loader lock hang
+// Setting the workaround for all compile targets ($APP_PLATFORM) below Android 4.4 (android-19)
+#if __ANDROID__
+    // MISSING #include <android/api-level.h>
+#endif
+
+#define __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING (TBB_PREVIEW_FLOW_GRAPH_FEATURES)
+
+#ifndef __TBB_PREVIEW_CRITICAL_TASKS
+#define __TBB_PREVIEW_CRITICAL_TASKS            1
+#endif
+
+#ifndef __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+#define __TBB_PREVIEW_FLOW_GRAPH_NODE_SET       (TBB_PREVIEW_FLOW_GRAPH_FEATURES)
+#endif
+
+#if TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS
+#define __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS 1
+#endif
+
+#if TBB_PREVIEW_TASK_GROUP_EXTENSIONS || __TBB_BUILD
+#define __TBB_PREVIEW_TASK_GROUP_EXTENSIONS 1
+#endif
+
+#endif // __TBB_detail__config_H
diff --git a/third_party/tbb/detail/_containers_helpers.h b/third_party/tbb/detail/_containers_helpers.h
new file mode 100644
index 000000000..a583a911c
--- /dev/null
+++ b/third_party/tbb/detail/_containers_helpers.h
@@ -0,0 +1,68 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__containers_helpers_H
+#define __TBB_detail__containers_helpers_H
+
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_allocator_traits.h"
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/functional"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+template <typename Compare, typename = void>
+struct comp_is_transparent : std::false_type {};
+
+template <typename Compare>
+struct comp_is_transparent<Compare, tbb::detail::void_t<typename Compare::is_transparent>> : std::true_type {};
+
+template <typename Key, typename Hasher, typename KeyEqual, typename = void >
+struct has_transparent_key_equal : std::false_type { using type = KeyEqual; };
+
+template <typename Key, typename Hasher, typename KeyEqual>
+struct has_transparent_key_equal<Key, Hasher, KeyEqual, tbb::detail::void_t<typename Hasher::transparent_key_equal>> : std::true_type {
+    using type = typename Hasher::transparent_key_equal;
+    static_assert(comp_is_transparent<type>::value, "Hash::transparent_key_equal::is_transparent is not valid or does not denote a type.");
+    static_assert((std::is_same<KeyEqual, std::equal_to<Key>>::value ||
+        std::is_same<typename Hasher::transparent_key_equal, KeyEqual>::value), "KeyEqual is a different type than equal_to<Key> or Hash::transparent_key_equal.");
+ };
+
+struct is_iterator_impl {
+template <typename T>
+using iter_traits_category = typename std::iterator_traits<T>::iterator_category;
+
+template <typename T>
+using input_iter_category = typename std::enable_if<std::is_base_of<std::input_iterator_tag, iter_traits_category<T>>::value>::type;
+}; // struct is_iterator_impl
+
+template <typename T>
+using is_input_iterator = supports<T, is_iterator_impl::iter_traits_category, is_iterator_impl::input_iter_category>;
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename T>
+inline constexpr bool is_input_iterator_v = is_input_iterator<T>::value;
+#endif
+
+} // inline namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__containers_helpers_H
diff --git a/third_party/tbb/detail/_exception.h b/third_party/tbb/detail/_exception.h
new file mode 100644
index 000000000..e209862f2
--- /dev/null
+++ b/third_party/tbb/detail/_exception.h
@@ -0,0 +1,89 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__exception_H
+#define __TBB__exception_H
+
+#include "third_party/tbb/detail/_config.h"
+
+#include "third_party/libcxx/new"          // std::bad_alloc
+#include "third_party/libcxx/exception"    // std::exception
+#include "third_party/libcxx/stdexcept"    // std::runtime_error
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+enum class exception_id {
+    bad_alloc = 1,
+    bad_last_alloc,
+    user_abort,
+    nonpositive_step,
+    out_of_range,
+    reservation_length_error,
+    missing_wait,
+    invalid_load_factor,
+    invalid_key,
+    bad_tagged_msg_cast,
+    unsafe_wait,
+    last_entry
+};
+} // namespace d0
+
+#if _MSC_VER
+    #pragma warning(disable: 4275)
+#endif
+
+namespace r1 {
+//! Exception for concurrent containers
+class TBB_EXPORT bad_last_alloc : public std::bad_alloc {
+public:
+    const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override;
+};
+
+//! Exception for user-initiated abort
+class TBB_EXPORT user_abort : public std::exception {
+public:
+    const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override;
+};
+
+//! Exception for missing wait on structured_task_group
+class TBB_EXPORT missing_wait : public std::exception {
+public:
+    const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override;
+};
+
+//! Exception for impossible finalization of task_sheduler_handle
+class TBB_EXPORT unsafe_wait : public std::runtime_error {
+public:
+    unsafe_wait(const char* msg) : std::runtime_error(msg) {}
+};
+
+//! Gathers all throw operators in one place.
+/** Its purpose is to minimize code bloat that can be caused by throw operators
+    scattered in multiple places, especially in templates. **/
+TBB_EXPORT void __TBB_EXPORTED_FUNC throw_exception ( exception_id );
+} // namespace r1
+
+inline namespace d0 {
+using r1::throw_exception;
+} // namespace d0
+
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB__exception_H
+
diff --git a/third_party/tbb/detail/_export.h b/third_party/tbb/detail/_export.h
new file mode 100644
index 000000000..515095917
--- /dev/null
+++ b/third_party/tbb/detail/_export.h
@@ -0,0 +1,47 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__export_H
+#define __TBB_detail__export_H
+
+#if defined(__MINGW32__)
+    #define _EXPORT __declspec(dllexport)
+#elif defined(_WIN32) || defined(__unix__) || defined(__APPLE__) // Use .def files for these
+    #define _EXPORT
+#else
+    #error "Unknown platform/compiler"
+#endif
+
+#if __TBB_BUILD
+    #define TBB_EXPORT _EXPORT
+#else
+    #define TBB_EXPORT
+#endif
+
+#if __TBBMALLOC_BUILD
+    #define TBBMALLOC_EXPORT _EXPORT
+#else
+    #define TBBMALLOC_EXPORT
+#endif
+
+#if __TBBBIND_BUILD
+    #define TBBBIND_EXPORT _EXPORT
+#else
+    #define TBBBIND_EXPORT
+#endif
+
+#endif
diff --git a/third_party/tbb/detail/_flow_graph_body_impl.h b/third_party/tbb/detail/_flow_graph_body_impl.h
new file mode 100644
index 000000000..8515c94be
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_body_impl.h
@@ -0,0 +1,386 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_body_impl_H
+#define __TBB__flow_graph_body_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1 (in flow_graph.h)
+
+typedef std::uint64_t tag_value;
+
+
+// TODO revamp: find out if there is already helper for has_policy.
+template<typename ... Policies> struct Policy {};
+
+template<typename ... Policies> struct has_policy;
+
+template<typename ExpectedPolicy, typename FirstPolicy, typename ...Policies>
+struct has_policy<ExpectedPolicy, FirstPolicy, Policies...> :
+    std::integral_constant<bool, has_policy<ExpectedPolicy, FirstPolicy>::value ||
+                                 has_policy<ExpectedPolicy, Policies...>::value> {};
+
+template<typename ExpectedPolicy, typename SinglePolicy>
+struct has_policy<ExpectedPolicy, SinglePolicy> :
+    std::integral_constant<bool, std::is_same<ExpectedPolicy, SinglePolicy>::value> {};
+
+template<typename ExpectedPolicy, typename ...Policies>
+struct has_policy<ExpectedPolicy, Policy<Policies...> > : has_policy<ExpectedPolicy, Policies...> {};
+
+namespace graph_policy_namespace {
+
+    struct rejecting { };
+    struct reserving { };
+    struct queueing  { };
+    struct lightweight  { };
+
+    // K == type of field used for key-matching.  Each tag-matching port will be provided
+    // functor that, given an object accepted by the port, will return the
+    /// field of type K being used for matching.
+    template<typename K, typename KHash=tbb_hash_compare<typename std::decay<K>::type > >
+        __TBB_requires(tbb::detail::hash_compare<KHash, K>)
+    struct key_matching {
+        typedef K key_type;
+        typedef typename std::decay<K>::type base_key_type;
+        typedef KHash hash_compare_type;
+    };
+
+    // old tag_matching join's new specifier
+    typedef key_matching<tag_value> tag_matching;
+
+    // Aliases for Policy combinations
+    typedef Policy<queueing, lightweight> queueing_lightweight;
+    typedef Policy<rejecting, lightweight> rejecting_lightweight;
+
+} // namespace graph_policy_namespace
+
+// -------------- function_body containers ----------------------
+
+//! A functor that takes no input and generates a value of type Output
+template< typename Output >
+class input_body : no_assign {
+public:
+    virtual ~input_body() {}
+    virtual Output operator()(flow_control& fc) = 0;
+    virtual input_body* clone() = 0;
+};
+
+//! The leaf for input_body
+template< typename Output, typename Body>
+class input_body_leaf : public input_body<Output> {
+public:
+    input_body_leaf( const Body &_body ) : body(_body) { }
+    Output operator()(flow_control& fc) override { return body(fc); }
+    input_body_leaf* clone() override {
+        return new input_body_leaf< Output, Body >(body);
+    }
+    Body get_body() { return body; }
+private:
+    Body body;
+};
+
+//! A functor that takes an Input and generates an Output
+template< typename Input, typename Output >
+class function_body : no_assign {
+public:
+    virtual ~function_body() {}
+    virtual Output operator()(const Input &input) = 0;
+    virtual function_body* clone() = 0;
+};
+
+//! the leaf for function_body
+template <typename Input, typename Output, typename B>
+class function_body_leaf : public function_body< Input, Output > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    Output operator()(const Input &i) override { return tbb::detail::invoke(body,i); }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< Input, Output, B >(body);
+    }
+private:
+    B body;
+};
+
+//! the leaf for function_body specialized for Input and output of continue_msg
+template <typename B>
+class function_body_leaf< continue_msg, continue_msg, B> : public function_body< continue_msg, continue_msg > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    continue_msg operator()( const continue_msg &i ) override {
+        body(i);
+        return i;
+    }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< continue_msg, continue_msg, B >(body);
+    }
+private:
+    B body;
+};
+
+//! the leaf for function_body specialized for Output of continue_msg
+template <typename Input, typename B>
+class function_body_leaf< Input, continue_msg, B> : public function_body< Input, continue_msg > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    continue_msg operator()(const Input &i) override {
+        body(i);
+        return continue_msg();
+    }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< Input, continue_msg, B >(body);
+    }
+private:
+    B body;
+};
+
+//! the leaf for function_body specialized for Input of continue_msg
+template <typename Output, typename B>
+class function_body_leaf< continue_msg, Output, B > : public function_body< continue_msg, Output > {
+public:
+    function_body_leaf( const B &_body ) : body(_body) { }
+    Output operator()(const continue_msg &i) override {
+        return body(i);
+    }
+    B get_body() { return body; }
+    function_body_leaf* clone() override {
+        return new function_body_leaf< continue_msg, Output, B >(body);
+    }
+private:
+    B body;
+};
+
+//! function_body that takes an Input and a set of output ports
+template<typename Input, typename OutputSet>
+class multifunction_body : no_assign {
+public:
+    virtual ~multifunction_body () {}
+    virtual void operator()(const Input &/* input*/, OutputSet &/*oset*/) = 0;
+    virtual multifunction_body* clone() = 0;
+    virtual void* get_body_ptr() = 0;
+};
+
+//! leaf for multifunction.  OutputSet can be a std::tuple or a vector.
+template<typename Input, typename OutputSet, typename B >
+class multifunction_body_leaf : public multifunction_body<Input, OutputSet> {
+public:
+    multifunction_body_leaf(const B &_body) : body(_body) { }
+    void operator()(const Input &input, OutputSet &oset) override {
+        tbb::detail::invoke(body, input, oset); // body may explicitly put() to one or more of oset.
+    }
+    void* get_body_ptr() override { return &body; }
+    multifunction_body_leaf* clone() override {
+        return new multifunction_body_leaf<Input, OutputSet,B>(body);
+    }
+
+private:
+    B body;
+};
+
+// ------ function bodies for hash_buffers and key-matching joins.
+
+template<typename Input, typename Output>
+class type_to_key_function_body : no_assign {
+    public:
+        virtual ~type_to_key_function_body() {}
+        virtual Output operator()(const Input &input) = 0;  // returns an Output
+        virtual type_to_key_function_body* clone() = 0;
+};
+
+// specialization for ref output
+template<typename Input, typename Output>
+class type_to_key_function_body<Input,Output&> : no_assign {
+    public:
+        virtual ~type_to_key_function_body() {}
+        virtual const Output & operator()(const Input &input) = 0;  // returns a const Output&
+        virtual type_to_key_function_body* clone() = 0;
+};
+
+template <typename Input, typename Output, typename B>
+class type_to_key_function_body_leaf : public type_to_key_function_body<Input, Output> {
+public:
+    type_to_key_function_body_leaf( const B &_body ) : body(_body) { }
+    Output operator()(const Input &i) override { return tbb::detail::invoke(body, i); }
+    type_to_key_function_body_leaf* clone() override {
+        return new type_to_key_function_body_leaf< Input, Output, B>(body);
+    }
+private:
+    B body;
+};
+
+template <typename Input, typename Output, typename B>
+class type_to_key_function_body_leaf<Input,Output&,B> : public type_to_key_function_body< Input, Output&> {
+public:
+    type_to_key_function_body_leaf( const B &_body ) : body(_body) { }
+    const Output& operator()(const Input &i) override {
+        return tbb::detail::invoke(body, i);
+    }
+    type_to_key_function_body_leaf* clone() override {
+        return new type_to_key_function_body_leaf< Input, Output&, B>(body);
+    }
+private:
+    B body;
+};
+
+// --------------------------- end of function_body containers ------------------------
+
+// --------------------------- node task bodies ---------------------------------------
+
+//! A task that calls a node's forward_task function
+template< typename NodeType >
+class forward_task_bypass : public graph_task {
+    NodeType &my_node;
+public:
+    forward_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n
+                         , node_priority_t node_priority = no_priority
+    ) : graph_task(g, allocator, node_priority),
+    my_node(n) {}
+
+    task* execute(execution_data& ed) override {
+        graph_task* next_task = my_node.forward_task();
+        if (SUCCESSFULLY_ENQUEUED == next_task)
+            next_task = nullptr;
+        else if (next_task)
+            next_task = prioritize_task(my_node.graph_reference(), *next_task);
+        finalize<forward_task_bypass>(ed);
+        return next_task;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize<forward_task_bypass>(ed);
+        return nullptr;
+    }
+};
+
+//! A task that calls a node's apply_body_bypass function, passing in an input of type Input
+//  return the task* unless it is SUCCESSFULLY_ENQUEUED, in which case return nullptr
+template< typename NodeType, typename Input >
+class apply_body_task_bypass : public graph_task {
+    NodeType &my_node;
+    Input my_input;
+public:
+
+    apply_body_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n, const Input &i
+                            , node_priority_t node_priority = no_priority
+    ) : graph_task(g, allocator, node_priority),
+        my_node(n), my_input(i) {}
+
+    task* execute(execution_data& ed) override {
+        graph_task* next_task = my_node.apply_body_bypass( my_input );
+        if (SUCCESSFULLY_ENQUEUED == next_task)
+            next_task = nullptr;
+        else if (next_task)
+            next_task = prioritize_task(my_node.graph_reference(), *next_task);
+        finalize<apply_body_task_bypass>(ed);
+        return next_task;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize<apply_body_task_bypass>(ed);
+        return nullptr;
+    }
+};
+
+//! A task that calls a node's apply_body_bypass function with no input
+template< typename NodeType >
+class input_node_task_bypass : public graph_task {
+    NodeType &my_node;
+public:
+    input_node_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n )
+        : graph_task(g, allocator), my_node(n) {}
+
+    task* execute(execution_data& ed) override {
+        graph_task* next_task = my_node.apply_body_bypass( );
+        if (SUCCESSFULLY_ENQUEUED == next_task)
+            next_task = nullptr;
+        else if (next_task)
+            next_task = prioritize_task(my_node.graph_reference(), *next_task);
+        finalize<input_node_task_bypass>(ed);
+        return next_task;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize<input_node_task_bypass>(ed);
+        return nullptr;
+    }
+};
+
+// ------------------------ end of node task bodies -----------------------------------
+
+template<typename T, typename DecrementType, typename DummyType = void>
+class threshold_regulator;
+
+template<typename T, typename DecrementType>
+class threshold_regulator<T, DecrementType,
+                  typename std::enable_if<std::is_integral<DecrementType>::value>::type>
+    : public receiver<DecrementType>, no_copy
+{
+    T* my_node;
+protected:
+
+    graph_task* try_put_task( const DecrementType& value ) override {
+        graph_task* result = my_node->decrement_counter( value );
+        if( !result )
+            result = SUCCESSFULLY_ENQUEUED;
+        return result;
+    }
+
+    graph& graph_reference() const override {
+        return my_node->my_graph;
+    }
+
+    template<typename U, typename V> friend class limiter_node;
+    void reset_receiver( reset_flags ) {}
+
+public:
+    threshold_regulator(T* owner) : my_node(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+};
+
+template<typename T>
+class threshold_regulator<T, continue_msg, void> : public continue_receiver, no_copy {
+
+    T *my_node;
+
+    graph_task* execute() override {
+        return my_node->decrement_counter( 1 );
+    }
+
+protected:
+
+    graph& graph_reference() const override {
+        return my_node->my_graph;
+    }
+
+public:
+
+    typedef continue_msg input_type;
+    typedef continue_msg output_type;
+    threshold_regulator(T* owner)
+        : continue_receiver( /*number_of_predecessors=*/0, no_priority ), my_node(owner)
+    {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+};
+
+#endif // __TBB__flow_graph_body_impl_H
diff --git a/third_party/tbb/detail/_flow_graph_cache_impl.h b/third_party/tbb/detail/_flow_graph_cache_impl.h
new file mode 100644
index 000000000..b75545324
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_cache_impl.h
@@ -0,0 +1,435 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_cache_impl_H
+#define __TBB__flow_graph_cache_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1 (in flow_graph.h)
+
+//! A node_cache maintains a std::queue of elements of type T.  Each operation is protected by a lock.
+template< typename T, typename M=spin_mutex >
+class node_cache {
+    public:
+
+    typedef size_t size_type;
+
+    bool empty() {
+        typename mutex_type::scoped_lock lock( my_mutex );
+        return internal_empty();
+    }
+
+    void add( T &n ) {
+        typename mutex_type::scoped_lock lock( my_mutex );
+        internal_push(n);
+    }
+
+    void remove( T &n ) {
+        typename mutex_type::scoped_lock lock( my_mutex );
+        for ( size_t i = internal_size(); i != 0; --i ) {
+            T &s = internal_pop();
+            if ( &s == &n )
+                break;  // only remove one predecessor per request
+            internal_push(s);
+        }
+    }
+
+    void clear() {
+        while( !my_q.empty()) (void)my_q.pop();
+    }
+
+protected:
+
+    typedef M mutex_type;
+    mutex_type my_mutex;
+    std::queue< T * > my_q;
+
+    // Assumes lock is held
+    inline bool internal_empty( )  {
+        return my_q.empty();
+    }
+
+    // Assumes lock is held
+    inline size_type internal_size( )  {
+        return my_q.size();
+    }
+
+    // Assumes lock is held
+    inline void internal_push( T &n )  {
+        my_q.push(&n);
+    }
+
+    // Assumes lock is held
+    inline T &internal_pop() {
+        T *v = my_q.front();
+        my_q.pop();
+        return *v;
+    }
+
+};
+
+//! A cache of predecessors that only supports try_get
+template< typename T, typename M=spin_mutex >
+class predecessor_cache : public node_cache< sender<T>, M > {
+public:
+    typedef M mutex_type;
+    typedef T output_type;
+    typedef sender<output_type> predecessor_type;
+    typedef receiver<output_type> successor_type;
+
+    predecessor_cache( successor_type* owner ) : my_owner( owner ) {
+        __TBB_ASSERT( my_owner, "predecessor_cache should have an owner." );
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    bool get_item( output_type& v ) {
+
+        bool msg = false;
+
+        do {
+            predecessor_type *src;
+            {
+                typename mutex_type::scoped_lock lock(this->my_mutex);
+                if ( this->internal_empty() ) {
+                    break;
+                }
+                src = &this->internal_pop();
+            }
+
+            // Try to get from this sender
+            msg = src->try_get( v );
+
+            if (msg == false) {
+                // Relinquish ownership of the edge
+                register_successor(*src, *my_owner);
+            } else {
+                // Retain ownership of the edge
+                this->add(*src);
+            }
+        } while ( msg == false );
+        return msg;
+    }
+
+    // If we are removing arcs (rf_clear_edges), call clear() rather than reset().
+    void reset() {
+        for(;;) {
+            predecessor_type *src;
+            {
+                if (this->internal_empty()) break;
+                src = &this->internal_pop();
+            }
+            register_successor(*src, *my_owner);
+        }
+    }
+
+protected:
+    successor_type* my_owner;
+};
+
+//! An cache of predecessors that supports requests and reservations
+template< typename T, typename M=spin_mutex >
+class reservable_predecessor_cache : public predecessor_cache< T, M > {
+public:
+    typedef M mutex_type;
+    typedef T output_type;
+    typedef sender<T> predecessor_type;
+    typedef receiver<T> successor_type;
+
+    reservable_predecessor_cache( successor_type* owner )
+        : predecessor_cache<T,M>(owner), reserved_src(nullptr)
+    {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    bool try_reserve( output_type &v ) {
+        bool msg = false;
+
+        do {
+            predecessor_type* pred = nullptr;
+            {
+                typename mutex_type::scoped_lock lock(this->my_mutex);
+                if ( reserved_src.load(std::memory_order_relaxed) || this->internal_empty() )
+                    return false;
+
+                pred = &this->internal_pop();
+                reserved_src.store(pred, std::memory_order_relaxed);
+            }
+
+            // Try to get from this sender
+            msg = pred->try_reserve( v );
+
+            if (msg == false) {
+                typename mutex_type::scoped_lock lock(this->my_mutex);
+                // Relinquish ownership of the edge
+                register_successor( *pred, *this->my_owner );
+                reserved_src.store(nullptr, std::memory_order_relaxed);
+            } else {
+                // Retain ownership of the edge
+                this->add( *pred);
+            }
+        } while ( msg == false );
+
+        return msg;
+    }
+
+    bool try_release() {
+        reserved_src.load(std::memory_order_relaxed)->try_release();
+        reserved_src.store(nullptr, std::memory_order_relaxed);
+        return true;
+    }
+
+    bool try_consume() {
+        reserved_src.load(std::memory_order_relaxed)->try_consume();
+        reserved_src.store(nullptr, std::memory_order_relaxed);
+        return true;
+    }
+
+    void reset() {
+        reserved_src.store(nullptr, std::memory_order_relaxed);
+        predecessor_cache<T, M>::reset();
+    }
+
+    void clear() {
+        reserved_src.store(nullptr, std::memory_order_relaxed);
+        predecessor_cache<T, M>::clear();
+    }
+
+private:
+    std::atomic<predecessor_type*> reserved_src;
+};
+
+
+//! An abstract cache of successors
+template<typename T, typename M=spin_rw_mutex >
+class successor_cache : no_copy {
+protected:
+
+    typedef M mutex_type;
+    mutex_type my_mutex;
+
+    typedef receiver<T> successor_type;
+    typedef receiver<T>* pointer_type;
+    typedef sender<T> owner_type;
+    // TODO revamp: introduce heapified collection of successors for strict priorities
+    typedef std::list< pointer_type > successors_type;
+    successors_type my_successors;
+
+    owner_type* my_owner;
+
+public:
+    successor_cache( owner_type* owner ) : my_owner(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    virtual ~successor_cache() {}
+
+    void register_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        if( r.priority() != no_priority )
+            my_successors.push_front( &r );
+        else
+            my_successors.push_back( &r );
+    }
+
+    void remove_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        for ( typename successors_type::iterator i = my_successors.begin();
+              i != my_successors.end(); ++i ) {
+            if ( *i == & r ) {
+                my_successors.erase(i);
+                break;
+            }
+        }
+    }
+
+    bool empty() {
+        typename mutex_type::scoped_lock l(my_mutex, false);
+        return my_successors.empty();
+    }
+
+    void clear() {
+        my_successors.clear();
+    }
+
+    virtual graph_task* try_put_task( const T& t ) = 0;
+};  // successor_cache<T>
+
+//! An abstract cache of successors, specialized to continue_msg
+template<typename M>
+class successor_cache< continue_msg, M > : no_copy {
+protected:
+
+    typedef M mutex_type;
+    mutex_type my_mutex;
+
+    typedef receiver<continue_msg> successor_type;
+    typedef receiver<continue_msg>* pointer_type;
+    typedef sender<continue_msg> owner_type;
+    typedef std::list< pointer_type > successors_type;
+    successors_type my_successors;
+    owner_type* my_owner;
+
+public:
+    successor_cache( sender<continue_msg>* owner ) : my_owner(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    virtual ~successor_cache() {}
+
+    void register_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        if( r.priority() != no_priority )
+            my_successors.push_front( &r );
+        else
+            my_successors.push_back( &r );
+        __TBB_ASSERT( my_owner, "Cache of successors must have an owner." );
+        if ( r.is_continue_receiver() ) {
+            r.register_predecessor( *my_owner );
+        }
+    }
+
+    void remove_successor( successor_type& r ) {
+        typename mutex_type::scoped_lock l(my_mutex, true);
+        for ( successors_type::iterator i = my_successors.begin(); i != my_successors.end(); ++i ) {
+            if ( *i == &r ) {
+                __TBB_ASSERT(my_owner, "Cache of successors must have an owner.");
+                // TODO: check if we need to test for continue_receiver before removing from r.
+                r.remove_predecessor( *my_owner );
+                my_successors.erase(i);
+                break;
+            }
+        }
+    }
+
+    bool empty() {
+        typename mutex_type::scoped_lock l(my_mutex, false);
+        return my_successors.empty();
+    }
+
+    void clear() {
+        my_successors.clear();
+    }
+
+    virtual graph_task* try_put_task( const continue_msg& t ) = 0;
+};  // successor_cache< continue_msg >
+
+//! A cache of successors that are broadcast to
+template<typename T, typename M=spin_rw_mutex>
+class broadcast_cache : public successor_cache<T, M> {
+    typedef successor_cache<T, M> base_type;
+    typedef M mutex_type;
+    typedef typename successor_cache<T,M>::successors_type successors_type;
+
+public:
+
+    broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    // as above, but call try_put_task instead, and return the last task we received (if any)
+    graph_task* try_put_task( const T &t ) override {
+        graph_task * last_task = nullptr;
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
+        typename successors_type::iterator i = this->my_successors.begin();
+        while ( i != this->my_successors.end() ) {
+            graph_task *new_task = (*i)->try_put_task(t);
+            // workaround for icc bug
+            graph& graph_ref = (*i)->graph_reference();
+            last_task = combine_tasks(graph_ref, last_task, new_task);  // enqueue if necessary
+            if(new_task) {
+                ++i;
+            }
+            else {  // failed
+                if ( (*i)->register_predecessor(*this->my_owner) ) {
+                    i = this->my_successors.erase(i);
+                } else {
+                    ++i;
+                }
+            }
+        }
+        return last_task;
+    }
+
+    // call try_put_task and return list of received tasks
+    bool gather_successful_try_puts( const T &t, graph_task_list& tasks ) {
+        bool is_at_least_one_put_successful = false;
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
+        typename successors_type::iterator i = this->my_successors.begin();
+        while ( i != this->my_successors.end() ) {
+            graph_task * new_task = (*i)->try_put_task(t);
+            if(new_task) {
+                ++i;
+                if(new_task != SUCCESSFULLY_ENQUEUED) {
+                    tasks.push_back(*new_task);
+                }
+                is_at_least_one_put_successful = true;
+            }
+            else {  // failed
+                if ( (*i)->register_predecessor(*this->my_owner) ) {
+                    i = this->my_successors.erase(i);
+                } else {
+                    ++i;
+                }
+            }
+        }
+        return is_at_least_one_put_successful;
+    }
+};
+
+//! A cache of successors that are put in a round-robin fashion
+template<typename T, typename M=spin_rw_mutex >
+class round_robin_cache : public successor_cache<T, M> {
+    typedef successor_cache<T, M> base_type;
+    typedef size_t size_type;
+    typedef M mutex_type;
+    typedef typename successor_cache<T,M>::successors_type successors_type;
+
+public:
+
+    round_robin_cache( typename base_type::owner_type* owner ): base_type(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    size_type size() {
+        typename mutex_type::scoped_lock l(this->my_mutex, false);
+        return this->my_successors.size();
+    }
+
+    graph_task* try_put_task( const T &t ) override {
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
+        typename successors_type::iterator i = this->my_successors.begin();
+        while ( i != this->my_successors.end() ) {
+            graph_task* new_task = (*i)->try_put_task(t);
+            if ( new_task ) {
+                return new_task;
+            } else {
+               if ( (*i)->register_predecessor(*this->my_owner) ) {
+                   i = this->my_successors.erase(i);
+               }
+               else {
+                   ++i;
+               }
+            }
+        }
+        return nullptr;
+    }
+};
+
+#endif // __TBB__flow_graph_cache_impl_H
diff --git a/third_party/tbb/detail/_flow_graph_impl.h b/third_party/tbb/detail/_flow_graph_impl.h
new file mode 100644
index 000000000..38ee6bf9e
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_impl.h
@@ -0,0 +1,477 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_impl_H
+#define __TBB_flow_graph_impl_H
+
+// // MISSING #include "../config.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/task_group.h"
+#include "third_party/tbb/task_arena.h"
+#include "third_party/tbb/flow_graph_abstractions.h"
+
+#include "third_party/tbb/concurrent_priority_queue.h"
+
+#include "third_party/libcxx/list"
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+
+class graph_task;
+static graph_task* const SUCCESSFULLY_ENQUEUED = (graph_task*)-1;
+typedef unsigned int node_priority_t;
+static const node_priority_t no_priority = node_priority_t(0);
+
+class graph;
+class graph_node;
+
+template <typename GraphContainerType, typename GraphNodeType>
+class graph_iterator {
+    friend class graph;
+    friend class graph_node;
+public:
+    typedef size_t size_type;
+    typedef GraphNodeType value_type;
+    typedef GraphNodeType* pointer;
+    typedef GraphNodeType& reference;
+    typedef const GraphNodeType& const_reference;
+    typedef std::forward_iterator_tag iterator_category;
+
+    //! Copy constructor
+    graph_iterator(const graph_iterator& other) :
+        my_graph(other.my_graph), current_node(other.current_node)
+    {}
+
+    //! Assignment
+    graph_iterator& operator=(const graph_iterator& other) {
+        if (this != &other) {
+            my_graph = other.my_graph;
+            current_node = other.current_node;
+        }
+        return *this;
+    }
+
+    //! Dereference
+    reference operator*() const;
+
+    //! Dereference
+    pointer operator->() const;
+
+    //! Equality
+    bool operator==(const graph_iterator& other) const {
+        return ((my_graph == other.my_graph) && (current_node == other.current_node));
+    }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+    //! Inequality
+    bool operator!=(const graph_iterator& other) const { return !(operator==(other)); }
+#endif
+
+    //! Pre-increment
+    graph_iterator& operator++() {
+        internal_forward();
+        return *this;
+    }
+
+    //! Post-increment
+    graph_iterator operator++(int) {
+        graph_iterator result = *this;
+        operator++();
+        return result;
+    }
+
+private:
+    // the graph over which we are iterating
+    GraphContainerType *my_graph;
+    // pointer into my_graph's my_nodes list
+    pointer current_node;
+
+    //! Private initializing constructor for begin() and end() iterators
+    graph_iterator(GraphContainerType *g, bool begin);
+    void internal_forward();
+};  // class graph_iterator
+
+// flags to modify the behavior of the graph reset().  Can be combined.
+enum reset_flags {
+    rf_reset_protocol = 0,
+    rf_reset_bodies = 1 << 0,  // delete the current node body, reset to a copy of the initial node body.
+    rf_clear_edges = 1 << 1   // delete edges
+};
+
+void activate_graph(graph& g);
+void deactivate_graph(graph& g);
+bool is_graph_active(graph& g);
+graph_task* prioritize_task(graph& g, graph_task& arena_task);
+void spawn_in_graph_arena(graph& g, graph_task& arena_task);
+void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
+
+class graph;
+
+//! Base class for tasks generated by graph nodes.
+class graph_task : public task {
+public:
+    graph_task(graph& g, small_object_allocator& allocator
+               , node_priority_t node_priority = no_priority
+    )
+        : my_graph(g)
+        , priority(node_priority)
+        , my_allocator(allocator)
+    {}
+    graph& my_graph; // graph instance the task belongs to
+    // TODO revamp: rename to my_priority
+    node_priority_t priority;
+    template <typename DerivedType>
+    void destruct_and_deallocate(const execution_data& ed);
+protected:
+    template <typename DerivedType>
+    void finalize(const execution_data& ed);
+private:
+    // To organize task_list
+    graph_task* my_next{ nullptr };
+    small_object_allocator my_allocator;
+    // TODO revamp: elaborate internal interfaces to avoid friends declarations
+    friend class graph_task_list;
+    friend graph_task* prioritize_task(graph& g, graph_task& gt);
+};
+
+struct graph_task_comparator {
+    bool operator()(const graph_task* left, const graph_task* right) {
+        return left->priority < right->priority;
+    }
+};
+
+typedef tbb::concurrent_priority_queue<graph_task*, graph_task_comparator> graph_task_priority_queue_t;
+
+class priority_task_selector : public task {
+public:
+    priority_task_selector(graph_task_priority_queue_t& priority_queue, small_object_allocator& allocator)
+        : my_priority_queue(priority_queue), my_allocator(allocator), my_task() {}
+    task* execute(execution_data& ed) override {
+        next_task();
+        __TBB_ASSERT(my_task, nullptr);
+        task* t_next = my_task->execute(ed);
+        my_allocator.delete_object(this, ed);
+        return t_next;
+    }
+    task* cancel(execution_data& ed) override {
+        if (!my_task) {
+            next_task();
+        }
+        __TBB_ASSERT(my_task, nullptr);
+        task* t_next = my_task->cancel(ed);
+        my_allocator.delete_object(this, ed);
+        return t_next;
+    }
+private:
+    void next_task() {
+        // TODO revamp: hold functors in priority queue instead of real tasks
+        bool result = my_priority_queue.try_pop(my_task);
+        __TBB_ASSERT_EX(result, "Number of critical tasks for scheduler and tasks"
+            " in graph's priority queue mismatched");
+        __TBB_ASSERT(my_task && my_task != SUCCESSFULLY_ENQUEUED,
+            "Incorrect task submitted to graph priority queue");
+        __TBB_ASSERT(my_task->priority != no_priority,
+            "Tasks from graph's priority queue must have priority");
+    }
+
+    graph_task_priority_queue_t& my_priority_queue;
+    small_object_allocator my_allocator;
+    graph_task* my_task;
+};
+
+template <typename Receiver, typename Body> class run_and_put_task;
+template <typename Body> class run_task;
+
+//********************************************************************************
+// graph tasks helpers
+//********************************************************************************
+
+//! The list of graph tasks
+class graph_task_list : no_copy {
+private:
+    graph_task* my_first;
+    graph_task** my_next_ptr;
+public:
+    //! Construct empty list
+    graph_task_list() : my_first(nullptr), my_next_ptr(&my_first) {}
+
+    //! True if list is empty; false otherwise.
+    bool empty() const { return !my_first; }
+
+    //! Push task onto back of list.
+    void push_back(graph_task& task) {
+        task.my_next = nullptr;
+        *my_next_ptr = &task;
+        my_next_ptr = &task.my_next;
+    }
+
+    //! Pop the front task from the list.
+    graph_task& pop_front() {
+        __TBB_ASSERT(!empty(), "attempt to pop item from empty task_list");
+        graph_task* result = my_first;
+        my_first = result->my_next;
+        if (!my_first) {
+            my_next_ptr = &my_first;
+        }
+        return *result;
+    }
+};
+
+//! The graph class
+/** This class serves as a handle to the graph */
+class graph : no_copy, public graph_proxy {
+    friend class graph_node;
+
+    void prepare_task_arena(bool reinit = false) {
+        if (reinit) {
+            __TBB_ASSERT(my_task_arena, "task arena is nullptr");
+            my_task_arena->terminate();
+            my_task_arena->initialize(task_arena::attach());
+        }
+        else {
+            __TBB_ASSERT(my_task_arena == nullptr, "task arena is not nullptr");
+            my_task_arena = new task_arena(task_arena::attach());
+        }
+        if (!my_task_arena->is_active()) // failed to attach
+            my_task_arena->initialize(); // create a new, default-initialized arena
+        __TBB_ASSERT(my_task_arena->is_active(), "task arena is not active");
+    }
+
+public:
+    //! Constructs a graph with isolated task_group_context
+    graph();
+
+    //! Constructs a graph with use_this_context as context
+    explicit graph(task_group_context& use_this_context);
+
+    //! Destroys the graph.
+    /** Calls wait_for_all, then destroys the root task and context. */
+    ~graph();
+
+    //! Used to register that an external entity may still interact with the graph.
+    /** The graph will not return from wait_for_all until a matching number of release_wait calls is
+    made. */
+    void reserve_wait() override;
+
+    //! Deregisters an external entity that may have interacted with the graph.
+    /** The graph will not return from wait_for_all until all the number of reserve_wait calls
+    matches the number of release_wait calls. */
+    void release_wait() override;
+
+    //! Wait until graph is idle and the number of release_wait calls equals to the number of
+    //! reserve_wait calls.
+    /** The waiting thread will go off and steal work while it is blocked in the wait_for_all. */
+    void wait_for_all() {
+        cancelled = false;
+        caught_exception = false;
+        try_call([this] {
+            my_task_arena->execute([this] {
+                wait(my_wait_context, *my_context);
+            });
+            cancelled = my_context->is_group_execution_cancelled();
+        }).on_exception([this] {
+            my_context->reset();
+            caught_exception = true;
+            cancelled = true;
+        });
+        // TODO: the "if" condition below is just a work-around to support the concurrent wait
+        // mode. The cancellation and exception mechanisms are still broken in this mode.
+        // Consider using task group not to re-implement the same functionality.
+        if (!(my_context->traits() & task_group_context::concurrent_wait)) {
+            my_context->reset();  // consistent with behavior in catch()
+        }
+    }
+
+    // TODO revamp: consider adding getter for task_group_context.
+
+    // ITERATORS
+    template<typename C, typename N>
+    friend class graph_iterator;
+
+    // Graph iterator typedefs
+    typedef graph_iterator<graph, graph_node> iterator;
+    typedef graph_iterator<const graph, const graph_node> const_iterator;
+
+    // Graph iterator constructors
+    //! start iterator
+    iterator begin();
+    //! end iterator
+    iterator end();
+    //! start const iterator
+    const_iterator begin() const;
+    //! end const iterator
+    const_iterator end() const;
+    //! start const iterator
+    const_iterator cbegin() const;
+    //! end const iterator
+    const_iterator cend() const;
+
+    // thread-unsafe state reset.
+    void reset(reset_flags f = rf_reset_protocol);
+
+    //! cancels execution of the associated task_group_context
+    void cancel();
+
+    //! return status of graph execution
+    bool is_cancelled() { return cancelled; }
+    bool exception_thrown() { return caught_exception; }
+
+private:
+    wait_context my_wait_context;
+    task_group_context *my_context;
+    bool own_context;
+    bool cancelled;
+    bool caught_exception;
+    bool my_is_active;
+
+    graph_node *my_nodes, *my_nodes_last;
+
+    tbb::spin_mutex nodelist_mutex;
+    void register_node(graph_node *n);
+    void remove_node(graph_node *n);
+
+    task_arena* my_task_arena;
+
+    graph_task_priority_queue_t my_priority_queue;
+
+    friend void activate_graph(graph& g);
+    friend void deactivate_graph(graph& g);
+    friend bool is_graph_active(graph& g);
+    friend graph_task* prioritize_task(graph& g, graph_task& arena_task);
+    friend void spawn_in_graph_arena(graph& g, graph_task& arena_task);
+    friend void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
+
+    friend class task_arena_base;
+
+};  // class graph
+
+template<typename DerivedType>
+inline void graph_task::destruct_and_deallocate(const execution_data& ed) {
+    auto allocator = my_allocator;
+    // TODO: investigate if direct call of derived destructor gives any benefits.
+    this->~graph_task();
+    allocator.deallocate(static_cast<DerivedType*>(this), ed);
+}
+
+template<typename DerivedType>
+inline void graph_task::finalize(const execution_data& ed) {
+    graph& g = my_graph;
+    destruct_and_deallocate<DerivedType>(ed);
+    g.release_wait();
+}
+
+//********************************************************************************
+// end of graph tasks helpers
+//********************************************************************************
+
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+class get_graph_helper;
+#endif
+
+//! The base of all graph nodes.
+class graph_node : no_copy {
+    friend class graph;
+    template<typename C, typename N>
+    friend class graph_iterator;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    friend class get_graph_helper;
+#endif
+
+protected:
+    graph& my_graph;
+    graph& graph_reference() const {
+        // TODO revamp: propagate graph_reference() method to all the reference places.
+        return my_graph;
+    }
+    graph_node* next = nullptr;
+    graph_node* prev = nullptr;
+public:
+    explicit graph_node(graph& g);
+
+    virtual ~graph_node();
+
+protected:
+    // performs the reset on an individual node.
+    virtual void reset_node(reset_flags f = rf_reset_protocol) = 0;
+};  // class graph_node
+
+inline void activate_graph(graph& g) {
+    g.my_is_active = true;
+}
+
+inline void deactivate_graph(graph& g) {
+    g.my_is_active = false;
+}
+
+inline bool is_graph_active(graph& g) {
+    return g.my_is_active;
+}
+
+inline graph_task* prioritize_task(graph& g, graph_task& gt) {
+    if( no_priority == gt.priority )
+        return &gt;
+
+    //! Non-preemptive priority pattern. The original task is submitted as a work item to the
+    //! priority queue, and a new critical task is created to take and execute a work item with
+    //! the highest known priority. The reference counting responsibility is transferred (via
+    //! allocate_continuation) to the new task.
+    task* critical_task = gt.my_allocator.new_object<priority_task_selector>(g.my_priority_queue, gt.my_allocator);
+    __TBB_ASSERT( critical_task, "bad_alloc?" );
+    g.my_priority_queue.push(&gt);
+    using tbb::detail::d1::submit;
+    submit( *critical_task, *g.my_task_arena, *g.my_context, /*as_critical=*/true );
+    return nullptr;
+}
+
+//! Spawns a task inside graph arena
+inline void spawn_in_graph_arena(graph& g, graph_task& arena_task) {
+    if (is_graph_active(g)) {
+        task* gt = prioritize_task(g, arena_task);
+        if( !gt )
+            return;
+
+        __TBB_ASSERT(g.my_task_arena && g.my_task_arena->is_active(), nullptr);
+        submit( *gt, *g.my_task_arena, *g.my_context
+#if __TBB_PREVIEW_CRITICAL_TASKS
+                , /*as_critical=*/false
+#endif
+        );
+    }
+}
+
+// TODO revamp: unify *_in_graph_arena functions
+
+//! Enqueues a task inside graph arena
+inline void enqueue_in_graph_arena(graph &g, graph_task& arena_task) {
+    if (is_graph_active(g)) {
+        __TBB_ASSERT( g.my_task_arena && g.my_task_arena->is_active(), "Is graph's arena initialized and active?" );
+
+        // TODO revamp: decide on the approach that does not postpone critical task
+        if( task* gt = prioritize_task(g, arena_task) )
+            submit( *gt, *g.my_task_arena, *g.my_context, /*as_critical=*/false);
+    }
+}
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_flow_graph_impl_H
diff --git a/third_party/tbb/detail/_flow_graph_indexer_impl.h b/third_party/tbb/detail/_flow_graph_indexer_impl.h
new file mode 100644
index 000000000..fdb4f6ab6
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_indexer_impl.h
@@ -0,0 +1,352 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_indexer_impl_H
+#define __TBB__flow_graph_indexer_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1
+
+#include "third_party/tbb/detail/_flow_graph_types_impl.h"
+
+    // Output of the indexer_node is a tbb::flow::tagged_msg, and will be of
+    // the form  tagged_msg<tag, result>
+    // where the value of tag will indicate which result was put to the
+    // successor.
+
+    template<typename IndexerNodeBaseType, typename T, size_t K>
+    graph_task* do_try_put(const T &v, void *p) {
+        typename IndexerNodeBaseType::output_type o(K, v);
+        return reinterpret_cast<IndexerNodeBaseType *>(p)->try_put_task(&o);
+    }
+
+    template<typename TupleTypes,int N>
+    struct indexer_helper {
+        template<typename IndexerNodeBaseType, typename PortTuple>
+        static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
+            typedef typename std::tuple_element<N-1, TupleTypes>::type T;
+            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, N-1>;
+            std::get<N-1>(my_input).set_up(p, indexer_node_put_task, g);
+            indexer_helper<TupleTypes,N-1>::template set_indexer_node_pointer<IndexerNodeBaseType,PortTuple>(my_input, p, g);
+        }
+    };
+
+    template<typename TupleTypes>
+    struct indexer_helper<TupleTypes,1> {
+        template<typename IndexerNodeBaseType, typename PortTuple>
+        static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
+            typedef typename std::tuple_element<0, TupleTypes>::type T;
+            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, 0>;
+            std::get<0>(my_input).set_up(p, indexer_node_put_task, g);
+        }
+    };
+
+    template<typename T>
+    class indexer_input_port : public receiver<T> {
+    private:
+        void* my_indexer_ptr;
+        typedef graph_task* (* forward_function_ptr)(T const &, void* );
+        forward_function_ptr my_try_put_task;
+        graph* my_graph;
+    public:
+        void set_up(void* p, forward_function_ptr f, graph& g) {
+            my_indexer_ptr = p;
+            my_try_put_task = f;
+            my_graph = &g;
+        }
+
+    protected:
+        template< typename R, typename B > friend class run_and_put_task;
+        template<typename X, typename Y> friend class broadcast_cache;
+        template<typename X, typename Y> friend class round_robin_cache;
+        graph_task* try_put_task(const T &v) override {
+            return my_try_put_task(v, my_indexer_ptr);
+        }
+
+        graph& graph_reference() const override {
+            return *my_graph;
+        }
+    };
+
+    template<typename InputTuple, typename OutputType, typename StructTypes>
+    class indexer_node_FE {
+    public:
+        static const int N = std::tuple_size<InputTuple>::value;
+        typedef OutputType output_type;
+        typedef InputTuple input_type;
+
+        // Some versions of Intel(R) C++ Compiler fail to generate an implicit constructor for the class which has std::tuple as a member.
+        indexer_node_FE() : my_inputs() {}
+
+        input_type &input_ports() { return my_inputs; }
+    protected:
+        input_type my_inputs;
+    };
+
+    //! indexer_node_base
+    template<typename InputTuple, typename OutputType, typename StructTypes>
+    class indexer_node_base : public graph_node, public indexer_node_FE<InputTuple, OutputType,StructTypes>,
+                           public sender<OutputType> {
+    protected:
+       using graph_node::my_graph;
+    public:
+        static const size_t N = std::tuple_size<InputTuple>::value;
+        typedef OutputType output_type;
+        typedef StructTypes tuple_types;
+        typedef typename sender<output_type>::successor_type successor_type;
+        typedef indexer_node_FE<InputTuple, output_type,StructTypes> input_ports_type;
+
+    private:
+        // ----------- Aggregator ------------
+        enum op_type { reg_succ, rem_succ, try__put_task
+        };
+        typedef indexer_node_base<InputTuple,output_type,StructTypes> class_type;
+
+        class indexer_node_base_operation : public aggregated_operation<indexer_node_base_operation> {
+        public:
+            char type;
+            union {
+                output_type const *my_arg;
+                successor_type *my_succ;
+                graph_task* bypass_t;
+            };
+            indexer_node_base_operation(const output_type* e, op_type t) :
+                type(char(t)), my_arg(e) {}
+            indexer_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
+                my_succ(const_cast<successor_type *>(&s)) {}
+        };
+
+        typedef aggregating_functor<class_type, indexer_node_base_operation> handler_type;
+        friend class aggregating_functor<class_type, indexer_node_base_operation>;
+        aggregator<handler_type, indexer_node_base_operation> my_aggregator;
+
+        void handle_operations(indexer_node_base_operation* op_list) {
+            indexer_node_base_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+
+                case reg_succ:
+                    my_successors.register_successor(*(current->my_succ));
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+
+                case rem_succ:
+                    my_successors.remove_successor(*(current->my_succ));
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case try__put_task: {
+                        current->bypass_t = my_successors.try_put_task(*(current->my_arg));
+                        current->status.store( SUCCEEDED, std::memory_order_release);  // return of try_put_task actual return value
+                    }
+                    break;
+                }
+            }
+        }
+        // ---------- end aggregator -----------
+    public:
+        indexer_node_base(graph& g) : graph_node(g), input_ports_type(), my_successors(this) {
+            indexer_helper<StructTypes,N>::set_indexer_node_pointer(this->my_inputs, this, g);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        indexer_node_base(const indexer_node_base& other)
+            : graph_node(other.my_graph), input_ports_type(), sender<output_type>(), my_successors(this)
+        {
+            indexer_helper<StructTypes,N>::set_indexer_node_pointer(this->my_inputs, this, other.my_graph);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        bool register_successor(successor_type &r) override {
+            indexer_node_base_operation op_data(r, reg_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        bool remove_successor( successor_type &r) override {
+            indexer_node_base_operation op_data(r, rem_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        graph_task* try_put_task(output_type const *v) { // not a virtual method in this class
+            indexer_node_base_operation op_data(v, try__put_task);
+            my_aggregator.execute(&op_data);
+            return op_data.bypass_t;
+        }
+
+    protected:
+        void reset_node(reset_flags f) override {
+            if(f & rf_clear_edges) {
+                my_successors.clear();
+            }
+        }
+
+    private:
+        broadcast_cache<output_type, null_rw_mutex> my_successors;
+    };  //indexer_node_base
+
+
+    template<int N, typename InputTuple> struct input_types;
+
+    template<typename InputTuple>
+    struct input_types<1, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef tagged_msg<size_t, first_type > type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<2, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef tagged_msg<size_t, first_type, second_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<3, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<4, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<5, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<6, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<7, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type> type;
+    };
+
+
+    template<typename InputTuple>
+    struct input_types<8, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef typename std::tuple_element<7, InputTuple>::type eighth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type, eighth_type> type;
+    };
+
+
+    template<typename InputTuple>
+    struct input_types<9, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef typename std::tuple_element<7, InputTuple>::type eighth_type;
+        typedef typename std::tuple_element<8, InputTuple>::type nineth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type, eighth_type, nineth_type> type;
+    };
+
+    template<typename InputTuple>
+    struct input_types<10, InputTuple> {
+        typedef typename std::tuple_element<0, InputTuple>::type first_type;
+        typedef typename std::tuple_element<1, InputTuple>::type second_type;
+        typedef typename std::tuple_element<2, InputTuple>::type third_type;
+        typedef typename std::tuple_element<3, InputTuple>::type fourth_type;
+        typedef typename std::tuple_element<4, InputTuple>::type fifth_type;
+        typedef typename std::tuple_element<5, InputTuple>::type sixth_type;
+        typedef typename std::tuple_element<6, InputTuple>::type seventh_type;
+        typedef typename std::tuple_element<7, InputTuple>::type eighth_type;
+        typedef typename std::tuple_element<8, InputTuple>::type nineth_type;
+        typedef typename std::tuple_element<9, InputTuple>::type tenth_type;
+        typedef tagged_msg<size_t, first_type, second_type, third_type,
+                                                      fourth_type, fifth_type, sixth_type,
+                                                      seventh_type, eighth_type, nineth_type,
+                                                      tenth_type> type;
+    };
+
+    // type generators
+    template<typename OutputTuple>
+    struct indexer_types : public input_types<std::tuple_size<OutputTuple>::value, OutputTuple> {
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef typename input_types<N, OutputTuple>::type output_type;
+        typedef typename wrap_tuple_elements<N,indexer_input_port,OutputTuple>::type input_ports_type;
+        typedef indexer_node_FE<input_ports_type,output_type,OutputTuple> indexer_FE_type;
+        typedef indexer_node_base<input_ports_type, output_type, OutputTuple> indexer_base_type;
+    };
+
+    template<class OutputTuple>
+    class unfolded_indexer_node : public indexer_types<OutputTuple>::indexer_base_type {
+    public:
+        typedef typename indexer_types<OutputTuple>::input_ports_type input_ports_type;
+        typedef OutputTuple tuple_types;
+        typedef typename indexer_types<OutputTuple>::output_type output_type;
+    private:
+        typedef typename indexer_types<OutputTuple>::indexer_base_type base_type;
+    public:
+        unfolded_indexer_node(graph& g) : base_type(g) {}
+        unfolded_indexer_node(const unfolded_indexer_node &other) : base_type(other) {}
+    };
+
+#endif  /* __TBB__flow_graph_indexer_impl_H */
diff --git a/third_party/tbb/detail/_flow_graph_item_buffer_impl.h b/third_party/tbb/detail/_flow_graph_item_buffer_impl.h
new file mode 100644
index 000000000..6cce8b62c
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_item_buffer_impl.h
@@ -0,0 +1,280 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_item_buffer_impl_H
+#define __TBB__flow_graph_item_buffer_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+#include "third_party/tbb/detail/_aligned_space.h"
+
+// in namespace tbb::flow::interfaceX (included in _flow_graph_node_impl.h)
+
+//! Expandable buffer of items.  The possible operations are push, pop,
+//* tests for empty and so forth.  No mutual exclusion is built in.
+//* objects are constructed into and explicitly-destroyed.  get_my_item gives
+// a read-only reference to the item in the buffer.  set_my_item may be called
+// with either an empty or occupied slot.
+
+template <typename T, typename A=cache_aligned_allocator<T> >
+class item_buffer {
+public:
+    typedef T item_type;
+    enum buffer_item_state { no_item=0, has_item=1, reserved_item=2 };
+protected:
+    typedef size_t size_type;
+    typedef std::pair<item_type, buffer_item_state> aligned_space_item;
+    typedef aligned_space<aligned_space_item> buffer_item_type;
+    typedef typename allocator_traits<A>::template rebind_alloc<buffer_item_type> allocator_type;
+    buffer_item_type *my_array;
+    size_type my_array_size;
+    static const size_type initial_buffer_size = 4;
+    size_type my_head;
+    size_type my_tail;
+
+    bool buffer_empty() const { return my_head == my_tail; }
+
+    aligned_space_item &item(size_type i) {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value), nullptr);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), nullptr);
+        return *my_array[i & (my_array_size - 1) ].begin();
+    }
+
+    const aligned_space_item &item(size_type i) const {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value), nullptr);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), nullptr);
+        return *my_array[i & (my_array_size-1)].begin();
+    }
+
+    bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (item(i).second != no_item); }
+#if TBB_USE_ASSERT
+    bool my_item_reserved(size_type i) const { return item(i).second == reserved_item; }
+#endif
+
+    // object management in buffer
+    const item_type &get_my_item(size_t i) const {
+        __TBB_ASSERT(my_item_valid(i),"attempt to get invalid item");
+        item_type* itm = const_cast<item_type*>(reinterpret_cast<const item_type*>(&item(i).first));
+        return *itm;
+    }
+
+    // may be called with an empty slot or a slot that has already been constructed into.
+    void set_my_item(size_t i, const item_type &o) {
+        if(item(i).second != no_item) {
+            destroy_item(i);
+        }
+        new(&(item(i).first)) item_type(o);
+        item(i).second = has_item;
+    }
+
+    // destructively-fetch an object from the buffer
+    void fetch_item(size_t i, item_type &o) {
+        __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot");
+        o = get_my_item(i);  // could have std::move assign semantics
+        destroy_item(i);
+    }
+
+    // move an existing item from one slot to another.  The moved-to slot must be unoccupied,
+    // the moved-from slot must exist and not be reserved.  The after, from will be empty,
+    // to will be occupied but not reserved
+    void move_item(size_t to, size_t from) {
+        __TBB_ASSERT(!my_item_valid(to), "Trying to move to a non-empty slot");
+        __TBB_ASSERT(my_item_valid(from), "Trying to move from an empty slot");
+        set_my_item(to, get_my_item(from));   // could have std::move semantics
+        destroy_item(from);
+
+    }
+
+    // put an item in an empty slot.  Return true if successful, else false
+    bool place_item(size_t here, const item_type &me) {
+#if !TBB_DEPRECATED_SEQUENCER_DUPLICATES
+        if(my_item_valid(here)) return false;
+#endif
+        set_my_item(here, me);
+        return true;
+    }
+
+    // could be implemented with std::move semantics
+    void swap_items(size_t i, size_t j) {
+        __TBB_ASSERT(my_item_valid(i) && my_item_valid(j), "attempt to swap invalid item(s)");
+        item_type temp = get_my_item(i);
+        set_my_item(i, get_my_item(j));
+        set_my_item(j, temp);
+    }
+
+    void destroy_item(size_type i) {
+        __TBB_ASSERT(my_item_valid(i), "destruction of invalid item");
+        item(i).first.~item_type();
+        item(i).second = no_item;
+    }
+
+    // returns the front element
+    const item_type& front() const
+    {
+        __TBB_ASSERT(my_item_valid(my_head), "attempt to fetch head non-item");
+        return get_my_item(my_head);
+    }
+
+    // returns  the back element
+    const item_type& back() const
+    {
+        __TBB_ASSERT(my_item_valid(my_tail - 1), "attempt to fetch head non-item");
+        return get_my_item(my_tail - 1);
+    }
+
+    // following methods are for reservation of the front of a buffer.
+    void reserve_item(size_type i) { __TBB_ASSERT(my_item_valid(i) && !my_item_reserved(i), "item cannot be reserved"); item(i).second = reserved_item; }
+    void release_item(size_type i) { __TBB_ASSERT(my_item_reserved(i), "item is not reserved"); item(i).second = has_item; }
+
+    void destroy_front() { destroy_item(my_head); ++my_head; }
+    void destroy_back() { destroy_item(my_tail-1); --my_tail; }
+
+    // we have to be able to test against a new tail value without changing my_tail
+    // grow_array doesn't work if we change my_tail when the old array is too small
+    size_type size(size_t new_tail = 0) { return (new_tail ? new_tail : my_tail) - my_head; }
+    size_type capacity() { return my_array_size; }
+    // sequencer_node does not use this method, so we don't
+    // need a version that passes in the new_tail value.
+    bool buffer_full() { return size() >= capacity(); }
+
+    //! Grows the internal array.
+    void grow_my_array( size_t minimum_size ) {
+        // test that we haven't made the structure inconsistent.
+        __TBB_ASSERT(capacity() >= my_tail - my_head, "total items exceed capacity");
+        size_type new_size = my_array_size ? 2*my_array_size : initial_buffer_size;
+        while( new_size<minimum_size )
+            new_size*=2;
+
+        buffer_item_type* new_array = allocator_type().allocate(new_size);
+
+        // initialize validity to "no"
+        for( size_type i=0; i<new_size; ++i ) { new_array[i].begin()->second = no_item; }
+
+        for( size_type i=my_head; i<my_tail; ++i) {
+            if(my_item_valid(i)) {  // sequencer_node may have empty slots
+                // placement-new copy-construct; could be std::move
+                char *new_space = (char *)&(new_array[i&(new_size-1)].begin()->first);
+                (void)new(new_space) item_type(get_my_item(i));
+                new_array[i&(new_size-1)].begin()->second = item(i).second;
+            }
+        }
+
+        clean_up_buffer(/*reset_pointers*/false);
+
+        my_array = new_array;
+        my_array_size = new_size;
+    }
+
+    bool push_back(item_type &v) {
+        if(buffer_full()) {
+            grow_my_array(size() + 1);
+        }
+        set_my_item(my_tail, v);
+        ++my_tail;
+        return true;
+    }
+
+    bool pop_back(item_type &v) {
+        if (!my_item_valid(my_tail-1)) {
+            return false;
+        }
+        v = this->back();
+        destroy_back();
+        return true;
+    }
+
+    bool pop_front(item_type &v) {
+        if(!my_item_valid(my_head)) {
+            return false;
+        }
+        v = this->front();
+        destroy_front();
+        return true;
+    }
+
+    // This is used both for reset and for grow_my_array.  In the case of grow_my_array
+    // we want to retain the values of the head and tail.
+    void clean_up_buffer(bool reset_pointers) {
+        if (my_array) {
+            for( size_type i=my_head; i<my_tail; ++i ) {
+                if(my_item_valid(i))
+                    destroy_item(i);
+            }
+            allocator_type().deallocate(my_array,my_array_size);
+        }
+        my_array = nullptr;
+        if(reset_pointers) {
+            my_head = my_tail = my_array_size = 0;
+        }
+    }
+
+public:
+    //! Constructor
+    item_buffer( ) : my_array(nullptr), my_array_size(0),
+                     my_head(0), my_tail(0) {
+        grow_my_array(initial_buffer_size);
+    }
+
+    ~item_buffer() {
+        clean_up_buffer(/*reset_pointers*/true);
+    }
+
+    void reset() { clean_up_buffer(/*reset_pointers*/true); grow_my_array(initial_buffer_size); }
+
+};
+
+//! item_buffer with reservable front-end.  NOTE: if reserving, do not
+//* complete operation with pop_front(); use consume_front().
+//* No synchronization built-in.
+template<typename T, typename A=cache_aligned_allocator<T> >
+class reservable_item_buffer : public item_buffer<T, A> {
+protected:
+    using item_buffer<T, A>::my_item_valid;
+    using item_buffer<T, A>::my_head;
+
+public:
+    reservable_item_buffer() : item_buffer<T, A>(), my_reserved(false) {}
+    void reset() {my_reserved = false; item_buffer<T,A>::reset(); }
+protected:
+
+    bool reserve_front(T &v) {
+        if(my_reserved || !my_item_valid(this->my_head)) return false;
+        my_reserved = true;
+        // reserving the head
+        v = this->front();
+        this->reserve_item(this->my_head);
+        return true;
+    }
+
+    void consume_front() {
+        __TBB_ASSERT(my_reserved, "Attempt to consume a non-reserved item");
+        this->destroy_front();
+        my_reserved = false;
+    }
+
+    void release_front() {
+        __TBB_ASSERT(my_reserved, "Attempt to release a non-reserved item");
+        this->release_item(this->my_head);
+        my_reserved = false;
+    }
+
+    bool my_reserved;
+};
+
+#endif // __TBB__flow_graph_item_buffer_impl_H
diff --git a/third_party/tbb/detail/_flow_graph_join_impl.h b/third_party/tbb/detail/_flow_graph_join_impl.h
new file mode 100644
index 000000000..1253a0662
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_join_impl.h
@@ -0,0 +1,1709 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_join_impl_H
+#define __TBB__flow_graph_join_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included into namespace tbb::detail::d1
+
+    struct forwarding_base : no_assign {
+        forwarding_base(graph &g) : graph_ref(g) {}
+        virtual ~forwarding_base() {}
+        graph& graph_ref;
+    };
+
+    struct queueing_forwarding_base : forwarding_base {
+        using forwarding_base::forwarding_base;
+        // decrement_port_count may create a forwarding task.  If we cannot handle the task
+        // ourselves, ask decrement_port_count to deal with it.
+        virtual graph_task* decrement_port_count(bool handle_task) = 0;
+    };
+
+    struct reserving_forwarding_base : forwarding_base {
+        using forwarding_base::forwarding_base;
+        // decrement_port_count may create a forwarding task.  If we cannot handle the task
+        // ourselves, ask decrement_port_count to deal with it.
+        virtual graph_task* decrement_port_count() = 0;
+        virtual void increment_port_count() = 0;
+    };
+
+    // specialization that lets us keep a copy of the current_key for building results.
+    // KeyType can be a reference type.
+    template<typename KeyType>
+    struct matching_forwarding_base : public forwarding_base {
+        typedef typename std::decay<KeyType>::type current_key_type;
+        matching_forwarding_base(graph &g) : forwarding_base(g) { }
+        virtual graph_task* increment_key_count(current_key_type const & /*t*/) = 0;
+        current_key_type current_key; // so ports can refer to FE's desired items
+    };
+
+    template< int N >
+    struct join_helper {
+
+        template< typename TupleType, typename PortType >
+        static inline void set_join_node_pointer(TupleType &my_input, PortType *port) {
+            std::get<N-1>( my_input ).set_join_node_pointer(port);
+            join_helper<N-1>::set_join_node_pointer( my_input, port );
+        }
+        template< typename TupleType >
+        static inline void consume_reservations( TupleType &my_input ) {
+            std::get<N-1>( my_input ).consume();
+            join_helper<N-1>::consume_reservations( my_input );
+        }
+
+        template< typename TupleType >
+        static inline void release_my_reservation( TupleType &my_input ) {
+            std::get<N-1>( my_input ).release();
+        }
+
+        template <typename TupleType>
+        static inline void release_reservations( TupleType &my_input) {
+            join_helper<N-1>::release_reservations(my_input);
+            release_my_reservation(my_input);
+        }
+
+        template< typename InputTuple, typename OutputTuple >
+        static inline bool reserve( InputTuple &my_input, OutputTuple &out) {
+            if ( !std::get<N-1>( my_input ).reserve( std::get<N-1>( out ) ) ) return false;
+            if ( !join_helper<N-1>::reserve( my_input, out ) ) {
+                release_my_reservation( my_input );
+                return false;
+            }
+            return true;
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
+            bool res = std::get<N-1>(my_input).get_item(std::get<N-1>(out) ); // may fail
+            return join_helper<N-1>::get_my_item(my_input, out) && res;       // do get on other inputs before returning
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
+            return get_my_item(my_input, out);
+        }
+
+        template<typename InputTuple>
+        static inline void reset_my_port(InputTuple &my_input) {
+            join_helper<N-1>::reset_my_port(my_input);
+            std::get<N-1>(my_input).reset_port();
+        }
+
+        template<typename InputTuple>
+        static inline void reset_ports(InputTuple& my_input) {
+            reset_my_port(my_input);
+        }
+
+        template<typename InputTuple, typename KeyFuncTuple>
+        static inline void set_key_functors(InputTuple &my_input, KeyFuncTuple &my_key_funcs) {
+            std::get<N-1>(my_input).set_my_key_func(std::get<N-1>(my_key_funcs));
+            std::get<N-1>(my_key_funcs) = nullptr;
+            join_helper<N-1>::set_key_functors(my_input, my_key_funcs);
+        }
+
+        template< typename KeyFuncTuple>
+        static inline void copy_key_functors(KeyFuncTuple &my_inputs, KeyFuncTuple &other_inputs) {
+            __TBB_ASSERT(
+                std::get<N-1>(other_inputs).get_my_key_func(),
+                "key matching join node should not be instantiated without functors."
+            );
+            std::get<N-1>(my_inputs).set_my_key_func(std::get<N-1>(other_inputs).get_my_key_func()->clone());
+            join_helper<N-1>::copy_key_functors(my_inputs, other_inputs);
+        }
+
+        template<typename InputTuple>
+        static inline void reset_inputs(InputTuple &my_input, reset_flags f) {
+            join_helper<N-1>::reset_inputs(my_input, f);
+            std::get<N-1>(my_input).reset_receiver(f);
+        }
+    };  // join_helper<N>
+
+    template< >
+    struct join_helper<1> {
+
+        template< typename TupleType, typename PortType >
+        static inline void set_join_node_pointer(TupleType &my_input, PortType *port) {
+            std::get<0>( my_input ).set_join_node_pointer(port);
+        }
+
+        template< typename TupleType >
+        static inline void consume_reservations( TupleType &my_input ) {
+            std::get<0>( my_input ).consume();
+        }
+
+        template< typename TupleType >
+        static inline void release_my_reservation( TupleType &my_input ) {
+            std::get<0>( my_input ).release();
+        }
+
+        template<typename TupleType>
+        static inline void release_reservations( TupleType &my_input) {
+            release_my_reservation(my_input);
+        }
+
+        template< typename InputTuple, typename OutputTuple >
+        static inline bool reserve( InputTuple &my_input, OutputTuple &out) {
+            return std::get<0>( my_input ).reserve( std::get<0>( out ) );
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
+            return std::get<0>(my_input).get_item(std::get<0>(out));
+        }
+
+        template<typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
+            return get_my_item(my_input, out);
+        }
+
+        template<typename InputTuple>
+        static inline void reset_my_port(InputTuple &my_input) {
+            std::get<0>(my_input).reset_port();
+        }
+
+        template<typename InputTuple>
+        static inline void reset_ports(InputTuple& my_input) {
+            reset_my_port(my_input);
+        }
+
+        template<typename InputTuple, typename KeyFuncTuple>
+        static inline void set_key_functors(InputTuple &my_input, KeyFuncTuple &my_key_funcs) {
+            std::get<0>(my_input).set_my_key_func(std::get<0>(my_key_funcs));
+            std::get<0>(my_key_funcs) = nullptr;
+        }
+
+        template< typename KeyFuncTuple>
+        static inline void copy_key_functors(KeyFuncTuple &my_inputs, KeyFuncTuple &other_inputs) {
+            __TBB_ASSERT(
+                std::get<0>(other_inputs).get_my_key_func(),
+                "key matching join node should not be instantiated without functors."
+            );
+            std::get<0>(my_inputs).set_my_key_func(std::get<0>(other_inputs).get_my_key_func()->clone());
+        }
+        template<typename InputTuple>
+        static inline void reset_inputs(InputTuple &my_input, reset_flags f) {
+            std::get<0>(my_input).reset_receiver(f);
+        }
+    };  // join_helper<1>
+
+    //! The two-phase join port
+    template< typename T >
+    class reserving_port : public receiver<T> {
+    public:
+        typedef T input_type;
+        typedef typename receiver<input_type>::predecessor_type predecessor_type;
+
+    private:
+        // ----------- Aggregator ------------
+        enum op_type { reg_pred, rem_pred, res_item, rel_res, con_res
+        };
+        typedef reserving_port<T> class_type;
+
+        class reserving_port_operation : public aggregated_operation<reserving_port_operation> {
+        public:
+            char type;
+            union {
+                T *my_arg;
+                predecessor_type *my_pred;
+            };
+            reserving_port_operation(const T& e, op_type t) :
+                type(char(t)), my_arg(const_cast<T*>(&e)) {}
+            reserving_port_operation(const predecessor_type &s, op_type t) : type(char(t)),
+                my_pred(const_cast<predecessor_type *>(&s)) {}
+            reserving_port_operation(op_type t) : type(char(t)) {}
+        };
+
+        typedef aggregating_functor<class_type, reserving_port_operation> handler_type;
+        friend class aggregating_functor<class_type, reserving_port_operation>;
+        aggregator<handler_type, reserving_port_operation> my_aggregator;
+
+        void handle_operations(reserving_port_operation* op_list) {
+            reserving_port_operation *current;
+            bool was_missing_predecessors = false;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case reg_pred:
+                    was_missing_predecessors = my_predecessors.empty();
+                    my_predecessors.add(*(current->my_pred));
+                    if ( was_missing_predecessors ) {
+                        (void) my_join->decrement_port_count(); // may try to forward
+                    }
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case rem_pred:
+                    if ( !my_predecessors.empty() ) {
+                        my_predecessors.remove(*(current->my_pred));
+                        if ( my_predecessors.empty() ) // was the last predecessor
+                            my_join->increment_port_count();
+                    }
+                    // TODO: consider returning failure if there were no predecessors to remove
+                    current->status.store( SUCCEEDED, std::memory_order_release );
+                    break;
+                case res_item:
+                    if ( reserved ) {
+                        current->status.store( FAILED, std::memory_order_release);
+                    }
+                    else if ( my_predecessors.try_reserve( *(current->my_arg) ) ) {
+                        reserved = true;
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    } else {
+                        if ( my_predecessors.empty() ) {
+                            my_join->increment_port_count();
+                        }
+                        current->status.store( FAILED, std::memory_order_release);
+                    }
+                    break;
+                case rel_res:
+                    reserved = false;
+                    my_predecessors.try_release( );
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case con_res:
+                    reserved = false;
+                    my_predecessors.try_consume( );
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                }
+            }
+        }
+
+    protected:
+        template< typename R, typename B > friend class run_and_put_task;
+        template<typename X, typename Y> friend class broadcast_cache;
+        template<typename X, typename Y> friend class round_robin_cache;
+        graph_task* try_put_task( const T & ) override {
+            return nullptr;
+        }
+
+        graph& graph_reference() const override {
+            return my_join->graph_ref;
+        }
+
+    public:
+
+        //! Constructor
+        reserving_port() : my_join(nullptr), my_predecessors(this), reserved(false) {
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        // copy constructor
+        reserving_port(const reserving_port& /* other */) = delete;
+
+        void set_join_node_pointer(reserving_forwarding_base *join) {
+            my_join = join;
+        }
+
+        //! Add a predecessor
+        bool register_predecessor( predecessor_type &src ) override {
+            reserving_port_operation op_data(src, reg_pred);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        //! Remove a predecessor
+        bool remove_predecessor( predecessor_type &src ) override {
+            reserving_port_operation op_data(src, rem_pred);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        //! Reserve an item from the port
+        bool reserve( T &v ) {
+            reserving_port_operation op_data(v, res_item);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        //! Release the port
+        void release( ) {
+            reserving_port_operation op_data(rel_res);
+            my_aggregator.execute(&op_data);
+        }
+
+        //! Complete use of the port
+        void consume( ) {
+            reserving_port_operation op_data(con_res);
+            my_aggregator.execute(&op_data);
+        }
+
+        void reset_receiver( reset_flags f) {
+            if(f & rf_clear_edges) my_predecessors.clear();
+            else
+            my_predecessors.reset();
+            reserved = false;
+            __TBB_ASSERT(!(f&rf_clear_edges) || my_predecessors.empty(), "port edges not removed");
+        }
+
+    private:
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+        friend class get_graph_helper;
+#endif
+
+        reserving_forwarding_base *my_join;
+        reservable_predecessor_cache< T, null_mutex > my_predecessors;
+        bool reserved;
+    };  // reserving_port
+
+    //! queueing join_port
+    template<typename T>
+    class queueing_port : public receiver<T>, public item_buffer<T> {
+    public:
+        typedef T input_type;
+        typedef typename receiver<input_type>::predecessor_type predecessor_type;
+        typedef queueing_port<T> class_type;
+
+    // ----------- Aggregator ------------
+    private:
+        enum op_type { get__item, res_port, try__put_task
+        };
+
+        class queueing_port_operation : public aggregated_operation<queueing_port_operation> {
+        public:
+            char type;
+            T my_val;
+            T* my_arg;
+            graph_task* bypass_t;
+            // constructor for value parameter
+            queueing_port_operation(const T& e, op_type t) :
+                type(char(t)), my_val(e), my_arg(nullptr)
+                , bypass_t(nullptr)
+            {}
+            // constructor for pointer parameter
+            queueing_port_operation(const T* p, op_type t) :
+                type(char(t)), my_arg(const_cast<T*>(p))
+                , bypass_t(nullptr)
+            {}
+            // constructor with no parameter
+            queueing_port_operation(op_type t) : type(char(t)), my_arg(nullptr)
+                , bypass_t(nullptr)
+            {}
+        };
+
+        typedef aggregating_functor<class_type, queueing_port_operation> handler_type;
+        friend class aggregating_functor<class_type, queueing_port_operation>;
+        aggregator<handler_type, queueing_port_operation> my_aggregator;
+
+        void handle_operations(queueing_port_operation* op_list) {
+            queueing_port_operation *current;
+            bool was_empty;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case try__put_task: {
+                        graph_task* rtask = nullptr;
+                        was_empty = this->buffer_empty();
+                        this->push_back(current->my_val);
+                        if (was_empty) rtask = my_join->decrement_port_count(false);
+                        else
+                            rtask = SUCCESSFULLY_ENQUEUED;
+                        current->bypass_t = rtask;
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
+                    break;
+                case get__item:
+                    if(!this->buffer_empty()) {
+                        __TBB_ASSERT(current->my_arg, nullptr);
+                        *(current->my_arg) = this->front();
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
+                    else {
+                        current->status.store( FAILED, std::memory_order_release);
+                    }
+                    break;
+                case res_port:
+                    __TBB_ASSERT(this->my_item_valid(this->my_head), "No item to reset");
+                    this->destroy_front();
+                    if(this->my_item_valid(this->my_head)) {
+                        (void)my_join->decrement_port_count(true);
+                    }
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                }
+            }
+        }
+    // ------------ End Aggregator ---------------
+
+    protected:
+        template< typename R, typename B > friend class run_and_put_task;
+        template<typename X, typename Y> friend class broadcast_cache;
+        template<typename X, typename Y> friend class round_robin_cache;
+        graph_task* try_put_task(const T &v) override {
+            queueing_port_operation op_data(v, try__put_task);
+            my_aggregator.execute(&op_data);
+            __TBB_ASSERT(op_data.status == SUCCEEDED || !op_data.bypass_t, "inconsistent return from aggregator");
+            if(!op_data.bypass_t) return SUCCESSFULLY_ENQUEUED;
+            return op_data.bypass_t;
+        }
+
+        graph& graph_reference() const override {
+            return my_join->graph_ref;
+        }
+
+    public:
+
+        //! Constructor
+        queueing_port() : item_buffer<T>() {
+            my_join = nullptr;
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        //! copy constructor
+        queueing_port(const queueing_port& /* other */) = delete;
+
+        //! record parent for tallying available items
+        void set_join_node_pointer(queueing_forwarding_base *join) {
+            my_join = join;
+        }
+
+        bool get_item( T &v ) {
+            queueing_port_operation op_data(&v, get__item);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        // reset_port is called when item is accepted by successor, but
+        // is initiated by join_node.
+        void reset_port() {
+            queueing_port_operation op_data(res_port);
+            my_aggregator.execute(&op_data);
+            return;
+        }
+
+        void reset_receiver(reset_flags) {
+            item_buffer<T>::reset();
+        }
+
+    private:
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+        friend class get_graph_helper;
+#endif
+
+        queueing_forwarding_base *my_join;
+    };  // queueing_port
+
+#include "third_party/tbb/detail/_flow_graph_tagged_buffer_impl.h"
+
+    template<typename K>
+    struct count_element {
+        K my_key;
+        size_t my_value;
+    };
+
+    // method to access the key in the counting table
+    // the ref has already been removed from K
+    template< typename K >
+    struct key_to_count_functor {
+        typedef count_element<K> table_item_type;
+        const K& operator()(const table_item_type& v) { return v.my_key; }
+    };
+
+    // the ports can have only one template parameter.  We wrap the types needed in
+    // a traits type
+    template< class TraitsType >
+    class key_matching_port :
+        public receiver<typename TraitsType::T>,
+        public hash_buffer< typename TraitsType::K, typename TraitsType::T, typename TraitsType::TtoK,
+                typename TraitsType::KHash > {
+    public:
+        typedef TraitsType traits;
+        typedef key_matching_port<traits> class_type;
+        typedef typename TraitsType::T input_type;
+        typedef typename TraitsType::K key_type;
+        typedef typename std::decay<key_type>::type noref_key_type;
+        typedef typename receiver<input_type>::predecessor_type predecessor_type;
+        typedef typename TraitsType::TtoK type_to_key_func_type;
+        typedef typename TraitsType::KHash hash_compare_type;
+        typedef hash_buffer< key_type, input_type, type_to_key_func_type, hash_compare_type > buffer_type;
+
+    private:
+// ----------- Aggregator ------------
+    private:
+        enum op_type { try__put, get__item, res_port
+        };
+
+        class key_matching_port_operation : public aggregated_operation<key_matching_port_operation> {
+        public:
+            char type;
+            input_type my_val;
+            input_type *my_arg;
+            // constructor for value parameter
+            key_matching_port_operation(const input_type& e, op_type t) :
+                type(char(t)), my_val(e), my_arg(nullptr) {}
+            // constructor for pointer parameter
+            key_matching_port_operation(const input_type* p, op_type t) :
+                type(char(t)), my_arg(const_cast<input_type*>(p)) {}
+            // constructor with no parameter
+            key_matching_port_operation(op_type t) : type(char(t)), my_arg(nullptr) {}
+        };
+
+        typedef aggregating_functor<class_type, key_matching_port_operation> handler_type;
+        friend class aggregating_functor<class_type, key_matching_port_operation>;
+        aggregator<handler_type, key_matching_port_operation> my_aggregator;
+
+        void handle_operations(key_matching_port_operation* op_list) {
+            key_matching_port_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case try__put: {
+                        bool was_inserted = this->insert_with_key(current->my_val);
+                        // return failure if a duplicate insertion occurs
+                        current->status.store( was_inserted ? SUCCEEDED : FAILED, std::memory_order_release);
+                    }
+                    break;
+                case get__item:
+                    // use current_key from FE for item
+                    __TBB_ASSERT(current->my_arg, nullptr);
+                    if(!this->find_with_key(my_join->current_key, *(current->my_arg))) {
+                        __TBB_ASSERT(false, "Failed to find item corresponding to current_key.");
+                    }
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case res_port:
+                    // use current_key from FE for item
+                    this->delete_with_key(my_join->current_key);
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                }
+            }
+        }
+// ------------ End Aggregator ---------------
+    protected:
+        template< typename R, typename B > friend class run_and_put_task;
+        template<typename X, typename Y> friend class broadcast_cache;
+        template<typename X, typename Y> friend class round_robin_cache;
+        graph_task* try_put_task(const input_type& v) override {
+            key_matching_port_operation op_data(v, try__put);
+            graph_task* rtask = nullptr;
+            my_aggregator.execute(&op_data);
+            if(op_data.status == SUCCEEDED) {
+                rtask = my_join->increment_key_count((*(this->get_key_func()))(v));  // may spawn
+                // rtask has to reflect the return status of the try_put
+                if(!rtask) rtask = SUCCESSFULLY_ENQUEUED;
+            }
+            return rtask;
+        }
+
+        graph& graph_reference() const override {
+            return my_join->graph_ref;
+        }
+
+    public:
+
+        key_matching_port() : receiver<input_type>(), buffer_type() {
+            my_join = nullptr;
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        // copy constructor
+        key_matching_port(const key_matching_port& /*other*/) = delete;
+#if __INTEL_COMPILER <= 2021
+        // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited
+        // class while the parent class has the virtual keyword for the destrocutor.
+        virtual
+#endif
+        ~key_matching_port() { }
+
+        void set_join_node_pointer(forwarding_base *join) {
+            my_join = dynamic_cast<matching_forwarding_base<key_type>*>(join);
+        }
+
+        void set_my_key_func(type_to_key_func_type *f) { this->set_key_func(f); }
+
+        type_to_key_func_type* get_my_key_func() { return this->get_key_func(); }
+
+        bool get_item( input_type &v ) {
+            // aggregator uses current_key from FE for Key
+            key_matching_port_operation op_data(&v, get__item);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        // reset_port is called when item is accepted by successor, but
+        // is initiated by join_node.
+        void reset_port() {
+            key_matching_port_operation op_data(res_port);
+            my_aggregator.execute(&op_data);
+            return;
+        }
+
+        void reset_receiver(reset_flags ) {
+            buffer_type::reset();
+        }
+
+    private:
+        // my_join forwarding base used to count number of inputs that
+        // received key.
+        matching_forwarding_base<key_type> *my_join;
+    };  // key_matching_port
+
+    using namespace graph_policy_namespace;
+
+    template<typename JP, typename InputTuple, typename OutputTuple>
+    class join_node_base;
+
+    //! join_node_FE : implements input port policy
+    template<typename JP, typename InputTuple, typename OutputTuple>
+    class join_node_FE;
+
+    template<typename InputTuple, typename OutputTuple>
+    class join_node_FE<reserving, InputTuple, OutputTuple> : public reserving_forwarding_base {
+    private:
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef OutputTuple output_type;
+        typedef InputTuple input_type;
+        typedef join_node_base<reserving, InputTuple, OutputTuple> base_node_type; // for forwarding
+    public:
+        join_node_FE(graph &g) : reserving_forwarding_base(g), my_node(nullptr) {
+            ports_with_no_inputs = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        join_node_FE(const join_node_FE& other) : reserving_forwarding_base((other.reserving_forwarding_base::graph_ref)), my_node(nullptr) {
+            ports_with_no_inputs = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; }
+
+       void increment_port_count() override {
+            ++ports_with_no_inputs;
+        }
+
+        // if all input_ports have predecessors, spawn forward to try and consume tuples
+        graph_task* decrement_port_count() override {
+            if(ports_with_no_inputs.fetch_sub(1) == 1) {
+                if(is_graph_active(this->graph_ref)) {
+                    small_object_allocator allocator{};
+                    typedef forward_task_bypass<base_node_type> task_type;
+                    graph_task* t = allocator.new_object<task_type>(graph_ref, allocator, *my_node);
+                    graph_ref.reserve_wait();
+                    spawn_in_graph_arena(this->graph_ref, *t);
+                }
+            }
+            return nullptr;
+        }
+
+        input_type &input_ports() { return my_inputs; }
+
+    protected:
+
+        void reset(  reset_flags f) {
+            // called outside of parallel contexts
+            ports_with_no_inputs = N;
+            join_helper<N>::reset_inputs(my_inputs, f);
+        }
+
+        // all methods on input ports should be called under mutual exclusion from join_node_base.
+
+        bool tuple_build_may_succeed() {
+            return !ports_with_no_inputs;
+        }
+
+        bool try_to_make_tuple(output_type &out) {
+            if(ports_with_no_inputs) return false;
+            return join_helper<N>::reserve(my_inputs, out);
+        }
+
+        void tuple_accepted() {
+            join_helper<N>::consume_reservations(my_inputs);
+        }
+        void tuple_rejected() {
+            join_helper<N>::release_reservations(my_inputs);
+        }
+
+        input_type my_inputs;
+        base_node_type *my_node;
+        std::atomic<std::size_t> ports_with_no_inputs;
+    };  // join_node_FE<reserving, ... >
+
+    template<typename InputTuple, typename OutputTuple>
+    class join_node_FE<queueing, InputTuple, OutputTuple> : public queueing_forwarding_base {
+    public:
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef OutputTuple output_type;
+        typedef InputTuple input_type;
+        typedef join_node_base<queueing, InputTuple, OutputTuple> base_node_type; // for forwarding
+
+        join_node_FE(graph &g) : queueing_forwarding_base(g), my_node(nullptr) {
+            ports_with_no_items = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        join_node_FE(const join_node_FE& other) : queueing_forwarding_base((other.queueing_forwarding_base::graph_ref)), my_node(nullptr) {
+            ports_with_no_items = N;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+        }
+
+        // needed for forwarding
+        void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; }
+
+        void reset_port_count() {
+            ports_with_no_items = N;
+        }
+
+        // if all input_ports have items, spawn forward to try and consume tuples
+        graph_task* decrement_port_count(bool handle_task) override
+        {
+            if(ports_with_no_items.fetch_sub(1) == 1) {
+                if(is_graph_active(this->graph_ref)) {
+                    small_object_allocator allocator{};
+                    typedef forward_task_bypass<base_node_type> task_type;
+                    graph_task* t = allocator.new_object<task_type>(graph_ref, allocator, *my_node);
+                    graph_ref.reserve_wait();
+                    if( !handle_task )
+                        return t;
+                    spawn_in_graph_arena(this->graph_ref, *t);
+                }
+            }
+            return nullptr;
+        }
+
+        input_type &input_ports() { return my_inputs; }
+
+    protected:
+
+        void reset(  reset_flags f) {
+            reset_port_count();
+            join_helper<N>::reset_inputs(my_inputs, f );
+        }
+
+        // all methods on input ports should be called under mutual exclusion from join_node_base.
+
+        bool tuple_build_may_succeed() {
+            return !ports_with_no_items;
+        }
+
+        bool try_to_make_tuple(output_type &out) {
+            if(ports_with_no_items) return false;
+            return join_helper<N>::get_items(my_inputs, out);
+        }
+
+        void tuple_accepted() {
+            reset_port_count();
+            join_helper<N>::reset_ports(my_inputs);
+        }
+        void tuple_rejected() {
+            // nothing to do.
+        }
+
+        input_type my_inputs;
+        base_node_type *my_node;
+        std::atomic<std::size_t> ports_with_no_items;
+    };  // join_node_FE<queueing, ...>
+
+    // key_matching join front-end.
+    template<typename InputTuple, typename OutputTuple, typename K, typename KHash>
+    class join_node_FE<key_matching<K,KHash>, InputTuple, OutputTuple> : public matching_forwarding_base<K>,
+             // buffer of key value counts
+              public hash_buffer<   // typedefed below to key_to_count_buffer_type
+                  typename std::decay<K>::type&,        // force ref type on K
+                  count_element<typename std::decay<K>::type>,
+                  type_to_key_function_body<
+                      count_element<typename std::decay<K>::type>,
+                      typename std::decay<K>::type& >,
+                  KHash >,
+             // buffer of output items
+             public item_buffer<OutputTuple> {
+    public:
+        static const int N = std::tuple_size<OutputTuple>::value;
+        typedef OutputTuple output_type;
+        typedef InputTuple input_type;
+        typedef K key_type;
+        typedef typename std::decay<key_type>::type unref_key_type;
+        typedef KHash key_hash_compare;
+        // must use K without ref.
+        typedef count_element<unref_key_type> count_element_type;
+        // method that lets us refer to the key of this type.
+        typedef key_to_count_functor<unref_key_type> key_to_count_func;
+        typedef type_to_key_function_body< count_element_type, unref_key_type&> TtoK_function_body_type;
+        typedef type_to_key_function_body_leaf<count_element_type, unref_key_type&, key_to_count_func> TtoK_function_body_leaf_type;
+        // this is the type of the special table that keeps track of the number of discrete
+        // elements corresponding to each key that we've seen.
+        typedef hash_buffer< unref_key_type&, count_element_type, TtoK_function_body_type, key_hash_compare >
+                 key_to_count_buffer_type;
+        typedef item_buffer<output_type> output_buffer_type;
+        typedef join_node_base<key_matching<key_type,key_hash_compare>, InputTuple, OutputTuple> base_node_type; // for forwarding
+        typedef matching_forwarding_base<key_type> forwarding_base_type;
+
+// ----------- Aggregator ------------
+        // the aggregator is only needed to serialize the access to the hash table.
+        // and the output_buffer_type base class
+    private:
+        enum op_type { res_count, inc_count, may_succeed, try_make };
+        typedef join_node_FE<key_matching<key_type,key_hash_compare>, InputTuple, OutputTuple> class_type;
+
+        class key_matching_FE_operation : public aggregated_operation<key_matching_FE_operation> {
+        public:
+            char type;
+            unref_key_type my_val;
+            output_type* my_output;
+            graph_task* bypass_t;
+            // constructor for value parameter
+            key_matching_FE_operation(const unref_key_type& e , op_type t) : type(char(t)), my_val(e),
+                 my_output(nullptr), bypass_t(nullptr) {}
+            key_matching_FE_operation(output_type *p, op_type t) : type(char(t)), my_output(p), bypass_t(nullptr) {}
+            // constructor with no parameter
+            key_matching_FE_operation(op_type t) : type(char(t)), my_output(nullptr), bypass_t(nullptr) {}
+        };
+
+        typedef aggregating_functor<class_type, key_matching_FE_operation> handler_type;
+        friend class aggregating_functor<class_type, key_matching_FE_operation>;
+        aggregator<handler_type, key_matching_FE_operation> my_aggregator;
+
+        // called from aggregator, so serialized
+        // returns a task pointer if the a task would have been enqueued but we asked that
+        // it be returned.  Otherwise returns nullptr.
+        graph_task* fill_output_buffer(unref_key_type &t) {
+            output_type l_out;
+            graph_task* rtask = nullptr;
+            bool do_fwd = this->buffer_empty() && is_graph_active(this->graph_ref);
+            this->current_key = t;
+            this->delete_with_key(this->current_key);   // remove the key
+            if(join_helper<N>::get_items(my_inputs, l_out)) {  //  <== call back
+                this->push_back(l_out);
+                if(do_fwd) {  // we enqueue if receiving an item from predecessor, not if successor asks for item
+                    small_object_allocator allocator{};
+                    typedef forward_task_bypass<base_node_type> task_type;
+                    rtask = allocator.new_object<task_type>(this->graph_ref, allocator, *my_node);
+                    this->graph_ref.reserve_wait();
+                    do_fwd = false;
+                }
+                // retire the input values
+                join_helper<N>::reset_ports(my_inputs);  //  <== call back
+            }
+            else {
+                __TBB_ASSERT(false, "should have had something to push");
+            }
+            return rtask;
+        }
+
+        void handle_operations(key_matching_FE_operation* op_list) {
+            key_matching_FE_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case res_count:  // called from BE
+                    {
+                        this->destroy_front();
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
+                    break;
+                case inc_count: {  // called from input ports
+                        count_element_type *p = nullptr;
+                        unref_key_type &t = current->my_val;
+                        if(!(this->find_ref_with_key(t,p))) {
+                            count_element_type ev;
+                            ev.my_key = t;
+                            ev.my_value = 0;
+                            this->insert_with_key(ev);
+                            bool found = this->find_ref_with_key(t, p);
+                            __TBB_ASSERT_EX(found, "should find key after inserting it");
+                        }
+                        if(++(p->my_value) == size_t(N)) {
+                            current->bypass_t = fill_output_buffer(t);
+                        }
+                    }
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case may_succeed:  // called from BE
+                    current->status.store( this->buffer_empty() ? FAILED : SUCCEEDED, std::memory_order_release);
+                    break;
+                case try_make:  // called from BE
+                    if(this->buffer_empty()) {
+                        current->status.store( FAILED, std::memory_order_release);
+                    }
+                    else {
+                        *(current->my_output) = this->front();
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
+                    break;
+                }
+            }
+        }
+// ------------ End Aggregator ---------------
+
+    public:
+        template<typename FunctionTuple>
+        join_node_FE(graph &g, FunctionTuple &TtoK_funcs) : forwarding_base_type(g), my_node(nullptr) {
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+            join_helper<N>::set_key_functors(my_inputs, TtoK_funcs);
+            my_aggregator.initialize_handler(handler_type(this));
+                    TtoK_function_body_type *cfb = new TtoK_function_body_leaf_type(key_to_count_func());
+            this->set_key_func(cfb);
+        }
+
+        join_node_FE(const join_node_FE& other) : forwarding_base_type((other.forwarding_base_type::graph_ref)), key_to_count_buffer_type(),
+        output_buffer_type() {
+            my_node = nullptr;
+            join_helper<N>::set_join_node_pointer(my_inputs, this);
+            join_helper<N>::copy_key_functors(my_inputs, const_cast<input_type &>(other.my_inputs));
+            my_aggregator.initialize_handler(handler_type(this));
+            TtoK_function_body_type *cfb = new TtoK_function_body_leaf_type(key_to_count_func());
+            this->set_key_func(cfb);
+        }
+
+        // needed for forwarding
+        void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; }
+
+        void reset_port_count() {  // called from BE
+            key_matching_FE_operation op_data(res_count);
+            my_aggregator.execute(&op_data);
+            return;
+        }
+
+        // if all input_ports have items, spawn forward to try and consume tuples
+        // return a task if we are asked and did create one.
+        graph_task *increment_key_count(unref_key_type const & t) override {  // called from input_ports
+            key_matching_FE_operation op_data(t, inc_count);
+            my_aggregator.execute(&op_data);
+            return op_data.bypass_t;
+        }
+
+        input_type &input_ports() { return my_inputs; }
+
+    protected:
+
+        void reset(  reset_flags f ) {
+            // called outside of parallel contexts
+            join_helper<N>::reset_inputs(my_inputs, f);
+
+            key_to_count_buffer_type::reset();
+            output_buffer_type::reset();
+        }
+
+        // all methods on input ports should be called under mutual exclusion from join_node_base.
+
+        bool tuple_build_may_succeed() {  // called from back-end
+            key_matching_FE_operation op_data(may_succeed);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        // cannot lock while calling back to input_ports.  current_key will only be set
+        // and reset under the aggregator, so it will remain consistent.
+        bool try_to_make_tuple(output_type &out) {
+            key_matching_FE_operation op_data(&out,try_make);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        void tuple_accepted() {
+            reset_port_count();  // reset current_key after ports reset.
+        }
+
+        void tuple_rejected() {
+            // nothing to do.
+        }
+
+        input_type my_inputs;  // input ports
+        base_node_type *my_node;
+    }; // join_node_FE<key_matching<K,KHash>, InputTuple, OutputTuple>
+
+    //! join_node_base
+    template<typename JP, typename InputTuple, typename OutputTuple>
+    class join_node_base : public graph_node, public join_node_FE<JP, InputTuple, OutputTuple>,
+                           public sender<OutputTuple> {
+    protected:
+        using graph_node::my_graph;
+    public:
+        typedef OutputTuple output_type;
+
+        typedef typename sender<output_type>::successor_type successor_type;
+        typedef join_node_FE<JP, InputTuple, OutputTuple> input_ports_type;
+        using input_ports_type::tuple_build_may_succeed;
+        using input_ports_type::try_to_make_tuple;
+        using input_ports_type::tuple_accepted;
+        using input_ports_type::tuple_rejected;
+
+    private:
+        // ----------- Aggregator ------------
+        enum op_type { reg_succ, rem_succ, try__get, do_fwrd, do_fwrd_bypass
+        };
+        typedef join_node_base<JP,InputTuple,OutputTuple> class_type;
+
+        class join_node_base_operation : public aggregated_operation<join_node_base_operation> {
+        public:
+            char type;
+            union {
+                output_type *my_arg;
+                successor_type *my_succ;
+            };
+            graph_task* bypass_t;
+            join_node_base_operation(const output_type& e, op_type t) : type(char(t)),
+                my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr) {}
+            join_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
+                my_succ(const_cast<successor_type *>(&s)), bypass_t(nullptr) {}
+            join_node_base_operation(op_type t) : type(char(t)), bypass_t(nullptr) {}
+        };
+
+        typedef aggregating_functor<class_type, join_node_base_operation> handler_type;
+        friend class aggregating_functor<class_type, join_node_base_operation>;
+        bool forwarder_busy;
+        aggregator<handler_type, join_node_base_operation> my_aggregator;
+
+        void handle_operations(join_node_base_operation* op_list) {
+            join_node_base_operation *current;
+            while(op_list) {
+                current = op_list;
+                op_list = op_list->next;
+                switch(current->type) {
+                case reg_succ: {
+                        my_successors.register_successor(*(current->my_succ));
+                        if(tuple_build_may_succeed() && !forwarder_busy && is_graph_active(my_graph)) {
+                            small_object_allocator allocator{};
+                            typedef forward_task_bypass< join_node_base<JP, InputTuple, OutputTuple> > task_type;
+                            graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
+                            my_graph.reserve_wait();
+                            spawn_in_graph_arena(my_graph, *t);
+                            forwarder_busy = true;
+                        }
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
+                    break;
+                case rem_succ:
+                    my_successors.remove_successor(*(current->my_succ));
+                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    break;
+                case try__get:
+                    if(tuple_build_may_succeed()) {
+                        if(try_to_make_tuple(*(current->my_arg))) {
+                            tuple_accepted();
+                            current->status.store( SUCCEEDED, std::memory_order_release);
+                        }
+                        else current->status.store( FAILED, std::memory_order_release);
+                    }
+                    else current->status.store( FAILED, std::memory_order_release);
+                    break;
+                case do_fwrd_bypass: {
+                        bool build_succeeded;
+                        graph_task *last_task = nullptr;
+                        output_type out;
+                        // forwarding must be exclusive, because try_to_make_tuple and tuple_accepted
+                        // are separate locked methods in the FE.  We could conceivably fetch the front
+                        // of the FE queue, then be swapped out, have someone else consume the FE's
+                        // object, then come back, forward, and then try to remove it from the queue
+                        // again. Without reservation of the FE, the methods accessing it must be locked.
+                        // We could remember the keys of the objects we forwarded, and then remove
+                        // them from the input ports after forwarding is complete?
+                        if(tuple_build_may_succeed()) {  // checks output queue of FE
+                            do {
+                                build_succeeded = try_to_make_tuple(out);  // fetch front_end of queue
+                                if(build_succeeded) {
+                                    graph_task *new_task = my_successors.try_put_task(out);
+                                    last_task = combine_tasks(my_graph, last_task, new_task);
+                                    if(new_task) {
+                                        tuple_accepted();
+                                    }
+                                    else {
+                                        tuple_rejected();
+                                        build_succeeded = false;
+                                    }
+                                }
+                            } while(build_succeeded);
+                        }
+                        current->bypass_t = last_task;
+                        current->status.store( SUCCEEDED, std::memory_order_release);
+                        forwarder_busy = false;
+                    }
+                    break;
+                }
+            }
+        }
+        // ---------- end aggregator -----------
+    public:
+        join_node_base(graph &g)
+            : graph_node(g), input_ports_type(g), forwarder_busy(false), my_successors(this)
+        {
+            input_ports_type::set_my_node(this);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        join_node_base(const join_node_base& other) :
+            graph_node(other.graph_node::my_graph), input_ports_type(other),
+            sender<OutputTuple>(), forwarder_busy(false), my_successors(this)
+        {
+            input_ports_type::set_my_node(this);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        template<typename FunctionTuple>
+        join_node_base(graph &g, FunctionTuple f)
+            : graph_node(g), input_ports_type(g, f), forwarder_busy(false), my_successors(this)
+        {
+            input_ports_type::set_my_node(this);
+            my_aggregator.initialize_handler(handler_type(this));
+        }
+
+        bool register_successor(successor_type &r) override {
+            join_node_base_operation op_data(r, reg_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        bool remove_successor( successor_type &r) override {
+            join_node_base_operation op_data(r, rem_succ);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+        bool try_get( output_type &v) override {
+            join_node_base_operation op_data(v, try__get);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+
+    protected:
+        void reset_node(reset_flags f) override {
+            input_ports_type::reset(f);
+            if(f & rf_clear_edges) my_successors.clear();
+        }
+
+    private:
+        broadcast_cache<output_type, null_rw_mutex> my_successors;
+
+        friend class forward_task_bypass< join_node_base<JP, InputTuple, OutputTuple> >;
+        graph_task *forward_task() {
+            join_node_base_operation op_data(do_fwrd_bypass);
+            my_aggregator.execute(&op_data);
+            return op_data.bypass_t;
+        }
+
+    };  // join_node_base
+
+    // join base class type generator
+    template<int N, template<class> class PT, typename OutputTuple, typename JP>
+    struct join_base {
+        typedef join_node_base<JP, typename wrap_tuple_elements<N,PT,OutputTuple>::type, OutputTuple> type;
+    };
+
+    template<int N, typename OutputTuple, typename K, typename KHash>
+    struct join_base<N, key_matching_port, OutputTuple, key_matching<K,KHash> > {
+        typedef key_matching<K, KHash> key_traits_type;
+        typedef K key_type;
+        typedef KHash key_hash_compare;
+        typedef join_node_base< key_traits_type,
+                // ports type
+                typename wrap_key_tuple_elements<N,key_matching_port,key_traits_type,OutputTuple>::type,
+                OutputTuple > type;
+    };
+
+    //! unfolded_join_node : passes input_ports_type to join_node_base.  We build the input port type
+    //  using tuple_element.  The class PT is the port type (reserving_port, queueing_port, key_matching_port)
+    //  and should match the typename.
+
+    template<int M, template<class> class PT, typename OutputTuple, typename JP>
+    class unfolded_join_node : public join_base<M,PT,OutputTuple,JP>::type {
+    public:
+        typedef typename wrap_tuple_elements<M, PT, OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<JP, input_ports_type, output_type > base_type;
+    public:
+        unfolded_join_node(graph &g) : base_type(g) {}
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+    template <typename K, typename T>
+    struct key_from_message_body {
+        K operator()(const T& t) const {
+            return key_from_message<K>(t);
+        }
+    };
+    // Adds const to reference type
+    template <typename K, typename T>
+    struct key_from_message_body<K&,T> {
+        const K& operator()(const T& t) const {
+            return key_from_message<const K&>(t);
+        }
+    };
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+    // key_matching unfolded_join_node.  This must be a separate specialization because the constructors
+    // differ.
+
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<2,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<2,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+    public:
+        typedef typename wrap_key_tuple_elements<2,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash>, input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef std::tuple< f0_p, f1_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 2, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<3,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<3,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+    public:
+        typedef typename wrap_key_tuple_elements<3,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash>, input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef std::tuple< f0_p, f1_p, f2_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 3, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<4,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<4,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+    public:
+        typedef typename wrap_key_tuple_elements<4,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash>, input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 4, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<5,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<5,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+    public:
+        typedef typename wrap_key_tuple_elements<5,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 5, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+
+#if __TBB_VARIADIC_MAX >= 6
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<6,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<6,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+    public:
+        typedef typename wrap_key_tuple_elements<6,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef type_to_key_function_body<T5, K> *f5_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>()),
+                    new type_to_key_function_body_leaf<T5, K, key_from_message_body<K,T5> >(key_from_message_body<K,T5>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4, typename Body5>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, Body5 body5)
+                : base_type(g, func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4),
+                    new type_to_key_function_body_leaf<T5, K, Body5>(body5)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 6, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+#endif
+
+#if __TBB_VARIADIC_MAX >= 7
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<7,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<7,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+    public:
+        typedef typename wrap_key_tuple_elements<7,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef type_to_key_function_body<T5, K> *f5_p;
+        typedef type_to_key_function_body<T6, K> *f6_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>()),
+                    new type_to_key_function_body_leaf<T5, K, key_from_message_body<K,T5> >(key_from_message_body<K,T5>()),
+                    new type_to_key_function_body_leaf<T6, K, key_from_message_body<K,T6> >(key_from_message_body<K,T6>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4,
+                 typename Body5, typename Body6>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4,
+                Body5 body5, Body6 body6) : base_type(g, func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4),
+                    new type_to_key_function_body_leaf<T5, K, Body5>(body5),
+                    new type_to_key_function_body_leaf<T6, K, Body6>(body6)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 7, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+#endif
+
+#if __TBB_VARIADIC_MAX >= 8
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<8,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<8,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+        typedef typename std::tuple_element<7, OutputTuple>::type T7;
+    public:
+        typedef typename wrap_key_tuple_elements<8,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef type_to_key_function_body<T5, K> *f5_p;
+        typedef type_to_key_function_body<T6, K> *f6_p;
+        typedef type_to_key_function_body<T7, K> *f7_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>()),
+                    new type_to_key_function_body_leaf<T5, K, key_from_message_body<K,T5> >(key_from_message_body<K,T5>()),
+                    new type_to_key_function_body_leaf<T6, K, key_from_message_body<K,T6> >(key_from_message_body<K,T6>()),
+                    new type_to_key_function_body_leaf<T7, K, key_from_message_body<K,T7> >(key_from_message_body<K,T7>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4,
+                 typename Body5, typename Body6, typename Body7>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4,
+                Body5 body5, Body6 body6, Body7 body7) : base_type(g, func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4),
+                    new type_to_key_function_body_leaf<T5, K, Body5>(body5),
+                    new type_to_key_function_body_leaf<T6, K, Body6>(body6),
+                    new type_to_key_function_body_leaf<T7, K, Body7>(body7)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 8, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+#endif
+
+#if __TBB_VARIADIC_MAX >= 9
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<9,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<9,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+        typedef typename std::tuple_element<7, OutputTuple>::type T7;
+        typedef typename std::tuple_element<8, OutputTuple>::type T8;
+    public:
+        typedef typename wrap_key_tuple_elements<9,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef type_to_key_function_body<T5, K> *f5_p;
+        typedef type_to_key_function_body<T6, K> *f6_p;
+        typedef type_to_key_function_body<T7, K> *f7_p;
+        typedef type_to_key_function_body<T8, K> *f8_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p, f8_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>()),
+                    new type_to_key_function_body_leaf<T5, K, key_from_message_body<K,T5> >(key_from_message_body<K,T5>()),
+                    new type_to_key_function_body_leaf<T6, K, key_from_message_body<K,T6> >(key_from_message_body<K,T6>()),
+                    new type_to_key_function_body_leaf<T7, K, key_from_message_body<K,T7> >(key_from_message_body<K,T7>()),
+                    new type_to_key_function_body_leaf<T8, K, key_from_message_body<K,T8> >(key_from_message_body<K,T8>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4,
+                 typename Body5, typename Body6, typename Body7, typename Body8>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4,
+                Body5 body5, Body6 body6, Body7 body7, Body8 body8) : base_type(g, func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4),
+                    new type_to_key_function_body_leaf<T5, K, Body5>(body5),
+                    new type_to_key_function_body_leaf<T6, K, Body6>(body6),
+                    new type_to_key_function_body_leaf<T7, K, Body7>(body7),
+                    new type_to_key_function_body_leaf<T8, K, Body8>(body8)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 9, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+#endif
+
+#if __TBB_VARIADIC_MAX >= 10
+    template<typename OutputTuple, typename K, typename KHash>
+    class unfolded_join_node<10,key_matching_port,OutputTuple,key_matching<K,KHash> > : public
+            join_base<10,key_matching_port,OutputTuple,key_matching<K,KHash> >::type {
+        typedef typename std::tuple_element<0, OutputTuple>::type T0;
+        typedef typename std::tuple_element<1, OutputTuple>::type T1;
+        typedef typename std::tuple_element<2, OutputTuple>::type T2;
+        typedef typename std::tuple_element<3, OutputTuple>::type T3;
+        typedef typename std::tuple_element<4, OutputTuple>::type T4;
+        typedef typename std::tuple_element<5, OutputTuple>::type T5;
+        typedef typename std::tuple_element<6, OutputTuple>::type T6;
+        typedef typename std::tuple_element<7, OutputTuple>::type T7;
+        typedef typename std::tuple_element<8, OutputTuple>::type T8;
+        typedef typename std::tuple_element<9, OutputTuple>::type T9;
+    public:
+        typedef typename wrap_key_tuple_elements<10,key_matching_port,key_matching<K,KHash>,OutputTuple>::type input_ports_type;
+        typedef OutputTuple output_type;
+    private:
+        typedef join_node_base<key_matching<K,KHash> , input_ports_type, output_type > base_type;
+        typedef type_to_key_function_body<T0, K> *f0_p;
+        typedef type_to_key_function_body<T1, K> *f1_p;
+        typedef type_to_key_function_body<T2, K> *f2_p;
+        typedef type_to_key_function_body<T3, K> *f3_p;
+        typedef type_to_key_function_body<T4, K> *f4_p;
+        typedef type_to_key_function_body<T5, K> *f5_p;
+        typedef type_to_key_function_body<T6, K> *f6_p;
+        typedef type_to_key_function_body<T7, K> *f7_p;
+        typedef type_to_key_function_body<T8, K> *f8_p;
+        typedef type_to_key_function_body<T9, K> *f9_p;
+        typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p, f8_p, f9_p > func_initializer_type;
+    public:
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+        unfolded_join_node(graph &g) : base_type(g,
+                func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, key_from_message_body<K,T0> >(key_from_message_body<K,T0>()),
+                    new type_to_key_function_body_leaf<T1, K, key_from_message_body<K,T1> >(key_from_message_body<K,T1>()),
+                    new type_to_key_function_body_leaf<T2, K, key_from_message_body<K,T2> >(key_from_message_body<K,T2>()),
+                    new type_to_key_function_body_leaf<T3, K, key_from_message_body<K,T3> >(key_from_message_body<K,T3>()),
+                    new type_to_key_function_body_leaf<T4, K, key_from_message_body<K,T4> >(key_from_message_body<K,T4>()),
+                    new type_to_key_function_body_leaf<T5, K, key_from_message_body<K,T5> >(key_from_message_body<K,T5>()),
+                    new type_to_key_function_body_leaf<T6, K, key_from_message_body<K,T6> >(key_from_message_body<K,T6>()),
+                    new type_to_key_function_body_leaf<T7, K, key_from_message_body<K,T7> >(key_from_message_body<K,T7>()),
+                    new type_to_key_function_body_leaf<T8, K, key_from_message_body<K,T8> >(key_from_message_body<K,T8>()),
+                    new type_to_key_function_body_leaf<T9, K, key_from_message_body<K,T9> >(key_from_message_body<K,T9>())
+                    ) ) {
+        }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+        template<typename Body0, typename Body1, typename Body2, typename Body3, typename Body4,
+            typename Body5, typename Body6, typename Body7, typename Body8, typename Body9>
+        unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4,
+                Body5 body5, Body6 body6, Body7 body7, Body8 body8, Body9 body9) : base_type(g, func_initializer_type(
+                    new type_to_key_function_body_leaf<T0, K, Body0>(body0),
+                    new type_to_key_function_body_leaf<T1, K, Body1>(body1),
+                    new type_to_key_function_body_leaf<T2, K, Body2>(body2),
+                    new type_to_key_function_body_leaf<T3, K, Body3>(body3),
+                    new type_to_key_function_body_leaf<T4, K, Body4>(body4),
+                    new type_to_key_function_body_leaf<T5, K, Body5>(body5),
+                    new type_to_key_function_body_leaf<T6, K, Body6>(body6),
+                    new type_to_key_function_body_leaf<T7, K, Body7>(body7),
+                    new type_to_key_function_body_leaf<T8, K, Body8>(body8),
+                    new type_to_key_function_body_leaf<T9, K, Body9>(body9)
+                    ) ) {
+            static_assert(std::tuple_size<OutputTuple>::value == 10, "wrong number of body initializers");
+        }
+        unfolded_join_node(const unfolded_join_node &other) : base_type(other) {}
+    };
+#endif
+
+    //! templated function to refer to input ports of the join node
+    template<size_t N, typename JNT>
+    typename std::tuple_element<N, typename JNT::input_ports_type>::type &input_port(JNT &jn) {
+        return std::get<N>(jn.input_ports());
+    }
+
+#endif // __TBB__flow_graph_join_impl_H
diff --git a/third_party/tbb/detail/_flow_graph_node_impl.h b/third_party/tbb/detail/_flow_graph_node_impl.h
new file mode 100644
index 000000000..20a3741d0
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_node_impl.h
@@ -0,0 +1,775 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_node_impl_H
+#define __TBB__flow_graph_node_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+#include "third_party/tbb/detail/_flow_graph_item_buffer_impl.h"
+
+template< typename T, typename A >
+class function_input_queue : public item_buffer<T,A> {
+public:
+    bool empty() const {
+        return this->buffer_empty();
+    }
+
+    const T& front() const {
+        return this->item_buffer<T, A>::front();
+    }
+
+    void pop() {
+        this->destroy_front();
+    }
+
+    bool push( T& t ) {
+        return this->push_back( t );
+    }
+};
+
+//! Input and scheduling for a function node that takes a type Input as input
+//  The only up-ref is apply_body_impl, which should implement the function
+//  call and any handling of the result.
+template< typename Input, typename Policy, typename A, typename ImplType >
+class function_input_base : public receiver<Input>, no_assign {
+    enum op_type {reg_pred, rem_pred, try_fwd, tryput_bypass, app_body_bypass, occupy_concurrency
+    };
+    typedef function_input_base<Input, Policy, A, ImplType> class_type;
+
+public:
+
+    //! The input type of this receiver
+    typedef Input input_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef predecessor_cache<input_type, null_mutex > predecessor_cache_type;
+    typedef function_input_queue<input_type, A> input_queue_type;
+    typedef typename allocator_traits<A>::template rebind_alloc<input_queue_type> allocator_type;
+    static_assert(!has_policy<queueing, Policy>::value || !has_policy<rejecting, Policy>::value, "");
+
+    //! Constructor for function_input_base
+    function_input_base( graph &g, size_t max_concurrency, node_priority_t a_priority, bool is_no_throw )
+        : my_graph_ref(g), my_max_concurrency(max_concurrency)
+        , my_concurrency(0), my_priority(a_priority), my_is_no_throw(is_no_throw)
+        , my_queue(!has_policy<rejecting, Policy>::value ? new input_queue_type() : nullptr)
+        , my_predecessors(this)
+        , forwarder_busy(false)
+    {
+        my_aggregator.initialize_handler(handler_type(this));
+    }
+
+    //! Copy constructor
+    function_input_base( const function_input_base& src )
+        : function_input_base(src.my_graph_ref, src.my_max_concurrency, src.my_priority, src.my_is_no_throw) {}
+
+    //! Destructor
+    // The queue is allocated by the constructor for {multi}function_node.
+    // TODO: pass the graph_buffer_policy to the base so it can allocate the queue instead.
+    // This would be an interface-breaking change.
+    virtual ~function_input_base() {
+        delete my_queue;
+        my_queue = nullptr;
+    }
+
+    graph_task* try_put_task( const input_type& t) override {
+        if ( my_is_no_throw )
+            return try_put_task_impl(t, has_policy<lightweight, Policy>());
+        else
+            return try_put_task_impl(t, std::false_type());
+    }
+
+    //! Adds src to the list of cached predecessors.
+    bool register_predecessor( predecessor_type &src ) override {
+        operation_type op_data(reg_pred);
+        op_data.r = &src;
+        my_aggregator.execute(&op_data);
+        return true;
+    }
+
+    //! Removes src from the list of cached predecessors.
+    bool remove_predecessor( predecessor_type &src ) override {
+        operation_type op_data(rem_pred);
+        op_data.r = &src;
+        my_aggregator.execute(&op_data);
+        return true;
+    }
+
+protected:
+
+    void reset_function_input_base( reset_flags f) {
+        my_concurrency = 0;
+        if(my_queue) {
+            my_queue->reset();
+        }
+        reset_receiver(f);
+        forwarder_busy = false;
+    }
+
+    graph& my_graph_ref;
+    const size_t my_max_concurrency;
+    size_t my_concurrency;
+    node_priority_t my_priority;
+    const bool my_is_no_throw;
+    input_queue_type *my_queue;
+    predecessor_cache<input_type, null_mutex > my_predecessors;
+
+    void reset_receiver( reset_flags f) {
+        if( f & rf_clear_edges) my_predecessors.clear();
+        else
+            my_predecessors.reset();
+        __TBB_ASSERT(!(f & rf_clear_edges) || my_predecessors.empty(), "function_input_base reset failed");
+    }
+
+    graph& graph_reference() const override {
+        return my_graph_ref;
+    }
+
+    graph_task* try_get_postponed_task(const input_type& i) {
+        operation_type op_data(i, app_body_bypass);  // tries to pop an item or get_item
+        my_aggregator.execute(&op_data);
+        return op_data.bypass_t;
+    }
+
+private:
+
+    friend class apply_body_task_bypass< class_type, input_type >;
+    friend class forward_task_bypass< class_type >;
+
+    class operation_type : public aggregated_operation< operation_type > {
+    public:
+        char type;
+        union {
+            input_type *elem;
+            predecessor_type *r;
+        };
+        graph_task* bypass_t;
+        operation_type(const input_type& e, op_type t) :
+            type(char(t)), elem(const_cast<input_type*>(&e)), bypass_t(nullptr) {}
+        operation_type(op_type t) : type(char(t)), r(nullptr), bypass_t(nullptr) {}
+    };
+
+    bool forwarder_busy;
+    typedef aggregating_functor<class_type, operation_type> handler_type;
+    friend class aggregating_functor<class_type, operation_type>;
+    aggregator< handler_type, operation_type > my_aggregator;
+
+    graph_task* perform_queued_requests() {
+        graph_task* new_task = nullptr;
+        if(my_queue) {
+            if(!my_queue->empty()) {
+                ++my_concurrency;
+                new_task = create_body_task(my_queue->front());
+
+                my_queue->pop();
+            }
+        }
+        else {
+            input_type i;
+            if(my_predecessors.get_item(i)) {
+                ++my_concurrency;
+                new_task = create_body_task(i);
+            }
+        }
+        return new_task;
+    }
+    void handle_operations(operation_type *op_list) {
+        operation_type* tmp;
+        while (op_list) {
+            tmp = op_list;
+            op_list = op_list->next;
+            switch (tmp->type) {
+            case reg_pred:
+                my_predecessors.add(*(tmp->r));
+                tmp->status.store(SUCCEEDED, std::memory_order_release);
+                if (!forwarder_busy) {
+                    forwarder_busy = true;
+                    spawn_forward_task();
+                }
+                break;
+            case rem_pred:
+                my_predecessors.remove(*(tmp->r));
+                tmp->status.store(SUCCEEDED, std::memory_order_release);
+                break;
+            case app_body_bypass: {
+                tmp->bypass_t = nullptr;
+                __TBB_ASSERT(my_max_concurrency != 0, nullptr);
+                --my_concurrency;
+                if(my_concurrency<my_max_concurrency)
+                    tmp->bypass_t = perform_queued_requests();
+                tmp->status.store(SUCCEEDED, std::memory_order_release);
+            }
+                break;
+            case tryput_bypass: internal_try_put_task(tmp);  break;
+            case try_fwd: internal_forward(tmp);  break;
+            case occupy_concurrency:
+                if (my_concurrency < my_max_concurrency) {
+                    ++my_concurrency;
+                    tmp->status.store(SUCCEEDED, std::memory_order_release);
+                } else {
+                    tmp->status.store(FAILED, std::memory_order_release);
+                }
+                break;
+            }
+        }
+    }
+
+    //! Put to the node, but return the task instead of enqueueing it
+    void internal_try_put_task(operation_type *op) {
+        __TBB_ASSERT(my_max_concurrency != 0, nullptr);
+        if (my_concurrency < my_max_concurrency) {
+            ++my_concurrency;
+            graph_task * new_task = create_body_task(*(op->elem));
+            op->bypass_t = new_task;
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        } else if ( my_queue && my_queue->push(*(op->elem)) ) {
+            op->bypass_t = SUCCESSFULLY_ENQUEUED;
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        } else {
+            op->bypass_t = nullptr;
+            op->status.store(FAILED, std::memory_order_release);
+        }
+    }
+
+    //! Creates tasks for postponed messages if available and if concurrency allows
+    void internal_forward(operation_type *op) {
+        op->bypass_t = nullptr;
+        if (my_concurrency < my_max_concurrency)
+            op->bypass_t = perform_queued_requests();
+        if(op->bypass_t)
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        else {
+            forwarder_busy = false;
+            op->status.store(FAILED, std::memory_order_release);
+        }
+    }
+
+    graph_task* internal_try_put_bypass( const input_type& t ) {
+        operation_type op_data(t, tryput_bypass);
+        my_aggregator.execute(&op_data);
+        if( op_data.status == SUCCEEDED ) {
+            return op_data.bypass_t;
+        }
+        return nullptr;
+    }
+
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::true_type ) {
+        if( my_max_concurrency == 0 ) {
+            return apply_body_bypass(t);
+        } else {
+            operation_type check_op(t, occupy_concurrency);
+            my_aggregator.execute(&check_op);
+            if( check_op.status == SUCCEEDED ) {
+                return apply_body_bypass(t);
+            }
+            return internal_try_put_bypass(t);
+        }
+    }
+
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::false_type ) {
+        if( my_max_concurrency == 0 ) {
+            return create_body_task(t);
+        } else {
+            return internal_try_put_bypass(t);
+        }
+    }
+
+    //! Applies the body to the provided input
+    //  then decides if more work is available
+    graph_task* apply_body_bypass( const input_type &i ) {
+        return static_cast<ImplType *>(this)->apply_body_impl_bypass(i);
+    }
+
+    //! allocates a task to apply a body
+    graph_task* create_body_task( const input_type &input ) {
+        if (!is_graph_active(my_graph_ref)) {
+            return nullptr;
+        }
+        // TODO revamp: extract helper for common graph task allocation part
+        small_object_allocator allocator{};
+        typedef apply_body_task_bypass<class_type, input_type> task_type;
+        graph_task* t = allocator.new_object<task_type>( my_graph_ref, allocator, *this, input, my_priority );
+        graph_reference().reserve_wait();
+        return t;
+    }
+
+    //! This is executed by an enqueued task, the "forwarder"
+    graph_task* forward_task() {
+        operation_type op_data(try_fwd);
+        graph_task* rval = nullptr;
+        do {
+            op_data.status = WAIT;
+            my_aggregator.execute(&op_data);
+            if(op_data.status == SUCCEEDED) {
+                graph_task* ttask = op_data.bypass_t;
+                __TBB_ASSERT( ttask && ttask != SUCCESSFULLY_ENQUEUED, nullptr);
+                rval = combine_tasks(my_graph_ref, rval, ttask);
+            }
+        } while (op_data.status == SUCCEEDED);
+        return rval;
+    }
+
+    inline graph_task* create_forward_task() {
+        if (!is_graph_active(my_graph_ref)) {
+            return nullptr;
+        }
+        small_object_allocator allocator{};
+        typedef forward_task_bypass<class_type> task_type;
+        graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, my_priority );
+        graph_reference().reserve_wait();
+        return t;
+    }
+
+    //! Spawns a task that calls forward()
+    inline void spawn_forward_task() {
+        graph_task* tp = create_forward_task();
+        if(tp) {
+            spawn_in_graph_arena(graph_reference(), *tp);
+        }
+    }
+
+    node_priority_t priority() const override { return my_priority; }
+};  // function_input_base
+
+//! Implements methods for a function node that takes a type Input as input and sends
+//  a type Output to its successors.
+template< typename Input, typename Output, typename Policy, typename A>
+class function_input : public function_input_base<Input, Policy, A, function_input<Input,Output,Policy,A> > {
+public:
+    typedef Input input_type;
+    typedef Output output_type;
+    typedef function_body<input_type, output_type> function_body_type;
+    typedef function_input<Input, Output, Policy,A> my_class;
+    typedef function_input_base<Input, Policy, A, my_class> base_type;
+    typedef function_input_queue<input_type, A> input_queue_type;
+
+    // constructor
+    template<typename Body>
+    function_input(
+        graph &g, size_t max_concurrency, Body& body, node_priority_t a_priority )
+      : base_type(g, max_concurrency, a_priority, noexcept(tbb::detail::invoke(body, input_type())))
+      , my_body( new function_body_leaf< input_type, output_type, Body>(body) )
+      , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) ) {
+    }
+
+    //! Copy constructor
+    function_input( const function_input& src ) :
+        base_type(src),
+        my_body( src.my_init_body->clone() ),
+        my_init_body(src.my_init_body->clone() ) {
+    }
+#if __INTEL_COMPILER <= 2021
+    // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited
+    // class while the parent class has the virtual keyword for the destrocutor.
+    virtual
+#endif
+    ~function_input() {
+        delete my_body;
+        delete my_init_body;
+    }
+
+    template< typename Body >
+    Body copy_function_object() {
+        function_body_type &body_ref = *this->my_body;
+        return dynamic_cast< function_body_leaf<input_type, output_type, Body> & >(body_ref).get_body();
+    }
+
+    output_type apply_body_impl( const input_type& i) {
+        // There is an extra copied needed to capture the
+        // body execution without the try_put
+        fgt_begin_body( my_body );
+        output_type v = tbb::detail::invoke(*my_body, i);
+        fgt_end_body( my_body );
+        return v;
+    }
+
+    //TODO: consider moving into the base class
+    graph_task* apply_body_impl_bypass( const input_type &i) {
+        output_type v = apply_body_impl(i);
+        graph_task* postponed_task = nullptr;
+        if( base_type::my_max_concurrency != 0 ) {
+            postponed_task = base_type::try_get_postponed_task(i);
+            __TBB_ASSERT( !postponed_task || postponed_task != SUCCESSFULLY_ENQUEUED, nullptr);
+        }
+        if( postponed_task ) {
+            // make the task available for other workers since we do not know successors'
+            // execution policy
+            spawn_in_graph_arena(base_type::graph_reference(), *postponed_task);
+        }
+        graph_task* successor_task = successors().try_put_task(v);
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (push)
+#pragma warning (disable: 4127)  /* suppress conditional expression is constant */
+#endif
+        if(has_policy<lightweight, Policy>::value) {
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (pop)
+#endif
+            if(!successor_task) {
+                // Return confirmative status since current
+                // node's body has been executed anyway
+                successor_task = SUCCESSFULLY_ENQUEUED;
+            }
+        }
+        return successor_task;
+    }
+
+protected:
+
+    void reset_function_input(reset_flags f) {
+        base_type::reset_function_input_base(f);
+        if(f & rf_reset_bodies) {
+            function_body_type *tmp = my_init_body->clone();
+            delete my_body;
+            my_body = tmp;
+        }
+    }
+
+    function_body_type *my_body;
+    function_body_type *my_init_body;
+    virtual broadcast_cache<output_type > &successors() = 0;
+
+};  // function_input
+
+
+// helper templates to clear the successor edges of the output ports of an multifunction_node
+template<int N> struct clear_element {
+    template<typename P> static void clear_this(P &p) {
+        (void)std::get<N-1>(p).successors().clear();
+        clear_element<N-1>::clear_this(p);
+    }
+#if TBB_USE_ASSERT
+    template<typename P> static bool this_empty(P &p) {
+        if(std::get<N-1>(p).successors().empty())
+            return clear_element<N-1>::this_empty(p);
+        return false;
+    }
+#endif
+};
+
+template<> struct clear_element<1> {
+    template<typename P> static void clear_this(P &p) {
+        (void)std::get<0>(p).successors().clear();
+    }
+#if TBB_USE_ASSERT
+    template<typename P> static bool this_empty(P &p) {
+        return std::get<0>(p).successors().empty();
+    }
+#endif
+};
+
+template <typename OutputTuple>
+struct init_output_ports {
+    template <typename... Args>
+    static OutputTuple call(graph& g, const std::tuple<Args...>&) {
+        return OutputTuple(Args(g)...);
+    }
+}; // struct init_output_ports
+
+//! Implements methods for a function node that takes a type Input as input
+//  and has a tuple of output ports specified.
+template< typename Input, typename OutputPortSet, typename Policy, typename A>
+class multifunction_input : public function_input_base<Input, Policy, A, multifunction_input<Input,OutputPortSet,Policy,A> > {
+public:
+    static const int N = std::tuple_size<OutputPortSet>::value;
+    typedef Input input_type;
+    typedef OutputPortSet output_ports_type;
+    typedef multifunction_body<input_type, output_ports_type> multifunction_body_type;
+    typedef multifunction_input<Input, OutputPortSet, Policy, A> my_class;
+    typedef function_input_base<Input, Policy, A, my_class> base_type;
+    typedef function_input_queue<input_type, A> input_queue_type;
+
+    // constructor
+    template<typename Body>
+    multifunction_input(graph &g, size_t max_concurrency,Body& body, node_priority_t a_priority )
+      : base_type(g, max_concurrency, a_priority, noexcept(tbb::detail::invoke(body, input_type(), my_output_ports)))
+      , my_body( new multifunction_body_leaf<input_type, output_ports_type, Body>(body) )
+      , my_init_body( new multifunction_body_leaf<input_type, output_ports_type, Body>(body) )
+      , my_output_ports(init_output_ports<output_ports_type>::call(g, my_output_ports)){
+    }
+
+    //! Copy constructor
+    multifunction_input( const multifunction_input& src ) :
+        base_type(src),
+        my_body( src.my_init_body->clone() ),
+        my_init_body(src.my_init_body->clone() ),
+        my_output_ports( init_output_ports<output_ports_type>::call(src.my_graph_ref, my_output_ports) ) {
+    }
+
+    ~multifunction_input() {
+        delete my_body;
+        delete my_init_body;
+    }
+
+    template< typename Body >
+    Body copy_function_object() {
+        multifunction_body_type &body_ref = *this->my_body;
+        return *static_cast<Body*>(dynamic_cast< multifunction_body_leaf<input_type, output_ports_type, Body> & >(body_ref).get_body_ptr());
+    }
+
+    // for multifunction nodes we do not have a single successor as such.  So we just tell
+    // the task we were successful.
+    //TODO: consider moving common parts with implementation in function_input into separate function
+    graph_task* apply_body_impl_bypass( const input_type &i ) {
+        fgt_begin_body( my_body );
+        (*my_body)(i, my_output_ports);
+        fgt_end_body( my_body );
+        graph_task* ttask = nullptr;
+        if(base_type::my_max_concurrency != 0) {
+            ttask = base_type::try_get_postponed_task(i);
+        }
+        return ttask ? ttask : SUCCESSFULLY_ENQUEUED;
+    }
+
+    output_ports_type &output_ports(){ return my_output_ports; }
+
+protected:
+
+    void reset(reset_flags f) {
+        base_type::reset_function_input_base(f);
+        if(f & rf_clear_edges)clear_element<N>::clear_this(my_output_ports);
+        if(f & rf_reset_bodies) {
+            multifunction_body_type* tmp = my_init_body->clone();
+            delete my_body;
+            my_body = tmp;
+        }
+        __TBB_ASSERT(!(f & rf_clear_edges) || clear_element<N>::this_empty(my_output_ports), "multifunction_node reset failed");
+    }
+
+    multifunction_body_type *my_body;
+    multifunction_body_type *my_init_body;
+    output_ports_type my_output_ports;
+
+};  // multifunction_input
+
+// template to refer to an output port of a multifunction_node
+template<size_t N, typename MOP>
+typename std::tuple_element<N, typename MOP::output_ports_type>::type &output_port(MOP &op) {
+    return std::get<N>(op.output_ports());
+}
+
+inline void check_task_and_spawn(graph& g, graph_task* t) {
+    if (t && t != SUCCESSFULLY_ENQUEUED) {
+        spawn_in_graph_arena(g, *t);
+    }
+}
+
+// helper structs for split_node
+template<int N>
+struct emit_element {
+    template<typename T, typename P>
+    static graph_task* emit_this(graph& g, const T &t, P &p) {
+        // TODO: consider to collect all the tasks in task_list and spawn them all at once
+        graph_task* last_task = std::get<N-1>(p).try_put_task(std::get<N-1>(t));
+        check_task_and_spawn(g, last_task);
+        return emit_element<N-1>::emit_this(g,t,p);
+    }
+};
+
+template<>
+struct emit_element<1> {
+    template<typename T, typename P>
+    static graph_task* emit_this(graph& g, const T &t, P &p) {
+        graph_task* last_task = std::get<0>(p).try_put_task(std::get<0>(t));
+        check_task_and_spawn(g, last_task);
+        return SUCCESSFULLY_ENQUEUED;
+    }
+};
+
+//! Implements methods for an executable node that takes continue_msg as input
+template< typename Output, typename Policy>
+class continue_input : public continue_receiver {
+public:
+
+    //! The input type of this receiver
+    typedef continue_msg input_type;
+
+    //! The output type of this receiver
+    typedef Output output_type;
+    typedef function_body<input_type, output_type> function_body_type;
+    typedef continue_input<output_type, Policy> class_type;
+
+    template< typename Body >
+    continue_input( graph &g, Body& body, node_priority_t a_priority )
+        : continue_receiver(/*number_of_predecessors=*/0, a_priority)
+        , my_graph_ref(g)
+        , my_body( new function_body_leaf< input_type, output_type, Body>(body) )
+        , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) )
+    { }
+
+    template< typename Body >
+    continue_input( graph &g, int number_of_predecessors,
+                    Body& body, node_priority_t a_priority )
+      : continue_receiver( number_of_predecessors, a_priority )
+      , my_graph_ref(g)
+      , my_body( new function_body_leaf< input_type, output_type, Body>(body) )
+      , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) )
+    { }
+
+    continue_input( const continue_input& src ) : continue_receiver(src),
+                                                  my_graph_ref(src.my_graph_ref),
+                                                  my_body( src.my_init_body->clone() ),
+                                                  my_init_body( src.my_init_body->clone() ) {}
+
+    ~continue_input() {
+        delete my_body;
+        delete my_init_body;
+    }
+
+    template< typename Body >
+    Body copy_function_object() {
+        function_body_type &body_ref = *my_body;
+        return dynamic_cast< function_body_leaf<input_type, output_type, Body> & >(body_ref).get_body();
+    }
+
+    void reset_receiver( reset_flags f) override {
+        continue_receiver::reset_receiver(f);
+        if(f & rf_reset_bodies) {
+            function_body_type *tmp = my_init_body->clone();
+            delete my_body;
+            my_body = tmp;
+        }
+    }
+
+protected:
+
+    graph& my_graph_ref;
+    function_body_type *my_body;
+    function_body_type *my_init_body;
+
+    virtual broadcast_cache<output_type > &successors() = 0;
+
+    friend class apply_body_task_bypass< class_type, continue_msg >;
+
+    //! Applies the body to the provided input
+    graph_task* apply_body_bypass( input_type ) {
+        // There is an extra copied needed to capture the
+        // body execution without the try_put
+        fgt_begin_body( my_body );
+        output_type v = (*my_body)( continue_msg() );
+        fgt_end_body( my_body );
+        return successors().try_put_task( v );
+    }
+
+    graph_task* execute() override {
+        if(!is_graph_active(my_graph_ref)) {
+            return nullptr;
+        }
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (push)
+#pragma warning (disable: 4127)  /* suppress conditional expression is constant */
+#endif
+        if(has_policy<lightweight, Policy>::value) {
+#if _MSC_VER && !__INTEL_COMPILER
+#pragma warning (pop)
+#endif
+            return apply_body_bypass( continue_msg() );
+        }
+        else {
+            small_object_allocator allocator{};
+            typedef apply_body_task_bypass<class_type, continue_msg> task_type;
+            graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority );
+            graph_reference().reserve_wait();
+            return t;
+        }
+    }
+
+    graph& graph_reference() const override {
+        return my_graph_ref;
+    }
+};  // continue_input
+
+//! Implements methods for both executable and function nodes that puts Output to its successors
+template< typename Output >
+class function_output : public sender<Output> {
+public:
+
+    template<int N> friend struct clear_element;
+    typedef Output output_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+    typedef broadcast_cache<output_type> broadcast_cache_type;
+
+    function_output(graph& g) : my_successors(this), my_graph_ref(g) {}
+    function_output(const function_output& other) = delete;
+
+    //! Adds a new successor to this node
+    bool register_successor( successor_type &r ) override {
+        successors().register_successor( r );
+        return true;
+    }
+
+    //! Removes a successor from this node
+    bool remove_successor( successor_type &r ) override {
+        successors().remove_successor( r );
+        return true;
+    }
+
+    broadcast_cache_type &successors() { return my_successors; }
+
+    graph& graph_reference() const { return my_graph_ref; }
+protected:
+    broadcast_cache_type my_successors;
+    graph& my_graph_ref;
+};  // function_output
+
+template< typename Output >
+class multifunction_output : public function_output<Output> {
+public:
+    typedef Output output_type;
+    typedef function_output<output_type> base_type;
+    using base_type::my_successors;
+
+    multifunction_output(graph& g) : base_type(g) {}
+    multifunction_output(const multifunction_output& other) : base_type(other.my_graph_ref) {}
+
+    bool try_put(const output_type &i) {
+        graph_task *res = try_put_task(i);
+        if( !res ) return false;
+        if( res != SUCCESSFULLY_ENQUEUED ) {
+            // wrapping in task_arena::execute() is not needed since the method is called from
+            // inside task::execute()
+            spawn_in_graph_arena(graph_reference(), *res);
+        }
+        return true;
+    }
+
+    using base_type::graph_reference;
+
+protected:
+
+    graph_task* try_put_task(const output_type &i) {
+        return my_successors.try_put_task(i);
+    }
+
+    template <int N> friend struct emit_element;
+
+};  // multifunction_output
+
+//composite_node
+template<typename CompositeType>
+void add_nodes_impl(CompositeType*, bool) {}
+
+template< typename CompositeType, typename NodeType1, typename... NodeTypes >
+void add_nodes_impl(CompositeType *c_node, bool visible, const NodeType1& n1, const NodeTypes&... n) {
+    void *addr = const_cast<NodeType1 *>(&n1);
+
+    fgt_alias_port(c_node, addr, visible);
+    add_nodes_impl(c_node, visible, n...);
+}
+
+#endif // __TBB__flow_graph_node_impl_H
diff --git a/third_party/tbb/detail/_flow_graph_node_set_impl.h b/third_party/tbb/detail/_flow_graph_node_set_impl.h
new file mode 100644
index 000000000..993e4fee7
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_node_set_impl.h
@@ -0,0 +1,266 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_node_set_impl_H
+#define __TBB_flow_graph_node_set_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// Included in namespace tbb::detail::d1 (in flow_graph.h)
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+// Visual Studio 2019 reports an error while calling predecessor_selector::get and successor_selector::get
+// Seems like the well-formed expression in trailing decltype is treated as ill-formed
+// TODO: investigate problems with decltype in trailing return types or find the cross-platform solution
+#define __TBB_MSVC_DISABLE_TRAILING_DECLTYPE (_MSC_VER >= 1900)
+
+namespace order {
+struct undefined {};
+struct following {};
+struct preceding {};
+}
+
+class get_graph_helper {
+public:
+    // TODO: consider making graph_reference() public and consistent interface to get a reference to the graph
+    // and remove get_graph_helper
+    template <typename T>
+    static graph& get(const T& object) {
+        return get_impl(object, std::is_base_of<graph_node, T>());
+    }
+
+private:
+    // Get graph from the object of type derived from graph_node
+    template <typename T>
+    static graph& get_impl(const T& object, std::true_type) {
+        return static_cast<const graph_node*>(&object)->my_graph;
+    }
+
+    template <typename T>
+    static graph& get_impl(const T& object, std::false_type) {
+        return object.graph_reference();
+    }
+};
+
+template<typename Order, typename... Nodes>
+struct node_set {
+    typedef Order order_type;
+
+    std::tuple<Nodes&...> nodes;
+    node_set(Nodes&... ns) : nodes(ns...) {}
+
+    template <typename... Nodes2>
+    node_set(const node_set<order::undefined, Nodes2...>& set) : nodes(set.nodes) {}
+
+    graph& graph_reference() const {
+        return get_graph_helper::get(std::get<0>(nodes));
+    }
+};
+
+namespace alias_helpers {
+template <typename T> using output_type = typename T::output_type;
+template <typename T> using output_ports_type = typename T::output_ports_type;
+template <typename T> using input_type = typename T::input_type;
+template <typename T> using input_ports_type = typename T::input_ports_type;
+} // namespace alias_helpers
+
+template <typename T>
+using has_output_type = supports<T, alias_helpers::output_type>;
+
+template <typename T>
+using has_input_type = supports<T, alias_helpers::input_type>;
+
+template <typename T>
+using has_input_ports_type = supports<T, alias_helpers::input_ports_type>;
+
+template <typename T>
+using has_output_ports_type = supports<T, alias_helpers::output_ports_type>;
+
+template<typename T>
+struct is_sender : std::is_base_of<sender<typename T::output_type>, T> {};
+
+template<typename T>
+struct is_receiver : std::is_base_of<receiver<typename T::input_type>, T> {};
+
+template <typename Node>
+struct is_async_node : std::false_type {};
+
+template <typename... Args>
+struct is_async_node<async_node<Args...>> : std::true_type {};
+
+template<typename FirstPredecessor, typename... Predecessors>
+node_set<order::following, FirstPredecessor, Predecessors...>
+follows(FirstPredecessor& first_predecessor, Predecessors&... predecessors) {
+    static_assert((conjunction<has_output_type<FirstPredecessor>,
+                                                   has_output_type<Predecessors>...>::value),
+                        "Not all node's predecessors has output_type typedef");
+    static_assert((conjunction<is_sender<FirstPredecessor>, is_sender<Predecessors>...>::value),
+                        "Not all node's predecessors are senders");
+    return node_set<order::following, FirstPredecessor, Predecessors...>(first_predecessor, predecessors...);
+}
+
+template<typename... Predecessors>
+node_set<order::following, Predecessors...>
+follows(node_set<order::undefined, Predecessors...>& predecessors_set) {
+    static_assert((conjunction<has_output_type<Predecessors>...>::value),
+                        "Not all nodes in the set has output_type typedef");
+    static_assert((conjunction<is_sender<Predecessors>...>::value),
+                        "Not all nodes in the set are senders");
+    return node_set<order::following, Predecessors...>(predecessors_set);
+}
+
+template<typename FirstSuccessor, typename... Successors>
+node_set<order::preceding, FirstSuccessor, Successors...>
+precedes(FirstSuccessor& first_successor, Successors&... successors) {
+    static_assert((conjunction<has_input_type<FirstSuccessor>,
+                                                    has_input_type<Successors>...>::value),
+                        "Not all node's successors has input_type typedef");
+    static_assert((conjunction<is_receiver<FirstSuccessor>, is_receiver<Successors>...>::value),
+                        "Not all node's successors are receivers");
+    return node_set<order::preceding, FirstSuccessor, Successors...>(first_successor, successors...);
+}
+
+template<typename... Successors>
+node_set<order::preceding, Successors...>
+precedes(node_set<order::undefined, Successors...>& successors_set) {
+    static_assert((conjunction<has_input_type<Successors>...>::value),
+                        "Not all nodes in the set has input_type typedef");
+    static_assert((conjunction<is_receiver<Successors>...>::value),
+                        "Not all nodes in the set are receivers");
+    return node_set<order::preceding, Successors...>(successors_set);
+}
+
+template <typename Node, typename... Nodes>
+node_set<order::undefined, Node, Nodes...>
+make_node_set(Node& first_node, Nodes&... nodes) {
+    return node_set<order::undefined, Node, Nodes...>(first_node, nodes...);
+}
+
+template<size_t I>
+class successor_selector {
+    template <typename NodeType>
+    static auto get_impl(NodeType& node, std::true_type) -> decltype(input_port<I>(node)) {
+        return input_port<I>(node);
+    }
+
+    template <typename NodeType>
+    static NodeType& get_impl(NodeType& node, std::false_type) { return node; }
+
+public:
+    template <typename NodeType>
+#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE
+    static auto& get(NodeType& node)
+#else
+    static auto get(NodeType& node) -> decltype(get_impl(node, has_input_ports_type<NodeType>()))
+#endif
+    {
+        return get_impl(node, has_input_ports_type<NodeType>());
+    }
+};
+
+template<size_t I>
+class predecessor_selector {
+    template <typename NodeType>
+    static auto internal_get(NodeType& node, std::true_type) -> decltype(output_port<I>(node)) {
+        return output_port<I>(node);
+    }
+
+    template <typename NodeType>
+    static NodeType& internal_get(NodeType& node, std::false_type) { return node;}
+
+    template <typename NodeType>
+#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE
+    static auto& get_impl(NodeType& node, std::false_type)
+#else
+    static auto get_impl(NodeType& node, std::false_type) -> decltype(internal_get(node, has_output_ports_type<NodeType>()))
+#endif
+    {
+        return internal_get(node, has_output_ports_type<NodeType>());
+    }
+
+    template <typename AsyncNode>
+    static AsyncNode& get_impl(AsyncNode& node, std::true_type) { return node; }
+
+public:
+    template <typename NodeType>
+#if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE
+    static auto& get(NodeType& node)
+#else
+    static auto get(NodeType& node) -> decltype(get_impl(node, is_async_node<NodeType>()))
+#endif
+    {
+        return get_impl(node, is_async_node<NodeType>());
+    }
+};
+
+template<size_t I>
+class make_edges_helper {
+public:
+    template<typename PredecessorsTuple, typename NodeType>
+    static void connect_predecessors(PredecessorsTuple& predecessors, NodeType& node) {
+        make_edge(std::get<I>(predecessors), successor_selector<I>::get(node));
+        make_edges_helper<I - 1>::connect_predecessors(predecessors, node);
+    }
+
+    template<typename SuccessorsTuple, typename NodeType>
+    static void connect_successors(NodeType& node, SuccessorsTuple& successors) {
+        make_edge(predecessor_selector<I>::get(node), std::get<I>(successors));
+        make_edges_helper<I - 1>::connect_successors(node, successors);
+    }
+};
+
+template<>
+struct make_edges_helper<0> {
+    template<typename PredecessorsTuple, typename NodeType>
+    static void connect_predecessors(PredecessorsTuple& predecessors, NodeType& node) {
+        make_edge(std::get<0>(predecessors), successor_selector<0>::get(node));
+    }
+
+    template<typename SuccessorsTuple, typename NodeType>
+    static void connect_successors(NodeType& node, SuccessorsTuple& successors) {
+        make_edge(predecessor_selector<0>::get(node), std::get<0>(successors));
+    }
+};
+
+// TODO: consider adding an overload for making edges between node sets
+template<typename NodeType, typename OrderFlagType, typename... Args>
+void make_edges(const node_set<OrderFlagType, Args...>& s, NodeType& node) {
+    const std::size_t SetSize = std::tuple_size<decltype(s.nodes)>::value;
+    make_edges_helper<SetSize - 1>::connect_predecessors(s.nodes, node);
+}
+
+template <typename NodeType, typename OrderFlagType, typename... Args>
+void make_edges(NodeType& node, const node_set<OrderFlagType, Args...>& s) {
+    const std::size_t SetSize = std::tuple_size<decltype(s.nodes)>::value;
+    make_edges_helper<SetSize - 1>::connect_successors(node, s.nodes);
+}
+
+template <typename NodeType, typename... Nodes>
+void make_edges_in_order(const node_set<order::following, Nodes...>& ns, NodeType& node) {
+    make_edges(ns, node);
+}
+
+template <typename NodeType, typename... Nodes>
+void make_edges_in_order(const node_set<order::preceding, Nodes...>& ns, NodeType& node) {
+    make_edges(node, ns);
+}
+
+#endif  // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+#endif // __TBB_flow_graph_node_set_impl_H
diff --git a/third_party/tbb/detail/_flow_graph_nodes_deduction.h b/third_party/tbb/detail/_flow_graph_nodes_deduction.h
new file mode 100644
index 000000000..4eaa7a8b4
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_nodes_deduction.h
@@ -0,0 +1,278 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_nodes_deduction_H
+#define __TBB_flow_graph_nodes_deduction_H
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Input, typename Output>
+struct declare_body_types {
+    using input_type = Input;
+    using output_type = Output;
+};
+
+struct NoInputBody {};
+
+template <typename Output>
+struct declare_body_types<NoInputBody, Output> {
+    using output_type = Output;
+};
+
+template <typename T> struct body_types;
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(const Input&) const> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(const Input&)> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(Input&) const> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct body_types<Output (T::*)(Input&)> : declare_body_types<Input, Output> {};
+
+template <typename T, typename Output>
+struct body_types<Output (T::*)(flow_control&) const> : declare_body_types<NoInputBody, Output> {};
+
+template <typename T, typename Output>
+struct body_types<Output (T::*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+
+template <typename Input, typename Output>
+struct body_types<Output (*)(Input&)> : declare_body_types<Input, Output> {};
+
+template <typename Input, typename Output>
+struct body_types<Output (*)(const Input&)> : declare_body_types<Input, Output> {};
+
+template <typename Output>
+struct body_types<Output (*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+
+template <typename Body>
+using input_t = typename body_types<Body>::input_type;
+
+template <typename Body>
+using output_t = typename body_types<Body>::output_type;
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(const Input&) const)->decltype(name);
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(const Input&))->decltype(name);
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(Input&) const)->decltype(name);
+
+template <typename T, typename Input, typename Output>
+auto decide_on_operator_overload(Output (T::*name)(Input&))->decltype(name);
+
+template <typename Input, typename Output>
+auto decide_on_operator_overload(Output (*name)(const Input&))->decltype(name);
+
+template <typename Input, typename Output>
+auto decide_on_operator_overload(Output (*name)(Input&))->decltype(name);
+
+template <typename Body>
+decltype(decide_on_operator_overload(&Body::operator())) decide_on_callable_type(int);
+
+template <typename Body>
+decltype(decide_on_operator_overload(std::declval<Body>())) decide_on_callable_type(...);
+
+// Deduction guides for Flow Graph nodes
+
+template <typename GraphOrSet, typename Body>
+input_node(GraphOrSet&&, Body)
+->input_node<output_t<decltype(decide_on_callable_type<Body>(0))>>;
+    
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename NodeSet>
+struct decide_on_set;
+
+template <typename Node, typename... Nodes>
+struct decide_on_set<node_set<order::following, Node, Nodes...>> {
+    using type = typename Node::output_type;
+};
+
+template <typename Node, typename... Nodes>
+struct decide_on_set<node_set<order::preceding, Node, Nodes...>> {
+    using type = typename Node::input_type;
+};
+
+template <typename NodeSet>
+using decide_on_set_t = typename decide_on_set<std::decay_t<NodeSet>>::type;
+
+template <typename NodeSet>
+broadcast_node(const NodeSet&)
+->broadcast_node<decide_on_set_t<NodeSet>>;
+
+template <typename NodeSet>
+buffer_node(const NodeSet&)
+->buffer_node<decide_on_set_t<NodeSet>>;
+
+template <typename NodeSet>
+queue_node(const NodeSet&)
+->queue_node<decide_on_set_t<NodeSet>>;
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename GraphOrProxy, typename Sequencer>
+sequencer_node(GraphOrProxy&&, Sequencer)
+->sequencer_node<input_t<decltype(decide_on_callable_type<Sequencer>(0))>>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename NodeSet, typename Compare>
+priority_queue_node(const NodeSet&, const Compare&)
+->priority_queue_node<decide_on_set_t<NodeSet>, Compare>;
+
+template <typename NodeSet>
+priority_queue_node(const NodeSet&)
+->priority_queue_node<decide_on_set_t<NodeSet>, std::less<decide_on_set_t<NodeSet>>>;
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename Key>
+struct join_key {
+    using type = Key;
+};
+
+template <typename T>
+struct join_key<const T&> {
+    using type = T&;
+};
+
+template <typename Key>
+using join_key_t = typename join_key<Key>::type;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename Policy, typename... Predecessors>
+join_node(const node_set<order::following, Predecessors...>&, Policy)
+->join_node<std::tuple<typename Predecessors::output_type...>,
+            Policy>;
+
+template <typename Policy, typename Successor, typename... Successors>
+join_node(const node_set<order::preceding, Successor, Successors...>&, Policy)
+->join_node<typename Successor::input_type, Policy>;
+
+template <typename... Predecessors>
+join_node(const node_set<order::following, Predecessors...>)
+->join_node<std::tuple<typename Predecessors::output_type...>,
+            queueing>;
+
+template <typename Successor, typename... Successors>
+join_node(const node_set<order::preceding, Successor, Successors...>)
+->join_node<typename Successor::input_type, queueing>;
+#endif
+
+template <typename GraphOrProxy, typename Body, typename... Bodies>
+join_node(GraphOrProxy&&, Body, Bodies...)
+->join_node<std::tuple<input_t<decltype(decide_on_callable_type<Body>(0))>,
+                       input_t<decltype(decide_on_callable_type<Bodies>(0))>...>,
+            key_matching<join_key_t<output_t<decltype(decide_on_callable_type<Body>(0))>>>>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename... Predecessors>
+indexer_node(const node_set<order::following, Predecessors...>&)
+->indexer_node<typename Predecessors::output_type...>;
+#endif
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+template <typename NodeSet>
+limiter_node(const NodeSet&, size_t)
+->limiter_node<decide_on_set_t<NodeSet>>;
+
+template <typename Predecessor, typename... Predecessors>
+split_node(const node_set<order::following, Predecessor, Predecessors...>&)
+->split_node<typename Predecessor::output_type>;
+
+template <typename... Successors>
+split_node(const node_set<order::preceding, Successors...>&)
+->split_node<std::tuple<typename Successors::input_type...>>;
+
+#endif
+
+template <typename GraphOrSet, typename Body, typename Policy>
+function_node(GraphOrSet&&,
+              size_t, Body,
+              Policy, node_priority_t = no_priority)
+->function_node<input_t<decltype(decide_on_callable_type<Body>(0))>,
+                output_t<decltype(decide_on_callable_type<Body>(0))>,
+                Policy>;
+
+template <typename GraphOrSet, typename Body>
+function_node(GraphOrSet&&, size_t,
+              Body, node_priority_t = no_priority)
+->function_node<input_t<decltype(decide_on_callable_type<Body>(0))>,
+                output_t<decltype(decide_on_callable_type<Body>(0))>,
+                queueing>;
+
+template <typename Output>
+struct continue_output {
+    using type = Output;
+};
+
+template <>
+struct continue_output<void> {
+    using type = continue_msg;
+};
+
+template <typename T>
+using continue_output_t = typename continue_output<T>::type;
+
+template <typename GraphOrSet, typename Body, typename Policy>
+continue_node(GraphOrSet&&, Body,
+              Policy, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>,
+                Policy>;
+
+template <typename GraphOrSet, typename Body, typename Policy>
+continue_node(GraphOrSet&&,
+              int, Body,
+              Policy, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>,
+                Policy>;
+
+template <typename GraphOrSet, typename Body>
+continue_node(GraphOrSet&&,
+              Body, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>, Policy<void>>;
+
+template <typename GraphOrSet, typename Body>
+continue_node(GraphOrSet&&, int,
+              Body, node_priority_t = no_priority)
+->continue_node<continue_output_t<std::invoke_result_t<Body, continue_msg>>,
+                Policy<void>>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+template <typename NodeSet>
+overwrite_node(const NodeSet&)
+->overwrite_node<decide_on_set_t<NodeSet>>;
+
+template <typename NodeSet>
+write_once_node(const NodeSet&)
+->write_once_node<decide_on_set_t<NodeSet>>;
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+#endif // __TBB_flow_graph_nodes_deduction_H
diff --git a/third_party/tbb/detail/_flow_graph_tagged_buffer_impl.h b/third_party/tbb/detail/_flow_graph_tagged_buffer_impl.h
new file mode 100644
index 000000000..68ce59b96
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_tagged_buffer_impl.h
@@ -0,0 +1,258 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// a hash table buffer that can expand, and can support as many deletions as
+// additions, list-based, with elements of list held in array (for destruction
+// management), multiplicative hashing (like ets).  No synchronization built-in.
+//
+
+#ifndef __TBB__flow_graph_hash_buffer_impl_H
+#define __TBB__flow_graph_hash_buffer_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::flow::interfaceX::internal
+
+// elements in the table are a simple list; we need pointer to next element to
+// traverse the chain
+template<typename ValueType>
+struct buffer_element_type {
+    // the second parameter below is void * because we can't forward-declare the type
+    // itself, so we just reinterpret_cast below.
+    typedef typename aligned_pair<ValueType, void *>::type type;
+};
+
+template
+    <
+     typename Key,         // type of key within ValueType
+     typename ValueType,
+     typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
+     typename HashCompare, // has hash and equal
+     typename Allocator=tbb::cache_aligned_allocator< typename aligned_pair<ValueType, void *>::type >
+    >
+class hash_buffer : public HashCompare {
+public:
+    static const size_t INITIAL_SIZE = 8;  // initial size of the hash pointer table
+    typedef ValueType value_type;
+    typedef typename buffer_element_type< value_type >::type element_type;
+    typedef value_type *pointer_type;
+    typedef element_type *list_array_type;  // array we manage manually
+    typedef list_array_type *pointer_array_type;
+    typedef typename std::allocator_traits<Allocator>::template rebind_alloc<list_array_type> pointer_array_allocator_type;
+    typedef typename std::allocator_traits<Allocator>::template rebind_alloc<element_type> elements_array_allocator;
+    typedef typename std::decay<Key>::type Knoref;
+
+private:
+    ValueToKey *my_key;
+    size_t my_size;
+    size_t nelements;
+    pointer_array_type pointer_array;    // pointer_array[my_size]
+    list_array_type elements_array;      // elements_array[my_size / 2]
+    element_type* free_list;
+
+    size_t mask() { return my_size - 1; }
+
+    void set_up_free_list( element_type **p_free_list, list_array_type la, size_t sz) {
+        for(size_t i=0; i < sz - 1; ++i ) {  // construct free list
+            la[i].second = &(la[i+1]);
+        }
+        la[sz-1].second = nullptr;
+        *p_free_list = (element_type *)&(la[0]);
+    }
+
+    // cleanup for exceptions
+    struct DoCleanup {
+        pointer_array_type *my_pa;
+        list_array_type *my_elements;
+        size_t my_size;
+
+        DoCleanup(pointer_array_type &pa, list_array_type &my_els, size_t sz) :
+            my_pa(&pa), my_elements(&my_els), my_size(sz) {  }
+        ~DoCleanup() {
+            if(my_pa) {
+                size_t dont_care = 0;
+                internal_free_buffer(*my_pa, *my_elements, my_size, dont_care);
+            }
+        }
+    };
+
+    // exception-safety requires we do all the potentially-throwing operations first
+    void grow_array() {
+        size_t new_size = my_size*2;
+        size_t new_nelements = nelements;  // internal_free_buffer zeroes this
+        list_array_type new_elements_array = nullptr;
+        pointer_array_type new_pointer_array = nullptr;
+        list_array_type new_free_list = nullptr;
+        {
+            DoCleanup my_cleanup(new_pointer_array, new_elements_array, new_size);
+            new_elements_array = elements_array_allocator().allocate(my_size);
+            new_pointer_array = pointer_array_allocator_type().allocate(new_size);
+            for(size_t i=0; i < new_size; ++i) new_pointer_array[i] = nullptr;
+            set_up_free_list(&new_free_list, new_elements_array, my_size );
+
+            for(size_t i=0; i < my_size; ++i) {
+                for( element_type* op = pointer_array[i]; op; op = (element_type *)(op->second)) {
+                    value_type *ov = reinterpret_cast<value_type *>(&(op->first));
+                    // could have std::move semantics
+                    internal_insert_with_key(new_pointer_array, new_size, new_free_list, *ov);
+                }
+            }
+            my_cleanup.my_pa = nullptr;
+            my_cleanup.my_elements = nullptr;
+        }
+
+        internal_free_buffer(pointer_array, elements_array, my_size, nelements);
+        free_list = new_free_list;
+        pointer_array = new_pointer_array;
+        elements_array = new_elements_array;
+        my_size = new_size;
+        nelements = new_nelements;
+    }
+
+    // v should have perfect forwarding if std::move implemented.
+    // we use this method to move elements in grow_array, so can't use class fields
+    void internal_insert_with_key( element_type **p_pointer_array, size_t p_sz, list_array_type &p_free_list,
+            const value_type &v) {
+        size_t l_mask = p_sz-1;
+        __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+        size_t h = this->hash(tbb::detail::invoke(*my_key, v)) & l_mask;
+        __TBB_ASSERT(p_free_list, "Error: free list not set up.");
+        element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->second);
+        (void) new(&(my_elem->first)) value_type(v);
+        my_elem->second = p_pointer_array[h];
+        p_pointer_array[h] = my_elem;
+    }
+
+    void internal_initialize_buffer() {
+        pointer_array = pointer_array_allocator_type().allocate(my_size);
+        for(size_t i = 0; i < my_size; ++i) pointer_array[i] = nullptr;
+        elements_array = elements_array_allocator().allocate(my_size / 2);
+        set_up_free_list(&free_list, elements_array, my_size / 2);
+    }
+
+    // made static so an enclosed class can use to properly dispose of the internals
+    static void internal_free_buffer( pointer_array_type &pa, list_array_type &el, size_t &sz, size_t &ne ) {
+        if(pa) {
+            for(size_t i = 0; i < sz; ++i ) {
+                element_type *p_next;
+                for( element_type *p = pa[i]; p; p = p_next) {
+                    p_next = (element_type *)p->second;
+                    // TODO revamp: make sure type casting is correct.
+                    void* ptr = (void*)(p->first);
+#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
+                    suppress_unused_warning(ptr);
+#endif
+                    ((value_type*)ptr)->~value_type();
+                }
+            }
+            pointer_array_allocator_type().deallocate(pa, sz);
+            pa = nullptr;
+        }
+        // Separate test (if allocation of pa throws, el may be allocated.
+        // but no elements will be constructed.)
+        if(el) {
+            elements_array_allocator().deallocate(el, sz / 2);
+            el = nullptr;
+        }
+        sz = INITIAL_SIZE;
+        ne = 0;
+    }
+
+public:
+    hash_buffer() : my_key(nullptr), my_size(INITIAL_SIZE), nelements(0) {
+        internal_initialize_buffer();
+    }
+
+    ~hash_buffer() {
+        internal_free_buffer(pointer_array, elements_array, my_size, nelements);
+        delete my_key;
+        my_key = nullptr;
+    }
+    hash_buffer(const hash_buffer&) = delete;
+    hash_buffer& operator=(const hash_buffer&) = delete;
+
+    void reset() {
+        internal_free_buffer(pointer_array, elements_array, my_size, nelements);
+        internal_initialize_buffer();
+    }
+
+    // Take ownership of func object allocated with new.
+    // This method is only used internally, so can't be misused by user.
+    void set_key_func(ValueToKey *vtk) { my_key = vtk; }
+    // pointer is used to clone()
+    ValueToKey* get_key_func() { return my_key; }
+
+    bool insert_with_key(const value_type &v) {
+        pointer_type p = nullptr;
+        __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+        if(find_ref_with_key(tbb::detail::invoke(*my_key, v), p)) {
+            p->~value_type();
+            (void) new(p) value_type(v);  // copy-construct into the space
+            return false;
+        }
+        ++nelements;
+        if(nelements*2 > my_size) grow_array();
+        internal_insert_with_key(pointer_array, my_size, free_list, v);
+        return true;
+    }
+
+    // returns true and sets v to array element if found, else returns false.
+    bool find_ref_with_key(const Knoref& k, pointer_type &v) {
+        size_t i = this->hash(k) & mask();
+        for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->second)) {
+            pointer_type pv = reinterpret_cast<pointer_type>(&(p->first));
+            __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+            if(this->equal(tbb::detail::invoke(*my_key, *pv), k)) {
+                v = pv;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool find_with_key( const Knoref& k, value_type &v) {
+        value_type *p;
+        if(find_ref_with_key(k, p)) {
+            v = *p;
+            return true;
+        }
+        else
+            return false;
+    }
+
+    void delete_with_key(const Knoref& k) {
+        size_t h = this->hash(k) & mask();
+        element_type* prev = nullptr;
+        for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->second)) {
+            value_type *vp = reinterpret_cast<value_type *>(&(p->first));
+            __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
+            if(this->equal(tbb::detail::invoke(*my_key, *vp), k)) {
+                vp->~value_type();
+                if(prev) prev->second = p->second;
+                else pointer_array[h] = (element_type *)(p->second);
+                p->second = free_list;
+                free_list = p;
+                --nelements;
+                return;
+            }
+        }
+        __TBB_ASSERT(false, "key not found for delete");
+    }
+};
+#endif // __TBB__flow_graph_hash_buffer_impl_H
diff --git a/third_party/tbb/detail/_flow_graph_trace_impl.h b/third_party/tbb/detail/_flow_graph_trace_impl.h
new file mode 100644
index 000000000..dc9c857be
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_trace_impl.h
@@ -0,0 +1,365 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _FGT_GRAPH_TRACE_IMPL_H
+#define _FGT_GRAPH_TRACE_IMPL_H
+
+#include "third_party/tbb/profiling.h"
+#if (_MSC_VER >= 1900)
+    // MISSING #include <intrin.h>
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template< typename T > class sender;
+template< typename T > class receiver;
+
+#if TBB_USE_PROFILING_TOOLS
+    #if __TBB_FLOW_TRACE_CODEPTR
+        #if (_MSC_VER >= 1900)
+            #define CODEPTR() (_ReturnAddress())
+        #elif __TBB_GCC_VERSION >= 40800
+            #define CODEPTR() ( __builtin_return_address(0))
+        #else
+            #define CODEPTR() nullptr
+        #endif
+    #else
+        #define CODEPTR() nullptr
+    #endif /* __TBB_FLOW_TRACE_CODEPTR */
+
+static inline void fgt_alias_port(void *node, void *p, bool visible) {
+    if(visible)
+        itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_NODE );
+    else
+        itt_relation_add( ITT_DOMAIN_FLOW, p, FLOW_NODE, __itt_relation_is_child_of, node, FLOW_NODE );
+}
+
+static inline void fgt_composite ( void* codeptr, void *node, void *graph ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_COMPOSITE_NODE );
+    suppress_unused_warning( codeptr );
+#if __TBB_FLOW_TRACE_CODEPTR
+    if (codeptr != nullptr) {
+        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+    }
+#endif
+}
+
+static inline void fgt_internal_alias_input_port( void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_INPUT_PORT );
+}
+
+static inline void fgt_internal_alias_output_port( void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_OUTPUT_PORT );
+}
+
+template<typename InputType>
+void alias_input_port(void *node, receiver<InputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_alias_input_port a function template?
+    fgt_internal_alias_input_port( node, port, name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_input_alias_helper {
+    static void alias_port( void *node, PortsTuple &ports ) {
+        alias_input_port( node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_INPUT_PORT_0 + N - 1) );
+        fgt_internal_input_alias_helper<PortsTuple, N-1>::alias_port( node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_input_alias_helper<PortsTuple, 0> {
+    static void alias_port( void * /* node */, PortsTuple & /* ports */ ) { }
+};
+
+template<typename OutputType>
+void alias_output_port(void *node, sender<OutputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_alias_output_port a function template?
+    fgt_internal_alias_output_port( node, static_cast<void *>(port), name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_output_alias_helper {
+    static void alias_port( void *node, PortsTuple &ports ) {
+        alias_output_port( node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_OUTPUT_PORT_0 + N - 1) );
+        fgt_internal_output_alias_helper<PortsTuple, N-1>::alias_port( node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_output_alias_helper<PortsTuple, 0> {
+    static void alias_port( void * /*node*/, PortsTuple &/*ports*/ ) {
+    }
+};
+
+static inline void fgt_internal_create_input_port( void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+}
+
+static inline void fgt_internal_create_output_port( void* codeptr, void *node, void *p, string_resource_index name_index ) {
+    itt_make_task_group(ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index);
+    suppress_unused_warning( codeptr );
+#if __TBB_FLOW_TRACE_CODEPTR
+    if (codeptr != nullptr) {
+        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+    }
+#endif
+}
+
+template<typename InputType>
+void register_input_port(void *node, receiver<InputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_create_input_port a function template?
+    fgt_internal_create_input_port(node, static_cast<void*>(port), name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_input_helper {
+    static void register_port( void *node, PortsTuple &ports ) {
+        register_input_port( node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_INPUT_PORT_0 + N - 1) );
+        fgt_internal_input_helper<PortsTuple, N-1>::register_port( node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_input_helper<PortsTuple, 1> {
+    static void register_port( void *node, PortsTuple &ports ) {
+        register_input_port( node, &(std::get<0>(ports)), FLOW_INPUT_PORT_0 );
+    }
+};
+
+template<typename OutputType>
+void register_output_port(void* codeptr, void *node, sender<OutputType>* port, string_resource_index name_index) {
+    // TODO: Make fgt_internal_create_output_port a function template?
+    fgt_internal_create_output_port( codeptr, node, static_cast<void *>(port), name_index);
+}
+
+template < typename PortsTuple, int N >
+struct fgt_internal_output_helper {
+    static void register_port( void* codeptr, void *node, PortsTuple &ports ) {
+        register_output_port( codeptr, node, &(std::get<N-1>(ports)), static_cast<string_resource_index>(FLOW_OUTPUT_PORT_0 + N - 1) );
+        fgt_internal_output_helper<PortsTuple, N-1>::register_port( codeptr, node, ports );
+    }
+};
+
+template < typename PortsTuple >
+struct fgt_internal_output_helper<PortsTuple,1> {
+    static void register_port( void* codeptr, void *node, PortsTuple &ports ) {
+        register_output_port( codeptr, node, &(std::get<0>(ports)), FLOW_OUTPUT_PORT_0 );
+    }
+};
+
+template< typename NodeType >
+void fgt_multioutput_node_desc( const NodeType *node, const char *desc ) {
+    void *addr =  (void *)( static_cast< receiver< typename NodeType::input_type > * >(const_cast< NodeType *>(node)) );
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+}
+
+template< typename NodeType >
+void fgt_multiinput_multioutput_node_desc( const NodeType *node, const char *desc ) {
+    void *addr =  const_cast<NodeType *>(node);
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+}
+
+template< typename NodeType >
+static inline void fgt_node_desc( const NodeType *node, const char *desc ) {
+    void *addr =  (void *)( static_cast< sender< typename NodeType::output_type > * >(const_cast< NodeType *>(node)) );
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+}
+
+static inline void fgt_graph_desc( const void *g, const char *desc ) {
+    void *addr = const_cast< void *>(g);
+    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_GRAPH, FLOW_OBJECT_NAME, desc );
+}
+
+static inline void fgt_body( void *node, void *body ) {
+    itt_relation_add( ITT_DOMAIN_FLOW, body, FLOW_BODY, __itt_relation_is_child_of, node, FLOW_NODE );
+}
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node(void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
+    fgt_internal_output_helper<PortsTuple, N>::register_port(codeptr, input_port, ports );
+}
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports, void *body ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
+    fgt_internal_output_helper<PortsTuple, N>::register_port( codeptr, input_port, ports );
+    fgt_body( input_port, body );
+}
+
+template< int N, typename PortsTuple >
+static inline void fgt_multiinput_node( void* codeptr, string_resource_index t, void *g, PortsTuple &ports, void *output_port) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
+    fgt_internal_input_helper<PortsTuple, N>::register_port( output_port, ports );
+}
+
+static inline void fgt_multiinput_multioutput_node( void* codeptr, string_resource_index t, void *n, void *g ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, n, FLOW_NODE, g, FLOW_GRAPH, t );
+    suppress_unused_warning( codeptr );
+#if __TBB_FLOW_TRACE_CODEPTR
+    if (codeptr != nullptr) {
+        register_node_addr(ITT_DOMAIN_FLOW, n, FLOW_NODE, CODE_ADDRESS, &codeptr);
+    }
+#endif
+}
+
+static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *output_port ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
+}
+
+static void fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *output_port, void *body ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    fgt_internal_create_output_port(codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
+    fgt_body( output_port, body );
+}
+
+static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *input_port, void *output_port ) {
+    fgt_node( codeptr, t, g, output_port );
+    fgt_internal_create_input_port( output_port, input_port, FLOW_INPUT_PORT_0 );
+}
+
+static inline void  fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, void *output_port, void *body ) {
+    fgt_node_with_body( codeptr, t, g, output_port, body );
+    fgt_internal_create_input_port( output_port, input_port, FLOW_INPUT_PORT_0 );
+}
+
+
+static inline void  fgt_node( void* codeptr, string_resource_index t, void *g, void *input_port, void *decrement_port, void *output_port ) {
+    fgt_node( codeptr, t, g, input_port, output_port );
+    fgt_internal_create_input_port( output_port, decrement_port, FLOW_INPUT_PORT_1 );
+}
+
+static inline void fgt_make_edge( void *output_port, void *input_port ) {
+    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_predecessor_to, input_port, FLOW_INPUT_PORT);
+}
+
+static inline void fgt_remove_edge( void *output_port, void *input_port ) {
+    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_sibling_of, input_port, FLOW_INPUT_PORT);
+}
+
+static inline void fgt_graph( void *g ) {
+    itt_make_task_group( ITT_DOMAIN_FLOW, g, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_GRAPH );
+}
+
+static inline void fgt_begin_body( void *body ) {
+    itt_task_begin( ITT_DOMAIN_FLOW, body, FLOW_BODY, nullptr, FLOW_NULL, FLOW_BODY );
+}
+
+static inline void fgt_end_body( void * ) {
+    itt_task_end( ITT_DOMAIN_FLOW );
+}
+
+static inline void fgt_async_try_put_begin( void *node, void *port ) {
+    itt_task_begin( ITT_DOMAIN_FLOW, port, FLOW_OUTPUT_PORT, node, FLOW_NODE, FLOW_OUTPUT_PORT );
+}
+
+static inline void fgt_async_try_put_end( void *, void * ) {
+    itt_task_end( ITT_DOMAIN_FLOW );
+}
+
+static inline void fgt_async_reserve( void *node, void *graph ) {
+    itt_region_begin( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL );
+}
+
+static inline void fgt_async_commit( void *node, void * /*graph*/) {
+    itt_region_end( ITT_DOMAIN_FLOW, node, FLOW_NODE );
+}
+
+static inline void fgt_reserve_wait( void *graph ) {
+    itt_region_begin( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_NULL );
+}
+
+static inline void fgt_release_wait( void *graph ) {
+    itt_region_end( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH );
+}
+
+#else // TBB_USE_PROFILING_TOOLS
+
+#define CODEPTR() nullptr
+
+static inline void fgt_alias_port(void * /*node*/, void * /*p*/, bool /*visible*/ ) { }
+
+static inline void fgt_composite ( void* /*codeptr*/, void * /*node*/, void * /*graph*/ ) { }
+
+static inline void fgt_graph( void * /*g*/ ) { }
+
+template< typename NodeType >
+static inline void fgt_multioutput_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { }
+
+template< typename NodeType >
+static inline void fgt_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { }
+
+static inline void fgt_graph_desc( const void * /*g*/, const char * /*desc*/ ) { }
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, PortsTuple & /*ports*/ ) { }
+
+template< int N, typename PortsTuple >
+static inline void fgt_multioutput_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, PortsTuple & /*ports*/, void * /*body*/ ) { }
+
+template< int N, typename PortsTuple >
+static inline void fgt_multiinput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, PortsTuple & /*ports*/, void * /*output_port*/ ) { }
+
+static inline void fgt_multiinput_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*node*/, void * /*graph*/ ) { }
+
+static inline void fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*output_port*/ ) { }
+static inline void  fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*decrement_port*/, void * /*output_port*/ ) { }
+
+static inline void fgt_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*output_port*/, void * /*body*/ ) { }
+static inline void fgt_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*output_port*/, void * /*body*/ ) { }
+
+static inline void fgt_make_edge( void * /*output_port*/, void * /*input_port*/ ) { }
+static inline void fgt_remove_edge( void * /*output_port*/, void * /*input_port*/ ) { }
+
+static inline void fgt_begin_body( void * /*body*/ ) { }
+static inline void fgt_end_body( void *  /*body*/) { }
+
+static inline void fgt_async_try_put_begin( void * /*node*/, void * /*port*/ ) { }
+static inline void fgt_async_try_put_end( void * /*node*/ , void * /*port*/ ) { }
+static inline void fgt_async_reserve( void * /*node*/, void * /*graph*/ ) { }
+static inline void fgt_async_commit( void * /*node*/, void * /*graph*/ ) { }
+static inline void fgt_reserve_wait( void * /*graph*/ ) { }
+static inline void fgt_release_wait( void * /*graph*/ ) { }
+
+template< typename NodeType >
+void fgt_multiinput_multioutput_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { }
+
+template < typename PortsTuple, int N >
+struct fgt_internal_input_alias_helper {
+    static void alias_port( void * /*node*/, PortsTuple & /*ports*/ ) { }
+};
+
+template < typename PortsTuple, int N >
+struct fgt_internal_output_alias_helper {
+    static void alias_port( void * /*node*/, PortsTuple & /*ports*/ ) { }
+};
+
+#endif // TBB_USE_PROFILING_TOOLS
+
+} // d1
+} // namespace detail
+} // namespace tbb
+
+#endif // _FGT_GRAPH_TRACE_IMPL_H
diff --git a/third_party/tbb/detail/_flow_graph_types_impl.h b/third_party/tbb/detail/_flow_graph_types_impl.h
new file mode 100644
index 000000000..3de282c3b
--- /dev/null
+++ b/third_party/tbb/detail/_flow_graph_types_impl.h
@@ -0,0 +1,408 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__flow_graph_types_impl_H
+#define __TBB__flow_graph_types_impl_H
+
+#ifndef __TBB_flow_graph_H
+#error Do not #include this internal file directly; use public TBB headers instead.
+#endif
+
+// included in namespace tbb::detail::d1
+
+// the change to key_matching (adding a K and KHash template parameter, making it a class)
+// means we have to pass this data to the key_matching_port.  All the ports have only one
+// template parameter, so we have to wrap the following types in a trait:
+//
+//    . K == key_type
+//    . KHash == hash and compare for Key
+//    . TtoK == function_body that given an object of T, returns its K
+//    . T == type accepted by port, and stored in the hash table
+//
+// The port will have an additional parameter on node construction, which is a function_body
+// that accepts a const T& and returns a K which is the field in T which is its K.
+template<typename Kp, typename KHashp, typename Tp>
+struct KeyTrait {
+    typedef Kp K;
+    typedef Tp T;
+    typedef type_to_key_function_body<T,K> TtoK;
+    typedef KHashp KHash;
+};
+
+// wrap each element of a tuple in a template, and make a tuple of the result.
+template<int N, template<class> class PT, typename TypeTuple>
+struct wrap_tuple_elements;
+
+// A wrapper that generates the traits needed for each port of a key-matching join,
+// and the type of the tuple of input ports.
+template<int N, template<class> class PT, typename KeyTraits, typename TypeTuple>
+struct wrap_key_tuple_elements;
+
+template<int N, template<class> class PT,  typename... Args>
+struct wrap_tuple_elements<N, PT, std::tuple<Args...> >{
+    typedef typename std::tuple<PT<Args>... > type;
+};
+
+template<int N, template<class> class PT, typename KeyTraits, typename... Args>
+struct wrap_key_tuple_elements<N, PT, KeyTraits, std::tuple<Args...> > {
+    typedef typename KeyTraits::key_type K;
+    typedef typename KeyTraits::hash_compare_type KHash;
+    typedef typename std::tuple<PT<KeyTrait<K, KHash, Args> >... > type;
+};
+
+template< int... S > class sequence {};
+
+template< int N, int... S >
+struct make_sequence : make_sequence < N - 1, N - 1, S... > {};
+
+template< int... S >
+struct make_sequence < 0, S... > {
+    typedef sequence<S...> type;
+};
+
+//! type mimicking std::pair but with trailing fill to ensure each element of an array
+//* will have the correct alignment
+template<typename T1, typename T2, size_t REM>
+struct type_plus_align {
+    char first[sizeof(T1)];
+    T2 second;
+    char fill1[REM];
+};
+
+template<typename T1, typename T2>
+struct type_plus_align<T1,T2,0> {
+    char first[sizeof(T1)];
+    T2 second;
+};
+
+template<class U> struct alignment_of {
+    typedef struct { char t; U    padded; } test_alignment;
+    static const size_t value = sizeof(test_alignment) - sizeof(U);
+};
+
+// T1, T2 are actual types stored.  The space defined for T1 in the type returned
+// is a char array of the correct size.  Type T2 should be trivially-constructible,
+// T1 must be explicitly managed.
+template<typename T1, typename T2>
+struct aligned_pair {
+    static const size_t t1_align = alignment_of<T1>::value;
+    static const size_t t2_align = alignment_of<T2>::value;
+    typedef type_plus_align<T1, T2, 0 > just_pair;
+    static const size_t max_align = t1_align < t2_align ? t2_align : t1_align;
+    static const size_t extra_bytes = sizeof(just_pair) % max_align;
+    static const size_t remainder = extra_bytes ? max_align - extra_bytes : 0;
+public:
+    typedef type_plus_align<T1,T2,remainder> type;
+};  // aligned_pair
+
+// support for variant type
+// type we use when we're not storing a value
+struct default_constructed { };
+
+// type which contains another type, tests for what type is contained, and references to it.
+// Wrapper<T>
+//     void CopyTo( void *newSpace) : builds a Wrapper<T> copy of itself in newSpace
+
+// struct to allow us to copy and test the type of objects
+struct WrapperBase {
+    virtual ~WrapperBase() {}
+    virtual void CopyTo(void* /*newSpace*/) const = 0;
+};
+
+// Wrapper<T> contains a T, with the ability to test what T is.  The Wrapper<T> can be
+// constructed from a T, can be copy-constructed from another Wrapper<T>, and can be
+// examined via value(), but not modified.
+template<typename T>
+struct Wrapper: public WrapperBase {
+    typedef T value_type;
+    typedef T* pointer_type;
+private:
+    T value_space;
+public:
+    const value_type &value() const { return value_space; }
+
+private:
+    Wrapper();
+
+    // on exception will ensure the Wrapper will contain only a trivially-constructed object
+    struct _unwind_space {
+        pointer_type space;
+        _unwind_space(pointer_type p) : space(p) {}
+        ~_unwind_space() {
+            if(space) (void) new (space) Wrapper<default_constructed>(default_constructed());
+        }
+    };
+public:
+    explicit Wrapper( const T& other ) : value_space(other) { }
+    explicit Wrapper(const Wrapper& other) = delete;
+
+    void CopyTo(void* newSpace) const override {
+        _unwind_space guard((pointer_type)newSpace);
+        (void) new(newSpace) Wrapper(value_space);
+        guard.space = nullptr;
+    }
+    ~Wrapper() { }
+};
+
+// specialization for array objects
+template<typename T, size_t N>
+struct Wrapper<T[N]> : public WrapperBase {
+    typedef T value_type;
+    typedef T* pointer_type;
+    // space must be untyped.
+    typedef T ArrayType[N];
+private:
+    // The space is not of type T[N] because when copy-constructing, it would be
+    // default-initialized and then copied to in some fashion, resulting in two
+    // constructions and one destruction per element.  If the type is char[ ], we
+    // placement new into each element, resulting in one construction per element.
+    static const size_t space_size = sizeof(ArrayType);
+    char value_space[space_size];
+
+
+    // on exception will ensure the already-built objects will be destructed
+    // (the value_space is a char array, so it is already trivially-destructible.)
+    struct _unwind_class {
+        pointer_type space;
+        int    already_built;
+        _unwind_class(pointer_type p) : space(p), already_built(0) {}
+        ~_unwind_class() {
+            if(space) {
+                for(size_t i = already_built; i > 0 ; --i ) space[i-1].~value_type();
+                (void) new(space) Wrapper<default_constructed>(default_constructed());
+            }
+        }
+    };
+public:
+    const ArrayType &value() const {
+        char *vp = const_cast<char *>(value_space);
+        return reinterpret_cast<ArrayType &>(*vp);
+    }
+
+private:
+    Wrapper();
+public:
+    // have to explicitly construct because other decays to a const value_type*
+    explicit Wrapper(const ArrayType& other) {
+        _unwind_class guard((pointer_type)value_space);
+        pointer_type vp = reinterpret_cast<pointer_type>(&value_space);
+        for(size_t i = 0; i < N; ++i ) {
+            (void) new(vp++) value_type(other[i]);
+            ++(guard.already_built);
+        }
+        guard.space = nullptr;
+    }
+    explicit Wrapper(const Wrapper& other) : WrapperBase() {
+        // we have to do the heavy lifting to copy contents
+        _unwind_class guard((pointer_type)value_space);
+        pointer_type dp = reinterpret_cast<pointer_type>(value_space);
+        pointer_type sp = reinterpret_cast<pointer_type>(const_cast<char *>(other.value_space));
+        for(size_t i = 0; i < N; ++i, ++dp, ++sp) {
+            (void) new(dp) value_type(*sp);
+            ++(guard.already_built);
+        }
+        guard.space = nullptr;
+    }
+
+    void CopyTo(void* newSpace) const override {
+        (void) new(newSpace) Wrapper(*this);  // exceptions handled in copy constructor
+    }
+
+    ~Wrapper() {
+        // have to destroy explicitly in reverse order
+        pointer_type vp = reinterpret_cast<pointer_type>(&value_space);
+        for(size_t i = N; i > 0 ; --i ) vp[i-1].~value_type();
+    }
+};
+
+// given a tuple, return the type of the element that has the maximum alignment requirement.
+// Given a tuple and that type, return the number of elements of the object with the max
+// alignment requirement that is at least as big as the largest object in the tuple.
+
+template<bool, class T1, class T2> struct pick_one;
+template<class T1, class T2> struct pick_one<true , T1, T2> { typedef T1 type; };
+template<class T1, class T2> struct pick_one<false, T1, T2> { typedef T2 type; };
+
+template< template<class> class Selector, typename T1, typename T2 >
+struct pick_max {
+    typedef typename pick_one< (Selector<T1>::value > Selector<T2>::value), T1, T2 >::type type;
+};
+
+template<typename T> struct size_of { static const int value = sizeof(T); };
+
+template< size_t N, class Tuple, template<class> class Selector > struct pick_tuple_max {
+    typedef typename pick_tuple_max<N-1, Tuple, Selector>::type LeftMaxType;
+    typedef typename std::tuple_element<N-1, Tuple>::type ThisType;
+    typedef typename pick_max<Selector, LeftMaxType, ThisType>::type type;
+};
+
+template< class Tuple, template<class> class Selector > struct pick_tuple_max<0, Tuple, Selector> {
+    typedef typename std::tuple_element<0, Tuple>::type type;
+};
+
+// is the specified type included in a tuple?
+template<class Q, size_t N, class Tuple>
+struct is_element_of {
+    typedef typename std::tuple_element<N-1, Tuple>::type T_i;
+    static const bool value = std::is_same<Q,T_i>::value || is_element_of<Q,N-1,Tuple>::value;
+};
+
+template<class Q, class Tuple>
+struct is_element_of<Q,0,Tuple> {
+    typedef typename std::tuple_element<0, Tuple>::type T_i;
+    static const bool value = std::is_same<Q,T_i>::value;
+};
+
+// allow the construction of types that are listed tuple.  If a disallowed type
+// construction is written, a method involving this type is created.  The
+// type has no definition, so a syntax error is generated.
+template<typename T> struct ERROR_Type_Not_allowed_In_Tagged_Msg_Not_Member_Of_Tuple;
+
+template<typename T, bool BUILD_IT> struct do_if;
+template<typename T>
+struct do_if<T, true> {
+    static void construct(void *mySpace, const T& x) {
+        (void) new(mySpace) Wrapper<T>(x);
+    }
+};
+template<typename T>
+struct do_if<T, false> {
+    static void construct(void * /*mySpace*/, const T& x) {
+        // This method is instantiated when the type T does not match any of the
+        // element types in the Tuple in variant<Tuple>.
+        ERROR_Type_Not_allowed_In_Tagged_Msg_Not_Member_Of_Tuple<T>::bad_type(x);
+    }
+};
+
+// Tuple tells us the allowed types that variant can hold.  It determines the alignment of the space in
+// Wrapper, and how big Wrapper is.
+//
+// the object can only be tested for type, and a read-only reference can be fetched by cast_to<T>().
+
+using tbb::detail::punned_cast;
+struct tagged_null_type {};
+template<typename TagType, typename T0, typename T1=tagged_null_type, typename T2=tagged_null_type, typename T3=tagged_null_type,
+                           typename T4=tagged_null_type, typename T5=tagged_null_type, typename T6=tagged_null_type,
+                           typename T7=tagged_null_type, typename T8=tagged_null_type, typename T9=tagged_null_type>
+class tagged_msg {
+    typedef std::tuple<T0, T1, T2, T3, T4
+                  //TODO: Should we reject lists longer than a tuple can hold?
+                  #if __TBB_VARIADIC_MAX >= 6
+                  , T5
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 7
+                  , T6
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 8
+                  , T7
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 9
+                  , T8
+                  #endif
+                  #if __TBB_VARIADIC_MAX >= 10
+                  , T9
+                  #endif
+                  > Tuple;
+
+private:
+    class variant {
+        static const size_t N = std::tuple_size<Tuple>::value;
+        typedef typename pick_tuple_max<N, Tuple, alignment_of>::type AlignType;
+        typedef typename pick_tuple_max<N, Tuple, size_of>::type MaxSizeType;
+        static const size_t MaxNBytes = (sizeof(Wrapper<MaxSizeType>)+sizeof(AlignType)-1);
+        static const size_t MaxNElements = MaxNBytes/sizeof(AlignType);
+        typedef aligned_space<AlignType, MaxNElements> SpaceType;
+        SpaceType my_space;
+        static const size_t MaxSize = sizeof(SpaceType);
+
+    public:
+        variant() { (void) new(&my_space) Wrapper<default_constructed>(default_constructed()); }
+
+        template<typename T>
+        variant( const T& x ) {
+            do_if<T, is_element_of<T, N, Tuple>::value>::construct(&my_space,x);
+        }
+
+        variant(const variant& other) {
+            const WrapperBase * h = punned_cast<const WrapperBase *>(&(other.my_space));
+            h->CopyTo(&my_space);
+        }
+
+        // assignment must destroy and re-create the Wrapper type, as there is no way
+        // to create a Wrapper-to-Wrapper assign even if we find they agree in type.
+        void operator=( const variant& rhs ) {
+            if(&rhs != this) {
+                WrapperBase *h = punned_cast<WrapperBase *>(&my_space);
+                h->~WrapperBase();
+                const WrapperBase *ch = punned_cast<const WrapperBase *>(&(rhs.my_space));
+                ch->CopyTo(&my_space);
+            }
+        }
+
+        template<typename U>
+        const U& variant_cast_to() const {
+            const Wrapper<U> *h = dynamic_cast<const Wrapper<U>*>(punned_cast<const WrapperBase *>(&my_space));
+            if(!h) {
+                throw_exception(exception_id::bad_tagged_msg_cast);
+            }
+            return h->value();
+        }
+        template<typename U>
+        bool variant_is_a() const { return dynamic_cast<const Wrapper<U>*>(punned_cast<const WrapperBase *>(&my_space)) != nullptr; }
+
+        bool variant_is_default_constructed() const {return variant_is_a<default_constructed>();}
+
+        ~variant() {
+            WrapperBase *h = punned_cast<WrapperBase *>(&my_space);
+            h->~WrapperBase();
+        }
+    }; //class variant
+
+    TagType my_tag;
+    variant my_msg;
+
+public:
+    tagged_msg(): my_tag(TagType(~0)), my_msg(){}
+
+    template<typename T, typename R>
+    tagged_msg(T const &index, R const &value) : my_tag(index), my_msg(value) {}
+
+    template<typename T, typename R, size_t N>
+    tagged_msg(T const &index,  R (&value)[N]) : my_tag(index), my_msg(value) {}
+
+    void set_tag(TagType const &index) {my_tag = index;}
+    TagType tag() const {return my_tag;}
+
+    template<typename V>
+    const V& cast_to() const {return my_msg.template variant_cast_to<V>();}
+
+    template<typename V>
+    bool is_a() const {return my_msg.template variant_is_a<V>();}
+
+    bool is_default_constructed() const {return my_msg.variant_is_default_constructed();}
+}; //class tagged_msg
+
+// template to simplify cast and test for tagged_msg in template contexts
+template<typename V, typename T>
+const V& cast_to(T const &t) { return t.template cast_to<V>(); }
+
+template<typename V, typename T>
+bool is_a(T const &t) { return t.template is_a<V>(); }
+
+enum op_stat { WAIT = 0, SUCCEEDED, FAILED };
+
+#endif  /* __TBB__flow_graph_types_impl_H */
diff --git a/third_party/tbb/detail/_hash_compare.h b/third_party/tbb/detail/_hash_compare.h
new file mode 100644
index 000000000..2ad1551d0
--- /dev/null
+++ b/third_party/tbb/detail/_hash_compare.h
@@ -0,0 +1,148 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__hash_compare_H
+#define __TBB_detail__hash_compare_H
+
+#include "third_party/libcxx/functional"
+
+#include "third_party/tbb/detail/_containers_helpers.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Key, typename Hash, typename KeyEqual>
+class hash_compare {
+    using is_transparent_hash = has_transparent_key_equal<Key, Hash, KeyEqual>;
+public:
+    using hasher = Hash;
+    using key_equal = typename is_transparent_hash::type;
+
+    hash_compare() = default;
+    hash_compare( hasher hash, key_equal equal ) : my_hasher(hash), my_equal(equal) {}
+
+    std::size_t operator()( const Key& key ) const {
+        return std::size_t(my_hasher(key));
+    }
+
+    bool operator()( const Key& key1, const Key& key2 ) const {
+        return my_equal(key1, key2);
+    }
+
+    template <typename K, typename = typename std::enable_if<is_transparent_hash::value, K>::type>
+    std::size_t operator()( const K& key ) const {
+        return std::size_t(my_hasher(key));
+    }
+
+    template <typename K1, typename K2, typename = typename std::enable_if<is_transparent_hash::value, K1>::type>
+    bool operator()( const K1& key1, const K2& key2 ) const {
+        return my_equal(key1, key2);
+    }
+
+    hasher hash_function() const {
+        return my_hasher;
+    }
+
+    key_equal key_eq() const {
+        return my_equal;
+    }
+
+
+private:
+    hasher my_hasher;
+    key_equal my_equal;
+}; // class hash_compare
+
+//! hash_compare that is default argument for concurrent_hash_map
+template <typename Key>
+class tbb_hash_compare {
+public:
+    std::size_t hash( const Key& a ) const { return my_hash_func(a); }
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#pragma warning (push)
+// MSVC 2015 throws a strange warning: 'std::size_t': forcing value to bool 'true' or 'false'
+#pragma warning (disable: 4800)
+#endif
+    bool equal( const Key& a, const Key& b ) const { return my_key_equal(a, b); }
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#pragma warning (pop)
+#endif
+private:
+    std::hash<Key> my_hash_func;
+    std::equal_to<Key> my_key_equal;
+};
+
+} // namespace d1
+#if __TBB_CPP20_CONCEPTS_PRESENT
+inline namespace d0 {
+
+template <typename HashCompare, typename Key>
+concept hash_compare = std::copy_constructible<HashCompare> &&
+                       requires( const std::remove_reference_t<HashCompare>& hc, const Key& key1, const Key& key2 ) {
+                           { hc.hash(key1) } -> std::same_as<std::size_t>;
+                           { hc.equal(key1, key2) } -> std::convertible_to<bool>;
+                       };
+
+} // namespace d0
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+} // namespace detail
+} // namespace tbb
+
+#if TBB_DEFINE_STD_HASH_SPECIALIZATIONS
+
+namespace std {
+
+template <typename T, typename U>
+struct hash<std::pair<T, U>> {
+public:
+    std::size_t operator()( const std::pair<T, U>& p ) const {
+        return first_hash(p.first) ^ second_hash(p.second);
+    }
+
+private:
+    std::hash<T> first_hash;
+    std::hash<U> second_hash;
+}; // struct hash<std::pair>
+
+// Apple clang and MSVC defines their own specializations for std::hash<std::basic_string<T, Traits, Alloc>>
+#if !(_LIBCPP_VERSION) && !(_CPPLIB_VER)
+
+template <typename CharT, typename Traits, typename Allocator>
+struct hash<std::basic_string<CharT, Traits, Allocator>> {
+public:
+    std::size_t operator()( const std::basic_string<CharT, Traits, Allocator>& s ) const {
+        std::size_t h = 0;
+        for ( const CharT* c = s.c_str(); *c; ++c ) {
+            h = h * hash_multiplier ^ char_hash(*c);
+        }
+        return h;
+    }
+
+private:
+    static constexpr std::size_t hash_multiplier = tbb::detail::select_size_t_constant<2654435769U, 11400714819323198485ULL>::value;
+
+    std::hash<CharT> char_hash;
+}; // struct hash<std::basic_string>
+
+#endif // !(_LIBCPP_VERSION || _CPPLIB_VER)
+
+} // namespace std
+
+#endif // TBB_DEFINE_STD_HASH_SPECIALIZATIONS
+
+#endif // __TBB_detail__hash_compare_H
diff --git a/third_party/tbb/detail/_intrusive_list_node.h b/third_party/tbb/detail/_intrusive_list_node.h
new file mode 100644
index 000000000..d3e1e506b
--- /dev/null
+++ b/third_party/tbb/detail/_intrusive_list_node.h
@@ -0,0 +1,42 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_detail__intrusive_list_node_H
+#define _TBB_detail__intrusive_list_node_H
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Data structure to be inherited by the types that can form intrusive lists.
+/** Intrusive list is formed by means of the member_intrusive_list<T> template class.
+    Note that type T must derive from intrusive_list_node either publicly or
+    declare instantiation member_intrusive_list<T> as a friend.
+    This class implements a limited subset of std::list interface. **/
+struct intrusive_list_node {
+    intrusive_list_node* my_prev_node{};
+    intrusive_list_node* my_next_node{};
+#if TBB_USE_ASSERT
+    intrusive_list_node() { my_prev_node = my_next_node = this; }
+#endif /* TBB_USE_ASSERT */
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_detail__intrusive_list_node_H
diff --git a/third_party/tbb/detail/_machine.h b/third_party/tbb/detail/_machine.h
new file mode 100644
index 000000000..5e9df02ba
--- /dev/null
+++ b/third_party/tbb/detail/_machine.h
@@ -0,0 +1,397 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__machine_H
+#define __TBB_detail__machine_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_assert.h"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/climits"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/cstddef"
+
+#ifdef _WIN32
+// MISSING #include <intrin.h>
+#ifdef __TBBMALLOC_BUILD
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h" // SwitchToThread()
+#endif
+#ifdef _MSC_VER
+#if __TBB_x86_64 || __TBB_x86_32
+#pragma intrinsic(__rdtsc)
+#endif
+#endif
+#endif
+#if __TBB_x86_64 || __TBB_x86_32
+#include "third_party/intel/immintrin.internal.h" // _mm_pause
+#endif
+#if (_WIN32)
+#include "libc/math.h"
+#include "libc/runtime/fenv.h" // _control87
+#endif
+
+#if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/cpuset.h"
+#include "libc/calls/struct/sched_param.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/sched.h" // sched_yield
+#else
+#include "third_party/libcxx/thread" // std::this_thread::yield()
+#endif
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//--------------------------------------------------------------------------------------------------
+// Yield implementation
+//--------------------------------------------------------------------------------------------------
+
+#if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
+static inline void yield() {
+    int err = sched_yield();
+    __TBB_ASSERT_EX(err == 0, "sched_yield has failed");
+}
+#elif __TBBMALLOC_BUILD && _WIN32
+// Use Windows API for yield in tbbmalloc to avoid dependency on C++ runtime with some implementations.
+static inline void yield() {
+    SwitchToThread();
+}
+#else
+using std::this_thread::yield;
+#endif
+
+//--------------------------------------------------------------------------------------------------
+// atomic_fence_seq_cst implementation
+//--------------------------------------------------------------------------------------------------
+
+static inline void atomic_fence_seq_cst() {
+#if (__TBB_x86_64 || __TBB_x86_32) && defined(__GNUC__) && __GNUC__ < 11
+    unsigned char dummy = 0u;
+    __asm__ __volatile__ ("lock; notb %0" : "+m" (dummy) :: "memory");
+#else
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+#endif
+}
+
+//--------------------------------------------------------------------------------------------------
+// Pause implementation
+//--------------------------------------------------------------------------------------------------
+
+static inline void machine_pause(int32_t delay) {
+#if __TBB_x86_64 || __TBB_x86_32
+    while (delay-- > 0) { _mm_pause(); }
+#elif __ARM_ARCH_7A__ || __aarch64__
+    while (delay-- > 0) { __asm__ __volatile__("yield" ::: "memory"); }
+#else /* Generic */
+    (void)delay; // suppress without including _template_helpers.h
+    yield();
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// tbb::detail::log2() implementation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// TODO: Use log2p1() function that will be available in C++20 standard
+
+#if defined(__GNUC__) || defined(__clang__)
+namespace gnu_builtins {
+    inline uintptr_t clz(unsigned int x) { return static_cast<uintptr_t>(__builtin_clz(x)); }
+    inline uintptr_t clz(unsigned long int x) { return static_cast<uintptr_t>(__builtin_clzl(x)); }
+    inline uintptr_t clz(unsigned long long int x) { return static_cast<uintptr_t>(__builtin_clzll(x)); }
+}
+#elif defined(_MSC_VER)
+#pragma intrinsic(__TBB_W(_BitScanReverse))
+namespace msvc_intrinsics {
+    static inline uintptr_t bit_scan_reverse(uintptr_t i) {
+        unsigned long j;
+        __TBB_W(_BitScanReverse)( &j, i );
+        return j;
+    }
+}
+#endif
+
+template <typename T>
+constexpr std::uintptr_t number_of_bits() {
+    return sizeof(T) * CHAR_BIT;
+}
+
+// logarithm is the index of the most significant non-zero bit
+static inline uintptr_t machine_log2(uintptr_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+    // If P is a power of 2 and x<P, then (P-1)-x == (P-1) XOR x
+    return (number_of_bits<decltype(x)>() - 1) ^ gnu_builtins::clz(x);
+#elif defined(_MSC_VER)
+    return msvc_intrinsics::bit_scan_reverse(x);
+#elif __i386__ || __i386 /*for Sun OS*/ || __MINGW32__
+    uintptr_t j, i = x;
+    __asm__("bsr %1,%0" : "=r"(j) : "r"(i));
+    return j;
+#elif __powerpc__ || __POWERPC__
+    #if __TBB_WORDSIZE==8
+    __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x));
+    return 63 - static_cast<intptr_t>(x);
+    #else
+    __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x));
+    return 31 - static_cast<intptr_t>(x);
+    #endif /*__TBB_WORDSIZE*/
+#elif __sparc
+    uint64_t count;
+    // one hot encode
+    x |= (x >> 1);
+    x |= (x >> 2);
+    x |= (x >> 4);
+    x |= (x >> 8);
+    x |= (x >> 16);
+    x |= (x >> 32);
+    // count 1's
+    __asm__ ("popc %1, %0" : "=r"(count) : "r"(x) );
+    return count - 1;
+#else
+    intptr_t result = 0;
+
+    if( sizeof(x) > 4 && (uintptr_t tmp = x >> 32) ) { x = tmp; result += 32; }
+    if( uintptr_t tmp = x >> 16 ) { x = tmp; result += 16; }
+    if( uintptr_t tmp = x >> 8 )  { x = tmp; result += 8; }
+    if( uintptr_t tmp = x >> 4 )  { x = tmp; result += 4; }
+    if( uintptr_t tmp = x >> 2 )  { x = tmp; result += 2; }
+
+    return (x & 2) ? result + 1 : result;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// tbb::detail::reverse_bits() implementation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#if TBB_USE_CLANG_BITREVERSE_BUILTINS
+namespace  llvm_builtins {
+    inline uint8_t  builtin_bitreverse(uint8_t  x) { return __builtin_bitreverse8 (x); }
+    inline uint16_t builtin_bitreverse(uint16_t x) { return __builtin_bitreverse16(x); }
+    inline uint32_t builtin_bitreverse(uint32_t x) { return __builtin_bitreverse32(x); }
+    inline uint64_t builtin_bitreverse(uint64_t x) { return __builtin_bitreverse64(x); }
+}
+#else // generic
+template<typename T>
+struct reverse {
+    static const T byte_table[256];
+};
+
+template<typename T>
+const T reverse<T>::byte_table[256] = {
+    0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
+    0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
+    0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
+    0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
+    0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
+    0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
+    0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
+    0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
+    0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
+    0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
+    0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
+    0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
+    0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
+    0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
+    0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
+    0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
+};
+
+inline unsigned char reverse_byte(unsigned char src) {
+    return reverse<unsigned char>::byte_table[src];
+}
+#endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
+
+template<typename T>
+T machine_reverse_bits(T src) {
+#if TBB_USE_CLANG_BITREVERSE_BUILTINS
+    return builtin_bitreverse(fixed_width_cast(src));
+#else /* Generic */
+    T dst;
+    unsigned char *original = reinterpret_cast<unsigned char *>(&src);
+    unsigned char *reversed = reinterpret_cast<unsigned char *>(&dst);
+
+    for ( int i = sizeof(T) - 1; i >= 0; i-- ) {
+        reversed[i] = reverse_byte( original[sizeof(T) - i - 1] );
+    }
+
+    return dst;
+#endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
+}
+
+} // inline namespace d0
+
+namespace d1 {
+
+#if (_WIN32)
+// API to retrieve/update FPU control setting
+#define __TBB_CPU_CTL_ENV_PRESENT 1
+struct cpu_ctl_env {
+    unsigned int x87cw{};
+#if (__TBB_x86_64)
+    // Changing the infinity mode or the floating-point precision is not supported on x64.
+    // The attempt causes an assertion. See
+    // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/control87-controlfp-control87-2
+    static constexpr unsigned int X87CW_CONTROL_MASK = _MCW_DN | _MCW_EM | _MCW_RC;
+#else
+    static constexpr unsigned int X87CW_CONTROL_MASK = ~0U;
+#endif
+#if (__TBB_x86_32 || __TBB_x86_64)
+    unsigned int mxcsr{};
+    static constexpr unsigned int MXCSR_CONTROL_MASK = ~0x3fu; /* all except last six status bits */
+#endif
+
+    bool operator!=( const cpu_ctl_env& ctl ) const {
+        return
+#if (__TBB_x86_32 || __TBB_x86_64)
+            mxcsr != ctl.mxcsr ||
+#endif
+            x87cw != ctl.x87cw;
+    }
+    void get_env() {
+        x87cw = _control87(0, 0);
+#if (__TBB_x86_32 || __TBB_x86_64)
+        mxcsr = _mm_getcsr();
+#endif
+    }
+    void set_env() const {
+        _control87(x87cw, X87CW_CONTROL_MASK);
+#if (__TBB_x86_32 || __TBB_x86_64)
+        _mm_setcsr(mxcsr & MXCSR_CONTROL_MASK);
+#endif
+    }
+};
+#elif (__TBB_x86_32 || __TBB_x86_64)
+// API to retrieve/update FPU control setting
+#define __TBB_CPU_CTL_ENV_PRESENT 1
+struct cpu_ctl_env {
+    int     mxcsr{};
+    short   x87cw{};
+    static const int MXCSR_CONTROL_MASK = ~0x3f; /* all except last six status bits */
+
+    bool operator!=(const cpu_ctl_env& ctl) const {
+        return mxcsr != ctl.mxcsr || x87cw != ctl.x87cw;
+    }
+    void get_env() {
+        __asm__ __volatile__(
+            "stmxcsr %0\n\t"
+            "fstcw %1"
+            : "=m"(mxcsr), "=m"(x87cw)
+        );
+        mxcsr &= MXCSR_CONTROL_MASK;
+    }
+    void set_env() const {
+        __asm__ __volatile__(
+            "ldmxcsr %0\n\t"
+            "fldcw %1"
+            : : "m"(mxcsr), "m"(x87cw)
+        );
+    }
+};
+#endif
+
+} // namespace d1
+
+} // namespace detail
+} // namespace tbb
+
+#if !__TBB_CPU_CTL_ENV_PRESENT
+#include "libc/runtime/fenv.h"
+
+#include "third_party/libcxx/cstring"
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size);
+void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p);
+} // namespace r1
+
+namespace d1 {
+
+class cpu_ctl_env {
+    fenv_t *my_fenv_ptr;
+public:
+    cpu_ctl_env() : my_fenv_ptr(nullptr) {}
+    ~cpu_ctl_env() {
+        if ( my_fenv_ptr )
+            r1::cache_aligned_deallocate( (void*)my_fenv_ptr );
+    }
+    // It is possible not to copy memory but just to copy pointers but the following issues should be addressed:
+    //   1. The arena lifetime and the context lifetime are independent;
+    //   2. The user is allowed to recapture different FPU settings to context so 'current FPU settings' inside
+    //   dispatch loop may become invalid.
+    // But do we really want to improve the fenv implementation? It seems to be better to replace the fenv implementation
+    // with a platform specific implementation.
+    cpu_ctl_env( const cpu_ctl_env &src ) : my_fenv_ptr(nullptr) {
+        *this = src;
+    }
+    cpu_ctl_env& operator=( const cpu_ctl_env &src ) {
+        __TBB_ASSERT( src.my_fenv_ptr, nullptr);
+        if ( !my_fenv_ptr )
+            my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
+        *my_fenv_ptr = *src.my_fenv_ptr;
+        return *this;
+    }
+    bool operator!=( const cpu_ctl_env &ctl ) const {
+        __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
+        __TBB_ASSERT( ctl.my_fenv_ptr, "cpu_ctl_env is not initialized." );
+        return std::memcmp( (void*)my_fenv_ptr, (void*)ctl.my_fenv_ptr, sizeof(fenv_t) );
+    }
+    void get_env () {
+        if ( !my_fenv_ptr )
+            my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
+        fegetenv( my_fenv_ptr );
+    }
+    const cpu_ctl_env& set_env () const {
+        __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
+        fesetenv( my_fenv_ptr );
+        return *this;
+    }
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* !__TBB_CPU_CTL_ENV_PRESENT */
+
+#endif // __TBB_detail__machine_H
diff --git a/third_party/tbb/detail/_mutex_common.h b/third_party/tbb/detail/_mutex_common.h
new file mode 100644
index 000000000..3392df0f5
--- /dev/null
+++ b/third_party/tbb/detail/_mutex_common.h
@@ -0,0 +1,62 @@
+// clang-format off
+/*
+    Copyright (c) 2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__mutex_common_H
+#define __TBB_detail__mutex_common_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_utils.h"
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+// MISSING #include <concepts>
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+template <typename Lock, typename Mutex>
+concept mutex_scoped_lock = std::default_initializable<Lock> &&
+                            std::constructible_from<Lock, Mutex&> &&
+                            requires( Lock& lock, Mutex& mutex ) {
+                                lock.acquire(mutex);
+                                { lock.try_acquire(mutex) } -> adaptive_same_as<bool>;
+                                lock.release();
+                            };
+
+template <typename Lock, typename Mutex>
+concept rw_mutex_scoped_lock = mutex_scoped_lock<Lock, Mutex> &&
+                               std::constructible_from<Lock, Mutex&, bool> &&
+                               requires( Lock& lock, Mutex& mutex ) {
+                                   lock.acquire(mutex, false);
+                                   { lock.try_acquire(mutex, false) } -> adaptive_same_as<bool>;
+                                   { lock.upgrade_to_writer() } -> adaptive_same_as<bool>;
+                                   { lock.downgrade_to_reader() } -> adaptive_same_as<bool>;
+                               };
+
+template <typename Mutex>
+concept scoped_lockable = mutex_scoped_lock<typename Mutex::scoped_lock, Mutex>;
+
+template <typename Mutex>
+concept rw_scoped_lockable = scoped_lockable<Mutex> &&
+                             rw_mutex_scoped_lock<typename Mutex::scoped_lock, Mutex>;
+
+} // namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+#endif // __TBB_detail__mutex_common_H
diff --git a/third_party/tbb/detail/_namespace_injection.h b/third_party/tbb/detail/_namespace_injection.h
new file mode 100644
index 000000000..6b61e4f0d
--- /dev/null
+++ b/third_party/tbb/detail/_namespace_injection.h
@@ -0,0 +1,25 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// All public entities of the OneAPI Spec are available under oneapi namespace
+
+// Define tbb namespace first as it might not be known yet
+namespace tbb {}
+
+namespace oneapi {
+namespace tbb = ::tbb;
+}
diff --git a/third_party/tbb/detail/_node_handle.h b/third_party/tbb/detail/_node_handle.h
new file mode 100644
index 000000000..29bec49af
--- /dev/null
+++ b/third_party/tbb/detail/_node_handle.h
@@ -0,0 +1,163 @@
+// clang-format off
+/*
+    Copyright (c) 2019-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__node_handle_H
+#define __TBB_detail__node_handle_H
+
+#include "third_party/tbb/detail/_allocator_traits.h"
+#include "third_party/tbb/detail/_assert.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+// A structure to access private node handle methods in internal TBB classes
+// Regular friend declaration is not convenient because classes which use node handle
+// can be placed in the different versioning namespaces.
+struct node_handle_accessor {
+    template <typename NodeHandleType>
+    static typename NodeHandleType::node* get_node_ptr( NodeHandleType& nh ) {
+        return nh.get_node_ptr();
+    }
+
+    template <typename NodeHandleType>
+    static NodeHandleType construct( typename NodeHandleType::node* node_ptr ) {
+        return NodeHandleType{node_ptr};
+    }
+
+    template <typename NodeHandleType>
+    static void deactivate( NodeHandleType& nh ) {
+        nh.deactivate();
+    }
+}; // struct node_handle_accessor
+
+template<typename Value, typename Node, typename Allocator>
+class node_handle_base {
+public:
+    using allocator_type = Allocator;
+protected:
+    using node = Node;
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+public:
+
+    node_handle_base() : my_node(nullptr), my_allocator() {}
+    node_handle_base(node_handle_base&& nh) : my_node(nh.my_node),
+                                              my_allocator(std::move(nh.my_allocator)) {
+        nh.my_node = nullptr;
+    }
+
+    __TBB_nodiscard bool empty() const { return my_node == nullptr; }
+    explicit operator bool() const { return my_node != nullptr; }
+
+    ~node_handle_base() { internal_destroy(); }
+
+    node_handle_base& operator=( node_handle_base&& nh ) {
+        internal_destroy();
+        my_node = nh.my_node;
+        move_assign_allocators(my_allocator, nh.my_allocator);
+        nh.deactivate();
+        return *this;
+    }
+
+    void swap( node_handle_base& nh ) {
+        using std::swap;
+        swap(my_node, nh.my_node);
+        swap_allocators(my_allocator, nh.my_allocator);
+    }
+
+    allocator_type get_allocator() const {
+        return my_allocator;
+    }
+
+protected:
+    node_handle_base( node* n ) : my_node(n) {}
+
+    void internal_destroy() {
+        if(my_node != nullptr) {
+            allocator_traits_type::destroy(my_allocator, my_node->storage());
+            typename allocator_traits_type::template rebind_alloc<node> node_allocator(my_allocator);
+            node_allocator.deallocate(my_node, 1);
+        }
+    }
+
+    node* get_node_ptr() { return my_node; }
+
+    void deactivate() { my_node = nullptr; }
+
+    node* my_node;
+    allocator_type my_allocator;
+};
+
+// node handle for maps
+template<typename Key, typename Value, typename Node, typename Allocator>
+class node_handle : public node_handle_base<Value, Node, Allocator> {
+    using base_type = node_handle_base<Value, Node, Allocator>;
+public:
+    using key_type = Key;
+    using mapped_type = typename Value::second_type;
+    using allocator_type = typename base_type::allocator_type;
+
+    node_handle() = default;
+
+    key_type& key() const {
+        __TBB_ASSERT(!this->empty(), "Cannot get key from the empty node_type object");
+        return *const_cast<key_type*>(&(this->my_node->value().first));
+    }
+
+    mapped_type& mapped() const {
+        __TBB_ASSERT(!this->empty(), "Cannot get mapped value from the empty node_type object");
+        return this->my_node->value().second;
+    }
+
+private:
+    friend struct node_handle_accessor;
+
+    node_handle( typename base_type::node* n ) : base_type(n) {}
+}; // class node_handle
+
+// node handle for sets
+template<typename Key, typename Node, typename Allocator>
+class node_handle<Key, Key, Node, Allocator> : public node_handle_base<Key, Node, Allocator> {
+    using base_type = node_handle_base<Key, Node, Allocator>;
+public:
+    using value_type = Key;
+    using allocator_type = typename base_type::allocator_type;
+
+    node_handle() = default;
+
+    value_type& value() const {
+        __TBB_ASSERT(!this->empty(), "Cannot get value from the empty node_type object");
+        return *const_cast<value_type*>(&(this->my_node->value()));
+    }
+
+private:
+    friend struct node_handle_accessor;
+
+    node_handle( typename base_type::node* n ) : base_type(n) {}
+}; // class node_handle
+
+template <typename Key, typename Value, typename Node, typename Allocator>
+void swap( node_handle<Key, Value, Node, Allocator>& lhs,
+           node_handle<Key, Value, Node, Allocator>& rhs ) {
+    return lhs.swap(rhs);
+}
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__node_handle_H
diff --git a/third_party/tbb/detail/_pipeline_filters.h b/third_party/tbb/detail/_pipeline_filters.h
new file mode 100644
index 000000000..48f453dbb
--- /dev/null
+++ b/third_party/tbb/detail/_pipeline_filters.h
@@ -0,0 +1,456 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_filters_H
+#define __TBB_parallel_filters_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/detail/_pipeline_filters_deduction.h"
+#include "third_party/tbb/tbb_allocator.h"
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/cstdint"
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class base_filter;
+}
+
+namespace r1 {
+TBB_EXPORT void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter&);
+class pipeline;
+class stage_task;
+class input_buffer;
+}
+
+namespace d1 {
+class filter_node;
+
+//! A stage in a pipeline.
+/** @ingroup algorithms */
+class base_filter{
+private:
+    //! Value used to mark "not in pipeline"
+    static base_filter* not_in_pipeline() { return reinterpret_cast<base_filter*>(std::intptr_t(-1)); }
+public:
+    //! The lowest bit 0 is for parallel vs serial
+    static constexpr  unsigned int filter_is_serial = 0x1;
+
+    //! 2nd bit distinguishes ordered vs unordered filters.
+    static constexpr  unsigned int filter_is_out_of_order = 0x1<<1;
+
+    //! 3rd bit marks input filters emitting small objects
+    static constexpr  unsigned int filter_may_emit_null = 0x1<<2;
+
+    base_filter(const base_filter&) = delete;
+    base_filter& operator=(const base_filter&) = delete;
+
+protected:
+    explicit base_filter( unsigned int m ) :
+        next_filter_in_pipeline(not_in_pipeline()),
+        my_input_buffer(nullptr),
+        my_filter_mode(m),
+        my_pipeline(nullptr)
+    {}
+
+    // signal end-of-input for concrete_filters
+    void set_end_of_input() {
+        r1::set_end_of_input(*this);
+    }
+
+public:
+    //! True if filter is serial.
+    bool is_serial() const {
+        return bool( my_filter_mode & filter_is_serial );
+    }
+
+    //! True if filter must receive stream in order.
+    bool is_ordered() const {
+        return (my_filter_mode & filter_is_serial) && !(my_filter_mode & filter_is_out_of_order);
+    }
+
+    //! true if an input filter can emit null
+    bool object_may_be_null() {
+        return ( my_filter_mode & filter_may_emit_null ) == filter_may_emit_null;
+    }
+
+    //! Operate on an item from the input stream, and return item for output stream.
+    /** Returns nullptr if filter is a sink. */
+    virtual void* operator()( void* item ) = 0;
+
+    //! Destroy filter.
+    virtual ~base_filter() {};
+
+    //! Destroys item if pipeline was cancelled.
+    /** Required to prevent memory leaks.
+        Note it can be called concurrently even for serial filters.*/
+    virtual void finalize( void* /*item*/ ) {}
+
+private:
+    //! Pointer to next filter in the pipeline.
+    base_filter* next_filter_in_pipeline;
+
+    //! Buffer for incoming tokens, or nullptr if not required.
+    /** The buffer is required if the filter is serial. */
+    r1::input_buffer* my_input_buffer;
+
+    friend class r1::stage_task;
+    friend class r1::pipeline;
+    friend void r1::set_end_of_input(d1::base_filter&);
+
+    //! Storage for filter mode and dynamically checked implementation version.
+    const unsigned int my_filter_mode;
+
+    //! Pointer to the pipeline.
+    r1::pipeline* my_pipeline;
+};
+
+template<typename Body, typename InputType, typename OutputType >
+class concrete_filter;
+
+//! input_filter control to signal end-of-input for parallel_pipeline
+class flow_control {
+    bool is_pipeline_stopped = false;
+    flow_control() = default;
+    template<typename Body, typename InputType, typename OutputType > friend class concrete_filter;
+    template<typename Output>
+    __TBB_requires(std::copyable<Output>)
+    friend class input_node;
+public:
+    void stop() { is_pipeline_stopped = true; }
+};
+
+// Emulate std::is_trivially_copyable (false positives not allowed, false negatives suboptimal but safe).
+#if __TBB_CPP11_TYPE_PROPERTIES_PRESENT
+template<typename T> using tbb_trivially_copyable = std::is_trivially_copyable<T>;
+#else
+template<typename T> struct tbb_trivially_copyable                      { enum { value = false }; };
+template<typename T> struct tbb_trivially_copyable <         T*       > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         bool     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         char     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <  signed char     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned char     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         short    > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned short    > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         int      > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned int      > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         long     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned long     > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         long long> { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <unsigned long long> { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         float    > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <         double   > { enum { value = true  }; };
+template<>           struct tbb_trivially_copyable <    long double   > { enum { value = true  }; };
+#endif // __TBB_CPP11_TYPE_PROPERTIES_PRESENT
+
+template<typename T>
+struct use_allocator {
+   static constexpr bool value = sizeof(T) > sizeof(void *) || !tbb_trivially_copyable<T>::value;
+};
+
+// A helper class to customize how a type is passed between filters.
+// Usage: token_helper<T, use_allocator<T>::value>
+template<typename T, bool Allocate> struct token_helper;
+
+// using tbb_allocator
+template<typename T>
+struct token_helper<T, true> {
+    using pointer = T*;
+    using value_type = T;
+    static pointer create_token(value_type && source) {
+        return new (r1::allocate_memory(sizeof(T))) T(std::move(source));
+    }
+    static value_type & token(pointer & t) { return *t; }
+    static void * cast_to_void_ptr(pointer ref) { return reinterpret_cast<void *>(ref); }
+    static pointer cast_from_void_ptr(void * ref) { return reinterpret_cast<pointer>(ref); }
+    static void destroy_token(pointer token) {
+        token->~value_type();
+        r1::deallocate_memory(token);
+    }
+};
+
+// pointer specialization
+template<typename T>
+struct token_helper<T*, false> {
+    using pointer = T*;
+    using value_type = T*;
+    static pointer create_token(const value_type & source) { return source; }
+    static value_type & token(pointer & t) { return t; }
+    static void * cast_to_void_ptr(pointer ref) { return reinterpret_cast<void *>(ref); }
+    static pointer cast_from_void_ptr(void * ref) { return reinterpret_cast<pointer>(ref); }
+    static void destroy_token( pointer /*token*/) {}
+};
+
+// converting type to and from void*, passing objects directly
+template<typename T>
+struct token_helper<T, false> {
+    typedef union {
+        T actual_value;
+        void * void_overlay;
+    } type_to_void_ptr_map;
+    using pointer = T;  // not really a pointer in this case.
+    using value_type = T;
+    static pointer create_token(const value_type & source) { return source; }
+    static value_type & token(pointer & t) { return t; }
+    static void * cast_to_void_ptr(pointer ref) {
+        type_to_void_ptr_map mymap;
+        mymap.void_overlay = nullptr;
+        mymap.actual_value = ref;
+        return mymap.void_overlay;
+    }
+    static pointer cast_from_void_ptr(void * ref) {
+        type_to_void_ptr_map mymap;
+        mymap.void_overlay = ref;
+        return mymap.actual_value;
+    }
+    static void destroy_token( pointer /*token*/) {}
+};
+
+// intermediate
+template<typename InputType,  typename OutputType, typename Body>
+class concrete_filter: public base_filter {
+    const Body& my_body;
+    using input_helper = token_helper<InputType, use_allocator<InputType >::value>;
+    using input_pointer = typename input_helper::pointer;
+    using output_helper = token_helper<OutputType, use_allocator<OutputType>::value>;
+    using output_pointer = typename output_helper::pointer;
+
+    void* operator()(void* input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        output_pointer temp_output = output_helper::create_token(tbb::detail::invoke(my_body, std::move(input_helper::token(temp_input))));
+        input_helper::destroy_token(temp_input);
+        return output_helper::cast_to_void_ptr(temp_output);
+    }
+
+    void finalize(void * input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        input_helper::destroy_token(temp_input);
+    }
+
+public:
+    concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {}
+};
+
+// input
+template<typename OutputType, typename Body>
+class concrete_filter<void, OutputType, Body>: public base_filter {
+    const Body& my_body;
+    using output_helper = token_helper<OutputType, use_allocator<OutputType>::value>;
+    using output_pointer = typename output_helper::pointer;
+
+    void* operator()(void*) override {
+        flow_control control;
+        output_pointer temp_output = output_helper::create_token(my_body(control));
+        if(control.is_pipeline_stopped) {
+            output_helper::destroy_token(temp_output);
+            set_end_of_input();
+            return nullptr;
+        }
+        return output_helper::cast_to_void_ptr(temp_output);
+    }
+
+public:
+    concrete_filter(unsigned int m, const Body& body) :
+        base_filter(m | filter_may_emit_null),
+        my_body(body)
+    {}
+};
+
+// output
+template<typename InputType, typename Body>
+class concrete_filter<InputType, void, Body>: public base_filter {
+    const Body& my_body;
+    using input_helper = token_helper<InputType, use_allocator<InputType >::value>;
+    using input_pointer = typename input_helper::pointer;
+
+    void* operator()(void* input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        tbb::detail::invoke(my_body, std::move(input_helper::token(temp_input)));
+        input_helper::destroy_token(temp_input);
+        return nullptr;
+    }
+    void finalize(void* input) override {
+        input_pointer temp_input = input_helper::cast_from_void_ptr(input);
+        input_helper::destroy_token(temp_input);
+    }
+
+public:
+    concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {}
+};
+
+template<typename Body>
+class concrete_filter<void, void, Body>: public base_filter {
+    const Body& my_body;
+
+    void* operator()(void*) override {
+        flow_control control;
+        my_body(control);
+        void* output = control.is_pipeline_stopped ? nullptr : (void*)(std::intptr_t)-1;
+        return output;
+    }
+public:
+    concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {}
+};
+
+class filter_node_ptr {
+    filter_node * my_node;
+
+public:
+    filter_node_ptr() : my_node(nullptr) {}
+    filter_node_ptr(filter_node *);
+    ~filter_node_ptr();
+    filter_node_ptr(const filter_node_ptr &);
+    filter_node_ptr(filter_node_ptr &&);
+    void operator=(filter_node *);
+    void operator=(const filter_node_ptr &);
+    void operator=(filter_node_ptr &&);
+    filter_node& operator*() const;
+    operator bool() const;
+};
+
+//! Abstract base class that represents a node in a parse tree underlying a filter class.
+/** These nodes are always heap-allocated and can be shared by filter objects. */
+class filter_node {
+    /** Count must be atomic because it is hidden state for user, but might be shared by threads. */
+    std::atomic<std::intptr_t> ref_count;
+public:
+    filter_node_ptr left;
+    filter_node_ptr right;
+protected:
+    filter_node() : ref_count(0), left(nullptr), right(nullptr) {
+#ifdef __TBB_TEST_FILTER_NODE_COUNT
+        ++(__TBB_TEST_FILTER_NODE_COUNT);
+#endif
+    }
+public:
+    filter_node(const filter_node_ptr& x, const filter_node_ptr& y) : filter_node(){
+        left = x;
+        right = y;
+    }
+    filter_node(const filter_node&) = delete;
+    filter_node& operator=(const filter_node&) = delete;
+
+    //! Add concrete_filter to pipeline
+    virtual base_filter* create_filter() const {
+        __TBB_ASSERT(false, "method of non-leaf was called");
+        return nullptr;
+    }
+
+    //! Increment reference count
+    void add_ref() { ref_count.fetch_add(1, std::memory_order_relaxed); }
+
+    //! Decrement reference count and delete if it becomes zero.
+    void remove_ref() {
+        __TBB_ASSERT(ref_count>0,"ref_count underflow");
+        if( ref_count.fetch_sub(1, std::memory_order_relaxed) == 1 ) {
+            this->~filter_node();
+            r1::deallocate_memory(this);
+        }
+    }
+
+    virtual ~filter_node() {
+#ifdef __TBB_TEST_FILTER_NODE_COUNT
+        --(__TBB_TEST_FILTER_NODE_COUNT);
+#endif
+    }
+};
+
+inline filter_node_ptr::filter_node_ptr(filter_node * nd) : my_node(nd) {
+    if (my_node) {
+        my_node->add_ref();
+    }
+}
+
+inline filter_node_ptr::~filter_node_ptr() {
+    if (my_node) {
+        my_node->remove_ref();
+    }
+}
+
+inline filter_node_ptr::filter_node_ptr(const filter_node_ptr & rhs) : my_node(rhs.my_node) {
+    if (my_node) {
+        my_node->add_ref();
+    }
+}
+
+inline filter_node_ptr::filter_node_ptr(filter_node_ptr && rhs) : my_node(rhs.my_node) {
+    rhs.my_node = nullptr;
+}
+
+inline void filter_node_ptr::operator=(filter_node * rhs) {
+    // Order of operations below carefully chosen so that reference counts remain correct
+    // in unlikely event that remove_ref throws exception.
+    filter_node* old = my_node;
+    my_node = rhs;
+    if (my_node) {
+        my_node->add_ref();
+    }
+    if (old) {
+        old->remove_ref();
+    }
+}
+
+inline void filter_node_ptr::operator=(const filter_node_ptr & rhs) {
+    *this = rhs.my_node;
+}
+
+inline void filter_node_ptr::operator=(filter_node_ptr && rhs) {
+    filter_node* old = my_node;
+    my_node = rhs.my_node;
+    rhs.my_node = nullptr;
+    if (old) {
+        old->remove_ref();
+    }
+}
+
+inline filter_node& filter_node_ptr::operator*() const{
+    __TBB_ASSERT(my_node,"nullptr node is used");
+    return *my_node;
+}
+
+inline filter_node_ptr::operator bool() const {
+    return my_node != nullptr;
+}
+
+//! Node in parse tree representing result of make_filter.
+template<typename InputType, typename OutputType, typename Body>
+class filter_node_leaf: public filter_node {
+    const unsigned int my_mode;
+    const Body my_body;
+    base_filter* create_filter() const override {
+        return new(r1::allocate_memory(sizeof(concrete_filter<InputType, OutputType, Body>))) concrete_filter<InputType, OutputType, Body>(my_mode,my_body);
+    }
+public:
+    filter_node_leaf( unsigned int m, const Body& b ) : my_mode(m), my_body(b) {}
+};
+
+
+template <typename Body, typename Input = typename filter_body_types<decltype(&Body::operator())>::input_type>
+using filter_input = typename std::conditional<std::is_same<Input, flow_control>::value, void, Input>::type;
+
+template <typename Body>
+using filter_output = typename filter_body_types<decltype(&Body::operator())>::output_type;
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+
+#endif /* __TBB_parallel_filters_H */
diff --git a/third_party/tbb/detail/_pipeline_filters_deduction.h b/third_party/tbb/detail/_pipeline_filters_deduction.h
new file mode 100644
index 000000000..ad183f4cb
--- /dev/null
+++ b/third_party/tbb/detail/_pipeline_filters_deduction.h
@@ -0,0 +1,47 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__pipeline_filters_deduction_H
+#define __TBB__pipeline_filters_deduction_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/type_traits"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename Input, typename Output>
+struct declare_filter_types {
+    using input_type = typename std::remove_const<typename std::remove_reference<Input>::type>::type;
+    using output_type = typename std::remove_const<typename std::remove_reference<Output>::type>::type;
+};
+
+template <typename T> struct filter_body_types;
+
+template <typename T, typename Input, typename Output>
+struct filter_body_types<Output(T::*)(Input) const> : declare_filter_types<Input, Output> {};
+
+template <typename T, typename Input, typename Output>
+struct filter_body_types<Output(T::*)(Input)> : declare_filter_types<Input, Output> {};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB__pipeline_filters_deduction_H
diff --git a/third_party/tbb/detail/_range_common.h b/third_party/tbb/detail/_range_common.h
new file mode 100644
index 000000000..15f4d2bea
--- /dev/null
+++ b/third_party/tbb/detail/_range_common.h
@@ -0,0 +1,131 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__range_common_H
+#define __TBB_detail__range_common_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_utils.h"
+#if __TBB_CPP20_CONCEPTS_PRESENT
+// MISSING #include <concepts>
+#endif
+#include "third_party/libcxx/iterator"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//! Dummy type that distinguishes splitting constructor from copy constructor.
+/**
+ * See description of parallel_for and parallel_reduce for example usages.
+ * @ingroup algorithms
+ */
+class split {};
+
+//! Type enables transmission of splitting proportion from partitioners to range objects
+/**
+ * In order to make use of such facility Range objects must implement
+ * splitting constructor with this type passed.
+ */
+class proportional_split : no_assign {
+public:
+    proportional_split(size_t _left = 1, size_t _right = 1) : my_left(_left), my_right(_right) { }
+
+    size_t left() const { return my_left; }
+    size_t right() const { return my_right; }
+
+    // used when range does not support proportional split
+    explicit operator split() const { return split(); }
+
+private:
+    size_t my_left, my_right;
+};
+
+template <typename Range, typename = void>
+struct range_split_object_provider {
+    template <typename PartitionerSplitType>
+    static split get( PartitionerSplitType& ) { return split(); }
+};
+
+template <typename Range>
+struct range_split_object_provider<Range,
+                                   typename std::enable_if<std::is_constructible<Range, Range&, proportional_split&>::value>::type> {
+    template <typename PartitionerSplitType>
+    static PartitionerSplitType& get( PartitionerSplitType& split_obj ) { return split_obj; }
+};
+
+template <typename Range, typename PartitionerSplitType>
+auto get_range_split_object( PartitionerSplitType& split_obj )
+-> decltype(range_split_object_provider<Range>::get(split_obj)) {
+    return range_split_object_provider<Range>::get(split_obj);
+}
+
+template <typename Range>
+using range_iterator_type = decltype(std::begin(std::declval<Range&>()));
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+template <typename Iterator>
+using iterator_reference_type = typename std::iterator_traits<Iterator>::reference;
+
+template <typename Range>
+using range_reference_type = iterator_reference_type<range_iterator_type<Range>>;
+
+template <typename Value>
+concept blocked_range_value = std::copyable<Value> &&
+                              requires( const std::remove_reference_t<Value>& lhs, const std::remove_reference_t<Value>& rhs ) {
+                                  { lhs < rhs } -> relaxed_convertible_to<bool>;
+                                  { lhs - rhs } -> std::convertible_to<std::size_t>;
+                                  { lhs + (rhs - lhs) } -> std::convertible_to<Value>;
+                              };
+
+template <typename T>
+concept splittable = std::constructible_from<T, T&, tbb::detail::split>;
+
+template <typename Range>
+concept tbb_range = std::copy_constructible<Range> &&
+                    splittable<Range> &&
+                    requires( const std::remove_reference_t<Range>& range ) {
+                        { range.empty() } -> relaxed_convertible_to<bool>;
+                        { range.is_divisible() } -> relaxed_convertible_to<bool>;
+                    };
+
+template <typename Iterator>
+constexpr bool iterator_concept_helper( std::input_iterator_tag ) {
+    return std::input_iterator<Iterator>;
+}
+
+template <typename Iterator>
+constexpr bool iterator_concept_helper( std::random_access_iterator_tag ) {
+    return std::random_access_iterator<Iterator>;
+}
+
+template <typename Iterator, typename IteratorTag>
+concept iterator_satisfies = requires (IteratorTag tag) {
+    requires iterator_concept_helper<Iterator>(tag);
+};
+
+template <typename Sequence, typename IteratorTag>
+concept container_based_sequence = requires( Sequence& seq ) {
+    { std::begin(seq) } -> iterator_satisfies<IteratorTag>;
+    { std::end(seq) } -> iterator_satisfies<IteratorTag>;
+};
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+} // namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__range_common_H
diff --git a/third_party/tbb/detail/_rtm_mutex.h b/third_party/tbb/detail/_rtm_mutex.h
new file mode 100644
index 000000000..0633bb6f6
--- /dev/null
+++ b/third_party/tbb/detail/_rtm_mutex.h
@@ -0,0 +1,163 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__rtm_mutex_impl_H
+#define __TBB__rtm_mutex_impl_H
+
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/spin_mutex.h"
+
+#include "third_party/tbb/profiling.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+struct rtm_mutex_impl;
+}
+namespace d1 {
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress warning: structure was padded due to alignment specifier
+    #pragma warning (push)
+    #pragma warning (disable: 4324)
+#endif
+
+/** A rtm_mutex is an speculation-enabled spin mutex.
+    It should be used for locking short critical sections where the lock is
+    contended but the data it protects are not.  If zero-initialized, the
+    mutex is considered unheld.
+    @ingroup synchronization */
+class alignas(max_nfs_size) rtm_mutex : private spin_mutex {
+private:
+    enum class rtm_state {
+        rtm_none,
+        rtm_transacting,
+        rtm_real
+    };
+public:
+    //! Constructors
+    rtm_mutex() noexcept {
+        create_itt_sync(this, "tbb::speculative_spin_mutex", "");
+    }
+
+    //! Destructor
+    ~rtm_mutex() = default;
+
+    //! Represents acquisition of a mutex.
+    class scoped_lock {
+    public:
+        friend class rtm_mutex;
+        //! Construct lock that has not acquired a mutex.
+        /** Equivalent to zero-initialization of *this. */
+        constexpr scoped_lock() : m_mutex(nullptr), m_transaction_state(rtm_state::rtm_none) {}
+
+        //! Acquire lock on given mutex.
+        scoped_lock(rtm_mutex& m) : m_mutex(nullptr), m_transaction_state(rtm_state::rtm_none) {
+            acquire(m);
+        }
+
+        //! Release lock (if lock is held).
+        ~scoped_lock() {
+            if(m_transaction_state != rtm_state::rtm_none) {
+                release();
+            }
+        }
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        //! Acquire lock on given mutex.
+        void acquire(rtm_mutex& m);
+
+        //! Try acquire lock on given mutex.
+        bool try_acquire(rtm_mutex& m);
+
+        //! Release lock
+        void release();
+
+    private:
+        rtm_mutex* m_mutex;
+        rtm_state m_transaction_state;
+        friend r1::rtm_mutex_impl;
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = false;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+private:
+    friend r1::rtm_mutex_impl;
+}; // end of rtm_mutex
+} // namespace d1
+
+namespace r1 {
+    //! Internal acquire lock.
+    // only_speculate == true if we're doing a try_lock, else false.
+    TBB_EXPORT void __TBB_EXPORTED_FUNC acquire(d1::rtm_mutex&, d1::rtm_mutex::scoped_lock&, bool only_speculate = false);
+    //! Internal try_acquire lock.
+    TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire(d1::rtm_mutex&, d1::rtm_mutex::scoped_lock&);
+    //! Internal release lock.
+    TBB_EXPORT void __TBB_EXPORTED_FUNC release(d1::rtm_mutex::scoped_lock&);
+} // namespace r1
+
+namespace d1 {
+//! Acquire lock on given mutex.
+inline void rtm_mutex::scoped_lock::acquire(rtm_mutex& m) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    r1::acquire(m, *this);
+}
+
+//! Try acquire lock on given mutex.
+inline bool rtm_mutex::scoped_lock::try_acquire(rtm_mutex& m) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    return r1::try_acquire(m, *this);
+}
+
+//! Release lock
+inline void rtm_mutex::scoped_lock::release() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    __TBB_ASSERT(m_transaction_state != rtm_state::rtm_none, "lock is not acquired");
+    return r1::release(*this);
+}
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning (pop) // 4324 warning
+#endif
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(rtm_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64)
+inline void set_name(rtm_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif // WIN
+#else
+inline void set_name(rtm_mutex&, const char*) {}
+#if (_WIN32||_WIN64)
+inline void set_name(rtm_mutex&, const wchar_t*) {}
+#endif // WIN
+#endif
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB__rtm_mutex_impl_H */
diff --git a/third_party/tbb/detail/_rtm_rw_mutex.h b/third_party/tbb/detail/_rtm_rw_mutex.h
new file mode 100644
index 000000000..2f2d53e49
--- /dev/null
+++ b/third_party/tbb/detail/_rtm_rw_mutex.h
@@ -0,0 +1,216 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__rtm_rw_mutex_H
+#define __TBB_detail__rtm_rw_mutex_H
+
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/spin_rw_mutex.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+struct rtm_rw_mutex_impl;
+}
+
+namespace d1 {
+
+constexpr std::size_t speculation_granularity = 64;
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress warning: structure was padded due to alignment specifier
+    #pragma warning (push)
+    #pragma warning (disable: 4324)
+#endif
+
+//! Fast, unfair, spinning speculation-enabled reader-writer lock with backoff and writer-preference
+/** @ingroup synchronization */
+class alignas(max_nfs_size) rtm_rw_mutex : private spin_rw_mutex {
+    friend struct r1::rtm_rw_mutex_impl;
+private:
+    enum class rtm_type {
+        rtm_not_in_mutex,
+        rtm_transacting_reader,
+        rtm_transacting_writer,
+        rtm_real_reader,
+        rtm_real_writer
+    };
+public:
+    //! Constructors
+    rtm_rw_mutex() noexcept : write_flag(false) {
+        create_itt_sync(this, "tbb::speculative_spin_rw_mutex", "");
+    }
+
+    //! Destructor
+    ~rtm_rw_mutex() = default;
+
+    //! Represents acquisition of a mutex.
+    class scoped_lock {
+        friend struct r1::rtm_rw_mutex_impl;
+    public:
+        //! Construct lock that has not acquired a mutex.
+        /** Equivalent to zero-initialization of *this. */
+        constexpr scoped_lock() : m_mutex(nullptr), m_transaction_state(rtm_type::rtm_not_in_mutex) {}
+
+        //! Acquire lock on given mutex.
+        scoped_lock(rtm_rw_mutex& m, bool write = true) : m_mutex(nullptr), m_transaction_state(rtm_type::rtm_not_in_mutex) {
+            acquire(m, write);
+        }
+
+        //! Release lock (if lock is held).
+        ~scoped_lock() {
+            if(m_transaction_state != rtm_type::rtm_not_in_mutex) {
+                release();
+            }
+        }
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        //! Acquire lock on given mutex.
+        inline void acquire(rtm_rw_mutex& m, bool write = true);
+
+        //! Try acquire lock on given mutex.
+        inline bool try_acquire(rtm_rw_mutex& m, bool write = true);
+
+        //! Release lock
+        inline void release();
+
+        //! Upgrade reader to become a writer.
+        /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+        inline bool upgrade_to_writer();
+
+        //! Downgrade writer to become a reader.
+        inline bool downgrade_to_reader();
+
+        inline bool is_writer() const;
+    private:
+        rtm_rw_mutex* m_mutex;
+        rtm_type m_transaction_state;
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = true;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+
+private:
+    alignas(speculation_granularity) std::atomic<bool> write_flag;
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning (pop) // 4324 warning
+#endif
+
+} // namespace d1
+
+namespace r1 {
+    //! Internal acquire write lock.
+    // only_speculate == true if we're doing a try_lock, else false.
+    TBB_EXPORT void __TBB_EXPORTED_FUNC acquire_writer(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&, bool only_speculate = false);
+    //! Internal acquire read lock.
+    // only_speculate == true if we're doing a try_lock, else false.
+    TBB_EXPORT void __TBB_EXPORTED_FUNC acquire_reader(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&, bool only_speculate = false);
+    //! Internal upgrade reader to become a writer.
+    TBB_EXPORT bool __TBB_EXPORTED_FUNC upgrade(d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal downgrade writer to become a reader.
+    TBB_EXPORT bool __TBB_EXPORTED_FUNC downgrade(d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal try_acquire write lock.
+    TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire_writer(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal try_acquire read lock.
+    TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire_reader(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&);
+    //! Internal release lock.
+    TBB_EXPORT void __TBB_EXPORTED_FUNC release(d1::rtm_rw_mutex::scoped_lock&);
+}
+
+namespace d1 {
+//! Acquire lock on given mutex.
+void rtm_rw_mutex::scoped_lock::acquire(rtm_rw_mutex& m, bool write) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    if (write) {
+        r1::acquire_writer(m, *this);
+    } else {
+        r1::acquire_reader(m, *this);
+    }
+}
+
+//! Try acquire lock on given mutex.
+bool rtm_rw_mutex::scoped_lock::try_acquire(rtm_rw_mutex& m, bool write) {
+    __TBB_ASSERT(!m_mutex, "lock is already acquired");
+    if (write) {
+        return r1::try_acquire_writer(m, *this);
+    } else {
+        return r1::try_acquire_reader(m, *this);
+    }
+}
+
+//! Release lock
+void rtm_rw_mutex::scoped_lock::release() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    __TBB_ASSERT(m_transaction_state != rtm_type::rtm_not_in_mutex, "lock is not acquired");
+    return r1::release(*this);
+}
+
+//! Upgrade reader to become a writer.
+/** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+bool rtm_rw_mutex::scoped_lock::upgrade_to_writer() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    if (m_transaction_state == rtm_type::rtm_transacting_writer || m_transaction_state == rtm_type::rtm_real_writer) {
+        return true; // Already a writer
+    }
+    return r1::upgrade(*this);
+}
+
+//! Downgrade writer to become a reader.
+bool rtm_rw_mutex::scoped_lock::downgrade_to_reader() {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    if (m_transaction_state == rtm_type::rtm_transacting_reader || m_transaction_state == rtm_type::rtm_real_reader) {
+        return true; // Already a reader
+    }
+    return r1::downgrade(*this);
+}
+
+bool rtm_rw_mutex::scoped_lock::is_writer() const {
+    __TBB_ASSERT(m_mutex, "lock is not acquired");
+    return m_transaction_state == rtm_type::rtm_transacting_writer || m_transaction_state == rtm_type::rtm_real_writer;
+}
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(rtm_rw_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64)
+inline void set_name(rtm_rw_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif // WIN
+#else
+inline void set_name(rtm_rw_mutex&, const char*) {}
+#if (_WIN32||_WIN64)
+inline void set_name(rtm_rw_mutex&, const wchar_t*) {}
+#endif // WIN
+#endif
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__rtm_rw_mutex_H
diff --git a/third_party/tbb/detail/_scoped_lock.h b/third_party/tbb/detail/_scoped_lock.h
new file mode 100644
index 000000000..640d15d10
--- /dev/null
+++ b/third_party/tbb/detail/_scoped_lock.h
@@ -0,0 +1,175 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail_scoped_lock_H
+#define __TBB_detail_scoped_lock_H
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+// unique_scoped_lock supposes that Mutex operations never throw
+template <typename Mutex>
+class unique_scoped_lock {
+    //! Points to currently held Mutex, or nullptr if no lock is held.
+    Mutex* m_mutex{};
+
+public:
+    //! Construct without acquiring a Mutex.
+    constexpr unique_scoped_lock() noexcept : m_mutex(nullptr) {}
+
+    //! Construct and acquire lock on a Mutex.
+    unique_scoped_lock(Mutex& m) {
+        acquire(m);
+    }
+
+    //! No Copy
+    unique_scoped_lock(const unique_scoped_lock&) = delete;
+    unique_scoped_lock& operator=(const unique_scoped_lock&) = delete;
+
+    //! Acquire lock.
+    void acquire(Mutex& m) {
+        __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired");
+        m_mutex = &m;
+        m.lock();
+    }
+
+    //! Try acquiring lock (non-blocking)
+    /** Return true if lock acquired; false otherwise. */
+    bool try_acquire(Mutex& m) {
+        __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired");
+        bool succeed = m.try_lock();
+        if (succeed) {
+            m_mutex = &m;
+        }
+        return succeed;
+    }
+
+    //! Release lock
+    void release() {
+        __TBB_ASSERT(m_mutex, "release on Mutex::unique_scoped_lock that is not holding a lock");
+        m_mutex->unlock();
+        m_mutex = nullptr;
+    }
+
+    //! Destroy lock. If holding a lock, releases the lock first.
+    ~unique_scoped_lock() {
+        if (m_mutex) {
+            release();
+        }
+    }
+};
+
+// rw_scoped_lock supposes that Mutex operations never throw
+template <typename Mutex>
+class rw_scoped_lock {
+public:
+    //! Construct lock that has not acquired a mutex.
+    /** Equivalent to zero-initialization of *this. */
+    constexpr rw_scoped_lock() noexcept {}
+
+    //! Acquire lock on given mutex.
+    rw_scoped_lock(Mutex& m, bool write = true) {
+        acquire(m, write);
+    }
+
+    //! Release lock (if lock is held).
+    ~rw_scoped_lock() {
+        if (m_mutex) {
+            release();
+        }
+    }
+
+    //! No Copy
+    rw_scoped_lock(const rw_scoped_lock&) = delete;
+    rw_scoped_lock& operator=(const rw_scoped_lock&) = delete;
+
+    //! Acquire lock on given mutex.
+    void acquire(Mutex& m, bool write = true) {
+        __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired");
+        m_is_writer = write;
+        m_mutex = &m;
+        if (write) {
+            m_mutex->lock();
+        } else {
+            m_mutex->lock_shared();
+        }
+    }
+
+    //! Try acquire lock on given mutex.
+    bool try_acquire(Mutex& m, bool write = true) {
+        bool succeed = write ? m.try_lock() : m.try_lock_shared();
+        if (succeed) {
+            m_mutex = &m;
+            m_is_writer = write;
+        }
+        return succeed;
+    }
+
+    //! Release lock.
+    void release() {
+        __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired");
+        Mutex* m = m_mutex;
+        m_mutex = nullptr;
+
+        if (m_is_writer) {
+            m->unlock();
+        } else {
+            m->unlock_shared();
+        }
+    }
+
+    //! Upgrade reader to become a writer.
+    /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+    bool upgrade_to_writer() {
+        __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired");
+        if (m_is_writer) {
+            return true; // Already a writer
+        }
+        m_is_writer = true;
+        return m_mutex->upgrade();
+    }
+
+    //! Downgrade writer to become a reader.
+    bool downgrade_to_reader() {
+        __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired");
+        if (m_is_writer) {
+            m_mutex->downgrade();
+            m_is_writer = false;
+        }
+        return true;
+    }
+
+    bool is_writer() const {
+        __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired");
+        return m_is_writer;
+    }
+
+protected:
+    //! The pointer to the current mutex that is held, or nullptr if no mutex is held.
+    Mutex* m_mutex {nullptr};
+
+    //! If mutex != nullptr, then is_writer is true if holding a writer lock, false if holding a reader lock.
+    /** Not defined if not holding a lock. */
+    bool m_is_writer {false};
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail_scoped_lock_H
diff --git a/third_party/tbb/detail/_segment_table.h b/third_party/tbb/detail/_segment_table.h
new file mode 100644
index 000000000..a9f570a72
--- /dev/null
+++ b/third_party/tbb/detail/_segment_table.h
@@ -0,0 +1,567 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__segment_table_H
+#define __TBB_detail__segment_table_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_allocator_traits.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/cstring"
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: conditional expression is constant
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+template <typename T, typename Allocator, typename DerivedType, std::size_t PointersPerEmbeddedTable>
+class segment_table {
+public:
+    using value_type = T;
+    using segment_type = T*;
+    using atomic_segment = std::atomic<segment_type>;
+    using segment_table_type = atomic_segment*;
+
+    using size_type = std::size_t;
+    using segment_index_type = std::size_t;
+
+    using allocator_type = Allocator;
+
+    using allocator_traits_type = tbb::detail::allocator_traits<allocator_type>;
+    using segment_table_allocator_type = typename allocator_traits_type::template rebind_alloc<atomic_segment>;
+protected:
+    using segment_table_allocator_traits = tbb::detail::allocator_traits<segment_table_allocator_type>;
+    using derived_type = DerivedType;
+
+    static constexpr size_type pointers_per_embedded_table = PointersPerEmbeddedTable;
+    static constexpr size_type pointers_per_long_table = sizeof(size_type) * 8;
+public:
+    segment_table( const allocator_type& alloc = allocator_type() )
+        : my_segment_table_allocator(alloc), my_segment_table(nullptr)
+        , my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+    }
+
+    segment_table( const segment_table& other )
+        : my_segment_table_allocator(segment_table_allocator_traits::
+                                     select_on_container_copy_construction(other.my_segment_table_allocator))
+        , my_segment_table(nullptr), my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        try_call( [&] {
+            internal_transfer(other, copy_segment_body_type{*this});
+        } ).on_exception( [&] {
+            clear();
+        });
+    }
+
+    segment_table( const segment_table& other, const allocator_type& alloc )
+        : my_segment_table_allocator(alloc), my_segment_table(nullptr)
+        , my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        try_call( [&] {
+            internal_transfer(other, copy_segment_body_type{*this});
+        } ).on_exception( [&] {
+            clear();
+        });
+    }
+
+    segment_table( segment_table&& other )
+        : my_segment_table_allocator(std::move(other.my_segment_table_allocator)), my_segment_table(nullptr)
+        , my_first_block{}, my_size{}, my_segment_table_allocation_failed{}
+    {
+        my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        internal_move(std::move(other));
+    }
+
+    segment_table( segment_table&& other, const allocator_type& alloc )
+        : my_segment_table_allocator(alloc), my_segment_table(nullptr), my_first_block{}
+        , my_size{}, my_segment_table_allocation_failed{}
+    {
+        my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        zero_table(my_embedded_table, pointers_per_embedded_table);
+        using is_equal_type = typename segment_table_allocator_traits::is_always_equal;
+        internal_move_construct_with_allocator(std::move(other), alloc, is_equal_type());
+    }
+
+    ~segment_table() {
+        clear();
+    }
+
+    segment_table& operator=( const segment_table& other ) {
+        if (this != &other) {
+            copy_assign_allocators(my_segment_table_allocator, other.my_segment_table_allocator);
+            internal_transfer(other, copy_segment_body_type{*this});
+        }
+        return *this;
+    }
+
+    segment_table& operator=( segment_table&& other )
+        noexcept(derived_type::is_noexcept_assignment)
+    {
+        using pocma_type = typename segment_table_allocator_traits::propagate_on_container_move_assignment;
+        using is_equal_type = typename segment_table_allocator_traits::is_always_equal;
+
+        if (this != &other) {
+            move_assign_allocators(my_segment_table_allocator, other.my_segment_table_allocator);
+            internal_move_assign(std::move(other), tbb::detail::disjunction<is_equal_type, pocma_type>());
+        }
+        return *this;
+    }
+
+    void swap( segment_table& other )
+        noexcept(derived_type::is_noexcept_swap)
+    {
+        using is_equal_type = typename segment_table_allocator_traits::is_always_equal;
+        using pocs_type = typename segment_table_allocator_traits::propagate_on_container_swap;
+
+        if (this != &other) {
+            swap_allocators(my_segment_table_allocator, other.my_segment_table_allocator);
+            internal_swap(other, tbb::detail::disjunction<is_equal_type, pocs_type>());
+        }
+    }
+
+    segment_type get_segment( segment_index_type index ) const {
+        return get_table()[index] + segment_base(index);
+    }
+
+    value_type& operator[]( size_type index ) {
+        return internal_subscript<true>(index);
+    }
+
+    const value_type& operator[]( size_type index ) const {
+        return const_cast<segment_table*>(this)->internal_subscript<true>(index);
+    }
+
+    const segment_table_allocator_type& get_allocator() const {
+        return my_segment_table_allocator;
+    }
+
+    segment_table_allocator_type& get_allocator() {
+        return my_segment_table_allocator;
+    }
+
+    void enable_segment( segment_type& segment, segment_table_type table, segment_index_type seg_index, size_type index ) {
+        // Allocate new segment
+        segment_type new_segment = self()->create_segment(table, seg_index, index);
+        if (new_segment != nullptr) {
+            // Store (new_segment - segment_base) into the segment table to allow access to the table by index via
+            // my_segment_table[segment_index_of(index)][index]
+            segment_type disabled_segment = nullptr;
+            if (!table[seg_index].compare_exchange_strong(disabled_segment, new_segment - segment_base(seg_index))) {
+                // compare_exchange failed => some other thread has already enabled this segment
+                // Deallocate the memory
+                self()->deallocate_segment(new_segment, seg_index);
+            }
+        }
+
+        segment = table[seg_index].load(std::memory_order_acquire);
+        __TBB_ASSERT(segment != nullptr, "If create_segment returned nullptr, the element should be stored in the table");
+    }
+
+    void delete_segment( segment_index_type seg_index ) {
+        segment_type segment_to_delete = self()->nullify_segment(get_table(), seg_index);
+        if (segment_to_delete == segment_allocation_failure_tag) {
+            return;
+        }
+
+        segment_to_delete += segment_base(seg_index);
+
+        // Deallocate the segment
+        self()->destroy_segment(segment_to_delete, seg_index);
+    }
+
+    size_type number_of_segments( segment_table_type table ) const {
+        // Check for an active table, if it is embedded table - return the number of embedded segments
+        // Otherwise - return the maximum number of segments
+        return table == my_embedded_table ? pointers_per_embedded_table : pointers_per_long_table;
+    }
+
+    size_type capacity() const noexcept {
+        segment_table_type table = get_table();
+        size_type num_segments = number_of_segments(table);
+        for (size_type seg_index = 0; seg_index < num_segments; ++seg_index) {
+            // Check if the pointer is valid (allocated)
+            if (table[seg_index].load(std::memory_order_relaxed) <= segment_allocation_failure_tag) {
+                return segment_base(seg_index);
+            }
+        }
+        return segment_base(num_segments);
+    }
+
+    size_type find_last_allocated_segment( segment_table_type table ) const noexcept {
+        size_type end = 0;
+        size_type num_segments = number_of_segments(table);
+        for (size_type seg_index = 0; seg_index < num_segments; ++seg_index) {
+            // Check if the pointer is valid (allocated)
+            if (table[seg_index].load(std::memory_order_relaxed) > segment_allocation_failure_tag) {
+                end = seg_index + 1;
+            }
+        }
+        return end;
+    }
+
+    void reserve( size_type n ) {
+        if (n > allocator_traits_type::max_size(my_segment_table_allocator)) {
+            throw_exception(exception_id::reservation_length_error);
+        }
+
+        size_type size = my_size.load(std::memory_order_relaxed);
+        segment_index_type start_seg_idx = size == 0 ? 0 : segment_index_of(size - 1) + 1;
+        for (segment_index_type seg_idx = start_seg_idx; segment_base(seg_idx) < n; ++seg_idx) {
+                size_type first_index = segment_base(seg_idx);
+                internal_subscript<true>(first_index);
+        }
+    }
+
+    void clear() {
+        clear_segments();
+        clear_table();
+        my_size.store(0, std::memory_order_relaxed);
+        my_first_block.store(0, std::memory_order_relaxed);
+    }
+
+    void clear_segments() {
+        segment_table_type current_segment_table = get_table();
+        for (size_type i = number_of_segments(current_segment_table); i != 0; --i) {
+            if (current_segment_table[i - 1].load(std::memory_order_relaxed) != nullptr) {
+                // If the segment was enabled - disable and deallocate it
+                delete_segment(i - 1);
+            }
+        }
+    }
+
+    void clear_table() {
+        segment_table_type current_segment_table = get_table();
+        if (current_segment_table != my_embedded_table) {
+            // If the active table is not the embedded one - deallocate the active table
+            for (size_type i = 0; i != pointers_per_long_table; ++i) {
+                segment_table_allocator_traits::destroy(my_segment_table_allocator, &current_segment_table[i]);
+            }
+
+            segment_table_allocator_traits::deallocate(my_segment_table_allocator, current_segment_table, pointers_per_long_table);
+            my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+            zero_table(my_embedded_table, pointers_per_embedded_table);
+        }
+    }
+
+    void extend_table_if_necessary(segment_table_type& table, size_type start_index, size_type end_index) {
+        // extend_segment_table if an active table is an embedded table
+        // and the requested index is not in the embedded table
+        if (table == my_embedded_table && end_index > embedded_table_size) {
+            if (start_index <= embedded_table_size) {
+                try_call([&] {
+                    table = self()->allocate_long_table(my_embedded_table, start_index);
+                    // It is possible that the table was extended by the thread that allocated first_block.
+                    // In this case it is necessary to re-read the current table.
+
+                    if (table) {
+                        my_segment_table.store(table, std::memory_order_release);
+                    } else {
+                        table = my_segment_table.load(std::memory_order_acquire);
+                    }
+                }).on_exception([&] {
+                    my_segment_table_allocation_failed.store(true, std::memory_order_relaxed);
+                });
+            } else {
+                atomic_backoff backoff;
+                do {
+                    if (my_segment_table_allocation_failed.load(std::memory_order_relaxed)) {
+                        throw_exception(exception_id::bad_alloc);
+                    }
+                    backoff.pause();
+                    table = my_segment_table.load(std::memory_order_acquire);
+                } while (table == my_embedded_table);
+            }
+        }
+    }
+
+    // Return the segment where index is stored
+    static constexpr segment_index_type segment_index_of( size_type index ) {
+        return size_type(tbb::detail::log2(uintptr_t(index|1)));
+    }
+
+    // Needed to calculate the offset in segment
+    static constexpr size_type segment_base( size_type index ) {
+        return size_type(1) << index & ~size_type(1);
+    }
+
+    // Return size of the segment
+    static constexpr size_type segment_size( size_type index ) {
+        return index == 0 ? 2 : size_type(1) << index;
+    }
+
+private:
+
+    derived_type* self() {
+        return static_cast<derived_type*>(this);
+    }
+
+    struct copy_segment_body_type {
+        void operator()( segment_index_type index, segment_type from, segment_type to ) const {
+            my_instance.self()->copy_segment(index, from, to);
+        }
+        segment_table& my_instance;
+    };
+
+    struct move_segment_body_type {
+        void operator()( segment_index_type index, segment_type from, segment_type to ) const {
+            my_instance.self()->move_segment(index, from, to);
+        }
+        segment_table& my_instance;
+    };
+
+    // Transgers all segments from the other table
+    template <typename TransferBody>
+    void internal_transfer( const segment_table& other, TransferBody transfer_segment ) {
+        static_cast<derived_type*>(this)->destroy_elements();
+
+        assign_first_block_if_necessary(other.my_first_block.load(std::memory_order_relaxed));
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+
+        segment_table_type other_table = other.get_table();
+        size_type end_segment_size = segment_size(other.find_last_allocated_segment(other_table));
+
+        // If an exception occurred in other, then the size may be greater than the size of the end segment.
+        size_type other_size = end_segment_size < other.my_size.load(std::memory_order_relaxed) ?
+            other.my_size.load(std::memory_order_relaxed) : end_segment_size;
+        other_size = my_segment_table_allocation_failed ? embedded_table_size : other_size;
+
+        for (segment_index_type i = 0; segment_base(i) < other_size; ++i) {
+            // If the segment in other table is enabled - transfer it
+            if (other_table[i].load(std::memory_order_relaxed) == segment_allocation_failure_tag)
+            {
+                    my_size = segment_base(i);
+                    break;
+            } else if (other_table[i].load(std::memory_order_relaxed) != nullptr) {
+                internal_subscript<true>(segment_base(i));
+                transfer_segment(i, other.get_table()[i].load(std::memory_order_relaxed) + segment_base(i),
+                                get_table()[i].load(std::memory_order_relaxed) + segment_base(i));
+            }
+        }
+    }
+
+    // Moves the other segment table
+    // Only equal allocators are allowed
+    void internal_move( segment_table&& other ) {
+        // NOTE: allocators should be equal
+        clear();
+        my_first_block.store(other.my_first_block.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        // If an active table in other is embedded - restore all of the embedded segments
+        if (other.get_table() == other.my_embedded_table) {
+            for ( size_type i = 0; i != pointers_per_embedded_table; ++i ) {
+                segment_type other_segment = other.my_embedded_table[i].load(std::memory_order_relaxed);
+                my_embedded_table[i].store(other_segment, std::memory_order_relaxed);
+                other.my_embedded_table[i].store(nullptr, std::memory_order_relaxed);
+            }
+            my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        } else {
+            my_segment_table.store(other.my_segment_table, std::memory_order_relaxed);
+            other.my_segment_table.store(other.my_embedded_table, std::memory_order_relaxed);
+            zero_table(other.my_embedded_table, pointers_per_embedded_table);
+        }
+        other.my_size.store(0, std::memory_order_relaxed);
+    }
+
+    // Move construct the segment table with the allocator object
+    // if any instances of allocator_type are always equal
+    void internal_move_construct_with_allocator( segment_table&& other, const allocator_type&,
+                                                 /*is_always_equal = */ std::true_type ) {
+        internal_move(std::move(other));
+    }
+
+    // Move construct the segment table with the allocator object
+    // if any instances of allocator_type are always equal
+    void internal_move_construct_with_allocator( segment_table&& other, const allocator_type& alloc,
+                                                 /*is_always_equal = */ std::false_type ) {
+        if (other.my_segment_table_allocator == alloc) {
+            // If allocators are equal - restore pointers
+            internal_move(std::move(other));
+        } else {
+            // If allocators are not equal - perform per element move with reallocation
+            try_call( [&] {
+                internal_transfer(other, move_segment_body_type{*this});
+            } ).on_exception( [&] {
+                clear();
+            });
+        }
+    }
+
+    // Move assigns the segment table to other is any instances of allocator_type are always equal
+    // or propagate_on_container_move_assignment is true
+    void internal_move_assign( segment_table&& other, /*is_always_equal || POCMA = */ std::true_type ) {
+        internal_move(std::move(other));
+    }
+
+    // Move assigns the segment table to other is any instances of allocator_type are not always equal
+    // and propagate_on_container_move_assignment is false
+    void internal_move_assign( segment_table&& other, /*is_always_equal || POCMA = */ std::false_type ) {
+        if (my_segment_table_allocator == other.my_segment_table_allocator) {
+            // If allocators are equal - restore pointers
+            internal_move(std::move(other));
+        } else {
+            // If allocators are not equal - perform per element move with reallocation
+            internal_transfer(other, move_segment_body_type{*this});
+        }
+    }
+
+    // Swaps two segment tables if any instances of allocator_type are always equal
+    // or propagate_on_container_swap is true
+    void internal_swap( segment_table& other, /*is_always_equal || POCS = */ std::true_type ) {
+        internal_swap_fields(other);
+    }
+
+    // Swaps two segment tables if any instances of allocator_type are not always equal
+    // and propagate_on_container_swap is false
+    // According to the C++ standard, swapping of two containers with unequal allocators
+    // is an undefined behavior scenario
+    void internal_swap( segment_table& other, /*is_always_equal || POCS = */ std::false_type ) {
+        __TBB_ASSERT(my_segment_table_allocator == other.my_segment_table_allocator,
+                     "Swapping with unequal allocators is not allowed");
+        internal_swap_fields(other);
+    }
+
+    void internal_swap_fields( segment_table& other ) {
+        // If an active table in either *this segment table or other is an embedded one - swaps the embedded tables
+        if (get_table() == my_embedded_table ||
+            other.get_table() == other.my_embedded_table) {
+
+            for (size_type i = 0; i != pointers_per_embedded_table; ++i) {
+                segment_type current_segment = my_embedded_table[i].load(std::memory_order_relaxed);
+                segment_type other_segment = other.my_embedded_table[i].load(std::memory_order_relaxed);
+
+                my_embedded_table[i].store(other_segment, std::memory_order_relaxed);
+                other.my_embedded_table[i].store(current_segment, std::memory_order_relaxed);
+            }
+        }
+
+        segment_table_type current_segment_table = get_table();
+        segment_table_type other_segment_table = other.get_table();
+
+        // If an active table is an embedded one -
+        // store an active table in other to the embedded one from other
+        if (current_segment_table == my_embedded_table) {
+            other.my_segment_table.store(other.my_embedded_table, std::memory_order_relaxed);
+        } else {
+            // Otherwise - store it to the active segment table
+            other.my_segment_table.store(current_segment_table, std::memory_order_relaxed);
+        }
+
+        // If an active table in other segment table is an embedded one -
+        // store an active table in other to the embedded one from *this
+        if (other_segment_table == other.my_embedded_table) {
+            my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
+        } else {
+            // Otherwise - store it to the active segment table in other
+            my_segment_table.store(other_segment_table, std::memory_order_relaxed);
+        }
+        auto first_block = other.my_first_block.load(std::memory_order_relaxed);
+        other.my_first_block.store(my_first_block.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        my_first_block.store(first_block, std::memory_order_relaxed);
+
+        auto size = other.my_size.load(std::memory_order_relaxed);
+        other.my_size.store(my_size.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        my_size.store(size, std::memory_order_relaxed);
+    }
+
+protected:
+    // A flag indicates that an exception was throws during segment allocations
+    const segment_type segment_allocation_failure_tag = reinterpret_cast<segment_type>(1);
+    static constexpr size_type embedded_table_size = segment_size(pointers_per_embedded_table);
+
+    template <bool allow_out_of_range_access>
+    value_type& internal_subscript( size_type index ) {
+        segment_index_type seg_index = segment_index_of(index);
+        segment_table_type table = my_segment_table.load(std::memory_order_acquire);
+        segment_type segment = nullptr;
+
+        if (allow_out_of_range_access) {
+            if (derived_type::allow_table_extending) {
+                extend_table_if_necessary(table, index, index + 1);
+            }
+
+            segment = table[seg_index].load(std::memory_order_acquire);
+            // If the required segment is disabled - enable it
+            if (segment == nullptr) {
+                enable_segment(segment, table, seg_index, index);
+            }
+            // Check if an exception was thrown during segment allocation
+            if (segment == segment_allocation_failure_tag) {
+                throw_exception(exception_id::bad_alloc);
+            }
+        } else {
+            segment = table[seg_index].load(std::memory_order_acquire);
+        }
+        __TBB_ASSERT(segment != nullptr, nullptr);
+
+        return segment[index];
+    }
+
+    void assign_first_block_if_necessary(segment_index_type index) {
+        size_type zero = 0;
+        if (this->my_first_block.load(std::memory_order_relaxed) == zero) {
+            this->my_first_block.compare_exchange_strong(zero, index);
+        }
+    }
+
+    void zero_table( segment_table_type table, size_type count ) {
+        for (size_type i = 0; i != count; ++i) {
+            table[i].store(nullptr, std::memory_order_relaxed);
+        }
+    }
+
+    segment_table_type get_table() const {
+        return my_segment_table.load(std::memory_order_acquire);
+    }
+
+    segment_table_allocator_type my_segment_table_allocator;
+    std::atomic<segment_table_type> my_segment_table;
+    atomic_segment my_embedded_table[pointers_per_embedded_table];
+    // Number of segments in first block
+    std::atomic<size_type> my_first_block;
+    // Number of elements in table
+    std::atomic<size_type> my_size;
+    // Flag to indicate failed extend table
+    std::atomic<bool> my_segment_table_allocation_failed;
+}; // class segment_table
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(pop) // warning 4127 is back
+#endif
+
+#endif // __TBB_detail__segment_table_H
diff --git a/third_party/tbb/detail/_small_object_pool.h b/third_party/tbb/detail/_small_object_pool.h
new file mode 100644
index 000000000..114858597
--- /dev/null
+++ b/third_party/tbb/detail/_small_object_pool.h
@@ -0,0 +1,109 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__small_object_pool_H
+#define __TBB__small_object_pool_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_assert.h"
+
+#include "third_party/tbb/profiling.h"
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class small_object_pool {
+protected:
+    small_object_pool() = default;
+};
+struct execution_data;
+}
+
+namespace r1 {
+TBB_EXPORT void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& pool, std::size_t number_of_bytes,
+                                    const d1::execution_data& ed);
+TBB_EXPORT void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& pool, std::size_t number_of_bytes);
+TBB_EXPORT void  __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& pool, void* ptr, std::size_t number_of_bytes,
+                                        const d1::execution_data& ed);
+TBB_EXPORT void  __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& pool, void* ptr, std::size_t number_of_bytes);
+}
+
+namespace d1 {
+class small_object_allocator {
+public:
+    template <typename Type, typename... Args>
+    Type* new_object(execution_data& ed, Args&&... args) {
+        void* allocated_object = r1::allocate(m_pool, sizeof(Type), ed);
+
+        auto constructed_object = new(allocated_object) Type(std::forward<Args>(args)...);
+        return constructed_object;
+    }
+
+    template <typename Type, typename... Args>
+    Type* new_object(Args&&... args) {
+        void* allocated_object = r1::allocate(m_pool, sizeof(Type));
+
+        auto constructed_object = new(allocated_object) Type(std::forward<Args>(args)...);
+        return constructed_object;
+    }
+
+    template <typename Type>
+    void delete_object(Type* object, const execution_data& ed) {
+        // Copy this since it can be a member of the passed object and
+        // unintentionally destroyed when Type destructor is called below
+        small_object_allocator alloc = *this;
+        object->~Type();
+        alloc.deallocate(object, ed);
+    }
+
+    template <typename Type>
+    void delete_object(Type* object) {
+        // Copy this since it can be a member of the passed object and
+        // unintentionally destroyed when Type destructor is called below
+        small_object_allocator alloc = *this;
+        object->~Type();
+        alloc.deallocate(object);
+    }
+
+    template <typename Type>
+    void deallocate(Type* ptr, const execution_data& ed) {
+        call_itt_task_notify(destroy, ptr);
+
+        __TBB_ASSERT(m_pool != nullptr, "Pool must be valid for deallocate call");
+        r1::deallocate(*m_pool, ptr, sizeof(Type), ed);
+    }
+
+    template <typename Type>
+    void deallocate(Type* ptr) {
+        call_itt_task_notify(destroy, ptr);
+
+        __TBB_ASSERT(m_pool != nullptr, "Pool must be valid for deallocate call");
+        r1::deallocate(*m_pool, ptr, sizeof(Type));
+    }
+private:
+    small_object_pool* m_pool{};
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB__small_object_pool_H */
diff --git a/third_party/tbb/detail/_string_resource.h b/third_party/tbb/detail/_string_resource.h
new file mode 100644
index 000000000..d1dd46d1b
--- /dev/null
+++ b/third_party/tbb/detail/_string_resource.h
@@ -0,0 +1,79 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+TBB_STRING_RESOURCE(ALGORITHM, "tbb_algorithm")
+TBB_STRING_RESOURCE(PARALLEL_FOR, "tbb_parallel_for")
+TBB_STRING_RESOURCE(PARALLEL_FOR_EACH, "tbb_parallel_for_each")
+TBB_STRING_RESOURCE(PARALLEL_INVOKE, "tbb_parallel_invoke")
+TBB_STRING_RESOURCE(PARALLEL_REDUCE, "tbb_parallel_reduce")
+TBB_STRING_RESOURCE(PARALLEL_SCAN, "tbb_parallel_scan")
+TBB_STRING_RESOURCE(PARALLEL_SORT, "tbb_parallel_sort")
+TBB_STRING_RESOURCE(PARALLEL_PIPELINE, "tbb_parallel_pipeline")
+TBB_STRING_RESOURCE(CUSTOM_CTX, "tbb_custom")
+
+TBB_STRING_RESOURCE(FLOW_NULL, "null")
+TBB_STRING_RESOURCE(FLOW_BROADCAST_NODE, "broadcast_node")
+TBB_STRING_RESOURCE(FLOW_BUFFER_NODE, "buffer_node")
+TBB_STRING_RESOURCE(FLOW_CONTINUE_NODE, "continue_node")
+TBB_STRING_RESOURCE(FLOW_FUNCTION_NODE, "function_node")
+TBB_STRING_RESOURCE(FLOW_JOIN_NODE_QUEUEING, "join_node (queueing)")
+TBB_STRING_RESOURCE(FLOW_JOIN_NODE_RESERVING, "join_node (reserving)")
+TBB_STRING_RESOURCE(FLOW_JOIN_NODE_TAG_MATCHING, "join_node (tag_matching)")
+TBB_STRING_RESOURCE(FLOW_LIMITER_NODE, "limiter_node")
+TBB_STRING_RESOURCE(FLOW_MULTIFUNCTION_NODE, "multifunction_node")
+TBB_STRING_RESOURCE(FLOW_OVERWRITE_NODE, "overwrite_node")
+TBB_STRING_RESOURCE(FLOW_PRIORITY_QUEUE_NODE, "priority_queue_node")
+TBB_STRING_RESOURCE(FLOW_QUEUE_NODE, "queue_node")
+TBB_STRING_RESOURCE(FLOW_SEQUENCER_NODE, "sequencer_node")
+TBB_STRING_RESOURCE(FLOW_INPUT_NODE, "input_node")
+TBB_STRING_RESOURCE(FLOW_SPLIT_NODE, "split_node")
+TBB_STRING_RESOURCE(FLOW_WRITE_ONCE_NODE, "write_once_node")
+TBB_STRING_RESOURCE(FLOW_INDEXER_NODE, "indexer_node")
+TBB_STRING_RESOURCE(FLOW_COMPOSITE_NODE, "composite_node")
+TBB_STRING_RESOURCE(FLOW_ASYNC_NODE, "async_node")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT, "input_port")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_0, "input_port_0")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_1, "input_port_1")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_2, "input_port_2")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_3, "input_port_3")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_4, "input_port_4")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_5, "input_port_5")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_6, "input_port_6")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_7, "input_port_7")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_8, "input_port_8")
+TBB_STRING_RESOURCE(FLOW_INPUT_PORT_9, "input_port_9")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT, "output_port")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_0, "output_port_0")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_1, "output_port_1")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_2, "output_port_2")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_3, "output_port_3")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_4, "output_port_4")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_5, "output_port_5")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_6, "output_port_6")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_7, "output_port_7")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_8, "output_port_8")
+TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_9, "output_port_9")
+TBB_STRING_RESOURCE(FLOW_OBJECT_NAME, "object_name")
+TBB_STRING_RESOURCE(FLOW_BODY, "body")
+TBB_STRING_RESOURCE(FLOW_GRAPH, "graph")
+TBB_STRING_RESOURCE(FLOW_NODE, "node")
+TBB_STRING_RESOURCE(FLOW_TASKS, "tbb_flow_graph")
+TBB_STRING_RESOURCE(USER_EVENT, "user_event")
+
+#if __TBB_FLOW_TRACE_CODEPTR
+TBB_STRING_RESOURCE(CODE_ADDRESS, "code_address")
+#endif
diff --git a/third_party/tbb/detail/_task.h b/third_party/tbb/detail/_task.h
new file mode 100644
index 000000000..0413c0bed
--- /dev/null
+++ b/third_party/tbb/detail/_task.h
@@ -0,0 +1,233 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB__task_H
+#define __TBB__task_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+
+#include "third_party/tbb/profiling.h"
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/climits"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/mutex"
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+using slot_id = unsigned short;
+constexpr slot_id no_slot = slot_id(~0);
+constexpr slot_id any_slot = slot_id(~1);
+
+class task;
+class wait_context;
+class task_group_context;
+struct execution_data;
+}
+
+namespace r1 {
+//! Task spawn/wait entry points
+TBB_EXPORT void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx);
+TBB_EXPORT void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx, d1::slot_id id);
+TBB_EXPORT void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context&, d1::task_group_context& w_ctx);
+TBB_EXPORT void __TBB_EXPORTED_FUNC wait(d1::wait_context&, d1::task_group_context& ctx);
+TBB_EXPORT d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data*);
+TBB_EXPORT d1::task_group_context* __TBB_EXPORTED_FUNC current_context();
+
+// Do not place under __TBB_RESUMABLE_TASKS. It is a stub for unsupported platforms.
+struct suspend_point_type;
+using suspend_callback_type = void(*)(void*, suspend_point_type*);
+//! The resumable tasks entry points
+TBB_EXPORT void __TBB_EXPORTED_FUNC suspend(suspend_callback_type suspend_callback, void* user_callback);
+TBB_EXPORT void __TBB_EXPORTED_FUNC resume(suspend_point_type* tag);
+TBB_EXPORT suspend_point_type* __TBB_EXPORTED_FUNC current_suspend_point();
+TBB_EXPORT void __TBB_EXPORTED_FUNC notify_waiters(std::uintptr_t wait_ctx_addr);
+
+class thread_data;
+class task_dispatcher;
+class external_waiter;
+struct task_accessor;
+struct task_arena_impl;
+} // namespace r1
+
+namespace d1 {
+
+class task_arena;
+using suspend_point = r1::suspend_point_type*;
+
+#if __TBB_RESUMABLE_TASKS
+template <typename F>
+static void suspend_callback(void* user_callback, suspend_point sp) {
+    // Copy user function to a new stack after the context switch to avoid a race when the previous
+    // suspend point is resumed while the user_callback is being called.
+    F user_callback_copy = *static_cast<F*>(user_callback);
+    user_callback_copy(sp);
+}
+
+template <typename F>
+void suspend(F f) {
+    r1::suspend(&suspend_callback<F>, &f);
+}
+
+inline void resume(suspend_point tag) {
+    r1::resume(tag);
+}
+#endif /* __TBB_RESUMABLE_TASKS */
+
+// TODO align wait_context on cache lane
+class wait_context {
+    static constexpr std::uint64_t overflow_mask = ~((1LLU << 32) - 1);
+
+    std::uint64_t m_version_and_traits{1};
+    std::atomic<std::uint64_t> m_ref_count{};
+
+    void add_reference(std::int64_t delta) {
+        call_itt_task_notify(releasing, this);
+        std::uint64_t r = m_ref_count.fetch_add(static_cast<std::uint64_t>(delta)) + static_cast<std::uint64_t>(delta);
+
+        __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected");
+
+        if (!r) {
+            // Some external waiters or coroutine waiters sleep in wait list
+            // Should to notify them that work is done
+            std::uintptr_t wait_ctx_addr = std::uintptr_t(this);
+            r1::notify_waiters(wait_ctx_addr);
+        }
+    }
+
+    bool continue_execution() const {
+        std::uint64_t r = m_ref_count.load(std::memory_order_acquire);
+        __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected");
+        return r > 0;
+    }
+
+    friend class r1::thread_data;
+    friend class r1::task_dispatcher;
+    friend class r1::external_waiter;
+    friend class task_group;
+    friend class task_group_base;
+    friend struct r1::task_arena_impl;
+    friend struct r1::suspend_point_type;
+public:
+    // Despite the internal reference count is uin64_t we limit the user interface with uint32_t
+    // to preserve a part of the internal reference count for special needs.
+    wait_context(std::uint32_t ref_count) : m_ref_count{ref_count} { suppress_unused_warning(m_version_and_traits); }
+    wait_context(const wait_context&) = delete;
+
+    ~wait_context() {
+        __TBB_ASSERT(!continue_execution(), nullptr);
+    }
+
+    void reserve(std::uint32_t delta = 1) {
+        add_reference(delta);
+    }
+
+    void release(std::uint32_t delta = 1) {
+        add_reference(-std::int64_t(delta));
+    }
+};
+
+struct execution_data {
+    task_group_context* context{};
+    slot_id original_slot{};
+    slot_id affinity_slot{};
+};
+
+inline task_group_context* context(const execution_data& ed) {
+    return ed.context;
+}
+
+inline slot_id original_slot(const execution_data& ed) {
+    return ed.original_slot;
+}
+
+inline slot_id affinity_slot(const execution_data& ed) {
+    return ed.affinity_slot;
+}
+
+inline slot_id execution_slot(const execution_data& ed) {
+    return r1::execution_slot(&ed);
+}
+
+inline bool is_same_affinity(const execution_data& ed) {
+    return affinity_slot(ed) == no_slot || affinity_slot(ed) == execution_slot(ed);
+}
+
+inline bool is_stolen(const execution_data& ed) {
+    return original_slot(ed) != execution_slot(ed);
+}
+
+inline void spawn(task& t, task_group_context& ctx) {
+    call_itt_task_notify(releasing, &t);
+    r1::spawn(t, ctx);
+}
+
+inline void spawn(task& t, task_group_context& ctx, slot_id id) {
+    call_itt_task_notify(releasing, &t);
+    r1::spawn(t, ctx, id);
+}
+
+inline void execute_and_wait(task& t, task_group_context& t_ctx, wait_context& wait_ctx, task_group_context& w_ctx) {
+    r1::execute_and_wait(t, t_ctx, wait_ctx, w_ctx);
+    call_itt_task_notify(acquired, &wait_ctx);
+    call_itt_task_notify(destroy, &wait_ctx);
+}
+
+inline void wait(wait_context& wait_ctx, task_group_context& ctx) {
+    r1::wait(wait_ctx, ctx);
+    call_itt_task_notify(acquired, &wait_ctx);
+    call_itt_task_notify(destroy, &wait_ctx);
+}
+
+using r1::current_context;
+
+class task_traits {
+    std::uint64_t m_version_and_traits{};
+    friend struct r1::task_accessor;
+};
+
+//! Alignment for a task object
+static constexpr std::size_t task_alignment = 64;
+
+//! Base class for user-defined tasks.
+/** @ingroup task_scheduling */
+class alignas(task_alignment) task : public task_traits {
+protected:
+    virtual ~task() = default;
+
+public:
+    virtual task* execute(execution_data&) = 0;
+    virtual task* cancel(execution_data&) = 0;
+
+private:
+    std::uint64_t m_reserved[6]{};
+    friend struct r1::task_accessor;
+};
+static_assert(sizeof(task) == task_alignment, "task size is broken");
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB__task_H */
diff --git a/third_party/tbb/detail/_task_handle.h b/third_party/tbb/detail/_task_handle.h
new file mode 100644
index 000000000..c7bf32992
--- /dev/null
+++ b/third_party/tbb/detail/_task_handle.h
@@ -0,0 +1,123 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+
+#ifndef __TBB_task_handle_H
+#define __TBB_task_handle_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/libcxx/memory"
+
+namespace tbb {
+namespace detail {
+
+namespace d1 { class task_group_context; class wait_context; struct execution_data; }
+namespace d2 {
+
+class task_handle;
+
+class task_handle_task : public d1::task {
+    std::uint64_t m_version_and_traits{};
+    d1::wait_context& m_wait_ctx;
+    d1::task_group_context& m_ctx;
+    d1::small_object_allocator m_allocator;
+public:
+    void finalize(const d1::execution_data* ed = nullptr) {
+        if (ed) {
+            m_allocator.delete_object(this, *ed);
+        } else {
+            m_allocator.delete_object(this);
+        }
+    }
+
+    task_handle_task(d1::wait_context& wo, d1::task_group_context& ctx, d1::small_object_allocator& alloc)
+        : m_wait_ctx(wo)
+        , m_ctx(ctx)
+        , m_allocator(alloc) {
+        suppress_unused_warning(m_version_and_traits);
+    }
+
+    ~task_handle_task() override {
+        m_wait_ctx.release();
+    }
+
+    d1::task_group_context& ctx() const { return m_ctx; }
+};
+
+
+class task_handle {
+    struct task_handle_task_finalizer_t{
+        void operator()(task_handle_task* p){ p->finalize(); }
+    };
+    using handle_impl_t = std::unique_ptr<task_handle_task, task_handle_task_finalizer_t>;
+
+    handle_impl_t m_handle = {nullptr};
+public:
+    task_handle() = default;
+    task_handle(task_handle&&) = default;
+    task_handle& operator=(task_handle&&) = default;
+
+    explicit operator bool() const noexcept { return static_cast<bool>(m_handle); }
+
+    friend bool operator==(task_handle const& th, std::nullptr_t) noexcept;
+    friend bool operator==(std::nullptr_t, task_handle const& th) noexcept;
+
+    friend bool operator!=(task_handle const& th, std::nullptr_t) noexcept;
+    friend bool operator!=(std::nullptr_t, task_handle const& th) noexcept;
+
+private:
+    friend struct task_handle_accessor;
+
+    task_handle(task_handle_task* t) : m_handle {t}{};
+
+    d1::task* release() {
+       return m_handle.release();
+    }
+};
+
+struct task_handle_accessor {
+static task_handle              construct(task_handle_task* t)  { return {t}; }
+static d1::task*                release(task_handle& th)        { return th.release(); }
+static d1::task_group_context&  ctx_of(task_handle& th)         {
+    __TBB_ASSERT(th.m_handle, "ctx_of does not expect empty task_handle.");
+    return th.m_handle->ctx();
+}
+};
+
+inline bool operator==(task_handle const& th, std::nullptr_t) noexcept {
+    return th.m_handle == nullptr;
+}
+inline bool operator==(std::nullptr_t, task_handle const& th) noexcept {
+    return th.m_handle == nullptr;
+}
+
+inline bool operator!=(task_handle const& th, std::nullptr_t) noexcept {
+    return th.m_handle != nullptr;
+}
+
+inline bool operator!=(std::nullptr_t, task_handle const& th) noexcept {
+    return th.m_handle != nullptr;
+}
+
+} // namespace d2
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_task_handle_H */
diff --git a/third_party/tbb/detail/_template_helpers.h b/third_party/tbb/detail/_template_helpers.h
new file mode 100644
index 000000000..e27ff363e
--- /dev/null
+++ b/third_party/tbb/detail/_template_helpers.h
@@ -0,0 +1,404 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__template_helpers_H
+#define __TBB_detail__template_helpers_H
+
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_config.h"
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/iterator"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+// An internal implementation of void_t, which can be used in SFINAE contexts
+template <typename...>
+struct void_impl {
+    using type = void;
+}; // struct void_impl
+
+template <typename... Args>
+using void_t = typename void_impl<Args...>::type;
+
+// Generic SFINAE helper for expression checks, based on the idea demonstrated in ISO C++ paper n4502
+template <typename T, typename, template <typename> class... Checks>
+struct supports_impl {
+    using type = std::false_type;
+};
+
+template <typename T, template <typename> class... Checks>
+struct supports_impl<T, void_t<Checks<T>...>, Checks...> {
+    using type = std::true_type;
+};
+
+template <typename T, template <typename> class... Checks>
+using supports = typename supports_impl<T, void, Checks...>::type;
+
+//! A template to select either 32-bit or 64-bit constant as compile time, depending on machine word size.
+template <unsigned u, unsigned long long ull >
+struct select_size_t_constant {
+    // Explicit cast is needed to avoid compiler warnings about possible truncation.
+    // The value of the right size,   which is selected by ?:, is anyway not truncated or promoted.
+    static const std::size_t value = static_cast<std::size_t>((sizeof(std::size_t)==sizeof(u)) ? u : ull);
+};
+
+// TODO: do we really need it?
+//! Cast between unrelated pointer types.
+/** This method should be used sparingly as a last resort for dealing with
+  situations that inherently break strict ISO C++ aliasing rules. */
+// T is a pointer type because it will be explicitly provided by the programmer as a template argument;
+// U is a referent type to enable the compiler to check that "ptr" is a pointer, deducing U in the process.
+template<typename T, typename U>
+inline T punned_cast( U* ptr ) {
+    std::uintptr_t x = reinterpret_cast<std::uintptr_t>(ptr);
+    return reinterpret_cast<T>(x);
+}
+
+template<class T, size_t S, size_t R>
+struct padded_base : T {
+    char pad[S - R];
+};
+template<class T, size_t S> struct padded_base<T, S, 0> : T {};
+
+//! Pads type T to fill out to a multiple of cache line size.
+template<class T, size_t S = max_nfs_size>
+struct padded : padded_base<T, S, sizeof(T) % S> {};
+
+#if __TBB_CPP14_INTEGER_SEQUENCE_PRESENT
+
+using std::index_sequence;
+using std::make_index_sequence;
+
+#else
+
+template<std::size_t... S> class index_sequence {};
+
+template<std::size_t N, std::size_t... S>
+struct make_index_sequence_impl : make_index_sequence_impl < N - 1, N - 1, S... > {};
+
+template<std::size_t... S>
+struct make_index_sequence_impl <0, S...> {
+    using type = index_sequence<S...>;
+};
+
+template<std::size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+
+#endif /* __TBB_CPP14_INTEGER_SEQUENCE_PRESENT */
+
+#if __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT
+using std::conjunction;
+using std::disjunction;
+#else // __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT
+
+template <typename...>
+struct conjunction : std::true_type {};
+
+template <typename First, typename... Args>
+struct conjunction<First, Args...>
+    : std::conditional<bool(First::value), conjunction<Args...>, First>::type {};
+
+template <typename T>
+struct conjunction<T> : T {};
+
+template <typename...>
+struct disjunction : std::false_type {};
+
+template <typename First, typename... Args>
+struct disjunction<First, Args...>
+    : std::conditional<bool(First::value), First, disjunction<Args...>>::type {};
+
+template <typename T>
+struct disjunction<T> : T {};
+
+#endif // __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT
+
+template <typename Iterator>
+using iterator_value_t = typename std::iterator_traits<Iterator>::value_type;
+
+template <typename Iterator>
+using iterator_key_t = typename std::remove_const<typename iterator_value_t<Iterator>::first_type>::type;
+
+template <typename Iterator>
+using iterator_mapped_t = typename iterator_value_t<Iterator>::second_type;
+
+template <typename Iterator>
+using iterator_alloc_pair_t = std::pair<typename std::add_const<iterator_key_t<Iterator>>::type,
+                                        iterator_mapped_t<Iterator>>;
+
+template <typename A> using alloc_value_type = typename A::value_type;
+template <typename A> using alloc_ptr_t = typename std::allocator_traits<A>::pointer;
+template <typename A> using has_allocate = decltype(std::declval<alloc_ptr_t<A>&>() = std::declval<A>().allocate(0));
+template <typename A> using has_deallocate = decltype(std::declval<A>().deallocate(std::declval<alloc_ptr_t<A>>(), 0));
+
+// alloc_value_type should be checked first, because it can be used in other checks
+template <typename T>
+using is_allocator = supports<T, alloc_value_type, has_allocate, has_deallocate>;
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template <typename T>
+inline constexpr bool is_allocator_v = is_allocator<T>::value;
+#endif
+
+// Template class in which the "type" determines the type of the element number N in pack Args
+template <std::size_t N, typename... Args>
+struct pack_element {
+    using type = void;
+};
+
+template <std::size_t N, typename T, typename... Args>
+struct pack_element<N, T, Args...> {
+    using type = typename pack_element<N-1, Args...>::type;
+};
+
+template <typename T, typename... Args>
+struct pack_element<0, T, Args...> {
+    using type = T;
+};
+
+template <std::size_t N, typename... Args>
+using pack_element_t = typename pack_element<N, Args...>::type;
+
+template <typename Func>
+class raii_guard {
+public:
+    static_assert(
+        std::is_nothrow_copy_constructible<Func>::value &&
+        std::is_nothrow_move_constructible<Func>::value,
+        "Throwing an exception during the Func copy or move construction cause an unexpected behavior."
+    );
+
+    raii_guard( Func f ) noexcept : my_func(f), is_active(true) {}
+
+    raii_guard( raii_guard&& g ) noexcept : my_func(std::move(g.my_func)), is_active(g.is_active) {
+        g.is_active = false;
+    }
+
+    ~raii_guard() {
+        if (is_active) {
+            my_func();
+        }
+    }
+
+    void dismiss() {
+        is_active = false;
+    }
+private:
+    Func my_func;
+    bool is_active;
+}; // class raii_guard
+
+template <typename Func>
+raii_guard<Func> make_raii_guard( Func f ) {
+    return raii_guard<Func>(f);
+}
+
+template <typename Body>
+struct try_call_proxy {
+    try_call_proxy( Body b ) : body(b) {}
+
+    template <typename OnExceptionBody>
+    void on_exception( OnExceptionBody on_exception_body ) {
+        auto guard = make_raii_guard(on_exception_body);
+        body();
+        guard.dismiss();
+    }
+
+    template <typename OnCompletionBody>
+    void on_completion(OnCompletionBody on_completion_body) {
+        auto guard = make_raii_guard(on_completion_body);
+        body();
+    }
+
+    Body body;
+}; // struct try_call_proxy
+
+// Template helper function for API
+// try_call(lambda1).on_exception(lambda2)
+// Executes lambda1 and if it throws an exception - executes lambda2
+template <typename Body>
+try_call_proxy<Body> try_call( Body b ) {
+    return try_call_proxy<Body>(b);
+}
+
+#if __TBB_CPP17_IS_SWAPPABLE_PRESENT
+using std::is_nothrow_swappable;
+using std::is_swappable;
+#else // __TBB_CPP17_IS_SWAPPABLE_PRESENT
+namespace is_swappable_detail {
+using std::swap;
+
+template <typename T>
+using has_swap = decltype(swap(std::declval<T&>(), std::declval<T&>()));
+
+#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
+// Workaround for VS2015: it fails to instantiate noexcept(...) inside std::integral_constant.
+template <typename T>
+struct noexcept_wrapper {
+    static const bool value = noexcept(swap(std::declval<T&>(), std::declval<T&>()));
+};
+template <typename T>
+struct is_nothrow_swappable_impl : std::integral_constant<bool, noexcept_wrapper<T>::value> {};
+#else
+template <typename T>
+struct is_nothrow_swappable_impl : std::integral_constant<bool, noexcept(swap(std::declval<T&>(), std::declval<T&>()))> {};
+#endif
+}
+
+template <typename T>
+struct is_swappable : supports<T, is_swappable_detail::has_swap> {};
+
+template <typename T>
+struct is_nothrow_swappable
+    : conjunction<is_swappable<T>, is_swappable_detail::is_nothrow_swappable_impl<T>> {};
+#endif // __TBB_CPP17_IS_SWAPPABLE_PRESENT
+
+//! Allows to store a function parameter pack as a variable and later pass it to another function
+template< typename... Types >
+struct stored_pack;
+
+template<>
+struct stored_pack<>
+{
+    using pack_type = stored_pack<>;
+    stored_pack() {}
+
+    // Friend front-end functions
+    template< typename F, typename Pack > friend void call(F&& f, Pack&& p);
+    template< typename Ret, typename F, typename Pack > friend Ret call_and_return(F&& f, Pack&& p);
+
+protected:
+    // Ideally, ref-qualified non-static methods would be used,
+    // but that would greatly reduce the set of compilers where it works.
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, const pack_type& /*pack*/, Preceding&&... params) {
+        return std::forward<F>(f)(std::forward<Preceding>(params)...);
+    }
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, pack_type&& /*pack*/, Preceding&&... params) {
+        return std::forward<F>(f)(std::forward<Preceding>(params)...);
+    }
+};
+
+template< typename T, typename... Types >
+struct stored_pack<T, Types...> : stored_pack<Types...>
+{
+    using pack_type = stored_pack<T, Types...>;
+    using pack_remainder = stored_pack<Types...>;
+
+    // Since lifetime of original values is out of control, copies should be made.
+    // Thus references should be stripped away from the deduced type.
+    typename std::decay<T>::type leftmost_value;
+
+    // Here rvalue references act in the same way as forwarding references,
+    // as long as class template parameters were deduced via forwarding references.
+    stored_pack(T&& t, Types&&... types)
+    : pack_remainder(std::forward<Types>(types)...), leftmost_value(std::forward<T>(t)) {}
+
+    // Friend front-end functions
+    template< typename F, typename Pack > friend void call(F&& f, Pack&& p);
+    template< typename Ret, typename F, typename Pack > friend Ret call_and_return(F&& f, Pack&& p);
+
+protected:
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, pack_type& pack, Preceding&&... params) {
+        return pack_remainder::template call<Ret>(
+            std::forward<F>(f), static_cast<pack_remainder&>(pack),
+            std::forward<Preceding>(params)... , pack.leftmost_value
+        );
+    }
+
+    template< typename Ret, typename F, typename... Preceding >
+    static Ret call(F&& f, pack_type&& pack, Preceding&&... params) {
+        return pack_remainder::template call<Ret>(
+            std::forward<F>(f), static_cast<pack_remainder&&>(pack),
+            std::forward<Preceding>(params)... , std::move(pack.leftmost_value)
+        );
+    }
+};
+
+//! Calls the given function with arguments taken from a stored_pack
+template< typename F, typename Pack >
+void call(F&& f, Pack&& p) {
+    std::decay<Pack>::type::template call<void>(std::forward<F>(f), std::forward<Pack>(p));
+}
+
+template< typename Ret, typename F, typename Pack >
+Ret call_and_return(F&& f, Pack&& p) {
+    return std::decay<Pack>::type::template call<Ret>(std::forward<F>(f), std::forward<Pack>(p));
+}
+
+template< typename... Types >
+stored_pack<Types...> save_pack(Types&&... types) {
+    return stored_pack<Types...>(std::forward<Types>(types)...);
+}
+
+// A structure with the value which is equal to Trait::value
+// but can be used in the immediate context due to parameter T
+template <typename Trait, typename T>
+struct dependent_bool : std::integral_constant<bool, bool(Trait::value)> {};
+
+template <typename Callable>
+struct body_arg_detector;
+
+template <typename Callable, typename ReturnType, typename Arg>
+struct body_arg_detector<ReturnType(Callable::*)(Arg)> {
+    using arg_type = Arg;
+};
+
+template <typename Callable, typename ReturnType, typename Arg>
+struct body_arg_detector<ReturnType(Callable::*)(Arg) const> {
+    using arg_type = Arg;
+};
+
+template <typename Callable>
+struct argument_detector;
+
+template <typename Callable>
+struct argument_detector {
+    using type = typename body_arg_detector<decltype(&Callable::operator())>::arg_type;
+};
+
+template <typename ReturnType, typename Arg>
+struct argument_detector<ReturnType(*)(Arg)> {
+    using type = Arg;
+};
+
+// Detects the argument type of callable, works for callable with one argument.
+template <typename Callable>
+using argument_type_of = typename argument_detector<typename std::decay<Callable>::type>::type;
+
+template <typename T>
+struct type_identity {
+    using type = T;
+};
+
+template <typename T>
+using type_identity_t = typename type_identity<T>::type;
+
+} // inline namespace d0
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__template_helpers_H
diff --git a/third_party/tbb/detail/_utils.h b/third_party/tbb/detail/_utils.h
new file mode 100644
index 000000000..09fb02561
--- /dev/null
+++ b/third_party/tbb/detail/_utils.h
@@ -0,0 +1,394 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__utils_H
+#define __TBB_detail__utils_H
+
+#include "third_party/libcxx/type_traits"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/functional"
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_machine.h"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+
+//! Utility template function to prevent "unused" warnings by various compilers.
+template<typename... T> void suppress_unused_warning(T&&...) {}
+
+//! Compile-time constant that is upper bound on cache line/sector size.
+/** It should be used only in situations where having a compile-time upper
+  bound is more useful than a run-time exact answer.
+  @ingroup memory_allocation */
+constexpr size_t max_nfs_size = 128;
+constexpr std::size_t max_nfs_size_exp = 7;
+static_assert(1 << max_nfs_size_exp == max_nfs_size, "max_nfs_size_exp must be a log2(max_nfs_size)");
+
+//! Class that implements exponential backoff.
+class atomic_backoff {
+    //! Time delay, in units of "pause" instructions.
+    /** Should be equal to approximately the number of "pause" instructions
+      that take the same time as an context switch. Must be a power of two.*/
+    static constexpr std::int32_t LOOPS_BEFORE_YIELD = 16;
+    std::int32_t count;
+
+public:
+    // In many cases, an object of this type is initialized eagerly on hot path,
+    // as in for(atomic_backoff b; ; b.pause()) { /*loop body*/ }
+    // For this reason, the construction cost must be very small!
+    atomic_backoff() : count(1) {}
+    // This constructor pauses immediately; do not use on hot paths!
+    atomic_backoff(bool) : count(1) { pause(); }
+
+    //! No Copy
+    atomic_backoff(const atomic_backoff&) = delete;
+    atomic_backoff& operator=(const atomic_backoff&) = delete;
+
+    //! Pause for a while.
+    void pause() {
+        if (count <= LOOPS_BEFORE_YIELD) {
+            machine_pause(count);
+            // Pause twice as long the next time.
+            count *= 2;
+        } else {
+            // Pause is so long that we might as well yield CPU to scheduler.
+            yield();
+        }
+    }
+
+    //! Pause for a few times and return false if saturated.
+    bool bounded_pause() {
+        machine_pause(count);
+        if (count < LOOPS_BEFORE_YIELD) {
+            // Pause twice as long the next time.
+            count *= 2;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    void reset() {
+        count = 1;
+    }
+};
+
+//! Spin WHILE the condition is true.
+/** T and U should be comparable types. */
+template <typename T, typename C>
+T spin_wait_while(const std::atomic<T>& location, C comp, std::memory_order order) {
+    atomic_backoff backoff;
+    T snapshot = location.load(order);
+    while (comp(snapshot)) {
+        backoff.pause();
+        snapshot = location.load(order);
+    }
+    return snapshot;
+}
+
+//! Spin WHILE the value of the variable is equal to a given value
+/** T and U should be comparable types. */
+template <typename T, typename U>
+T spin_wait_while_eq(const std::atomic<T>& location, const U value, std::memory_order order = std::memory_order_acquire) {
+    return spin_wait_while(location, [&value](T t) { return t == value; }, order);
+}
+
+//! Spin UNTIL the value of the variable is equal to a given value
+/** T and U should be comparable types. */
+template<typename T, typename U>
+T spin_wait_until_eq(const std::atomic<T>& location, const U value, std::memory_order order = std::memory_order_acquire) {
+    return spin_wait_while(location, [&value](T t) { return t != value; }, order);
+}
+
+//! Spin UNTIL the condition returns true or spinning time is up.
+/** Returns what the passed functor returned last time it was invoked. */
+template <typename Condition>
+bool timed_spin_wait_until(Condition condition) {
+    // 32 pauses + 32 yields are meausered as balanced spin time before sleep.
+    bool finish = condition();
+    for (int i = 1; !finish && i < 32; finish = condition(), i *= 2) {
+        machine_pause(i);
+    }
+    for (int i = 32; !finish && i < 64; finish = condition(), ++i) {
+        yield();
+    }
+    return finish;
+}
+
+template <typename T>
+T clamp(T value, T lower_bound, T upper_bound) {
+    __TBB_ASSERT(lower_bound <= upper_bound, "Incorrect bounds");
+    return value > lower_bound ? (value > upper_bound ? upper_bound : value) : lower_bound;
+}
+
+template <typename T>
+std::uintptr_t log2(T in) {
+    __TBB_ASSERT(in > 0, "The logarithm of a non-positive value is undefined.");
+    return machine_log2(in);
+}
+
+template<typename T>
+T reverse_bits(T src) {
+    return machine_reverse_bits(src);
+}
+
+template<typename T>
+T reverse_n_bits(T src, std::size_t n) {
+    __TBB_ASSERT(n != 0, "Reverse for 0 bits is undefined behavior.");
+    return reverse_bits(src) >> (number_of_bits<T>() - n);
+}
+
+// A function to check if passed integer is a power of two
+template <typename IntegerType>
+constexpr bool is_power_of_two( IntegerType arg ) {
+    static_assert(std::is_integral<IntegerType>::value,
+                  "An argument for is_power_of_two should be integral type");
+    return arg && (0 == (arg & (arg - 1)));
+}
+
+// A function to determine if passed integer is a power of two
+// at least as big as another power of two, i.e. for strictly positive i and j,
+// with j being a power of two, determines whether i==j<<k for some nonnegative k
+template <typename ArgIntegerType, typename DivisorIntegerType>
+constexpr bool is_power_of_two_at_least(ArgIntegerType arg, DivisorIntegerType divisor) {
+    // Divisor should be a power of two
+    static_assert(std::is_integral<ArgIntegerType>::value,
+                  "An argument for is_power_of_two_at_least should be integral type");
+    return 0 == (arg & (arg - divisor));
+}
+
+// A function to compute arg modulo divisor where divisor is a power of 2.
+template<typename ArgIntegerType, typename DivisorIntegerType>
+inline ArgIntegerType modulo_power_of_two(ArgIntegerType arg, DivisorIntegerType divisor) {
+    __TBB_ASSERT( is_power_of_two(divisor), "Divisor should be a power of two" );
+    return arg & (divisor - 1);
+}
+
+//! A function to check if passed in pointer is aligned on a specific border
+template<typename T>
+constexpr bool is_aligned(T* pointer, std::uintptr_t alignment) {
+    return 0 == (reinterpret_cast<std::uintptr_t>(pointer) & (alignment - 1));
+}
+
+#if TBB_USE_ASSERT
+static void* const poisoned_ptr = reinterpret_cast<void*>(-1);
+
+//! Set p to invalid pointer value.
+template<typename T>
+inline void poison_pointer( T* &p ) { p = reinterpret_cast<T*>(poisoned_ptr); }
+
+template<typename T>
+inline void poison_pointer(std::atomic<T*>& p) { p.store(reinterpret_cast<T*>(poisoned_ptr), std::memory_order_relaxed); }
+
+/** Expected to be used in assertions only, thus no empty form is defined. **/
+template<typename T>
+inline bool is_poisoned( T* p ) { return p == reinterpret_cast<T*>(poisoned_ptr); }
+
+template<typename T>
+inline bool is_poisoned(const std::atomic<T*>& p) { return is_poisoned(p.load(std::memory_order_relaxed)); }
+#else
+template<typename T>
+inline void poison_pointer(T&) {/*do nothing*/}
+#endif /* !TBB_USE_ASSERT */
+
+template <std::size_t alignment = 0, typename T>
+bool assert_pointer_valid(T* p, const char* comment = nullptr) {
+    suppress_unused_warning(p, comment);
+    __TBB_ASSERT(p != nullptr, comment);
+    __TBB_ASSERT(!is_poisoned(p), comment);
+#if !(_MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER)
+    __TBB_ASSERT(is_aligned(p, alignment == 0 ? alignof(T) : alignment), comment);
+#endif
+    // Returns something to simplify assert_pointers_valid implementation.
+    return true;
+}
+
+template <typename... Args>
+void assert_pointers_valid(Args*... p) {
+    // suppress_unused_warning is used as an evaluation context for the variadic pack.
+    suppress_unused_warning(assert_pointer_valid(p)...);
+}
+
+//! Base class for types that should not be assigned.
+class no_assign {
+public:
+    void operator=(const no_assign&) = delete;
+    no_assign(const no_assign&) = default;
+    no_assign() = default;
+};
+
+//! Base class for types that should not be copied or assigned.
+class no_copy: no_assign {
+public:
+    no_copy(const no_copy&) = delete;
+    no_copy() = default;
+};
+
+template <typename T>
+void swap_atomics_relaxed(std::atomic<T>& lhs, std::atomic<T>& rhs){
+    T tmp = lhs.load(std::memory_order_relaxed);
+    lhs.store(rhs.load(std::memory_order_relaxed), std::memory_order_relaxed);
+    rhs.store(tmp, std::memory_order_relaxed);
+}
+
+//! One-time initialization states
+enum class do_once_state {
+    uninitialized = 0,      ///< No execution attempts have been undertaken yet
+    pending,                ///< A thread is executing associated do-once routine
+    executed,               ///< Do-once routine has been executed
+    initialized = executed  ///< Convenience alias
+};
+
+//! One-time initialization function
+/** /param initializer Pointer to function without arguments
+           The variant that returns bool is used for cases when initialization can fail
+           and it is OK to continue execution, but the state should be reset so that
+           the initialization attempt was repeated the next time.
+    /param state Shared state associated with initializer that specifies its
+            initialization state. Must be initially set to #uninitialized value
+            (e.g. by means of default static zero initialization). **/
+template <typename F>
+void atomic_do_once( const F& initializer, std::atomic<do_once_state>& state ) {
+    // The loop in the implementation is necessary to avoid race when thread T2
+    // that arrived in the middle of initialization attempt by another thread T1
+    // has just made initialization possible.
+    // In such a case T2 has to rely on T1 to initialize, but T1 may already be past
+    // the point where it can recognize the changed conditions.
+    do_once_state expected_state;
+    while ( state.load( std::memory_order_acquire ) != do_once_state::executed ) {
+        if( state.load( std::memory_order_relaxed ) == do_once_state::uninitialized ) {
+            expected_state = do_once_state::uninitialized;
+#if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
+            using enum_type = typename std::underlying_type<do_once_state>::type;
+            if( ((std::atomic<enum_type>&)state).compare_exchange_strong( (enum_type&)expected_state, (enum_type)do_once_state::pending ) ) {
+#else
+            if( state.compare_exchange_strong( expected_state, do_once_state::pending ) ) {
+#endif
+                run_initializer( initializer, state );
+                break;
+            }
+        }
+        spin_wait_while_eq( state, do_once_state::pending );
+    }
+}
+
+// Run the initializer which can not fail
+template<typename Functor>
+void run_initializer(const Functor& f, std::atomic<do_once_state>& state ) {
+    f();
+    state.store(do_once_state::executed, std::memory_order_release);
+}
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+template <typename T>
+concept boolean_testable_impl = std::convertible_to<T, bool>;
+
+template <typename T>
+concept boolean_testable = boolean_testable_impl<T> && requires( T&& t ) {
+                               { !std::forward<T>(t) } -> boolean_testable_impl;
+                           };
+
+#if __TBB_CPP20_COMPARISONS_PRESENT
+struct synthesized_three_way_comparator {
+    template <typename T1, typename T2>
+    auto operator()( const T1& lhs, const T2& rhs ) const
+        requires requires {
+            { lhs < rhs } -> boolean_testable;
+            { rhs < lhs } -> boolean_testable;
+        }
+    {
+        if constexpr (std::three_way_comparable_with<T1, T2>) {
+            return lhs <=> rhs;
+        } else {
+            if (lhs < rhs) {
+                return std::weak_ordering::less;
+            }
+            if (rhs < lhs) {
+                return std::weak_ordering::greater;
+            }
+            return std::weak_ordering::equivalent;
+        }
+    }
+}; // struct synthesized_three_way_comparator
+
+template <typename T1, typename T2 = T1>
+using synthesized_three_way_result = decltype(synthesized_three_way_comparator{}(std::declval<T1&>(),
+                                                                                 std::declval<T2&>()));
+
+#endif // __TBB_CPP20_COMPARISONS_PRESENT
+
+// Check if the type T is implicitly OR explicitly convertible to U
+template <typename T, typename U>
+concept relaxed_convertible_to = std::constructible_from<U, T>;
+
+template <typename T, typename U>
+concept adaptive_same_as =
+#if __TBB_STRICT_CONSTRAINTS
+    std::same_as<T, U>;
+#else
+    std::convertible_to<T, U>;
+#endif
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+template <typename F, typename... Args>
+auto invoke(F&& f, Args&&... args)
+#if __TBB_CPP17_INVOKE_PRESENT
+    noexcept(std::is_nothrow_invocable_v<F, Args...>)
+    -> std::invoke_result_t<F, Args...>
+{
+    return std::invoke(std::forward<F>(f), std::forward<Args>(args)...);
+}
+#else // __TBB_CPP17_INVOKE_PRESENT
+    noexcept(noexcept(std::forward<F>(f)(std::forward<Args>(args)...)))
+    -> decltype(std::forward<F>(f)(std::forward<Args>(args)...))
+{
+    return std::forward<F>(f)(std::forward<Args>(args)...);
+}
+#endif // __TBB_CPP17_INVOKE_PRESENT
+
+} // namespace d0
+
+namespace d1 {
+
+class delegate_base {
+public:
+    virtual bool operator()() const = 0;
+    virtual ~delegate_base() {}
+};
+
+template <typename FuncType>
+class delegated_function : public delegate_base {
+public:
+    delegated_function(FuncType& f) : my_func(f) {}
+
+    bool operator()() const override {
+        return my_func();
+    }
+
+private:
+    FuncType &my_func;
+};
+} // namespace d1
+
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__utils_H
diff --git a/third_party/tbb/detail/_waitable_atomic.h b/third_party/tbb/detail/_waitable_atomic.h
new file mode 100644
index 000000000..992f9a112
--- /dev/null
+++ b/third_party/tbb/detail/_waitable_atomic.h
@@ -0,0 +1,105 @@
+// clang-format off
+/*
+    Copyright (c) 2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_detail__address_waiters_H
+#define __TBB_detail__address_waiters_H
+
+#include "third_party/tbb/detail/_utils.h"
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+TBB_EXPORT void __TBB_EXPORTED_FUNC wait_on_address(void* address, d1::delegate_base& wakeup_condition, std::uintptr_t context);
+TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address(void* address, std::uintptr_t context);
+TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address_one(void* address);
+TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address_all(void* address);
+} // namespace r1
+
+namespace d1 {
+
+template <typename Predicate>
+void adaptive_wait_on_address(void* address, Predicate wakeup_condition, std::uintptr_t context) {
+    if (!timed_spin_wait_until(wakeup_condition)) {
+        d1::delegated_function<Predicate> pred(wakeup_condition);
+        r1::wait_on_address(address, pred, context);
+    }
+}
+
+template <typename T>
+class waitable_atomic {
+public:
+    waitable_atomic() = default;
+
+    explicit waitable_atomic(T value) : my_atomic(value) {}
+
+    waitable_atomic(const waitable_atomic&) = delete;
+    waitable_atomic& operator=(const waitable_atomic&) = delete;
+
+    T load(std::memory_order order) const noexcept {
+        return my_atomic.load(order);
+    }
+
+    T exchange(T desired) noexcept {
+        return my_atomic.exchange(desired);
+    }
+
+    void wait(T old, std::uintptr_t context, std::memory_order order) {
+        auto wakeup_condition = [&] { return my_atomic.load(order) != old; };
+        if (!timed_spin_wait_until(wakeup_condition)) {
+            // We need to use while here, because notify_all() will wake up all threads
+            // But predicate for them might be false
+            d1::delegated_function<decltype(wakeup_condition)> pred(wakeup_condition);
+            do {
+                r1::wait_on_address(this, pred, context);
+            } while (!wakeup_condition());
+        }
+    }
+
+    void wait_until(T expected, std::uintptr_t context, std::memory_order order) {
+        auto wakeup_condition = [&] { return my_atomic.load(order) == expected; };
+        if (!timed_spin_wait_until(wakeup_condition)) {
+            // We need to use while here, because notify_all() will wake up all threads
+            // But predicate for them might be false
+            d1::delegated_function<decltype(wakeup_condition)> pred(wakeup_condition);
+            do {
+                r1::wait_on_address(this, pred, context);
+            } while (!wakeup_condition());
+        }
+    }
+
+    void notify_relaxed(std::uintptr_t context) {
+        r1::notify_by_address(this, context);
+    }
+
+    void notify_one_relaxed() {
+        r1::notify_by_address_one(this);
+    }
+
+    // TODO: consider adding following interfaces:
+    // store(desired, memory_order)
+    // notify_all_relaxed()
+
+private:
+    std::atomic<T> my_atomic{};
+};
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_detail__address_waiters_H
diff --git a/third_party/tbb/dynamic_link.cpp b/third_party/tbb/dynamic_link.cpp
new file mode 100644
index 000000000..c20e88f2c
--- /dev/null
+++ b/third_party/tbb/dynamic_link.cpp
@@ -0,0 +1,516 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/dynamic_link.h"
+#include "third_party/tbb/environment.h"
+
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_utils.h"
+
+/*
+    This file is used by both TBB and OpenMP RTL. Do not use __TBB_ASSERT() macro
+    and runtime_warning() function because they are not available in OpenMP. Use
+    __TBB_ASSERT_EX and DYNAMIC_LINK_WARNING instead.
+*/
+
+#include "third_party/libcxx/cstdarg"          // va_list etc.
+#include "third_party/libcxx/cstring"          // strrchr
+#if _WIN32
+    #include "libc/mem/mem.h"
+
+    // Unify system calls
+    #define dlopen( name, flags )   LoadLibrary( name )
+    #define dlsym( handle, name )   GetProcAddress( handle, name )
+    #define dlclose( handle )       ( ! FreeLibrary( handle ) )
+    #define dlerror()               GetLastError()
+#ifndef PATH_MAX
+    #define PATH_MAX                MAX_PATH
+#endif
+#else /* _WIN32 */
+    #include "libc/runtime/dlfcn.h"
+    #include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+
+    #include "third_party/libcxx/climits"
+    #include "third_party/libcxx/cstdlib"
+#endif /* _WIN32 */
+
+#if __TBB_WEAK_SYMBOLS_PRESENT && !__TBB_DYNAMIC_LOAD_ENABLED
+    //TODO: use function attribute for weak symbols instead of the pragma.
+    #pragma weak dlopen
+    #pragma weak dlsym
+    #pragma weak dlclose
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT && !__TBB_DYNAMIC_LOAD_ENABLED */
+
+
+#define __USE_STATIC_DL_INIT    ( !__ANDROID__ )
+
+
+/*
+dynamic_link is a common interface for searching for required symbols in an
+executable and dynamic libraries.
+
+dynamic_link provides certain guarantees:
+  1. Either all or none of the requested symbols are resolved. Moreover, if
+  symbols are not resolved, the dynamic_link_descriptor table is not modified;
+  2. All returned symbols have secured lifetime: this means that none of them
+  can be invalidated until dynamic_unlink is called;
+  3. Any loaded library is loaded only via the full path. The full path is that
+  from which the runtime itself was loaded. (This is done to avoid security
+  issues caused by loading libraries from insecure paths).
+
+dynamic_link searches for the requested symbols in three stages, stopping as
+soon as all of the symbols have been resolved.
+
+  1. Search the global scope:
+    a. On Windows: dynamic_link tries to obtain the handle of the requested
+    library and if it succeeds it resolves the symbols via that handle.
+    b. On Linux: dynamic_link tries to search for the symbols in the global
+    scope via the main program handle. If the symbols are present in the global
+    scope their lifetime is not guaranteed (since dynamic_link does not know
+    anything about the library from which they are exported). Therefore it
+    tries to "pin" the symbols by obtaining the library name and reopening it.
+    dlopen may fail to reopen the library in two cases:
+       i. The symbols are exported from the executable. Currently dynamic _link
+      cannot handle this situation, so it will not find these symbols in this
+      step.
+      ii. The necessary library has been unloaded and cannot be reloaded. It
+      seems there is nothing that can be done in this case. No symbols are
+      returned.
+
+  2. Dynamic load: an attempt is made to load the requested library via the
+  full path.
+    The full path used is that from which the runtime itself was loaded. If the
+    library can be loaded, then an attempt is made to resolve the requested
+    symbols in the newly loaded library.
+    If the symbols are not found the library is unloaded.
+
+  3. Weak symbols: if weak symbols are available they are returned.
+*/
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED
+
+#if !defined(DYNAMIC_LINK_WARNING) && !__TBB_WIN8UI_SUPPORT && __TBB_DYNAMIC_LOAD_ENABLED
+    // Report runtime errors and continue.
+    #define DYNAMIC_LINK_WARNING dynamic_link_warning
+    static void dynamic_link_warning( dynamic_link_error_t code, ... ) {
+        suppress_unused_warning(code);
+    } // library_warning
+#endif /* !defined(DYNAMIC_LINK_WARNING) && !__TBB_WIN8UI_SUPPORT && __TBB_DYNAMIC_LOAD_ENABLED */
+
+    static bool resolve_symbols( dynamic_link_handle module, const dynamic_link_descriptor descriptors[], std::size_t required )
+    {
+        if ( !module )
+            return false;
+
+        #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */
+            if ( !dlsym ) return false;
+        #endif /* !__TBB_DYNAMIC_LOAD_ENABLED */
+
+        const std::size_t n_desc=20; // Usually we don't have more than 20 descriptors per library
+        __TBB_ASSERT_EX( required <= n_desc, "Too many descriptors is required" );
+        if ( required > n_desc ) return false;
+        pointer_to_handler h[n_desc];
+
+        for ( std::size_t k = 0; k < required; ++k ) {
+            dynamic_link_descriptor const & desc = descriptors[k];
+            pointer_to_handler addr = (pointer_to_handler)dlsym( module, desc.name );
+            if ( !addr ) {
+                return false;
+            }
+            h[k] = addr;
+        }
+
+        // Commit the entry points.
+        // Cannot use memset here, because the writes must be atomic.
+        for( std::size_t k = 0; k < required; ++k )
+            *descriptors[k].handler = h[k];
+        return true;
+    }
+
+#if __TBB_WIN8UI_SUPPORT
+    bool dynamic_link( const char*  library, const dynamic_link_descriptor descriptors[], std::size_t required, dynamic_link_handle*, int flags ) {
+        dynamic_link_handle tmp_handle = nullptr;
+        TCHAR wlibrary[256];
+        if ( MultiByteToWideChar(CP_UTF8, 0, library, -1, wlibrary, 255) == 0 ) return false;
+        if ( flags & DYNAMIC_LINK_LOAD )
+            tmp_handle = LoadPackagedLibrary( wlibrary, 0 );
+        if (tmp_handle != nullptr){
+            return resolve_symbols(tmp_handle, descriptors, required);
+        }else{
+            return false;
+        }
+    }
+    void dynamic_unlink( dynamic_link_handle ) {}
+    void dynamic_unlink_all() {}
+#else
+#if __TBB_DYNAMIC_LOAD_ENABLED
+/*
+    There is a security issue on Windows: LoadLibrary() may load and execute malicious code.
+    See http://www.microsoft.com/technet/security/advisory/2269637.mspx for details.
+    To avoid the issue, we have to pass full path (not just library name) to LoadLibrary. This
+    function constructs full path to the specified library (it is assumed the library located
+    side-by-side with the tbb.dll.
+
+    The function constructs absolute path for given relative path. Important: Base directory is not
+    current one, it is the directory tbb.dll loaded from.
+
+    Example:
+        Let us assume "tbb.dll" is located in "c:\program files\common\intel\" directory, e.g.
+        absolute path of the library is "c:\program files\common\intel\tbb.dll". Absolute path for
+        "tbbmalloc.dll" would be "c:\program files\common\intel\tbbmalloc.dll". Absolute path for
+        "malloc\tbbmalloc.dll" would be "c:\program files\common\intel\malloc\tbbmalloc.dll".
+*/
+
+    // Struct handle_storage is used by dynamic_link routine to store handles of
+    // all loaded or pinned dynamic libraries. When TBB is shut down, it calls
+    // dynamic_unlink_all() that unloads modules referenced by handle_storage.
+    // This struct should not have any constructors since it may be used before
+    // the constructor is called.
+    #define MAX_LOADED_MODULES 8 // The number of maximum possible modules which can be loaded
+
+    using atomic_incrementer = std::atomic<std::size_t>;
+
+    static struct handles_t {
+        atomic_incrementer my_size;
+        dynamic_link_handle my_handles[MAX_LOADED_MODULES];
+
+        void add(const dynamic_link_handle &handle) {
+            const std::size_t ind = my_size++;
+            __TBB_ASSERT_EX( ind < MAX_LOADED_MODULES, "Too many modules are loaded" );
+            my_handles[ind] = handle;
+        }
+
+        void free() {
+            const std::size_t size = my_size;
+            for (std::size_t i=0; i<size; ++i)
+                dynamic_unlink( my_handles[i] );
+        }
+    } handles;
+
+    static std::once_flag init_dl_data_state;
+
+    static struct ap_data_t {
+        char _path[PATH_MAX+1];
+        std::size_t _len;
+    } ap_data;
+
+    static void init_ap_data() {
+    #if _WIN32
+        // Get handle of our DLL first.
+        HMODULE handle;
+        BOOL brc = GetModuleHandleEx(
+            GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+            (LPCSTR)( & dynamic_link ), // any function inside the library can be used for the address
+            & handle
+            );
+        if ( !brc ) { // Error occurred.
+            int err = GetLastError();
+            DYNAMIC_LINK_WARNING( dl_sys_fail, "GetModuleHandleEx", err );
+            return;
+        }
+        // Now get path to our DLL.
+        DWORD drc = GetModuleFileName( handle, ap_data._path, static_cast< DWORD >( PATH_MAX ) );
+        if ( drc == 0 ) { // Error occurred.
+            int err = GetLastError();
+            DYNAMIC_LINK_WARNING( dl_sys_fail, "GetModuleFileName", err );
+            return;
+        }
+        if ( drc >= PATH_MAX ) { // Buffer too short.
+            DYNAMIC_LINK_WARNING( dl_buff_too_small );
+            return;
+        }
+        // Find the position of the last backslash.
+        char *backslash = std::strrchr( ap_data._path, '\\' );
+
+        if ( !backslash ) {    // Backslash not found.
+            __TBB_ASSERT_EX( backslash != nullptr, "Unbelievable.");
+            return;
+        }
+        __TBB_ASSERT_EX( backslash >= ap_data._path, "Unbelievable.");
+        ap_data._len = (std::size_t)(backslash - ap_data._path) + 1;
+        *(backslash+1) = 0;
+    #else
+        // Get the library path
+        Dl_info dlinfo;
+        int res = dladdr( (void*)&dynamic_link, &dlinfo ); // any function inside the library can be used for the address
+        if ( !res ) {
+            char const * err = dlerror();
+            DYNAMIC_LINK_WARNING( dl_sys_fail, "dladdr", err );
+            return;
+        } else {
+            __TBB_ASSERT_EX( dlinfo.dli_fname!=nullptr, "Unbelievable." );
+        }
+
+        char const *slash = std::strrchr( dlinfo.dli_fname, '/' );
+        std::size_t fname_len=0;
+        if ( slash ) {
+            __TBB_ASSERT_EX( slash >= dlinfo.dli_fname, "Unbelievable.");
+            fname_len = (std::size_t)(slash - dlinfo.dli_fname) + 1;
+        }
+
+        std::size_t rc;
+        if ( dlinfo.dli_fname[0]=='/' ) {
+            // The library path is absolute
+            rc = 0;
+            ap_data._len = 0;
+        } else {
+            // The library path is relative so get the current working directory
+            if ( !getcwd( ap_data._path, sizeof(ap_data._path)/sizeof(ap_data._path[0]) ) ) {
+                DYNAMIC_LINK_WARNING( dl_buff_too_small );
+                return;
+            }
+            ap_data._len = std::strlen( ap_data._path );
+            ap_data._path[ap_data._len++]='/';
+            rc = ap_data._len;
+        }
+
+        if ( fname_len>0 ) {
+            ap_data._len += fname_len;
+            if ( ap_data._len>PATH_MAX ) {
+                DYNAMIC_LINK_WARNING( dl_buff_too_small );
+                ap_data._len=0;
+                return;
+            }
+            std::strncpy( ap_data._path+rc, dlinfo.dli_fname, fname_len );
+            ap_data._path[ap_data._len]=0;
+        }
+    #endif /* _WIN32 */
+    }
+
+    static void init_dl_data() {
+        init_ap_data();
+    }
+
+    /*
+        The function constructs absolute path for given relative path. Important: Base directory is not
+        current one, it is the directory libtbb.so loaded from.
+
+        Arguments:
+        in  name -- Name of a file (may be with relative path; it must not be an absolute one).
+        out path -- Buffer to save result (absolute path) to.
+        in  len  -- Size of buffer.
+        ret      -- 0         -- Error occurred.
+                    > len     -- Buffer too short, required size returned.
+                    otherwise -- Ok, number of characters (incl. terminating null) written to buffer.
+    */
+    static std::size_t abs_path( char const * name, char * path, std::size_t len ) {
+        if ( ap_data._len == 0 )
+            return 0;
+
+        std::size_t name_len = std::strlen( name );
+        std::size_t full_len = name_len+ap_data._len;
+        if ( full_len < len ) {
+            __TBB_ASSERT( ap_data._path[ap_data._len] == 0, nullptr);
+            __TBB_ASSERT( std::strlen(ap_data._path) == ap_data._len, nullptr);
+            std::strncpy( path, ap_data._path, ap_data._len + 1 );
+            __TBB_ASSERT( path[ap_data._len] == 0, nullptr);
+            std::strncat( path, name, len - ap_data._len );
+            __TBB_ASSERT( std::strlen(path) == full_len, nullptr);
+        }
+        return full_len+1; // +1 for null character
+    }
+#endif  // __TBB_DYNAMIC_LOAD_ENABLED
+    void init_dynamic_link_data() {
+    #if __TBB_DYNAMIC_LOAD_ENABLED
+        std::call_once( init_dl_data_state, init_dl_data );
+    #endif
+    }
+
+    #if __USE_STATIC_DL_INIT
+    // ap_data structure is initialized with current directory on Linux.
+    // So it should be initialized as soon as possible since the current directory may be changed.
+    // static_init_ap_data object provides this initialization during library loading.
+    static struct static_init_dl_data_t {
+        static_init_dl_data_t() {
+            init_dynamic_link_data();
+        }
+    } static_init_dl_data;
+    #endif
+
+    #if __TBB_WEAK_SYMBOLS_PRESENT
+    static bool weak_symbol_link( const dynamic_link_descriptor descriptors[], std::size_t required )
+    {
+        // Check if the required entries are present in what was loaded into our process.
+        for ( std::size_t k = 0; k < required; ++k )
+            if ( !descriptors[k].ptr )
+                return false;
+        // Commit the entry points.
+        for ( std::size_t k = 0; k < required; ++k )
+            *descriptors[k].handler = (pointer_to_handler) descriptors[k].ptr;
+        return true;
+    }
+    #else
+    static bool weak_symbol_link( const dynamic_link_descriptor[], std::size_t ) {
+        return false;
+    }
+    #endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+    void dynamic_unlink( dynamic_link_handle handle ) {
+    #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */
+        if ( !dlclose ) return;
+    #endif
+        if ( handle ) {
+            dlclose( handle );
+        }
+    }
+
+    void dynamic_unlink_all() {
+    #if __TBB_DYNAMIC_LOAD_ENABLED
+        handles.free();
+    #endif
+    }
+
+    static dynamic_link_handle global_symbols_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required ) {
+        dynamic_link_handle library_handle{};
+#if _WIN32
+        auto res = GetModuleHandleEx(0, library, &library_handle);
+        __TBB_ASSERT_EX((res && library_handle) || (!res && !library_handle), nullptr);
+#else /* _WIN32 */
+    #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */
+        if ( !dlopen ) return 0;
+    #endif /* !__TBB_DYNAMIC_LOAD_ENABLED */
+        // RTLD_GLOBAL - to guarantee that old TBB will find the loaded library
+        // RTLD_NOLOAD - not to load the library without the full path
+        library_handle = dlopen(library, RTLD_LAZY | RTLD_GLOBAL | RTLD_NOLOAD);
+#endif /* _WIN32 */
+        if (library_handle) {
+            if (!resolve_symbols(library_handle, descriptors, required)) {
+                dynamic_unlink(library_handle);
+                library_handle = nullptr;
+            }
+        }
+        return library_handle;
+    }
+
+    static void save_library_handle( dynamic_link_handle src, dynamic_link_handle *dst ) {
+        __TBB_ASSERT_EX( src, "The library handle to store must be non-zero" );
+        if ( dst )
+            *dst = src;
+    #if __TBB_DYNAMIC_LOAD_ENABLED
+        else
+            handles.add( src );
+    #endif /* __TBB_DYNAMIC_LOAD_ENABLED */
+    }
+
+#if !_WIN32
+    int loading_flags(bool local_binding) {
+        int flags = RTLD_NOW;
+        if (local_binding) {
+            flags = flags | RTLD_LOCAL;
+#if (__linux__ && __GLIBC__) && !__TBB_USE_SANITIZERS
+            if( !GetBoolEnvironmentVariable("TBB_ENABLE_SANITIZERS") ) {
+                flags = flags | RTLD_DEEPBIND;
+            }
+#endif
+        } else {
+            flags = flags | RTLD_GLOBAL;
+        }
+        return flags;
+    }
+#endif
+
+    dynamic_link_handle dynamic_load( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, bool local_binding ) {
+        ::tbb::detail::suppress_unused_warning( library, descriptors, required, local_binding );
+#if __TBB_DYNAMIC_LOAD_ENABLED
+        std::size_t const len = PATH_MAX + 1;
+        char path[ len ];
+        std::size_t rc = abs_path( library, path, len );
+        if ( 0 < rc && rc <= len ) {
+#if _WIN32
+            // Prevent Windows from displaying silly message boxes if it fails to load library
+            // (e.g. because of MS runtime problems - one of those crazy manifest related ones)
+            UINT prev_mode = SetErrorMode (SEM_FAILCRITICALERRORS);
+#endif /* _WIN32 */
+            // The second argument (loading_flags) is ignored on Windows
+            dynamic_link_handle library_handle = dlopen( path, loading_flags(local_binding) );
+#if _WIN32
+            SetErrorMode (prev_mode);
+#endif /* _WIN32 */
+            if( library_handle ) {
+                if( !resolve_symbols( library_handle, descriptors, required ) ) {
+                    // The loaded library does not contain all the expected entry points
+                    dynamic_unlink( library_handle );
+                    library_handle = nullptr;
+                }
+            } else
+                DYNAMIC_LINK_WARNING( dl_lib_not_found, path, dlerror() );
+            return library_handle;
+        } else if ( rc>len )
+                DYNAMIC_LINK_WARNING( dl_buff_too_small );
+                // rc == 0 means failing of init_ap_data so the warning has already been issued.
+
+#endif /* __TBB_DYNAMIC_LOAD_ENABLED */
+            return nullptr;
+    }
+
+    bool dynamic_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, dynamic_link_handle *handle, int flags ) {
+        init_dynamic_link_data();
+
+        // TODO: May global_symbols_link find weak symbols?
+        dynamic_link_handle library_handle = ( flags & DYNAMIC_LINK_GLOBAL ) ? global_symbols_link( library, descriptors, required ) : nullptr;
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#pragma warning (push)
+// MSVC 2015 warning: 'int': forcing value to bool 'true' or 'false'
+#pragma warning (disable: 4800)
+#endif
+        if ( !library_handle && ( flags & DYNAMIC_LINK_LOAD ) )
+            library_handle = dynamic_load( library, descriptors, required, flags & DYNAMIC_LINK_LOCAL );
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#pragma warning (pop)
+#endif
+        if ( !library_handle && ( flags & DYNAMIC_LINK_WEAK ) )
+            return weak_symbol_link( descriptors, required );
+
+        if ( library_handle ) {
+            save_library_handle( library_handle, handle );
+            return true;
+        }
+        return false;
+    }
+
+#endif /*__TBB_WIN8UI_SUPPORT*/
+#else /* __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED */
+    bool dynamic_link( const char*, const dynamic_link_descriptor*, std::size_t, dynamic_link_handle *handle, int ) {
+        if ( handle )
+            *handle=0;
+        return false;
+    }
+    void dynamic_unlink( dynamic_link_handle ) {}
+    void dynamic_unlink_all() {}
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/dynamic_link.h b/third_party/tbb/dynamic_link.h
new file mode 100644
index 000000000..a7af0072c
--- /dev/null
+++ b/third_party/tbb/dynamic_link.h
@@ -0,0 +1,137 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_dynamic_link
+#define __TBB_dynamic_link
+
+// Support for dynamic loading entry points from other shared libraries.
+
+#include "third_party/tbb/detail/_config.h"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/mutex"
+
+/** By default, symbols declared and defined here go into namespace tbb::internal.
+    To put them in other namespace, define macros OPEN_INTERNAL_NAMESPACE
+    and CLOSE_INTERNAL_NAMESPACE to override the following default definitions. **/
+
+#include "third_party/libcxx/cstddef"
+#ifdef _WIN32
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#endif /* _WIN32 */
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Type definition for a pointer to a void somefunc(void)
+typedef void (*pointer_to_handler)();
+
+//! The helper to construct dynamic_link_descriptor structure
+// Double cast through the void* in DLD macro is necessary to
+// prevent warnings from some compilers (g++ 4.1)
+#if __TBB_WEAK_SYMBOLS_PRESENT
+#define DLD(s,h) {#s, (pointer_to_handler*)(void*)(&h), (pointer_to_handler)&s}
+#define DLD_NOWEAK(s,h) {#s, (pointer_to_handler*)(void*)(&h), nullptr}
+#else
+#define DLD(s,h) {#s, (pointer_to_handler*)(void*)(&h)}
+#define DLD_NOWEAK(s,h) DLD(s,h)
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+//! Association between a handler name and location of pointer to it.
+struct dynamic_link_descriptor {
+    //! Name of the handler
+    const char* name;
+    //! Pointer to the handler
+    pointer_to_handler* handler;
+#if __TBB_WEAK_SYMBOLS_PRESENT
+    //! Weak symbol
+    pointer_to_handler ptr;
+#endif
+};
+
+#if _WIN32
+using dynamic_link_handle = HMODULE;
+#else
+using dynamic_link_handle = void*;
+#endif /* _WIN32 */
+
+const int DYNAMIC_LINK_GLOBAL        = 0x01;
+const int DYNAMIC_LINK_LOAD          = 0x02;
+const int DYNAMIC_LINK_WEAK          = 0x04;
+const int DYNAMIC_LINK_LOCAL         = 0x08;
+
+const int DYNAMIC_LINK_LOCAL_BINDING = DYNAMIC_LINK_LOCAL | DYNAMIC_LINK_LOAD;
+const int DYNAMIC_LINK_DEFAULT       = DYNAMIC_LINK_GLOBAL | DYNAMIC_LINK_LOAD | DYNAMIC_LINK_WEAK;
+
+//! Fill in dynamically linked handlers.
+/** 'library' is the name of the requested library. It should not contain a full
+    path since dynamic_link adds the full path (from which the runtime itself
+    was loaded) to the library name.
+    'required' is the number of the initial entries in the array descriptors[]
+    that have to be found in order for the call to succeed. If the library and
+    all the required handlers are found, then the corresponding handler
+    pointers are set, and the return value is true.  Otherwise the original
+    array of descriptors is left untouched and the return value is false.
+    'required' is limited by 20 (exceeding of this value will result in failure
+    to load the symbols and the return value will be false).
+    'handle' is the handle of the library if it is loaded. Otherwise it is left
+    untouched.
+    'flags' is the set of DYNAMIC_LINK_* flags. Each of the DYNAMIC_LINK_* flags
+    allows its corresponding linking stage.
+**/
+bool dynamic_link( const char* library,
+                   const dynamic_link_descriptor descriptors[],
+                   std::size_t required,
+                   dynamic_link_handle* handle = nullptr,
+                   int flags = DYNAMIC_LINK_DEFAULT );
+
+void dynamic_unlink( dynamic_link_handle handle );
+
+void dynamic_unlink_all();
+
+enum dynamic_link_error_t {
+    dl_success = 0,
+    dl_lib_not_found,     // char const * lib, dlerr_t err
+    dl_sym_not_found,     // char const * sym, dlerr_t err
+                          // Note: dlerr_t depends on OS: it is char const * on Linux* and macOS*, int on Windows*.
+    dl_sys_fail,          // char const * func, int err
+    dl_buff_too_small     // none
+}; // dynamic_link_error_t
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_dynamic_link */
diff --git a/third_party/tbb/enumerable_thread_specific.h b/third_party/tbb/enumerable_thread_specific.h
new file mode 100644
index 000000000..0bef0393d
--- /dev/null
+++ b/third_party/tbb/enumerable_thread_specific.h
@@ -0,0 +1,1135 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_enumerable_thread_specific_H
+#define __TBB_enumerable_thread_specific_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_aligned_space.h"
+
+#include "third_party/tbb/concurrent_vector.h"
+#include "third_party/tbb/tbb_allocator.h"
+#include "third_party/tbb/cache_aligned_allocator.h"
+#include "third_party/tbb/profiling.h"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/thread"
+#include "third_party/libcxx/cstring" // memcpy
+#include "third_party/libcxx/cstddef" // std::ptrdiff_t
+
+#include "third_party/tbb/task.h" // for task::suspend_point
+
+#if _WIN32 || _WIN64
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#else
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/thread2.h"
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! enum for selecting between single key and key-per-instance versions
+enum ets_key_usage_type {
+    ets_key_per_instance
+    , ets_no_key
+#if __TBB_RESUMABLE_TASKS
+    , ets_suspend_aware
+#endif
+};
+
+// Forward declaration to use in internal classes
+template <typename T, typename Allocator, ets_key_usage_type ETS_key_type>
+class enumerable_thread_specific;
+
+template <std::size_t ThreadIDSize>
+struct internal_ets_key_selector {
+    using key_type = std::thread::id;
+    static key_type current_key() {
+        return std::this_thread::get_id();
+    }
+};
+
+// Intel Compiler on OSX cannot create atomics objects that instantiated from non-fundamental types
+#if __INTEL_COMPILER && __APPLE__
+template<>
+struct internal_ets_key_selector<sizeof(std::size_t)> {
+    using key_type = std::size_t;
+    static key_type current_key() {
+        auto id = std::this_thread::get_id();
+        return reinterpret_cast<key_type&>(id);
+    }
+};
+#endif
+
+template <ets_key_usage_type ETS_key_type>
+struct ets_key_selector : internal_ets_key_selector<sizeof(std::thread::id)> {};
+
+#if __TBB_RESUMABLE_TASKS
+template <>
+struct ets_key_selector<ets_suspend_aware> {
+    using key_type = suspend_point;
+    static key_type current_key() {
+        return r1::current_suspend_point();
+    }
+};
+#endif
+
+template<ets_key_usage_type ETS_key_type>
+class ets_base : detail::no_copy {
+protected:
+    using key_type = typename ets_key_selector<ETS_key_type>::key_type;
+
+public:
+    struct slot;
+    struct array {
+        array* next;
+        std::size_t lg_size;
+        slot& at( std::size_t k ) {
+            return (reinterpret_cast<slot*>(reinterpret_cast<void*>(this+1)))[k];
+        }
+        std::size_t size() const { return std::size_t(1) << lg_size; }
+        std::size_t mask() const { return size() - 1; }
+        std::size_t start( std::size_t h ) const {
+            return h >> (8 * sizeof(std::size_t) - lg_size);
+        }
+    };
+    struct slot {
+        std::atomic<key_type> key;
+        void* ptr;
+        bool empty() const { return key.load(std::memory_order_relaxed) == key_type(); }
+        bool match( key_type k ) const { return key.load(std::memory_order_relaxed) == k; }
+        bool claim( key_type k ) {
+            // TODO: maybe claim ptr, because key_type is not guaranteed to fit into word size
+            key_type expected = key_type();
+            return key.compare_exchange_strong(expected, k);
+        }
+    };
+
+protected:
+    //! Root of linked list of arrays of decreasing size.
+    /** nullptr if and only if my_count==0.
+        Each array in the list is half the size of its predecessor. */
+    std::atomic<array*> my_root;
+    std::atomic<std::size_t> my_count;
+
+    virtual void* create_local() = 0;
+    virtual void* create_array(std::size_t _size) = 0;  // _size in bytes
+    virtual void free_array(void* ptr, std::size_t _size) = 0; // _size in bytes
+
+    array* allocate( std::size_t lg_size ) {
+        std::size_t n = std::size_t(1) << lg_size;
+        array* a = static_cast<array*>(create_array(sizeof(array) + n * sizeof(slot)));
+        a->lg_size = lg_size;
+        std::memset( a + 1, 0, n * sizeof(slot) );
+        return a;
+    }
+    void deallocate(array* a) {
+        std::size_t n = std::size_t(1) << (a->lg_size);
+        free_array( static_cast<void*>(a), std::size_t(sizeof(array) + n * sizeof(slot)) );
+    }
+
+    ets_base() : my_root{nullptr}, my_count{0} {}
+    virtual ~ets_base();  // g++ complains if this is not virtual
+
+    void* table_lookup( bool& exists );
+    void  table_clear();
+    // The following functions are not used in concurrent context,
+    // so we don't need synchronization and ITT annotations there.
+    template <ets_key_usage_type E2>
+    void table_elementwise_copy( const ets_base& other,
+                                 void*(*add_element)(ets_base<E2>&, void*) ) {
+        __TBB_ASSERT(!my_root.load(std::memory_order_relaxed), nullptr);
+        __TBB_ASSERT(!my_count.load(std::memory_order_relaxed), nullptr);
+        if( !other.my_root.load(std::memory_order_relaxed) ) return;
+        array* root = allocate(other.my_root.load(std::memory_order_relaxed)->lg_size);
+        my_root.store(root, std::memory_order_relaxed);
+        root->next = nullptr;
+        my_count.store(other.my_count.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        std::size_t mask = root->mask();
+        for( array* r = other.my_root.load(std::memory_order_relaxed); r; r = r->next ) {
+            for( std::size_t i = 0; i < r->size(); ++i ) {
+                slot& s1 = r->at(i);
+                if( !s1.empty() ) {
+                    for( std::size_t j = root->start(std::hash<key_type>{}(s1.key.load(std::memory_order_relaxed))); ; j = (j+1)&mask ) {
+                        slot& s2 = root->at(j);
+                        if( s2.empty() ) {
+                            s2.ptr = add_element(static_cast<ets_base<E2>&>(*this), s1.ptr);
+                            s2.key.store(s1.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+                            break;
+                        }
+                        else if( s2.match(s1.key.load(std::memory_order_relaxed)) )
+                            break;
+                    }
+                }
+            }
+        }
+    }
+    void table_swap( ets_base& other ) {
+       __TBB_ASSERT(this!=&other, "Don't swap an instance with itself");
+       swap_atomics_relaxed(my_root, other.my_root);
+       swap_atomics_relaxed(my_count, other.my_count);
+    }
+};
+
+template<ets_key_usage_type ETS_key_type>
+ets_base<ETS_key_type>::~ets_base() {
+    __TBB_ASSERT(!my_root.load(std::memory_order_relaxed), nullptr);
+}
+
+template<ets_key_usage_type ETS_key_type>
+void ets_base<ETS_key_type>::table_clear() {
+    while ( array* r = my_root.load(std::memory_order_relaxed) ) {
+        my_root.store(r->next, std::memory_order_relaxed);
+        deallocate(r);
+    }
+    my_count.store(0, std::memory_order_relaxed);
+}
+
+template<ets_key_usage_type ETS_key_type>
+void* ets_base<ETS_key_type>::table_lookup( bool& exists ) {
+    const key_type k = ets_key_selector<ETS_key_type>::current_key();
+
+    __TBB_ASSERT(k != key_type(), nullptr);
+    void* found;
+    std::size_t h = std::hash<key_type>{}(k);
+    for( array* r = my_root.load(std::memory_order_acquire); r; r = r->next ) {
+        call_itt_notify(acquired,r);
+        std::size_t mask=r->mask();
+        for(std::size_t i = r->start(h); ;i=(i+1)&mask) {
+            slot& s = r->at(i);
+            if( s.empty() ) break;
+            if( s.match(k) ) {
+                if( r == my_root.load(std::memory_order_acquire) ) {
+                    // Success at top level
+                    exists = true;
+                    return s.ptr;
+                } else {
+                    // Success at some other level.  Need to insert at top level.
+                    exists = true;
+                    found = s.ptr;
+                    goto insert;
+                }
+            }
+        }
+    }
+    // Key does not yet exist.  The density of slots in the table does not exceed 0.5,
+    // for if this will occur a new table is allocated with double the current table
+    // size, which is swapped in as the new root table.  So an empty slot is guaranteed.
+    exists = false;
+    found = create_local();
+    {
+        std::size_t c = ++my_count;
+        array* r = my_root.load(std::memory_order_acquire);
+        call_itt_notify(acquired,r);
+        if( !r || c > r->size()/2 ) {
+            std::size_t s = r ? r->lg_size : 2;
+            while( c > std::size_t(1)<<(s-1) ) ++s;
+            array* a = allocate(s);
+            for(;;) {
+                a->next = r;
+                call_itt_notify(releasing,a);
+                array* new_r = r;
+                if( my_root.compare_exchange_strong(new_r, a) ) break;
+                call_itt_notify(acquired, new_r);
+                __TBB_ASSERT(new_r != nullptr, nullptr);
+                if( new_r->lg_size >= s ) {
+                    // Another thread inserted an equal or  bigger array, so our array is superfluous.
+                    deallocate(a);
+                    break;
+                }
+                r = new_r;
+            }
+        }
+    }
+    insert:
+    // Whether a slot has been found in an older table, or if it has been inserted at this level,
+    // it has already been accounted for in the total.  Guaranteed to be room for it, and it is
+    // not present, so search for empty slot and use it.
+    array* ir = my_root.load(std::memory_order_acquire);
+    call_itt_notify(acquired, ir);
+    std::size_t mask = ir->mask();
+    for(std::size_t i = ir->start(h);; i = (i+1)&mask) {
+        slot& s = ir->at(i);
+        if( s.empty() ) {
+            if( s.claim(k) ) {
+                s.ptr = found;
+                return found;
+            }
+        }
+    }
+}
+
+//! Specialization that exploits native TLS
+template <>
+class ets_base<ets_key_per_instance>: public ets_base<ets_no_key> {
+    using super = ets_base<ets_no_key>;
+#if _WIN32||_WIN64
+#if __TBB_WIN8UI_SUPPORT
+    using tls_key_t = DWORD;
+    void create_key() { my_key = FlsAlloc(nullptr); }
+    void destroy_key() { FlsFree(my_key); }
+    void set_tls(void * value) { FlsSetValue(my_key, (LPVOID)value); }
+    void* get_tls() { return (void *)FlsGetValue(my_key); }
+#else
+    using tls_key_t = DWORD;
+    void create_key() { my_key = TlsAlloc(); }
+    void destroy_key() { TlsFree(my_key); }
+    void set_tls(void * value) { TlsSetValue(my_key, (LPVOID)value); }
+    void* get_tls() { return (void *)TlsGetValue(my_key); }
+#endif
+#else
+    using tls_key_t = pthread_key_t;
+    void create_key() { pthread_key_create(&my_key, nullptr); }
+    void destroy_key() { pthread_key_delete(my_key); }
+    void set_tls( void * value ) const { pthread_setspecific(my_key, value); }
+    void* get_tls() const { return pthread_getspecific(my_key); }
+#endif
+    tls_key_t my_key;
+    virtual void* create_local() override = 0;
+    virtual void* create_array(std::size_t _size) override = 0;  // _size in bytes
+    virtual void free_array(void* ptr, std::size_t _size) override = 0; // size in bytes
+protected:
+    ets_base() {create_key();}
+    ~ets_base() {destroy_key();}
+    void* table_lookup( bool& exists ) {
+        void* found = get_tls();
+        if( found ) {
+            exists=true;
+        } else {
+            found = super::table_lookup(exists);
+            set_tls(found);
+        }
+        return found;
+    }
+    void table_clear() {
+        destroy_key();
+        create_key();
+        super::table_clear();
+    }
+    void table_swap( ets_base& other ) {
+       using std::swap;
+       __TBB_ASSERT(this!=&other, "Don't swap an instance with itself");
+       swap(my_key, other.my_key);
+       super::table_swap(other);
+    }
+};
+
+//! Random access iterator for traversing the thread local copies.
+template< typename Container, typename Value >
+class enumerable_thread_specific_iterator
+{
+    //! current position in the concurrent_vector
+
+    Container *my_container;
+    typename Container::size_type my_index;
+    mutable Value *my_value;
+
+    template<typename C, typename T, typename U>
+    friend bool operator==( const enumerable_thread_specific_iterator<C, T>& i,
+                     const enumerable_thread_specific_iterator<C, U>& j );
+
+    template<typename C, typename T, typename U>
+    friend bool operator<( const enumerable_thread_specific_iterator<C,T>& i,
+                           const enumerable_thread_specific_iterator<C,U>& j );
+
+    template<typename C, typename T, typename U>
+    friend std::ptrdiff_t operator-( const enumerable_thread_specific_iterator<C,T>& i,
+                                const enumerable_thread_specific_iterator<C,U>& j );
+
+    template<typename C, typename U>
+    friend class enumerable_thread_specific_iterator;
+
+public:
+    //! STL support
+    using difference_type = std::ptrdiff_t;
+    using value_type = Value;
+    using pointer = Value*;
+    using reference = Value&;
+    using iterator_category = std::random_access_iterator_tag;
+
+    enumerable_thread_specific_iterator( const Container &container, typename Container::size_type index ) :
+        my_container(&const_cast<Container &>(container)), my_index(index), my_value(nullptr) {}
+
+    //! Default constructor
+    enumerable_thread_specific_iterator() : my_container(nullptr), my_index(0), my_value(nullptr) {}
+
+    template<typename U>
+    enumerable_thread_specific_iterator( const enumerable_thread_specific_iterator<Container, U>& other ) :
+            my_container( other.my_container ), my_index( other.my_index), my_value( const_cast<Value *>(other.my_value) ) {}
+
+    enumerable_thread_specific_iterator operator+( std::ptrdiff_t offset ) const {
+        return enumerable_thread_specific_iterator(*my_container, my_index + offset);
+    }
+
+    friend enumerable_thread_specific_iterator operator+( std::ptrdiff_t offset, enumerable_thread_specific_iterator v ) {
+        return enumerable_thread_specific_iterator(*v.my_container, v.my_index + offset);
+    }
+
+    enumerable_thread_specific_iterator &operator+=( std::ptrdiff_t offset ) {
+        my_index += offset;
+        my_value = nullptr;
+        return *this;
+    }
+
+    enumerable_thread_specific_iterator operator-( std::ptrdiff_t offset ) const {
+        return enumerable_thread_specific_iterator( *my_container, my_index-offset );
+    }
+
+    enumerable_thread_specific_iterator &operator-=( std::ptrdiff_t offset ) {
+        my_index -= offset;
+        my_value = nullptr;
+        return *this;
+    }
+
+    Value& operator*() const {
+        Value* value = my_value;
+        if( !value ) {
+            value = my_value = (*my_container)[my_index].value();
+        }
+        __TBB_ASSERT( value==(*my_container)[my_index].value(), "corrupt cache" );
+        return *value;
+    }
+
+    Value& operator[]( std::ptrdiff_t k ) const {
+       return *(*my_container)[my_index + k].value();
+    }
+
+    Value* operator->() const {return &operator*();}
+
+    enumerable_thread_specific_iterator& operator++() {
+        ++my_index;
+        my_value = nullptr;
+        return *this;
+    }
+
+    enumerable_thread_specific_iterator& operator--() {
+        --my_index;
+        my_value = nullptr;
+        return *this;
+    }
+
+    //! Post increment
+    enumerable_thread_specific_iterator operator++(int) {
+        enumerable_thread_specific_iterator result = *this;
+        ++my_index;
+        my_value = nullptr;
+        return result;
+    }
+
+    //! Post decrement
+    enumerable_thread_specific_iterator operator--(int) {
+        enumerable_thread_specific_iterator result = *this;
+        --my_index;
+        my_value = nullptr;
+        return result;
+    }
+};
+
+template<typename Container, typename T, typename U>
+bool operator==( const enumerable_thread_specific_iterator<Container, T>& i,
+                 const enumerable_thread_specific_iterator<Container, U>& j ) {
+    return i.my_index == j.my_index && i.my_container == j.my_container;
+}
+
+template<typename Container, typename T, typename U>
+bool operator!=( const enumerable_thread_specific_iterator<Container,T>& i,
+                 const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return !(i==j);
+}
+
+template<typename Container, typename T, typename U>
+bool operator<( const enumerable_thread_specific_iterator<Container,T>& i,
+                const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return i.my_index<j.my_index;
+}
+
+template<typename Container, typename T, typename U>
+bool operator>( const enumerable_thread_specific_iterator<Container,T>& i,
+                const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return j<i;
+}
+
+template<typename Container, typename T, typename U>
+bool operator>=( const enumerable_thread_specific_iterator<Container,T>& i,
+                 const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return !(i<j);
+}
+
+template<typename Container, typename T, typename U>
+bool operator<=( const enumerable_thread_specific_iterator<Container,T>& i,
+                 const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return !(j<i);
+}
+
+template<typename Container, typename T, typename U>
+std::ptrdiff_t operator-( const enumerable_thread_specific_iterator<Container,T>& i,
+                     const enumerable_thread_specific_iterator<Container,U>& j ) {
+    return i.my_index-j.my_index;
+}
+
+template<typename SegmentedContainer, typename Value >
+class segmented_iterator
+{
+    template<typename C, typename T, typename U>
+    friend bool operator==(const segmented_iterator<C,T>& i, const segmented_iterator<C,U>& j);
+
+    template<typename C, typename T, typename U>
+    friend bool operator!=(const segmented_iterator<C,T>& i, const segmented_iterator<C,U>& j);
+
+    template<typename C, typename U>
+    friend class segmented_iterator;
+
+public:
+    segmented_iterator() {my_segcont = nullptr;}
+
+    segmented_iterator( const SegmentedContainer& _segmented_container ) :
+        my_segcont(const_cast<SegmentedContainer*>(&_segmented_container)),
+        outer_iter(my_segcont->end()) { }
+
+    ~segmented_iterator() {}
+
+    using InnerContainer = typename SegmentedContainer::value_type;
+    using inner_iterator = typename InnerContainer::iterator;
+    using outer_iterator = typename SegmentedContainer::iterator;
+
+    // STL support
+    // TODO: inherit all types from segmented container?
+    using difference_type = std::ptrdiff_t;
+    using value_type = Value;
+    using size_type = typename SegmentedContainer::size_type;
+    using pointer = Value*;
+    using reference = Value&;
+    using iterator_category = std::input_iterator_tag;
+
+    // Copy Constructor
+    template<typename U>
+    segmented_iterator(const segmented_iterator<SegmentedContainer, U>& other) :
+        my_segcont(other.my_segcont),
+        outer_iter(other.outer_iter),
+        // can we assign a default-constructed iterator to inner if we're at the end?
+        inner_iter(other.inner_iter)
+    {}
+
+    // assignment
+    template<typename U>
+    segmented_iterator& operator=( const segmented_iterator<SegmentedContainer, U>& other) {
+        my_segcont = other.my_segcont;
+        outer_iter = other.outer_iter;
+        if(outer_iter != my_segcont->end()) inner_iter = other.inner_iter;
+        return *this;
+    }
+
+    // allow assignment of outer iterator to segmented iterator.  Once it is
+    // assigned, move forward until a non-empty inner container is found or
+    // the end of the outer container is reached.
+    segmented_iterator& operator=(const outer_iterator& new_outer_iter) {
+        __TBB_ASSERT(my_segcont != nullptr, nullptr);
+        // check that this iterator points to something inside the segmented container
+        for(outer_iter = new_outer_iter ;outer_iter!=my_segcont->end(); ++outer_iter) {
+            if( !outer_iter->empty() ) {
+                inner_iter = outer_iter->begin();
+                break;
+            }
+        }
+        return *this;
+    }
+
+    // pre-increment
+    segmented_iterator& operator++() {
+        advance_me();
+        return *this;
+    }
+
+    // post-increment
+    segmented_iterator operator++(int) {
+        segmented_iterator tmp = *this;
+        operator++();
+        return tmp;
+    }
+
+    bool operator==(const outer_iterator& other_outer) const {
+        __TBB_ASSERT(my_segcont != nullptr, nullptr);
+        return (outer_iter == other_outer &&
+                (outer_iter == my_segcont->end() || inner_iter == outer_iter->begin()));
+    }
+
+    bool operator!=(const outer_iterator& other_outer) const {
+        return !operator==(other_outer);
+
+    }
+
+    // (i)* RHS
+    reference operator*() const {
+        __TBB_ASSERT(my_segcont != nullptr, nullptr);
+        __TBB_ASSERT(outer_iter != my_segcont->end(), "Dereferencing a pointer at end of container");
+        __TBB_ASSERT(inner_iter != outer_iter->end(), nullptr); // should never happen
+        return *inner_iter;
+    }
+
+    // i->
+    pointer operator->() const { return &operator*();}
+
+private:
+    SegmentedContainer* my_segcont;
+    outer_iterator outer_iter;
+    inner_iterator inner_iter;
+
+    void advance_me() {
+        __TBB_ASSERT(my_segcont != nullptr, nullptr);
+        __TBB_ASSERT(outer_iter != my_segcont->end(), nullptr); // not true if there are no inner containers
+        __TBB_ASSERT(inner_iter != outer_iter->end(), nullptr); // not true if the inner containers are all empty.
+        ++inner_iter;
+        while(inner_iter == outer_iter->end() && ++outer_iter != my_segcont->end()) {
+            inner_iter = outer_iter->begin();
+        }
+    }
+};    // segmented_iterator
+
+template<typename SegmentedContainer, typename T, typename U>
+bool operator==( const segmented_iterator<SegmentedContainer,T>& i,
+                 const segmented_iterator<SegmentedContainer,U>& j ) {
+    if(i.my_segcont != j.my_segcont) return false;
+    if(i.my_segcont == nullptr) return true;
+    if(i.outer_iter != j.outer_iter) return false;
+    if(i.outer_iter == i.my_segcont->end()) return true;
+    return i.inner_iter == j.inner_iter;
+}
+
+// !=
+template<typename SegmentedContainer, typename T, typename U>
+bool operator!=( const segmented_iterator<SegmentedContainer,T>& i,
+                 const segmented_iterator<SegmentedContainer,U>& j ) {
+    return !(i==j);
+}
+
+template<typename T>
+struct construct_by_default: no_assign {
+    void construct(void*where) {new(where) T();} // C++ note: the () in T() ensure zero initialization.
+    construct_by_default( int ) {}
+};
+
+template<typename T>
+struct construct_by_exemplar: no_assign {
+    const T exemplar;
+    void construct(void*where) {new(where) T(exemplar);}
+    construct_by_exemplar( const T& t ) : exemplar(t) {}
+    construct_by_exemplar( T&& t ) : exemplar(std::move(t)) {}
+};
+
+template<typename T, typename Finit>
+struct construct_by_finit: no_assign {
+    Finit f;
+    void construct(void* where) {new(where) T(f());}
+    construct_by_finit( Finit&& f_ ) : f(std::move(f_)) {}
+};
+
+template<typename T, typename... P>
+struct construct_by_args: no_assign {
+    stored_pack<P...> pack;
+    void construct(void* where) {
+        call( [where](const typename std::decay<P>::type&... args ){
+           new(where) T(args...);
+        }, pack );
+    }
+    construct_by_args( P&& ... args ) : pack(std::forward<P>(args)...) {}
+};
+
+// storage for initialization function pointer
+// TODO: consider removing the template parameter T here and in callback_leaf
+class callback_base {
+public:
+    // Clone *this
+    virtual callback_base* clone() const = 0;
+    // Destruct and free *this
+    virtual void destroy() = 0;
+    // Need virtual destructor to satisfy GCC compiler warning
+    virtual ~callback_base() { }
+    // Construct T at where
+    virtual void construct(void* where) = 0;
+};
+
+template <typename Constructor>
+class callback_leaf: public callback_base, Constructor {
+    template<typename... P> callback_leaf( P&& ... params ) : Constructor(std::forward<P>(params)...) {}
+    // TODO: make the construction/destruction consistent (use allocator.construct/destroy)
+    using my_allocator_type = typename tbb::tbb_allocator<callback_leaf>;
+
+    callback_base* clone() const override {
+        return make(*this);
+    }
+
+    void destroy() override {
+        my_allocator_type alloc;
+        tbb::detail::allocator_traits<my_allocator_type>::destroy(alloc, this);
+        tbb::detail::allocator_traits<my_allocator_type>::deallocate(alloc, this, 1);
+    }
+
+    void construct(void* where) override {
+        Constructor::construct(where);
+    }
+
+public:
+    template<typename... P>
+    static callback_base* make( P&& ... params ) {
+        void* where = my_allocator_type().allocate(1);
+        return new(where) callback_leaf( std::forward<P>(params)... );
+    }
+};
+
+//! Template for recording construction of objects in table
+/** All maintenance of the space will be done explicitly on push_back,
+    and all thread local copies must be destroyed before the concurrent
+    vector is deleted.
+
+    The flag is_built is initialized to false.  When the local is
+    successfully-constructed, set the flag to true or call value_committed().
+    If the constructor throws, the flag will be false.
+*/
+template<typename U>
+struct ets_element {
+    detail::aligned_space<U> my_space;
+    bool is_built;
+    ets_element() { is_built = false; }  // not currently-built
+    U* value() { return my_space.begin(); }
+    U* value_committed() { is_built = true; return my_space.begin(); }
+    ~ets_element() {
+        if(is_built) {
+            my_space.begin()->~U();
+            is_built = false;
+        }
+    }
+};
+
+// A predicate that can be used for a compile-time compatibility check of ETS instances
+// Ideally, it should have been declared inside the ETS class, but unfortunately
+// in that case VS2013 does not enable the variadic constructor.
+template<typename T, typename ETS> struct is_compatible_ets : std::false_type {};
+template<typename T, typename U, typename A, ets_key_usage_type C>
+struct is_compatible_ets< T, enumerable_thread_specific<U,A,C> > : std::is_same<T, U> {};
+
+// A predicate that checks whether, for a variable 'foo' of type T, foo() is a valid expression
+template <typename T> using has_empty_braces_operator = decltype(std::declval<T>()());
+template <typename T> using is_callable_no_args = supports<T, has_empty_braces_operator>;
+
+//! The enumerable_thread_specific container
+/** enumerable_thread_specific has the following properties:
+    - thread-local copies are lazily created, with default, exemplar or function initialization.
+    - thread-local copies do not move (during lifetime, and excepting clear()) so the address of a copy is invariant.
+    - the contained objects need not have operator=() defined if combine is not used.
+    - enumerable_thread_specific containers may be copy-constructed or assigned.
+    - thread-local copies can be managed by hash-table, or can be accessed via TLS storage for speed.
+    - outside of parallel contexts, the contents of all thread-local copies are accessible by iterator or using combine or combine_each methods
+
+@par Segmented iterator
+    When the thread-local objects are containers with input_iterators defined, a segmented iterator may
+    be used to iterate over all the elements of all thread-local copies.
+
+@par combine and combine_each
+    - Both methods are defined for enumerable_thread_specific.
+    - combine() requires the type T have operator=() defined.
+    - neither method modifies the contents of the object (though there is no guarantee that the applied methods do not modify the object.)
+    - Both are evaluated in serial context (the methods are assumed to be non-benign.)
+
+@ingroup containers */
+template <typename T, typename Allocator=cache_aligned_allocator<T>,
+          ets_key_usage_type ETS_key_type=ets_no_key >
+class enumerable_thread_specific: ets_base<ETS_key_type> {
+
+    template<typename U, typename A, ets_key_usage_type C> friend class enumerable_thread_specific;
+
+    using padded_element = padded<ets_element<T>>;
+
+    //! A generic range, used to create range objects from the iterators
+    template<typename I>
+    class generic_range_type: public blocked_range<I> {
+    public:
+        using value_type = T;
+        using reference = T&;
+        using const_reference = const T&;
+        using iterator = I;
+        using difference_type = std::ptrdiff_t;
+
+        generic_range_type( I begin_, I end_, std::size_t grainsize_ = 1) : blocked_range<I>(begin_,end_,grainsize_) {}
+        template<typename U>
+        generic_range_type( const generic_range_type<U>& r) : blocked_range<I>(r.begin(),r.end(),r.grainsize()) {}
+        generic_range_type( generic_range_type& r, split ) : blocked_range<I>(r,split()) {}
+    };
+
+    using allocator_traits_type = tbb::detail::allocator_traits<Allocator>;
+
+    using padded_allocator_type = typename allocator_traits_type::template rebind_alloc<padded_element>;
+    using internal_collection_type = tbb::concurrent_vector< padded_element, padded_allocator_type >;
+
+    callback_base *my_construct_callback;
+
+    internal_collection_type my_locals;
+
+    // TODO: consider unifying the callback mechanism for all create_local* methods below
+    //   (likely non-compatible and requires interface version increase)
+    void* create_local() override {
+        padded_element& lref = *my_locals.grow_by(1);
+        my_construct_callback->construct(lref.value());
+        return lref.value_committed();
+    }
+
+    static void* create_local_by_copy( ets_base<ETS_key_type>& base, void* p ) {
+        enumerable_thread_specific& ets = static_cast<enumerable_thread_specific&>(base);
+        padded_element& lref = *ets.my_locals.grow_by(1);
+        new(lref.value()) T(*static_cast<T*>(p));
+        return lref.value_committed();
+    }
+
+    static void* create_local_by_move( ets_base<ETS_key_type>& base, void* p ) {
+        enumerable_thread_specific& ets = static_cast<enumerable_thread_specific&>(base);
+        padded_element& lref = *ets.my_locals.grow_by(1);
+        new(lref.value()) T(std::move(*static_cast<T*>(p)));
+        return lref.value_committed();
+    }
+
+    using array_allocator_type = typename allocator_traits_type::template rebind_alloc<uintptr_t>;
+
+    // _size is in bytes
+    void* create_array(std::size_t _size) override {
+        std::size_t nelements = (_size + sizeof(uintptr_t) -1) / sizeof(uintptr_t);
+        return array_allocator_type().allocate(nelements);
+    }
+
+    void free_array( void* _ptr, std::size_t _size) override {
+        std::size_t nelements = (_size + sizeof(uintptr_t) -1) / sizeof(uintptr_t);
+        array_allocator_type().deallocate( reinterpret_cast<uintptr_t *>(_ptr),nelements);
+    }
+
+public:
+
+    //! Basic types
+    using value_type = T;
+    using allocator_type = Allocator;
+    using size_type = typename internal_collection_type::size_type;
+    using difference_type = typename internal_collection_type::difference_type;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
+
+    // Iterator types
+    using iterator = enumerable_thread_specific_iterator<internal_collection_type, value_type>;
+    using const_iterator = enumerable_thread_specific_iterator<internal_collection_type, const value_type>;
+
+    // Parallel range types
+    using range_type = generic_range_type<iterator>;
+    using const_range_type = generic_range_type<const_iterator>;
+
+    //! Default constructor.  Each local instance of T is default constructed.
+    enumerable_thread_specific() : my_construct_callback(
+        callback_leaf<construct_by_default<T> >::make(/*dummy argument*/0)
+    ){}
+
+    //! Constructor with initializer functor. Each local instance of T is constructed by T(finit()).
+    template <typename Finit , typename = typename std::enable_if<is_callable_no_args<typename std::decay<Finit>::type>::value>::type>
+    explicit enumerable_thread_specific( Finit finit ) : my_construct_callback(
+        callback_leaf<construct_by_finit<T,Finit> >::make( std::move(finit) )
+    ){}
+
+    //! Constructor with exemplar. Each local instance of T is copy-constructed from the exemplar.
+    explicit enumerable_thread_specific( const T& exemplar ) : my_construct_callback(
+        callback_leaf<construct_by_exemplar<T> >::make( exemplar )
+    ){}
+
+    explicit enumerable_thread_specific( T&& exemplar ) : my_construct_callback(
+        callback_leaf<construct_by_exemplar<T> >::make( std::move(exemplar) )
+    ){}
+
+    //! Variadic constructor with initializer arguments.  Each local instance of T is constructed by T(args...)
+    template <typename P1, typename... P,
+              typename = typename std::enable_if<!is_callable_no_args<typename std::decay<P1>::type>::value
+                                                      && !is_compatible_ets<T, typename std::decay<P1>::type>::value
+                                                      && !std::is_same<T, typename std::decay<P1>::type>::value
+                                                     >::type>
+    enumerable_thread_specific( P1&& arg1, P&& ... args ) : my_construct_callback(
+        callback_leaf<construct_by_args<T,P1,P...> >::make( std::forward<P1>(arg1), std::forward<P>(args)... )
+    ){}
+
+    //! Destructor
+    ~enumerable_thread_specific() {
+        if(my_construct_callback) my_construct_callback->destroy();
+        // Deallocate the hash table before overridden free_array() becomes inaccessible
+        this->ets_base<ETS_key_type>::table_clear();
+    }
+
+    //! returns reference to local, discarding exists
+    reference local() {
+        bool exists;
+        return local(exists);
+    }
+
+    //! Returns reference to calling thread's local copy, creating one if necessary
+    reference local(bool& exists)  {
+        void* ptr = this->table_lookup(exists);
+        return *(T*)ptr;
+    }
+
+    //! Get the number of local copies
+    size_type size() const { return my_locals.size(); }
+
+    //! true if there have been no local copies created
+    bool empty() const { return my_locals.empty(); }
+
+    //! begin iterator
+    iterator begin() { return iterator( my_locals, 0 ); }
+    //! end iterator
+    iterator end() { return iterator(my_locals, my_locals.size() ); }
+
+    //! begin const iterator
+    const_iterator begin() const { return const_iterator(my_locals, 0); }
+
+    //! end const iterator
+    const_iterator end() const { return const_iterator(my_locals, my_locals.size()); }
+
+    //! Get range for parallel algorithms
+    range_type range( std::size_t grainsize=1 ) { return range_type( begin(), end(), grainsize ); }
+
+    //! Get const range for parallel algorithms
+    const_range_type range( std::size_t grainsize=1 ) const { return const_range_type( begin(), end(), grainsize ); }
+
+    //! Destroys local copies
+    void clear() {
+        my_locals.clear();
+        this->table_clear();
+        // callback is not destroyed
+    }
+
+private:
+    template<typename A2, ets_key_usage_type C2>
+    void internal_copy(const enumerable_thread_specific<T, A2, C2>& other) {
+        // this tests is_compatible_ets
+        static_assert( (is_compatible_ets<T, typename std::decay<decltype(other)>::type>::value), "is_compatible_ets fails" );
+        // Initialize my_construct_callback first, so that it is valid even if rest of this routine throws an exception.
+        my_construct_callback = other.my_construct_callback->clone();
+        __TBB_ASSERT(my_locals.size()==0, nullptr);
+        my_locals.reserve(other.size());
+        this->table_elementwise_copy( other, create_local_by_copy );
+    }
+
+    void internal_swap(enumerable_thread_specific& other) {
+        using std::swap;
+        __TBB_ASSERT( this!=&other, nullptr);
+        swap(my_construct_callback, other.my_construct_callback);
+        // concurrent_vector::swap() preserves storage space,
+        // so addresses to the vector kept in ETS hash table remain valid.
+        swap(my_locals, other.my_locals);
+        this->ets_base<ETS_key_type>::table_swap(other);
+    }
+
+    template<typename A2, ets_key_usage_type C2>
+    void internal_move(enumerable_thread_specific<T, A2, C2>&& other) {
+        static_assert( (is_compatible_ets<T, typename std::decay<decltype(other)>::type>::value), "is_compatible_ets fails" );
+        my_construct_callback = other.my_construct_callback;
+        other.my_construct_callback = nullptr;
+        __TBB_ASSERT(my_locals.size()==0, nullptr);
+        my_locals.reserve(other.size());
+        this->table_elementwise_copy( other, create_local_by_move );
+    }
+
+public:
+    enumerable_thread_specific( const enumerable_thread_specific& other )
+    : ets_base<ETS_key_type>() /* prevents GCC warnings with -Wextra */
+    {
+        internal_copy(other);
+    }
+
+    template<typename Alloc, ets_key_usage_type Cachetype>
+    enumerable_thread_specific( const enumerable_thread_specific<T, Alloc, Cachetype>& other )
+    {
+        internal_copy(other);
+    }
+
+    enumerable_thread_specific( enumerable_thread_specific&& other ) : my_construct_callback()
+    {
+        // TODO: use internal_move correctly here
+        internal_swap(other);
+    }
+
+    template<typename Alloc, ets_key_usage_type Cachetype>
+    enumerable_thread_specific( enumerable_thread_specific<T, Alloc, Cachetype>&& other ) : my_construct_callback()
+    {
+        internal_move(std::move(other));
+    }
+
+    enumerable_thread_specific& operator=( const enumerable_thread_specific& other )
+    {
+        if( this != &other ) {
+            this->clear();
+            my_construct_callback->destroy();
+            internal_copy( other );
+        }
+        return *this;
+    }
+
+    template<typename Alloc, ets_key_usage_type Cachetype>
+    enumerable_thread_specific& operator=( const enumerable_thread_specific<T, Alloc, Cachetype>& other )
+    {
+        __TBB_ASSERT( static_cast<void*>(this)!=static_cast<const void*>(&other), nullptr); // Objects of different types
+        this->clear();
+        my_construct_callback->destroy();
+        internal_copy(other);
+        return *this;
+    }
+
+    enumerable_thread_specific& operator=( enumerable_thread_specific&& other )
+    {
+        if( this != &other ) {
+            // TODO: use internal_move correctly here
+            internal_swap(other);
+        }
+        return *this;
+    }
+
+    template<typename Alloc, ets_key_usage_type Cachetype>
+    enumerable_thread_specific& operator=( enumerable_thread_specific<T, Alloc, Cachetype>&& other )
+    {
+        __TBB_ASSERT( static_cast<void*>(this)!=static_cast<const void*>(&other), nullptr); // Objects of different types
+        this->clear();
+        my_construct_callback->destroy();
+        internal_move(std::move(other));
+        return *this;
+    }
+
+    // CombineFunc has signature T(T,T) or T(const T&, const T&)
+    template <typename CombineFunc>
+    T combine(CombineFunc f_combine) {
+        if(begin() == end()) {
+            ets_element<T> location;
+            my_construct_callback->construct(location.value());
+            return *location.value_committed();
+        }
+        const_iterator ci = begin();
+        T my_result = *ci;
+        while(++ci != end())
+            my_result = f_combine( my_result, *ci );
+        return my_result;
+    }
+
+    // combine_func_t takes T by value or by [const] reference, and returns nothing
+    template <typename CombineFunc>
+    void combine_each(CombineFunc f_combine) {
+        for(iterator ci = begin(); ci != end(); ++ci) {
+            f_combine( *ci );
+        }
+    }
+
+}; // enumerable_thread_specific
+
+template< typename Container >
+class flattened2d {
+    // This intermediate typedef is to address issues with VC7.1 compilers
+    using conval_type = typename Container::value_type;
+
+public:
+    //! Basic types
+    using size_type = typename conval_type::size_type;
+    using difference_type = typename conval_type::difference_type;
+    using allocator_type = typename conval_type::allocator_type;
+    using value_type = typename conval_type::value_type;
+    using reference = typename conval_type::reference;
+    using const_reference = typename conval_type::const_reference;
+    using pointer = typename conval_type::pointer;
+    using const_pointer = typename conval_type::const_pointer;
+
+    using iterator = segmented_iterator<Container, value_type>;
+    using const_iterator = segmented_iterator<Container, const value_type>;
+
+    flattened2d( const Container &c, typename Container::const_iterator b, typename Container::const_iterator e ) :
+        my_container(const_cast<Container*>(&c)), my_begin(b), my_end(e) { }
+
+    explicit flattened2d( const Container &c ) :
+        my_container(const_cast<Container*>(&c)), my_begin(c.begin()), my_end(c.end()) { }
+
+    iterator begin() { return iterator(*my_container) = my_begin; }
+    iterator end() { return iterator(*my_container) = my_end; }
+    const_iterator begin() const { return const_iterator(*my_container) = my_begin; }
+    const_iterator end() const { return const_iterator(*my_container) = my_end; }
+
+    size_type size() const {
+        size_type tot_size = 0;
+        for(typename Container::const_iterator i = my_begin; i != my_end; ++i) {
+            tot_size += i->size();
+        }
+        return tot_size;
+    }
+
+private:
+    Container *my_container;
+    typename Container::const_iterator my_begin;
+    typename Container::const_iterator my_end;
+};
+
+template <typename Container>
+flattened2d<Container> flatten2d(const Container &c, const typename Container::const_iterator b, const typename Container::const_iterator e) {
+    return flattened2d<Container>(c, b, e);
+}
+
+template <typename Container>
+flattened2d<Container> flatten2d(const Container &c) {
+    return flattened2d<Container>(c);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::enumerable_thread_specific;
+using detail::d1::flattened2d;
+using detail::d1::flatten2d;
+// ets enum keys
+using detail::d1::ets_key_usage_type;
+using detail::d1::ets_key_per_instance;
+using detail::d1::ets_no_key;
+#if __TBB_RESUMABLE_TASKS
+using detail::d1::ets_suspend_aware;
+#endif
+} // inline namespace v1
+
+} // namespace tbb
+
+#endif // __TBB_enumerable_thread_specific_H
+
diff --git a/third_party/tbb/environment.h b/third_party/tbb/environment.h
new file mode 100644
index 000000000..15442a7f6
--- /dev/null
+++ b/third_party/tbb/environment.h
@@ -0,0 +1,82 @@
+// clang-format off
+/*
+    Copyright (c) 2018-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_tbb_environment_H
+#define __TBB_tbb_environment_H
+
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/cerrno"
+#include "third_party/libcxx/cctype"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_WIN8UI_SUPPORT
+static inline bool GetBoolEnvironmentVariable( const char * ) {
+    return false;
+}
+
+static inline long GetIntegralEnvironmentVariable( const char * ) {
+    return -1;
+}
+#else  /* __TBB_WIN8UI_SUPPORT */
+static inline bool GetBoolEnvironmentVariable( const char * name ) {
+    if ( const char* s = std::getenv(name) ) {
+        // The result is defined as true only if the environment variable contains
+        // no characters except one '1' character and an arbitrary number of spaces
+        // (including the absence of spaces).
+        size_t index = std::strspn(s, " ");
+        if (s[index] != '1') return false;
+        index++;
+        // Memory access after incrementing is safe, since the getenv() returns a
+        // null-terminated string, and even if the character getting by index is '1',
+        // and this character is the end of string, after incrementing we will get
+        // an index of character, that contains '\0'
+        index += std::strspn(&s[index], " ");
+        return !s[index];
+    }
+    return false;
+}
+
+static inline long GetIntegralEnvironmentVariable( const char * name ) {
+    if ( const char* s = std::getenv(name) ) {
+        char* end = nullptr;
+        errno = 0;
+        long value = std::strtol(s, &end, 10);
+
+        // We have exceeded the range, value is negative or string is incovertable
+        if ( errno == ERANGE || value < 0 || end==s ) {
+            return -1;
+        }
+        for ( ; *end != '\0'; end++ ) {
+            if ( !std::isspace(*end) ) {
+                return -1;
+            }
+        }
+        return value;
+    }
+    return -1;
+}
+#endif /* __TBB_WIN8UI_SUPPORT */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_tbb_environment_H
diff --git a/third_party/tbb/exception.cpp b/third_party/tbb/exception.cpp
new file mode 100644
index 000000000..7668598f4
--- /dev/null
+++ b/third_party/tbb/exception.cpp
@@ -0,0 +1,167 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/stdexcept" // std::runtime_error
+#include "third_party/libcxx/new"
+#include "third_party/libcxx/stdexcept"
+
+#define __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN                             \
+    (__GLIBCXX__ && __TBB_GLIBCXX_VERSION>=40700 && __TBB_GLIBCXX_VERSION<60000 && TBB_USE_EXCEPTIONS)
+
+#if __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN
+// GCC ABI declarations necessary for a workaround
+// MISSING #include <cxxabi.h>
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+const char* bad_last_alloc::what() const noexcept(true) { return "bad allocation in previous or concurrent attempt"; }
+const char* user_abort::what() const noexcept(true) { return "User-initiated abort has terminated this operation"; }
+const char* missing_wait::what() const noexcept(true) { return "wait() was not called on the structured_task_group"; }
+
+#if TBB_USE_EXCEPTIONS
+    template <typename F>
+    /*[[noreturn]]*/ void do_throw_noexcept(F throw_func) noexcept {
+        throw_func();
+    }
+
+    /*[[noreturn]]*/ void do_throw_noexcept(void (*throw_func)()) noexcept {
+        throw_func();
+#if __GNUC__ == 7
+        // In release, GCC 7 loses noexcept attribute during tail call optimization.
+        // The following statement prevents tail call optimization.
+        volatile bool reach_this_point = true;
+        suppress_unused_warning(reach_this_point);
+#endif
+    }
+
+    bool terminate_on_exception(); // defined in global_control.cpp and ipc_server.cpp
+
+    template <typename F>
+    /*[[noreturn]]*/ void do_throw(F throw_func) {
+        if (terminate_on_exception()) {
+            do_throw_noexcept(throw_func);
+        }
+        throw_func();
+    }
+
+    #define DO_THROW(exc, init_args) do_throw( []{ throw exc init_args; } );
+#else /* !TBB_USE_EXCEPTIONS */
+    #define PRINT_ERROR_AND_ABORT(exc_name, msg) \
+        std::fprintf (stderr, "Exception %s with message %s would have been thrown, "  \
+            "if exception handling had not been disabled. Aborting.\n", exc_name, msg); \
+        std::fflush(stderr); \
+        std::abort();
+    #define DO_THROW(exc, init_args) PRINT_ERROR_AND_ABORT(#exc, #init_args)
+#endif /* !TBB_USE_EXCEPTIONS */
+
+void throw_exception ( exception_id eid ) {
+    switch ( eid ) {
+    case exception_id::bad_alloc: DO_THROW(std::bad_alloc, ()); break;
+    case exception_id::bad_last_alloc: DO_THROW(bad_last_alloc, ()); break;
+    case exception_id::user_abort: DO_THROW( user_abort, () ); break;
+    case exception_id::nonpositive_step: DO_THROW(std::invalid_argument, ("Step must be positive") ); break;
+    case exception_id::out_of_range: DO_THROW(std::out_of_range, ("Index out of requested size range")); break;
+    case exception_id::reservation_length_error: DO_THROW(std::length_error, ("Attempt to exceed implementation defined length limits")); break;
+    case exception_id::missing_wait: DO_THROW(missing_wait, ()); break;
+    case exception_id::invalid_load_factor: DO_THROW(std::out_of_range, ("Invalid hash load factor")); break;
+    case exception_id::invalid_key: DO_THROW(std::out_of_range, ("invalid key")); break;
+    case exception_id::bad_tagged_msg_cast: DO_THROW(std::runtime_error, ("Illegal tagged_msg cast")); break;
+    case exception_id::unsafe_wait: DO_THROW(unsafe_wait, ("Unsafe to wait further")); break;
+    default: __TBB_ASSERT ( false, "Unknown exception ID" );
+    }
+    __TBB_ASSERT(false, "Unreachable code");
+}
+
+/* The "what" should be fairly short, not more than about 128 characters.
+   Because we control all the call sites to handle_perror, it is pointless
+   to bullet-proof it for very long strings.
+
+   Design note: ADR put this routine off to the side in tbb_misc.cpp instead of
+   Task.cpp because the throw generates a pathetic lot of code, and ADR wanted
+   this large chunk of code to be placed on a cold page. */
+void handle_perror( int error_code, const char* what ) {
+    const int BUF_SIZE = 255;
+    char buf[BUF_SIZE + 1] = { 0 };
+    std::strncat(buf, what, BUF_SIZE);
+    std::size_t buf_len = std::strlen(buf);
+    if (error_code) {
+        std::strncat(buf, ": ", BUF_SIZE - buf_len);
+        buf_len = std::strlen(buf);
+        std::strncat(buf, std::strerror(error_code), BUF_SIZE - buf_len);
+        buf_len = std::strlen(buf);
+    }
+    __TBB_ASSERT(buf_len <= BUF_SIZE && buf[buf_len] == 0, nullptr);
+#if TBB_USE_EXCEPTIONS
+    do_throw([&buf] { throw std::runtime_error(buf); });
+#else
+    PRINT_ERROR_AND_ABORT( "runtime_error", buf);
+#endif /* !TBB_USE_EXCEPTIONS */
+}
+
+#if __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN
+// Runtime detection and workaround for the GCC bug 62258.
+// The problem is that std::rethrow_exception() does not increment a counter
+// of active exceptions, causing std::uncaught_exception() to return a wrong value.
+// The code is created after, and roughly reflects, the workaround
+// at https://gcc.gnu.org/bugzilla/attachment.cgi?id=34683
+
+void fix_broken_rethrow() {
+    struct gcc_eh_data {
+        void *       caughtExceptions;
+        unsigned int uncaughtExceptions;
+    };
+    gcc_eh_data* eh_data = punned_cast<gcc_eh_data*>( abi::__cxa_get_globals() );
+    ++eh_data->uncaughtExceptions;
+}
+
+bool gcc_rethrow_exception_broken() {
+    bool is_broken;
+    __TBB_ASSERT( !std::uncaught_exception(),
+        "gcc_rethrow_exception_broken() must not be called when an exception is active" );
+    try {
+        // Throw, catch, and rethrow an exception
+        try {
+            throw __TBB_GLIBCXX_VERSION;
+        } catch(...) {
+            std::rethrow_exception( std::current_exception() );
+        }
+    } catch(...) {
+        // Check the bug presence
+        is_broken = std::uncaught_exception();
+    }
+    if( is_broken ) fix_broken_rethrow();
+    __TBB_ASSERT( !std::uncaught_exception(), nullptr);
+    return is_broken;
+}
+#else
+void fix_broken_rethrow() {}
+bool gcc_rethrow_exception_broken() { return false; }
+#endif /* __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/third_party/tbb/flow_graph.h b/third_party/tbb/flow_graph.h
new file mode 100644
index 000000000..7b0343ddc
--- /dev/null
+++ b/third_party/tbb/flow_graph.h
@@ -0,0 +1,3377 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_H
+#define __TBB_flow_graph_H
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/type_traits"
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/spin_mutex.h"
+#include "third_party/tbb/null_mutex.h"
+#include "third_party/tbb/spin_rw_mutex.h"
+#include "third_party/tbb/null_rw_mutex.h"
+#include "third_party/tbb/detail/_pipeline_filters.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+#include "third_party/tbb/cache_aligned_allocator.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_aggregator.h"
+#include "third_party/tbb/detail/_allocator_traits.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/profiling.h"
+#include "third_party/tbb/task_arena.h"
+
+#if TBB_USE_PROFILING_TOOLS && ( __unix__ || __APPLE__ )
+   #if __INTEL_COMPILER
+       // Disabled warning "routine is both inline and noinline"
+       #pragma warning (push)
+       #pragma warning( disable: 2196 )
+   #endif
+   #define __TBB_NOINLINE_SYM __attribute__((noinline))
+#else
+   #define __TBB_NOINLINE_SYM
+#endif
+
+#include "third_party/libcxx/tuple"
+#include "third_party/libcxx/list"
+#include "third_party/libcxx/queue"
+#if __TBB_CPP20_CONCEPTS_PRESENT
+// MISSING #include <concepts>
+#endif
+
+/** @file
+  \brief The graph related classes and functions
+
+  There are some applications that best express dependencies as messages
+  passed between nodes in a graph.  These messages may contain data or
+  simply act as signals that a predecessors has completed. The graph
+  class and its associated node classes can be used to express such
+  applications.
+*/
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+
+//! An enumeration the provides the two most common concurrency levels: unlimited and serial
+enum concurrency { unlimited = 0, serial = 1 };
+
+//! A generic null type
+struct null_type {};
+
+//! An empty class used for messages that mean "I'm done"
+class continue_msg {};
+
+} // namespace d1
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+namespace d0 {
+
+template <typename ReturnType, typename OutputType>
+concept node_body_return_type = std::same_as<OutputType, tbb::detail::d1::continue_msg> ||
+                                std::convertible_to<OutputType, ReturnType>;
+
+// TODO: consider using std::invocable here
+template <typename Body, typename Output>
+concept continue_node_body = std::copy_constructible<Body> &&
+                             requires( Body& body, const tbb::detail::d1::continue_msg& v ) {
+                                 { body(v) } -> node_body_return_type<Output>;
+                             };
+
+template <typename Body, typename Input, typename Output>
+concept function_node_body = std::copy_constructible<Body> &&
+                             std::invocable<Body&, const Input&> &&
+                             node_body_return_type<std::invoke_result_t<Body&, const Input&>, Output>;
+
+template <typename FunctionObject, typename Input, typename Key>
+concept join_node_function_object = std::copy_constructible<FunctionObject> &&
+                                    std::invocable<FunctionObject&, const Input&> &&
+                                    std::convertible_to<std::invoke_result_t<FunctionObject&, const Input&>, Key>;
+
+template <typename Body, typename Output>
+concept input_node_body = std::copy_constructible<Body> &&
+                          requires( Body& body, tbb::detail::d1::flow_control& fc ) {
+                              { body(fc) } -> adaptive_same_as<Output>;
+                          };
+
+template <typename Body, typename Input, typename OutputPortsType>
+concept multifunction_node_body = std::copy_constructible<Body> &&
+                                  std::invocable<Body&, const Input&, OutputPortsType&>;
+
+template <typename Sequencer, typename Value>
+concept sequencer = std::copy_constructible<Sequencer> &&
+                    std::invocable<Sequencer&, const Value&> &&
+                    std::convertible_to<std::invoke_result_t<Sequencer&, const Value&>, std::size_t>;
+
+template <typename Body, typename Input, typename GatewayType>
+concept async_node_body = std::copy_constructible<Body> &&
+                          std::invocable<Body&, const Input&, GatewayType&>;
+
+} // namespace d0
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+namespace d1 {
+
+//! Forward declaration section
+template< typename T > class sender;
+template< typename T > class receiver;
+class continue_receiver;
+
+template< typename T, typename U > class limiter_node;  // needed for resetting decrementer
+
+template<typename T, typename M> class successor_cache;
+template<typename T, typename M> class broadcast_cache;
+template<typename T, typename M> class round_robin_cache;
+template<typename T, typename M> class predecessor_cache;
+template<typename T, typename M> class reservable_predecessor_cache;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+namespace order {
+struct following;
+struct preceding;
+}
+template<typename Order, typename... Args> struct node_set;
+#endif
+
+
+} // namespace d1
+} // namespace detail
+} // namespace tbb
+
+//! The graph class
+#include "third_party/tbb/detail/_flow_graph_impl.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+static inline std::pair<graph_task*, graph_task*> order_tasks(graph_task* first, graph_task* second) {
+    if (second->priority > first->priority)
+        return std::make_pair(second, first);
+    return std::make_pair(first, second);
+}
+
+// submit task if necessary. Returns the non-enqueued task if there is one.
+static inline graph_task* combine_tasks(graph& g, graph_task* left, graph_task* right) {
+    // if no RHS task, don't change left.
+    if (right == nullptr) return left;
+    // right != nullptr
+    if (left == nullptr) return right;
+    if (left == SUCCESSFULLY_ENQUEUED) return right;
+    // left contains a task
+    if (right != SUCCESSFULLY_ENQUEUED) {
+        // both are valid tasks
+        auto tasks_pair = order_tasks(left, right);
+        spawn_in_graph_arena(g, *tasks_pair.first);
+        return tasks_pair.second;
+    }
+    return left;
+}
+
+//! Pure virtual template class that defines a sender of messages of type T
+template< typename T >
+class sender {
+public:
+    virtual ~sender() {}
+
+    //! Request an item from the sender
+    virtual bool try_get( T & ) { return false; }
+
+    //! Reserves an item in the sender
+    virtual bool try_reserve( T & ) { return false; }
+
+    //! Releases the reserved item
+    virtual bool try_release( ) { return false; }
+
+    //! Consumes the reserved item
+    virtual bool try_consume( ) { return false; }
+
+protected:
+    //! The output type of this sender
+    typedef T output_type;
+
+    //! The successor type for this node
+    typedef receiver<T> successor_type;
+
+    //! Add a new successor to this node
+    virtual bool register_successor( successor_type &r ) = 0;
+
+    //! Removes a successor from this node
+    virtual bool remove_successor( successor_type &r ) = 0;
+
+    template<typename C>
+    friend bool register_successor(sender<C>& s, receiver<C>& r);
+
+    template<typename C>
+    friend bool remove_successor  (sender<C>& s, receiver<C>& r);
+};  // class sender<T>
+
+template<typename C>
+bool register_successor(sender<C>& s, receiver<C>& r) {
+    return s.register_successor(r);
+}
+
+template<typename C>
+bool remove_successor(sender<C>& s, receiver<C>& r) {
+    return s.remove_successor(r);
+}
+
+//! Pure virtual template class that defines a receiver of messages of type T
+template< typename T >
+class receiver {
+public:
+    //! Destructor
+    virtual ~receiver() {}
+
+    //! Put an item to the receiver
+    bool try_put( const T& t ) {
+        graph_task *res = try_put_task(t);
+        if (!res) return false;
+        if (res != SUCCESSFULLY_ENQUEUED) spawn_in_graph_arena(graph_reference(), *res);
+        return true;
+    }
+
+    //! put item to successor; return task to run the successor if possible.
+protected:
+    //! The input type of this receiver
+    typedef T input_type;
+
+    //! The predecessor type for this node
+    typedef sender<T> predecessor_type;
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template< typename X, typename Y > friend class broadcast_cache;
+    template< typename X, typename Y > friend class round_robin_cache;
+    virtual graph_task *try_put_task(const T& t) = 0;
+    virtual graph& graph_reference() const = 0;
+
+    template<typename TT, typename M> friend class successor_cache;
+    virtual bool is_continue_receiver() { return false; }
+
+    // TODO revamp: reconsider the inheritance and move node priority out of receiver
+    virtual node_priority_t priority() const { return no_priority; }
+
+    //! Add a predecessor to the node
+    virtual bool register_predecessor( predecessor_type & ) { return false; }
+
+    //! Remove a predecessor from the node
+    virtual bool remove_predecessor( predecessor_type & ) { return false; }
+
+    template <typename C>
+    friend bool register_predecessor(receiver<C>& r, sender<C>& s);
+    template <typename C>
+    friend bool remove_predecessor  (receiver<C>& r, sender<C>& s);
+}; // class receiver<T>
+
+template <typename C>
+bool register_predecessor(receiver<C>& r, sender<C>& s) {
+    return r.register_predecessor(s);
+}
+
+template <typename C>
+bool remove_predecessor(receiver<C>& r, sender<C>& s) {
+    return r.remove_predecessor(s);
+}
+
+//! Base class for receivers of completion messages
+/** These receivers automatically reset, but cannot be explicitly waited on */
+class continue_receiver : public receiver< continue_msg > {
+protected:
+
+    //! Constructor
+    explicit continue_receiver( int number_of_predecessors, node_priority_t a_priority ) {
+        my_predecessor_count = my_initial_predecessor_count = number_of_predecessors;
+        my_current_count = 0;
+        my_priority = a_priority;
+    }
+
+    //! Copy constructor
+    continue_receiver( const continue_receiver& src ) : receiver<continue_msg>() {
+        my_predecessor_count = my_initial_predecessor_count = src.my_initial_predecessor_count;
+        my_current_count = 0;
+        my_priority = src.my_priority;
+    }
+
+    //! Increments the trigger threshold
+    bool register_predecessor( predecessor_type & ) override {
+        spin_mutex::scoped_lock l(my_mutex);
+        ++my_predecessor_count;
+        return true;
+    }
+
+    //! Decrements the trigger threshold
+    /** Does not check to see if the removal of the predecessor now makes the current count
+        exceed the new threshold.  So removing a predecessor while the graph is active can cause
+        unexpected results. */
+    bool remove_predecessor( predecessor_type & ) override {
+        spin_mutex::scoped_lock l(my_mutex);
+        --my_predecessor_count;
+        return true;
+    }
+
+    //! The input type
+    typedef continue_msg input_type;
+
+    //! The predecessor type for this node
+    typedef receiver<input_type>::predecessor_type predecessor_type;
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    // execute body is supposed to be too small to create a task for.
+    graph_task* try_put_task( const input_type & ) override {
+        {
+            spin_mutex::scoped_lock l(my_mutex);
+            if ( ++my_current_count < my_predecessor_count )
+                return SUCCESSFULLY_ENQUEUED;
+            else
+                my_current_count = 0;
+        }
+        graph_task* res = execute();
+        return res? res : SUCCESSFULLY_ENQUEUED;
+    }
+
+    spin_mutex my_mutex;
+    int my_predecessor_count;
+    int my_current_count;
+    int my_initial_predecessor_count;
+    node_priority_t my_priority;
+    // the friend declaration in the base class did not eliminate the "protected class"
+    // error in gcc 4.1.2
+    template<typename U, typename V> friend class limiter_node;
+
+    virtual void reset_receiver( reset_flags f ) {
+        my_current_count = 0;
+        if (f & rf_clear_edges) {
+            my_predecessor_count = my_initial_predecessor_count;
+        }
+    }
+
+    //! Does whatever should happen when the threshold is reached
+    /** This should be very fast or else spawn a task.  This is
+        called while the sender is blocked in the try_put(). */
+    virtual graph_task* execute() = 0;
+    template<typename TT, typename M> friend class successor_cache;
+    bool is_continue_receiver() override { return true; }
+
+    node_priority_t priority() const override { return my_priority; }
+}; // class continue_receiver
+
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+    template <typename K, typename T>
+    K key_from_message( const T &t ) {
+        return t.key();
+    }
+#endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+
+} // d1
+} // detail
+} // tbb
+
+#include "third_party/tbb/detail/_flow_graph_trace_impl.h"
+#include "third_party/tbb/detail/_hash_compare.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+#include "third_party/tbb/detail/_flow_graph_body_impl.h"
+#include "third_party/tbb/detail/_flow_graph_cache_impl.h"
+#include "third_party/tbb/detail/_flow_graph_types_impl.h"
+
+using namespace graph_policy_namespace;
+
+template <typename C, typename N>
+graph_iterator<C,N>::graph_iterator(C *g, bool begin) : my_graph(g), current_node(nullptr)
+{
+    if (begin) current_node = my_graph->my_nodes;
+    //else it is an end iterator by default
+}
+
+template <typename C, typename N>
+typename graph_iterator<C,N>::reference graph_iterator<C,N>::operator*() const {
+    __TBB_ASSERT(current_node, "graph_iterator at end");
+    return *operator->();
+}
+
+template <typename C, typename N>
+typename graph_iterator<C,N>::pointer graph_iterator<C,N>::operator->() const {
+    return current_node;
+}
+
+template <typename C, typename N>
+void graph_iterator<C,N>::internal_forward() {
+    if (current_node) current_node = current_node->next;
+}
+
+//! Constructs a graph with isolated task_group_context
+inline graph::graph() : my_wait_context(0), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
+    prepare_task_arena();
+    own_context = true;
+    cancelled = false;
+    caught_exception = false;
+    my_context = new (r1::cache_aligned_allocate(sizeof(task_group_context))) task_group_context(FLOW_TASKS);
+    fgt_graph(this);
+    my_is_active = true;
+}
+
+inline graph::graph(task_group_context& use_this_context) :
+    my_wait_context(0), my_context(&use_this_context), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
+    prepare_task_arena();
+    own_context = false;
+    cancelled = false;
+    caught_exception = false;
+    fgt_graph(this);
+    my_is_active = true;
+}
+
+inline graph::~graph() {
+    wait_for_all();
+    if (own_context) {
+        my_context->~task_group_context();
+        r1::cache_aligned_deallocate(my_context);
+    }
+    delete my_task_arena;
+}
+
+inline void graph::reserve_wait() {
+    my_wait_context.reserve();
+    fgt_reserve_wait(this);
+}
+
+inline void graph::release_wait() {
+    fgt_release_wait(this);
+    my_wait_context.release();
+}
+
+inline void graph::register_node(graph_node *n) {
+    n->next = nullptr;
+    {
+        spin_mutex::scoped_lock lock(nodelist_mutex);
+        n->prev = my_nodes_last;
+        if (my_nodes_last) my_nodes_last->next = n;
+        my_nodes_last = n;
+        if (!my_nodes) my_nodes = n;
+    }
+}
+
+inline void graph::remove_node(graph_node *n) {
+    {
+        spin_mutex::scoped_lock lock(nodelist_mutex);
+        __TBB_ASSERT(my_nodes && my_nodes_last, "graph::remove_node: Error: no registered nodes");
+        if (n->prev) n->prev->next = n->next;
+        if (n->next) n->next->prev = n->prev;
+        if (my_nodes_last == n) my_nodes_last = n->prev;
+        if (my_nodes == n) my_nodes = n->next;
+    }
+    n->prev = n->next = nullptr;
+}
+
+inline void graph::reset( reset_flags f ) {
+    // reset context
+    deactivate_graph(*this);
+
+    my_context->reset();
+    cancelled = false;
+    caught_exception = false;
+    // reset all the nodes comprising the graph
+    for(iterator ii = begin(); ii != end(); ++ii) {
+        graph_node *my_p = &(*ii);
+        my_p->reset_node(f);
+    }
+    // Reattach the arena. Might be useful to run the graph in a particular task_arena
+    // while not limiting graph lifetime to a single task_arena::execute() call.
+    prepare_task_arena( /*reinit=*/true );
+    activate_graph(*this);
+}
+
+inline void graph::cancel() {
+    my_context->cancel_group_execution();
+}
+
+inline graph::iterator graph::begin() { return iterator(this, true); }
+
+inline graph::iterator graph::end() { return iterator(this, false); }
+
+inline graph::const_iterator graph::begin() const { return const_iterator(this, true); }
+
+inline graph::const_iterator graph::end() const { return const_iterator(this, false); }
+
+inline graph::const_iterator graph::cbegin() const { return const_iterator(this, true); }
+
+inline graph::const_iterator graph::cend() const { return const_iterator(this, false); }
+
+inline graph_node::graph_node(graph& g) : my_graph(g) {
+    my_graph.register_node(this);
+}
+
+inline graph_node::~graph_node() {
+    my_graph.remove_node(this);
+}
+
+#include "third_party/tbb/detail/_flow_graph_node_impl.h"
+
+
+//! An executable node that acts as a source, i.e. it has no predecessors
+
+template < typename Output >
+    __TBB_requires(std::copyable<Output>)
+class input_node : public graph_node, public sender< Output > {
+public:
+    //! The type of the output message, which is complete
+    typedef Output output_type;
+
+    //! The type of successors of this node
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    // Input node has no input type
+    typedef null_type input_type;
+
+    //! Constructor for a node with a successor
+    template< typename Body >
+        __TBB_requires(input_node_body<Body, Output>)
+     __TBB_NOINLINE_SYM input_node( graph &g, Body body )
+         : graph_node(g), my_active(false)
+         , my_body( new input_body_leaf< output_type, Body>(body) )
+         , my_init_body( new input_body_leaf< output_type, Body>(body) )
+         , my_successors(this), my_reserved(false), my_has_cached_item(false)
+    {
+        fgt_node_with_body(CODEPTR(), FLOW_INPUT_NODE, &this->my_graph,
+                           static_cast<sender<output_type> *>(this), this->my_body);
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Successors>
+        __TBB_requires(input_node_body<Body, Output>)
+    input_node( const node_set<order::preceding, Successors...>& successors, Body body )
+        : input_node(successors.graph_reference(), body)
+    {
+        make_edges(*this, successors);
+    }
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM input_node( const input_node& src )
+        : graph_node(src.my_graph), sender<Output>()
+        , my_active(false)
+        , my_body(src.my_init_body->clone()), my_init_body(src.my_init_body->clone())
+        , my_successors(this), my_reserved(false), my_has_cached_item(false)
+    {
+        fgt_node_with_body(CODEPTR(), FLOW_INPUT_NODE, &this->my_graph,
+                           static_cast<sender<output_type> *>(this), this->my_body);
+    }
+
+    //! The destructor
+    ~input_node() { delete my_body; delete my_init_body; }
+
+    //! Add a new successor to this node
+    bool register_successor( successor_type &r ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_successors.register_successor(r);
+        if ( my_active )
+            spawn_put();
+        return true;
+    }
+
+    //! Removes a successor from this node
+    bool remove_successor( successor_type &r ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_successors.remove_successor(r);
+        return true;
+    }
+
+    //! Request an item from the node
+    bool try_get( output_type &v ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        if ( my_reserved )
+            return false;
+
+        if ( my_has_cached_item ) {
+            v = my_cached_item;
+            my_has_cached_item = false;
+            return true;
+        }
+        // we've been asked to provide an item, but we have none.  enqueue a task to
+        // provide one.
+        if ( my_active )
+            spawn_put();
+        return false;
+    }
+
+    //! Reserves an item.
+    bool try_reserve( output_type &v ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        if ( my_reserved ) {
+            return false;
+        }
+
+        if ( my_has_cached_item ) {
+            v = my_cached_item;
+            my_reserved = true;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    //! Release a reserved item.
+    /** true = item has been released and so remains in sender, dest must request or reserve future items */
+    bool try_release( ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        __TBB_ASSERT( my_reserved && my_has_cached_item, "releasing non-existent reservation" );
+        my_reserved = false;
+        if(!my_successors.empty())
+            spawn_put();
+        return true;
+    }
+
+    //! Consumes a reserved item
+    bool try_consume( ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        __TBB_ASSERT( my_reserved && my_has_cached_item, "consuming non-existent reservation" );
+        my_reserved = false;
+        my_has_cached_item = false;
+        if ( !my_successors.empty() ) {
+            spawn_put();
+        }
+        return true;
+    }
+
+    //! Activates a node that was created in the inactive state
+    void activate() {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_active = true;
+        if (!my_successors.empty())
+            spawn_put();
+    }
+
+    template<typename Body>
+    Body copy_function_object() {
+        input_body<output_type> &body_ref = *this->my_body;
+        return dynamic_cast< input_body_leaf<output_type, Body> & >(body_ref).get_body();
+    }
+
+protected:
+
+    //! resets the input_node to its initial state
+    void reset_node( reset_flags f) override {
+        my_active = false;
+        my_reserved = false;
+        my_has_cached_item = false;
+
+        if(f & rf_clear_edges) my_successors.clear();
+        if(f & rf_reset_bodies) {
+            input_body<output_type> *tmp = my_init_body->clone();
+            delete my_body;
+            my_body = tmp;
+        }
+    }
+
+private:
+    spin_mutex my_mutex;
+    bool my_active;
+    input_body<output_type> *my_body;
+    input_body<output_type> *my_init_body;
+    broadcast_cache< output_type > my_successors;
+    bool my_reserved;
+    bool my_has_cached_item;
+    output_type my_cached_item;
+
+    // used by apply_body_bypass, can invoke body of node.
+    bool try_reserve_apply_body(output_type &v) {
+        spin_mutex::scoped_lock lock(my_mutex);
+        if ( my_reserved ) {
+            return false;
+        }
+        if ( !my_has_cached_item ) {
+            flow_control control;
+
+            fgt_begin_body( my_body );
+
+            my_cached_item = (*my_body)(control);
+            my_has_cached_item = !control.is_pipeline_stopped;
+
+            fgt_end_body( my_body );
+        }
+        if ( my_has_cached_item ) {
+            v = my_cached_item;
+            my_reserved = true;
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    graph_task* create_put_task() {
+        small_object_allocator allocator{};
+        typedef input_node_task_bypass< input_node<output_type> > task_type;
+        graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
+        my_graph.reserve_wait();
+        return t;
+    }
+
+    //! Spawns a task that applies the body
+    void spawn_put( ) {
+        if(is_graph_active(this->my_graph)) {
+            spawn_in_graph_arena(this->my_graph, *create_put_task());
+        }
+    }
+
+    friend class input_node_task_bypass< input_node<output_type> >;
+    //! Applies the body.  Returning SUCCESSFULLY_ENQUEUED okay; forward_task_bypass will handle it.
+    graph_task* apply_body_bypass( ) {
+        output_type v;
+        if ( !try_reserve_apply_body(v) )
+            return nullptr;
+
+        graph_task *last_task = my_successors.try_put_task(v);
+        if ( last_task )
+            try_consume();
+        else
+            try_release();
+        return last_task;
+    }
+};  // class input_node
+
+//! Implements a function node that supports Input -> Output
+template<typename Input, typename Output = continue_msg, typename Policy = queueing>
+    __TBB_requires(std::default_initializable<Input> &&
+                   std::copy_constructible<Input> &&
+                   std::copy_constructible<Output>)
+class function_node
+    : public graph_node
+    , public function_input< Input, Output, Policy, cache_aligned_allocator<Input> >
+    , public function_output<Output>
+{
+    typedef cache_aligned_allocator<Input> internals_allocator;
+
+public:
+    typedef Input input_type;
+    typedef Output output_type;
+    typedef function_input<input_type,output_type,Policy,internals_allocator> input_impl_type;
+    typedef function_input_queue<input_type, internals_allocator> input_queue_type;
+    typedef function_output<output_type> fOutput_type;
+    typedef typename input_impl_type::predecessor_type predecessor_type;
+    typedef typename fOutput_type::successor_type successor_type;
+
+    using input_impl_type::my_predecessors;
+
+    //! Constructor
+    // input_queue_type is allocated here, but destroyed in the function_input_base.
+    // TODO: pass the graph_buffer_policy to the function_input_base so it can all
+    // be done in one place.  This would be an interface-breaking change.
+    template< typename Body >
+        __TBB_requires(function_node_body<Body, Input, Output>)
+     __TBB_NOINLINE_SYM function_node( graph &g, size_t concurrency,
+                   Body body, Policy = Policy(), node_priority_t a_priority = no_priority )
+        : graph_node(g), input_impl_type(g, concurrency, body, a_priority),
+          fOutput_type(g) {
+        fgt_node_with_body( CODEPTR(), FLOW_FUNCTION_NODE, &this->my_graph,
+                static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this), this->my_body );
+    }
+
+    template <typename Body>
+        __TBB_requires(function_node_body<Body, Input, Output>)
+    function_node( graph& g, size_t concurrency, Body body, node_priority_t a_priority )
+        : function_node(g, concurrency, body, Policy(), a_priority) {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Args>
+        __TBB_requires(function_node_body<Body, Input, Output>)
+    function_node( const node_set<Args...>& nodes, size_t concurrency, Body body,
+                   Policy p = Policy(), node_priority_t a_priority = no_priority )
+        : function_node(nodes.graph_reference(), concurrency, body, p, a_priority) {
+        make_edges_in_order(nodes, *this);
+    }
+
+    template <typename Body, typename... Args>
+        __TBB_requires(function_node_body<Body, Input, Output>)
+    function_node( const node_set<Args...>& nodes, size_t concurrency, Body body, node_priority_t a_priority )
+        : function_node(nodes, concurrency, body, Policy(), a_priority) {}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM function_node( const function_node& src ) :
+        graph_node(src.my_graph),
+        input_impl_type(src),
+        fOutput_type(src.my_graph) {
+        fgt_node_with_body( CODEPTR(), FLOW_FUNCTION_NODE, &this->my_graph,
+                static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this), this->my_body );
+    }
+
+protected:
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    using input_impl_type::try_put_task;
+
+    broadcast_cache<output_type> &successors () override { return fOutput_type::my_successors; }
+
+    void reset_node(reset_flags f) override {
+        input_impl_type::reset_function_input(f);
+        // TODO: use clear() instead.
+        if(f & rf_clear_edges) {
+            successors().clear();
+            my_predecessors.clear();
+        }
+        __TBB_ASSERT(!(f & rf_clear_edges) || successors().empty(), "function_node successors not empty");
+        __TBB_ASSERT(this->my_predecessors.empty(), "function_node predecessors not empty");
+    }
+
+};  // class function_node
+
+//! implements a function node that supports Input -> (set of outputs)
+// Output is a tuple of output types.
+template<typename Input, typename Output, typename Policy = queueing>
+    __TBB_requires(std::default_initializable<Input> &&
+                   std::copy_constructible<Input>)
+class multifunction_node :
+    public graph_node,
+    public multifunction_input
+    <
+        Input,
+        typename wrap_tuple_elements<
+            std::tuple_size<Output>::value,  // #elements in tuple
+            multifunction_output,  // wrap this around each element
+            Output // the tuple providing the types
+        >::type,
+        Policy,
+        cache_aligned_allocator<Input>
+    >
+{
+    typedef cache_aligned_allocator<Input> internals_allocator;
+
+protected:
+    static const int N = std::tuple_size<Output>::value;
+public:
+    typedef Input input_type;
+    typedef null_type output_type;
+    typedef typename wrap_tuple_elements<N,multifunction_output, Output>::type output_ports_type;
+    typedef multifunction_input<
+        input_type, output_ports_type, Policy, internals_allocator> input_impl_type;
+    typedef function_input_queue<input_type, internals_allocator> input_queue_type;
+private:
+    using input_impl_type::my_predecessors;
+public:
+    template<typename Body>
+        __TBB_requires(multifunction_node_body<Body, Input, output_ports_type>)
+    __TBB_NOINLINE_SYM multifunction_node(
+        graph &g, size_t concurrency,
+        Body body, Policy = Policy(), node_priority_t a_priority = no_priority
+    ) : graph_node(g), input_impl_type(g, concurrency, body, a_priority) {
+        fgt_multioutput_node_with_body<N>(
+            CODEPTR(), FLOW_MULTIFUNCTION_NODE,
+            &this->my_graph, static_cast<receiver<input_type> *>(this),
+            this->output_ports(), this->my_body
+        );
+    }
+
+    template <typename Body>
+        __TBB_requires(multifunction_node_body<Body, Input, output_ports_type>)
+    __TBB_NOINLINE_SYM multifunction_node(graph& g, size_t concurrency, Body body, node_priority_t a_priority)
+        : multifunction_node(g, concurrency, body, Policy(), a_priority) {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Args>
+        __TBB_requires(multifunction_node_body<Body, Input, output_ports_type>)
+    __TBB_NOINLINE_SYM multifunction_node(const node_set<Args...>& nodes, size_t concurrency, Body body,
+                       Policy p = Policy(), node_priority_t a_priority = no_priority)
+        : multifunction_node(nodes.graph_reference(), concurrency, body, p, a_priority) {
+        make_edges_in_order(nodes, *this);
+    }
+
+    template <typename Body, typename... Args>
+        __TBB_requires(multifunction_node_body<Body, Input, output_ports_type>)
+    __TBB_NOINLINE_SYM multifunction_node(const node_set<Args...>& nodes, size_t concurrency, Body body, node_priority_t a_priority)
+        : multifunction_node(nodes, concurrency, body, Policy(), a_priority) {}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+    __TBB_NOINLINE_SYM multifunction_node( const multifunction_node &other) :
+        graph_node(other.my_graph), input_impl_type(other) {
+        fgt_multioutput_node_with_body<N>( CODEPTR(), FLOW_MULTIFUNCTION_NODE,
+                &this->my_graph, static_cast<receiver<input_type> *>(this),
+                this->output_ports(), this->my_body );
+    }
+
+    // all the guts are in multifunction_input...
+protected:
+    void reset_node(reset_flags f) override { input_impl_type::reset(f); }
+};  // multifunction_node
+
+//! split_node: accepts a tuple as input, forwards each element of the tuple to its
+//  successors.  The node has unlimited concurrency, so it does not reject inputs.
+template<typename TupleType>
+class split_node : public graph_node, public receiver<TupleType> {
+    static const int N = std::tuple_size<TupleType>::value;
+    typedef receiver<TupleType> base_type;
+public:
+    typedef TupleType input_type;
+    typedef typename wrap_tuple_elements<
+            N,  // #elements in tuple
+            multifunction_output,  // wrap this around each element
+            TupleType // the tuple providing the types
+        >::type  output_ports_type;
+
+    __TBB_NOINLINE_SYM explicit split_node(graph &g)
+        : graph_node(g),
+          my_output_ports(init_output_ports<output_ports_type>::call(g, my_output_ports))
+    {
+        fgt_multioutput_node<N>(CODEPTR(), FLOW_SPLIT_NODE, &this->my_graph,
+            static_cast<receiver<input_type> *>(this), this->output_ports());
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    __TBB_NOINLINE_SYM split_node(const node_set<Args...>& nodes) : split_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    __TBB_NOINLINE_SYM split_node(const split_node& other)
+        : graph_node(other.my_graph), base_type(other),
+          my_output_ports(init_output_ports<output_ports_type>::call(other.my_graph, my_output_ports))
+    {
+        fgt_multioutput_node<N>(CODEPTR(), FLOW_SPLIT_NODE, &this->my_graph,
+            static_cast<receiver<input_type> *>(this), this->output_ports());
+    }
+
+    output_ports_type &output_ports() { return my_output_ports; }
+
+protected:
+    graph_task *try_put_task(const TupleType& t) override {
+        // Sending split messages in parallel is not justified, as overheads would prevail.
+        // Also, we do not have successors here. So we just tell the task returned here is successful.
+        return emit_element<N>::emit_this(this->my_graph, t, output_ports());
+    }
+    void reset_node(reset_flags f) override {
+        if (f & rf_clear_edges)
+            clear_element<N>::clear_this(my_output_ports);
+
+        __TBB_ASSERT(!(f & rf_clear_edges) || clear_element<N>::this_empty(my_output_ports), "split_node reset failed");
+    }
+    graph& graph_reference() const override {
+        return my_graph;
+    }
+
+private:
+    output_ports_type my_output_ports;
+};
+
+//! Implements an executable node that supports continue_msg -> Output
+template <typename Output, typename Policy = Policy<void> >
+    __TBB_requires(std::copy_constructible<Output>)
+class continue_node : public graph_node, public continue_input<Output, Policy>,
+                      public function_output<Output> {
+public:
+    typedef continue_msg input_type;
+    typedef Output output_type;
+    typedef continue_input<Output, Policy> input_impl_type;
+    typedef function_output<output_type> fOutput_type;
+    typedef typename input_impl_type::predecessor_type predecessor_type;
+    typedef typename fOutput_type::successor_type successor_type;
+
+    //! Constructor for executable node with continue_msg -> Output
+    template <typename Body >
+        __TBB_requires(continue_node_body<Body, Output>)
+    __TBB_NOINLINE_SYM continue_node(
+        graph &g,
+        Body body, Policy = Policy(), node_priority_t a_priority = no_priority
+    ) : graph_node(g), input_impl_type( g, body, a_priority ),
+        fOutput_type(g) {
+        fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph,
+
+                                           static_cast<receiver<input_type> *>(this),
+                                           static_cast<sender<output_type> *>(this), this->my_body );
+    }
+
+    template <typename Body>
+        __TBB_requires(continue_node_body<Body, Output>)
+    continue_node( graph& g, Body body, node_priority_t a_priority )
+        : continue_node(g, body, Policy(), a_priority) {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Args>
+        __TBB_requires(continue_node_body<Body, Output>)
+    continue_node( const node_set<Args...>& nodes, Body body,
+                   Policy p = Policy(), node_priority_t a_priority = no_priority )
+        : continue_node(nodes.graph_reference(), body, p, a_priority ) {
+        make_edges_in_order(nodes, *this);
+    }
+    template <typename Body, typename... Args>
+        __TBB_requires(continue_node_body<Body, Output>)
+    continue_node( const node_set<Args...>& nodes, Body body, node_priority_t a_priority)
+        : continue_node(nodes, body, Policy(), a_priority) {}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+    //! Constructor for executable node with continue_msg -> Output
+    template <typename Body >
+        __TBB_requires(continue_node_body<Body, Output>)
+    __TBB_NOINLINE_SYM continue_node(
+        graph &g, int number_of_predecessors,
+        Body body, Policy = Policy(), node_priority_t a_priority = no_priority
+    ) : graph_node(g)
+      , input_impl_type(g, number_of_predecessors, body, a_priority),
+        fOutput_type(g) {
+        fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph,
+                                           static_cast<receiver<input_type> *>(this),
+                                           static_cast<sender<output_type> *>(this), this->my_body );
+    }
+
+    template <typename Body>
+        __TBB_requires(continue_node_body<Body, Output>)
+    continue_node( graph& g, int number_of_predecessors, Body body, node_priority_t a_priority)
+        : continue_node(g, number_of_predecessors, body, Policy(), a_priority) {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Args>
+        __TBB_requires(continue_node_body<Body, Output>)
+    continue_node( const node_set<Args...>& nodes, int number_of_predecessors,
+                   Body body, Policy p = Policy(), node_priority_t a_priority = no_priority )
+        : continue_node(nodes.graph_reference(), number_of_predecessors, body, p, a_priority) {
+        make_edges_in_order(nodes, *this);
+    }
+
+    template <typename Body, typename... Args>
+        __TBB_requires(continue_node_body<Body, Output>)
+    continue_node( const node_set<Args...>& nodes, int number_of_predecessors,
+                   Body body, node_priority_t a_priority )
+        : continue_node(nodes, number_of_predecessors, body, Policy(), a_priority) {}
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM continue_node( const continue_node& src ) :
+        graph_node(src.my_graph), input_impl_type(src),
+        function_output<Output>(src.my_graph) {
+        fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph,
+                                           static_cast<receiver<input_type> *>(this),
+                                           static_cast<sender<output_type> *>(this), this->my_body );
+    }
+
+protected:
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    using input_impl_type::try_put_task;
+    broadcast_cache<output_type> &successors () override { return fOutput_type::my_successors; }
+
+    void reset_node(reset_flags f) override {
+        input_impl_type::reset_receiver(f);
+        if(f & rf_clear_edges)successors().clear();
+        __TBB_ASSERT(!(f & rf_clear_edges) || successors().empty(), "continue_node not reset");
+    }
+};  // continue_node
+
+//! Forwards messages of type T to all successors
+template <typename T>
+class broadcast_node : public graph_node, public receiver<T>, public sender<T> {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+private:
+    broadcast_cache<input_type> my_successors;
+public:
+
+    __TBB_NOINLINE_SYM explicit broadcast_node(graph& g) : graph_node(g), my_successors(this) {
+        fgt_node( CODEPTR(), FLOW_BROADCAST_NODE, &this->my_graph,
+                  static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    broadcast_node(const node_set<Args...>& nodes) : broadcast_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM broadcast_node( const broadcast_node& src ) : broadcast_node(src.my_graph) {}
+
+    //! Adds a successor
+    bool register_successor( successor_type &r ) override {
+        my_successors.register_successor( r );
+        return true;
+    }
+
+    //! Removes s as a successor
+    bool remove_successor( successor_type &r ) override {
+        my_successors.remove_successor( r );
+        return true;
+    }
+
+protected:
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    //! build a task to run the successor if possible.  Default is old behavior.
+    graph_task *try_put_task(const T& t) override {
+        graph_task *new_task = my_successors.try_put_task(t);
+        if (!new_task) new_task = SUCCESSFULLY_ENQUEUED;
+        return new_task;
+    }
+
+    graph& graph_reference() const override {
+        return my_graph;
+    }
+
+    void reset_node(reset_flags f) override {
+        if (f&rf_clear_edges) {
+           my_successors.clear();
+        }
+        __TBB_ASSERT(!(f & rf_clear_edges) || my_successors.empty(), "Error resetting broadcast_node");
+    }
+};  // broadcast_node
+
+//! Forwards messages in arbitrary order
+template <typename T>
+class buffer_node
+    : public graph_node
+    , public reservable_item_buffer< T, cache_aligned_allocator<T> >
+    , public receiver<T>, public sender<T>
+{
+    typedef cache_aligned_allocator<T> internals_allocator;
+
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+    typedef buffer_node<T> class_type;
+
+protected:
+    typedef size_t size_type;
+    round_robin_cache< T, null_rw_mutex > my_successors;
+
+    friend class forward_task_bypass< class_type >;
+
+    enum op_type {reg_succ, rem_succ, req_item, res_item, rel_res, con_res, put_item, try_fwd_task
+    };
+
+    // implements the aggregator_operation concept
+    class buffer_operation : public aggregated_operation< buffer_operation > {
+    public:
+        char type;
+        T* elem;
+        graph_task* ltask;
+        successor_type *r;
+
+        buffer_operation(const T& e, op_type t) : type(char(t))
+                                                  , elem(const_cast<T*>(&e)) , ltask(nullptr)
+                                                  , r(nullptr)
+        {}
+        buffer_operation(op_type t) : type(char(t)), elem(nullptr), ltask(nullptr), r(nullptr) {}
+    };
+
+    bool forwarder_busy;
+    typedef aggregating_functor<class_type, buffer_operation> handler_type;
+    friend class aggregating_functor<class_type, buffer_operation>;
+    aggregator< handler_type, buffer_operation> my_aggregator;
+
+    virtual void handle_operations(buffer_operation *op_list) {
+        handle_operations_impl(op_list, this);
+    }
+
+    template<typename derived_type>
+    void handle_operations_impl(buffer_operation *op_list, derived_type* derived) {
+        __TBB_ASSERT(static_cast<class_type*>(derived) == this, "'this' is not a base class for derived");
+
+        buffer_operation *tmp = nullptr;
+        bool try_forwarding = false;
+        while (op_list) {
+            tmp = op_list;
+            op_list = op_list->next;
+            switch (tmp->type) {
+            case reg_succ: internal_reg_succ(tmp); try_forwarding = true; break;
+            case rem_succ: internal_rem_succ(tmp); break;
+            case req_item: internal_pop(tmp); break;
+            case res_item: internal_reserve(tmp); break;
+            case rel_res:  internal_release(tmp); try_forwarding = true; break;
+            case con_res:  internal_consume(tmp); try_forwarding = true; break;
+            case put_item: try_forwarding = internal_push(tmp); break;
+            case try_fwd_task: internal_forward_task(tmp); break;
+            }
+        }
+
+        derived->order();
+
+        if (try_forwarding && !forwarder_busy) {
+            if(is_graph_active(this->my_graph)) {
+                forwarder_busy = true;
+                typedef forward_task_bypass<class_type> task_type;
+                small_object_allocator allocator{};
+                graph_task* new_task = allocator.new_object<task_type>(graph_reference(), allocator, *this);
+                my_graph.reserve_wait();
+                // tmp should point to the last item handled by the aggregator.  This is the operation
+                // the handling thread enqueued.  So modifying that record will be okay.
+                // TODO revamp: check that the issue is still present
+                // workaround for icc bug  (at least 12.0 and 13.0)
+                // error: function "tbb::flow::interfaceX::combine_tasks" cannot be called with the given argument list
+                //        argument types are: (graph, graph_task *, graph_task *)
+                graph_task *z = tmp->ltask;
+                graph &g = this->my_graph;
+                tmp->ltask = combine_tasks(g, z, new_task);  // in case the op generated a task
+            }
+        }
+    }  // handle_operations
+
+    inline graph_task *grab_forwarding_task( buffer_operation &op_data) {
+        return op_data.ltask;
+    }
+
+    inline bool enqueue_forwarding_task(buffer_operation &op_data) {
+        graph_task *ft = grab_forwarding_task(op_data);
+        if(ft) {
+            spawn_in_graph_arena(graph_reference(), *ft);
+            return true;
+        }
+        return false;
+    }
+
+    //! This is executed by an enqueued task, the "forwarder"
+    virtual graph_task *forward_task() {
+        buffer_operation op_data(try_fwd_task);
+        graph_task *last_task = nullptr;
+        do {
+            op_data.status = WAIT;
+            op_data.ltask = nullptr;
+            my_aggregator.execute(&op_data);
+
+            // workaround for icc bug
+            graph_task *xtask = op_data.ltask;
+            graph& g = this->my_graph;
+            last_task = combine_tasks(g, last_task, xtask);
+        } while (op_data.status ==SUCCEEDED);
+        return last_task;
+    }
+
+    //! Register successor
+    virtual void internal_reg_succ(buffer_operation *op) {
+        __TBB_ASSERT(op->r, nullptr);
+        my_successors.register_successor(*(op->r));
+        op->status.store(SUCCEEDED, std::memory_order_release);
+    }
+
+    //! Remove successor
+    virtual void internal_rem_succ(buffer_operation *op) {
+        __TBB_ASSERT(op->r, nullptr);
+        my_successors.remove_successor(*(op->r));
+        op->status.store(SUCCEEDED, std::memory_order_release);
+    }
+
+private:
+    void order() {}
+
+    bool is_item_valid() {
+        return this->my_item_valid(this->my_tail - 1);
+    }
+
+    void try_put_and_add_task(graph_task*& last_task) {
+        graph_task *new_task = my_successors.try_put_task(this->back());
+        if (new_task) {
+            // workaround for icc bug
+            graph& g = this->my_graph;
+            last_task = combine_tasks(g, last_task, new_task);
+            this->destroy_back();
+        }
+    }
+
+protected:
+    //! Tries to forward valid items to successors
+    virtual void internal_forward_task(buffer_operation *op) {
+        internal_forward_task_impl(op, this);
+    }
+
+    template<typename derived_type>
+    void internal_forward_task_impl(buffer_operation *op, derived_type* derived) {
+        __TBB_ASSERT(static_cast<class_type*>(derived) == this, "'this' is not a base class for derived");
+
+        if (this->my_reserved || !derived->is_item_valid()) {
+            op->status.store(FAILED, std::memory_order_release);
+            this->forwarder_busy = false;
+            return;
+        }
+        // Try forwarding, giving each successor a chance
+        graph_task* last_task = nullptr;
+        size_type counter = my_successors.size();
+        for (; counter > 0 && derived->is_item_valid(); --counter)
+            derived->try_put_and_add_task(last_task);
+
+        op->ltask = last_task;  // return task
+        if (last_task && !counter) {
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        }
+        else {
+            op->status.store(FAILED, std::memory_order_release);
+            forwarder_busy = false;
+        }
+    }
+
+    virtual bool internal_push(buffer_operation *op) {
+        __TBB_ASSERT(op->elem, nullptr);
+        this->push_back(*(op->elem));
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        return true;
+    }
+
+    virtual void internal_pop(buffer_operation *op) {
+        __TBB_ASSERT(op->elem, nullptr);
+        if(this->pop_back(*(op->elem))) {
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        }
+        else {
+            op->status.store(FAILED, std::memory_order_release);
+        }
+    }
+
+    virtual void internal_reserve(buffer_operation *op) {
+        __TBB_ASSERT(op->elem, nullptr);
+        if(this->reserve_front(*(op->elem))) {
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        }
+        else {
+            op->status.store(FAILED, std::memory_order_release);
+        }
+    }
+
+    virtual void internal_consume(buffer_operation *op) {
+        this->consume_front();
+        op->status.store(SUCCEEDED, std::memory_order_release);
+    }
+
+    virtual void internal_release(buffer_operation *op) {
+        this->release_front();
+        op->status.store(SUCCEEDED, std::memory_order_release);
+    }
+
+public:
+    //! Constructor
+    __TBB_NOINLINE_SYM explicit buffer_node( graph &g )
+        : graph_node(g), reservable_item_buffer<T, internals_allocator>(), receiver<T>(),
+          sender<T>(), my_successors(this), forwarder_busy(false)
+    {
+        my_aggregator.initialize_handler(handler_type(this));
+        fgt_node( CODEPTR(), FLOW_BUFFER_NODE, &this->my_graph,
+                                 static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    buffer_node(const node_set<Args...>& nodes) : buffer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM buffer_node( const buffer_node& src ) : buffer_node(src.my_graph) {}
+
+    //
+    // message sender implementation
+    //
+
+    //! Adds a new successor.
+    /** Adds successor r to the list of successors; may forward tasks.  */
+    bool register_successor( successor_type &r ) override {
+        buffer_operation op_data(reg_succ);
+        op_data.r = &r;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return true;
+    }
+
+    //! Removes a successor.
+    /** Removes successor r from the list of successors.
+        It also calls r.remove_predecessor(*this) to remove this node as a predecessor. */
+    bool remove_successor( successor_type &r ) override {
+        // TODO revamp: investigate why full qualification is necessary here
+        tbb::detail::d1::remove_predecessor(r, *this);
+        buffer_operation op_data(rem_succ);
+        op_data.r = &r;
+        my_aggregator.execute(&op_data);
+        // even though this operation does not cause a forward, if we are the handler, and
+        // a forward is scheduled, we may be the first to reach this point after the aggregator,
+        // and so should check for the task.
+        (void)enqueue_forwarding_task(op_data);
+        return true;
+    }
+
+    //! Request an item from the buffer_node
+    /**  true = v contains the returned item<BR>
+         false = no item has been returned */
+    bool try_get( T &v ) override {
+        buffer_operation op_data(req_item);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return (op_data.status==SUCCEEDED);
+    }
+
+    //! Reserves an item.
+    /**  false = no item can be reserved<BR>
+         true = an item is reserved */
+    bool try_reserve( T &v ) override {
+        buffer_operation op_data(res_item);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return (op_data.status==SUCCEEDED);
+    }
+
+    //! Release a reserved item.
+    /**  true = item has been released and so remains in sender */
+    bool try_release() override {
+        buffer_operation op_data(rel_res);
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return true;
+    }
+
+    //! Consumes a reserved item.
+    /** true = item is removed from sender and reservation removed */
+    bool try_consume() override {
+        buffer_operation op_data(con_res);
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return true;
+    }
+
+protected:
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    //! receive an item, return a task *if possible
+    graph_task *try_put_task(const T &t) override {
+        buffer_operation op_data(t, put_item);
+        my_aggregator.execute(&op_data);
+        graph_task *ft = grab_forwarding_task(op_data);
+        // sequencer_nodes can return failure (if an item has been previously inserted)
+        // We have to spawn the returned task if our own operation fails.
+
+        if(ft && op_data.status ==FAILED) {
+            // we haven't succeeded queueing the item, but for some reason the
+            // call returned a task (if another request resulted in a successful
+            // forward this could happen.)  Queue the task and reset the pointer.
+            spawn_in_graph_arena(graph_reference(), *ft); ft = nullptr;
+        }
+        else if(!ft && op_data.status ==SUCCEEDED) {
+            ft = SUCCESSFULLY_ENQUEUED;
+        }
+        return ft;
+    }
+
+    graph& graph_reference() const override {
+        return my_graph;
+    }
+
+protected:
+    void reset_node( reset_flags f) override {
+        reservable_item_buffer<T, internals_allocator>::reset();
+        // TODO: just clear structures
+        if (f&rf_clear_edges) {
+            my_successors.clear();
+        }
+        forwarder_busy = false;
+    }
+};  // buffer_node
+
+//! Forwards messages in FIFO order
+template <typename T>
+class queue_node : public buffer_node<T> {
+protected:
+    typedef buffer_node<T> base_type;
+    typedef typename base_type::size_type size_type;
+    typedef typename base_type::buffer_operation queue_operation;
+    typedef queue_node class_type;
+
+private:
+    template<typename> friend class buffer_node;
+
+    bool is_item_valid() {
+        return this->my_item_valid(this->my_head);
+    }
+
+    void try_put_and_add_task(graph_task*& last_task) {
+        graph_task *new_task = this->my_successors.try_put_task(this->front());
+        if (new_task) {
+            // workaround for icc bug
+            graph& graph_ref = this->graph_reference();
+            last_task = combine_tasks(graph_ref, last_task, new_task);
+            this->destroy_front();
+        }
+    }
+
+protected:
+    void internal_forward_task(queue_operation *op) override {
+        this->internal_forward_task_impl(op, this);
+    }
+
+    void internal_pop(queue_operation *op) override {
+        if ( this->my_reserved || !this->my_item_valid(this->my_head)){
+            op->status.store(FAILED, std::memory_order_release);
+        }
+        else {
+            this->pop_front(*(op->elem));
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        }
+    }
+    void internal_reserve(queue_operation *op) override {
+        if (this->my_reserved || !this->my_item_valid(this->my_head)) {
+            op->status.store(FAILED, std::memory_order_release);
+        }
+        else {
+            this->reserve_front(*(op->elem));
+            op->status.store(SUCCEEDED, std::memory_order_release);
+        }
+    }
+    void internal_consume(queue_operation *op) override {
+        this->consume_front();
+        op->status.store(SUCCEEDED, std::memory_order_release);
+    }
+
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    //! Constructor
+    __TBB_NOINLINE_SYM explicit queue_node( graph &g ) : base_type(g) {
+        fgt_node( CODEPTR(), FLOW_QUEUE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    queue_node( const node_set<Args...>& nodes) : queue_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM queue_node( const queue_node& src) : base_type(src) {
+        fgt_node( CODEPTR(), FLOW_QUEUE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+
+protected:
+    void reset_node( reset_flags f) override {
+        base_type::reset_node(f);
+    }
+};  // queue_node
+
+//! Forwards messages in sequence order
+template <typename T>
+    __TBB_requires(std::copyable<T>)
+class sequencer_node : public queue_node<T> {
+    function_body< T, size_t > *my_sequencer;
+    // my_sequencer should be a benign function and must be callable
+    // from a parallel context.  Does this mean it needn't be reset?
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    //! Constructor
+    template< typename Sequencer >
+        __TBB_requires(sequencer<Sequencer, T>)
+    __TBB_NOINLINE_SYM sequencer_node( graph &g, const Sequencer& s ) : queue_node<T>(g),
+        my_sequencer(new function_body_leaf< T, size_t, Sequencer>(s) ) {
+        fgt_node( CODEPTR(), FLOW_SEQUENCER_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Sequencer, typename... Args>
+        __TBB_requires(sequencer<Sequencer, T>)
+    sequencer_node( const node_set<Args...>& nodes, const Sequencer& s)
+        : sequencer_node(nodes.graph_reference(), s) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM sequencer_node( const sequencer_node& src ) : queue_node<T>(src),
+        my_sequencer( src.my_sequencer->clone() ) {
+        fgt_node( CODEPTR(), FLOW_SEQUENCER_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+    //! Destructor
+    ~sequencer_node() { delete my_sequencer; }
+
+protected:
+    typedef typename buffer_node<T>::size_type size_type;
+    typedef typename buffer_node<T>::buffer_operation sequencer_operation;
+
+private:
+    bool internal_push(sequencer_operation *op) override {
+        size_type tag = (*my_sequencer)(*(op->elem));
+#if !TBB_DEPRECATED_SEQUENCER_DUPLICATES
+        if (tag < this->my_head) {
+            // have already emitted a message with this tag
+            op->status.store(FAILED, std::memory_order_release);
+            return false;
+        }
+#endif
+        // cannot modify this->my_tail now; the buffer would be inconsistent.
+        size_t new_tail = (tag+1 > this->my_tail) ? tag+1 : this->my_tail;
+
+        if (this->size(new_tail) > this->capacity()) {
+            this->grow_my_array(this->size(new_tail));
+        }
+        this->my_tail = new_tail;
+
+        const op_stat res = this->place_item(tag, *(op->elem)) ? SUCCEEDED : FAILED;
+        op->status.store(res, std::memory_order_release);
+        return res ==SUCCEEDED;
+    }
+};  // sequencer_node
+
+//! Forwards messages in priority order
+template<typename T, typename Compare = std::less<T>>
+class priority_queue_node : public buffer_node<T> {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef buffer_node<T> base_type;
+    typedef priority_queue_node class_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    //! Constructor
+    __TBB_NOINLINE_SYM explicit priority_queue_node( graph &g, const Compare& comp = Compare() )
+        : buffer_node<T>(g), compare(comp), mark(0) {
+        fgt_node( CODEPTR(), FLOW_PRIORITY_QUEUE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    priority_queue_node(const node_set<Args...>& nodes, const Compare& comp = Compare())
+        : priority_queue_node(nodes.graph_reference(), comp) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor
+    __TBB_NOINLINE_SYM priority_queue_node( const priority_queue_node &src )
+        : buffer_node<T>(src), mark(0)
+    {
+        fgt_node( CODEPTR(), FLOW_PRIORITY_QUEUE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+protected:
+
+    void reset_node( reset_flags f) override {
+        mark = 0;
+        base_type::reset_node(f);
+    }
+
+    typedef typename buffer_node<T>::size_type size_type;
+    typedef typename buffer_node<T>::item_type item_type;
+    typedef typename buffer_node<T>::buffer_operation prio_operation;
+
+    //! Tries to forward valid items to successors
+    void internal_forward_task(prio_operation *op) override {
+        this->internal_forward_task_impl(op, this);
+    }
+
+    void handle_operations(prio_operation *op_list) override {
+        this->handle_operations_impl(op_list, this);
+    }
+
+    bool internal_push(prio_operation *op) override {
+        prio_push(*(op->elem));
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        return true;
+    }
+
+    void internal_pop(prio_operation *op) override {
+        // if empty or already reserved, don't pop
+        if ( this->my_reserved == true || this->my_tail == 0 ) {
+            op->status.store(FAILED, std::memory_order_release);
+            return;
+        }
+
+        *(op->elem) = prio();
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        prio_pop();
+
+    }
+
+    // pops the highest-priority item, saves copy
+    void internal_reserve(prio_operation *op) override {
+        if (this->my_reserved == true || this->my_tail == 0) {
+            op->status.store(FAILED, std::memory_order_release);
+            return;
+        }
+        this->my_reserved = true;
+        *(op->elem) = prio();
+        reserved_item = *(op->elem);
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        prio_pop();
+    }
+
+    void internal_consume(prio_operation *op) override {
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        this->my_reserved = false;
+        reserved_item = input_type();
+    }
+
+    void internal_release(prio_operation *op) override {
+        op->status.store(SUCCEEDED, std::memory_order_release);
+        prio_push(reserved_item);
+        this->my_reserved = false;
+        reserved_item = input_type();
+    }
+
+private:
+    template<typename> friend class buffer_node;
+
+    void order() {
+        if (mark < this->my_tail) heapify();
+        __TBB_ASSERT(mark == this->my_tail, "mark unequal after heapify");
+    }
+
+    bool is_item_valid() {
+        return this->my_tail > 0;
+    }
+
+    void try_put_and_add_task(graph_task*& last_task) {
+        graph_task * new_task = this->my_successors.try_put_task(this->prio());
+        if (new_task) {
+            // workaround for icc bug
+            graph& graph_ref = this->graph_reference();
+            last_task = combine_tasks(graph_ref, last_task, new_task);
+            prio_pop();
+        }
+    }
+
+private:
+    Compare compare;
+    size_type mark;
+
+    input_type reserved_item;
+
+    // in case a reheap has not been done after a push, check if the mark item is higher than the 0'th item
+    bool prio_use_tail() {
+        __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds before test");
+        return mark < this->my_tail && compare(this->get_my_item(0), this->get_my_item(this->my_tail - 1));
+    }
+
+    // prio_push: checks that the item will fit, expand array if necessary, put at end
+    void prio_push(const T &src) {
+        if ( this->my_tail >= this->my_array_size )
+            this->grow_my_array( this->my_tail + 1 );
+        (void) this->place_item(this->my_tail, src);
+        ++(this->my_tail);
+        __TBB_ASSERT(mark < this->my_tail, "mark outside bounds after push");
+    }
+
+    // prio_pop: deletes highest priority item from the array, and if it is item
+    // 0, move last item to 0 and reheap.  If end of array, just destroy and decrement tail
+    // and mark.  Assumes the array has already been tested for emptiness; no failure.
+    void prio_pop()  {
+        if (prio_use_tail()) {
+            // there are newly pushed elements; last one higher than top
+            // copy the data
+            this->destroy_item(this->my_tail-1);
+            --(this->my_tail);
+            __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds after pop");
+            return;
+        }
+        this->destroy_item(0);
+        if(this->my_tail > 1) {
+            // push the last element down heap
+            __TBB_ASSERT(this->my_item_valid(this->my_tail - 1), nullptr);
+            this->move_item(0,this->my_tail - 1);
+        }
+        --(this->my_tail);
+        if(mark > this->my_tail) --mark;
+        if (this->my_tail > 1) // don't reheap for heap of size 1
+            reheap();
+        __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds after pop");
+    }
+
+    const T& prio() {
+        return this->get_my_item(prio_use_tail() ? this->my_tail-1 : 0);
+    }
+
+    // turn array into heap
+    void heapify() {
+        if(this->my_tail == 0) {
+            mark = 0;
+            return;
+        }
+        if (!mark) mark = 1;
+        for (; mark<this->my_tail; ++mark) { // for each unheaped element
+            size_type cur_pos = mark;
+            input_type to_place;
+            this->fetch_item(mark,to_place);
+            do { // push to_place up the heap
+                size_type parent = (cur_pos-1)>>1;
+                if (!compare(this->get_my_item(parent), to_place))
+                    break;
+                this->move_item(cur_pos, parent);
+                cur_pos = parent;
+            } while( cur_pos );
+            (void) this->place_item(cur_pos, to_place);
+        }
+    }
+
+    // otherwise heapified array with new root element; rearrange to heap
+    void reheap() {
+        size_type cur_pos=0, child=1;
+        while (child < mark) {
+            size_type target = child;
+            if (child+1<mark &&
+                compare(this->get_my_item(child),
+                        this->get_my_item(child+1)))
+                ++target;
+            // target now has the higher priority child
+            if (compare(this->get_my_item(target),
+                        this->get_my_item(cur_pos)))
+                break;
+            // swap
+            this->swap_items(cur_pos, target);
+            cur_pos = target;
+            child = (cur_pos<<1)+1;
+        }
+    }
+};  // priority_queue_node
+
+//! Forwards messages only if the threshold has not been reached
+/** This node forwards items until its threshold is reached.
+    It contains no buffering.  If the downstream node rejects, the
+    message is dropped. */
+template< typename T, typename DecrementType=continue_msg >
+class limiter_node : public graph_node, public receiver< T >, public sender< T > {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+    //TODO: There is a lack of predefined types for its controlling "decrementer" port. It should be fixed later.
+
+private:
+    size_t my_threshold;
+    size_t my_count; // number of successful puts
+    size_t my_tries; // number of active put attempts
+    size_t my_future_decrement; // number of active decrement
+    reservable_predecessor_cache< T, spin_mutex > my_predecessors;
+    spin_mutex my_mutex;
+    broadcast_cache< T > my_successors;
+
+    //! The internal receiver< DecrementType > that adjusts the count
+    threshold_regulator< limiter_node<T, DecrementType>, DecrementType > decrement;
+
+    graph_task* decrement_counter( long long delta ) {
+        if ( delta > 0 && size_t(delta) > my_threshold ) {
+            delta = my_threshold;
+        }
+
+        {
+            spin_mutex::scoped_lock lock(my_mutex);
+            if ( delta > 0 && size_t(delta) > my_count ) {
+                if( my_tries > 0 ) {
+                    my_future_decrement += (size_t(delta) - my_count);
+                }
+                my_count = 0;
+            }
+            else if ( delta < 0 && size_t(-delta) > my_threshold - my_count ) {
+                my_count = my_threshold;
+            }
+            else {
+                my_count -= size_t(delta); // absolute value of delta is sufficiently small
+            }
+            __TBB_ASSERT(my_count <= my_threshold, "counter values are truncated to be inside the [0, threshold] interval");
+        }
+        return forward_task();
+    }
+
+    // Let threshold_regulator call decrement_counter()
+    friend class threshold_regulator< limiter_node<T, DecrementType>, DecrementType >;
+
+    friend class forward_task_bypass< limiter_node<T,DecrementType> >;
+
+    bool check_conditions() {  // always called under lock
+        return ( my_count + my_tries < my_threshold && !my_predecessors.empty() && !my_successors.empty() );
+    }
+
+    // only returns a valid task pointer or nullptr, never SUCCESSFULLY_ENQUEUED
+    graph_task* forward_task() {
+        input_type v;
+        graph_task* rval = nullptr;
+        bool reserved = false;
+
+        {
+            spin_mutex::scoped_lock lock(my_mutex);
+            if ( check_conditions() )
+                ++my_tries;
+            else
+                return nullptr;
+        }
+
+        //SUCCESS
+        // if we can reserve and can put, we consume the reservation
+        // we increment the count and decrement the tries
+        if ( (my_predecessors.try_reserve(v)) == true ) {
+            reserved = true;
+            if ( (rval = my_successors.try_put_task(v)) != nullptr ) {
+                {
+                    spin_mutex::scoped_lock lock(my_mutex);
+                    ++my_count;
+                    if ( my_future_decrement ) {
+                        if ( my_count > my_future_decrement ) {
+                            my_count -= my_future_decrement;
+                            my_future_decrement = 0;
+                        }
+                        else {
+                            my_future_decrement -= my_count;
+                            my_count = 0;
+                        }
+                    }
+                    --my_tries;
+                    my_predecessors.try_consume();
+                    if ( check_conditions() ) {
+                        if ( is_graph_active(this->my_graph) ) {
+                            typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
+                            small_object_allocator allocator{};
+                            graph_task* rtask = allocator.new_object<task_type>( my_graph, allocator, *this );
+                            my_graph.reserve_wait();
+                            spawn_in_graph_arena(graph_reference(), *rtask);
+                        }
+                    }
+                }
+                return rval;
+            }
+        }
+        //FAILURE
+        //if we can't reserve, we decrement the tries
+        //if we can reserve but can't put, we decrement the tries and release the reservation
+        {
+            spin_mutex::scoped_lock lock(my_mutex);
+            --my_tries;
+            if (reserved) my_predecessors.try_release();
+            if ( check_conditions() ) {
+                if ( is_graph_active(this->my_graph) ) {
+                    small_object_allocator allocator{};
+                    typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
+                    graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
+                    my_graph.reserve_wait();
+                    __TBB_ASSERT(!rval, "Have two tasks to handle");
+                    return t;
+                }
+            }
+            return rval;
+        }
+    }
+
+    void initialize() {
+        fgt_node(
+            CODEPTR(), FLOW_LIMITER_NODE, &this->my_graph,
+            static_cast<receiver<input_type> *>(this), static_cast<receiver<DecrementType> *>(&decrement),
+            static_cast<sender<output_type> *>(this)
+        );
+    }
+
+public:
+    //! Constructor
+    limiter_node(graph &g, size_t threshold)
+        : graph_node(g), my_threshold(threshold), my_count(0), my_tries(0), my_future_decrement(0),
+        my_predecessors(this), my_successors(this), decrement(this)
+    {
+        initialize();
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    limiter_node(const node_set<Args...>& nodes, size_t threshold)
+        : limiter_node(nodes.graph_reference(), threshold) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor
+    limiter_node( const limiter_node& src ) : limiter_node(src.my_graph, src.my_threshold) {}
+
+    //! The interface for accessing internal receiver< DecrementType > that adjusts the count
+    receiver<DecrementType>& decrementer() { return decrement; }
+
+    //! Replace the current successor with this new successor
+    bool register_successor( successor_type &r ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        bool was_empty = my_successors.empty();
+        my_successors.register_successor(r);
+        //spawn a forward task if this is the only successor
+        if ( was_empty && !my_predecessors.empty() && my_count + my_tries < my_threshold ) {
+            if ( is_graph_active(this->my_graph) ) {
+                small_object_allocator allocator{};
+                typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
+                graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
+                my_graph.reserve_wait();
+                spawn_in_graph_arena(graph_reference(), *t);
+            }
+        }
+        return true;
+    }
+
+    //! Removes a successor from this node
+    /** r.remove_predecessor(*this) is also called. */
+    bool remove_successor( successor_type &r ) override {
+        // TODO revamp: investigate why qualification is needed for remove_predecessor() call
+        tbb::detail::d1::remove_predecessor(r, *this);
+        my_successors.remove_successor(r);
+        return true;
+    }
+
+    //! Adds src to the list of cached predecessors.
+    bool register_predecessor( predecessor_type &src ) override {
+        spin_mutex::scoped_lock lock(my_mutex);
+        my_predecessors.add( src );
+        if ( my_count + my_tries < my_threshold && !my_successors.empty() && is_graph_active(this->my_graph) ) {
+            small_object_allocator allocator{};
+            typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
+            graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
+            my_graph.reserve_wait();
+            spawn_in_graph_arena(graph_reference(), *t);
+        }
+        return true;
+    }
+
+    //! Removes src from the list of cached predecessors.
+    bool remove_predecessor( predecessor_type &src ) override {
+        my_predecessors.remove( src );
+        return true;
+    }
+
+protected:
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    //! Puts an item to this receiver
+    graph_task* try_put_task( const T &t ) override {
+        {
+            spin_mutex::scoped_lock lock(my_mutex);
+            if ( my_count + my_tries >= my_threshold )
+                return nullptr;
+            else
+                ++my_tries;
+        }
+
+        graph_task* rtask = my_successors.try_put_task(t);
+        if ( !rtask ) {  // try_put_task failed.
+            spin_mutex::scoped_lock lock(my_mutex);
+            --my_tries;
+            if (check_conditions() && is_graph_active(this->my_graph)) {
+                small_object_allocator allocator{};
+                typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
+                rtask = allocator.new_object<task_type>(my_graph, allocator, *this);
+                my_graph.reserve_wait();
+            }
+        }
+        else {
+            spin_mutex::scoped_lock lock(my_mutex);
+            ++my_count;
+            if ( my_future_decrement ) {
+                if ( my_count > my_future_decrement ) {
+                    my_count -= my_future_decrement;
+                    my_future_decrement = 0;
+                }
+                else {
+                    my_future_decrement -= my_count;
+                    my_count = 0;
+                }
+            }
+            --my_tries;
+        }
+        return rtask;
+    }
+
+    graph& graph_reference() const override { return my_graph; }
+
+    void reset_node( reset_flags f ) override {
+        my_count = 0;
+        if ( f & rf_clear_edges ) {
+            my_predecessors.clear();
+            my_successors.clear();
+        }
+        else {
+            my_predecessors.reset();
+        }
+        decrement.reset_receiver(f);
+    }
+};  // limiter_node
+
+#include "third_party/tbb/detail/_flow_graph_join_impl.h"
+
+template<typename OutputTuple, typename JP=queueing> class join_node;
+
+template<typename OutputTuple>
+class join_node<OutputTuple,reserving>: public unfolded_join_node<std::tuple_size<OutputTuple>::value, reserving_port, OutputTuple, reserving> {
+private:
+    static const int N = std::tuple_size<OutputTuple>::value;
+    typedef unfolded_join_node<N, reserving_port, OutputTuple, reserving> unfolded_type;
+public:
+    typedef OutputTuple output_type;
+    typedef typename unfolded_type::input_ports_type input_ports_type;
+     __TBB_NOINLINE_SYM explicit join_node(graph &g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_RESERVING, &this->my_graph,
+                                            this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    __TBB_NOINLINE_SYM join_node(const node_set<Args...>& nodes, reserving = reserving()) : join_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_RESERVING, &this->my_graph,
+                                            this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+template<typename OutputTuple>
+class join_node<OutputTuple,queueing>: public unfolded_join_node<std::tuple_size<OutputTuple>::value, queueing_port, OutputTuple, queueing> {
+private:
+    static const int N = std::tuple_size<OutputTuple>::value;
+    typedef unfolded_join_node<N, queueing_port, OutputTuple, queueing> unfolded_type;
+public:
+    typedef OutputTuple output_type;
+    typedef typename unfolded_type::input_ports_type input_ports_type;
+     __TBB_NOINLINE_SYM explicit join_node(graph &g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_QUEUEING, &this->my_graph,
+                                            this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    __TBB_NOINLINE_SYM join_node(const node_set<Args...>& nodes, queueing = queueing()) : join_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_QUEUEING, &this->my_graph,
+                                            this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+// Helper function which is well-formed only if all of the elements in OutputTuple
+// satisfies join_node_function_object<body[i], tuple[i], K>
+template <typename OutputTuple, typename K,
+          typename... Functions, std::size_t... Idx>
+void join_node_function_objects_helper( std::index_sequence<Idx...> )
+    requires (std::tuple_size_v<OutputTuple> == sizeof...(Functions)) &&
+             (... && join_node_function_object<Functions, std::tuple_element_t<Idx, OutputTuple>, K>);
+
+template <typename OutputTuple, typename K, typename... Functions>
+concept join_node_functions = requires {
+    join_node_function_objects_helper<OutputTuple, K, Functions...>(std::make_index_sequence<sizeof...(Functions)>{});
+};
+
+#endif
+
+// template for key_matching join_node
+// tag_matching join_node is a specialization of key_matching, and is source-compatible.
+template<typename OutputTuple, typename K, typename KHash>
+class join_node<OutputTuple, key_matching<K, KHash> > : public unfolded_join_node<std::tuple_size<OutputTuple>::value,
+      key_matching_port, OutputTuple, key_matching<K,KHash> > {
+private:
+    static const int N = std::tuple_size<OutputTuple>::value;
+    typedef unfolded_join_node<N, key_matching_port, OutputTuple, key_matching<K,KHash> > unfolded_type;
+public:
+    typedef OutputTuple output_type;
+    typedef typename unfolded_type::input_ports_type input_ports_type;
+
+#if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
+    join_node(graph &g) : unfolded_type(g) {}
+#endif  /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
+
+    template<typename __TBB_B0, typename __TBB_B1>
+        __TBB_requires(join_node_functions<OutputTuple, K, __TBB_B0, __TBB_B1>)
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1) : unfolded_type(g, b0, b1) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2>
+        __TBB_requires(join_node_functions<OutputTuple, K, __TBB_B0, __TBB_B1, __TBB_B2>)
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2) : unfolded_type(g, b0, b1, b2) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3>
+        __TBB_requires(join_node_functions<OutputTuple, K, __TBB_B0, __TBB_B1, __TBB_B2, __TBB_B3>)
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3) : unfolded_type(g, b0, b1, b2, b3) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4>
+        __TBB_requires(join_node_functions<OutputTuple, K, __TBB_B0, __TBB_B1, __TBB_B2, __TBB_B3, __TBB_B4>)
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4) :
+            unfolded_type(g, b0, b1, b2, b3, b4) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#if __TBB_VARIADIC_MAX >= 6
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4,
+        typename __TBB_B5>
+        __TBB_requires(join_node_functions<OutputTuple, K, __TBB_B0, __TBB_B1, __TBB_B2, __TBB_B3, __TBB_B4, __TBB_B5>)
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5) :
+            unfolded_type(g, b0, b1, b2, b3, b4, b5) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#endif
+#if __TBB_VARIADIC_MAX >= 7
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4,
+        typename __TBB_B5, typename __TBB_B6>
+        __TBB_requires(join_node_functions<OutputTuple, K, __TBB_B0, __TBB_B1, __TBB_B2, __TBB_B3, __TBB_B4, __TBB_B5, __TBB_B6>)
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6) :
+            unfolded_type(g, b0, b1, b2, b3, b4, b5, b6) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#endif
+#if __TBB_VARIADIC_MAX >= 8
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4,
+        typename __TBB_B5, typename __TBB_B6, typename __TBB_B7>
+        __TBB_requires(join_node_functions<OutputTuple, K, __TBB_B0, __TBB_B1, __TBB_B2, __TBB_B3, __TBB_B4, __TBB_B5, __TBB_B6, __TBB_B7>)
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6,
+            __TBB_B7 b7) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#endif
+#if __TBB_VARIADIC_MAX >= 9
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4,
+        typename __TBB_B5, typename __TBB_B6, typename __TBB_B7, typename __TBB_B8>
+        __TBB_requires(join_node_functions<OutputTuple, K, __TBB_B0, __TBB_B1, __TBB_B2, __TBB_B3, __TBB_B4, __TBB_B5, __TBB_B6, __TBB_B7, __TBB_B8>)
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6,
+            __TBB_B7 b7, __TBB_B8 b8) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7, b8) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#endif
+#if __TBB_VARIADIC_MAX >= 10
+    template<typename __TBB_B0, typename __TBB_B1, typename __TBB_B2, typename __TBB_B3, typename __TBB_B4,
+        typename __TBB_B5, typename __TBB_B6, typename __TBB_B7, typename __TBB_B8, typename __TBB_B9>
+        __TBB_requires(join_node_functions<OutputTuple, K, __TBB_B0, __TBB_B1, __TBB_B2, __TBB_B3, __TBB_B4, __TBB_B5, __TBB_B6, __TBB_B7, __TBB_B8, __TBB_B9>)
+     __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6,
+            __TBB_B7 b7, __TBB_B8 b8, __TBB_B9 b9) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+#endif
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <
+#if (__clang_major__ == 3 && __clang_minor__ == 4)
+        // clang 3.4 misdeduces 'Args...' for 'node_set' while it can cope with template template parameter.
+        template<typename...> class node_set,
+#endif
+        typename... Args, typename... Bodies
+    >
+    __TBB_requires((sizeof...(Bodies) == 0) || join_node_functions<OutputTuple, K, Bodies...>)
+    __TBB_NOINLINE_SYM join_node(const node_set<Args...>& nodes, Bodies... bodies)
+        : join_node(nodes.graph_reference(), bodies...) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+    __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph,
+                                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+// indexer node
+#include "third_party/tbb/detail/_flow_graph_indexer_impl.h"
+
+// TODO: Implement interface with variadic template or tuple
+template<typename T0, typename T1=null_type, typename T2=null_type, typename T3=null_type,
+                      typename T4=null_type, typename T5=null_type, typename T6=null_type,
+                      typename T7=null_type, typename T8=null_type, typename T9=null_type> class indexer_node;
+
+//indexer node specializations
+template<typename T0>
+class indexer_node<T0> : public unfolded_indexer_node<std::tuple<T0> > {
+private:
+    static const int N = 1;
+public:
+    typedef std::tuple<T0> InputTuple;
+    typedef tagged_msg<size_t, T0> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+};
+
+template<typename T0, typename T1>
+class indexer_node<T0, T1> : public unfolded_indexer_node<std::tuple<T0, T1> > {
+private:
+    static const int N = 2;
+public:
+    typedef std::tuple<T0, T1> InputTuple;
+    typedef tagged_msg<size_t, T0, T1> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+template<typename T0, typename T1, typename T2>
+class indexer_node<T0, T1, T2> : public unfolded_indexer_node<std::tuple<T0, T1, T2> > {
+private:
+    static const int N = 3;
+public:
+    typedef std::tuple<T0, T1, T2> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+template<typename T0, typename T1, typename T2, typename T3>
+class indexer_node<T0, T1, T2, T3> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3> > {
+private:
+    static const int N = 4;
+public:
+    typedef std::tuple<T0, T1, T2, T3> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+template<typename T0, typename T1, typename T2, typename T3, typename T4>
+class indexer_node<T0, T1, T2, T3, T4> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4> > {
+private:
+    static const int N = 5;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+
+#if __TBB_VARIADIC_MAX >= 6
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+class indexer_node<T0, T1, T2, T3, T4, T5> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4, T5> > {
+private:
+    static const int N = 6;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4, T5> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4, T5> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+#endif //variadic max 6
+
+#if __TBB_VARIADIC_MAX >= 7
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+         typename T6>
+class indexer_node<T0, T1, T2, T3, T4, T5, T6> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4, T5, T6> > {
+private:
+    static const int N = 7;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4, T5, T6> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4, T5, T6> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+#endif //variadic max 7
+
+#if __TBB_VARIADIC_MAX >= 8
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+         typename T6, typename T7>
+class indexer_node<T0, T1, T2, T3, T4, T5, T6, T7> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4, T5, T6, T7> > {
+private:
+    static const int N = 8;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4, T5, T6, T7> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4, T5, T6, T7> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+#endif //variadic max 8
+
+#if __TBB_VARIADIC_MAX >= 9
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+         typename T6, typename T7, typename T8>
+class indexer_node<T0, T1, T2, T3, T4, T5, T6, T7, T8> : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4, T5, T6, T7, T8> > {
+private:
+    static const int N = 9;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4, T5, T6, T7, T8> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4, T5, T6, T7, T8> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+#endif //variadic max 9
+
+#if __TBB_VARIADIC_MAX >= 10
+template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5,
+         typename T6, typename T7, typename T8, typename T9>
+class indexer_node/*default*/ : public unfolded_indexer_node<std::tuple<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9> > {
+private:
+    static const int N = 10;
+public:
+    typedef std::tuple<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9> InputTuple;
+    typedef tagged_msg<size_t, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9> output_type;
+    typedef unfolded_indexer_node<InputTuple> unfolded_type;
+    __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    indexer_node(const node_set<Args...>& nodes) : indexer_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    // Copy constructor
+    __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) {
+        fgt_multiinput_node<N>( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph,
+                                           this->input_ports(), static_cast< sender< output_type > *>(this) );
+    }
+
+};
+#endif //variadic max 10
+
+template< typename T >
+inline void internal_make_edge( sender<T> &p, receiver<T> &s ) {
+    register_successor(p, s);
+    fgt_make_edge( &p, &s );
+}
+
+//! Makes an edge between a single predecessor and a single successor
+template< typename T >
+inline void make_edge( sender<T> &p, receiver<T> &s ) {
+    internal_make_edge( p, s );
+}
+
+//Makes an edge from port 0 of a multi-output predecessor to port 0 of a multi-input successor.
+template< typename T, typename V,
+          typename = typename T::output_ports_type, typename = typename V::input_ports_type >
+inline void make_edge( T& output, V& input) {
+    make_edge(std::get<0>(output.output_ports()), std::get<0>(input.input_ports()));
+}
+
+//Makes an edge from port 0 of a multi-output predecessor to a receiver.
+template< typename T, typename R,
+          typename = typename T::output_ports_type >
+inline void make_edge( T& output, receiver<R>& input) {
+     make_edge(std::get<0>(output.output_ports()), input);
+}
+
+//Makes an edge from a sender to port 0 of a multi-input successor.
+template< typename S,  typename V,
+          typename = typename V::input_ports_type >
+inline void make_edge( sender<S>& output, V& input) {
+     make_edge(output, std::get<0>(input.input_ports()));
+}
+
+template< typename T >
+inline void internal_remove_edge( sender<T> &p, receiver<T> &s ) {
+    remove_successor( p, s );
+    fgt_remove_edge( &p, &s );
+}
+
+//! Removes an edge between a single predecessor and a single successor
+template< typename T >
+inline void remove_edge( sender<T> &p, receiver<T> &s ) {
+    internal_remove_edge( p, s );
+}
+
+//Removes an edge between port 0 of a multi-output predecessor and port 0 of a multi-input successor.
+template< typename T, typename V,
+          typename = typename T::output_ports_type, typename = typename V::input_ports_type >
+inline void remove_edge( T& output, V& input) {
+    remove_edge(std::get<0>(output.output_ports()), std::get<0>(input.input_ports()));
+}
+
+//Removes an edge between port 0 of a multi-output predecessor and a receiver.
+template< typename T, typename R,
+          typename = typename T::output_ports_type >
+inline void remove_edge( T& output, receiver<R>& input) {
+     remove_edge(std::get<0>(output.output_ports()), input);
+}
+//Removes an edge between a sender and port 0 of a multi-input successor.
+template< typename S,  typename V,
+          typename = typename V::input_ports_type >
+inline void remove_edge( sender<S>& output, V& input) {
+     remove_edge(output, std::get<0>(input.input_ports()));
+}
+
+//! Returns a copy of the body from a function or continue node
+template< typename Body, typename Node >
+Body copy_body( Node &n ) {
+    return n.template copy_function_object<Body>();
+}
+
+//composite_node
+template< typename InputTuple, typename OutputTuple > class composite_node;
+
+template< typename... InputTypes, typename... OutputTypes>
+class composite_node <std::tuple<InputTypes...>, std::tuple<OutputTypes...> > : public graph_node {
+
+public:
+    typedef std::tuple< receiver<InputTypes>&... > input_ports_type;
+    typedef std::tuple< sender<OutputTypes>&... > output_ports_type;
+
+private:
+    std::unique_ptr<input_ports_type> my_input_ports;
+    std::unique_ptr<output_ports_type> my_output_ports;
+
+    static const size_t NUM_INPUTS = sizeof...(InputTypes);
+    static const size_t NUM_OUTPUTS = sizeof...(OutputTypes);
+
+protected:
+    void reset_node(reset_flags) override {}
+
+public:
+    composite_node( graph &g ) : graph_node(g) {
+        fgt_multiinput_multioutput_node( CODEPTR(), FLOW_COMPOSITE_NODE, this, &this->my_graph );
+    }
+
+    template<typename T1, typename T2>
+    void set_external_ports(T1&& input_ports_tuple, T2&& output_ports_tuple) {
+        static_assert(NUM_INPUTS == std::tuple_size<input_ports_type>::value, "number of arguments does not match number of input ports");
+        static_assert(NUM_OUTPUTS == std::tuple_size<output_ports_type>::value, "number of arguments does not match number of output ports");
+
+        fgt_internal_input_alias_helper<T1, NUM_INPUTS>::alias_port( this, input_ports_tuple);
+        fgt_internal_output_alias_helper<T2, NUM_OUTPUTS>::alias_port( this, output_ports_tuple);
+
+        my_input_ports.reset( new input_ports_type(std::forward<T1>(input_ports_tuple)) );
+        my_output_ports.reset( new output_ports_type(std::forward<T2>(output_ports_tuple)) );
+    }
+
+    template< typename... NodeTypes >
+    void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); }
+
+    template< typename... NodeTypes >
+    void add_nodes(const NodeTypes&... n) { add_nodes_impl(this, false, n...); }
+
+
+    input_ports_type& input_ports() {
+         __TBB_ASSERT(my_input_ports, "input ports not set, call set_external_ports to set input ports");
+         return *my_input_ports;
+    }
+
+    output_ports_type& output_ports() {
+         __TBB_ASSERT(my_output_ports, "output ports not set, call set_external_ports to set output ports");
+         return *my_output_ports;
+    }
+};  // class composite_node
+
+//composite_node with only input ports
+template< typename... InputTypes>
+class composite_node <std::tuple<InputTypes...>, std::tuple<> > : public graph_node {
+public:
+    typedef std::tuple< receiver<InputTypes>&... > input_ports_type;
+
+private:
+    std::unique_ptr<input_ports_type> my_input_ports;
+    static const size_t NUM_INPUTS = sizeof...(InputTypes);
+
+protected:
+    void reset_node(reset_flags) override {}
+
+public:
+    composite_node( graph &g ) : graph_node(g) {
+        fgt_composite( CODEPTR(), this, &g );
+    }
+
+   template<typename T>
+   void set_external_ports(T&& input_ports_tuple) {
+       static_assert(NUM_INPUTS == std::tuple_size<input_ports_type>::value, "number of arguments does not match number of input ports");
+
+       fgt_internal_input_alias_helper<T, NUM_INPUTS>::alias_port( this, input_ports_tuple);
+
+       my_input_ports.reset( new input_ports_type(std::forward<T>(input_ports_tuple)) );
+   }
+
+    template< typename... NodeTypes >
+    void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); }
+
+    template< typename... NodeTypes >
+    void add_nodes( const NodeTypes&... n) { add_nodes_impl(this, false, n...); }
+
+
+    input_ports_type& input_ports() {
+         __TBB_ASSERT(my_input_ports, "input ports not set, call set_external_ports to set input ports");
+         return *my_input_ports;
+    }
+
+};  // class composite_node
+
+//composite_nodes with only output_ports
+template<typename... OutputTypes>
+class composite_node <std::tuple<>, std::tuple<OutputTypes...> > : public graph_node {
+public:
+    typedef std::tuple< sender<OutputTypes>&... > output_ports_type;
+
+private:
+    std::unique_ptr<output_ports_type> my_output_ports;
+    static const size_t NUM_OUTPUTS = sizeof...(OutputTypes);
+
+protected:
+    void reset_node(reset_flags) override {}
+
+public:
+    __TBB_NOINLINE_SYM composite_node( graph &g ) : graph_node(g) {
+        fgt_composite( CODEPTR(), this, &g );
+    }
+
+   template<typename T>
+   void set_external_ports(T&& output_ports_tuple) {
+       static_assert(NUM_OUTPUTS == std::tuple_size<output_ports_type>::value, "number of arguments does not match number of output ports");
+
+       fgt_internal_output_alias_helper<T, NUM_OUTPUTS>::alias_port( this, output_ports_tuple);
+
+       my_output_ports.reset( new output_ports_type(std::forward<T>(output_ports_tuple)) );
+   }
+
+    template<typename... NodeTypes >
+    void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); }
+
+    template<typename... NodeTypes >
+    void add_nodes(const NodeTypes&... n) { add_nodes_impl(this, false, n...); }
+
+
+    output_ports_type& output_ports() {
+         __TBB_ASSERT(my_output_ports, "output ports not set, call set_external_ports to set output ports");
+         return *my_output_ports;
+    }
+
+};  // class composite_node
+
+template<typename Gateway>
+class async_body_base: no_assign {
+public:
+    typedef Gateway gateway_type;
+
+    async_body_base(gateway_type *gateway): my_gateway(gateway) { }
+    void set_gateway(gateway_type *gateway) {
+        my_gateway = gateway;
+    }
+
+protected:
+    gateway_type *my_gateway;
+};
+
+template<typename Input, typename Ports, typename Gateway, typename Body>
+class async_body: public async_body_base<Gateway> {
+private:
+    Body my_body;
+
+public:
+    typedef async_body_base<Gateway> base_type;
+    typedef Gateway gateway_type;
+
+    async_body(const Body &body, gateway_type *gateway)
+        : base_type(gateway), my_body(body) { }
+
+    void operator()( const Input &v, Ports & ) noexcept(noexcept(tbb::detail::invoke(my_body, v, std::declval<gateway_type&>()))) {
+        tbb::detail::invoke(my_body, v, *this->my_gateway);
+    }
+
+    Body get_body() { return my_body; }
+};
+
+//! Implements async node
+template < typename Input, typename Output,
+           typename Policy = queueing_lightweight >
+    __TBB_requires(std::default_initializable<Input> && std::copy_constructible<Input>)
+class async_node
+    : public multifunction_node< Input, std::tuple< Output >, Policy >, public sender< Output >
+{
+    typedef multifunction_node< Input, std::tuple< Output >, Policy > base_type;
+    typedef multifunction_input<
+        Input, typename base_type::output_ports_type, Policy, cache_aligned_allocator<Input>> mfn_input_type;
+
+public:
+    typedef Input input_type;
+    typedef Output output_type;
+    typedef receiver<input_type> receiver_type;
+    typedef receiver<output_type> successor_type;
+    typedef sender<input_type> predecessor_type;
+    typedef receiver_gateway<output_type> gateway_type;
+    typedef async_body_base<gateway_type> async_body_base_type;
+    typedef typename base_type::output_ports_type output_ports_type;
+
+private:
+    class receiver_gateway_impl: public receiver_gateway<Output> {
+    public:
+        receiver_gateway_impl(async_node* node): my_node(node) {}
+        void reserve_wait() override {
+            fgt_async_reserve(static_cast<typename async_node::receiver_type *>(my_node), &my_node->my_graph);
+            my_node->my_graph.reserve_wait();
+        }
+
+        void release_wait() override {
+            async_node* n = my_node;
+            graph* g = &n->my_graph;
+            g->release_wait();
+            fgt_async_commit(static_cast<typename async_node::receiver_type *>(n), g);
+        }
+
+        //! Implements gateway_type::try_put for an external activity to submit a message to FG
+        bool try_put(const Output &i) override {
+            return my_node->try_put_impl(i);
+        }
+
+    private:
+        async_node* my_node;
+    } my_gateway;
+
+    //The substitute of 'this' for member construction, to prevent compiler warnings
+    async_node* self() { return this; }
+
+    //! Implements gateway_type::try_put for an external activity to submit a message to FG
+    bool try_put_impl(const Output &i) {
+        multifunction_output<Output> &port_0 = output_port<0>(*this);
+        broadcast_cache<output_type>& port_successors = port_0.successors();
+        fgt_async_try_put_begin(this, &port_0);
+        // TODO revamp: change to std::list<graph_task*>
+        graph_task_list tasks;
+        bool is_at_least_one_put_successful = port_successors.gather_successful_try_puts(i, tasks);
+        __TBB_ASSERT( is_at_least_one_put_successful || tasks.empty(),
+                      "Return status is inconsistent with the method operation." );
+
+        while( !tasks.empty() ) {
+            enqueue_in_graph_arena(this->my_graph, tasks.pop_front());
+        }
+        fgt_async_try_put_end(this, &port_0);
+        return is_at_least_one_put_successful;
+    }
+
+public:
+    template<typename Body>
+        __TBB_requires(async_node_body<Body, input_type, gateway_type>)
+    __TBB_NOINLINE_SYM async_node(
+        graph &g, size_t concurrency,
+        Body body, Policy = Policy(), node_priority_t a_priority = no_priority
+    ) : base_type(
+        g, concurrency,
+        async_body<Input, typename base_type::output_ports_type, gateway_type, Body>
+        (body, &my_gateway), a_priority ), my_gateway(self()) {
+        fgt_multioutput_node_with_body<1>(
+            CODEPTR(), FLOW_ASYNC_NODE,
+            &this->my_graph, static_cast<receiver<input_type> *>(this),
+            this->output_ports(), this->my_body
+        );
+    }
+
+    template <typename Body>
+        __TBB_requires(async_node_body<Body, input_type, gateway_type>)
+    __TBB_NOINLINE_SYM async_node(graph& g, size_t concurrency, Body body, node_priority_t a_priority)
+        : async_node(g, concurrency, body, Policy(), a_priority) {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename Body, typename... Args>
+        __TBB_requires(async_node_body<Body, input_type, gateway_type>)
+    __TBB_NOINLINE_SYM async_node(
+        const node_set<Args...>& nodes, size_t concurrency, Body body,
+        Policy = Policy(), node_priority_t a_priority = no_priority )
+        : async_node(nodes.graph_reference(), concurrency, body, a_priority) {
+        make_edges_in_order(nodes, *this);
+    }
+
+    template <typename Body, typename... Args>
+        __TBB_requires(async_node_body<Body, input_type, gateway_type>)
+    __TBB_NOINLINE_SYM async_node(const node_set<Args...>& nodes, size_t concurrency, Body body, node_priority_t a_priority)
+        : async_node(nodes, concurrency, body, Policy(), a_priority) {}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+
+    __TBB_NOINLINE_SYM async_node( const async_node &other ) : base_type(other), sender<Output>(), my_gateway(self()) {
+        static_cast<async_body_base_type*>(this->my_body->get_body_ptr())->set_gateway(&my_gateway);
+        static_cast<async_body_base_type*>(this->my_init_body->get_body_ptr())->set_gateway(&my_gateway);
+
+        fgt_multioutput_node_with_body<1>( CODEPTR(), FLOW_ASYNC_NODE,
+                &this->my_graph, static_cast<receiver<input_type> *>(this),
+                this->output_ports(), this->my_body );
+    }
+
+    gateway_type& gateway() {
+        return my_gateway;
+    }
+
+    // Define sender< Output >
+
+    //! Add a new successor to this node
+    bool register_successor(successor_type&) override {
+        __TBB_ASSERT(false, "Successors must be registered only via ports");
+        return false;
+    }
+
+    //! Removes a successor from this node
+    bool remove_successor(successor_type&) override {
+        __TBB_ASSERT(false, "Successors must be removed only via ports");
+        return false;
+    }
+
+    template<typename Body>
+    Body copy_function_object() {
+        typedef multifunction_body<input_type, typename base_type::output_ports_type> mfn_body_type;
+        typedef async_body<Input, typename base_type::output_ports_type, gateway_type, Body> async_body_type;
+        mfn_body_type &body_ref = *this->my_body;
+        async_body_type ab = *static_cast<async_body_type*>(dynamic_cast< multifunction_body_leaf<input_type, typename base_type::output_ports_type, async_body_type> & >(body_ref).get_body_ptr());
+        return ab.get_body();
+    }
+
+protected:
+
+    void reset_node( reset_flags f) override {
+       base_type::reset_node(f);
+    }
+};
+
+#include "third_party/tbb/detail/_flow_graph_node_set_impl.h"
+
+template< typename T >
+class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    __TBB_NOINLINE_SYM explicit overwrite_node(graph &g)
+        : graph_node(g), my_successors(this), my_buffer_is_valid(false)
+    {
+        fgt_node( CODEPTR(), FLOW_OVERWRITE_NODE, &this->my_graph,
+                  static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    overwrite_node(const node_set<Args...>& nodes) : overwrite_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor; doesn't take anything from src; default won't work
+    __TBB_NOINLINE_SYM overwrite_node( const overwrite_node& src ) : overwrite_node(src.my_graph) {}
+
+    ~overwrite_node() {}
+
+    bool register_successor( successor_type &s ) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        if (my_buffer_is_valid && is_graph_active( my_graph )) {
+            // We have a valid value that must be forwarded immediately.
+            bool ret = s.try_put( my_buffer );
+            if ( ret ) {
+                // We add the successor that accepted our put
+                my_successors.register_successor( s );
+            } else {
+                // In case of reservation a race between the moment of reservation and register_successor can appear,
+                // because failed reserve does not mean that register_successor is not ready to put a message immediately.
+                // We have some sort of infinite loop: reserving node tries to set pull state for the edge,
+                // but overwrite_node tries to return push state back. That is why we have to break this loop with task creation.
+                small_object_allocator allocator{};
+                typedef register_predecessor_task task_type;
+                graph_task* t = allocator.new_object<task_type>(graph_reference(), allocator, *this, s);
+                graph_reference().reserve_wait();
+                spawn_in_graph_arena( my_graph, *t );
+            }
+        } else {
+            // No valid value yet, just add as successor
+            my_successors.register_successor( s );
+        }
+        return true;
+    }
+
+    bool remove_successor( successor_type &s ) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        my_successors.remove_successor(s);
+        return true;
+    }
+
+    bool try_get( input_type &v ) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        if ( my_buffer_is_valid ) {
+            v = my_buffer;
+            return true;
+        }
+        return false;
+    }
+
+    //! Reserves an item
+    bool try_reserve( T &v ) override {
+        return try_get(v);
+    }
+
+    //! Releases the reserved item
+    bool try_release() override { return true; }
+
+    //! Consumes the reserved item
+    bool try_consume() override { return true; }
+
+    bool is_valid() {
+       spin_mutex::scoped_lock l( my_mutex );
+       return my_buffer_is_valid;
+    }
+
+    void clear() {
+       spin_mutex::scoped_lock l( my_mutex );
+       my_buffer_is_valid = false;
+    }
+
+protected:
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    graph_task* try_put_task( const input_type &v ) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        return try_put_task_impl(v);
+    }
+
+    graph_task * try_put_task_impl(const input_type &v) {
+        my_buffer = v;
+        my_buffer_is_valid = true;
+        graph_task* rtask = my_successors.try_put_task(v);
+        if (!rtask) rtask = SUCCESSFULLY_ENQUEUED;
+        return rtask;
+    }
+
+    graph& graph_reference() const override {
+        return my_graph;
+    }
+
+    //! Breaks an infinite loop between the node reservation and register_successor call
+    struct register_predecessor_task : public graph_task {
+        register_predecessor_task(
+            graph& g, small_object_allocator& allocator, predecessor_type& owner, successor_type& succ)
+            : graph_task(g, allocator), o(owner), s(succ) {};
+
+        task* execute(execution_data& ed) override {
+            // TODO revamp: investigate why qualification is needed for register_successor() call
+            using tbb::detail::d1::register_predecessor;
+            using tbb::detail::d1::register_successor;
+            if ( !register_predecessor(s, o) ) {
+                register_successor(o, s);
+            }
+            finalize<register_predecessor_task>(ed);
+            return nullptr;
+        }
+
+        task* cancel(execution_data& ed) override {
+            finalize<register_predecessor_task>(ed);
+            return nullptr;
+        }
+
+        predecessor_type& o;
+        successor_type& s;
+    };
+
+    spin_mutex my_mutex;
+    broadcast_cache< input_type, null_rw_mutex > my_successors;
+    input_type my_buffer;
+    bool my_buffer_is_valid;
+
+    void reset_node( reset_flags f) override {
+        my_buffer_is_valid = false;
+       if (f&rf_clear_edges) {
+           my_successors.clear();
+       }
+    }
+};  // overwrite_node
+
+template< typename T >
+class write_once_node : public overwrite_node<T> {
+public:
+    typedef T input_type;
+    typedef T output_type;
+    typedef overwrite_node<T> base_type;
+    typedef typename receiver<input_type>::predecessor_type predecessor_type;
+    typedef typename sender<output_type>::successor_type successor_type;
+
+    //! Constructor
+    __TBB_NOINLINE_SYM explicit write_once_node(graph& g) : base_type(g) {
+        fgt_node( CODEPTR(), FLOW_WRITE_ONCE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    template <typename... Args>
+    write_once_node(const node_set<Args...>& nodes) : write_once_node(nodes.graph_reference()) {
+        make_edges_in_order(nodes, *this);
+    }
+#endif
+
+    //! Copy constructor: call base class copy constructor
+    __TBB_NOINLINE_SYM write_once_node( const write_once_node& src ) : base_type(src) {
+        fgt_node( CODEPTR(), FLOW_WRITE_ONCE_NODE, &(this->my_graph),
+                                 static_cast<receiver<input_type> *>(this),
+                                 static_cast<sender<output_type> *>(this) );
+    }
+
+protected:
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    graph_task *try_put_task( const T &v ) override {
+        spin_mutex::scoped_lock l( this->my_mutex );
+        return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v);
+    }
+}; // write_once_node
+
+inline void set_name(const graph& g, const char *name) {
+    fgt_graph_desc(&g, name);
+}
+
+template <typename Output>
+inline void set_name(const input_node<Output>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename Input, typename Output, typename Policy>
+inline void set_name(const function_node<Input, Output, Policy>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename Output, typename Policy>
+inline void set_name(const continue_node<Output,Policy>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const broadcast_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const buffer_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const queue_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const sequencer_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T, typename Compare>
+inline void set_name(const priority_queue_node<T, Compare>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T, typename DecrementType>
+inline void set_name(const limiter_node<T, DecrementType>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename OutputTuple, typename JP>
+inline void set_name(const join_node<OutputTuple, JP>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename... Types>
+inline void set_name(const indexer_node<Types...>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const overwrite_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template <typename T>
+inline void set_name(const write_once_node<T>& node, const char *name) {
+    fgt_node_desc(&node, name);
+}
+
+template<typename Input, typename Output, typename Policy>
+inline void set_name(const multifunction_node<Input, Output, Policy>& node, const char *name) {
+    fgt_multioutput_node_desc(&node, name);
+}
+
+template<typename TupleType>
+inline void set_name(const split_node<TupleType>& node, const char *name) {
+    fgt_multioutput_node_desc(&node, name);
+}
+
+template< typename InputTuple, typename OutputTuple >
+inline void set_name(const composite_node<InputTuple, OutputTuple>& node, const char *name) {
+    fgt_multiinput_multioutput_node_desc(&node, name);
+}
+
+template<typename Input, typename Output, typename Policy>
+inline void set_name(const async_node<Input, Output, Policy>& node, const char *name)
+{
+    fgt_multioutput_node_desc(&node, name);
+}
+} // d1
+} // detail
+} // tbb
+
+
+// Include deduction guides for node classes
+#include "third_party/tbb/detail/_flow_graph_nodes_deduction.h"
+
+namespace tbb {
+namespace flow {
+inline namespace v1 {
+    using detail::d1::receiver;
+    using detail::d1::sender;
+
+    using detail::d1::serial;
+    using detail::d1::unlimited;
+
+    using detail::d1::reset_flags;
+    using detail::d1::rf_reset_protocol;
+    using detail::d1::rf_reset_bodies;
+    using detail::d1::rf_clear_edges;
+
+    using detail::d1::graph;
+    using detail::d1::graph_node;
+    using detail::d1::continue_msg;
+
+    using detail::d1::input_node;
+    using detail::d1::function_node;
+    using detail::d1::multifunction_node;
+    using detail::d1::split_node;
+    using detail::d1::output_port;
+    using detail::d1::indexer_node;
+    using detail::d1::tagged_msg;
+    using detail::d1::cast_to;
+    using detail::d1::is_a;
+    using detail::d1::continue_node;
+    using detail::d1::overwrite_node;
+    using detail::d1::write_once_node;
+    using detail::d1::broadcast_node;
+    using detail::d1::buffer_node;
+    using detail::d1::queue_node;
+    using detail::d1::sequencer_node;
+    using detail::d1::priority_queue_node;
+    using detail::d1::limiter_node;
+    using namespace detail::d1::graph_policy_namespace;
+    using detail::d1::join_node;
+    using detail::d1::input_port;
+    using detail::d1::copy_body;
+    using detail::d1::make_edge;
+    using detail::d1::remove_edge;
+    using detail::d1::tag_value;
+    using detail::d1::composite_node;
+    using detail::d1::async_node;
+    using detail::d1::node_priority_t;
+    using detail::d1::no_priority;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+    using detail::d1::follows;
+    using detail::d1::precedes;
+    using detail::d1::make_node_set;
+    using detail::d1::make_edges;
+#endif
+
+} // v1
+} // flow
+
+    using detail::d1::flow_control;
+
+namespace profiling {
+    using detail::d1::set_name;
+} // profiling
+
+} // tbb
+
+
+#if TBB_USE_PROFILING_TOOLS  && ( __unix__ || __APPLE__ )
+   // We don't do pragma pop here, since it still gives warning on the USER side
+   #undef __TBB_NOINLINE_SYM
+#endif
+
+#endif // __TBB_flow_graph_H
diff --git a/third_party/tbb/flow_graph_abstractions.h b/third_party/tbb/flow_graph_abstractions.h
new file mode 100644
index 000000000..87921c9f9
--- /dev/null
+++ b/third_party/tbb/flow_graph_abstractions.h
@@ -0,0 +1,52 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_flow_graph_abstractions_H
+#define __TBB_flow_graph_abstractions_H
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Pure virtual template classes that define interfaces for async communication
+class graph_proxy {
+public:
+    //! Inform a graph that messages may come from outside, to prevent premature graph completion
+    virtual void reserve_wait() = 0;
+
+    //! Inform a graph that a previous call to reserve_wait is no longer in effect
+    virtual void release_wait() = 0;
+
+    virtual ~graph_proxy() {}
+};
+
+template <typename Input>
+class receiver_gateway : public graph_proxy {
+public:
+    //! Type of inputing data into FG.
+    typedef Input input_type;
+
+    //! Submit signal from an asynchronous activity to FG.
+    virtual bool try_put(const input_type&) = 0;
+};
+
+} // d1
+
+
+} // detail
+} // tbb
+#endif
diff --git a/third_party/tbb/global_control.cpp b/third_party/tbb/global_control.cpp
new file mode 100644
index 000000000..8d84e61ca
--- /dev/null
+++ b/third_party/tbb/global_control.cpp
@@ -0,0 +1,281 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+
+#include "third_party/tbb/global_control.h"
+#include "third_party/tbb/tbb_allocator.h"
+#include "third_party/tbb/spin_mutex.h"
+
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/threading_control.h"
+#include "third_party/tbb/market.h"
+#include "third_party/tbb/misc.h"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/set"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Comparator for a set of global_control objects
+struct control_storage_comparator {
+    bool operator()(const d1::global_control* lhs, const d1::global_control* rhs) const;
+};
+
+class control_storage {
+    friend struct global_control_impl;
+    friend std::size_t global_control_active_value(int);
+    friend void global_control_lock();
+    friend void global_control_unlock();
+    friend std::size_t global_control_active_value_unsafe(d1::global_control::parameter);
+protected:
+    std::size_t my_active_value{0};
+    std::set<d1::global_control*, control_storage_comparator, tbb_allocator<d1::global_control*>> my_list{};
+    spin_mutex my_list_mutex{};
+public:
+    virtual std::size_t default_value() const = 0;
+    virtual void apply_active(std::size_t new_active) {
+        my_active_value = new_active;
+    }
+    virtual bool is_first_arg_preferred(std::size_t a, std::size_t b) const {
+        return a>b; // prefer max by default
+    }
+    virtual std::size_t active_value() {
+        spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call
+        return !my_list.empty() ? my_active_value : default_value();
+    }
+
+    std::size_t active_value_unsafe() {
+        return !my_list.empty() ? my_active_value : default_value();
+    }
+};
+
+class alignas(max_nfs_size) allowed_parallelism_control : public control_storage {
+    std::size_t default_value() const override {
+        return max(1U, governor::default_num_threads());
+    }
+    bool is_first_arg_preferred(std::size_t a, std::size_t b) const override {
+        return a<b; // prefer min allowed parallelism
+    }
+    void apply_active(std::size_t new_active) override {
+        control_storage::apply_active(new_active);
+        __TBB_ASSERT(my_active_value >= 1, nullptr);
+        // -1 to take external thread into account
+        threading_control::set_active_num_workers(my_active_value - 1);
+    }
+    std::size_t active_value() override {
+        spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call
+        if (my_list.empty()) {
+            return default_value();
+        }
+
+        // non-zero, if market is active
+        const std::size_t workers = threading_control::max_num_workers();
+        // We can't exceed market's maximal number of workers.
+        // +1 to take external thread into account
+        return workers ? min(workers + 1, my_active_value) : my_active_value;
+    }
+public:
+    std::size_t active_value_if_present() const {
+        return !my_list.empty() ? my_active_value : 0;
+    }
+};
+
+class alignas(max_nfs_size) stack_size_control : public control_storage {
+    std::size_t default_value() const override {
+#if _WIN32_WINNT >= 0x0602 /* _WIN32_WINNT_WIN8 */
+        static auto ThreadStackSizeDefault = [] {
+            ULONG_PTR hi, lo;
+            GetCurrentThreadStackLimits(&lo, &hi);
+            return hi - lo;
+        }();
+        return ThreadStackSizeDefault;
+#else
+        return ThreadStackSize;
+#endif
+    }
+    void apply_active(std::size_t new_active) override {
+        control_storage::apply_active(new_active);
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+        __TBB_ASSERT( false, "For Windows 8 Store* apps we must not set stack size" );
+#endif
+    }
+};
+
+class alignas(max_nfs_size) terminate_on_exception_control : public control_storage {
+    std::size_t default_value() const override {
+        return 0;
+    }
+};
+
+class alignas(max_nfs_size) lifetime_control : public control_storage {
+    bool is_first_arg_preferred(std::size_t, std::size_t) const override {
+        return false; // not interested
+    }
+    std::size_t default_value() const override {
+        return 0;
+    }
+    void apply_active(std::size_t new_active) override {
+        if (new_active == 1) {
+            // reserve the market reference
+            threading_control::register_lifetime_control();
+        } else if (new_active == 0) { // new_active == 0
+            threading_control::unregister_lifetime_control(/*blocking_terminate*/ false);
+        }
+        control_storage::apply_active(new_active);
+    }
+
+public:
+    bool is_empty() {
+        spin_mutex::scoped_lock lock(my_list_mutex);
+        return my_list.empty();
+    }
+};
+
+static allowed_parallelism_control allowed_parallelism_ctl;
+static stack_size_control stack_size_ctl;
+static terminate_on_exception_control terminate_on_exception_ctl;
+static lifetime_control lifetime_ctl;
+static control_storage *controls[] = {&allowed_parallelism_ctl, &stack_size_ctl, &terminate_on_exception_ctl, &lifetime_ctl};
+
+void global_control_lock() {
+    for (auto& ctl : controls) {
+        ctl->my_list_mutex.lock();
+    }
+}
+
+void global_control_unlock() {
+    int N = std::distance(std::begin(controls), std::end(controls));
+    for (int i = N - 1; i >= 0; --i) {
+        controls[i]->my_list_mutex.unlock();
+    }
+}
+
+std::size_t global_control_active_value_unsafe(d1::global_control::parameter param) {
+    __TBB_ASSERT_RELEASE(param < d1::global_control::parameter_max, nullptr);
+    return controls[param]->active_value_unsafe();
+}
+
+//! Comparator for a set of global_control objects
+inline bool control_storage_comparator::operator()(const d1::global_control* lhs, const d1::global_control* rhs) const {
+    __TBB_ASSERT_RELEASE(lhs->my_param < d1::global_control::parameter_max , nullptr);
+    return lhs->my_value < rhs->my_value || (lhs->my_value == rhs->my_value && lhs < rhs);
+}
+
+bool terminate_on_exception() {
+    return d1::global_control::active_value(d1::global_control::terminate_on_exception) == 1;
+}
+
+struct global_control_impl {
+private:
+    static bool erase_if_present(control_storage* const c, d1::global_control& gc) {
+        auto it = c->my_list.find(&gc);
+        if (it != c->my_list.end()) {
+            c->my_list.erase(it);
+            return true;
+        }
+        return false;
+    }
+
+public:
+
+    static void create(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr);
+        control_storage* const c = controls[gc.my_param];
+
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        if (c->my_list.empty() || c->is_first_arg_preferred(gc.my_value, c->my_active_value)) {
+            // to guarantee that apply_active() is called with current active value,
+            // calls it here and in internal_destroy() under my_list_mutex
+            c->apply_active(gc.my_value);
+        }
+        c->my_list.insert(&gc);
+    }
+
+    static void destroy(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr);
+        control_storage* const c = controls[gc.my_param];
+        // Concurrent reading and changing global parameter is possible.
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        __TBB_ASSERT(gc.my_param == d1::global_control::scheduler_handle || !c->my_list.empty(), nullptr);
+        std::size_t new_active = (std::size_t)(-1), old_active = c->my_active_value;
+
+        if (!erase_if_present(c, gc)) {
+            __TBB_ASSERT(gc.my_param == d1::global_control::scheduler_handle , nullptr);
+            return;
+        }
+        if (c->my_list.empty()) {
+            __TBB_ASSERT(new_active == (std::size_t) - 1, nullptr);
+            new_active = c->default_value();
+        } else {
+            new_active = (*c->my_list.begin())->my_value;
+        }
+        if (new_active != old_active) {
+            c->apply_active(new_active);
+        }
+    }
+
+    static bool remove_and_check_if_empty(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr);
+        control_storage* const c = controls[gc.my_param];
+
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        __TBB_ASSERT(!c->my_list.empty(), nullptr);
+        erase_if_present(c, gc);
+        return c->my_list.empty();
+    }
+#if TBB_USE_ASSERT
+    static bool is_present(d1::global_control& gc) {
+        __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr);
+        control_storage* const c = controls[gc.my_param];
+
+        spin_mutex::scoped_lock lock(c->my_list_mutex);
+        auto it = c->my_list.find(&gc);
+        if (it != c->my_list.end()) {
+            return true;
+        }
+        return false;
+    }
+#endif // TBB_USE_ASSERT
+};
+
+void __TBB_EXPORTED_FUNC create(d1::global_control& gc) {
+    global_control_impl::create(gc);
+}
+void __TBB_EXPORTED_FUNC destroy(d1::global_control& gc) {
+    global_control_impl::destroy(gc);
+}
+
+bool remove_and_check_if_empty(d1::global_control& gc) {
+    return global_control_impl::remove_and_check_if_empty(gc);
+}
+#if TBB_USE_ASSERT
+bool is_present(d1::global_control& gc) {
+    return global_control_impl::is_present(gc);
+}
+#endif // TBB_USE_ASSERT
+std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int param) {
+    __TBB_ASSERT_RELEASE(param < d1::global_control::parameter_max, nullptr);
+    return controls[param]->active_value();
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/global_control.h b/third_party/tbb/global_control.h
new file mode 100644
index 000000000..9740b5700
--- /dev/null
+++ b/third_party/tbb/global_control.h
@@ -0,0 +1,201 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_global_control_H
+#define __TBB_global_control_H
+
+#include "third_party/tbb/detail/_config.h"
+
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_attach.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/new" // std::nothrow_t
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class global_control;
+class task_scheduler_handle;
+}
+
+namespace r1 {
+TBB_EXPORT void __TBB_EXPORTED_FUNC create(d1::global_control&);
+TBB_EXPORT void __TBB_EXPORTED_FUNC destroy(d1::global_control&);
+TBB_EXPORT std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int);
+struct global_control_impl;
+struct control_storage_comparator;
+void release_impl(d1::task_scheduler_handle& handle);
+bool finalize_impl(d1::task_scheduler_handle& handle);
+TBB_EXPORT void __TBB_EXPORTED_FUNC get(d1::task_scheduler_handle&);
+TBB_EXPORT bool __TBB_EXPORTED_FUNC finalize(d1::task_scheduler_handle&, std::intptr_t mode);
+}
+
+namespace d1 {
+
+class global_control {
+public:
+    enum parameter {
+        max_allowed_parallelism,
+        thread_stack_size,
+        terminate_on_exception,
+        scheduler_handle, // not a public parameter
+        parameter_max // insert new parameters above this point
+    };
+
+    global_control(parameter p, std::size_t value) :
+        my_value(value), my_reserved(), my_param(p) {
+        suppress_unused_warning(my_reserved);
+        __TBB_ASSERT(my_param < parameter_max, "Invalid parameter");
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+        // For Windows 8 Store* apps it's impossible to set stack size
+        if (p==thread_stack_size)
+            return;
+#elif __TBB_x86_64 && (_WIN32 || _WIN64)
+        if (p==thread_stack_size)
+            __TBB_ASSERT_RELEASE((unsigned)value == value, "Stack size is limited to unsigned int range");
+#endif
+        if (my_param==max_allowed_parallelism)
+            __TBB_ASSERT_RELEASE(my_value>0, "max_allowed_parallelism cannot be 0.");
+        r1::create(*this);
+    }
+
+    ~global_control() {
+        __TBB_ASSERT(my_param < parameter_max, "Invalid parameter");
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+        // For Windows 8 Store* apps it's impossible to set stack size
+        if (my_param==thread_stack_size)
+            return;
+#endif
+        r1::destroy(*this);
+    }
+
+    static std::size_t active_value(parameter p) {
+        __TBB_ASSERT(p < parameter_max, "Invalid parameter");
+        return r1::global_control_active_value((int)p);
+    }
+
+private:
+    std::size_t my_value;
+    std::intptr_t my_reserved; // TODO: substitution of global_control* not to break backward compatibility
+    parameter my_param;
+
+    friend struct r1::global_control_impl;
+    friend struct r1::control_storage_comparator;
+};
+
+//! Finalization options.
+//! Outside of the class to avoid extensive friendship.
+static constexpr std::intptr_t release_nothrowing = 0;
+static constexpr std::intptr_t finalize_nothrowing = 1;
+static constexpr std::intptr_t finalize_throwing = 2;
+
+//! User side wrapper for a task scheduler lifetime control object
+class task_scheduler_handle {
+public:
+    //! Creates an empty task_scheduler_handle
+    task_scheduler_handle() = default;
+
+    //! Creates an attached instance of task_scheduler_handle
+    task_scheduler_handle(attach) {
+        r1::get(*this);
+    }
+
+    //! Release a reference if any
+    ~task_scheduler_handle() {
+        release();
+    }
+
+    //! No copy
+    task_scheduler_handle(const task_scheduler_handle& other) = delete;
+    task_scheduler_handle& operator=(const task_scheduler_handle& other) = delete;
+
+    //! Move only
+    task_scheduler_handle(task_scheduler_handle&& other) noexcept {
+        std::swap(m_ctl, other.m_ctl);
+    }
+    task_scheduler_handle& operator=(task_scheduler_handle&& other) noexcept {
+        std::swap(m_ctl, other.m_ctl);
+        return *this;
+    };
+
+    //! Checks if the task_scheduler_handle is empty
+    explicit operator bool() const noexcept {
+        return m_ctl != nullptr;
+    }
+
+    //! Release the reference and deactivate handle
+    void release() {
+        if (m_ctl != nullptr) {
+            r1::finalize(*this, release_nothrowing);
+            m_ctl = nullptr;
+        }
+    }
+
+private:
+    friend void r1::release_impl(task_scheduler_handle& handle);
+    friend bool r1::finalize_impl(task_scheduler_handle& handle);
+    friend void __TBB_EXPORTED_FUNC r1::get(task_scheduler_handle&);
+
+    friend void finalize(task_scheduler_handle&);
+    friend bool finalize(task_scheduler_handle&, const std::nothrow_t&) noexcept;
+
+    global_control* m_ctl{nullptr};
+};
+
+#if TBB_USE_EXCEPTIONS
+//! Waits for worker threads termination. Throws exception on error.
+inline void finalize(task_scheduler_handle& handle) {
+    try_call([&] {
+        if (handle.m_ctl != nullptr) {
+            bool finalized = r1::finalize(handle, finalize_throwing);
+            __TBB_ASSERT_EX(finalized, "r1::finalize did not respect finalize_throwing ?");
+            
+        }
+    }).on_completion([&] {
+        __TBB_ASSERT(!handle, "The handle should be empty after finalize");
+    });
+}
+#endif
+//! Waits for worker threads termination. Returns false on error.
+inline bool finalize(task_scheduler_handle& handle, const std::nothrow_t&) noexcept {
+    bool finalized = true;
+    if (handle.m_ctl != nullptr) {
+        finalized = r1::finalize(handle, finalize_nothrowing);
+    }
+    __TBB_ASSERT(!handle, "The handle should be empty after finalize");
+    return finalized;
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::global_control;
+using detail::d1::attach;
+using detail::d1::finalize;
+using detail::d1::task_scheduler_handle;
+using detail::r1::unsafe_wait;
+} // namespace v1
+
+} // namespace tbb
+
+#endif // __TBB_global_control_H
diff --git a/third_party/tbb/governor.cpp b/third_party/tbb/governor.cpp
new file mode 100644
index 000000000..91f3db3a1
--- /dev/null
+++ b/third_party/tbb/governor.cpp
@@ -0,0 +1,580 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/threading_control.h"
+#include "third_party/tbb/main.h"
+#include "third_party/tbb/thread_data.h"
+#include "third_party/tbb/market.h"
+#include "third_party/tbb/arena.h"
+#include "third_party/tbb/dynamic_link.h"
+#include "third_party/tbb/concurrent_monitor.h"
+#include "third_party/tbb/thread_dispatcher.h"
+
+#include "third_party/tbb/task_group.h"
+#include "third_party/tbb/global_control.h"
+#include "third_party/tbb/tbb_allocator.h"
+#include "third_party/tbb/info.h"
+
+#include "third_party/tbb/task_dispatcher.h"
+
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/algorithm"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+void clear_address_waiter_table();
+
+//! global_control.cpp contains definition
+bool remove_and_check_if_empty(d1::global_control& gc);
+bool is_present(d1::global_control& gc);
+
+namespace rml {
+tbb_server* make_private_server( tbb_client& client );
+} // namespace rml
+
+namespace system_topology {
+    void destroy();
+}
+
+//------------------------------------------------------------------------
+// governor
+//------------------------------------------------------------------------
+
+void governor::acquire_resources () {
+#if __TBB_USE_POSIX
+    int status = theTLS.create(auto_terminate);
+#else
+    int status = theTLS.create();
+#endif
+    if( status )
+        handle_perror(status, "TBB failed to initialize task scheduler TLS\n");
+    detect_cpu_features(cpu_features);
+
+    is_rethrow_broken = gcc_rethrow_exception_broken();
+}
+
+void governor::release_resources () {
+    theRMLServerFactory.close();
+    destroy_process_mask();
+
+    __TBB_ASSERT(!(__TBB_InitOnce::initialization_done() && theTLS.get()), "TBB is unloaded while thread data still alive?");
+
+    int status = theTLS.destroy();
+    if( status )
+        runtime_warning("failed to destroy task scheduler TLS: %s", std::strerror(status));
+    clear_address_waiter_table();
+
+    system_topology::destroy();
+    dynamic_unlink_all();
+}
+
+rml::tbb_server* governor::create_rml_server ( rml::tbb_client& client ) {
+    rml::tbb_server* server = nullptr;
+    if( !UsePrivateRML ) {
+        ::rml::factory::status_type status = theRMLServerFactory.make_server( server, client );
+        if( status != ::rml::factory::st_success ) {
+            UsePrivateRML = true;
+            runtime_warning( "rml::tbb_factory::make_server failed with status %x, falling back on private rml", status );
+        }
+    }
+    if ( !server ) {
+        __TBB_ASSERT( UsePrivateRML, nullptr);
+        server = rml::make_private_server( client );
+    }
+    __TBB_ASSERT( server, "Failed to create RML server" );
+    return server;
+}
+
+void governor::one_time_init() {
+    if ( !__TBB_InitOnce::initialization_done() ) {
+        DoOneTimeInitialization();
+    }
+}
+
+bool governor::does_client_join_workers(const rml::tbb_client &client) {
+    return ((const thread_dispatcher&)client).must_join_workers();
+}
+
+/*
+    There is no portable way to get stack base address in Posix, however the modern
+    Linux versions provide pthread_attr_np API that can be used  to obtain thread's
+    stack size and base address. Unfortunately even this function does not provide
+    enough information for the main thread on IA-64 architecture (RSE spill area
+    and memory stack are allocated as two separate discontinuous chunks of memory),
+    and there is no portable way to discern the main and the secondary threads.
+    Thus for macOS* and IA-64 architecture for Linux* OS we use the TBB worker stack size for
+    all threads and use the current stack top as the stack base. This simplified
+    approach is based on the following assumptions:
+    1) If the default stack size is insufficient for the user app needs, the
+    required amount will be explicitly specified by the user at the point of the
+    TBB scheduler initialization (as an argument to tbb::task_scheduler_init
+    constructor).
+    2) When an external thread initializes the scheduler, it has enough space on its
+    stack. Here "enough" means "at least as much as worker threads have".
+    3) If the user app strives to conserve the memory by cutting stack size, it
+    should do this for TBB workers too (as in the #1).
+*/
+static std::uintptr_t get_stack_base(std::size_t stack_size) {
+    // Stacks are growing top-down. Highest address is called "stack base",
+    // and the lowest is "stack limit".
+#if __TBB_USE_WINAPI
+    suppress_unused_warning(stack_size);
+    NT_TIB* pteb = (NT_TIB*)NtCurrentTeb();
+    __TBB_ASSERT(&pteb < pteb->StackBase && &pteb > pteb->StackLimit, "invalid stack info in TEB");
+    return reinterpret_cast<std::uintptr_t>(pteb->StackBase);
+#else
+    // There is no portable way to get stack base address in Posix, so we use
+    // non-portable method (on all modern Linux) or the simplified approach
+    // based on the common sense assumptions. The most important assumption
+    // is that the main thread's stack size is not less than that of other threads.
+
+    // Points to the lowest addressable byte of a stack.
+    void* stack_limit = nullptr;
+#if __linux__ && !__bg__
+    size_t np_stack_size = 0;
+    pthread_attr_t np_attr_stack;
+    if (0 == pthread_getattr_np(pthread_self(), &np_attr_stack)) {
+        if (0 == pthread_attr_getstack(&np_attr_stack, &stack_limit, &np_stack_size)) {
+            __TBB_ASSERT( &stack_limit > stack_limit, "stack size must be positive" );
+        }
+        pthread_attr_destroy(&np_attr_stack);
+    }
+#endif /* __linux__ */
+    std::uintptr_t stack_base{};
+    if (stack_limit) {
+        stack_base = reinterpret_cast<std::uintptr_t>(stack_limit) + stack_size;
+    } else {
+        // Use an anchor as a base stack address.
+        int anchor{};
+        stack_base = reinterpret_cast<std::uintptr_t>(&anchor);
+    }
+    return stack_base;
+#endif /* __TBB_USE_WINAPI */
+}
+
+#if (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED
+static void register_external_thread_destructor() {
+    struct thread_destructor {
+        ~thread_destructor() {
+            governor::terminate_external_thread();
+        }
+    };
+    // ~thread_destructor() will be call during the calling thread termination
+    static thread_local thread_destructor thr_destructor;
+}
+#endif // (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED
+
+void governor::init_external_thread() {
+    one_time_init();
+    // Create new scheduler instance with arena
+    int num_slots = default_num_threads();
+    // TODO_REVAMP: support an external thread without an implicit arena
+    int num_reserved_slots = 1;
+    unsigned arena_priority_level = 1; // corresponds to tbb::task_arena::priority::normal
+    std::size_t stack_size = 0;
+    threading_control* thr_control = threading_control::register_public_reference();
+    arena& a = arena::create(thr_control, num_slots, num_reserved_slots, arena_priority_level);
+    // External thread always occupies the first slot
+    thread_data& td = *new(cache_aligned_allocate(sizeof(thread_data))) thread_data(0, false);
+    td.attach_arena(a, /*slot index*/ 0);
+    __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr);
+
+    stack_size = a.my_threading_control->worker_stack_size();
+    std::uintptr_t stack_base = get_stack_base(stack_size);
+    task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher();
+    td.enter_task_dispatcher(task_disp, calculate_stealing_threshold(stack_base, stack_size));
+
+    td.my_arena_slot->occupy();
+    thr_control->register_thread(td);
+    set_thread_data(td);
+#if (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED
+    // The external thread destructor is called from dllMain but it is not available with a static build.
+    // Therefore, we need to register the current thread to call the destructor during thread termination.
+    register_external_thread_destructor();
+#endif
+}
+
+void governor::auto_terminate(void* tls) {
+    __TBB_ASSERT(get_thread_data_if_initialized() == nullptr ||
+        get_thread_data_if_initialized() == tls, nullptr);
+    if (tls) {
+        thread_data* td = static_cast<thread_data*>(tls);
+
+        auto clear_tls = [td] {
+            td->~thread_data();
+            cache_aligned_deallocate(td);
+            clear_thread_data();
+        };
+
+        // Only external thread can be inside an arena during termination.
+        if (td->my_arena_slot) {
+            arena* a = td->my_arena;
+            threading_control* thr_control = a->my_threading_control;
+
+            // If the TLS slot is already cleared by OS or underlying concurrency
+            // runtime, restore its value to properly clean up arena
+            if (!is_thread_data_set(td)) {
+                set_thread_data(*td);
+            }
+
+            a->my_observers.notify_exit_observers(td->my_last_observer, td->my_is_worker);
+
+            td->leave_task_dispatcher();
+            td->my_arena_slot->release();
+            // Release an arena
+            a->on_thread_leaving(arena::ref_external);
+
+            thr_control->unregister_thread(*td);
+
+            // The tls should be cleared before market::release because
+            // market can destroy the tls key if we keep the last reference
+            clear_tls();
+
+            // If there was an associated arena, it added a public market reference
+            thr_control->unregister_public_reference(/* blocking terminate =*/ false);
+        } else {
+            clear_tls();
+        }
+    }
+    __TBB_ASSERT(get_thread_data_if_initialized() == nullptr, nullptr);
+}
+
+void governor::initialize_rml_factory () {
+    ::rml::factory::status_type res = theRMLServerFactory.open();
+    UsePrivateRML = res != ::rml::factory::st_success;
+}
+
+void __TBB_EXPORTED_FUNC get(d1::task_scheduler_handle& handle) {
+    handle.m_ctl = new(allocate_memory(sizeof(global_control))) global_control(global_control::scheduler_handle, 1);
+}
+
+void release_impl(d1::task_scheduler_handle& handle) {
+    if (handle.m_ctl != nullptr) {
+        handle.m_ctl->~global_control();
+        deallocate_memory(handle.m_ctl);
+        handle.m_ctl = nullptr;
+    }
+}
+
+bool finalize_impl(d1::task_scheduler_handle& handle) {
+    __TBB_ASSERT_RELEASE(handle, "trying to finalize with null handle");
+    __TBB_ASSERT(is_present(*handle.m_ctl), "finalize or release was already called on this object");
+
+    bool ok = true; // ok if threading_control does not exist yet
+    if (threading_control::is_present()) {
+        thread_data* td = governor::get_thread_data_if_initialized();
+        if (td) {
+            task_dispatcher* task_disp = td->my_task_dispatcher;
+            __TBB_ASSERT(task_disp, nullptr);
+            if (task_disp->m_properties.outermost && !td->my_is_worker) { // is not inside a parallel region
+                governor::auto_terminate(td);
+            }
+        }
+
+        if (remove_and_check_if_empty(*handle.m_ctl)) {
+            ok = threading_control::unregister_lifetime_control(/*blocking_terminate*/ true);
+        } else {
+            ok = false;
+        }
+    }
+
+    return ok;
+}
+
+bool __TBB_EXPORTED_FUNC finalize(d1::task_scheduler_handle& handle, std::intptr_t mode) {
+    if (mode == d1::release_nothrowing) {
+        release_impl(handle);
+        return true;
+    } else {
+        bool ok = finalize_impl(handle);
+        // TODO: it is unsafe when finalize is called concurrently and further library unload
+        release_impl(handle);
+        if (mode == d1::finalize_throwing && !ok) {
+            throw_exception(exception_id::unsafe_wait);
+        }
+        return ok;
+    }
+}
+
+#if __TBB_ARENA_BINDING
+
+#if __TBB_WEAK_SYMBOLS_PRESENT
+#pragma weak __TBB_internal_initialize_system_topology
+#pragma weak __TBB_internal_destroy_system_topology
+#pragma weak __TBB_internal_allocate_binding_handler
+#pragma weak __TBB_internal_deallocate_binding_handler
+#pragma weak __TBB_internal_apply_affinity
+#pragma weak __TBB_internal_restore_affinity
+#pragma weak __TBB_internal_get_default_concurrency
+
+extern "C" {
+void __TBB_internal_initialize_system_topology(
+    size_t groups_num,
+    int& numa_nodes_count, int*& numa_indexes_list,
+    int& core_types_count, int*& core_types_indexes_list
+);
+void __TBB_internal_destroy_system_topology( );
+
+//TODO: consider renaming to `create_binding_handler` and `destroy_binding_handler`
+binding_handler* __TBB_internal_allocate_binding_handler( int slot_num, int numa_id, int core_type_id, int max_threads_per_core );
+void __TBB_internal_deallocate_binding_handler( binding_handler* handler_ptr );
+
+void __TBB_internal_apply_affinity( binding_handler* handler_ptr, int slot_num );
+void __TBB_internal_restore_affinity( binding_handler* handler_ptr, int slot_num );
+
+int __TBB_internal_get_default_concurrency( int numa_id, int core_type_id, int max_threads_per_core );
+}
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+// Stubs that will be used if TBBbind library is unavailable.
+static void dummy_destroy_system_topology ( ) { }
+static binding_handler* dummy_allocate_binding_handler ( int, int, int, int ) { return nullptr; }
+static void dummy_deallocate_binding_handler ( binding_handler* ) { }
+static void dummy_apply_affinity ( binding_handler*, int ) { }
+static void dummy_restore_affinity ( binding_handler*, int ) { }
+static int dummy_get_default_concurrency( int, int, int ) { return governor::default_num_threads(); }
+
+// Handlers for communication with TBBbind
+static void (*initialize_system_topology_ptr)(
+    size_t groups_num,
+    int& numa_nodes_count, int*& numa_indexes_list,
+    int& core_types_count, int*& core_types_indexes_list
+) = nullptr;
+static void (*destroy_system_topology_ptr)( ) = dummy_destroy_system_topology;
+
+static binding_handler* (*allocate_binding_handler_ptr)( int slot_num, int numa_id, int core_type_id, int max_threads_per_core )
+    = dummy_allocate_binding_handler;
+static void (*deallocate_binding_handler_ptr)( binding_handler* handler_ptr )
+    = dummy_deallocate_binding_handler;
+static void (*apply_affinity_ptr)( binding_handler* handler_ptr, int slot_num )
+    = dummy_apply_affinity;
+static void (*restore_affinity_ptr)( binding_handler* handler_ptr, int slot_num )
+    = dummy_restore_affinity;
+int (*get_default_concurrency_ptr)( int numa_id, int core_type_id, int max_threads_per_core )
+    = dummy_get_default_concurrency;
+
+#if _WIN32 || _WIN64 || __unix__
+// Table describing how to link the handlers.
+static const dynamic_link_descriptor TbbBindLinkTable[] = {
+    DLD(__TBB_internal_initialize_system_topology, initialize_system_topology_ptr),
+    DLD(__TBB_internal_destroy_system_topology, destroy_system_topology_ptr),
+    DLD(__TBB_internal_allocate_binding_handler, allocate_binding_handler_ptr),
+    DLD(__TBB_internal_deallocate_binding_handler, deallocate_binding_handler_ptr),
+    DLD(__TBB_internal_apply_affinity, apply_affinity_ptr),
+    DLD(__TBB_internal_restore_affinity, restore_affinity_ptr),
+    DLD(__TBB_internal_get_default_concurrency, get_default_concurrency_ptr)
+};
+
+static const unsigned LinkTableSize = sizeof(TbbBindLinkTable) / sizeof(dynamic_link_descriptor);
+
+#if TBB_USE_DEBUG
+#define DEBUG_SUFFIX "_debug"
+#else
+#define DEBUG_SUFFIX
+#endif /* TBB_USE_DEBUG */
+
+#if _WIN32 || _WIN64
+#define LIBRARY_EXTENSION ".dll"
+#define LIBRARY_PREFIX
+#elif __unix__
+#define LIBRARY_EXTENSION __TBB_STRING(.so.3)
+#define LIBRARY_PREFIX "lib"
+#endif /* __unix__ */
+
+#define TBBBIND_NAME LIBRARY_PREFIX "tbbbind" DEBUG_SUFFIX LIBRARY_EXTENSION
+#define TBBBIND_2_0_NAME LIBRARY_PREFIX "tbbbind_2_0" DEBUG_SUFFIX LIBRARY_EXTENSION
+
+#define TBBBIND_2_5_NAME LIBRARY_PREFIX "tbbbind_2_5" DEBUG_SUFFIX LIBRARY_EXTENSION
+#endif /* _WIN32 || _WIN64 || __unix__ */
+
+// Representation of system hardware topology information on the TBB side.
+// System topology may be initialized by third-party component (e.g. hwloc)
+// or just filled in with default stubs.
+namespace system_topology {
+
+constexpr int automatic = -1;
+
+static std::atomic<do_once_state> initialization_state;
+
+namespace {
+int  numa_nodes_count = 0;
+int* numa_nodes_indexes = nullptr;
+
+int  core_types_count = 0;
+int* core_types_indexes = nullptr;
+
+const char* load_tbbbind_shared_object() {
+#if _WIN32 || _WIN64 || __unix__
+#if _WIN32 && !_WIN64
+    // For 32-bit Windows applications, process affinity masks can only support up to 32 logical CPUs.
+    SYSTEM_INFO si;
+    GetNativeSystemInfo(&si);
+    if (si.dwNumberOfProcessors > 32) return nullptr;
+#endif /* _WIN32 && !_WIN64 */
+    for (const auto& tbbbind_version : {TBBBIND_2_5_NAME, TBBBIND_2_0_NAME, TBBBIND_NAME}) {
+        if (dynamic_link(tbbbind_version, TbbBindLinkTable, LinkTableSize, nullptr, DYNAMIC_LINK_LOCAL_BINDING)) {
+            return tbbbind_version;
+        }
+    }
+#endif /* _WIN32 || _WIN64 || __unix__ */
+    return nullptr;
+}
+
+int processor_groups_num() {
+#if _WIN32
+    return NumberOfProcessorGroups();
+#else
+    // Stub to improve code readability by reducing number of the compile-time conditions
+    return 1;
+#endif
+}
+} // internal namespace
+
+// Tries to load TBBbind library API, if success, gets NUMA topology information from it,
+// in another case, fills NUMA topology by stubs.
+void initialization_impl() {
+    governor::one_time_init();
+
+    if (const char* tbbbind_name = load_tbbbind_shared_object()) {
+        initialize_system_topology_ptr(
+            processor_groups_num(),
+            numa_nodes_count, numa_nodes_indexes,
+            core_types_count, core_types_indexes
+        );
+
+        PrintExtraVersionInfo("TBBBIND", tbbbind_name);
+        return;
+    }
+
+    static int dummy_index = automatic;
+
+    numa_nodes_count = 1;
+    numa_nodes_indexes = &dummy_index;
+
+    core_types_count = 1;
+    core_types_indexes = &dummy_index;
+
+    PrintExtraVersionInfo("TBBBIND", "UNAVAILABLE");
+}
+
+void initialize() {
+    atomic_do_once(initialization_impl, initialization_state);
+}
+
+void destroy() {
+    destroy_system_topology_ptr();
+}
+} // namespace system_topology
+
+binding_handler* construct_binding_handler(int slot_num, int numa_id, int core_type_id, int max_threads_per_core) {
+    system_topology::initialize();
+    return allocate_binding_handler_ptr(slot_num, numa_id, core_type_id, max_threads_per_core);
+}
+
+void destroy_binding_handler(binding_handler* handler_ptr) {
+    __TBB_ASSERT(deallocate_binding_handler_ptr, "tbbbind loading was not performed");
+    deallocate_binding_handler_ptr(handler_ptr);
+}
+
+void apply_affinity_mask(binding_handler* handler_ptr, int slot_index) {
+    __TBB_ASSERT(slot_index >= 0, "Negative thread index");
+    __TBB_ASSERT(apply_affinity_ptr, "tbbbind loading was not performed");
+    apply_affinity_ptr(handler_ptr, slot_index);
+}
+
+void restore_affinity_mask(binding_handler* handler_ptr, int slot_index) {
+    __TBB_ASSERT(slot_index >= 0, "Negative thread index");
+    __TBB_ASSERT(restore_affinity_ptr, "tbbbind loading was not performed");
+    restore_affinity_ptr(handler_ptr, slot_index);
+}
+
+unsigned __TBB_EXPORTED_FUNC numa_node_count() {
+    system_topology::initialize();
+    return system_topology::numa_nodes_count;
+}
+
+void __TBB_EXPORTED_FUNC fill_numa_indices(int* index_array) {
+    system_topology::initialize();
+    std::memcpy(index_array, system_topology::numa_nodes_indexes, system_topology::numa_nodes_count * sizeof(int));
+}
+
+int __TBB_EXPORTED_FUNC numa_default_concurrency(int node_id) {
+    if (node_id >= 0) {
+        system_topology::initialize();
+        int result = get_default_concurrency_ptr(
+            node_id,
+            /*core_type*/system_topology::automatic,
+            /*threads_per_core*/system_topology::automatic
+        );
+        if (result > 0) return result;
+    }
+    return governor::default_num_threads();
+}
+
+unsigned __TBB_EXPORTED_FUNC core_type_count(intptr_t /*reserved*/) {
+    system_topology::initialize();
+    return system_topology::core_types_count;
+}
+
+void __TBB_EXPORTED_FUNC fill_core_type_indices(int* index_array, intptr_t /*reserved*/) {
+    system_topology::initialize();
+    std::memcpy(index_array, system_topology::core_types_indexes, system_topology::core_types_count * sizeof(int));
+}
+
+void constraints_assertion(d1::constraints c) {
+    bool is_topology_initialized = system_topology::initialization_state == do_once_state::initialized;
+    __TBB_ASSERT_RELEASE(c.max_threads_per_core == system_topology::automatic || c.max_threads_per_core > 0,
+        "Wrong max_threads_per_core constraints field value.");
+
+    auto numa_nodes_begin = system_topology::numa_nodes_indexes;
+    auto numa_nodes_end = system_topology::numa_nodes_indexes + system_topology::numa_nodes_count;
+    __TBB_ASSERT_RELEASE(
+        c.numa_id == system_topology::automatic ||
+        (is_topology_initialized && std::find(numa_nodes_begin, numa_nodes_end, c.numa_id) != numa_nodes_end),
+        "The constraints::numa_id value is not known to the library. Use tbb::info::numa_nodes() to get the list of possible values.");
+
+    int* core_types_begin = system_topology::core_types_indexes;
+    int* core_types_end = system_topology::core_types_indexes + system_topology::core_types_count;
+    __TBB_ASSERT_RELEASE(c.core_type == system_topology::automatic ||
+        (is_topology_initialized && std::find(core_types_begin, core_types_end, c.core_type) != core_types_end),
+        "The constraints::core_type value is not known to the library. Use tbb::info::core_types() to get the list of possible values.");
+}
+
+int __TBB_EXPORTED_FUNC constraints_default_concurrency(const d1::constraints& c, intptr_t /*reserved*/) {
+    constraints_assertion(c);
+
+    if (c.numa_id >= 0 || c.core_type >= 0 || c.max_threads_per_core > 0) {
+        system_topology::initialize();
+        return get_default_concurrency_ptr(c.numa_id, c.core_type, c.max_threads_per_core);
+    }
+    return governor::default_num_threads();
+}
+
+int __TBB_EXPORTED_FUNC constraints_threads_per_core(const d1::constraints&, intptr_t /*reserved*/) {
+    return system_topology::automatic;
+}
+#endif /* __TBB_ARENA_BINDING */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/governor.h b/third_party/tbb/governor.h
new file mode 100644
index 000000000..b1efe08c8
--- /dev/null
+++ b/third_party/tbb/governor.h
@@ -0,0 +1,157 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_governor_H
+#define _TBB_governor_H
+
+#include "third_party/tbb/rml_tbb.h"
+
+#include "third_party/tbb/misc.h" // for AvailableHwConcurrency
+#include "third_party/tbb/tls.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class market;
+class thread_data;
+class __TBB_InitOnce;
+
+#if __TBB_USE_ITT_NOTIFY
+//! Defined in profiling.cpp
+extern bool ITT_Present;
+#endif
+
+typedef std::size_t stack_size_type;
+
+//------------------------------------------------------------------------
+// Class governor
+//------------------------------------------------------------------------
+
+//! The class handles access to the single instance of market, and to TLS to keep scheduler instances.
+/** It also supports automatic on-demand initialization of the TBB scheduler.
+    The class contains only static data members and methods.*/
+class governor {
+private:
+    friend class __TBB_InitOnce;
+    friend class thread_dispatcher;
+    friend class threading_control_impl;
+
+    // TODO: consider using thread_local (measure performance and side effects)
+    //! TLS for scheduler instances associated with individual threads
+    static basic_tls<thread_data*> theTLS;
+
+    // TODO (TBB_REVAMP_TODO): reconsider constant names
+    static rml::tbb_factory theRMLServerFactory;
+
+    static bool UsePrivateRML;
+
+    // Flags for runtime-specific conditions
+    static cpu_features_type cpu_features;
+    static bool is_rethrow_broken;
+
+    //! Create key for thread-local storage and initialize RML.
+    static void acquire_resources ();
+
+    //! Destroy the thread-local storage key and deinitialize RML.
+    static void release_resources ();
+
+    static rml::tbb_server* create_rml_server ( rml::tbb_client& );
+
+public:
+    static unsigned default_num_threads () {
+        // Caches the maximal level of parallelism supported by the hardware
+        static unsigned num_threads = AvailableHwConcurrency();
+        return num_threads;
+    }
+    static std::size_t default_page_size () {
+        // Caches the size of OS regular memory page
+        static std::size_t page_size = DefaultSystemPageSize();
+        return page_size;
+    }
+    static void one_time_init();
+    //! Processes scheduler initialization request (possibly nested) in an external thread
+    /** If necessary creates new instance of arena and/or local scheduler.
+        The auto_init argument specifies if the call is due to automatic initialization. **/
+    static void init_external_thread();
+
+    //! The routine to undo automatic initialization.
+    /** The signature is written with void* so that the routine
+        can be the destructor argument to pthread_key_create. */
+    static void auto_terminate(void* tls);
+
+    //! Obtain the thread-local instance of the thread data.
+    /** If the scheduler has not been initialized yet, initialization is done automatically.
+        Note that auto-initialized scheduler instance is destroyed only when its thread terminates. **/
+    static thread_data* get_thread_data() {
+        thread_data* td = theTLS.get();
+        if (td) {
+            return td;
+        }
+        init_external_thread();
+        td = theTLS.get();
+        __TBB_ASSERT(td, nullptr);
+        return td;
+    }
+
+    static void set_thread_data(thread_data& td) {
+        theTLS.set(&td);
+    }
+
+    static void clear_thread_data() {
+        theTLS.set(nullptr);
+    }
+
+    static thread_data* get_thread_data_if_initialized () {
+        return theTLS.get();
+    }
+
+    static bool is_thread_data_set(thread_data* td) {
+        return theTLS.get() == td;
+    }
+
+    //! Undo automatic initialization if necessary; call when a thread exits.
+    static void terminate_external_thread() {
+        auto_terminate(get_thread_data_if_initialized());
+    }
+
+    static void initialize_rml_factory ();
+
+    static bool does_client_join_workers (const rml::tbb_client &client);
+
+    static bool speculation_enabled() { return cpu_features.rtm_enabled; }
+
+#if __TBB_WAITPKG_INTRINSICS_PRESENT
+    static bool wait_package_enabled() { return cpu_features.waitpkg_enabled; }
+#endif
+
+    static bool rethrow_exception_broken() { return is_rethrow_broken; }
+
+    static bool is_itt_present() {
+#if __TBB_USE_ITT_NOTIFY
+        return ITT_Present;
+#else
+        return false;
+#endif
+    }
+}; // class governor
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_governor_H */
diff --git a/third_party/tbb/info.h b/third_party/tbb/info.h
new file mode 100644
index 000000000..b90d38bb2
--- /dev/null
+++ b/third_party/tbb/info.h
@@ -0,0 +1,126 @@
+// clang-format off
+/*
+    Copyright (c) 2019-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_info_H
+#define __TBB_info_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+
+#if __TBB_ARENA_BINDING
+#include "third_party/libcxx/vector"
+#include "third_party/libcxx/cstdint"
+
+namespace tbb {
+namespace detail {
+
+namespace d1{
+
+using numa_node_id = int;
+using core_type_id = int;
+
+// TODO: consider version approach to resolve backward compatibility potential issues.
+struct constraints {
+#if !__TBB_CPP20_PRESENT
+    constraints(numa_node_id id = -1, int maximal_concurrency = -1)
+        : numa_id(id)
+        , max_concurrency(maximal_concurrency)
+    {}
+#endif /*!__TBB_CPP20_PRESENT*/
+
+    constraints& set_numa_id(numa_node_id id) {
+        numa_id = id;
+        return *this;
+    }
+    constraints& set_max_concurrency(int maximal_concurrency) {
+        max_concurrency = maximal_concurrency;
+        return *this;
+    }
+    constraints& set_core_type(core_type_id id) {
+        core_type = id;
+        return *this;
+    }
+    constraints& set_max_threads_per_core(int threads_number) {
+        max_threads_per_core = threads_number;
+        return *this;
+    }
+
+    numa_node_id numa_id = -1;
+    int max_concurrency = -1;
+    core_type_id core_type = -1;
+    int max_threads_per_core = -1;
+};
+
+} // namespace d1
+
+namespace r1 {
+TBB_EXPORT unsigned __TBB_EXPORTED_FUNC numa_node_count();
+TBB_EXPORT void __TBB_EXPORTED_FUNC fill_numa_indices(int* index_array);
+TBB_EXPORT int __TBB_EXPORTED_FUNC numa_default_concurrency(int numa_id);
+
+// Reserved fields are required to save binary backward compatibility in case of future changes.
+// They must be defined to 0 at this moment.
+TBB_EXPORT unsigned __TBB_EXPORTED_FUNC core_type_count(intptr_t reserved = 0);
+TBB_EXPORT void __TBB_EXPORTED_FUNC fill_core_type_indices(int* index_array, intptr_t reserved = 0);
+
+TBB_EXPORT int __TBB_EXPORTED_FUNC constraints_default_concurrency(const d1::constraints& c, intptr_t reserved = 0);
+TBB_EXPORT int __TBB_EXPORTED_FUNC constraints_threads_per_core(const d1::constraints& c, intptr_t reserved = 0);
+} // namespace r1
+
+namespace d1 {
+
+inline std::vector<numa_node_id> numa_nodes() {
+    std::vector<numa_node_id> node_indices(r1::numa_node_count());
+    r1::fill_numa_indices(node_indices.data());
+    return node_indices;
+}
+
+inline int default_concurrency(numa_node_id id = -1) {
+    return r1::numa_default_concurrency(id);
+}
+
+inline std::vector<core_type_id> core_types() {
+    std::vector<int> core_type_indexes(r1::core_type_count());
+    r1::fill_core_type_indices(core_type_indexes.data());
+    return core_type_indexes;
+}
+
+inline int default_concurrency(constraints c) {
+    if (c.max_concurrency > 0) { return c.max_concurrency; }
+    return r1::constraints_default_concurrency(c);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::numa_node_id;
+using detail::d1::core_type_id;
+
+namespace info {
+using detail::d1::numa_nodes;
+using detail::d1::core_types;
+
+using detail::d1::default_concurrency;
+} // namespace info
+} // namespace v1
+
+} // namespace tbb
+
+#endif /*__TBB_ARENA_BINDING*/
+
+#endif /*__TBB_info_H*/
diff --git a/third_party/tbb/intrusive_list.h b/third_party/tbb/intrusive_list.h
new file mode 100644
index 000000000..c0f1b19e2
--- /dev/null
+++ b/third_party/tbb/intrusive_list.h
@@ -0,0 +1,234 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_intrusive_list_H
+#define _TBB_intrusive_list_H
+
+#include "third_party/tbb/detail/_intrusive_list_node.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+using d1::intrusive_list_node;
+
+//! List of element of type T, where T is derived from intrusive_list_node
+/** The class is not thread safe. **/
+template <class List, class T>
+class intrusive_list_base {
+    //! Pointer to the head node
+    intrusive_list_node my_head;
+
+    //! Number of list elements
+    std::size_t my_size;
+
+    static intrusive_list_node& node ( T& item ) { return List::node(item); }
+
+    static T& item ( intrusive_list_node* node ) { return List::item(node); }
+
+    static const T& item( const intrusive_list_node* node ) { return List::item(node); }
+
+    template <typename DereferenceType>
+    class iterator_impl {
+        static_assert(std::is_same<DereferenceType, T>::value ||
+                      std::is_same<DereferenceType, const T>::value,
+                      "Incorrect DereferenceType in iterator_impl");
+
+        using pointer_type = typename std::conditional<std::is_same<DereferenceType, T>::value,
+                                                       intrusive_list_node*,
+                                                       const intrusive_list_node*>::type;
+
+    public:
+        iterator_impl() : my_pos(nullptr) {}
+
+        iterator_impl( pointer_type pos ) : my_pos(pos) {}
+
+        iterator_impl& operator++() {
+            my_pos = my_pos->my_next_node;
+            return *this;
+        }
+
+        iterator_impl operator++( int ) {
+            iterator_impl it(*this);
+            ++*this;
+            return it;
+        }
+
+        iterator_impl& operator--() {
+            my_pos = my_pos->my_prev_node;
+            return *this;
+        }
+
+        iterator_impl operator--( int ) {
+            iterator_impl it(*this);
+            --*this;
+            return it;
+        }
+
+        bool operator==( const iterator_impl& rhs ) const {
+            return my_pos == rhs.my_pos;
+        }
+
+        bool operator!=( const iterator_impl& rhs ) const {
+            return my_pos != rhs.my_pos;
+        }
+
+        DereferenceType& operator*() const {
+            return intrusive_list_base::item(my_pos);
+        }
+
+        DereferenceType* operator->() const {
+            return &intrusive_list_base::item(my_pos);
+        }
+    private:
+        // Node the iterator points to at the moment
+        pointer_type my_pos;
+    }; // class iterator_impl
+
+    void assert_ok () const {
+        __TBB_ASSERT( (my_head.my_prev_node == &my_head && !my_size) ||
+                      (my_head.my_next_node != &my_head && my_size >0), "intrusive_list_base corrupted" );
+#if TBB_USE_ASSERT >= 2
+        std::size_t i = 0;
+        for ( intrusive_list_node *n = my_head.my_next_node; n != &my_head; n = n->my_next_node )
+            ++i;
+        __TBB_ASSERT( my_size == i, "Wrong size" );
+#endif /* TBB_USE_ASSERT >= 2 */
+    }
+
+public:
+    using iterator = iterator_impl<T>;
+    using const_iterator = iterator_impl<const T>;
+
+    intrusive_list_base () : my_size(0) {
+        my_head.my_prev_node = &my_head;
+        my_head.my_next_node = &my_head;
+    }
+
+    bool empty () const { return my_head.my_next_node == &my_head; }
+
+    std::size_t size () const { return my_size; }
+
+    iterator begin () { return iterator(my_head.my_next_node); }
+
+    iterator end () { return iterator(&my_head); }
+
+    const_iterator begin () const { return const_iterator(my_head.my_next_node); }
+
+    const_iterator end () const { return const_iterator(&my_head); }
+
+    void push_front ( T& val ) {
+        __TBB_ASSERT( node(val).my_prev_node == &node(val) && node(val).my_next_node == &node(val),
+                    "Object with intrusive list node can be part of only one intrusive list simultaneously" );
+        // An object can be part of only one intrusive list at the given moment via the given node member
+        node(val).my_prev_node = &my_head;
+        node(val).my_next_node = my_head.my_next_node;
+        my_head.my_next_node->my_prev_node = &node(val);
+        my_head.my_next_node = &node(val);
+        ++my_size;
+        assert_ok();
+    }
+
+    void remove( T& val ) {
+        __TBB_ASSERT( node(val).my_prev_node != &node(val) && node(val).my_next_node != &node(val), "Element to remove is not in the list" );
+        __TBB_ASSERT( node(val).my_prev_node->my_next_node == &node(val) && node(val).my_next_node->my_prev_node == &node(val), "Element to remove is not in the list" );
+        --my_size;
+        node(val).my_next_node->my_prev_node = node(val).my_prev_node;
+        node(val).my_prev_node->my_next_node = node(val).my_next_node;
+#if TBB_USE_ASSERT
+        node(val).my_prev_node = node(val).my_next_node = &node(val);
+#endif
+        assert_ok();
+    }
+
+    iterator erase ( iterator it ) {
+        T& val = *it;
+        ++it;
+        remove( val );
+        return it;
+    }
+
+}; // intrusive_list_base
+
+#if __TBB_TODO
+// With standard compliant compilers memptr_intrusive_list could be named simply intrusive_list,
+// and inheritance based intrusive_list version would become its partial specialization.
+// Here are the corresponding declarations:
+
+struct dummy_intrusive_list_item { intrusive_list_node my_node; };
+
+template <class T, class U = dummy_intrusive_list_item, intrusive_list_node U::*NodePtr = &dummy_intrusive_list_item::my_node>
+class intrusive_list : public intrusive_list_base<intrusive_list<T, U, NodePtr>, T>;
+
+template <class T>
+class intrusive_list<T, dummy_intrusive_list_item, &dummy_intrusive_list_item::my_node>
+    : public intrusive_list_base<intrusive_list<T>, T>;
+
+#endif /* __TBB_TODO */
+
+//! Double linked list of items of type T containing a member of type intrusive_list_node.
+/** NodePtr is a member pointer to the node data field. Class U is either T or
+    a base class of T containing the node member. Default values exist for the sake
+    of a partial specialization working with inheritance case.
+
+    The list does not have ownership of its items. Its purpose is to avoid dynamic
+    memory allocation when forming lists of existing objects.
+
+    The class is not thread safe. **/
+template <class T, class U, intrusive_list_node U::*NodePtr>
+class memptr_intrusive_list : public intrusive_list_base<memptr_intrusive_list<T, U, NodePtr>, T>
+{
+    friend class intrusive_list_base<memptr_intrusive_list<T, U, NodePtr>, T>;
+
+    static intrusive_list_node& node ( T& val ) { return val.*NodePtr; }
+
+    static T& item ( intrusive_list_node* node ) {
+        // Cannot use __TBB_offsetof (and consequently __TBB_get_object_ref) macro
+        // with *NodePtr argument because gcc refuses to interpret pasted "->" and "*"
+        // as member pointer dereferencing operator, and explicit usage of ## in
+        // __TBB_offsetof implementation breaks operations with normal member names.
+        return *reinterpret_cast<T*>((char*)node - ((ptrdiff_t)&(reinterpret_cast<T*>(0x1000)->*NodePtr) - 0x1000));
+    }
+
+    static const T& item( const intrusive_list_node* node ) {
+        return item(const_cast<intrusive_list_node*>(node));
+    }
+
+}; // intrusive_list<T, U, NodePtr>
+
+//! Double linked list of items of type T that is derived from intrusive_list_node class.
+/** The list does not have ownership of its items. Its purpose is to avoid dynamic
+    memory allocation when forming lists of existing objects.
+
+    The class is not thread safe. **/
+template <class T>
+class intrusive_list : public intrusive_list_base<intrusive_list<T>, T>
+{
+    friend class intrusive_list_base<intrusive_list<T>, T>;
+
+    static intrusive_list_node& node ( T& val ) { return val; }
+
+    static T& item ( intrusive_list_node* node ) { return *static_cast<T*>(node); }
+
+    static const T& item( const intrusive_list_node* node ) { return *static_cast<const T*>(node); }
+}; // intrusive_list<T>
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_intrusive_list_H */
diff --git a/third_party/tbb/itt_notify.cpp b/third_party/tbb/itt_notify.cpp
new file mode 100644
index 000000000..fe9325490
--- /dev/null
+++ b/third_party/tbb/itt_notify.cpp
@@ -0,0 +1,70 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#if __TBB_USE_ITT_NOTIFY
+
+#if _WIN32||_WIN64
+    #ifndef UNICODE
+        #define UNICODE
+    #endif
+#else
+    #pragma weak dlopen
+    #pragma weak dlsym
+    #pragma weak dlerror
+#endif /* WIN */
+
+#if __TBB_BUILD
+
+extern "C" void ITT_DoOneTimeInitialization();
+#define __itt_init_ittlib_name(x,y) (ITT_DoOneTimeInitialization(), true)
+
+#elif __TBBMALLOC_BUILD
+
+extern "C" void MallocInitializeITT();
+#define __itt_init_ittlib_name(x,y) (MallocInitializeITT(), true)
+
+#else
+#error This file is expected to be used for either TBB or TBB allocator build.
+#endif // __TBB_BUILD
+
+// MISSING #include "tools_api/ittnotify_static.c"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+/** This extra proxy method is necessary since __itt_init_lib is declared as static **/
+int __TBB_load_ittnotify() {
+#if !(_WIN32||_WIN64)
+    // tool_api crashes without dlopen, check that it's present. Common case
+    // for lack of dlopen is static binaries, i.e. ones build with -static.
+    if (dlopen == nullptr)
+        return 0;
+#endif
+    return __itt_init_ittlib(nullptr,       // groups for:
+      (__itt_group_id)(__itt_group_sync     // prepare/cancel/acquired/releasing
+                       | __itt_group_thread // name threads
+                       | __itt_group_stitch // stack stitching
+                       | __itt_group_structure
+                           ));
+}
+
+} //namespace r1
+} //namespace detail
+} // namespace tbb
+
+#endif /* __TBB_USE_ITT_NOTIFY */
diff --git a/third_party/tbb/itt_notify.h b/third_party/tbb/itt_notify.h
new file mode 100644
index 000000000..eba910d27
--- /dev/null
+++ b/third_party/tbb/itt_notify.h
@@ -0,0 +1,118 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_ITT_NOTIFY
+#define _TBB_ITT_NOTIFY
+
+#include "third_party/tbb/detail/_config.h"
+
+#if __TBB_USE_ITT_NOTIFY
+
+#if _WIN32||_WIN64
+    #ifndef UNICODE
+        #define UNICODE
+    #endif
+#endif /* WIN */
+
+#ifndef INTEL_ITTNOTIFY_API_PRIVATE
+#define INTEL_ITTNOTIFY_API_PRIVATE
+#endif
+
+// MISSING #include "tools_api/ittnotify.h"
+// MISSING #include "tools_api/legacy/ittnotify.h"
+extern "C" void __itt_fini_ittlib(void);
+extern "C" void __itt_release_resources(void);
+
+#if _WIN32||_WIN64
+    #undef _T
+#endif /* WIN */
+
+#endif /* __TBB_USE_ITT_NOTIFY */
+
+#if !ITT_CALLER_NULL
+#define ITT_CALLER_NULL ((__itt_caller)0)
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Unicode support
+#if (_WIN32||_WIN64)
+    //! Unicode character type. Always wchar_t on Windows.
+    /** We do not use typedefs from Windows TCHAR family to keep consistence of TBB coding style. **/
+    using tchar = wchar_t;
+    //! Standard Windows macro to markup the string literals.
+    #define _T(string_literal) L ## string_literal
+#else /* !WIN */
+    using tchar = char;
+    //! Standard Windows style macro to markup the string literals.
+    #define _T(string_literal) string_literal
+#endif /* !WIN */
+
+//! Display names of internal synchronization types
+extern const tchar
+    *SyncType_Scheduler;
+//! Display names of internal synchronization components/scenarios
+extern const tchar
+    *SyncObj_ContextsList
+    ;
+
+#if __TBB_USE_ITT_NOTIFY
+// const_cast<void*>() is necessary to cast off volatility
+#define ITT_NOTIFY(name,obj)            __itt_##name(const_cast<void*>(static_cast<volatile void*>(obj)))
+#define ITT_THREAD_SET_NAME(name)       __itt_thread_set_name(name)
+#define ITT_FINI_ITTLIB()               __itt_fini_ittlib()
+#define ITT_RELEASE_RESOURCES()         __itt_release_resources()
+#define ITT_SYNC_CREATE(obj, type, name) __itt_sync_create((void*)(obj), type, name, 2)
+#define ITT_STACK_CREATE(obj)           obj = __itt_stack_caller_create()
+#define ITT_STACK_DESTROY(obj)          (obj!=nullptr) ? __itt_stack_caller_destroy(static_cast<__itt_caller>(obj)) : ((void)0)
+#define ITT_CALLEE_ENTER(cond, t, obj)  if(cond) {\
+                                            __itt_stack_callee_enter(static_cast<__itt_caller>(obj));\
+                                            __itt_sync_acquired(t);\
+                                        }
+#define ITT_CALLEE_LEAVE(cond, obj)     (cond) ? __itt_stack_callee_leave(static_cast<__itt_caller>(obj)) : ((void)0)
+
+#define ITT_TASK_GROUP(obj,name,parent)     r1::itt_make_task_group(d1::ITT_DOMAIN_MAIN,(void*)(obj),ALGORITHM,(void*)(parent),(parent!=nullptr) ? ALGORITHM : FLOW_NULL,name)
+#define ITT_TASK_BEGIN(obj,name,id)         r1::itt_task_begin(d1::ITT_DOMAIN_MAIN,(void*)(id),ALGORITHM,(void*)(obj),ALGORITHM,name)
+#define ITT_TASK_END                        r1::itt_task_end(d1::ITT_DOMAIN_MAIN)
+
+
+#else /* !__TBB_USE_ITT_NOTIFY */
+
+#define ITT_NOTIFY(name,obj)            ((void)0)
+#define ITT_THREAD_SET_NAME(name)       ((void)0)
+#define ITT_FINI_ITTLIB()               ((void)0)
+#define ITT_RELEASE_RESOURCES()         ((void)0)
+#define ITT_SYNC_CREATE(obj, type, name) ((void)0)
+#define ITT_STACK_CREATE(obj)           ((void)0)
+#define ITT_STACK_DESTROY(obj)          ((void)0)
+#define ITT_CALLEE_ENTER(cond, t, obj)  ((void)0)
+#define ITT_CALLEE_LEAVE(cond, obj)     ((void)0)
+#define ITT_TASK_GROUP(type,name,parent)    ((void)0)
+#define ITT_TASK_BEGIN(type,name,id)        ((void)0)
+#define ITT_TASK_END                        ((void)0)
+
+#endif /* !__TBB_USE_ITT_NOTIFY */
+
+int __TBB_load_ittnotify();
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_ITT_NOTIFY */
diff --git a/third_party/tbb/mailbox.h b/third_party/tbb/mailbox.h
new file mode 100644
index 000000000..3cb2f0646
--- /dev/null
+++ b/third_party/tbb/mailbox.h
@@ -0,0 +1,247 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_mailbox_H
+#define _TBB_mailbox_H
+
+#include "third_party/tbb/cache_aligned_allocator.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+
+#include "third_party/tbb/scheduler_common.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+struct task_proxy : public d1::task {
+    static const intptr_t      pool_bit = 1<<0;
+    static const intptr_t   mailbox_bit = 1<<1;
+    static const intptr_t location_mask = pool_bit | mailbox_bit;
+    /* All but two low-order bits represent a (task*).
+       Two low-order bits mean:
+       1 = proxy is/was/will be in task pool
+       2 = proxy is/was/will be in mailbox */
+    std::atomic<intptr_t> task_and_tag;
+
+    //! Pointer to next task_proxy in a mailbox
+    std::atomic<task_proxy*> next_in_mailbox;
+
+    //! Mailbox to which this was mailed.
+    mail_outbox* outbox;
+
+    //! Task affinity id which is referenced
+    d1::slot_id slot;
+
+    d1::small_object_allocator allocator;
+
+    //! True if the proxy is stored both in its sender's pool and in the destination mailbox.
+    static bool is_shared ( intptr_t tat ) {
+        return (tat & location_mask) == location_mask;
+    }
+
+    //! Returns a pointer to the encapsulated task or nullptr.
+    static task* task_ptr ( intptr_t tat ) {
+        return (task*)(tat & ~location_mask);
+    }
+
+    //! Returns a pointer to the encapsulated task or nullptr, and frees proxy if necessary.
+    template<intptr_t from_bit>
+    inline task* extract_task () {
+        // __TBB_ASSERT( prefix().extra_state == es_task_proxy, "Normal task misinterpreted as a proxy?" );
+        intptr_t tat = task_and_tag.load(std::memory_order_acquire);
+        __TBB_ASSERT( tat == from_bit || (is_shared(tat) && task_ptr(tat)),
+            "Proxy's tag cannot specify both locations if the proxy "
+            "was retrieved from one of its original locations" );
+        if ( tat != from_bit ) {
+            const intptr_t cleaner_bit = location_mask & ~from_bit;
+            // Attempt to transition the proxy to the "empty" state with
+            // cleaner_bit specifying entity responsible for its eventual freeing.
+            // Explicit cast to void* is to work around a seeming ICC 11.1 bug.
+            if ( task_and_tag.compare_exchange_strong(tat, cleaner_bit) ) {
+                // Successfully grabbed the task, and left new owner with the job of freeing the proxy
+                return task_ptr(tat);
+            }
+        }
+        // Proxied task has already been claimed from another proxy location.
+        __TBB_ASSERT( task_and_tag.load(std::memory_order_relaxed) == from_bit, "Empty proxy cannot contain non-zero task pointer" );
+        return nullptr;
+    }
+
+    task* execute(d1::execution_data&) override {
+        __TBB_ASSERT_RELEASE(false, nullptr);
+        return nullptr;
+    }
+    task* cancel(d1::execution_data&) override {
+        __TBB_ASSERT_RELEASE(false, nullptr);
+        return nullptr;
+    }
+}; // struct task_proxy
+
+//! Internal representation of mail_outbox, without padding.
+class unpadded_mail_outbox {
+protected:
+    typedef std::atomic<task_proxy*> atomic_proxy_ptr;
+
+    //! Pointer to first task_proxy in mailbox, or nullptr if box is empty.
+    atomic_proxy_ptr my_first;
+
+    //! Pointer to pointer that will point to next item in the queue.  Never nullptr.
+    std::atomic<atomic_proxy_ptr*> my_last;
+
+    //! Owner of mailbox is not executing a task, and has drained its own task pool.
+    std::atomic<bool> my_is_idle;
+};
+
+// TODO: - consider moving to arena slot
+//! Class representing where mail is put.
+/** Padded to occupy a cache line. */
+class mail_outbox : padded<unpadded_mail_outbox> {
+
+    task_proxy* internal_pop( isolation_type isolation ) {
+        task_proxy* curr = my_first.load(std::memory_order_acquire);
+        if ( !curr )
+            return nullptr;
+        atomic_proxy_ptr* prev_ptr = &my_first;
+        if ( isolation != no_isolation ) {
+            while ( task_accessor::isolation(*curr) != isolation ) {
+                prev_ptr = &curr->next_in_mailbox;
+                // The next_in_mailbox should be read with acquire to guarantee (*curr) consistency.
+                curr = curr->next_in_mailbox.load(std::memory_order_acquire);
+                if ( !curr )
+                    return nullptr;
+            }
+        }
+        // There is a first item in the mailbox.  See if there is a second.
+        // The next_in_mailbox should be read with acquire to guarantee (*second) consistency.
+        if ( task_proxy* second = curr->next_in_mailbox.load(std::memory_order_acquire) ) {
+            // There are at least two items, so first item can be popped easily.
+            prev_ptr->store(second, std::memory_order_relaxed);
+        } else {
+            // There is only one item. Some care is required to pop it.
+
+            prev_ptr->store(nullptr, std::memory_order_relaxed);
+            atomic_proxy_ptr* expected = &curr->next_in_mailbox;
+            if ( my_last.compare_exchange_strong( expected, prev_ptr ) ) {
+                // Successfully transitioned mailbox from having one item to having none.
+                __TBB_ASSERT( !curr->next_in_mailbox.load(std::memory_order_relaxed), nullptr);
+            } else {
+                // Some other thread updated my_last but has not filled in first->next_in_mailbox
+                // Wait until first item points to second item.
+                atomic_backoff backoff;
+                // The next_in_mailbox should be read with acquire to guarantee (*second) consistency.
+                while ( !(second = curr->next_in_mailbox.load(std::memory_order_acquire)) ) backoff.pause();
+                prev_ptr->store( second, std::memory_order_relaxed);
+            }
+        }
+        assert_pointer_valid(curr);
+        return curr;
+    }
+public:
+    friend class mail_inbox;
+
+    //! Push task_proxy onto the mailbox queue of another thread.
+    /** Implementation is wait-free. */
+    void push( task_proxy* t ) {
+        assert_pointer_valid(t);
+        t->next_in_mailbox.store(nullptr, std::memory_order_relaxed);
+        atomic_proxy_ptr* const link = my_last.exchange(&t->next_in_mailbox);
+        // Logically, the release fence is not required because the exchange above provides the
+        // release-acquire semantic that guarantees that (*t) will be consistent when another thread
+        // loads the link atomic. However, C++11 memory model guarantees consistency of(*t) only
+        // when the same atomic is used for synchronization.
+        link->store(t, std::memory_order_release);
+    }
+
+    //! Return true if mailbox is empty
+    bool empty() {
+        return my_first.load(std::memory_order_relaxed) == nullptr;
+    }
+
+    //! Construct *this as a mailbox from zeroed memory.
+    /** Raise assertion if *this is not previously zeroed, or sizeof(this) is wrong.
+        This method is provided instead of a full constructor since we know the object
+        will be constructed in zeroed memory. */
+    void construct() {
+        __TBB_ASSERT( sizeof(*this)==max_nfs_size, nullptr );
+        __TBB_ASSERT( !my_first.load(std::memory_order_relaxed), nullptr );
+        __TBB_ASSERT( !my_last.load(std::memory_order_relaxed), nullptr );
+        __TBB_ASSERT( !my_is_idle.load(std::memory_order_relaxed), nullptr );
+        my_last = &my_first;
+        suppress_unused_warning(pad);
+    }
+
+    //! Drain the mailbox
+    void drain() {
+        // No fences here because other threads have already quit.
+        for( ; task_proxy* t = my_first; ) {
+            my_first.store(t->next_in_mailbox, std::memory_order_relaxed);
+            t->allocator.delete_object(t);
+        }
+    }
+
+    //! True if thread that owns this mailbox is looking for work.
+    bool recipient_is_idle() {
+        return my_is_idle.load(std::memory_order_relaxed);
+    }
+}; // class mail_outbox
+
+//! Class representing source of mail.
+class mail_inbox {
+    //! Corresponding sink where mail that we receive will be put.
+    mail_outbox* my_putter;
+public:
+    //! Construct unattached inbox
+    mail_inbox() : my_putter(nullptr) {}
+
+    //! Attach inbox to a corresponding outbox.
+    void attach( mail_outbox& putter ) {
+        my_putter = &putter;
+    }
+    //! Detach inbox from its outbox
+    void detach() {
+        __TBB_ASSERT(my_putter,"not attached");
+        my_putter = nullptr;
+    }
+    //! Get next piece of mail, or nullptr if mailbox is empty.
+    task_proxy* pop( isolation_type isolation ) {
+        return my_putter->internal_pop( isolation );
+    }
+    //! Return true if mailbox is empty
+    bool empty() {
+        return my_putter->empty();
+    }
+    //! Indicate whether thread that reads this mailbox is idle.
+    /** Raises assertion failure if mailbox is redundantly marked as not idle. */
+    void set_is_idle( bool value ) {
+        if( my_putter ) {
+            __TBB_ASSERT( my_putter->my_is_idle.load(std::memory_order_relaxed) || value, "attempt to redundantly mark mailbox as not idle" );
+            my_putter->my_is_idle.store(value, std::memory_order_relaxed);
+        }
+    }
+    //! Indicate whether thread that reads this mailbox is idle.
+    bool is_idle_state ( bool value ) const {
+        return !my_putter || my_putter->my_is_idle.load(std::memory_order_relaxed) == value;
+    }
+}; // class mail_inbox
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_mailbox_H */
diff --git a/third_party/tbb/main.cpp b/third_party/tbb/main.cpp
new file mode 100644
index 000000000..734913362
--- /dev/null
+++ b/third_party/tbb/main.cpp
@@ -0,0 +1,172 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_config.h"
+
+#include "third_party/tbb/main.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/threading_control.h"
+#include "third_party/tbb/environment.h"
+#include "third_party/tbb/market.h"
+#include "third_party/tbb/misc.h"
+#include "third_party/tbb/itt_notify.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// Begin shared data layout.
+// The following global data items are mostly read-only after initialization.
+//------------------------------------------------------------------------
+
+//------------------------------------------------------------------------
+// governor data
+basic_tls<thread_data*> governor::theTLS;
+rml::tbb_factory governor::theRMLServerFactory;
+bool governor::UsePrivateRML;
+bool governor::is_rethrow_broken;
+
+//------------------------------------------------------------------------
+// threading_control data
+threading_control* threading_control::g_threading_control;
+threading_control::global_mutex_type threading_control::g_threading_control_mutex;
+
+//------------------------------------------------------------------------
+// context propagation data
+context_state_propagation_mutex_type the_context_state_propagation_mutex;
+std::atomic<uintptr_t> the_context_state_propagation_epoch{};
+
+//------------------------------------------------------------------------
+// One time initialization data
+
+//! Counter of references to global shared resources such as TLS.
+std::atomic<int> __TBB_InitOnce::count{};
+
+std::atomic_flag __TBB_InitOnce::InitializationLock = ATOMIC_FLAG_INIT;
+
+//! Flag that is set to true after one-time initializations are done.
+std::atomic<bool> __TBB_InitOnce::InitializationDone{};
+
+#if __TBB_USE_ITT_NOTIFY
+//! Defined in profiling.cpp
+extern bool ITT_Present;
+void ITT_DoUnsafeOneTimeInitialization();
+#endif
+
+#if !(_WIN32||_WIN64) || __TBB_SOURCE_DIRECTLY_INCLUDED
+static __TBB_InitOnce __TBB_InitOnceHiddenInstance;
+#endif
+
+#if TBB_USE_ASSERT
+std::atomic<int> the_observer_proxy_count;
+
+struct check_observer_proxy_count {
+    ~check_observer_proxy_count() {
+        if (the_observer_proxy_count != 0) {
+            runtime_warning("Leaked %ld observer_proxy objects\n", long(the_observer_proxy_count));
+        }
+    }
+};
+// The proxy count checker shall be defined after __TBB_InitOnceHiddenInstance to check the count
+// after auto termination.
+static check_observer_proxy_count the_check_observer_proxy_count;
+#endif /* TBB_USE_ASSERT */
+
+//------------------------------------------------------------------------
+// __TBB_InitOnce
+//------------------------------------------------------------------------
+
+void __TBB_InitOnce::add_ref() {
+    if( ++count==1 )
+        governor::acquire_resources();
+}
+
+void __TBB_InitOnce::remove_ref() {
+    int k = --count;
+    __TBB_ASSERT(k>=0,"removed __TBB_InitOnce ref that was not added?");
+    if( k==0 ) {
+        governor::release_resources();
+        ITT_FINI_ITTLIB();
+        ITT_RELEASE_RESOURCES();
+    }
+}
+
+//------------------------------------------------------------------------
+// One-time Initializations
+//------------------------------------------------------------------------
+
+//! Defined in cache_aligned_allocator.cpp
+void initialize_cache_aligned_allocator();
+
+//! Performs thread-safe lazy one-time general TBB initialization.
+void DoOneTimeInitialization() {
+    __TBB_InitOnce::lock();
+    // No fence required for load of InitializationDone, because we are inside a critical section.
+    if( !__TBB_InitOnce::InitializationDone ) {
+        __TBB_InitOnce::add_ref();
+        if( GetBoolEnvironmentVariable("TBB_VERSION") )
+            PrintVersion();
+        bool itt_present = false;
+#if __TBB_USE_ITT_NOTIFY
+        ITT_DoUnsafeOneTimeInitialization();
+        itt_present = ITT_Present;
+#endif /* __TBB_USE_ITT_NOTIFY */
+        initialize_cache_aligned_allocator();
+        governor::initialize_rml_factory();
+        // Force processor groups support detection
+        governor::default_num_threads();
+        // Force OS regular page size detection
+        governor::default_page_size();
+        PrintExtraVersionInfo( "TOOLS SUPPORT", itt_present ? "enabled" : "disabled" );
+        __TBB_InitOnce::InitializationDone = true;
+    }
+    __TBB_InitOnce::unlock();
+}
+
+#if (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED
+//! Windows "DllMain" that handles startup and shutdown of dynamic library.
+extern "C" bool WINAPI DllMain( HANDLE /*hinstDLL*/, DWORD reason, LPVOID lpvReserved ) {
+    switch( reason ) {
+        case DLL_PROCESS_ATTACH:
+            __TBB_InitOnce::add_ref();
+            break;
+        case DLL_PROCESS_DETACH:
+            // Since THREAD_DETACH is not called for the main thread, call auto-termination
+            // here as well - but not during process shutdown (due to risk of a deadlock).
+            if ( lpvReserved == nullptr ) { // library unload
+                governor::terminate_external_thread();
+            }
+            __TBB_InitOnce::remove_ref();
+            // It is assumed that InitializationDone is not set after DLL_PROCESS_DETACH,
+            // and thus no race on InitializationDone is possible.
+            if ( __TBB_InitOnce::initialization_done() ) {
+                // Remove reference that we added in DoOneTimeInitialization.
+                __TBB_InitOnce::remove_ref();
+            }
+            break;
+        case DLL_THREAD_DETACH:
+            governor::terminate_external_thread();
+            break;
+    }
+    return true;
+}
+#endif /* (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/main.h b/third_party/tbb/main.h
new file mode 100644
index 000000000..c23f34bc5
--- /dev/null
+++ b/third_party/tbb/main.h
@@ -0,0 +1,100 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_main_H
+#define _TBB_main_H
+
+#include "third_party/tbb/governor.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+void DoOneTimeInitialization();
+
+//------------------------------------------------------------------------
+// __TBB_InitOnce
+//------------------------------------------------------------------------
+
+// TODO (TBB_REVAMP_TODO): consider better names
+//! Class that supports TBB initialization.
+/** It handles acquisition and release of global resources (e.g. TLS) during startup and shutdown,
+    as well as synchronization for DoOneTimeInitialization. */
+class __TBB_InitOnce {
+    friend void DoOneTimeInitialization();
+    friend void ITT_DoUnsafeOneTimeInitialization();
+
+    static std::atomic<int> count;
+
+    //! Platform specific code to acquire resources.
+    static void acquire_resources();
+
+    //! Platform specific code to release resources.
+    static void release_resources();
+
+    //! Specifies if the one-time initializations has been done.
+    static std::atomic<bool> InitializationDone;
+
+    //! Global initialization lock
+    /** Scenarios are possible when tools interop has to be initialized before the
+        TBB itself. This imposes a requirement that the global initialization lock
+        has to support valid static initialization, and does not issue any tool
+        notifications in any build mode. **/
+    static std::atomic_flag InitializationLock;
+
+public:
+    static void lock() {
+        tbb::detail::atomic_backoff backoff;
+        while( InitializationLock.test_and_set() ) backoff.pause();
+    }
+
+    static void unlock() { InitializationLock.clear(std::memory_order_release); }
+
+    static bool initialization_done() { return InitializationDone.load(std::memory_order_acquire); }
+
+    //! Add initial reference to resources.
+    /** We assume that dynamic loading of the library prevents any other threads
+        from entering the library until this constructor has finished running. **/
+    __TBB_InitOnce() { add_ref(); }
+
+    //! Remove the initial reference to resources.
+    /** This is not necessarily the last reference if other threads are still running. **/
+    ~__TBB_InitOnce() {
+        governor::terminate_external_thread(); // TLS dtor not called for the main thread
+        remove_ref();
+        // We assume that InitializationDone is not set after file-scope destructors
+        // start running, and thus no race on InitializationDone is possible.
+        if ( initialization_done() ) {
+            // Remove an extra reference that was added in DoOneTimeInitialization.
+            remove_ref();
+        }
+    }
+    //! Add reference to resources.  If first reference added, acquire the resources.
+    static void add_ref();
+
+    //! Remove reference to resources.  If last reference removed, release the resources.
+    static void remove_ref();
+
+}; // class __TBB_InitOnce
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_main_H */
diff --git a/third_party/tbb/market.cpp b/third_party/tbb/market.cpp
new file mode 100644
index 000000000..80a22b960
--- /dev/null
+++ b/third_party/tbb/market.cpp
@@ -0,0 +1,140 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/arena.h"
+#include "third_party/tbb/market.h"
+
+#include "third_party/libcxx/algorithm" // std::find
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+
+class tbb_permit_manager_client : public pm_client {
+public:
+    tbb_permit_manager_client(arena& a) : pm_client(a) {}
+
+    void set_allotment(unsigned allotment) {
+        my_arena.set_allotment(allotment);
+    }
+};
+
+//------------------------------------------------------------------------
+// market
+//------------------------------------------------------------------------
+
+market::market(unsigned workers_soft_limit)
+    : my_num_workers_soft_limit(workers_soft_limit)
+{}
+
+pm_client* market::create_client(arena& a) {
+    return new (cache_aligned_allocate(sizeof(tbb_permit_manager_client))) tbb_permit_manager_client(a);
+}
+
+void market::register_client(pm_client* c) {
+    mutex_type::scoped_lock lock(my_mutex);
+    my_clients[c->priority_level()].push_back(c);
+}
+
+void market::unregister_and_destroy_client(pm_client& c) {
+    {
+        mutex_type::scoped_lock lock(my_mutex);
+        auto& clients = my_clients[c.priority_level()];
+        auto it = std::find(clients.begin(), clients.end(), &c);
+        __TBB_ASSERT(it != clients.end(), "Destroying of an unregistered client");
+        clients.erase(it);
+    }
+
+    auto client = static_cast<tbb_permit_manager_client*>(&c);
+    client->~tbb_permit_manager_client();
+    cache_aligned_deallocate(client);
+}
+
+void market::update_allotment() {
+    int effective_soft_limit = my_mandatory_num_requested > 0 && my_num_workers_soft_limit == 0 ? 1 : my_num_workers_soft_limit;
+    int max_workers = min(my_total_demand, effective_soft_limit);
+    __TBB_ASSERT(max_workers >= 0, nullptr);
+
+    int unassigned_workers = max_workers;
+    int assigned = 0;
+    int carry = 0;
+    unsigned max_priority_level = num_priority_levels;
+    for (unsigned list_idx = 0; list_idx < num_priority_levels; ++list_idx ) {
+        int assigned_per_priority = min(my_priority_level_demand[list_idx], unassigned_workers);
+        unassigned_workers -= assigned_per_priority;
+        // We use reverse iterator there to serve last added clients first
+        for (auto it = my_clients[list_idx].rbegin(); it != my_clients[list_idx].rend(); ++it) {
+            tbb_permit_manager_client& client = static_cast<tbb_permit_manager_client&>(**it);
+            if (client.max_workers() == 0) {
+                client.set_allotment(0);
+                continue;
+            }
+
+            if (max_priority_level == num_priority_levels) {
+                max_priority_level = list_idx;
+            }
+
+            int allotted = 0;
+            if (my_num_workers_soft_limit == 0) {
+                __TBB_ASSERT(max_workers == 0 || max_workers == 1, nullptr);
+                allotted = client.min_workers() > 0 && assigned < max_workers ? 1 : 0;
+            } else {
+                int tmp = client.max_workers() * assigned_per_priority + carry;
+                allotted = tmp / my_priority_level_demand[list_idx];
+                carry = tmp % my_priority_level_demand[list_idx];
+                __TBB_ASSERT(allotted <= client.max_workers(), nullptr);
+            }
+            client.set_allotment(allotted);
+            client.set_top_priority(list_idx == max_priority_level);
+            assigned += allotted;
+        }
+    }
+    __TBB_ASSERT(assigned == max_workers, nullptr);
+}
+
+void market::set_active_num_workers(int soft_limit) {
+    mutex_type::scoped_lock lock(my_mutex);
+    if (my_num_workers_soft_limit != soft_limit) {
+        my_num_workers_soft_limit = soft_limit;
+        update_allotment();
+    }
+}
+
+void market::adjust_demand(pm_client& c, int mandatory_delta, int workers_delta) {
+    __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr);
+
+    int delta{};
+    {
+        mutex_type::scoped_lock lock(my_mutex);
+        // Update client's state
+        delta = c.update_request(mandatory_delta, workers_delta);
+
+        // Update market's state
+        my_total_demand += delta;
+        my_priority_level_demand[c.priority_level()] += delta;
+        my_mandatory_num_requested += mandatory_delta;
+
+        update_allotment();
+    }
+
+    notify_thread_request(delta);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/market.h b/third_party/tbb/market.h
new file mode 100644
index 000000000..0367a19c4
--- /dev/null
+++ b/third_party/tbb/market.h
@@ -0,0 +1,79 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_market_H
+#define _TBB_market_H
+
+#include "third_party/tbb/rw_mutex.h"
+#include "third_party/tbb/tbb_allocator.h"
+#include "third_party/tbb/task_arena.h"
+
+#include "third_party/tbb/permit_manager.h"
+#include "third_party/tbb/pm_client.h"
+
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/vector"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class market : public permit_manager {
+public:
+    market(unsigned soft_limit);
+
+    pm_client* create_client(arena& a) override;
+    void register_client(pm_client* client) override;
+    void unregister_and_destroy_client(pm_client& c) override;
+
+    //! Request that arena's need in workers should be adjusted.
+    void adjust_demand(pm_client&, int mandatory_delta, int workers_delta) override;
+
+    //! Set number of active workers
+    void set_active_num_workers(int soft_limit) override;
+private:
+    //! Recalculates the number of workers assigned to each arena in the list.
+    void update_allotment();
+
+    //! Keys for the arena map array. The lower the value the higher priority of the arena list.
+    static constexpr unsigned num_priority_levels = d1::num_priority_levels;
+
+    using mutex_type = d1::rw_mutex;
+    mutex_type my_mutex;
+
+    //! Current application-imposed limit on the number of workers
+    int my_num_workers_soft_limit;
+
+    //! Number of workers that were requested by all arenas on all priority levels
+    int my_total_demand{0};
+
+    //! Number of workers that were requested by arenas per single priority list item
+    int my_priority_level_demand[num_priority_levels] = {0};
+
+    //! How many times mandatory concurrency was requested from the market
+    int my_mandatory_num_requested{0};
+
+    //! Per priority list of registered arenas
+    using clients_container_type = std::vector<pm_client*, tbb::tbb_allocator<pm_client*>>;
+    clients_container_type my_clients[num_priority_levels];
+}; // class market
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_market_H */
diff --git a/third_party/tbb/memory_pool.h b/third_party/tbb/memory_pool.h
new file mode 100644
index 000000000..14f2393e0
--- /dev/null
+++ b/third_party/tbb/memory_pool.h
@@ -0,0 +1,273 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_memory_pool_H
+#define __TBB_memory_pool_H
+
+#if !TBB_PREVIEW_MEMORY_POOL
+#error Set TBB_PREVIEW_MEMORY_POOL to include memory_pool.h
+#endif
+/** @file */
+
+#include "third_party/tbb/scalable_allocator.h"
+
+#include "third_party/libcxx/new" // std::bad_alloc
+#include "third_party/libcxx/stdexcept" // std::runtime_error, std::invalid_argument
+#include "third_party/libcxx/utility" // std::forward
+
+
+#if __TBB_EXTRA_DEBUG
+#define __TBBMALLOC_ASSERT ASSERT
+#else
+#define __TBBMALLOC_ASSERT(a,b) ((void)0)
+#endif
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Base of thread-safe pool allocator for variable-size requests
+class pool_base : no_copy {
+    // Pool interface is separate from standard allocator classes because it has
+    // to maintain internal state, no copy or assignment. Move and swap are possible.
+public:
+    //! Reset pool to reuse its memory (free all objects at once)
+    void recycle() { rml::pool_reset(my_pool); }
+
+    //! The "malloc" analogue to allocate block of memory of size bytes
+    void *malloc(size_t size) { return rml::pool_malloc(my_pool, size); }
+
+    //! The "free" analogue to discard a previously allocated piece of memory.
+    void free(void* ptr) { rml::pool_free(my_pool, ptr); }
+
+    //! The "realloc" analogue complementing pool_malloc.
+    // Enables some low-level optimization possibilities
+    void *realloc(void* ptr, size_t size) {
+        return rml::pool_realloc(my_pool, ptr, size);
+    }
+
+protected:
+    //! destroy pool - must be called in a child class
+    void destroy() { rml::pool_destroy(my_pool); }
+
+    rml::MemoryPool *my_pool;
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Workaround for erroneous "unreferenced parameter" warning in method destroy.
+    #pragma warning (push)
+    #pragma warning (disable: 4100)
+#endif
+
+//! Meets "allocator" requirements of ISO C++ Standard, Section 20.1.5
+/** @ingroup memory_allocation */
+template<typename T, typename P = pool_base>
+class memory_pool_allocator {
+protected:
+    typedef P pool_type;
+    pool_type *my_pool;
+    template<typename U, typename R>
+    friend class memory_pool_allocator;
+    template<typename V, typename U, typename R>
+    friend bool operator==( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+    template<typename V, typename U, typename R>
+    friend bool operator!=( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+public:
+    typedef T value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+    typedef value_type& reference;
+    typedef const value_type& const_reference;
+    typedef size_t size_type;
+    typedef ptrdiff_t difference_type;
+    template<typename U> struct rebind {
+        typedef memory_pool_allocator<U, P> other;
+    };
+
+    explicit memory_pool_allocator(pool_type &pool) throw() : my_pool(&pool) {}
+    memory_pool_allocator(const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {}
+    template<typename U>
+    memory_pool_allocator(const memory_pool_allocator<U,P>& src) throw() : my_pool(src.my_pool) {}
+
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return &x; }
+
+    //! Allocate space for n objects.
+    pointer allocate( size_type n, const void* /*hint*/ = nullptr) {
+        pointer p = static_cast<pointer>( my_pool->malloc( n*sizeof(value_type) ) );
+        if (!p)
+            throw_exception(std::bad_alloc());
+        return p;
+    }
+    //! Free previously allocated block of memory.
+    void deallocate( pointer p, size_type ) {
+        my_pool->free(p);
+    }
+    //! Largest value for which method allocate might succeed.
+    size_type max_size() const throw() {
+        size_type max = static_cast<size_type>(-1) / sizeof (value_type);
+        return (max > 0 ? max : 1);
+    }
+    //! Copy-construct value at location pointed to by p.
+
+    template<typename U, typename... Args>
+    void construct(U *p, Args&&... args)
+        { ::new((void *)p) U(std::forward<Args>(args)...); }
+
+    //! Destroy value at location pointed to by p.
+    void destroy( pointer p ) { p->~value_type(); }
+
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning (pop)
+#endif // warning 4100 is back
+
+//! Analogous to std::allocator<void>, as defined in ISO C++ Standard, Section 20.4.1
+/** @ingroup memory_allocation */
+template<typename P>
+class memory_pool_allocator<void, P> {
+public:
+    typedef P pool_type;
+    typedef void* pointer;
+    typedef const void* const_pointer;
+    typedef void value_type;
+    template<typename U> struct rebind {
+        typedef memory_pool_allocator<U, P> other;
+    };
+
+    explicit memory_pool_allocator( pool_type &pool) throw() : my_pool(&pool) {}
+    memory_pool_allocator( const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {}
+    template<typename U>
+    memory_pool_allocator(const memory_pool_allocator<U,P>& src) throw() : my_pool(src.my_pool) {}
+
+protected:
+    pool_type *my_pool;
+    template<typename U, typename R>
+    friend class memory_pool_allocator;
+    template<typename V, typename U, typename R>
+    friend bool operator==( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+    template<typename V, typename U, typename R>
+    friend bool operator!=( const memory_pool_allocator<V,R>& a, const memory_pool_allocator<U,R>& b);
+};
+
+template<typename T, typename U, typename P>
+inline bool operator==( const memory_pool_allocator<T,P>& a, const memory_pool_allocator<U,P>& b) {return a.my_pool==b.my_pool;}
+
+template<typename T, typename U, typename P>
+inline bool operator!=( const memory_pool_allocator<T,P>& a, const memory_pool_allocator<U,P>& b) {return a.my_pool!=b.my_pool;}
+
+//! Thread-safe growable pool allocator for variable-size requests
+template <typename Alloc>
+class memory_pool : public pool_base {
+    Alloc my_alloc; // TODO: base-class optimization
+    static void *allocate_request(intptr_t pool_id, size_t & bytes);
+    static int deallocate_request(intptr_t pool_id, void*, size_t raw_bytes);
+
+public:
+    //! construct pool with underlying allocator
+    explicit memory_pool(const Alloc &src = Alloc());
+
+    //! destroy pool
+    ~memory_pool() { destroy(); } // call the callbacks first and destroy my_alloc latter
+};
+
+class fixed_pool : public pool_base {
+    void *my_buffer;
+    size_t my_size;
+    inline static void *allocate_request(intptr_t pool_id, size_t & bytes);
+
+public:
+    //! construct pool with underlying allocator
+    inline fixed_pool(void *buf, size_t size);
+    //! destroy pool
+    ~fixed_pool() { destroy(); }
+};
+
+//////////////// Implementation ///////////////
+
+template <typename Alloc>
+memory_pool<Alloc>::memory_pool(const Alloc &src) : my_alloc(src) {
+    rml::MemPoolPolicy args(allocate_request, deallocate_request,
+                            sizeof(typename Alloc::value_type));
+    rml::MemPoolError res = rml::pool_create_v1(intptr_t(this), &args, &my_pool);
+    if (res!=rml::POOL_OK)
+        throw_exception(std::runtime_error("Can't create pool"));
+}
+template <typename Alloc>
+void *memory_pool<Alloc>::allocate_request(intptr_t pool_id, size_t & bytes) {
+    memory_pool<Alloc> &self = *reinterpret_cast<memory_pool<Alloc>*>(pool_id);
+    const size_t unit_size = sizeof(typename Alloc::value_type);
+    __TBBMALLOC_ASSERT( 0 == bytes%unit_size, nullptr);
+    void *ptr;
+#if TBB_USE_EXCEPTIONS
+    try {
+#endif
+        ptr = self.my_alloc.allocate( bytes/unit_size );
+#if TBB_USE_EXCEPTIONS
+    } catch(...) {
+        return nullptr;
+    }
+#endif
+    return ptr;
+}
+#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED
+    // Workaround for erroneous "unreachable code" warning in the template below.
+    // Specific for VC++ 17-18 compiler
+    #pragma warning (push)
+    #pragma warning (disable: 4702)
+#endif
+template <typename Alloc>
+int memory_pool<Alloc>::deallocate_request(intptr_t pool_id, void* raw_ptr, size_t raw_bytes) {
+    memory_pool<Alloc> &self = *reinterpret_cast<memory_pool<Alloc>*>(pool_id);
+    const size_t unit_size = sizeof(typename Alloc::value_type);
+    __TBBMALLOC_ASSERT( 0 == raw_bytes%unit_size, nullptr);
+    self.my_alloc.deallocate( static_cast<typename Alloc::value_type*>(raw_ptr), raw_bytes/unit_size );
+    return 0;
+}
+#if __TBB_MSVC_UNREACHABLE_CODE_IGNORED
+    #pragma warning (pop)
+#endif
+inline fixed_pool::fixed_pool(void *buf, size_t size) : my_buffer(buf), my_size(size) {
+    if (!buf || !size)
+        // TODO: improve support for mode with exceptions disabled
+        throw_exception(std::invalid_argument("Zero in parameter is invalid"));
+    rml::MemPoolPolicy args(allocate_request, nullptr, size, /*fixedPool=*/true);
+    rml::MemPoolError res = rml::pool_create_v1(intptr_t(this), &args, &my_pool);
+    if (res!=rml::POOL_OK)
+        throw_exception(std::runtime_error("Can't create pool"));
+}
+inline void *fixed_pool::allocate_request(intptr_t pool_id, size_t & bytes) {
+    fixed_pool &self = *reinterpret_cast<fixed_pool*>(pool_id);
+    __TBBMALLOC_ASSERT(0 != self.my_size, "The buffer must not be used twice.");
+    bytes = self.my_size;
+    self.my_size = 0; // remember that buffer has been used
+    return self.my_buffer;
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::memory_pool_allocator;
+using detail::d1::memory_pool;
+using detail::d1::fixed_pool;
+} // inline namepspace v1
+} // namespace tbb
+
+#undef __TBBMALLOC_ASSERT
+#endif// __TBB_memory_pool_H
diff --git a/third_party/tbb/misc.cpp b/third_party/tbb/misc.cpp
new file mode 100644
index 000000000..5358252b8
--- /dev/null
+++ b/third_party/tbb/misc.cpp
@@ -0,0 +1,176 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// Source file for miscellaneous entities that are infrequently referenced by
+// an executing program.
+
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_machine.h"
+
+#include "third_party/tbb/version.h"
+
+#include "third_party/tbb/misc.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here.
+#include "third_party/tbb/concurrent_monitor_mutex.h"
+
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/stdexcept"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/cstdarg"
+
+#if _WIN32||_WIN64
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#endif
+
+#if !_WIN32
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h" // sysconf(_SC_PAGESIZE)
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// governor data
+//------------------------------------------------------------------------
+cpu_features_type governor::cpu_features;
+
+//------------------------------------------------------------------------
+// concurrent_monitor_mutex data
+//------------------------------------------------------------------------
+#if !__TBB_USE_FUTEX
+std::mutex concurrent_monitor_mutex::my_init_mutex;
+#endif
+
+
+size_t DefaultSystemPageSize() {
+#if _WIN32
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    return si.dwPageSize;
+#else
+    return sysconf(_SC_PAGESIZE);
+#endif
+}
+
+/** The leading "\0" is here so that applying "strings" to the binary delivers a clean result. */
+static const char VersionString[] = "\0" TBB_VERSION_STRINGS;
+
+static bool PrintVersionFlag = false;
+
+void PrintVersion() {
+    PrintVersionFlag = true;
+    std::fputs(VersionString+1,stderr);
+}
+
+void PrintExtraVersionInfo( const char* category, const char* format, ... ) {
+    if( PrintVersionFlag ) {
+        char str[1024]; std::memset(str, 0, 1024);
+        va_list args; va_start(args, format);
+        // Note: correct vsnprintf definition obtained from tbb_assert_impl.h
+        std::vsnprintf( str, 1024-1, format, args);
+        va_end(args);
+        std::fprintf(stderr, "oneTBB: %s\t%s\n", category, str );
+    }
+}
+
+//! check for transaction support.
+#if _MSC_VER
+// MISSING #include <intrin.h> // for __cpuid
+#endif
+
+#if __TBB_x86_32 || __TBB_x86_64
+void check_cpuid(int leaf, int sub_leaf, int registers[4]) {
+#if _MSC_VER
+    __cpuidex(registers, leaf, sub_leaf);
+#else
+    int reg_eax = 0;
+    int reg_ebx = 0;
+    int reg_ecx = 0;
+    int reg_edx = 0;
+#if __TBB_x86_32 && __PIC__
+    // On 32-bit systems with position-independent code GCC fails to work around the stuff in EBX
+    // register. We help it using backup and restore.
+    __asm__("mov %%ebx, %%esi\n\t"
+            "cpuid\n\t"
+            "xchg %%ebx, %%esi"
+            : "=a"(reg_eax), "=S"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx)
+            : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx
+    );
+#else
+    __asm__("cpuid"
+            : "=a"(reg_eax), "=b"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx)
+            : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx
+    );
+#endif
+    registers[0] = reg_eax;
+    registers[1] = reg_ebx;
+    registers[2] = reg_ecx;
+    registers[3] = reg_edx;
+#endif
+}
+#endif
+
+void detect_cpu_features(cpu_features_type& cpu_features) {
+    suppress_unused_warning(cpu_features);
+#if __TBB_x86_32 || __TBB_x86_64
+    const int rtm_ebx_mask = 1 << 11;
+    const int waitpkg_ecx_mask = 1 << 5;
+    int registers[4] = {0};
+
+    // Check RTM and WAITPKG
+    check_cpuid(7, 0, registers);
+    cpu_features.rtm_enabled = (registers[1] & rtm_ebx_mask) != 0;
+    cpu_features.waitpkg_enabled = (registers[2] & waitpkg_ecx_mask) != 0;
+#endif /* (__TBB_x86_32 || __TBB_x86_64) */
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/misc.h b/third_party/tbb/misc.h
new file mode 100644
index 000000000..7f8ab5038
--- /dev/null
+++ b/third_party/tbb/misc.h
@@ -0,0 +1,298 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_tbb_misc_H
+#define _TBB_tbb_misc_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_utils.h"
+
+#if __TBB_ARENA_BINDING
+#include "third_party/tbb/info.h"
+#endif /*__TBB_ARENA_BINDING*/
+
+#if __unix__
+#include "libc/intrin/newbie.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/rlimit.h"
+#include "libc/calls/struct/rusage.h"
+#include "libc/calls/sysparam.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/limits.h"
+#include "libc/sysv/consts/endian.h"
+#include "libc/sysv/consts/prio.h"
+#include "libc/sysv/consts/rlim.h"
+#include "libc/sysv/consts/rlimit.h"
+#include "libc/sysv/consts/rusage.h"  // __FreeBSD_version
+#if __FreeBSD_version >= 701000
+// MISSING #include <sys/cpuset.h>
+#endif
+#endif
+
+#include "third_party/libcxx/atomic"
+
+// Does the operating system have a system call to pin a thread to a set of OS processors?
+#define __TBB_OS_AFFINITY_SYSCALL_PRESENT ((__linux__ && !__ANDROID__) || (__FreeBSD_version >= 701000))
+// On IBM* Blue Gene* CNK nodes, the affinity API has restrictions that prevent its usability for TBB,
+// and also sysconf(_SC_NPROCESSORS_ONLN) already takes process affinity into account.
+#define __TBB_USE_OS_AFFINITY_SYSCALL (__TBB_OS_AFFINITY_SYSCALL_PRESENT && !__bg__)
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+void runtime_warning(const char* format, ... );
+
+#if __TBB_ARENA_BINDING
+class task_arena;
+class task_scheduler_observer;
+#endif /*__TBB_ARENA_BINDING*/
+
+const std::size_t MByte = 1024*1024;
+
+#if __TBB_USE_WINAPI
+// The Microsoft Documentation about Thread Stack Size states that
+// "The default stack reservation size used by the linker is 1 MB"
+const std::size_t ThreadStackSize = 1*MByte;
+#else
+const std::size_t ThreadStackSize = (sizeof(uintptr_t) <= 4 ? 2 : 4 )*MByte;
+#endif
+
+#ifndef __TBB_HardwareConcurrency
+
+//! Returns maximal parallelism level supported by the current OS configuration.
+int AvailableHwConcurrency();
+
+#else
+
+inline int AvailableHwConcurrency() {
+    int n = __TBB_HardwareConcurrency();
+    return n > 0 ? n : 1; // Fail safety strap
+}
+#endif /* __TBB_HardwareConcurrency */
+
+//! Returns OS regular memory page size
+size_t DefaultSystemPageSize();
+
+//! Returns number of processor groups in the current OS configuration.
+/** AvailableHwConcurrency must be called at least once before calling this method. **/
+int NumberOfProcessorGroups();
+
+#if _WIN32||_WIN64
+
+//! Retrieves index of processor group containing processor with the given index
+int FindProcessorGroupIndex ( int processorIndex );
+
+//! Affinitizes the thread to the specified processor group
+void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex );
+
+#endif /* _WIN32||_WIN64 */
+
+//! Prints TBB version information on stderr
+void PrintVersion();
+
+//! Prints arbitrary extra TBB version information on stderr
+void PrintExtraVersionInfo( const char* category, const char* format, ... );
+
+//! A callback routine to print RML version information on stderr
+void PrintRMLVersionInfo( void* arg, const char* server_info );
+
+// For TBB compilation only; not to be used in public headers
+#if defined(min) || defined(max)
+#undef min
+#undef max
+#endif
+
+//! Utility template function returning lesser of the two values.
+/** Provided here to avoid including not strict safe <algorithm>.\n
+    In case operands cause signed/unsigned or size mismatch warnings it is caller's
+    responsibility to do the appropriate cast before calling the function. **/
+template<typename T>
+T min ( const T& val1, const T& val2 ) {
+    return val1 < val2 ? val1 : val2;
+}
+
+//! Utility template function returning greater of the two values.
+/** Provided here to avoid including not strict safe <algorithm>.\n
+    In case operands cause signed/unsigned or size mismatch warnings it is caller's
+    responsibility to do the appropriate cast before calling the function. **/
+template<typename T>
+T max ( const T& val1, const T& val2 ) {
+    return val1 < val2 ? val2 : val1;
+}
+
+//! Utility helper structure to ease overload resolution
+template<int > struct int_to_type {};
+
+//------------------------------------------------------------------------
+// FastRandom
+//------------------------------------------------------------------------
+
+//! A fast random number generator.
+/** Uses linear congruential method. */
+class FastRandom {
+private:
+    unsigned x, c;
+    static const unsigned a = 0x9e3779b1; // a big prime number
+public:
+    //! Get a random number.
+    unsigned short get() {
+        return get(x);
+    }
+    //! Get a random number for the given seed; update the seed for next use.
+    unsigned short get( unsigned& seed ) {
+        unsigned short r = (unsigned short)(seed>>16);
+        __TBB_ASSERT(c&1, "c must be odd for big rng period");
+        seed = seed*a+c;
+        return r;
+    }
+    //! Construct a random number generator.
+    FastRandom( void* unique_ptr ) { init(uintptr_t(unique_ptr)); }
+
+    template <typename T>
+    void init( T seed ) {
+        init(seed,int_to_type<sizeof(seed)>());
+    }
+    void init( uint64_t seed , int_to_type<8> ) {
+        init(uint32_t((seed>>32)+seed), int_to_type<4>());
+    }
+    void init( uint32_t seed, int_to_type<4> ) {
+        // threads use different seeds for unique sequences
+        c = (seed|1)*0xba5703f5; // c must be odd, shuffle by a prime number
+        x = c^(seed>>1); // also shuffle x for the first get() invocation
+    }
+};
+
+//------------------------------------------------------------------------
+// Atomic extensions
+//------------------------------------------------------------------------
+
+//! Atomically replaces value of dst with newValue if they satisfy condition of compare predicate
+/** Return value semantics is the same as for CAS. **/
+template<typename T1, class Pred>
+T1 atomic_update(std::atomic<T1>& dst, T1 newValue, Pred compare) {
+    T1 oldValue = dst.load(std::memory_order_acquire);
+    while ( compare(oldValue, newValue) ) {
+        if ( dst.compare_exchange_strong(oldValue, newValue) )
+            break;
+    }
+    return oldValue;
+}
+
+#if __TBB_USE_OS_AFFINITY_SYSCALL
+  #if __linux__
+    typedef cpu_set_t basic_mask_t;
+  #elif __FreeBSD_version >= 701000
+    typedef cpuset_t basic_mask_t;
+  #else
+    #error affinity_helper is not implemented in this OS
+  #endif
+    class affinity_helper : no_copy {
+        basic_mask_t* threadMask;
+        int is_changed;
+    public:
+        affinity_helper() : threadMask(nullptr), is_changed(0) {}
+        ~affinity_helper();
+        void protect_affinity_mask( bool restore_process_mask  );
+        void dismiss();
+    };
+    void destroy_process_mask();
+#else
+    class affinity_helper : no_copy {
+    public:
+        void protect_affinity_mask( bool ) {}
+    };
+    inline void destroy_process_mask(){}
+#endif /* __TBB_USE_OS_AFFINITY_SYSCALL */
+
+struct cpu_features_type {
+    bool rtm_enabled{false};
+    bool waitpkg_enabled{false};
+};
+
+void detect_cpu_features(cpu_features_type& cpu_features);
+
+#if __TBB_ARENA_BINDING
+class binding_handler;
+
+binding_handler* construct_binding_handler(int slot_num, int numa_id, int core_type_id, int max_threads_per_core);
+void destroy_binding_handler(binding_handler* handler_ptr);
+void apply_affinity_mask(binding_handler* handler_ptr, int slot_num);
+void restore_affinity_mask(binding_handler* handler_ptr, int slot_num);
+
+#endif /*__TBB_ARENA_BINDING*/
+
+// RTM specific section
+// abort code for mutexes that detect a conflict with another thread.
+enum {
+    speculation_not_supported       = 0x00,
+    speculation_transaction_aborted = 0x01,
+    speculation_can_retry           = 0x02,
+    speculation_memadd_conflict     = 0x04,
+    speculation_buffer_overflow     = 0x08,
+    speculation_breakpoint_hit      = 0x10,
+    speculation_nested_abort        = 0x20,
+    speculation_xabort_mask         = 0xFF000000,
+    speculation_xabort_shift        = 24,
+    speculation_xabort_not_free     = 0xFF, // The value (0xFF) below comes from the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual 12.4.5 lock not free
+    speculation_successful_begin    = 0xFFFFFFFF,
+    speculation_retry               = speculation_transaction_aborted
+                                      | speculation_can_retry
+                                      | speculation_memadd_conflict
+};
+
+// We suppose that successful transactions are sequentially ordered and
+// do not require additional memory fences around them.
+// Technically it can be achieved only if xbegin has implicit
+// acquire memory semantics an xend/xabort has release memory semantics on compiler and hardware level.
+// See the article: https://arxiv.org/pdf/1710.04839.pdf
+static inline unsigned int begin_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    return _xbegin();
+#else
+    return speculation_not_supported; // return unsuccessful code
+#endif
+}
+
+static inline void end_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    _xend();
+#endif
+}
+
+static inline void abort_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    _xabort(speculation_xabort_not_free);
+#endif
+}
+
+#if TBB_USE_ASSERT
+static inline unsigned char is_in_transaction() {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    return _xtest();
+#else
+    return 0;
+#endif
+}
+#endif // TBB_USE_ASSERT
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_tbb_misc_H */
diff --git a/third_party/tbb/misc_ex.cpp b/third_party/tbb/misc_ex.cpp
new file mode 100644
index 000000000..feb0ad05f
--- /dev/null
+++ b/third_party/tbb/misc_ex.cpp
@@ -0,0 +1,457 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// Source file for miscellaneous entities that are infrequently referenced by
+// an executing program, and implementation of which requires dynamic linking.
+
+#include "third_party/tbb/misc.h"
+
+#if !defined(__TBB_HardwareConcurrency)
+
+#include "third_party/tbb/dynamic_link.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+#include "libc/limits.h"
+#include "libc/sysv/consts/_posix.h"
+#include "libc/sysv/consts/iov.h"
+#include "libc/sysv/consts/limits.h"
+#include "libc/sysv/consts/xopen.h"
+#include "libc/thread/thread.h"
+
+#if _WIN32||_WIN64
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#if __TBB_WIN8UI_SUPPORT
+#include "third_party/libcxx/thread"
+#endif
+#else
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+#if __unix__
+#if __linux__
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/sysinfo.h"
+#endif
+#include "third_party/libcxx/cstring"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/cpuset.h"
+#include "libc/calls/struct/sched_param.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/sched.h"
+#include "third_party/libcxx/cerrno"
+#elif __sun
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/sysinfo.h"
+#elif __FreeBSD__
+#include "third_party/libcxx/cerrno"
+#include "third_party/libcxx/cstring"
+#include "libc/intrin/newbie.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/rlimit.h"
+#include "libc/calls/struct/rusage.h"
+#include "libc/calls/sysparam.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/limits.h"
+#include "libc/sysv/consts/endian.h"
+#include "libc/sysv/consts/prio.h"
+#include "libc/sysv/consts/rlim.h"
+#include "libc/sysv/consts/rlimit.h"
+#include "libc/sysv/consts/rusage.h"  // Required by <sys/cpuset.h>
+// MISSING #include <sys/cpuset.h>
+#endif
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_USE_OS_AFFINITY_SYSCALL
+
+#if __unix__
+// Handlers for interoperation with libiomp
+static int (*libiomp_try_restoring_original_mask)();
+// Table for mapping to libiomp entry points
+static const dynamic_link_descriptor iompLinkTable[] = {
+    DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask )
+};
+#endif
+
+static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) {
+#if __FreeBSD__ || __NetBSD__ || __OpenBSD__
+    if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
+#else /* __unix__ */
+    if( sched_setaffinity( 0, maskSize, threadMask ) )
+#endif
+        // Here and below the error severity is lowered from critical level
+        // because it may happen during TBB library unload because of not
+        // waiting for workers to complete (current RML policy, to be fixed).
+        // handle_perror( errno, "setaffinity syscall" );
+        runtime_warning( "setaffinity syscall failed" );
+}
+
+static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) {
+#if __FreeBSD__ || __NetBSD__ || __OpenBSD__
+    if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
+#else /* __unix__ */
+    if( sched_getaffinity( 0, maskSize, threadMask ) )
+#endif
+    runtime_warning( "getaffinity syscall failed" );
+}
+
+static basic_mask_t* process_mask;
+static int num_masks;
+
+void destroy_process_mask() {
+    delete [] process_mask;
+    process_mask = nullptr;
+}
+
+#define curMaskSize sizeof(basic_mask_t) * num_masks
+affinity_helper::~affinity_helper() {
+    if( threadMask ) {
+        if( is_changed ) {
+            set_thread_affinity_mask( curMaskSize, threadMask );
+        }
+        delete [] threadMask;
+    }
+}
+void affinity_helper::protect_affinity_mask( bool restore_process_mask ) {
+    if( threadMask == nullptr && num_masks ) { // TODO: assert num_masks validity?
+        threadMask = new basic_mask_t [num_masks];
+        std::memset( threadMask, 0, curMaskSize );
+        get_thread_affinity_mask( curMaskSize, threadMask );
+        if( restore_process_mask ) {
+            __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" );
+            is_changed = memcmp( process_mask, threadMask, curMaskSize );
+            if( is_changed )
+                set_thread_affinity_mask( curMaskSize, process_mask );
+        } else {
+            // Assume that the mask will be changed by the caller.
+            is_changed = 1;
+        }
+    }
+}
+void affinity_helper::dismiss() {
+    delete [] threadMask;
+    threadMask = nullptr;
+    is_changed = 0;
+}
+#undef curMaskSize
+
+static std::atomic<do_once_state> hardware_concurrency_info;
+
+static int theNumProcs;
+
+static void initialize_hardware_concurrency_info () {
+    int err;
+    int availableProcs = 0;
+    int numMasks = 1;
+    int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
+    basic_mask_t* processMask;
+    const std::size_t BasicMaskSize =  sizeof(basic_mask_t);
+    for (;;) {
+        const int curMaskSize = BasicMaskSize * numMasks;
+        processMask = new basic_mask_t[numMasks];
+        std::memset( processMask, 0, curMaskSize );
+#if __FreeBSD__ || __NetBSD__ || __OpenBSD__
+        // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
+        err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
+        if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
+            break;
+#else /* __unix__ */
+        int pid = getpid();
+        err = sched_getaffinity( pid, curMaskSize, processMask );
+        if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
+             break;
+#endif
+        delete[] processMask;
+        numMasks <<= 1;
+    }
+    if ( !err ) {
+        // We have found the mask size and captured the process affinity mask into processMask.
+        num_masks = numMasks; // do here because it's needed for affinity_helper to work
+#if __unix__
+        // For better coexistence with libiomp which might have changed the mask already,
+        // check for its presence and ask it to restore the mask.
+        dynamic_link_handle libhandle;
+        if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) {
+            // We have found the symbol provided by libiomp5 for restoring original thread affinity.
+            affinity_helper affhelp;
+            affhelp.protect_affinity_mask( /*restore_process_mask=*/false );
+            if ( libiomp_try_restoring_original_mask()==0 ) {
+                // Now we have the right mask to capture, restored by libiomp.
+                const int curMaskSize = BasicMaskSize * numMasks;
+                std::memset( processMask, 0, curMaskSize );
+                get_thread_affinity_mask( curMaskSize, processMask );
+            } else
+                affhelp.dismiss();  // thread mask has not changed
+            dynamic_unlink( libhandle );
+            // Destructor of affinity_helper restores the thread mask (unless dismissed).
+        }
+#endif
+        for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
+            for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
+                if ( CPU_ISSET( i, processMask + m ) )
+                    ++availableProcs;
+            }
+        }
+        process_mask = processMask;
+    }
+    else {
+        // Failed to get the process affinity mask; assume the whole machine can be used.
+        availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
+        delete[] processMask;
+    }
+    theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
+    __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), nullptr);
+}
+
+int AvailableHwConcurrency() {
+    atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
+    return theNumProcs;
+}
+
+/* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */
+#elif __ANDROID__
+
+// Work-around for Android that reads the correct number of available CPUs since system calls are unreliable.
+// Format of "present" file is: ([<int>-<int>|<int>],)+
+int AvailableHwConcurrency() {
+    FILE *fp = fopen("/sys/devices/system/cpu/present", "r");
+    if (fp == nullptr) return 1;
+    int num_args, lower, upper, num_cpus=0;
+    while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) {
+        switch(num_args) {
+            case 2: num_cpus += upper - lower + 1; break;
+            case 1: num_cpus += 1; break;
+        }
+        fscanf(fp, ",");
+    }
+    return (num_cpus > 0) ? num_cpus : 1;
+}
+
+#elif defined(_SC_NPROCESSORS_ONLN)
+
+int AvailableHwConcurrency() {
+    int n = sysconf(_SC_NPROCESSORS_ONLN);
+    return (n > 0) ? n : 1;
+}
+
+#elif _WIN32||_WIN64
+
+static std::atomic<do_once_state> hardware_concurrency_info;
+
+static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;
+
+// Statically allocate an array for processor group information.
+// Windows 7 supports maximum 4 groups, but let's look ahead a little.
+static const WORD MaxProcessorGroups = 64;
+
+struct ProcessorGroupInfo {
+    DWORD_PTR   mask;                   ///< Affinity mask covering the whole group
+    int         numProcs;               ///< Number of processors in the group
+    int         numProcsRunningTotal;   ///< Subtotal of processors in this and preceding groups
+
+    //! Total number of processor groups in the system
+    static int NumGroups;
+
+    //! Index of the group with a slot reserved for the first external thread
+    /** In the context of multiple processor groups support current implementation
+        defines "the first external thread" as the first thread to invoke
+        AvailableHwConcurrency().
+
+        TODO:   Implement a dynamic scheme remapping workers depending on the pending
+                external threads affinity. **/
+    static int HoleIndex;
+};
+
+int ProcessorGroupInfo::NumGroups = 1;
+int ProcessorGroupInfo::HoleIndex = 0;
+
+ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];
+int calculate_numa[MaxProcessorGroups];  //Array needed for FindProcessorGroupIndex to calculate Processor Group when number of threads > number of cores to distribute threads evenly between processor groups
+int numaSum;
+struct TBB_GROUP_AFFINITY {
+    DWORD_PTR Mask;
+    WORD   Group;
+    WORD   Reserved[3];
+};
+
+static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = nullptr;
+static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = nullptr;
+static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread,
+                        const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
+static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );
+
+static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
+      DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount)
+    , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount)
+    , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
+    , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
+};
+
+static void initialize_hardware_concurrency_info () {
+    suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS);
+#if __TBB_WIN8UI_SUPPORT
+    // For these applications processor groups info is unavailable
+    // Setting up a number of processors for one processor group
+    theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency();
+#else /* __TBB_WIN8UI_SUPPORT */
+    dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable,
+                  sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
+    SYSTEM_INFO si;
+    GetNativeSystemInfo(&si);
+    DWORD_PTR pam, sam, m = 1;
+    GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
+    int nproc = 0;
+    for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
+        if ( pam & m )
+            ++nproc;
+    }
+    __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, nullptr);
+    // By default setting up a number of processors for one processor group
+    theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
+    // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
+    if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
+        // The process does not have restricting affinity mask and multiple processor groups are possible
+        ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
+        __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, nullptr);
+        // Fail safety bootstrap. Release versions will limit available concurrency
+        // level, while debug ones would assert.
+        if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
+            ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
+        if ( ProcessorGroupInfo::NumGroups > 1 ) {
+            TBB_GROUP_AFFINITY ga;
+            if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
+                ProcessorGroupInfo::HoleIndex = ga.Group;
+            int nprocs = 0;
+            int min_procs = INT_MAX;
+            for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
+                ProcessorGroupInfo  &pgi = theProcessorGroups[i];
+                pgi.numProcs = (int)TBB_GetActiveProcessorCount(i);
+                if (pgi.numProcs < min_procs) min_procs = pgi.numProcs;  //Finding the minimum number of processors in the Processor Groups
+                calculate_numa[i] = pgi.numProcs;
+                __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, nullptr);
+                pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
+                pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
+            }
+            __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), nullptr);
+
+            calculate_numa[0] = (calculate_numa[0] / min_procs)-1;
+            for (WORD i = 1; i < ProcessorGroupInfo::NumGroups; ++i) {
+                calculate_numa[i] = calculate_numa[i-1] + (calculate_numa[i] / min_procs);
+            }
+
+            numaSum = calculate_numa[ProcessorGroupInfo::NumGroups - 1];
+
+        }
+
+    }
+#endif /* __TBB_WIN8UI_SUPPORT */
+
+    PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
+    if (ProcessorGroupInfo::NumGroups>1)
+        for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
+            PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
+}
+
+int NumberOfProcessorGroups() {
+    __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
+    return ProcessorGroupInfo::NumGroups;
+}
+
+int FindProcessorGroupIndex ( int procIdx ) {
+    int current_grp_idx = ProcessorGroupInfo::HoleIndex;
+    if (procIdx >= theProcessorGroups[current_grp_idx].numProcs  && procIdx < theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) {
+        procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs;
+        do {
+            current_grp_idx = (current_grp_idx + 1) % (ProcessorGroupInfo::NumGroups);
+            procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs;
+
+        } while (procIdx >= 0);
+    }
+    else if (procIdx >= theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) {
+        int temp_grp_index = 0;
+        procIdx = procIdx - theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 
+        procIdx = procIdx % (numaSum+1);  //ProcIdx to stay between 0 and numaSum
+
+        while (procIdx - calculate_numa[temp_grp_index] > 0) {
+            temp_grp_index = (temp_grp_index + 1) % ProcessorGroupInfo::NumGroups;
+        }
+        current_grp_idx = temp_grp_index;
+    }
+    __TBB_ASSERT(current_grp_idx < ProcessorGroupInfo::NumGroups, nullptr);
+
+    return current_grp_idx;
+}
+
+void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
+    __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
+    if ( !TBB_SetThreadGroupAffinity )
+        return;
+    TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
+    TBB_SetThreadGroupAffinity( hThread, &ga, nullptr);
+}
+
+int AvailableHwConcurrency() {
+    atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
+    return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
+}
+
+/* End of _WIN32||_WIN64 implementation */
+#else
+    #error AvailableHwConcurrency is not implemented for this OS
+#endif
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* !__TBB_HardwareConcurrency */
diff --git a/third_party/tbb/mutex.h b/third_party/tbb/mutex.h
new file mode 100644
index 000000000..791ba7798
--- /dev/null
+++ b/third_party/tbb/mutex.h
@@ -0,0 +1,94 @@
+// clang-format off
+/*
+    Copyright (c) 2021-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_mutex_H
+#define __TBB_mutex_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_scoped_lock.h"
+#include "third_party/tbb/detail/_waitable_atomic.h"
+#include "third_party/tbb/detail/_mutex_common.h"
+#include "third_party/tbb/profiling.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+class mutex {
+public:
+    //! Constructors
+    mutex() {
+        create_itt_sync(this, "tbb::mutex", "");
+    };
+
+    //! Destructor
+    ~mutex() = default;
+
+    //! No Copy
+    mutex(const mutex&) = delete;
+    mutex& operator=(const mutex&) = delete;
+
+    using scoped_lock = unique_scoped_lock<mutex>;
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = false;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+
+    //! Acquire lock
+    /** Spin if the lock is taken */
+    void lock() {
+        call_itt_notify(prepare, this);
+        while (!try_lock()) {
+            my_flag.wait(true, /* context = */ 0, std::memory_order_relaxed);
+        }
+    }
+
+    //! Try acquiring lock (non-blocking)
+    /** Return true if lock acquired; false otherwise. */
+    bool try_lock() {
+        bool result = !my_flag.load(std::memory_order_relaxed) && !my_flag.exchange(true);
+        if (result) {
+            call_itt_notify(acquired, this);
+        }
+        return result;
+    }
+
+    //! Release lock
+    void unlock() {
+        call_itt_notify(releasing, this);
+        // We need Write Read memory barrier before notify that reads the waiter list.
+        // In C++ only full fence covers this type of barrier.
+        my_flag.exchange(false);
+        my_flag.notify_one_relaxed();
+    }
+
+private:
+    waitable_atomic<bool> my_flag{0};
+}; // class mutex
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::mutex;
+} // namespace v1
+
+} // namespace tbb
+
+#endif // __TBB_mutex_H
diff --git a/third_party/tbb/null_mutex.h b/third_party/tbb/null_mutex.h
new file mode 100644
index 000000000..d0c3cfb99
--- /dev/null
+++ b/third_party/tbb/null_mutex.h
@@ -0,0 +1,81 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_null_mutex_H
+#define __TBB_null_mutex_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_mutex_common.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! A mutex which does nothing
+/** A null_mutex does no operation and simulates success.
+    @ingroup synchronization */
+class null_mutex {
+public:
+    //! Constructors
+    constexpr null_mutex() noexcept = default;
+
+    //! Destructor
+    ~null_mutex() = default;
+
+    //! No Copy
+    null_mutex(const null_mutex&) = delete;
+    null_mutex& operator=(const null_mutex&) = delete;
+
+    //! Represents acquisition of a mutex.
+    class scoped_lock {
+    public:
+        //! Constructors
+        constexpr scoped_lock() noexcept = default;
+        scoped_lock(null_mutex&) {}
+
+        //! Destructor
+        ~scoped_lock() = default;
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        void acquire(null_mutex&) {}
+        bool try_acquire(null_mutex&) { return true; }
+        void release() {}
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = false;
+    static constexpr bool is_recursive_mutex = true;
+    static constexpr bool is_fair_mutex = true;
+
+    void lock() {}
+    bool try_lock() { return true; }
+    void unlock() {}
+}; // class null_mutex
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::null_mutex;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_null_mutex_H */
diff --git a/third_party/tbb/null_rw_mutex.h b/third_party/tbb/null_rw_mutex.h
new file mode 100644
index 000000000..d8dff0488
--- /dev/null
+++ b/third_party/tbb/null_rw_mutex.h
@@ -0,0 +1,88 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_null_rw_mutex_H
+#define __TBB_null_rw_mutex_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_mutex_common.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! A rw mutex which does nothing
+/** A null_rw_mutex is a rw mutex that does nothing and simulates successful operation.
+    @ingroup synchronization */
+class null_rw_mutex {
+public:
+    //! Constructors
+    constexpr null_rw_mutex() noexcept = default;
+
+    //! Destructor
+    ~null_rw_mutex() = default;
+
+    //! No Copy
+    null_rw_mutex(const null_rw_mutex&) = delete;
+    null_rw_mutex& operator=(const null_rw_mutex&) = delete;
+
+    //! Represents acquisition of a mutex.
+    class scoped_lock {
+    public:
+        //! Constructors
+        constexpr scoped_lock() noexcept = default;
+        scoped_lock(null_rw_mutex&, bool = true) {}
+
+        //! Destructor
+        ~scoped_lock() = default;
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        void acquire(null_rw_mutex&, bool = true) {}
+        bool try_acquire(null_rw_mutex&, bool = true) { return true; }
+        void release() {}
+        bool upgrade_to_writer() { return true; }
+        bool downgrade_to_reader() { return true; }
+
+        bool is_writer() const { return true; }
+    };
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = true;
+    static constexpr bool is_recursive_mutex = true;
+    static constexpr bool is_fair_mutex = true;
+
+    void lock() {}
+    bool try_lock() { return true; }
+    void unlock() {}
+    void lock_shared() {}
+    bool try_lock_shared() { return true; }
+    void unlock_shared() {}
+}; // class null_rw_mutex
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::null_rw_mutex;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_null_rw_mutex_H */
diff --git a/third_party/tbb/observer_proxy.cpp b/third_party/tbb/observer_proxy.cpp
new file mode 100644
index 000000000..463f57809
--- /dev/null
+++ b/third_party/tbb/observer_proxy.cpp
@@ -0,0 +1,320 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_utils.h"
+
+#include "third_party/tbb/observer_proxy.h"
+#include "third_party/tbb/arena.h"
+#include "third_party/tbb/main.h"
+#include "third_party/tbb/thread_data.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if TBB_USE_ASSERT
+extern std::atomic<int> the_observer_proxy_count;
+#endif /* TBB_USE_ASSERT */
+
+observer_proxy::observer_proxy( d1::task_scheduler_observer& tso )
+    : my_ref_count(1), my_list(nullptr), my_next(nullptr), my_prev(nullptr), my_observer(&tso)
+{
+#if TBB_USE_ASSERT
+    ++the_observer_proxy_count;
+#endif /* TBB_USE_ASSERT */
+}
+
+observer_proxy::~observer_proxy() {
+    __TBB_ASSERT( !my_ref_count, "Attempt to destroy proxy still in use" );
+    poison_value(my_ref_count);
+    poison_pointer(my_prev);
+    poison_pointer(my_next);
+#if TBB_USE_ASSERT
+    --the_observer_proxy_count;
+#endif /* TBB_USE_ASSERT */
+}
+
+void observer_list::clear() {
+    {
+        scoped_lock lock(mutex(), /*is_writer=*/true);
+        observer_proxy *next = my_head.load(std::memory_order_relaxed);
+        while ( observer_proxy *p = next ) {
+            next = p->my_next;
+            // Both proxy p and observer p->my_observer (if non-null) are guaranteed
+            // to be alive while the list is locked.
+            d1::task_scheduler_observer *obs = p->my_observer;
+            // Make sure that possible concurrent observer destruction does not
+            // conflict with the proxy list cleanup.
+            if (!obs || !(p = obs->my_proxy.exchange(nullptr))) {
+                continue;
+            }
+            // accessing 'obs' after detaching of obs->my_proxy leads to the race with observer destruction
+            __TBB_ASSERT(!next || p == next->my_prev, nullptr);
+            __TBB_ASSERT(is_alive(p->my_ref_count), "Observer's proxy died prematurely");
+            __TBB_ASSERT(p->my_ref_count.load(std::memory_order_relaxed) == 1, "Reference for observer is missing");
+            poison_pointer(p->my_observer);
+            remove(p);
+            --p->my_ref_count;
+            delete p;
+        }
+    }
+
+    // If observe(false) is called concurrently with the destruction of the arena,
+    // need to wait until all proxies are removed.
+    for (atomic_backoff backoff; ; backoff.pause()) {
+        scoped_lock lock(mutex(), /*is_writer=*/false);
+        if (my_head.load(std::memory_order_relaxed) == nullptr) {
+            break;
+        }
+    }
+
+    __TBB_ASSERT(my_head.load(std::memory_order_relaxed) == nullptr && my_tail.load(std::memory_order_relaxed) == nullptr, nullptr);
+}
+
+void observer_list::insert( observer_proxy* p ) {
+    scoped_lock lock(mutex(), /*is_writer=*/true);
+    if (my_head.load(std::memory_order_relaxed)) {
+        p->my_prev = my_tail.load(std::memory_order_relaxed);
+        my_tail.load(std::memory_order_relaxed)->my_next = p;
+    } else {
+        my_head.store(p, std::memory_order_relaxed);
+    }
+    my_tail.store(p, std::memory_order_relaxed);
+}
+
+void observer_list::remove(observer_proxy* p) {
+    __TBB_ASSERT(my_head.load(std::memory_order_relaxed), "Attempt to remove an item from an empty list");
+    __TBB_ASSERT(!my_tail.load(std::memory_order_relaxed)->my_next, "Last item's my_next must be nullptr");
+    if (p == my_tail.load(std::memory_order_relaxed)) {
+        __TBB_ASSERT(!p->my_next, nullptr);
+        my_tail.store(p->my_prev, std::memory_order_relaxed);
+    } else {
+        __TBB_ASSERT(p->my_next, nullptr);
+        p->my_next->my_prev = p->my_prev;
+    }
+    if (p == my_head.load(std::memory_order_relaxed)) {
+        __TBB_ASSERT(!p->my_prev, nullptr);
+        my_head.store(p->my_next, std::memory_order_relaxed);
+    } else {
+        __TBB_ASSERT(p->my_prev, nullptr);
+        p->my_prev->my_next = p->my_next;
+    }
+    __TBB_ASSERT((my_head.load(std::memory_order_relaxed) && my_tail.load(std::memory_order_relaxed)) ||
+        (!my_head.load(std::memory_order_relaxed) && !my_tail.load(std::memory_order_relaxed)), nullptr);
+}
+
+void observer_list::remove_ref(observer_proxy* p) {
+    std::uintptr_t r = p->my_ref_count.load(std::memory_order_acquire);
+    __TBB_ASSERT(is_alive(r), nullptr);
+    while (r > 1) {
+        if (p->my_ref_count.compare_exchange_strong(r, r - 1)) {
+            return;
+        }
+    }
+    __TBB_ASSERT(r == 1, nullptr);
+    // Reference count might go to zero
+    {
+        // Use lock to avoid resurrection by a thread concurrently walking the list
+        observer_list::scoped_lock lock(mutex(), /*is_writer=*/true);
+        r = --p->my_ref_count;
+        if (!r) {
+            remove(p);
+        }
+    }
+    __TBB_ASSERT(r || !p->my_ref_count, nullptr);
+    if (!r) {
+        delete p;
+    }
+}
+
+void observer_list::do_notify_entry_observers(observer_proxy*& last, bool worker) {
+    // Pointer p marches though the list from last (exclusively) to the end.
+    observer_proxy* p = last, * prev = p;
+    for (;;) {
+        d1::task_scheduler_observer* tso = nullptr;
+        // Hold lock on list only long enough to advance to the next proxy in the list.
+        {
+            scoped_lock lock(mutex(), /*is_writer=*/false);
+            do {
+                if (p) {
+                    // We were already processing the list.
+                    if (observer_proxy* q = p->my_next) {
+                        if (p == prev) {
+                            remove_ref_fast(prev); // sets prev to nullptr if successful
+                        }
+                        p = q;
+                    } else {
+                        // Reached the end of the list.
+                        if (p == prev) {
+                            // Keep the reference as we store the 'last' pointer in scheduler
+                            __TBB_ASSERT(int(p->my_ref_count.load(std::memory_order_relaxed)) >= 1 + (p->my_observer ? 1 : 0), nullptr);
+                        } else {
+                            // The last few proxies were empty
+                            __TBB_ASSERT(int(p->my_ref_count.load(std::memory_order_relaxed)), nullptr);
+                            ++p->my_ref_count;
+                            if (prev) {
+                                lock.release();
+                                remove_ref(prev);
+                            }
+                        }
+                        last = p;
+                        return;
+                    }
+                } else {
+                    // Starting pass through the list
+                    p = my_head.load(std::memory_order_relaxed);
+                    if (!p) {
+                        return;
+                    }
+                }
+                tso = p->my_observer;
+            } while (!tso);
+            ++p->my_ref_count;
+            ++tso->my_busy_count;
+        }
+        __TBB_ASSERT(!prev || p != prev, nullptr);
+        // Release the proxy pinned before p
+        if (prev) {
+            remove_ref(prev);
+        }
+        // Do not hold any locks on the list while calling user's code.
+        // Do not intercept any exceptions that may escape the callback so that
+        // they are either handled by the TBB scheduler or passed to the debugger.
+        tso->on_scheduler_entry(worker);
+        __TBB_ASSERT(p->my_ref_count.load(std::memory_order_relaxed), nullptr);
+        intptr_t bc = --tso->my_busy_count;
+        __TBB_ASSERT_EX(bc >= 0, "my_busy_count underflowed");
+        prev = p;
+    }
+}
+
+void observer_list::do_notify_exit_observers(observer_proxy* last, bool worker) {
+    // Pointer p marches though the list from the beginning to last (inclusively).
+    observer_proxy* p = nullptr, * prev = nullptr;
+    for (;;) {
+        d1::task_scheduler_observer* tso = nullptr;
+        // Hold lock on list only long enough to advance to the next proxy in the list.
+        {
+            scoped_lock lock(mutex(), /*is_writer=*/false);
+            do {
+                if (p) {
+                    // We were already processing the list.
+                    if (p != last) {
+                        __TBB_ASSERT(p->my_next, "List items before 'last' must have valid my_next pointer");
+                        if (p == prev)
+                            remove_ref_fast(prev); // sets prev to nullptr if successful
+                        p = p->my_next;
+                    } else {
+                        // remove the reference from the last item
+                        remove_ref_fast(p);
+                        if (p) {
+                            lock.release();
+                            if (p != prev && prev) {
+                                remove_ref(prev);
+                            }
+                            remove_ref(p);
+                        }
+                        return;
+                    }
+                } else {
+                    // Starting pass through the list
+                    p = my_head.load(std::memory_order_relaxed);
+                    __TBB_ASSERT(p, "Nonzero 'last' must guarantee that the global list is non-empty");
+                }
+                tso = p->my_observer;
+            } while (!tso);
+            // The item is already refcounted
+            if (p != last) // the last is already referenced since entry notification
+                ++p->my_ref_count;
+            ++tso->my_busy_count;
+        }
+        __TBB_ASSERT(!prev || p != prev, nullptr);
+        if (prev)
+            remove_ref(prev);
+        // Do not hold any locks on the list while calling user's code.
+        // Do not intercept any exceptions that may escape the callback so that
+        // they are either handled by the TBB scheduler or passed to the debugger.
+        tso->on_scheduler_exit(worker);
+        __TBB_ASSERT(p->my_ref_count || p == last, nullptr);
+        intptr_t bc = --tso->my_busy_count;
+        __TBB_ASSERT_EX(bc >= 0, "my_busy_count underflowed");
+        prev = p;
+    }
+}
+
+void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer &tso, bool enable) {
+    if( enable ) {
+        if( !tso.my_proxy.load(std::memory_order_relaxed) ) {
+            observer_proxy* p = new observer_proxy(tso);
+            tso.my_proxy.store(p, std::memory_order_relaxed);
+            tso.my_busy_count.store(0, std::memory_order_relaxed);
+
+            thread_data* td = governor::get_thread_data_if_initialized();
+            if (p->my_observer->my_task_arena == nullptr) {
+                if (!(td && td->my_arena)) {
+                    td = governor::get_thread_data();
+                }
+                __TBB_ASSERT(__TBB_InitOnce::initialization_done(), nullptr);
+                __TBB_ASSERT(td && td->my_arena, nullptr);
+                p->my_list = &td->my_arena->my_observers;
+            } else {
+                d1::task_arena* ta = p->my_observer->my_task_arena;
+                arena* a = ta->my_arena.load(std::memory_order_acquire);
+                if (a == nullptr) { // Avoid recursion during arena initialization
+                    ta->initialize();
+                    a = ta->my_arena.load(std::memory_order_relaxed);
+                }
+                __TBB_ASSERT(a != nullptr, nullptr);
+                p->my_list = &a->my_observers;
+            }
+            p->my_list->insert(p);
+            // Notify newly activated observer and other pending ones if it belongs to current arena
+            if (td && td->my_arena && &td->my_arena->my_observers == p->my_list) {
+                p->my_list->notify_entry_observers(td->my_last_observer, td->my_is_worker);
+            }
+        }
+    } else {
+        // Make sure that possible concurrent proxy list cleanup does not conflict
+        // with the observer destruction here.
+        if ( observer_proxy* proxy = tso.my_proxy.exchange(nullptr) ) {
+            // List destruction should not touch this proxy after we've won the above interlocked exchange.
+            __TBB_ASSERT( proxy->my_observer == &tso, nullptr);
+            __TBB_ASSERT( is_alive(proxy->my_ref_count.load(std::memory_order_relaxed)), "Observer's proxy died prematurely" );
+            __TBB_ASSERT( proxy->my_ref_count.load(std::memory_order_relaxed) >= 1, "reference for observer missing" );
+            observer_list &list = *proxy->my_list;
+            {
+                // Ensure that none of the list walkers relies on observer pointer validity
+                observer_list::scoped_lock lock(list.mutex(), /*is_writer=*/true);
+                proxy->my_observer = nullptr;
+                // Proxy may still be held by other threads (to track the last notified observer)
+                if( !--proxy->my_ref_count ) {// nobody can increase it under exclusive lock
+                    list.remove(proxy);
+                    __TBB_ASSERT( !proxy->my_ref_count, nullptr);
+                    delete proxy;
+                }
+            }
+            spin_wait_until_eq(tso.my_busy_count, 0); // other threads are still accessing the callback
+        }
+    }
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/observer_proxy.h b/third_party/tbb/observer_proxy.h
new file mode 100644
index 000000000..78a692423
--- /dev/null
+++ b/third_party/tbb/observer_proxy.h
@@ -0,0 +1,153 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_observer_proxy_H
+#define __TBB_observer_proxy_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_aligned_space.h"
+
+#include "third_party/tbb/task_scheduler_observer.h"
+#include "third_party/tbb/spin_rw_mutex.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class observer_list {
+    friend class arena;
+
+    // Mutex is wrapped with aligned_space to shut up warnings when its destructor
+    // is called while threads are still using it.
+    typedef aligned_space<spin_rw_mutex>  my_mutex_type;
+
+    //! Pointer to the head of this list.
+    std::atomic<observer_proxy*> my_head{nullptr};
+
+    //! Pointer to the tail of this list.
+    std::atomic<observer_proxy*> my_tail{nullptr};
+
+    //! Mutex protecting this list.
+    my_mutex_type my_mutex;
+
+    //! Back-pointer to the arena this list belongs to.
+    arena* my_arena;
+
+    //! Decrement refcount of the proxy p if there are other outstanding references.
+    /** In case of success sets p to nullptr. Must be invoked from under the list lock. **/
+    inline static void remove_ref_fast( observer_proxy*& p );
+
+    //! Implements notify_entry_observers functionality.
+    void do_notify_entry_observers( observer_proxy*& last, bool worker );
+
+    //! Implements notify_exit_observers functionality.
+    void do_notify_exit_observers( observer_proxy* last, bool worker );
+
+public:
+    observer_list () = default;
+
+    //! Removes and destroys all observer proxies from the list.
+    /** Cannot be used concurrently with other methods. **/
+    void clear ();
+
+    //! Add observer proxy to the tail of the list.
+    void insert ( observer_proxy* p );
+
+    //! Remove observer proxy from the list.
+    void remove ( observer_proxy* p );
+
+    //! Decrement refcount of the proxy and destroy it if necessary.
+    /** When refcount reaches zero removes the proxy from the list and destructs it. **/
+    void remove_ref( observer_proxy* p );
+
+    //! Type of the scoped lock for the reader-writer mutex associated with the list.
+    typedef spin_rw_mutex::scoped_lock scoped_lock;
+
+    //! Accessor to the reader-writer mutex associated with the list.
+    spin_rw_mutex& mutex () { return my_mutex.begin()[0]; }
+
+    //! Call entry notifications on observers added after last was notified.
+    /** Updates last to become the last notified observer proxy (in the global list)
+        or leaves it to be nullptr. The proxy has its refcount incremented. **/
+    inline void notify_entry_observers( observer_proxy*& last, bool worker );
+
+    //! Call exit notifications on last and observers added before it.
+    inline void notify_exit_observers( observer_proxy*& last, bool worker );
+}; // class observer_list
+
+//! Wrapper for an observer object
+/** To maintain shared lists of observers the scheduler first wraps each observer
+    object into a proxy so that a list item remained valid even after the corresponding
+    proxy object is destroyed by the user code. **/
+class observer_proxy {
+    friend class d1::task_scheduler_observer;
+    friend class observer_list;
+    friend void observe(d1::task_scheduler_observer&, bool);
+    //! Reference count used for garbage collection.
+    /** 1 for reference from my task_scheduler_observer.
+        1 for each task dispatcher's last observer pointer.
+        No accounting for neighbors in the shared list. */
+    std::atomic<std::uintptr_t> my_ref_count;
+    //! Reference to the list this observer belongs to.
+    observer_list* my_list;
+    //! Pointer to next observer in the list specified by my_head.
+    /** nullptr for the last item in the list. **/
+    observer_proxy* my_next;
+    //! Pointer to the previous observer in the list specified by my_head.
+    /** For the head of the list points to the last item. **/
+    observer_proxy* my_prev;
+    //! Associated observer
+    d1::task_scheduler_observer* my_observer;
+
+    //! Constructs proxy for the given observer and adds it to the specified list.
+    observer_proxy( d1::task_scheduler_observer& );
+
+    ~observer_proxy();
+}; // class observer_proxy
+
+void observer_list::remove_ref_fast( observer_proxy*& p ) {
+    if( p->my_observer ) {
+        // Can decrement refcount quickly, as it cannot drop to zero while under the lock.
+        std::uintptr_t r = --p->my_ref_count;
+        __TBB_ASSERT_EX( r, nullptr);
+        p = nullptr;
+    } else {
+        // Use slow form of refcount decrementing, after the lock is released.
+    }
+}
+
+void observer_list::notify_entry_observers(observer_proxy*& last, bool worker) {
+    if (last == my_tail.load(std::memory_order_relaxed))
+        return;
+    do_notify_entry_observers(last, worker);
+}
+
+void observer_list::notify_exit_observers( observer_proxy*& last, bool worker ) {
+    if (last == nullptr) {
+        return;
+    }
+    __TBB_ASSERT(!is_poisoned(last), nullptr);
+    do_notify_exit_observers( last, worker );
+    __TBB_ASSERT(last != nullptr, nullptr);
+    poison_pointer(last);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_observer_proxy_H */
diff --git a/third_party/tbb/parallel_for.h b/third_party/tbb/parallel_for.h
new file mode 100644
index 000000000..de7c4166c
--- /dev/null
+++ b/third_party/tbb/parallel_for.h
@@ -0,0 +1,470 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_for_H
+#define __TBB_parallel_for_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+#include "third_party/tbb/profiling.h"
+
+#include "third_party/tbb/partitioner.h"
+#include "third_party/tbb/blocked_range.h"
+#include "third_party/tbb/task_group.h"
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/new"
+
+namespace tbb {
+namespace detail {
+#if __TBB_CPP20_CONCEPTS_PRESENT
+inline namespace d0 {
+
+template <typename Body, typename Range>
+concept parallel_for_body = std::copy_constructible<Body> && std::invocable<const std::remove_reference_t<Body>&, Range&>;
+
+template <typename Index>
+concept parallel_for_index = std::constructible_from<Index, int> &&
+                             std::copyable<Index> &&
+                             requires( const std::remove_reference_t<Index>& lhs, const std::remove_reference_t<Index>& rhs ) {
+                                 { lhs < rhs } -> adaptive_same_as<bool>;
+                                 { lhs - rhs } -> std::convertible_to<std::size_t>;
+                                 { lhs + (rhs - lhs) } -> std::convertible_to<Index>;
+                             };
+
+template <typename Function, typename Index>
+concept parallel_for_function = std::invocable<const std::remove_reference_t<Function>&, Index>;
+
+} // namespace d0
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+namespace d1 {
+
+//! Task type used in parallel_for
+/** @ingroup algorithms */
+template<typename Range, typename Body, typename Partitioner>
+struct start_for : public task {
+    Range my_range;
+    const Body my_body;
+    node* my_parent;
+
+    typename Partitioner::task_partition_type my_partition;
+    small_object_allocator my_allocator;
+
+    task* execute(execution_data&) override;
+    task* cancel(execution_data&) override;
+    void finalize(const execution_data&);
+
+    //! Constructor for root task.
+    start_for( const Range& range, const Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
+        my_range(range),
+        my_body(body),
+        my_parent(nullptr),
+        my_partition(partitioner),
+        my_allocator(alloc) {}
+    //! Splitting constructor used to generate children.
+    /** parent_ becomes left child.  Newly constructed object is right child. */
+    start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
+        my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
+        my_body(parent_.my_body),
+        my_parent(nullptr),
+        my_partition(parent_.my_partition, split_obj),
+        my_allocator(alloc) {}
+    //! Construct right child from the given range as response to the demand.
+    /** parent_ remains left child.  Newly constructed object is right child. */
+    start_for( start_for& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
+        my_range(r),
+        my_body(parent_.my_body),
+        my_parent(nullptr),
+        my_partition(parent_.my_partition, split()),
+        my_allocator(alloc)
+    {
+        my_partition.align_depth( d );
+    }
+    static void run(const Range& range, const Body& body, Partitioner& partitioner) {
+        task_group_context context(PARALLEL_FOR);
+        run(range, body, partitioner, context);
+    }
+
+    static void run(const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context) {
+        if ( !range.empty() ) {
+            small_object_allocator alloc{};
+            start_for& for_task = *alloc.new_object<start_for>(range, body, partitioner, alloc);
+
+            // defer creation of the wait node until task allocation succeeds
+            wait_node wn;
+            for_task.my_parent = &wn;
+            execute_and_wait(for_task, context, wn.m_wait, context);
+        }
+    }
+    //! Run body for range, serves as callback for partitioner
+    void run_body( Range &r ) {
+        tbb::detail::invoke(my_body, r);
+    }
+
+    //! spawn right task, serves as callback for partitioner
+    void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
+       offer_work_impl(ed, *this, split_obj);
+    }
+
+    //! spawn right task, serves as callback for partitioner
+    void offer_work(const Range& r, depth_t d, execution_data& ed) {
+        offer_work_impl(ed, *this, r, d);
+    }
+
+private:
+    template <typename... Args>
+    void offer_work_impl(execution_data& ed, Args&&... constructor_args) {
+        // New right child
+        small_object_allocator alloc{};
+        start_for& right_child = *alloc.new_object<start_for>(ed, std::forward<Args>(constructor_args)..., alloc);
+
+        // New root node as a continuation and ref count. Left and right child attach to the new parent.
+        right_child.my_parent = my_parent = alloc.new_object<tree_node>(ed, my_parent, 2, alloc);
+        // Spawn the right sibling
+        right_child.spawn_self(ed);
+    }
+
+    void spawn_self(execution_data& ed) {
+        my_partition.spawn_task(*this, *context(ed));
+    }
+};
+
+//! fold the tree and deallocate the task
+template<typename Range, typename Body, typename Partitioner>
+void start_for<Range, Body, Partitioner>::finalize(const execution_data& ed) {
+    // Get the current parent and allocator an object destruction
+    node* parent = my_parent;
+    auto allocator = my_allocator;
+    // Task execution finished - destroy it
+    this->~start_for();
+    // Unwind the tree decrementing the parent`s reference count
+
+    fold_tree<tree_node>(parent, ed);
+    allocator.deallocate(this, ed);
+
+}
+
+//! execute task for parallel_for
+template<typename Range, typename Body, typename Partitioner>
+task* start_for<Range, Body, Partitioner>::execute(execution_data& ed) {
+    if (!is_same_affinity(ed)) {
+        my_partition.note_affinity(execution_slot(ed));
+    }
+    my_partition.check_being_stolen(*this, ed);
+    my_partition.execute(*this, my_range, ed);
+    finalize(ed);
+    return nullptr;
+}
+
+//! cancel task for parallel_for
+template<typename Range, typename Body, typename Partitioner>
+task* start_for<Range, Body, Partitioner>::cancel(execution_data& ed) {
+    finalize(ed);
+    return nullptr;
+}
+
+//! Calls the function with values from range [begin, end) with a step provided
+template<typename Function, typename Index>
+class parallel_for_body_wrapper : detail::no_assign {
+    const Function &my_func;
+    const Index my_begin;
+    const Index my_step;
+public:
+    parallel_for_body_wrapper( const Function& _func, Index& _begin, Index& _step )
+        : my_func(_func), my_begin(_begin), my_step(_step) {}
+
+    void operator()( const blocked_range<Index>& r ) const {
+        // A set of local variables to help the compiler with vectorization of the following loop.
+        Index b = r.begin();
+        Index e = r.end();
+        Index ms = my_step;
+        Index k = my_begin + b*ms;
+
+#if __INTEL_COMPILER
+#pragma ivdep
+#if __TBB_ASSERT_ON_VECTORIZATION_FAILURE
+#pragma vector always assert
+#endif
+#endif
+        for ( Index i = b; i < e; ++i, k += ms ) {
+            tbb::detail::invoke(my_func, k);
+        }
+    }
+};
+
+// Requirements on Range concept are documented in blocked_range.h
+
+/** \page parallel_for_body_req Requirements on parallel_for body
+    Class \c Body implementing the concept of parallel_for body must define:
+    - \code Body::Body( const Body& ); \endcode                 Copy constructor
+    - \code Body::~Body(); \endcode                             Destructor
+    - \code void Body::operator()( Range& r ) const; \endcode   Function call operator applying the body to range \c r.
+**/
+
+/** \name parallel_for
+    See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/
+//@{
+
+//! Parallel iteration over range with default partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
+void parallel_for( const Range& range, const Body& body ) {
+    start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER());
+}
+
+//! Parallel iteration over range with simple partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
+void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) {
+    start_for<Range,Body,const simple_partitioner>::run(range,body,partitioner);
+}
+
+//! Parallel iteration over range with auto_partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
+void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) {
+    start_for<Range,Body,const auto_partitioner>::run(range,body,partitioner);
+}
+
+//! Parallel iteration over range with static_partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
+void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) {
+    start_for<Range,Body,const static_partitioner>::run(range,body,partitioner);
+}
+
+//! Parallel iteration over range with affinity_partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
+void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) {
+    start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner);
+}
+
+//! Parallel iteration over range with default partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
+void parallel_for( const Range& range, const Body& body, task_group_context& context ) {
+    start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range, body, __TBB_DEFAULT_PARTITIONER(), context);
+}
+
+//! Parallel iteration over range with simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
+void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
+    start_for<Range,Body,const simple_partitioner>::run(range, body, partitioner, context);
+}
+
+//! Parallel iteration over range with auto_partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
+void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
+    start_for<Range,Body,const auto_partitioner>::run(range, body, partitioner, context);
+}
+
+//! Parallel iteration over range with static_partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
+void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) {
+    start_for<Range,Body,const static_partitioner>::run(range, body, partitioner, context);
+}
+
+//! Parallel iteration over range with affinity_partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
+void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
+    start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner, context);
+}
+
+//! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner
+template <typename Index, typename Function, typename Partitioner>
+void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) {
+    if (step <= 0 )
+        throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
+    else if (first < last) {
+        // Above "else" avoids "potential divide by zero" warning on some platforms
+        Index end = Index(last - first - 1ul) / step + Index(1);
+        blocked_range<Index> range(static_cast<Index>(0), end);
+        parallel_for_body_wrapper<Function, Index> body(f, first, step);
+        parallel_for(range, body, partitioner);
+    }
+}
+
+//! Parallel iteration over a range of integers with a step provided and default partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, Index step, const Function& f) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner());
+}
+//! Parallel iteration over a range of integers with a step provided and simple partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner);
+}
+//! Parallel iteration over a range of integers with a step provided and auto partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner);
+}
+//! Parallel iteration over a range of integers with a step provided and static partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner);
+}
+//! Parallel iteration over a range of integers with a step provided and affinity partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) {
+    parallel_for_impl(first, last, step, f, partitioner);
+}
+
+//! Parallel iteration over a range of integers with a default step value and default partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, const Function& f) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner());
+}
+//! Parallel iteration over a range of integers with a default step value and simple partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
+}
+//! Parallel iteration over a range of integers with a default step value and auto partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
+}
+//! Parallel iteration over a range of integers with a default step value and static partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) {
+    parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
+}
+//! Parallel iteration over a range of integers with a default step value and affinity partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) {
+    parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner);
+}
+
+//! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner
+template <typename Index, typename Function, typename Partitioner>
+void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, task_group_context &context) {
+    if (step <= 0 )
+        throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
+    else if (first < last) {
+        // Above "else" avoids "potential divide by zero" warning on some platforms
+        Index end = (last - first - Index(1)) / step + Index(1);
+        blocked_range<Index> range(static_cast<Index>(0), end);
+        parallel_for_body_wrapper<Function, Index> body(f, first, step);
+        parallel_for(range, body, partitioner, context);
+    }
+}
+
+//! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner(), context);
+}
+//! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl(first, last, step, f, partitioner, context);
+}
+
+//! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, const Function& f, task_group_context &context) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner(), context);
+}
+//! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
+}
+//! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner
+template <typename Index, typename Function>
+    __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
+void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
+    parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner, context);
+}
+// @}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::parallel_for;
+// Split types
+using detail::split;
+using detail::proportional_split;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_parallel_for_H */
diff --git a/third_party/tbb/parallel_for_each.h b/third_party/tbb/parallel_for_each.h
new file mode 100644
index 000000000..a4752c69a
--- /dev/null
+++ b/third_party/tbb/parallel_for_each.h
@@ -0,0 +1,682 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_for_each_H
+#define __TBB_parallel_for_each_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/detail/_aligned_space.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+#include "third_party/tbb/detail/_utils.h"
+
+#include "third_party/tbb/parallel_for.h"
+#include "third_party/tbb/task_group.h" // task_group_context
+
+#include "third_party/libcxx/iterator"
+#include "third_party/libcxx/type_traits"
+
+namespace tbb {
+namespace detail {
+#if __TBB_CPP20_CONCEPTS_PRESENT
+namespace d1 {
+template <typename Item>
+class feeder;
+
+} // namespace d1
+inline namespace d0 {
+
+template <typename Body, typename ItemType, typename FeederItemType>
+concept parallel_for_each_body = std::invocable<const std::remove_reference_t<Body>&, ItemType&&> ||
+                                 std::invocable<const std::remove_reference_t<Body>&, ItemType&&, tbb::detail::d1::feeder<FeederItemType>&>;
+
+} // namespace d0
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+namespace d2 {
+template<typename Body, typename Item> class feeder_impl;
+} // namespace d2
+
+namespace d1 {
+//! Class the user supplied algorithm body uses to add new tasks
+template<typename Item>
+class feeder {
+    feeder() {}
+    feeder(const feeder&) = delete;
+    void operator=( const feeder&) = delete;
+
+    virtual ~feeder () {}
+    virtual void internal_add_copy(const Item& item) = 0;
+    virtual void internal_add_move(Item&& item) = 0;
+
+    template<typename Body_, typename Item_> friend class d2::feeder_impl;
+public:
+    //! Add a work item to a running parallel_for_each.
+    void add(const Item& item) {internal_add_copy(item);}
+    void add(Item&& item) {internal_add_move(std::move(item));}
+};
+
+} // namespace d1
+
+namespace d2 {
+using namespace tbb::detail::d1;
+/** Selects one of the two possible forms of function call member operator.
+    @ingroup algorithms **/
+template<class Body>
+struct parallel_for_each_operator_selector {
+public:
+    template<typename ItemArg, typename FeederArg>
+    static auto call(const Body& body, ItemArg&& item, FeederArg*)
+    -> decltype(tbb::detail::invoke(body, std::forward<ItemArg>(item)), void()) {
+        #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+        // Suppression of Microsoft non-standard extension warnings
+        #pragma warning (push)
+        #pragma warning (disable: 4239)
+        #endif
+
+        tbb::detail::invoke(body, std::forward<ItemArg>(item));
+
+        #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+        #pragma warning (pop)
+        #endif
+    }
+
+    template<typename ItemArg, typename FeederArg>
+    static auto call(const Body& body, ItemArg&& item, FeederArg* feeder)
+    -> decltype(tbb::detail::invoke(body, std::forward<ItemArg>(item), *feeder), void()) {
+        #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+        // Suppression of Microsoft non-standard extension warnings
+        #pragma warning (push)
+        #pragma warning (disable: 4239)
+        #endif
+        __TBB_ASSERT(feeder, "Feeder was not created but should be");
+
+        tbb::detail::invoke(body, std::forward<ItemArg>(item), *feeder);
+
+        #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+        #pragma warning (pop)
+        #endif
+    }
+};
+
+template<typename Body, typename Item>
+struct feeder_item_task: public task {
+    using feeder_type = feeder_impl<Body, Item>;
+
+    template <typename ItemType>
+    feeder_item_task(ItemType&& input_item, feeder_type& feeder, small_object_allocator& alloc) :
+        item(std::forward<ItemType>(input_item)),
+        my_feeder(feeder),
+        my_allocator(alloc)
+    {}
+
+    void finalize(const execution_data& ed) {
+        my_feeder.my_wait_context.release();
+        my_allocator.delete_object(this, ed);
+    }
+
+    //! Hack for resolve ambiguity between calls to the body with and without moving the stored copy
+    //! Executing body with moving the copy should have higher priority
+    using first_priority = int;
+    using second_priority = double;
+
+    template <typename BodyType, typename ItemType, typename FeederType>
+    static auto call(const BodyType& call_body, ItemType& call_item, FeederType& call_feeder, first_priority)
+    -> decltype(parallel_for_each_operator_selector<Body>::call(call_body, std::move(call_item), &call_feeder), void())
+    {
+        parallel_for_each_operator_selector<Body>::call(call_body, std::move(call_item), &call_feeder);
+    }
+
+    template <typename BodyType, typename ItemType, typename FeederType>
+    static void call(const BodyType& call_body, ItemType& call_item, FeederType& call_feeder, second_priority) {
+        parallel_for_each_operator_selector<Body>::call(call_body, call_item, &call_feeder);
+    }
+
+    task* execute(execution_data& ed) override {
+        call(my_feeder.my_body, item, my_feeder, first_priority{});
+        finalize(ed);
+        return nullptr;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize(ed);
+        return nullptr;
+    }
+
+    Item item;
+    feeder_type& my_feeder;
+    small_object_allocator my_allocator;
+}; // class feeder_item_task
+
+/** Implements new task adding procedure.
+    @ingroup algorithms **/
+template<typename Body, typename Item>
+class feeder_impl : public feeder<Item> {
+    // Avoiding use of copy constructor in a virtual method if the type does not support it
+    void internal_add_copy_impl(std::true_type, const Item& item) {
+        using feeder_task = feeder_item_task<Body, Item>;
+        small_object_allocator alloc;
+        auto task = alloc.new_object<feeder_task>(item, *this, alloc);
+
+        my_wait_context.reserve();
+        spawn(*task, my_execution_context);
+    }
+
+    void internal_add_copy_impl(std::false_type, const Item&) {
+        __TBB_ASSERT(false, "Overloading for r-value reference doesn't work or it's not movable and not copyable object");
+    }
+
+    void internal_add_copy(const Item& item) override {
+        internal_add_copy_impl(typename std::is_copy_constructible<Item>::type(), item);
+    }
+
+    void internal_add_move(Item&& item) override {
+        using feeder_task = feeder_item_task<Body, Item>;
+        small_object_allocator alloc{};
+        auto task = alloc.new_object<feeder_task>(std::move(item), *this, alloc);
+
+        my_wait_context.reserve();
+        spawn(*task, my_execution_context);
+    }
+public:
+    feeder_impl(const Body& body, wait_context& w_context, task_group_context &context)
+      : my_body(body),
+        my_wait_context(w_context)
+      , my_execution_context(context)
+    {}
+
+    const Body& my_body;
+    wait_context& my_wait_context;
+    task_group_context& my_execution_context;
+}; // class feeder_impl
+
+/** Execute computation under one element of the range
+    @ingroup algorithms **/
+template<typename Iterator, typename Body, typename Item>
+struct for_each_iteration_task: public task {
+    using feeder_type = feeder_impl<Body, Item>;
+
+    for_each_iteration_task(Iterator input_item_ptr, const Body& body, feeder_impl<Body, Item>* feeder_ptr, wait_context& wait_context) :
+        item_ptr(input_item_ptr), my_body(body), my_feeder_ptr(feeder_ptr), parent_wait_context(wait_context)
+    {}
+
+    void finalize() {
+        parent_wait_context.release();
+    }
+
+    task* execute(execution_data&) override {
+        parallel_for_each_operator_selector<Body>::call(my_body, *item_ptr, my_feeder_ptr);
+        finalize();
+        return nullptr;
+    }
+
+    task* cancel(execution_data&) override {
+        finalize();
+        return nullptr;
+    }
+
+    Iterator item_ptr;
+    const Body& my_body;
+    feeder_impl<Body, Item>* my_feeder_ptr;
+    wait_context& parent_wait_context;
+}; // class for_each_iteration_task
+
+// Helper to get the type of the iterator to the internal sequence of copies
+// If the element can be passed to the body as an rvalue - this iterator should be move_iterator
+template <typename Body, typename Item, typename = void>
+struct input_iteration_task_iterator_helper {
+    // For input iterators we pass const lvalue reference to the body
+    // It is prohibited to take non-constant lvalue references for input iterators
+    using type = const Item*;
+};
+
+template <typename Body, typename Item>
+struct input_iteration_task_iterator_helper<Body, Item,
+    tbb::detail::void_t<decltype(parallel_for_each_operator_selector<Body>::call(std::declval<const Body&>(),
+                                                                                 std::declval<Item&&>(),
+                                                                                 std::declval<feeder_impl<Body, Item>*>()))>>
+{
+    using type = std::move_iterator<Item*>;
+};
+
+/** Split one block task to several(max_block_size) iteration tasks for input iterators
+    @ingroup algorithms **/
+template <typename Body, typename Item>
+struct input_block_handling_task : public task {
+    static constexpr size_t max_block_size = 4;
+
+    using feeder_type = feeder_impl<Body, Item>;
+    using iteration_task_iterator_type = typename input_iteration_task_iterator_helper<Body, Item>::type;
+    using iteration_task = for_each_iteration_task<iteration_task_iterator_type, Body, Item>;
+
+    input_block_handling_task(wait_context& root_wait_context, task_group_context& e_context,
+                              const Body& body, feeder_impl<Body, Item>* feeder_ptr, small_object_allocator& alloc)
+        :my_size(0), my_wait_context(0), my_root_wait_context(root_wait_context),
+         my_execution_context(e_context), my_allocator(alloc)
+    {
+        auto item_it = block_iteration_space.begin();
+        for (auto* it = task_pool.begin(); it != task_pool.end(); ++it) {
+            new (it) iteration_task(iteration_task_iterator_type(item_it++), body, feeder_ptr, my_wait_context);
+        }
+    }
+
+    void finalize(const execution_data& ed) {
+        my_root_wait_context.release();
+        my_allocator.delete_object(this, ed);
+    }
+
+    task* execute(execution_data& ed) override {
+        __TBB_ASSERT( my_size > 0, "Negative size was passed to task");
+        for (std::size_t counter = 1; counter < my_size; ++counter) {
+            my_wait_context.reserve();
+            spawn(*(task_pool.begin() + counter), my_execution_context);
+        }
+        my_wait_context.reserve();
+        execute_and_wait(*task_pool.begin(), my_execution_context,
+                         my_wait_context,    my_execution_context);
+
+        // deallocate current task after children execution
+        finalize(ed);
+        return nullptr;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize(ed);
+        return nullptr;
+    }
+
+    ~input_block_handling_task() {
+        for(std::size_t counter = 0; counter < max_block_size; ++counter) {
+            (task_pool.begin() + counter)->~iteration_task();
+            if (counter < my_size) {
+                (block_iteration_space.begin() + counter)->~Item();
+            }
+        }
+    }
+
+    aligned_space<Item, max_block_size> block_iteration_space;
+    aligned_space<iteration_task, max_block_size> task_pool;
+    std::size_t my_size;
+    wait_context my_wait_context;
+    wait_context& my_root_wait_context;
+    task_group_context& my_execution_context;
+    small_object_allocator my_allocator;
+}; // class input_block_handling_task
+
+/** Split one block task to several(max_block_size) iteration tasks for forward iterators
+    @ingroup algorithms **/
+template <typename Iterator, typename Body, typename Item>
+struct forward_block_handling_task : public task {
+    static constexpr size_t max_block_size = 4;
+
+    using iteration_task = for_each_iteration_task<Iterator, Body, Item>;
+
+    forward_block_handling_task(Iterator first, std::size_t size,
+                                wait_context& w_context, task_group_context& e_context,
+                                const Body& body, feeder_impl<Body, Item>* feeder_ptr,
+                                small_object_allocator& alloc)
+        : my_size(size), my_wait_context(0), my_root_wait_context(w_context),
+          my_execution_context(e_context), my_allocator(alloc)
+    {
+        auto* task_it = task_pool.begin();
+        for (std::size_t i = 0; i < size; i++) {
+            new (task_it++) iteration_task(first, body, feeder_ptr, my_wait_context);
+            ++first;
+        }
+    }
+
+    void finalize(const execution_data& ed) {
+        my_root_wait_context.release();
+        my_allocator.delete_object(this, ed);
+    }
+
+    task* execute(execution_data& ed) override {
+        __TBB_ASSERT( my_size > 0, "Negative size was passed to task");
+        for(std::size_t counter = 1; counter < my_size; ++counter) {
+            my_wait_context.reserve();
+            spawn(*(task_pool.begin() + counter), my_execution_context);
+        }
+        my_wait_context.reserve();
+        execute_and_wait(*task_pool.begin(), my_execution_context,
+                         my_wait_context,    my_execution_context);
+
+        // deallocate current task after children execution
+        finalize(ed);
+        return nullptr;
+    }
+
+    task* cancel(execution_data& ed) override {
+        finalize(ed);
+        return nullptr;
+    }
+
+    ~forward_block_handling_task() {
+        for(std::size_t counter = 0; counter < my_size; ++counter) {
+            (task_pool.begin() + counter)->~iteration_task();
+        }
+    }
+
+    aligned_space<iteration_task, max_block_size> task_pool;
+    std::size_t my_size;
+    wait_context my_wait_context;
+    wait_context& my_root_wait_context;
+    task_group_context& my_execution_context;
+    small_object_allocator my_allocator;
+}; // class forward_block_handling_task
+
+/** Body for parallel_for algorithm.
+  * Allows to redirect operations under random access iterators range to the parallel_for algorithm.
+    @ingroup algorithms **/
+template <typename Iterator, typename Body, typename Item>
+class parallel_for_body_wrapper {
+    Iterator my_first;
+    const Body& my_body;
+    feeder_impl<Body, Item>* my_feeder_ptr;
+public:
+    parallel_for_body_wrapper(Iterator first, const Body& body, feeder_impl<Body, Item>* feeder_ptr)
+        : my_first(first), my_body(body), my_feeder_ptr(feeder_ptr) {}
+
+    void operator()(tbb::blocked_range<std::size_t> range) const {
+#if __INTEL_COMPILER
+#pragma ivdep
+#endif
+        for (std::size_t count = range.begin(); count != range.end(); count++) {
+            parallel_for_each_operator_selector<Body>::call(my_body, *(my_first + count),
+                                                            my_feeder_ptr);
+        }
+    }
+}; // class parallel_for_body_wrapper
+
+
+/** Helper for getting iterators tag including inherited custom tags
+    @ingroup algorithms */
+template<typename It>
+using tag = typename std::iterator_traits<It>::iterator_category;
+
+template<typename It>
+using iterator_tag_dispatch = typename
+    std::conditional<
+        std::is_base_of<std::random_access_iterator_tag, tag<It>>::value,
+        std::random_access_iterator_tag,
+        typename std::conditional<
+            std::is_base_of<std::forward_iterator_tag, tag<It>>::value,
+            std::forward_iterator_tag,
+            std::input_iterator_tag
+        >::type
+    >::type;
+
+template <typename Body, typename Iterator, typename Item>
+using feeder_is_required = tbb::detail::void_t<decltype(tbb::detail::invoke(std::declval<const Body>(),
+                                                                            std::declval<typename std::iterator_traits<Iterator>::reference>(),
+                                                                            std::declval<feeder<Item>&>()))>;
+
+// Creates feeder object only if the body can accept it
+template <typename Iterator, typename Body, typename Item, typename = void>
+struct feeder_holder {
+    feeder_holder( wait_context&, task_group_context&, const Body& ) {}
+
+    feeder_impl<Body, Item>* feeder_ptr() { return nullptr; }
+}; // class feeder_holder
+
+template <typename Iterator, typename Body, typename Item>
+class feeder_holder<Iterator, Body, Item, feeder_is_required<Body, Iterator, Item>> {
+public:
+    feeder_holder( wait_context& w_context, task_group_context& context, const Body& body )
+        : my_feeder(body, w_context, context) {}
+
+    feeder_impl<Body, Item>* feeder_ptr() { return &my_feeder; }
+private:
+    feeder_impl<Body, Item> my_feeder;
+}; // class feeder_holder
+
+template <typename Iterator, typename Body, typename Item>
+class for_each_root_task_base : public task {
+public:
+    for_each_root_task_base(Iterator first, Iterator last, const Body& body, wait_context& w_context, task_group_context& e_context)
+        : my_first(first), my_last(last), my_wait_context(w_context), my_execution_context(e_context),
+          my_body(body), my_feeder_holder(my_wait_context, my_execution_context, my_body)
+    {
+        my_wait_context.reserve();
+    }
+private:
+    task* cancel(execution_data&) override {
+        this->my_wait_context.release();
+        return nullptr;
+    }
+protected:
+    Iterator my_first;
+    Iterator my_last;
+    wait_context& my_wait_context;
+    task_group_context& my_execution_context;
+    const Body& my_body;
+    feeder_holder<Iterator, Body, Item> my_feeder_holder;
+}; // class for_each_root_task_base
+
+/** parallel_for_each algorithm root task - most generic version
+  * Splits input range to blocks
+    @ingroup algorithms **/
+template <typename Iterator, typename Body, typename Item, typename IteratorTag = iterator_tag_dispatch<Iterator>>
+class for_each_root_task : public for_each_root_task_base<Iterator, Body, Item>
+{
+    using base_type = for_each_root_task_base<Iterator, Body, Item>;
+public:
+    using base_type::base_type;
+private:
+    task* execute(execution_data& ed) override {
+        using block_handling_type = input_block_handling_task<Body, Item>;
+
+        if (this->my_first == this->my_last) {
+            this->my_wait_context.release();
+            return nullptr;
+        }
+
+        this->my_wait_context.reserve();
+        small_object_allocator alloc{};
+        auto block_handling_task = alloc.new_object<block_handling_type>(ed, this->my_wait_context, this->my_execution_context,
+                                                                         this->my_body, this->my_feeder_holder.feeder_ptr(),
+                                                                         alloc);
+
+        auto* block_iterator = block_handling_task->block_iteration_space.begin();
+        for (; !(this->my_first == this->my_last) && block_handling_task->my_size < block_handling_type::max_block_size; ++this->my_first) {
+            // Move semantics are automatically used when supported by the iterator
+            new (block_iterator++) Item(*this->my_first);
+            ++block_handling_task->my_size;
+        }
+
+        // Do not access this after spawn to avoid races
+        spawn(*this, this->my_execution_context);
+        return block_handling_task;
+    }
+}; // class for_each_root_task - most generic implementation
+
+/** parallel_for_each algorithm root task - forward iterator based specialization
+  * Splits input range to blocks
+    @ingroup algorithms **/
+template <typename Iterator, typename Body, typename Item>
+class for_each_root_task<Iterator, Body, Item, std::forward_iterator_tag>
+    : public for_each_root_task_base<Iterator, Body, Item>
+{
+    using base_type = for_each_root_task_base<Iterator, Body, Item>;
+public:
+    using base_type::base_type;
+private:
+    task* execute(execution_data& ed) override {
+        using block_handling_type = forward_block_handling_task<Iterator, Body, Item>;
+        if (this->my_first == this->my_last) {
+            this->my_wait_context.release();
+            return nullptr;
+        }
+
+        std::size_t block_size{0};
+        Iterator first_block_element = this->my_first;
+        for (; !(this->my_first == this->my_last) && block_size < block_handling_type::max_block_size; ++this->my_first) {
+            ++block_size;
+        }
+
+        this->my_wait_context.reserve();
+        small_object_allocator alloc{};
+        auto block_handling_task = alloc.new_object<block_handling_type>(ed, first_block_element, block_size,
+                                                                         this->my_wait_context, this->my_execution_context,
+                                                                         this->my_body, this->my_feeder_holder.feeder_ptr(), alloc);
+
+        // Do not access this after spawn to avoid races
+        spawn(*this, this->my_execution_context);
+        return block_handling_task;
+    }
+}; // class for_each_root_task - forward iterator based specialization
+
+/** parallel_for_each algorithm root task - random access iterator based specialization
+  * Splits input range to blocks
+    @ingroup algorithms **/
+template <typename Iterator, typename Body, typename Item>
+class for_each_root_task<Iterator, Body, Item, std::random_access_iterator_tag>
+    : public for_each_root_task_base<Iterator, Body, Item>
+{
+    using base_type = for_each_root_task_base<Iterator, Body, Item>;
+public:
+    using base_type::base_type;
+private:
+    task* execute(execution_data&) override {
+        tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, std::distance(this->my_first, this->my_last)),
+            parallel_for_body_wrapper<Iterator, Body, Item>(this->my_first, this->my_body, this->my_feeder_holder.feeder_ptr())
+            , this->my_execution_context
+        );
+
+        this->my_wait_context.release();
+        return nullptr;
+    }
+}; // class for_each_root_task - random access iterator based specialization
+
+/** Helper for getting item type. If item type can be deduced from feeder - got it from feeder,
+    if feeder is generic - got item type from range.
+    @ingroup algorithms */
+template<typename Body, typename Item, typename FeederArg>
+auto feeder_argument_parser(void (Body::*)(Item, feeder<FeederArg>&) const) -> FeederArg;
+
+template<typename Body, typename>
+decltype(feeder_argument_parser<Body>(&Body::operator())) get_item_type_impl(int); // for (T, feeder<T>)
+template<typename Body, typename Item> Item get_item_type_impl(...); // stub
+
+template <typename Body, typename Item>
+using get_item_type = decltype(get_item_type_impl<Body, Item>(0));
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+template <typename Body, typename ItemType>
+using feeder_item_type = std::remove_cvref_t<get_item_type<Body, ItemType>>;
+
+template <typename Body, typename Iterator>
+concept parallel_for_each_iterator_body =
+    parallel_for_each_body<Body, iterator_reference_type<Iterator>, feeder_item_type<Body, iterator_reference_type<Iterator>>>;
+
+template <typename Body, typename Range>
+concept parallel_for_each_range_body =
+    parallel_for_each_body<Body, range_reference_type<Range>, feeder_item_type<Body, range_reference_type<Range>>>;
+#endif
+
+/** Implements parallel iteration over a range.
+    @ingroup algorithms */
+template<typename Iterator, typename Body>
+void run_parallel_for_each( Iterator first, Iterator last, const Body& body, task_group_context& context)
+{
+    if (!(first == last)) {
+        using ItemType = get_item_type<Body, typename std::iterator_traits<Iterator>::value_type>;
+        wait_context w_context(0);
+
+        for_each_root_task<Iterator, Body, ItemType> root_task(first, last, body, w_context, context);
+
+        execute_and_wait(root_task, context, w_context, context);
+    }
+}
+
+/** \page parallel_for_each_body_req Requirements on parallel_for_each body
+    Class \c Body implementing the concept of parallel_for_each body must define:
+    - \code
+        B::operator()(
+                cv_item_type item,
+                feeder<item_type>& feeder
+        ) const
+
+        OR
+
+        B::operator()( cv_item_type& item ) const
+      \endcode                                               Process item.
+                                                             May be invoked concurrently  for the same \c this but different \c item.
+
+    - \code item_type( const item_type& ) \endcode
+                                                             Copy a work item.
+    - \code ~item_type() \endcode                            Destroy a work item
+**/
+
+/** \name parallel_for_each
+    See also requirements on \ref parallel_for_each_body_req "parallel_for_each Body". **/
+//@{
+//! Parallel iteration over a range, with optional addition of more work.
+/** @ingroup algorithms */
+template<typename Iterator, typename Body>
+    __TBB_requires(std::input_iterator<Iterator> && parallel_for_each_iterator_body<Body, Iterator>)
+void parallel_for_each(Iterator first, Iterator last, const Body& body) {
+    task_group_context context(PARALLEL_FOR_EACH);
+    run_parallel_for_each<Iterator, Body>(first, last, body, context);
+}
+
+template<typename Range, typename Body>
+    __TBB_requires(container_based_sequence<Range, std::input_iterator_tag> && parallel_for_each_range_body<Body, Range>)
+void parallel_for_each(Range& rng, const Body& body) {
+    parallel_for_each(std::begin(rng), std::end(rng), body);
+}
+
+template<typename Range, typename Body>
+    __TBB_requires(container_based_sequence<Range, std::input_iterator_tag> && parallel_for_each_range_body<Body, Range>)
+void parallel_for_each(const Range& rng, const Body& body) {
+    parallel_for_each(std::begin(rng), std::end(rng), body);
+}
+
+//! Parallel iteration over a range, with optional addition of more work and user-supplied context
+/** @ingroup algorithms */
+template<typename Iterator, typename Body>
+    __TBB_requires(std::input_iterator<Iterator> && parallel_for_each_iterator_body<Body, Iterator>)
+void parallel_for_each(Iterator first, Iterator last, const Body& body, task_group_context& context) {
+    run_parallel_for_each<Iterator, Body>(first, last, body, context);
+}
+
+template<typename Range, typename Body>
+    __TBB_requires(container_based_sequence<Range, std::input_iterator_tag> && parallel_for_each_range_body<Body, Range>)
+void parallel_for_each(Range& rng, const Body& body, task_group_context& context) {
+    parallel_for_each(std::begin(rng), std::end(rng), body, context);
+}
+
+template<typename Range, typename Body>
+    __TBB_requires(container_based_sequence<Range, std::input_iterator_tag> && parallel_for_each_range_body<Body, Range>)
+void parallel_for_each(const Range& rng, const Body& body, task_group_context& context) {
+    parallel_for_each(std::begin(rng), std::end(rng), body, context);
+}
+
+} // namespace d2
+} // namespace detail
+//! @endcond
+//@}
+
+inline namespace v1 {
+using detail::d2::parallel_for_each;
+using detail::d1::feeder;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_parallel_for_each_H */
diff --git a/third_party/tbb/parallel_invoke.h b/third_party/tbb/parallel_invoke.h
new file mode 100644
index 000000000..899c57cba
--- /dev/null
+++ b/third_party/tbb/parallel_invoke.h
@@ -0,0 +1,228 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_invoke_H
+#define __TBB_parallel_invoke_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+
+#include "third_party/tbb/task_group.h"
+
+#include "third_party/libcxx/tuple"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/utility"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Simple task object, executing user method
+template<typename Function, typename WaitObject>
+struct function_invoker : public task {
+    function_invoker(const Function& function, WaitObject& wait_ctx) :
+        my_function(function),
+        parent_wait_ctx(wait_ctx)
+    {}
+
+    task* execute(execution_data& ed) override {
+        my_function();
+        parent_wait_ctx.release(ed);
+        call_itt_task_notify(destroy, this);
+        return nullptr;
+    }
+
+    task* cancel(execution_data& ed) override {
+        parent_wait_ctx.release(ed);
+        return nullptr;
+    }
+
+    const Function& my_function;
+    WaitObject& parent_wait_ctx;
+}; // struct function_invoker
+
+//! Task object for managing subroots in trinary task trees.
+// Endowed with additional synchronization logic (compatible with wait object intefaces) to support
+// continuation passing execution. This task spawns 2 function_invoker tasks with first and second functors
+// and then executes first functor by itself. But only the last executed functor must destruct and deallocate
+// the subroot task.
+template<typename F1, typename F2, typename F3>
+struct invoke_subroot_task : public task {
+    wait_context& root_wait_ctx;
+    std::atomic<unsigned> ref_count{0};
+    bool child_spawned = false;
+
+    const F1& self_invoked_functor;
+    function_invoker<F2, invoke_subroot_task<F1, F2, F3>> f2_invoker;
+    function_invoker<F3, invoke_subroot_task<F1, F2, F3>> f3_invoker;
+
+    task_group_context& my_execution_context;
+    small_object_allocator my_allocator;
+
+    invoke_subroot_task(const F1& f1, const F2& f2, const F3& f3, wait_context& wait_ctx, task_group_context& context,
+                 small_object_allocator& alloc) :
+        root_wait_ctx(wait_ctx),
+        self_invoked_functor(f1),
+        f2_invoker(f2, *this),
+        f3_invoker(f3, *this),
+        my_execution_context(context),
+        my_allocator(alloc)
+    {
+        root_wait_ctx.reserve();
+    }
+
+    void finalize(const execution_data& ed) {
+        root_wait_ctx.release();
+
+        my_allocator.delete_object(this, ed);
+    }
+
+    void release(const execution_data& ed) {
+        __TBB_ASSERT(ref_count > 0, nullptr);
+        call_itt_task_notify(releasing, this);
+        if( --ref_count == 0 ) {
+            call_itt_task_notify(acquired, this);
+            finalize(ed);
+        }
+    }
+
+    task* execute(execution_data& ed) override {
+        ref_count.fetch_add(3, std::memory_order_relaxed);
+        spawn(f3_invoker, my_execution_context);
+        spawn(f2_invoker, my_execution_context);
+        self_invoked_functor();
+
+        release(ed);
+        return nullptr;
+    }
+
+    task* cancel(execution_data& ed) override {
+        if( ref_count > 0 ) { // detect children spawn
+            release(ed);
+        } else {
+            finalize(ed);
+        }
+        return nullptr;
+    }
+}; // struct subroot_task
+
+class invoke_root_task {
+public:
+    invoke_root_task(wait_context& wc) : my_wait_context(wc) {}
+    void release(const execution_data&) {
+        my_wait_context.release();
+    }
+private:
+    wait_context& my_wait_context;
+};
+
+template<typename F1>
+void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1) {
+    root_wait_ctx.reserve(1);
+    invoke_root_task root(root_wait_ctx);
+    function_invoker<F1, invoke_root_task> invoker1(f1, root);
+
+    execute_and_wait(invoker1, context, root_wait_ctx, context);
+}
+
+template<typename F1, typename F2>
+void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1, const F2& f2) {
+    root_wait_ctx.reserve(2);
+    invoke_root_task root(root_wait_ctx);
+    function_invoker<F1, invoke_root_task> invoker1(f1, root);
+    function_invoker<F2, invoke_root_task> invoker2(f2, root);
+
+    spawn(invoker1, context);
+    execute_and_wait(invoker2, context, root_wait_ctx, context);
+}
+
+template<typename F1, typename F2, typename F3>
+void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1, const F2& f2, const F3& f3) {
+    root_wait_ctx.reserve(3);
+    invoke_root_task root(root_wait_ctx);
+    function_invoker<F1, invoke_root_task> invoker1(f1, root);
+    function_invoker<F2, invoke_root_task> invoker2(f2, root);
+    function_invoker<F3, invoke_root_task> invoker3(f3, root);
+
+    //TODO: implement sub root for two tasks (measure performance)
+    spawn(invoker1, context);
+    spawn(invoker2, context);
+    execute_and_wait(invoker3, context, root_wait_ctx, context);
+}
+
+template<typename F1, typename F2, typename F3, typename... Fs>
+void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context,
+                                 const F1& f1, const F2& f2, const F3& f3, const Fs&... fs) {
+    small_object_allocator alloc{};
+    auto sub_root = alloc.new_object<invoke_subroot_task<F1, F2, F3>>(f1, f2, f3, root_wait_ctx, context, alloc);
+    spawn(*sub_root, context);
+
+    invoke_recursive_separation(root_wait_ctx, context, fs...);
+}
+
+template<typename... Fs>
+void parallel_invoke_impl(task_group_context& context, const Fs&... fs) {
+    static_assert(sizeof...(Fs) >= 2, "Parallel invoke may be called with at least two callable");
+    wait_context root_wait_ctx{0};
+
+    invoke_recursive_separation(root_wait_ctx, context, fs...);
+}
+
+template<typename F1, typename... Fs>
+void parallel_invoke_impl(const F1& f1, const Fs&... fs) {
+    static_assert(sizeof...(Fs) >= 1, "Parallel invoke may be called with at least two callable");
+    task_group_context context(PARALLEL_INVOKE);
+    wait_context root_wait_ctx{0};
+
+    invoke_recursive_separation(root_wait_ctx, context, fs..., f1);
+}
+
+//! Passes last argument of variadic pack as first for handling user provided task_group_context
+template <typename Tuple, typename... Fs>
+struct invoke_helper;
+
+template <typename... Args, typename T, typename... Fs>
+struct invoke_helper<std::tuple<Args...>, T, Fs...> : invoke_helper<std::tuple<Args..., T>, Fs...> {};
+
+template <typename... Fs, typename T/*task_group_context or callable*/>
+struct invoke_helper<std::tuple<Fs...>, T> {
+    void operator()(Fs&&... args, T&& t) {
+        parallel_invoke_impl(std::forward<T>(t), std::forward<Fs>(args)...);
+    }
+};
+
+//! Parallel execution of several function objects
+// We need to pass parameter pack through forwarding reference,
+// since this pack may contain task_group_context that must be passed via lvalue non-const reference
+template<typename... Fs>
+void parallel_invoke(Fs&&... fs) {
+    invoke_helper<std::tuple<>, Fs...>()(std::forward<Fs>(fs)...);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::parallel_invoke;
+} // namespace v1
+
+} // namespace tbb
+#endif /* __TBB_parallel_invoke_H */
diff --git a/third_party/tbb/parallel_pipeline.cpp b/third_party/tbb/parallel_pipeline.cpp
new file mode 100644
index 000000000..dd0b4b651
--- /dev/null
+++ b/third_party/tbb/parallel_pipeline.cpp
@@ -0,0 +1,472 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/parallel_pipeline.h"
+#include "third_party/tbb/spin_mutex.h"
+#include "third_party/tbb/tbb_allocator.h"
+#include "third_party/tbb/cache_aligned_allocator.h"
+#include "third_party/tbb/itt_notify.h"
+#include "third_party/tbb/tls.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+void handle_perror(int error_code, const char* aux_info);
+
+using Token = unsigned long;
+
+//! A processing pipeline that applies filters to items.
+/** @ingroup algorithms */
+class pipeline {
+    friend void parallel_pipeline(d1::task_group_context&, std::size_t, const d1::filter_node&);
+public:
+
+    //! Construct empty pipeline.
+    pipeline(d1::task_group_context& cxt, std::size_t max_token) :
+        my_context(cxt),
+        first_filter(nullptr),
+        last_filter(nullptr),
+        input_tokens(Token(max_token)),
+        end_of_input(false),
+        wait_ctx(0) {
+            __TBB_ASSERT( max_token>0, "pipeline::run must have at least one token" );
+        }
+
+    ~pipeline();
+
+    //! Add filter to end of pipeline.
+    void add_filter( d1::base_filter& );
+
+    //! Traverse tree of fitler-node in-order and add filter for each leaf
+    void fill_pipeline(const d1::filter_node& root) {
+        if( root.left && root.right ) {
+            fill_pipeline(*root.left);
+            fill_pipeline(*root.right);
+        }
+        else {
+            __TBB_ASSERT(!root.left && !root.right, "tree should be full");
+            add_filter(*root.create_filter());
+        }
+    }
+
+private:
+    friend class stage_task;
+    friend class base_filter;
+    friend void set_end_of_input(d1::base_filter& bf);
+
+    task_group_context& my_context;
+
+    //! Pointer to first filter in the pipeline.
+    d1::base_filter* first_filter;
+
+    //! Pointer to last filter in the pipeline.
+    d1::base_filter* last_filter;
+
+    //! Number of idle tokens waiting for input stage.
+    std::atomic<Token> input_tokens;
+
+    //! False until flow_control::stop() is called.
+    std::atomic<bool> end_of_input;
+
+    d1::wait_context wait_ctx;
+};
+
+//! This structure is used to store task information in an input buffer
+struct task_info {
+    void* my_object = nullptr;
+    //! Invalid unless a task went through an ordered stage.
+    Token my_token = 0;
+    //! False until my_token is set.
+    bool my_token_ready  = false;
+    //! True if my_object is valid.
+    bool is_valid = false;
+    //! Set to initial state (no object, no token)
+    void reset() {
+        my_object = nullptr;
+        my_token = 0;
+        my_token_ready = false;
+        is_valid = false;
+    }
+};
+
+//! A buffer of input items for a filter.
+/** Each item is a task_info, inserted into a position in the buffer corresponding to a Token. */
+class input_buffer {
+    friend class base_filter;
+    friend class stage_task;
+    friend class pipeline;
+    friend void set_end_of_input(d1::base_filter& bf);
+
+    using size_type = Token;
+
+    //! Array of deferred tasks that cannot yet start executing.
+    task_info* array;
+
+    //! Size of array
+    /** Always 0 or a power of 2 */
+    size_type array_size;
+
+    //! Lowest token that can start executing.
+    /** All prior Token have already been seen. */
+    Token low_token;
+
+    //! Serializes updates.
+    spin_mutex array_mutex;
+
+    //! Resize "array".
+    /** Caller is responsible to acquiring a lock on "array_mutex". */
+    void grow( size_type minimum_size );
+
+    //! Initial size for "array"
+    /** Must be a power of 2 */
+    static const size_type initial_buffer_size = 4;
+
+    //! Used for out of order buffer, and for assigning my_token if is_ordered and my_token not already assigned
+    Token high_token;
+
+    //! True for ordered filter, false otherwise.
+    const bool is_ordered;
+
+    //! for parallel filters that accepts nullptrs, thread-local flag for reaching end_of_input
+    using end_of_input_tls_t = basic_tls<input_buffer*>;
+    end_of_input_tls_t end_of_input_tls;
+    bool end_of_input_tls_allocated; // no way to test pthread creation of TLS
+
+public:
+    input_buffer(const input_buffer&) = delete;
+    input_buffer& operator=(const input_buffer&) = delete;
+
+    //! Construct empty buffer.
+    input_buffer( bool ordered) :
+            array(nullptr),
+            array_size(0),
+            low_token(0),
+            high_token(0),
+            is_ordered(ordered),
+            end_of_input_tls(),
+            end_of_input_tls_allocated(false) {
+        grow(initial_buffer_size);
+        __TBB_ASSERT( array, nullptr );
+    }
+
+    //! Destroy the buffer.
+    ~input_buffer() {
+        __TBB_ASSERT( array, nullptr );
+        cache_aligned_allocator<task_info>().deallocate(array,array_size);
+        poison_pointer( array );
+        if( end_of_input_tls_allocated ) {
+            destroy_my_tls();
+        }
+    }
+
+    //! Define order when the first filter is serial_in_order.
+    Token get_ordered_token(){
+        return high_token++;
+    }
+
+    //! Put a token into the buffer.
+    /** If task information was placed into buffer, returns true;
+        otherwise returns false, informing the caller to create and spawn a task.
+    */
+    bool try_put_token( task_info& info ) {
+        info.is_valid = true;
+        spin_mutex::scoped_lock lock( array_mutex );
+        Token token;
+        if( is_ordered ) {
+            if( !info.my_token_ready ) {
+                info.my_token = high_token++;
+                info.my_token_ready = true;
+            }
+            token = info.my_token;
+        } else
+            token = high_token++;
+        __TBB_ASSERT( (long)(token-low_token)>=0, nullptr );
+        if( token!=low_token ) {
+            // Trying to put token that is beyond low_token.
+            // Need to wait until low_token catches up before dispatching.
+            if( token-low_token>=array_size )
+                grow( token-low_token+1 );
+            ITT_NOTIFY( sync_releasing, this );
+            array[token&(array_size-1)] = info;
+            return true;
+        }
+        return false;
+    }
+
+    //! Note that processing of a token is finished.
+    /** Fires up processing of the next token, if processing was deferred. */
+    // Uses template to avoid explicit dependency on stage_task.
+    template<typename StageTask>
+    void try_to_spawn_task_for_next_token(StageTask& spawner, d1::execution_data& ed) {
+        task_info wakee;
+        {
+            spin_mutex::scoped_lock lock( array_mutex );
+            // Wake the next task
+            task_info& item = array[++low_token & (array_size-1)];
+            ITT_NOTIFY( sync_acquired, this );
+            wakee = item;
+            item.is_valid = false;
+        }
+        if( wakee.is_valid )
+            spawner.spawn_stage_task(wakee, ed);
+    }
+
+    // end_of_input signal for parallel_pipeline, parallel input filters with 0 tokens allowed.
+    void create_my_tls() {
+        int status = end_of_input_tls.create();
+        if(status)
+            handle_perror(status, "TLS not allocated for filter");
+        end_of_input_tls_allocated = true;
+    }
+    void destroy_my_tls() {
+        int status = end_of_input_tls.destroy();
+        if(status)
+            handle_perror(status, "Failed to destroy filter TLS");
+    }
+    bool my_tls_end_of_input() {
+        return end_of_input_tls.get() != nullptr;
+    }
+    void set_my_tls_end_of_input() {
+        end_of_input_tls.set(this);
+    }
+};
+
+void input_buffer::grow( size_type minimum_size ) {
+    size_type old_size = array_size;
+    size_type new_size = old_size ? 2*old_size : initial_buffer_size;
+    while( new_size<minimum_size )
+        new_size*=2;
+    task_info* new_array = cache_aligned_allocator<task_info>().allocate(new_size);
+    task_info* old_array = array;
+    for( size_type i=0; i<new_size; ++i )
+        new_array[i].is_valid = false;
+    Token t=low_token;
+    for( size_type i=0; i<old_size; ++i, ++t )
+        new_array[t&(new_size-1)] = old_array[t&(old_size-1)];
+    array = new_array;
+    array_size = new_size;
+    if( old_array )
+        cache_aligned_allocator<task_info>().deallocate(old_array,old_size);
+}
+
+class stage_task : public d1::task, public task_info {
+private:
+    friend class pipeline;
+    pipeline& my_pipeline;
+    d1::base_filter* my_filter;
+    d1::small_object_allocator m_allocator;
+    //! True if this task has not yet read the input.
+    bool my_at_start;
+
+    //! True if this can be executed again.
+    bool execute_filter(d1::execution_data& ed);
+
+    //! Spawn task if token is available.
+    void try_spawn_stage_task(d1::execution_data& ed) {
+        ITT_NOTIFY( sync_releasing, &my_pipeline.input_tokens );
+        if( (my_pipeline.input_tokens.fetch_sub(1, std::memory_order_release)) > 1 ) {
+            d1::small_object_allocator alloc{};
+            r1::spawn( *alloc.new_object<stage_task>(ed, my_pipeline, alloc ), my_pipeline.my_context );
+        }
+    }
+
+public:
+
+    //! Construct stage_task for first stage in a pipeline.
+    /** Such a stage has not read any input yet. */
+    stage_task(pipeline& pipeline, d1::small_object_allocator& alloc ) :
+        my_pipeline(pipeline),
+        my_filter(pipeline.first_filter),
+        m_allocator(alloc),
+        my_at_start(true)
+    {
+        task_info::reset();
+        my_pipeline.wait_ctx.reserve();
+    }
+    //! Construct stage_task for a subsequent stage in a pipeline.
+    stage_task(pipeline& pipeline, d1::base_filter* filter, const task_info& info, d1::small_object_allocator& alloc) :
+        task_info(info),
+        my_pipeline(pipeline),
+        my_filter(filter),
+        m_allocator(alloc),
+        my_at_start(false)
+    {
+        my_pipeline.wait_ctx.reserve();
+    }
+    //! Roughly equivalent to the constructor of input stage task
+    void reset() {
+        task_info::reset();
+        my_filter = my_pipeline.first_filter;
+        my_at_start = true;
+    }
+    void finalize(d1::execution_data& ed) {
+        m_allocator.delete_object(this, ed);
+    }
+    //! The virtual task execution method
+    task* execute(d1::execution_data& ed) override {
+        if(!execute_filter(ed)) {
+            finalize(ed);
+            return nullptr;
+        }
+        return this;
+    }
+    task* cancel(d1::execution_data& ed) override {
+        finalize(ed);
+        return nullptr;
+    }
+
+    ~stage_task() override {
+        if ( my_filter && my_object ) {
+            my_filter->finalize(my_object);
+            my_object = nullptr;
+        }
+        my_pipeline.wait_ctx.release();
+    }
+    //! Creates and spawns stage_task from task_info
+    void spawn_stage_task(const task_info& info, d1::execution_data& ed) {
+        d1::small_object_allocator alloc{};
+        stage_task* clone = alloc.new_object<stage_task>(ed, my_pipeline, my_filter, info, alloc);
+        r1::spawn(*clone, my_pipeline.my_context);
+    }
+};
+
+bool stage_task::execute_filter(d1::execution_data& ed) {
+    __TBB_ASSERT( !my_at_start || !my_object, "invalid state of task" );
+    if( my_at_start ) {
+        if( my_filter->is_serial() ) {
+            my_object = (*my_filter)(my_object);
+            if( my_object || ( my_filter->object_may_be_null() && !my_pipeline.end_of_input.load(std::memory_order_relaxed)) ) {
+                if( my_filter->is_ordered() ) {
+                    my_token = my_filter->my_input_buffer->get_ordered_token();
+                    my_token_ready = true;
+                }
+                if( !my_filter->next_filter_in_pipeline ) { // we're only filter in pipeline
+                    reset();
+                    return true;
+                } else {
+                    try_spawn_stage_task(ed);
+                }
+            } else {
+                my_pipeline.end_of_input.store(true, std::memory_order_relaxed);
+                return false;
+            }
+        } else /*not is_serial*/ {
+            if ( my_pipeline.end_of_input.load(std::memory_order_relaxed) ) {
+                return false;
+            }
+
+            try_spawn_stage_task(ed);
+
+            my_object = (*my_filter)(my_object);
+            if( !my_object && (!my_filter->object_may_be_null() || my_filter->my_input_buffer->my_tls_end_of_input()) ){
+                my_pipeline.end_of_input.store(true, std::memory_order_relaxed);
+                return false;
+            }
+        }
+        my_at_start = false;
+    } else {
+        my_object = (*my_filter)(my_object);
+        if( my_filter->is_serial() )
+            my_filter->my_input_buffer->try_to_spawn_task_for_next_token(*this, ed);
+    }
+    my_filter = my_filter->next_filter_in_pipeline;
+    if( my_filter ) {
+        // There is another filter to execute.
+        if( my_filter->is_serial() ) {
+            // The next filter must execute tokens when they are available (in order for serial_in_order)
+            if( my_filter->my_input_buffer->try_put_token(*this) ){
+                my_filter = nullptr; // To prevent deleting my_object twice if exception occurs
+                return false;
+            }
+        }
+    } else {
+        // Reached end of the pipe.
+        std::size_t ntokens_avail = my_pipeline.input_tokens.fetch_add(1, std::memory_order_acquire);
+
+        if( ntokens_avail>0  // Only recycle if there is one available token
+                || my_pipeline.end_of_input.load(std::memory_order_relaxed) ) {
+            return false; // No need to recycle for new input
+        }
+        ITT_NOTIFY( sync_acquired, &my_pipeline.input_tokens );
+        // Recycle as an input stage task.
+        reset();
+    }
+    return true;
+}
+
+pipeline::~pipeline() {
+    while( first_filter ) {
+        d1::base_filter* f = first_filter;
+        if( input_buffer* b = f->my_input_buffer ) {
+            b->~input_buffer();
+            deallocate_memory(b);
+        }
+        first_filter = f->next_filter_in_pipeline;
+        f->~base_filter();
+        deallocate_memory(f);
+    }
+}
+
+void pipeline::add_filter( d1::base_filter& new_fitler ) {
+    __TBB_ASSERT( new_fitler.next_filter_in_pipeline==d1::base_filter::not_in_pipeline(), "filter already part of pipeline?" );
+    new_fitler.my_pipeline = this;
+    if ( first_filter == nullptr )
+        first_filter = &new_fitler;
+    else
+        last_filter->next_filter_in_pipeline = &new_fitler;
+    new_fitler.next_filter_in_pipeline = nullptr;
+    last_filter = &new_fitler;
+    if( new_fitler.is_serial() ) {
+        new_fitler.my_input_buffer = new (allocate_memory(sizeof(input_buffer))) input_buffer( new_fitler.is_ordered() );
+    } else {
+        if( first_filter == &new_fitler && new_fitler.object_may_be_null() ) {
+            //TODO: buffer only needed to hold TLS; could improve
+            new_fitler.my_input_buffer = new (allocate_memory(sizeof(input_buffer))) input_buffer( /*is_ordered*/false );
+            new_fitler.my_input_buffer->create_my_tls();
+        }
+    }
+}
+
+void __TBB_EXPORTED_FUNC parallel_pipeline(d1::task_group_context& cxt, std::size_t max_token, const d1::filter_node& fn) {
+    pipeline pipe(cxt, max_token);
+
+    pipe.fill_pipeline(fn);
+
+    d1::small_object_allocator alloc{};
+    stage_task& st = *alloc.new_object<stage_task>(pipe, alloc);
+
+    // Start execution of tasks
+    r1::execute_and_wait(st, cxt, pipe.wait_ctx, cxt);
+}
+
+void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter& bf) {
+    __TBB_ASSERT(bf.my_input_buffer, nullptr);
+    __TBB_ASSERT(bf.object_may_be_null(), nullptr);
+    if(bf.is_serial() ) {
+        bf.my_pipeline->end_of_input.store(true, std::memory_order_relaxed);
+    } else {
+        __TBB_ASSERT(bf.my_input_buffer->end_of_input_tls_allocated, nullptr);
+        bf.my_input_buffer->set_my_tls_end_of_input();
+    }
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/parallel_pipeline.h b/third_party/tbb/parallel_pipeline.h
new file mode 100644
index 000000000..3cc24afe4
--- /dev/null
+++ b/third_party/tbb/parallel_pipeline.h
@@ -0,0 +1,154 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_pipeline_H
+#define __TBB_parallel_pipeline_H
+
+#include "third_party/tbb/detail/_pipeline_filters.h"
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/task_group.h"
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/type_traits"
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+TBB_EXPORT void __TBB_EXPORTED_FUNC parallel_pipeline(task_group_context&, std::size_t, const d1::filter_node&);
+}
+
+namespace d1 {
+
+enum class filter_mode : unsigned int
+{
+    //! processes multiple items in parallel and in no particular order
+    parallel = base_filter::filter_is_out_of_order,
+    //! processes items one at a time; all such filters process items in the same order
+    serial_in_order =  base_filter::filter_is_serial,
+    //! processes items one at a time and in no particular order
+    serial_out_of_order = base_filter::filter_is_serial | base_filter::filter_is_out_of_order
+};
+//! Class representing a chain of type-safe pipeline filters
+/** @ingroup algorithms */
+template<typename InputType, typename OutputType>
+class filter {
+    filter_node_ptr my_root;
+    filter( filter_node_ptr root ) : my_root(root) {}
+    friend void parallel_pipeline( size_t, const filter<void,void>&, task_group_context& );
+    template<typename T_, typename U_, typename Body>
+    friend filter<T_,U_> make_filter( filter_mode, const Body& );
+    template<typename T_, typename V_, typename U_>
+    friend filter<T_,U_> operator&( const filter<T_,V_>&, const filter<V_,U_>& );
+public:
+    filter() = default;
+    filter( const filter& rhs ) : my_root(rhs.my_root) {}
+    filter( filter&& rhs ) : my_root(std::move(rhs.my_root)) {}
+
+    void operator=(const filter& rhs) {
+        my_root = rhs.my_root;
+    }
+    void operator=( filter&& rhs ) {
+        my_root = std::move(rhs.my_root);
+    }
+
+    template<typename Body>
+    filter( filter_mode mode, const Body& body ) :
+        my_root( new(r1::allocate_memory(sizeof(filter_node_leaf<InputType, OutputType, Body>)))
+                    filter_node_leaf<InputType, OutputType, Body>(static_cast<unsigned int>(mode), body) ) {
+    }
+
+    filter& operator&=( const filter<OutputType,OutputType>& right ) {
+        *this = *this & right;
+        return *this;
+    }
+
+    void clear() {
+        // Like operator= with filter() on right side.
+        my_root = nullptr;
+    }
+};
+
+//! Create a filter to participate in parallel_pipeline
+/** @ingroup algorithms */
+template<typename InputType, typename OutputType, typename Body>
+filter<InputType, OutputType> make_filter( filter_mode mode, const Body& body ) {
+    return filter_node_ptr( new(r1::allocate_memory(sizeof(filter_node_leaf<InputType, OutputType, Body>)))
+                                filter_node_leaf<InputType, OutputType, Body>(static_cast<unsigned int>(mode), body) );
+}
+
+//! Create a filter to participate in parallel_pipeline
+/** @ingroup algorithms */
+template<typename Body>
+filter<filter_input<Body>, filter_output<Body>> make_filter( filter_mode mode, const Body& body ) {
+    return make_filter<filter_input<Body>, filter_output<Body>>(mode, body);
+}
+
+//! Composition of filters left and right.
+/** @ingroup algorithms */
+template<typename T, typename V, typename U>
+filter<T,U> operator&( const filter<T,V>& left, const filter<V,U>& right ) {
+    __TBB_ASSERT(left.my_root,"cannot use default-constructed filter as left argument of '&'");
+    __TBB_ASSERT(right.my_root,"cannot use default-constructed filter as right argument of '&'");
+    return filter_node_ptr( new (r1::allocate_memory(sizeof(filter_node))) filter_node(left.my_root,right.my_root) );
+}
+
+#if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+template<typename Body>
+filter(filter_mode, Body)
+->filter<filter_input<Body>, filter_output<Body>>;
+#endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
+
+//! Parallel pipeline over chain of filters with user-supplied context.
+/** @ingroup algorithms **/
+inline void parallel_pipeline(size_t max_number_of_live_tokens, const filter<void,void>& filter_chain, task_group_context& context) {
+    r1::parallel_pipeline(context, max_number_of_live_tokens, *filter_chain.my_root);
+}
+
+//! Parallel pipeline over chain of filters.
+/** @ingroup algorithms **/
+inline void parallel_pipeline(size_t max_number_of_live_tokens, const filter<void,void>& filter_chain) {
+    task_group_context context;
+    parallel_pipeline(max_number_of_live_tokens, filter_chain, context);
+}
+
+//! Parallel pipeline over sequence of filters.
+/** @ingroup algorithms **/
+template<typename F1, typename F2, typename... FiltersContext>
+void parallel_pipeline(size_t max_number_of_live_tokens,
+                              const F1& filter1,
+                              const F2& filter2,
+                              FiltersContext&&... filters) {
+    parallel_pipeline(max_number_of_live_tokens, filter1 & filter2, std::forward<FiltersContext>(filters)...);
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1
+{
+using detail::d1::parallel_pipeline;
+using detail::d1::filter;
+using detail::d1::make_filter;
+using detail::d1::filter_mode;
+using detail::d1::flow_control;
+}
+} // tbb
+
+#endif /* __TBB_parallel_pipeline_H */
diff --git a/third_party/tbb/parallel_reduce.h b/third_party/tbb/parallel_reduce.h
new file mode 100644
index 000000000..1fc549ce1
--- /dev/null
+++ b/third_party/tbb/parallel_reduce.h
@@ -0,0 +1,772 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_reduce_H
+#define __TBB_parallel_reduce_H
+
+#include "third_party/libcxx/new"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/detail/_aligned_space.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+#include "third_party/tbb/detail/_range_common.h"
+
+#include "third_party/tbb/task_group.h" // task_group_context
+#include "third_party/tbb/partitioner.h"
+#include "third_party/tbb/profiling.h"
+
+namespace tbb {
+namespace detail {
+#if __TBB_CPP20_CONCEPTS_PRESENT
+inline namespace d0 {
+
+template <typename Body, typename Range>
+concept parallel_reduce_body = splittable<Body> &&
+                               requires( Body& body, const Range& range, Body& rhs ) {
+                                   body(range);
+                                   body.join(rhs);
+                               };
+
+template <typename Function, typename Range, typename Value>
+concept parallel_reduce_function = std::invocable<const std::remove_reference_t<Function>&,
+                                                  const Range&, const Value&> &&
+                                   std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Function>&,
+                                                                            const Range&, const Value&>,
+                                                        Value>;
+
+template <typename Combine, typename Value>
+concept parallel_reduce_combine = std::invocable<const std::remove_reference_t<Combine>&,
+                                                 const Value&, const Value&> &&
+                                  std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Combine>&,
+                                                                           const Value&, const Value&>,
+                                                      Value>;
+
+} // namespace d0
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+namespace d1 {
+
+//! Tree node type for parallel_reduce.
+/** @ingroup algorithms */
+//TODO: consider folding tree via bypass execution(instead of manual folding)
+// for better cancellation and critical tasks handling (performance measurements required).
+template<typename Body>
+struct reduction_tree_node : public tree_node {
+    tbb::detail::aligned_space<Body> zombie_space;
+    Body& left_body;
+    bool has_right_zombie{false};
+
+    reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) :
+        tree_node{parent, ref_count, alloc},
+        left_body(input_left_body) /* gcc4.8 bug - braced-initialization doesn't work for class members of reference type */
+    {}
+
+    void join(task_group_context* context) {
+        if (has_right_zombie && !context->is_group_execution_cancelled())
+            left_body.join(*zombie_space.begin());
+    }
+
+    ~reduction_tree_node() {
+        if( has_right_zombie ) zombie_space.begin()->~Body();
+    }
+};
+
+//! Task type used to split the work of parallel_reduce.
+/** @ingroup algorithms */
+template<typename Range, typename Body, typename Partitioner>
+struct start_reduce : public task {
+    Range my_range;
+    Body* my_body;
+    node* my_parent;
+
+    typename Partitioner::task_partition_type my_partition;
+    small_object_allocator my_allocator;
+    bool is_right_child;
+
+    task* execute(execution_data&) override;
+    task* cancel(execution_data&) override;
+    void finalize(const execution_data&);
+
+    using tree_node_type = reduction_tree_node<Body>;
+
+    //! Constructor reduce root task.
+    start_reduce( const Range& range, Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
+        my_range(range),
+        my_body(&body),
+        my_parent(nullptr),
+        my_partition(partitioner),
+        my_allocator(alloc),
+        is_right_child(false) {}
+    //! Splitting constructor used to generate children.
+    /** parent_ becomes left child. Newly constructed object is right child. */
+    start_reduce( start_reduce& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
+        my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
+        my_body(parent_.my_body),
+        my_parent(nullptr),
+        my_partition(parent_.my_partition, split_obj),
+        my_allocator(alloc),
+        is_right_child(true)
+    {
+        parent_.is_right_child = false;
+    }
+    //! Construct right child from the given range as response to the demand.
+    /** parent_ remains left child. Newly constructed object is right child. */
+    start_reduce( start_reduce& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
+        my_range(r),
+        my_body(parent_.my_body),
+        my_parent(nullptr),
+        my_partition(parent_.my_partition, split()),
+        my_allocator(alloc),
+        is_right_child(true)
+    {
+        my_partition.align_depth( d );
+        parent_.is_right_child = false;
+    }
+    static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) {
+        if ( !range.empty() ) {
+            wait_node wn;
+            small_object_allocator alloc{};
+            auto reduce_task = alloc.new_object<start_reduce>(range, body, partitioner, alloc);
+            reduce_task->my_parent = &wn;
+            execute_and_wait(*reduce_task, context, wn.m_wait, context);
+        }
+    }
+    static void run(const Range& range, Body& body, Partitioner& partitioner) {
+        // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
+        // and allows users to handle exceptions safely by wrapping parallel_reduce in the try-block.
+        task_group_context context(PARALLEL_REDUCE);
+        run(range, body, partitioner, context);
+    }
+    //! Run body for range, serves as callback for partitioner
+    void run_body( Range &r ) {
+        tbb::detail::invoke(*my_body, r);
+    }
+
+    //! spawn right task, serves as callback for partitioner
+    void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
+        offer_work_impl(ed, *this, split_obj);
+    }
+    //! spawn right task, serves as callback for partitioner
+    void offer_work(const Range& r, depth_t d, execution_data& ed) {
+        offer_work_impl(ed, *this, r, d);
+    }
+
+private:
+    template <typename... Args>
+    void offer_work_impl(execution_data& ed, Args&&... args) {
+        small_object_allocator alloc{};
+        // New right child
+        auto right_child = alloc.new_object<start_reduce>(ed, std::forward<Args>(args)..., alloc);
+
+        // New root node as a continuation and ref count. Left and right child attach to the new parent.
+        right_child->my_parent = my_parent = alloc.new_object<tree_node_type>(ed, my_parent, 2, *my_body, alloc);
+
+        // Spawn the right sibling
+        right_child->spawn_self(ed);
+    }
+
+    void spawn_self(execution_data& ed) {
+        my_partition.spawn_task(*this, *context(ed));
+    }
+};
+
+//! fold the tree and deallocate the task
+template<typename Range, typename Body, typename Partitioner>
+void start_reduce<Range, Body, Partitioner>::finalize(const execution_data& ed) {
+    // Get the current parent and wait object before an object destruction
+    node* parent = my_parent;
+    auto allocator = my_allocator;
+    // Task execution finished - destroy it
+    this->~start_reduce();
+    // Unwind the tree decrementing the parent`s reference count
+    fold_tree<tree_node_type>(parent, ed);
+    allocator.deallocate(this, ed);
+}
+
+//! Execute parallel_reduce task
+template<typename Range, typename Body, typename Partitioner>
+task* start_reduce<Range,Body,Partitioner>::execute(execution_data& ed) {
+    if (!is_same_affinity(ed)) {
+        my_partition.note_affinity(execution_slot(ed));
+    }
+    my_partition.check_being_stolen(*this, ed);
+
+    // The acquire barrier synchronizes the data pointed with my_body if the left
+    // task has already finished.
+    __TBB_ASSERT(my_parent, nullptr);
+    if( is_right_child && my_parent->m_ref_count.load(std::memory_order_acquire) == 2 ) {
+        tree_node_type* parent_ptr = static_cast<tree_node_type*>(my_parent);
+        my_body = static_cast<Body*>(new( parent_ptr->zombie_space.begin() ) Body(*my_body, split()));
+        parent_ptr->has_right_zombie = true;
+    }
+    __TBB_ASSERT(my_body != nullptr, "Incorrect body value");
+
+    my_partition.execute(*this, my_range, ed);
+
+    finalize(ed);
+    return nullptr;
+}
+
+//! Cancel parallel_reduce task
+template<typename Range, typename Body, typename Partitioner>
+task* start_reduce<Range, Body, Partitioner>::cancel(execution_data& ed) {
+    finalize(ed);
+    return nullptr;
+}
+
+//! Tree node type for parallel_deterministic_reduce.
+/** @ingroup algorithms */
+template<typename Body>
+struct deterministic_reduction_tree_node : public tree_node {
+    Body right_body;
+    Body& left_body;
+
+    deterministic_reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) :
+        tree_node{parent, ref_count, alloc},
+        right_body{input_left_body, detail::split()},
+        left_body(input_left_body)
+    {}
+
+    void join(task_group_context* context) {
+        if (!context->is_group_execution_cancelled())
+            left_body.join(right_body);
+    }
+};
+
+//! Task type used to split the work of parallel_deterministic_reduce.
+/** @ingroup algorithms */
+template<typename Range, typename Body, typename Partitioner>
+struct start_deterministic_reduce : public task {
+    Range my_range;
+    Body& my_body;
+    node* my_parent;
+
+    typename Partitioner::task_partition_type my_partition;
+    small_object_allocator my_allocator;
+
+    task* execute(execution_data&) override;
+    task* cancel(execution_data&) override;
+    void finalize(const execution_data&);
+
+    using tree_node_type = deterministic_reduction_tree_node<Body>;
+
+    //! Constructor deterministic_reduce root task.
+    start_deterministic_reduce( const Range& range, Partitioner& partitioner, Body& body, small_object_allocator& alloc ) :
+        my_range(range),
+        my_body(body),
+        my_parent(nullptr),
+        my_partition(partitioner),
+        my_allocator(alloc) {}
+    //! Splitting constructor used to generate children.
+    /** parent_ becomes left child.  Newly constructed object is right child. */
+    start_deterministic_reduce( start_deterministic_reduce& parent_, typename Partitioner::split_type& split_obj, Body& body,
+                                small_object_allocator& alloc ) :
+        my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
+        my_body(body),
+        my_parent(nullptr),
+        my_partition(parent_.my_partition, split_obj),
+        my_allocator(alloc) {}
+    static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) {
+        if ( !range.empty() ) {
+            wait_node wn;
+            small_object_allocator alloc{};
+            auto deterministic_reduce_task =
+                alloc.new_object<start_deterministic_reduce>(range, partitioner, body, alloc);
+            deterministic_reduce_task->my_parent = &wn;
+            execute_and_wait(*deterministic_reduce_task, context, wn.m_wait, context);
+        }
+    }
+    static void run(const Range& range, Body& body, Partitioner& partitioner) {
+        // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
+        // and allows users to handle exceptions safely by wrapping parallel_deterministic_reduce
+        // in the try-block.
+        task_group_context context(PARALLEL_REDUCE);
+        run(range, body, partitioner, context);
+    }
+    //! Run body for range, serves as callback for partitioner
+    void run_body( Range &r ) {
+        tbb::detail::invoke(my_body, r);
+    }
+    //! Spawn right task, serves as callback for partitioner
+    void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
+        offer_work_impl(ed, *this, split_obj);
+    }
+private:
+    template <typename... Args>
+    void offer_work_impl(execution_data& ed, Args&&... args) {
+        small_object_allocator alloc{};
+        // New root node as a continuation and ref count. Left and right child attach to the new parent. Split the body.
+        auto new_tree_node = alloc.new_object<tree_node_type>(ed, my_parent, 2, my_body, alloc);
+
+        // New right child
+        auto right_child = alloc.new_object<start_deterministic_reduce>(ed, std::forward<Args>(args)..., new_tree_node->right_body, alloc);
+
+        right_child->my_parent = my_parent = new_tree_node;
+
+        // Spawn the right sibling
+        right_child->spawn_self(ed);
+    }
+
+    void spawn_self(execution_data& ed) {
+        my_partition.spawn_task(*this, *context(ed));
+    }
+};
+
+//! Fold the tree and deallocate the task
+template<typename Range, typename Body, typename Partitioner>
+void start_deterministic_reduce<Range, Body, Partitioner>::finalize(const execution_data& ed) {
+    // Get the current parent and wait object before an object destruction
+    node* parent = my_parent;
+
+    auto allocator = my_allocator;
+    // Task execution finished - destroy it
+    this->~start_deterministic_reduce();
+    // Unwind the tree decrementing the parent`s reference count
+    fold_tree<tree_node_type>(parent, ed);
+    allocator.deallocate(this, ed);
+}
+
+//! Execute parallel_deterministic_reduce task
+template<typename Range, typename Body, typename Partitioner>
+task* start_deterministic_reduce<Range,Body,Partitioner>::execute(execution_data& ed) {
+    if (!is_same_affinity(ed)) {
+        my_partition.note_affinity(execution_slot(ed));
+    }
+    my_partition.check_being_stolen(*this, ed);
+
+    my_partition.execute(*this, my_range, ed);
+
+    finalize(ed);
+    return nullptr;
+}
+
+//! Cancel parallel_deterministic_reduce task
+template<typename Range, typename Body, typename Partitioner>
+task* start_deterministic_reduce<Range, Body, Partitioner>::cancel(execution_data& ed) {
+    finalize(ed);
+    return nullptr;
+}
+
+
+//! Auxiliary class for parallel_reduce; for internal use only.
+/** The adaptor class that implements \ref parallel_reduce_body_req "parallel_reduce Body"
+    using given \ref parallel_reduce_lambda_req "anonymous function objects".
+ **/
+/** @ingroup algorithms */
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+class lambda_reduce_body {
+//TODO: decide if my_real_body, my_reduction, and my_identity_element should be copied or referenced
+//       (might require some performance measurements)
+
+    const Value&     my_identity_element;
+    const RealBody&  my_real_body;
+    const Reduction& my_reduction;
+    Value            my_value;
+    lambda_reduce_body& operator= ( const lambda_reduce_body& other );
+public:
+    lambda_reduce_body( const Value& identity, const RealBody& body, const Reduction& reduction )
+        : my_identity_element(identity)
+        , my_real_body(body)
+        , my_reduction(reduction)
+        , my_value(identity)
+    { }
+    lambda_reduce_body( const lambda_reduce_body& other ) = default;
+    lambda_reduce_body( lambda_reduce_body& other, tbb::split )
+        : my_identity_element(other.my_identity_element)
+        , my_real_body(other.my_real_body)
+        , my_reduction(other.my_reduction)
+        , my_value(other.my_identity_element)
+    { }
+    void operator()(Range& range) {
+        my_value = tbb::detail::invoke(my_real_body, range, const_cast<const Value&>(my_value));
+    }
+    void join( lambda_reduce_body& rhs ) {
+        my_value = tbb::detail::invoke(my_reduction, const_cast<const Value&>(my_value),
+                                                     const_cast<const Value&>(rhs.my_value));
+    }
+    Value result() const {
+        return my_value;
+    }
+};
+
+
+// Requirements on Range concept are documented in blocked_range.h
+
+/** \page parallel_reduce_body_req Requirements on parallel_reduce body
+    Class \c Body implementing the concept of parallel_reduce body must define:
+    - \code Body::Body( Body&, split ); \endcode        Splitting constructor.
+                                                        Must be able to run concurrently with operator() and method \c join
+    - \code Body::~Body(); \endcode                     Destructor
+    - \code void Body::operator()( Range& r ); \endcode Function call operator applying body to range \c r
+                                                        and accumulating the result
+    - \code void Body::join( Body& b ); \endcode        Join results.
+                                                        The result in \c b should be merged into the result of \c this
+**/
+
+/** \page parallel_reduce_lambda_req Requirements on parallel_reduce anonymous function objects (lambda functions)
+    TO BE DOCUMENTED
+**/
+
+/** \name parallel_reduce
+    See also requirements on \ref range_req "Range" and \ref parallel_reduce_body_req "parallel_reduce Body". **/
+//@{
+
+//! Parallel iteration with reduction and default partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_reduce( const Range& range, Body& body ) {
+    start_reduce<Range,Body, const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER() );
+}
+
+//! Parallel iteration with reduction and simple_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
+    start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner );
+}
+
+//! Parallel iteration with reduction and auto_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner ) {
+    start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner );
+}
+
+//! Parallel iteration with reduction and static_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) {
+    start_reduce<Range,Body,const static_partitioner>::run( range, body, partitioner );
+}
+
+//! Parallel iteration with reduction and affinity_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner ) {
+    start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner );
+}
+
+//! Parallel iteration with reduction, default partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_reduce( const Range& range, Body& body, task_group_context& context ) {
+    start_reduce<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
+}
+
+//! Parallel iteration with reduction, simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
+    start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner, context );
+}
+
+//! Parallel iteration with reduction, auto_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
+    start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner, context );
+}
+
+//! Parallel iteration with reduction, static_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) {
+    start_reduce<Range,Body,const static_partitioner>::run( range, body, partitioner, context );
+}
+
+//! Parallel iteration with reduction, affinity_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
+    start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner, context );
+}
+/** parallel_reduce overloads that work with anonymous function objects
+    (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
+
+//! Parallel iteration with reduction and default partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
+                          ::run(range, body, __TBB_DEFAULT_PARTITIONER() );
+    return body.result();
+}
+
+//! Parallel iteration with reduction and simple_partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const simple_partitioner& partitioner ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
+                          ::run(range, body, partitioner );
+    return body.result();
+}
+
+//! Parallel iteration with reduction and auto_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const auto_partitioner& partitioner ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
+                          ::run( range, body, partitioner );
+    return body.result();
+}
+
+//! Parallel iteration with reduction and static_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const static_partitioner& partitioner ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
+                                        ::run( range, body, partitioner );
+    return body.result();
+}
+
+//! Parallel iteration with reduction and affinity_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       affinity_partitioner& partitioner ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
+                                        ::run( range, body, partitioner );
+    return body.result();
+}
+
+//! Parallel iteration with reduction, default partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       task_group_context& context ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
+                          ::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
+    return body.result();
+}
+
+//! Parallel iteration with reduction, simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const simple_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
+                          ::run( range, body, partitioner, context );
+    return body.result();
+}
+
+//! Parallel iteration with reduction, auto_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const auto_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
+                          ::run( range, body, partitioner, context );
+    return body.result();
+}
+
+//! Parallel iteration with reduction, static_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       const static_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
+                                        ::run( range, body, partitioner, context );
+    return body.result();
+}
+
+//! Parallel iteration with reduction, affinity_partitioner and user-supplied context
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+                       affinity_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
+                                        ::run( range, body, partitioner, context );
+    return body.result();
+}
+
+//! Parallel iteration with deterministic reduction and default simple partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_deterministic_reduce( const Range& range, Body& body ) {
+    start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, simple_partitioner());
+}
+
+//! Parallel iteration with deterministic reduction and simple partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
+    start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, partitioner);
+}
+
+//! Parallel iteration with deterministic reduction and static partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) {
+    start_deterministic_reduce<Range, Body, const static_partitioner>::run(range, body, partitioner);
+}
+
+//! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_deterministic_reduce( const Range& range, Body& body, task_group_context& context ) {
+    start_deterministic_reduce<Range,Body, const simple_partitioner>::run( range, body, simple_partitioner(), context );
+}
+
+//! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
+    start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, partitioner, context);
+}
+
+//! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
+void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) {
+    start_deterministic_reduce<Range, Body, const static_partitioner>::run(range, body, partitioner, context);
+}
+
+/** parallel_reduce overloads that work with anonymous function objects
+    (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
+
+//! Parallel iteration with deterministic reduction and default simple partitioner.
+// TODO: consider making static_partitioner the default
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
+    return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner());
+}
+
+//! Parallel iteration with deterministic reduction and simple partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const simple_partitioner& partitioner ) {
+    lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
+    start_deterministic_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>, const simple_partitioner>
+                          ::run(range, body, partitioner);
+    return body.result();
+}
+
+//! Parallel iteration with deterministic reduction and static partitioner.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const static_partitioner& partitioner ) {
+    lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
+    start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
+        ::run(range, body, partitioner);
+    return body.result();
+}
+
+//! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+    task_group_context& context ) {
+    return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner(), context);
+}
+
+//! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+    const simple_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
+    start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const simple_partitioner>
+        ::run(range, body, partitioner, context);
+    return body.result();
+}
+
+//! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename RealBody, typename Reduction>
+    __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
+                   parallel_reduce_combine<Reduction, Value>)
+Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
+    const static_partitioner& partitioner, task_group_context& context ) {
+    lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
+    start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
+        ::run(range, body, partitioner, context);
+    return body.result();
+}
+//@}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::parallel_reduce;
+using detail::d1::parallel_deterministic_reduce;
+// Split types
+using detail::split;
+using detail::proportional_split;
+} // namespace v1
+
+} // namespace tbb
+#endif /* __TBB_parallel_reduce_H */
diff --git a/third_party/tbb/parallel_scan.h b/third_party/tbb/parallel_scan.h
new file mode 100644
index 000000000..dba033af8
--- /dev/null
+++ b/third_party/tbb/parallel_scan.h
@@ -0,0 +1,631 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_scan_H
+#define __TBB_parallel_scan_H
+
+#include "third_party/libcxx/functional"
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_task.h"
+
+#include "third_party/tbb/profiling.h"
+#include "third_party/tbb/partitioner.h"
+#include "third_party/tbb/blocked_range.h"
+#include "third_party/tbb/task_group.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Used to indicate that the initial scan is being performed.
+/** @ingroup algorithms */
+struct pre_scan_tag {
+    static bool is_final_scan() {return false;}
+    operator bool() {return is_final_scan();}
+};
+
+//! Used to indicate that the final scan is being performed.
+/** @ingroup algorithms */
+struct final_scan_tag {
+    static bool is_final_scan() {return true;}
+    operator bool() {return is_final_scan();}
+};
+
+template<typename Range, typename Body>
+struct sum_node;
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+} // namespace d1
+namespace d0 {
+
+template <typename Body, typename Range>
+concept parallel_scan_body = splittable<Body> &&
+                             requires( Body& body, const Range& range, Body& other ) {
+                                 body(range, tbb::detail::d1::pre_scan_tag{});
+                                 body(range, tbb::detail::d1::final_scan_tag{});
+                                 body.reverse_join(other);
+                                 body.assign(other);
+                             };
+
+template <typename Function, typename Range, typename Value>
+concept parallel_scan_function = std::invocable<const std::remove_reference_t<Function>&,
+                                                const Range&, const Value&, bool> &&
+                                 std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Function>&,
+                                                                          const Range&, const Value&, bool>,
+                                                     Value>;
+
+template <typename Combine, typename Value>
+concept parallel_scan_combine = std::invocable<const std::remove_reference_t<Combine>&,
+                                               const Value&, const Value&> &&
+                                std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Combine>&,
+                                                                         const Value&, const Value&>,
+                                                    Value>;
+
+} // namespace d0
+namespace d1 {
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+//! Performs final scan for a leaf
+/** @ingroup algorithms */
+template<typename Range, typename Body>
+struct final_sum : public task {
+private:
+    using sum_node_type = sum_node<Range, Body>;
+    Body m_body;
+    aligned_space<Range> m_range;
+    //! Where to put result of last subrange, or nullptr if not last subrange.
+    Body* m_stuff_last;
+
+    wait_context& m_wait_context;
+    sum_node_type* m_parent = nullptr;
+public:
+    small_object_allocator m_allocator;
+    final_sum( Body& body, wait_context& w_o, small_object_allocator& alloc ) :
+        m_body(body, split()), m_wait_context(w_o), m_allocator(alloc) {
+        poison_pointer(m_stuff_last);
+    }
+
+    final_sum( final_sum& sum, small_object_allocator& alloc ) :
+        m_body(sum.m_body, split()), m_wait_context(sum.m_wait_context), m_allocator(alloc) {
+        poison_pointer(m_stuff_last);
+    }
+
+    ~final_sum() {
+        m_range.begin()->~Range();
+    }
+    void finish_construction( sum_node_type* parent, const Range& range, Body* stuff_last ) {
+        __TBB_ASSERT( m_parent == nullptr, nullptr );
+        m_parent = parent;
+        new( m_range.begin() ) Range(range);
+        m_stuff_last = stuff_last;
+    }
+private:
+    sum_node_type* release_parent() {
+        call_itt_task_notify(releasing, m_parent);
+        if (m_parent) {
+            auto parent = m_parent;
+            m_parent = nullptr;
+            if (parent->ref_count.fetch_sub(1) == 1) {
+                return parent;
+            }
+        }
+        else
+            m_wait_context.release();
+        return nullptr;
+    }
+    sum_node_type* finalize(const execution_data& ed){
+        sum_node_type* next_task = release_parent();
+        m_allocator.delete_object<final_sum>(this, ed);
+        return next_task;
+    }
+
+public:
+    task* execute(execution_data& ed) override {
+        m_body( *m_range.begin(), final_scan_tag() );
+        if( m_stuff_last )
+            m_stuff_last->assign(m_body);
+
+        return finalize(ed);
+    }
+    task* cancel(execution_data& ed) override {
+        return finalize(ed);
+    }
+    template<typename Tag>
+    void operator()( const Range& r, Tag tag ) {
+        m_body( r, tag );
+    }
+    void reverse_join( final_sum& a ) {
+        m_body.reverse_join(a.m_body);
+    }
+    void reverse_join( Body& body ) {
+        m_body.reverse_join(body);
+    }
+    void assign_to( Body& body ) {
+        body.assign(m_body);
+    }
+    void self_destroy(const execution_data& ed) {
+        m_allocator.delete_object<final_sum>(this, ed);
+    }
+};
+
+//! Split work to be done in the scan.
+/** @ingroup algorithms */
+template<typename Range, typename Body>
+struct sum_node : public task {
+private:
+    using final_sum_type = final_sum<Range,Body>;
+public:
+    final_sum_type *m_incoming;
+    final_sum_type *m_body;
+    Body *m_stuff_last;
+private:
+    final_sum_type *m_left_sum;
+    sum_node *m_left;
+    sum_node *m_right;
+    bool m_left_is_final;
+    Range m_range;
+    wait_context& m_wait_context;
+    sum_node* m_parent;
+    small_object_allocator m_allocator;
+public:
+    std::atomic<unsigned int> ref_count{0};
+    sum_node( const Range range, bool left_is_final_, sum_node* parent, wait_context& w_o, small_object_allocator& alloc ) :
+        m_stuff_last(nullptr),
+        m_left_sum(nullptr),
+        m_left(nullptr),
+        m_right(nullptr),
+        m_left_is_final(left_is_final_),
+        m_range(range),
+        m_wait_context(w_o),
+        m_parent(parent),
+        m_allocator(alloc)
+    {
+        if( m_parent )
+            m_parent->ref_count.fetch_add(1);
+        // Poison fields that will be set by second pass.
+        poison_pointer(m_body);
+        poison_pointer(m_incoming);
+    }
+
+    ~sum_node() {
+        if (m_parent)
+            m_parent->ref_count.fetch_sub(1);
+    }
+private:
+    sum_node* release_parent() {
+        call_itt_task_notify(releasing, m_parent);
+        if (m_parent) {
+            auto parent = m_parent;
+            m_parent = nullptr;
+            if (parent->ref_count.fetch_sub(1) == 1) {
+                return parent;
+            }
+        }
+        else
+            m_wait_context.release();
+        return nullptr;
+    }
+    task* create_child( const Range& range, final_sum_type& body, sum_node* child, final_sum_type* incoming, Body* stuff_last ) {
+        if( child ) {
+            __TBB_ASSERT( is_poisoned(child->m_body) && is_poisoned(child->m_incoming), nullptr );
+            child->prepare_for_execution(body, incoming, stuff_last);
+            return child;
+        } else {
+            body.finish_construction(this, range, stuff_last);
+            return &body;
+        }
+    }
+
+    sum_node* finalize(const execution_data& ed) {
+        sum_node* next_task = release_parent();
+        m_allocator.delete_object<sum_node>(this, ed);
+        return next_task;
+    }
+
+public:
+    void prepare_for_execution(final_sum_type& body, final_sum_type* incoming, Body *stuff_last) {
+        this->m_body = &body;
+        this->m_incoming = incoming;
+        this->m_stuff_last = stuff_last;
+    }
+    task* execute(execution_data& ed) override {
+        if( m_body ) {
+            if( m_incoming )
+                m_left_sum->reverse_join( *m_incoming );
+            task* right_child = this->create_child(Range(m_range,split()), *m_left_sum, m_right, m_left_sum, m_stuff_last);
+            task* left_child = m_left_is_final ? nullptr : this->create_child(m_range, *m_body, m_left, m_incoming, nullptr);
+            ref_count = (left_child != nullptr) + (right_child != nullptr);
+            m_body = nullptr;
+            if( left_child ) {
+                spawn(*right_child, *ed.context);
+                return left_child;
+            } else {
+                return right_child;
+            }
+        } else {
+            return finalize(ed);
+        }
+    }
+    task* cancel(execution_data& ed) override {
+        return finalize(ed);
+    }
+    void self_destroy(const execution_data& ed) {
+        m_allocator.delete_object<sum_node>(this, ed);
+    }
+    template<typename range,typename body,typename partitioner>
+    friend struct start_scan;
+
+    template<typename range,typename body>
+    friend struct finish_scan;
+};
+
+//! Combine partial results
+/** @ingroup algorithms */
+template<typename Range, typename Body>
+struct finish_scan : public task {
+private:
+    using sum_node_type = sum_node<Range,Body>;
+    using final_sum_type = final_sum<Range,Body>;
+    final_sum_type** const m_sum_slot;
+    sum_node_type*& m_return_slot;
+    small_object_allocator m_allocator;
+public:
+    std::atomic<final_sum_type*> m_right_zombie;
+    sum_node_type& m_result;
+    std::atomic<unsigned int> ref_count{2};
+    finish_scan*  m_parent;
+    wait_context& m_wait_context;
+    task* execute(execution_data& ed) override {
+        __TBB_ASSERT( m_result.ref_count.load() == static_cast<unsigned int>((m_result.m_left!=nullptr)+(m_result.m_right!=nullptr)), nullptr );
+        if( m_result.m_left )
+            m_result.m_left_is_final = false;
+        final_sum_type* right_zombie = m_right_zombie.load(std::memory_order_acquire);
+        if( right_zombie && m_sum_slot )
+            (*m_sum_slot)->reverse_join(*m_result.m_left_sum);
+        __TBB_ASSERT( !m_return_slot, nullptr );
+        if( right_zombie || m_result.m_right ) {
+            m_return_slot = &m_result;
+        } else {
+            m_result.self_destroy(ed);
+        }
+        if( right_zombie && !m_sum_slot && !m_result.m_right ) {
+            right_zombie->self_destroy(ed);
+            m_right_zombie.store(nullptr, std::memory_order_relaxed);
+        }
+        return finalize(ed);
+    }
+    task* cancel(execution_data& ed) override {
+        return finalize(ed);
+    }
+    finish_scan(sum_node_type*& return_slot, final_sum_type** sum, sum_node_type& result_, finish_scan* parent, wait_context& w_o, small_object_allocator& alloc) :
+        m_sum_slot(sum),
+        m_return_slot(return_slot),
+        m_allocator(alloc),
+        m_right_zombie(nullptr),
+        m_result(result_),
+        m_parent(parent),
+        m_wait_context(w_o)
+    {
+        __TBB_ASSERT( !m_return_slot, nullptr );
+    }
+private:
+    finish_scan* release_parent() {
+        call_itt_task_notify(releasing, m_parent);
+        if (m_parent) {
+            auto parent = m_parent;
+            m_parent = nullptr;
+            if (parent->ref_count.fetch_sub(1) == 1) {
+                return parent;
+            }
+        }
+        else
+            m_wait_context.release();
+        return nullptr;
+    }
+    finish_scan* finalize(const execution_data& ed) {
+        finish_scan* next_task = release_parent();
+        m_allocator.delete_object<finish_scan>(this, ed);
+        return next_task;
+    }
+};
+
+//! Initial task to split the work
+/** @ingroup algorithms */
+template<typename Range, typename Body, typename Partitioner>
+struct start_scan : public task {
+private:
+    using sum_node_type = sum_node<Range,Body>;
+    using final_sum_type = final_sum<Range,Body>;
+    using finish_pass1_type = finish_scan<Range,Body>;
+    std::reference_wrapper<sum_node_type*> m_return_slot;
+    Range m_range;
+    std::reference_wrapper<final_sum_type> m_body;
+    typename Partitioner::partition_type m_partition;
+    /** Non-null if caller is requesting total. */
+    final_sum_type** m_sum_slot;
+    bool m_is_final;
+    bool m_is_right_child;
+
+    finish_pass1_type*  m_parent;
+    small_object_allocator m_allocator;
+    wait_context& m_wait_context;
+
+    finish_pass1_type* release_parent() {
+        call_itt_task_notify(releasing, m_parent);
+        if (m_parent) {
+            auto parent = m_parent;
+            m_parent = nullptr;
+            if (parent->ref_count.fetch_sub(1) == 1) {
+                return parent;
+            }
+        }
+        else
+            m_wait_context.release();
+        return nullptr;
+    }
+
+    finish_pass1_type* finalize( const execution_data& ed ) {
+        finish_pass1_type* next_task = release_parent();
+        m_allocator.delete_object<start_scan>(this, ed);
+        return next_task;
+    }
+
+public:
+    task* execute( execution_data& ) override;
+    task* cancel( execution_data& ed ) override {
+        return finalize(ed);
+    }
+    start_scan( sum_node_type*& return_slot, start_scan& parent, small_object_allocator& alloc ) :
+        m_return_slot(return_slot),
+        m_range(parent.m_range,split()),
+        m_body(parent.m_body),
+        m_partition(parent.m_partition,split()),
+        m_sum_slot(parent.m_sum_slot),
+        m_is_final(parent.m_is_final),
+        m_is_right_child(true),
+        m_parent(parent.m_parent),
+        m_allocator(alloc),
+        m_wait_context(parent.m_wait_context)
+    {
+        __TBB_ASSERT( !m_return_slot, nullptr );
+        parent.m_is_right_child = false;
+    }
+
+    start_scan( sum_node_type*& return_slot, const Range& range, final_sum_type& body, const Partitioner& partitioner, wait_context& w_o, small_object_allocator& alloc ) :
+        m_return_slot(return_slot),
+        m_range(range),
+        m_body(body),
+        m_partition(partitioner),
+        m_sum_slot(nullptr),
+        m_is_final(true),
+        m_is_right_child(false),
+        m_parent(nullptr),
+        m_allocator(alloc),
+        m_wait_context(w_o)
+    {
+        __TBB_ASSERT( !m_return_slot, nullptr );
+    }
+
+    static void run( const Range& range, Body& body, const Partitioner& partitioner ) {
+        if( !range.empty() ) {
+            task_group_context context(PARALLEL_SCAN);
+
+            using start_pass1_type = start_scan<Range,Body,Partitioner>;
+            sum_node_type* root = nullptr;
+            wait_context w_ctx{1};
+            small_object_allocator alloc{};
+
+            auto& temp_body = *alloc.new_object<final_sum_type>(body, w_ctx, alloc);
+            temp_body.reverse_join(body);
+
+            auto& pass1 = *alloc.new_object<start_pass1_type>(/*m_return_slot=*/root, range, temp_body, partitioner, w_ctx, alloc);
+
+            execute_and_wait(pass1, context, w_ctx, context);
+            if( root ) {
+                root->prepare_for_execution(temp_body, nullptr, &body);
+                w_ctx.reserve();
+                execute_and_wait(*root, context, w_ctx, context);
+            } else {
+                temp_body.assign_to(body);
+                temp_body.finish_construction(nullptr, range, nullptr);
+                alloc.delete_object<final_sum_type>(&temp_body);
+            }
+        }
+    }
+};
+
+template<typename Range, typename Body, typename Partitioner>
+task* start_scan<Range,Body,Partitioner>::execute( execution_data& ed ) {
+    // Inspecting m_parent->result.left_sum would ordinarily be a race condition.
+    // But we inspect it only if we are not a stolen task, in which case we
+    // know that task assigning to m_parent->result.left_sum has completed.
+    __TBB_ASSERT(!m_is_right_child || m_parent, "right child is never an orphan");
+    bool treat_as_stolen = m_is_right_child && (is_stolen(ed) || &m_body.get()!=m_parent->m_result.m_left_sum);
+    if( treat_as_stolen ) {
+        // Invocation is for right child that has been really stolen or needs to be virtually stolen
+        small_object_allocator alloc{};
+        final_sum_type* right_zombie = alloc.new_object<final_sum_type>(m_body, alloc);
+        m_parent->m_right_zombie.store(right_zombie, std::memory_order_release);
+        m_body = *right_zombie;
+        m_is_final = false;
+    }
+    task* next_task = nullptr;
+    if( (m_is_right_child && !treat_as_stolen) || !m_range.is_divisible() || m_partition.should_execute_range(ed) ) {
+        if( m_is_final )
+            m_body(m_range, final_scan_tag());
+        else if( m_sum_slot )
+            m_body(m_range, pre_scan_tag());
+        if( m_sum_slot )
+            *m_sum_slot = &m_body.get();
+        __TBB_ASSERT( !m_return_slot, nullptr );
+
+        next_task = finalize(ed);
+    } else {
+        small_object_allocator alloc{};
+        auto result = alloc.new_object<sum_node_type>(m_range,/*m_left_is_final=*/m_is_final, m_parent? &m_parent->m_result: nullptr, m_wait_context, alloc);
+
+        auto new_parent = alloc.new_object<finish_pass1_type>(m_return_slot, m_sum_slot, *result, m_parent, m_wait_context, alloc);
+        m_parent = new_parent;
+
+        // Split off right child
+        auto& right_child = *alloc.new_object<start_scan>(/*m_return_slot=*/result->m_right, *this, alloc);
+
+        spawn(right_child, *ed.context);
+
+        m_sum_slot = &result->m_left_sum;
+        m_return_slot = result->m_left;
+
+        __TBB_ASSERT( !m_return_slot, nullptr );
+        next_task = this;
+    }
+    return next_task;
+}
+
+template<typename Range, typename Value, typename Scan, typename ReverseJoin>
+class lambda_scan_body {
+    Value               m_sum_slot;
+    const Value&        identity_element;
+    const Scan&         m_scan;
+    const ReverseJoin&  m_reverse_join;
+public:
+    void operator=(const lambda_scan_body&) = delete;
+    lambda_scan_body(const lambda_scan_body&) = default;
+
+    lambda_scan_body( const Value& identity, const Scan& scan, const ReverseJoin& rev_join )
+        : m_sum_slot(identity)
+        , identity_element(identity)
+        , m_scan(scan)
+        , m_reverse_join(rev_join) {}
+
+    lambda_scan_body( lambda_scan_body& b, split )
+        : m_sum_slot(b.identity_element)
+        , identity_element(b.identity_element)
+        , m_scan(b.m_scan)
+        , m_reverse_join(b.m_reverse_join) {}
+
+    template<typename Tag>
+    void operator()( const Range& r, Tag tag ) {
+        m_sum_slot = tbb::detail::invoke(m_scan, r, m_sum_slot, tag);
+    }
+
+    void reverse_join( lambda_scan_body& a ) {
+        m_sum_slot = tbb::detail::invoke(m_reverse_join, a.m_sum_slot, m_sum_slot);
+    }
+
+    void assign( lambda_scan_body& b ) {
+        m_sum_slot = b.m_sum_slot;
+    }
+
+    Value result() const {
+        return m_sum_slot;
+    }
+};
+
+// Requirements on Range concept are documented in blocked_range.h
+
+/** \page parallel_scan_body_req Requirements on parallel_scan body
+    Class \c Body implementing the concept of parallel_scan body must define:
+    - \code Body::Body( Body&, split ); \endcode    Splitting constructor.
+                                                    Split \c b so that \c this and \c b can accumulate separately
+    - \code Body::~Body(); \endcode                 Destructor
+    - \code void Body::operator()( const Range& r, pre_scan_tag ); \endcode
+                                                    Preprocess iterations for range \c r
+    - \code void Body::operator()( const Range& r, final_scan_tag ); \endcode
+                                                    Do final processing for iterations of range \c r
+    - \code void Body::reverse_join( Body& a ); \endcode
+                                                    Merge preprocessing state of \c a into \c this, where \c a was
+                                                    created earlier from \c b by b's splitting constructor
+**/
+
+/** \name parallel_scan
+    See also requirements on \ref range_req "Range" and \ref parallel_scan_body_req "parallel_scan Body". **/
+//@{
+
+//! Parallel prefix with default partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_scan_body<Body, Range>)
+void parallel_scan( const Range& range, Body& body ) {
+    start_scan<Range, Body, auto_partitioner>::run(range,body,__TBB_DEFAULT_PARTITIONER());
+}
+
+//! Parallel prefix with simple_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_scan_body<Body, Range>)
+void parallel_scan( const Range& range, Body& body, const simple_partitioner& partitioner ) {
+    start_scan<Range, Body, simple_partitioner>::run(range, body, partitioner);
+}
+
+//! Parallel prefix with auto_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Body>
+    __TBB_requires(tbb_range<Range> && parallel_scan_body<Body, Range>)
+void parallel_scan( const Range& range, Body& body, const auto_partitioner& partitioner ) {
+    start_scan<Range,Body,auto_partitioner>::run(range, body, partitioner);
+}
+
+//! Parallel prefix with default partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename Scan, typename ReverseJoin>
+    __TBB_requires(tbb_range<Range> && parallel_scan_function<Scan, Range, Value> &&
+                   parallel_scan_combine<ReverseJoin, Value>)
+Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join ) {
+    lambda_scan_body<Range, Value, Scan, ReverseJoin> body(identity, scan, reverse_join);
+    parallel_scan(range, body, __TBB_DEFAULT_PARTITIONER());
+    return body.result();
+}
+
+//! Parallel prefix with simple_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename Scan, typename ReverseJoin>
+    __TBB_requires(tbb_range<Range> && parallel_scan_function<Scan, Range, Value> &&
+                   parallel_scan_combine<ReverseJoin, Value>)
+Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join,
+                     const simple_partitioner& partitioner ) {
+    lambda_scan_body<Range, Value, Scan, ReverseJoin> body(identity, scan, reverse_join);
+    parallel_scan(range, body, partitioner);
+    return body.result();
+}
+
+//! Parallel prefix with auto_partitioner
+/** @ingroup algorithms **/
+template<typename Range, typename Value, typename Scan, typename ReverseJoin>
+    __TBB_requires(tbb_range<Range> && parallel_scan_function<Scan, Range, Value> &&
+                   parallel_scan_combine<ReverseJoin, Value>)
+Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join,
+                     const auto_partitioner& partitioner ) {
+    lambda_scan_body<Range, Value, Scan, ReverseJoin> body(identity, scan, reverse_join);
+    parallel_scan(range, body, partitioner);
+    return body.result();
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+    using detail::d1::parallel_scan;
+    using detail::d1::pre_scan_tag;
+    using detail::d1::final_scan_tag;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_parallel_scan_H */
diff --git a/third_party/tbb/parallel_sort.h b/third_party/tbb/parallel_sort.h
new file mode 100644
index 000000000..b089b9d99
--- /dev/null
+++ b/third_party/tbb/parallel_sort.h
@@ -0,0 +1,289 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_parallel_sort_H
+#define __TBB_parallel_sort_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/parallel_for.h"
+#include "third_party/tbb/blocked_range.h"
+#include "third_party/tbb/profiling.h"
+
+#include "third_party/libcxx/algorithm"
+#include "third_party/libcxx/iterator"
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/cstddef"
+
+namespace tbb {
+namespace detail {
+#if __TBB_CPP20_CONCEPTS_PRESENT
+inline namespace d0 {
+
+// TODO: consider using std::strict_weak_order concept
+template <typename Compare, typename Iterator>
+concept compare = requires( const std::remove_reference_t<Compare>& comp, typename std::iterator_traits<Iterator>::reference value ) {
+    // Forward via iterator_traits::reference
+    { comp(typename std::iterator_traits<Iterator>::reference(value),
+           typename std::iterator_traits<Iterator>::reference(value)) } -> std::convertible_to<bool>;
+};
+
+// Inspired by std::__PartiallyOrderedWith exposition only concept
+template <typename T>
+concept less_than_comparable = requires( const std::remove_reference_t<T>& lhs,
+                                         const std::remove_reference_t<T>& rhs ) {
+    { lhs < rhs } -> boolean_testable;
+};
+
+} // namespace d0
+#endif // __TBB_CPP20_CONCEPTS_PRESENT
+namespace d1 {
+
+//! Range used in quicksort to split elements into subranges based on a value.
+/** The split operation selects a splitter and places all elements less than or equal
+    to the value in the first range and the remaining elements in the second range.
+    @ingroup algorithms */
+template<typename RandomAccessIterator, typename Compare>
+class quick_sort_range {
+    std::size_t median_of_three( const RandomAccessIterator& array, std::size_t l, std::size_t m, std::size_t r ) const {
+        return comp(array[l], array[m]) ? ( comp(array[m], array[r]) ? m : ( comp(array[l], array[r]) ? r : l ) )
+                                        : ( comp(array[r], array[m]) ? m : ( comp(array[r], array[l]) ? r : l ) );
+    }
+
+    std::size_t pseudo_median_of_nine( const RandomAccessIterator& array, const quick_sort_range& range ) const {
+        std::size_t offset = range.size / 8u;
+        return median_of_three(array,
+                               median_of_three(array, 0 , offset, offset * 2),
+                               median_of_three(array, offset * 3, offset * 4, offset * 5),
+                               median_of_three(array, offset * 6, offset * 7, range.size - 1));
+
+    }
+
+    std::size_t split_range( quick_sort_range& range ) {
+        RandomAccessIterator array = range.begin;
+        RandomAccessIterator first_element = range.begin;
+        std::size_t m = pseudo_median_of_nine(array, range);
+        if( m != 0 ) std::iter_swap(array, array + m);
+
+        std::size_t i = 0;
+        std::size_t j = range.size;
+        // Partition interval [i + 1,j - 1] with key *first_element.
+        for(;;) {
+            __TBB_ASSERT( i < j, nullptr );
+            // Loop must terminate since array[l] == *first_element.
+            do {
+                --j;
+                __TBB_ASSERT( i <= j, "bad ordering relation?" );
+            } while( comp(*first_element, array[j]) );
+            do {
+                __TBB_ASSERT( i <= j, nullptr );
+                if( i == j ) goto partition;
+                ++i;
+            } while( comp(array[i], *first_element) );
+            if( i == j ) goto partition;
+            std::iter_swap(array + i, array + j);
+        }
+partition:
+        // Put the partition key were it belongs
+        std::iter_swap(array + j, first_element);
+        // array[l..j) is less or equal to key.
+        // array(j..r) is greater or equal to key.
+        // array[j] is equal to key
+        i = j + 1;
+        std::size_t new_range_size = range.size - i;
+        range.size = j;
+        return new_range_size;
+    }
+
+public:
+    quick_sort_range() = default;
+    quick_sort_range( const quick_sort_range& ) = default;
+    void operator=( const quick_sort_range& ) = delete;
+
+    static constexpr std::size_t grainsize = 500;
+    const Compare& comp;
+    std::size_t size;
+    RandomAccessIterator begin;
+
+    quick_sort_range( RandomAccessIterator begin_, std::size_t size_, const Compare& comp_ ) :
+        comp(comp_), size(size_), begin(begin_) {}
+
+    bool empty() const { return size == 0; }
+    bool is_divisible() const { return size >= grainsize; }
+
+    quick_sort_range( quick_sort_range& range, split )
+        : comp(range.comp)
+        , size(split_range(range))
+          // +1 accounts for the pivot element, which is at its correct place
+          // already and, therefore, is not included into subranges.
+        , begin(range.begin + range.size + 1) {}
+};
+
+//! Body class used to test if elements in a range are presorted
+/** @ingroup algorithms */
+template<typename RandomAccessIterator, typename Compare>
+class quick_sort_pretest_body {
+    const Compare& comp;
+    task_group_context& context;
+
+public:
+    quick_sort_pretest_body() = default;
+    quick_sort_pretest_body( const quick_sort_pretest_body& ) = default;
+    void operator=( const quick_sort_pretest_body& ) = delete;
+
+    quick_sort_pretest_body( const Compare& _comp, task_group_context& _context ) : comp(_comp), context(_context) {}
+
+    void operator()( const blocked_range<RandomAccessIterator>& range ) const {
+        RandomAccessIterator my_end = range.end();
+
+        int i = 0;
+        //TODO: consider using std::is_sorted() for each 64 iterations (requires performance measurements)
+        for( RandomAccessIterator k = range.begin(); k != my_end; ++k, ++i ) {
+            if( i % 64 == 0 && context.is_group_execution_cancelled() ) break;
+
+            // The k - 1 is never out-of-range because the first chunk starts at begin+serial_cutoff+1
+            if( comp(*(k), *(k - 1)) ) {
+                context.cancel_group_execution();
+                break;
+            }
+        }
+    }
+};
+
+//! Body class used to sort elements in a range that is smaller than the grainsize.
+/** @ingroup algorithms */
+template<typename RandomAccessIterator, typename Compare>
+struct quick_sort_body {
+    void operator()( const quick_sort_range<RandomAccessIterator,Compare>& range ) const {
+        std::sort(range.begin, range.begin + range.size, range.comp);
+    }
+};
+
+//! Method to perform parallel_for based quick sort.
+/** @ingroup algorithms */
+template<typename RandomAccessIterator, typename Compare>
+void do_parallel_quick_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) {
+    parallel_for(quick_sort_range<RandomAccessIterator,Compare>(begin, end - begin, comp),
+                 quick_sort_body<RandomAccessIterator,Compare>(),
+                 auto_partitioner());
+}
+
+//! Wrapper method to initiate the sort by calling parallel_for.
+/** @ingroup algorithms */
+template<typename RandomAccessIterator, typename Compare>
+void parallel_quick_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) {
+    task_group_context my_context(PARALLEL_SORT);
+    constexpr int serial_cutoff = 9;
+
+    __TBB_ASSERT( begin + serial_cutoff < end, "min_parallel_size is smaller than serial cutoff?" );
+    RandomAccessIterator k = begin;
+    for( ; k != begin + serial_cutoff; ++k ) {
+        if( comp(*(k + 1), *k) ) {
+            do_parallel_quick_sort(begin, end, comp);
+            return;
+        }
+    }
+
+    // Check is input range already sorted
+    parallel_for(blocked_range<RandomAccessIterator>(k + 1, end),
+                 quick_sort_pretest_body<RandomAccessIterator, Compare>(comp, my_context),
+                 auto_partitioner(),
+                 my_context);
+
+    if( my_context.is_group_execution_cancelled() )
+        do_parallel_quick_sort(begin, end, comp);
+}
+
+/** \page parallel_sort_iter_req Requirements on iterators for parallel_sort
+    Requirements on the iterator type \c It and its value type \c T for \c parallel_sort:
+
+    - \code void iter_swap( It a, It b ) \endcode Swaps the values of the elements the given
+    iterators \c a and \c b are pointing to. \c It should be a random access iterator.
+
+    - \code bool Compare::operator()( const T& x, const T& y ) \endcode True if x comes before y;
+**/
+
+/** \name parallel_sort
+    See also requirements on \ref parallel_sort_iter_req "iterators for parallel_sort". **/
+//@{
+
+#if __TBB_CPP20_CONCEPTS_PRESENT
+template<typename It>
+using iter_value_type = typename std::iterator_traits<It>::value_type;
+
+template<typename Range>
+using range_value_type = typename std::iterator_traits<range_iterator_type<Range>>::value_type;
+#endif
+
+//! Sorts the data in [begin,end) using the given comparator
+/** The compare function object is used for all comparisons between elements during sorting.
+    The compare object must define a bool operator() function.
+    @ingroup algorithms **/
+template<typename RandomAccessIterator, typename Compare>
+    __TBB_requires(std::random_access_iterator<RandomAccessIterator> &&
+                   compare<Compare, RandomAccessIterator> &&
+                   std::movable<iter_value_type<RandomAccessIterator>>)
+void parallel_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) {
+    constexpr int min_parallel_size = 500;
+    if( end > begin ) {
+        if( end - begin < min_parallel_size ) {
+            std::sort(begin, end, comp);
+        } else {
+            parallel_quick_sort(begin, end, comp);
+        }
+    }
+}
+
+//! Sorts the data in [begin,end) with a default comparator \c std::less
+/** @ingroup algorithms **/
+template<typename RandomAccessIterator>
+    __TBB_requires(std::random_access_iterator<RandomAccessIterator> &&
+                   less_than_comparable<iter_value_type<RandomAccessIterator>> &&
+                   std::movable<iter_value_type<RandomAccessIterator>>)
+void parallel_sort( RandomAccessIterator begin, RandomAccessIterator end ) {
+    parallel_sort(begin, end, std::less<typename std::iterator_traits<RandomAccessIterator>::value_type>());
+}
+
+//! Sorts the data in rng using the given comparator
+/** @ingroup algorithms **/
+template<typename Range, typename Compare>
+    __TBB_requires(container_based_sequence<Range, std::random_access_iterator_tag> &&
+                   compare<Compare, range_iterator_type<Range>> &&
+                   std::movable<range_value_type<Range>>)
+void parallel_sort( Range&& rng, const Compare& comp ) {
+    parallel_sort(std::begin(rng), std::end(rng), comp);
+}
+
+//! Sorts the data in rng with a default comparator \c std::less
+/** @ingroup algorithms **/
+template<typename Range>
+    __TBB_requires(container_based_sequence<Range, std::random_access_iterator_tag> &&
+                   less_than_comparable<range_value_type<Range>> &&
+                   std::movable<range_value_type<Range>>)
+void parallel_sort( Range&& rng ) {
+    parallel_sort(std::begin(rng), std::end(rng));
+}
+//@}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+    using detail::d1::parallel_sort;
+} // namespace v1
+} // namespace tbb
+
+#endif /*__TBB_parallel_sort_H*/
diff --git a/third_party/tbb/partitioner.h b/third_party/tbb/partitioner.h
new file mode 100644
index 000000000..25a300028
--- /dev/null
+++ b/third_party/tbb/partitioner.h
@@ -0,0 +1,682 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_partitioner_H
+#define __TBB_partitioner_H
+
+#ifndef __TBB_INITIAL_CHUNKS
+// initial task divisions per thread
+#define __TBB_INITIAL_CHUNKS 2
+#endif
+#ifndef __TBB_RANGE_POOL_CAPACITY
+// maximum number of elements in range pool
+#define __TBB_RANGE_POOL_CAPACITY 8
+#endif
+#ifndef __TBB_INIT_DEPTH
+// initial value for depth of range pool
+#define __TBB_INIT_DEPTH 5
+#endif
+#ifndef __TBB_DEMAND_DEPTH_ADD
+// when imbalance is found range splits this value times more
+#define __TBB_DEMAND_DEPTH_ADD 1
+#endif
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_aligned_space.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_range_common.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+
+#include "third_party/tbb/cache_aligned_allocator.h"
+#include "third_party/tbb/task_group.h" // task_group_context
+#include "third_party/tbb/task_arena.h"
+
+#include "third_party/libcxx/algorithm"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/type_traits"
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    // Workaround for overzealous compiler warnings
+    #pragma warning (push)
+    #pragma warning (disable: 4244)
+#endif
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class auto_partitioner;
+class simple_partitioner;
+class static_partitioner;
+class affinity_partitioner;
+class affinity_partition_type;
+class affinity_partitioner_base;
+
+inline std::size_t get_initial_auto_partitioner_divisor() {
+    const std::size_t factor = 4;
+    return factor * static_cast<std::size_t>(max_concurrency());
+}
+
+//! Defines entry point for affinity partitioner into oneTBB run-time library.
+class affinity_partitioner_base: no_copy {
+    friend class affinity_partitioner;
+    friend class affinity_partition_type;
+    //! Array that remembers affinities of tree positions to affinity_id.
+    /** nullptr if my_size==0. */
+    slot_id* my_array;
+    //! Number of elements in my_array.
+    std::size_t my_size;
+    //! Zeros the fields.
+    affinity_partitioner_base() : my_array(nullptr), my_size(0) {}
+    //! Deallocates my_array.
+    ~affinity_partitioner_base() { resize(0); }
+    //! Resize my_array.
+    /** Retains values if resulting size is the same. */
+    void resize(unsigned factor) {
+        // Check factor to avoid asking for number of workers while there might be no arena.
+        unsigned max_threads_in_arena = static_cast<unsigned>(max_concurrency());
+        std::size_t new_size = factor ? factor * max_threads_in_arena : 0;
+        if (new_size != my_size) {
+            if (my_array) {
+                r1::cache_aligned_deallocate(my_array);
+                // Following two assignments must be done here for sake of exception safety.
+                my_array = nullptr;
+                my_size = 0;
+            }
+            if (new_size) {
+                my_array = static_cast<slot_id*>(r1::cache_aligned_allocate(new_size * sizeof(slot_id)));
+                std::fill_n(my_array, new_size, no_slot);
+                my_size = new_size;
+            }
+        }
+    }
+};
+
+template<typename Range, typename Body, typename Partitioner> struct start_for;
+template<typename Range, typename Body, typename Partitioner> struct start_scan;
+template<typename Range, typename Body, typename Partitioner> struct start_reduce;
+template<typename Range, typename Body, typename Partitioner> struct start_deterministic_reduce;
+
+struct node {
+    node* my_parent{};
+    std::atomic<int> m_ref_count{};
+
+    node() = default;
+    node(node* parent, int ref_count) :
+        my_parent{parent}, m_ref_count{ref_count} {
+        __TBB_ASSERT(ref_count > 0, "The ref count must be positive");
+    }
+};
+
+struct wait_node : node {
+    wait_node() : node{ nullptr, 1 } {}
+    wait_context m_wait{1};
+};
+
+//! Join task node that contains shared flag for stealing feedback
+struct tree_node : public node {
+    small_object_allocator m_allocator;
+    std::atomic<bool> m_child_stolen{false};
+
+    tree_node(node* parent, int ref_count, small_object_allocator& alloc)
+        : node{parent, ref_count}
+        , m_allocator{alloc} {}
+
+    void join(task_group_context*) {/*dummy, required only for reduction algorithms*/};
+
+    template <typename Task>
+    static void mark_task_stolen(Task &t) {
+        std::atomic<bool> &flag = static_cast<tree_node*>(t.my_parent)->m_child_stolen;
+#if TBB_USE_PROFILING_TOOLS
+        // Threading tools respect lock prefix but report false-positive data-race via plain store
+        flag.exchange(true);
+#else
+        flag.store(true, std::memory_order_relaxed);
+#endif // TBB_USE_PROFILING_TOOLS
+    }
+    template <typename Task>
+    static bool is_peer_stolen(Task &t) {
+        return static_cast<tree_node*>(t.my_parent)->m_child_stolen.load(std::memory_order_relaxed);
+    }
+};
+
+// Context used to check cancellation state during reduction join process
+template<typename TreeNodeType>
+void fold_tree(node* n, const execution_data& ed) {
+    for (;;) {
+        __TBB_ASSERT(n, nullptr);
+        __TBB_ASSERT(n->m_ref_count.load(std::memory_order_relaxed) > 0, "The refcount must be positive.");
+        call_itt_task_notify(releasing, n);
+        if (--n->m_ref_count > 0) {
+            return;
+        }
+        node* parent = n->my_parent;
+        if (!parent) {
+            break;
+        };
+
+        call_itt_task_notify(acquired, n);
+        TreeNodeType* self = static_cast<TreeNodeType*>(n);
+        self->join(ed.context);
+        self->m_allocator.delete_object(self, ed);
+        n = parent;
+    }
+    // Finish parallel for execution when the root (last node) is reached
+    static_cast<wait_node*>(n)->m_wait.release();
+}
+
+//! Depth is a relative depth of recursive division inside a range pool. Relative depth allows
+//! infinite absolute depth of the recursion for heavily unbalanced workloads with range represented
+//! by a number that cannot fit into machine word.
+typedef unsigned char depth_t;
+
+//! Range pool stores ranges of type T in a circular buffer with MaxCapacity
+template <typename T, depth_t MaxCapacity>
+class range_vector {
+    depth_t my_head;
+    depth_t my_tail;
+    depth_t my_size;
+    depth_t my_depth[MaxCapacity]; // relative depths of stored ranges
+    tbb::detail::aligned_space<T, MaxCapacity> my_pool;
+
+public:
+    //! initialize via first range in pool
+    range_vector(const T& elem) : my_head(0), my_tail(0), my_size(1) {
+        my_depth[0] = 0;
+        new( static_cast<void *>(my_pool.begin()) ) T(elem);//TODO: std::move?
+    }
+    ~range_vector() {
+        while( !empty() ) pop_back();
+    }
+    bool empty() const { return my_size == 0; }
+    depth_t size() const { return my_size; }
+    //! Populates range pool via ranges up to max depth or while divisible
+    //! max_depth starts from 0, e.g. value 2 makes 3 ranges in the pool up to two 1/4 pieces
+    void split_to_fill(depth_t max_depth) {
+        while( my_size < MaxCapacity && is_divisible(max_depth) ) {
+            depth_t prev = my_head;
+            my_head = (my_head + 1) % MaxCapacity;
+            new(my_pool.begin()+my_head) T(my_pool.begin()[prev]); // copy TODO: std::move?
+            my_pool.begin()[prev].~T(); // instead of assignment
+            new(my_pool.begin()+prev) T(my_pool.begin()[my_head], detail::split()); // do 'inverse' split
+            my_depth[my_head] = ++my_depth[prev];
+            my_size++;
+        }
+    }
+    void pop_back() {
+        __TBB_ASSERT(my_size > 0, "range_vector::pop_back() with empty size");
+        my_pool.begin()[my_head].~T();
+        my_size--;
+        my_head = (my_head + MaxCapacity - 1) % MaxCapacity;
+    }
+    void pop_front() {
+        __TBB_ASSERT(my_size > 0, "range_vector::pop_front() with empty size");
+        my_pool.begin()[my_tail].~T();
+        my_size--;
+        my_tail = (my_tail + 1) % MaxCapacity;
+    }
+    T& back() {
+        __TBB_ASSERT(my_size > 0, "range_vector::back() with empty size");
+        return my_pool.begin()[my_head];
+    }
+    T& front() {
+        __TBB_ASSERT(my_size > 0, "range_vector::front() with empty size");
+        return my_pool.begin()[my_tail];
+    }
+    //! similarly to front(), returns depth of the first range in the pool
+    depth_t front_depth() {
+        __TBB_ASSERT(my_size > 0, "range_vector::front_depth() with empty size");
+        return my_depth[my_tail];
+    }
+    depth_t back_depth() {
+        __TBB_ASSERT(my_size > 0, "range_vector::back_depth() with empty size");
+        return my_depth[my_head];
+    }
+    bool is_divisible(depth_t max_depth) {
+        return back_depth() < max_depth && back().is_divisible();
+    }
+};
+
+//! Provides default methods for partition objects and common algorithm blocks.
+template <typename Partition>
+struct partition_type_base {
+    typedef detail::split split_type;
+    // decision makers
+    void note_affinity( slot_id ) {}
+    template <typename Task>
+    bool check_being_stolen(Task&, const execution_data&) { return false; } // part of old should_execute_range()
+    template <typename Range> split_type get_split() { return split(); }
+    Partition& self() { return *static_cast<Partition*>(this); } // CRTP helper
+
+    template<typename StartType, typename Range>
+    void work_balance(StartType &start, Range &range, const execution_data&) {
+        start.run_body( range ); // static partitioner goes here
+    }
+
+    template<typename StartType, typename Range>
+    void execute(StartType &start, Range &range, execution_data& ed) {
+        // The algorithm in a few words ([]-denotes calls to decision methods of partitioner):
+        // [If this task is stolen, adjust depth and divisions if necessary, set flag].
+        // If range is divisible {
+        //    Spread the work while [initial divisions left];
+        //    Create trap task [if necessary];
+        // }
+        // If not divisible or [max depth is reached], execute, else do the range pool part
+        if ( range.is_divisible() ) {
+            if ( self().is_divisible() ) {
+                do { // split until is divisible
+                    typename Partition::split_type split_obj = self().template get_split<Range>();
+                    start.offer_work( split_obj, ed );
+                } while ( range.is_divisible() && self().is_divisible() );
+            }
+        }
+        self().work_balance(start, range, ed);
+    }
+};
+
+//! Provides default splitting strategy for partition objects.
+template <typename Partition>
+struct adaptive_mode : partition_type_base<Partition> {
+    typedef Partition my_partition;
+    std::size_t my_divisor;
+    // For affinity_partitioner, my_divisor indicates the number of affinity array indices the task reserves.
+    // A task which has only one index must produce the right split without reserved index in order to avoid
+    // it to be overwritten in note_affinity() of the created (right) task.
+    // I.e. a task created deeper than the affinity array can remember must not save its affinity (LIFO order)
+    static const unsigned factor = 1;
+    adaptive_mode() : my_divisor(get_initial_auto_partitioner_divisor() / 4 * my_partition::factor) {}
+    adaptive_mode(adaptive_mode &src, split) : my_divisor(do_split(src, split())) {}
+    adaptive_mode(adaptive_mode&, const proportional_split&) : my_divisor(0)
+    {
+        // left blank as my_divisor gets overridden in the successors' constructors
+    }
+    /*! Override do_split methods in order to specify splitting strategy */
+    std::size_t do_split(adaptive_mode &src, split) {
+        return src.my_divisor /= 2u;
+    }
+};
+
+
+//! Provides proportional splitting strategy for partition objects
+template <typename Partition>
+struct proportional_mode : adaptive_mode<Partition> {
+    typedef Partition my_partition;
+    using partition_type_base<Partition>::self; // CRTP helper to get access to derived classes
+
+    proportional_mode() : adaptive_mode<Partition>() {}
+    proportional_mode(proportional_mode &src, split) : adaptive_mode<Partition>(src, split()) {}
+    proportional_mode(proportional_mode &src, const proportional_split& split_obj)
+        : adaptive_mode<Partition>(src, split_obj)
+    {
+        self().my_divisor = do_split(src, split_obj);
+    }
+    std::size_t do_split(proportional_mode &src, const proportional_split& split_obj) {
+        std::size_t portion = split_obj.right() * my_partition::factor;
+        portion = (portion + my_partition::factor/2) & (0ul - my_partition::factor);
+        src.my_divisor -= portion;
+        return portion;
+    }
+    bool is_divisible() { // part of old should_execute_range()
+        return self().my_divisor > my_partition::factor;
+    }
+    template <typename Range>
+    proportional_split get_split() {
+        // Create the proportion from partitioner internal resources (threads) that would be used:
+        // - into proportional_mode constructor to split the partitioner
+        // - if Range supports the proportional_split constructor it would use proposed proportion,
+        //   otherwise, the tbb::proportional_split object will be implicitly (for Range implementor)
+        //   casted to tbb::split
+
+        std::size_t n = self().my_divisor / my_partition::factor;
+        std::size_t right = n / 2;
+        std::size_t left  = n - right;
+        return proportional_split(left, right);
+    }
+};
+
+static std::size_t get_initial_partition_head() {
+    int current_index = tbb::this_task_arena::current_thread_index();
+    if (current_index == tbb::task_arena::not_initialized)
+        current_index = 0;
+    return size_t(current_index);
+}
+
+//! Provides default linear indexing of partitioner's sequence
+template <typename Partition>
+struct linear_affinity_mode : proportional_mode<Partition> {
+    std::size_t my_head;
+    std::size_t my_max_affinity;
+    using proportional_mode<Partition>::self;
+    linear_affinity_mode() : proportional_mode<Partition>(), my_head(get_initial_partition_head()),
+                             my_max_affinity(self().my_divisor) {}
+    linear_affinity_mode(linear_affinity_mode &src, split) : proportional_mode<Partition>(src, split())
+        , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {}
+    linear_affinity_mode(linear_affinity_mode &src, const proportional_split& split_obj) : proportional_mode<Partition>(src, split_obj)
+        , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {}
+    void spawn_task(task& t, task_group_context& ctx) {
+        if (self().my_divisor) {
+            spawn(t, ctx, slot_id(my_head));
+        } else {
+            spawn(t, ctx);
+        }
+    }
+};
+
+static bool is_stolen_task(const execution_data& ed) {
+    return execution_slot(ed) != original_slot(ed);
+}
+
+/*! Determine work-balance phase implementing splitting & stealing actions */
+template<class Mode>
+struct dynamic_grainsize_mode : Mode {
+    using Mode::self;
+    enum {
+        begin = 0,
+        run,
+        pass
+    } my_delay;
+    depth_t my_max_depth;
+    static const unsigned range_pool_size = __TBB_RANGE_POOL_CAPACITY;
+    dynamic_grainsize_mode(): Mode()
+        , my_delay(begin)
+        , my_max_depth(__TBB_INIT_DEPTH) {}
+    dynamic_grainsize_mode(dynamic_grainsize_mode& p, split)
+        : Mode(p, split())
+        , my_delay(pass)
+        , my_max_depth(p.my_max_depth) {}
+    dynamic_grainsize_mode(dynamic_grainsize_mode& p, const proportional_split& split_obj)
+        : Mode(p, split_obj)
+        , my_delay(begin)
+        , my_max_depth(p.my_max_depth) {}
+    template <typename Task>
+    bool check_being_stolen(Task &t, const execution_data& ed) { // part of old should_execute_range()
+        if( !(self().my_divisor / Mode::my_partition::factor) ) { // if not from the top P tasks of binary tree
+            self().my_divisor = 1; // TODO: replace by on-stack flag (partition_state's member)?
+            if( is_stolen_task(ed) && t.my_parent->m_ref_count >= 2 ) { // runs concurrently with the left task
+#if __TBB_USE_OPTIONAL_RTTI
+                // RTTI is available, check whether the cast is valid
+                // TODO: TBB_REVAMP_TODO __TBB_ASSERT(dynamic_cast<tree_node*>(t.m_parent), 0);
+                // correctness of the cast relies on avoiding the root task for which:
+                // - initial value of my_divisor != 0 (protected by separate assertion)
+                // - is_stolen_task() always returns false for the root task.
+#endif
+                tree_node::mark_task_stolen(t);
+                if( !my_max_depth ) my_max_depth++;
+                my_max_depth += __TBB_DEMAND_DEPTH_ADD;
+                return true;
+            }
+        }
+        return false;
+    }
+    depth_t max_depth() { return my_max_depth; }
+    void align_depth(depth_t base) {
+        __TBB_ASSERT(base <= my_max_depth, nullptr);
+        my_max_depth -= base;
+    }
+    template<typename StartType, typename Range>
+    void work_balance(StartType &start, Range &range, execution_data& ed) {
+        if( !range.is_divisible() || !self().max_depth() ) {
+            start.run_body( range );
+        }
+        else { // do range pool
+            range_vector<Range, range_pool_size> range_pool(range);
+            do {
+                range_pool.split_to_fill(self().max_depth()); // fill range pool
+                if( self().check_for_demand( start ) ) {
+                    if( range_pool.size() > 1 ) {
+                        start.offer_work( range_pool.front(), range_pool.front_depth(), ed );
+                        range_pool.pop_front();
+                        continue;
+                    }
+                    if( range_pool.is_divisible(self().max_depth()) ) // was not enough depth to fork a task
+                        continue; // note: next split_to_fill() should split range at least once
+                }
+                start.run_body( range_pool.back() );
+                range_pool.pop_back();
+            } while( !range_pool.empty() && !ed.context->is_group_execution_cancelled() );
+        }
+    }
+    template <typename Task>
+    bool check_for_demand(Task& t) {
+        if ( pass == my_delay ) {
+            if ( self().my_divisor > 1 ) // produce affinitized tasks while they have slot in array
+                return true; // do not do my_max_depth++ here, but be sure range_pool is splittable once more
+            else if ( self().my_divisor && my_max_depth ) { // make balancing task
+                self().my_divisor = 0; // once for each task; depth will be decreased in align_depth()
+                return true;
+            }
+            else if ( tree_node::is_peer_stolen(t) ) {
+                my_max_depth += __TBB_DEMAND_DEPTH_ADD;
+                return true;
+            }
+        } else if( begin == my_delay ) {
+            my_delay = pass;
+        }
+        return false;
+    }
+};
+
+class auto_partition_type: public dynamic_grainsize_mode<adaptive_mode<auto_partition_type> > {
+public:
+    auto_partition_type( const auto_partitioner& ) {
+        my_divisor *= __TBB_INITIAL_CHUNKS;
+    }
+    auto_partition_type( auto_partition_type& src, split)
+        : dynamic_grainsize_mode<adaptive_mode<auto_partition_type> >(src, split()) {}
+    bool is_divisible() { // part of old should_execute_range()
+        if( my_divisor > 1 ) return true;
+        if( my_divisor && my_max_depth ) { // can split the task. TODO: on-stack flag instead
+            // keep same fragmentation while splitting for the local task pool
+            my_max_depth--;
+            my_divisor = 0; // decrease max_depth once per task
+            return true;
+        } else return false;
+    }
+    template <typename Task>
+    bool check_for_demand(Task& t) {
+        if (tree_node::is_peer_stolen(t)) {
+            my_max_depth += __TBB_DEMAND_DEPTH_ADD;
+            return true;
+        } else return false;
+    }
+    void spawn_task(task& t, task_group_context& ctx) {
+        spawn(t, ctx);
+    }
+};
+
+class simple_partition_type: public partition_type_base<simple_partition_type> {
+public:
+    simple_partition_type( const simple_partitioner& ) {}
+    simple_partition_type( const simple_partition_type&, split ) {}
+    //! simplified algorithm
+    template<typename StartType, typename Range>
+    void execute(StartType &start, Range &range, execution_data& ed) {
+        split_type split_obj = split(); // start.offer_work accepts split_type as reference
+        while( range.is_divisible() )
+            start.offer_work( split_obj, ed );
+        start.run_body( range );
+    }
+    void spawn_task(task& t, task_group_context& ctx) {
+        spawn(t, ctx);
+    }
+};
+
+class static_partition_type : public linear_affinity_mode<static_partition_type> {
+public:
+    typedef detail::proportional_split split_type;
+    static_partition_type( const static_partitioner& ) {}
+    static_partition_type( static_partition_type& p, const proportional_split& split_obj )
+        : linear_affinity_mode<static_partition_type>(p, split_obj) {}
+};
+
+class affinity_partition_type : public dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> > {
+    static const unsigned factor_power = 4; // TODO: get a unified formula based on number of computing units
+    slot_id* my_array;
+public:
+    static const unsigned factor = 1 << factor_power; // number of slots in affinity array per task
+    typedef detail::proportional_split split_type;
+    affinity_partition_type( affinity_partitioner_base& ap ) {
+        __TBB_ASSERT( (factor&(factor-1))==0, "factor must be power of two" );
+        ap.resize(factor);
+        my_array = ap.my_array;
+        my_max_depth = factor_power + 1;
+        __TBB_ASSERT( my_max_depth < __TBB_RANGE_POOL_CAPACITY, nullptr );
+    }
+    affinity_partition_type(affinity_partition_type& p, split)
+        : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >(p, split())
+        , my_array(p.my_array) {}
+    affinity_partition_type(affinity_partition_type& p, const proportional_split& split_obj)
+        : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >(p, split_obj)
+        , my_array(p.my_array) {}
+    void note_affinity(slot_id id) {
+        if( my_divisor )
+            my_array[my_head] = id;
+    }
+    void spawn_task(task& t, task_group_context& ctx) {
+        if (my_divisor) {
+            if (!my_array[my_head]) {
+                // TODO: consider new ideas with my_array for both affinity and static partitioner's, then code reuse
+                spawn(t, ctx, slot_id(my_head / factor));
+            } else {
+                spawn(t, ctx, my_array[my_head]);
+            }
+        } else {
+            spawn(t, ctx);
+        }
+    }
+};
+
+//! A simple partitioner
+/** Divides the range until the range is not divisible.
+    @ingroup algorithms */
+class simple_partitioner {
+public:
+    simple_partitioner() {}
+private:
+    template<typename Range, typename Body, typename Partitioner> friend struct start_for;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
+    // new implementation just extends existing interface
+    typedef simple_partition_type task_partition_type;
+    // TODO: consider to make split_type public
+    typedef simple_partition_type::split_type split_type;
+
+    // for parallel_scan only
+    class partition_type {
+    public:
+        bool should_execute_range(const execution_data& ) {return false;}
+        partition_type( const simple_partitioner& ) {}
+        partition_type( const partition_type&, split ) {}
+    };
+};
+
+//! An auto partitioner
+/** The range is initial divided into several large chunks.
+    Chunks are further subdivided into smaller pieces if demand detected and they are divisible.
+    @ingroup algorithms */
+class auto_partitioner {
+public:
+    auto_partitioner() {}
+
+private:
+    template<typename Range, typename Body, typename Partitioner> friend struct start_for;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
+    // new implementation just extends existing interface
+    typedef auto_partition_type task_partition_type;
+    // TODO: consider to make split_type public
+    typedef auto_partition_type::split_type split_type;
+
+    //! Backward-compatible partition for auto and affinity partition objects.
+    class partition_type {
+        size_t num_chunks;
+        static const size_t VICTIM_CHUNKS = 4;
+        public:
+        bool should_execute_range(const execution_data& ed) {
+            if( num_chunks<VICTIM_CHUNKS && is_stolen_task(ed) )
+                num_chunks = VICTIM_CHUNKS;
+            return num_chunks==1;
+        }
+        partition_type( const auto_partitioner& )
+            : num_chunks(get_initial_auto_partitioner_divisor()*__TBB_INITIAL_CHUNKS/4) {}
+        partition_type( partition_type& pt, split ) {
+            num_chunks = pt.num_chunks = (pt.num_chunks+1u) / 2u;
+        }
+    };
+};
+
+//! A static partitioner
+class static_partitioner {
+public:
+    static_partitioner() {}
+private:
+    template<typename Range, typename Body, typename Partitioner> friend struct start_for;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
+    // new implementation just extends existing interface
+    typedef static_partition_type task_partition_type;
+    // TODO: consider to make split_type public
+    typedef static_partition_type::split_type split_type;
+};
+
+//! An affinity partitioner
+class affinity_partitioner : affinity_partitioner_base {
+public:
+    affinity_partitioner() {}
+
+private:
+    template<typename Range, typename Body, typename Partitioner> friend struct start_for;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
+    template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
+    // new implementation just extends existing interface
+    typedef affinity_partition_type task_partition_type;
+    // TODO: consider to make split_type public
+    typedef affinity_partition_type::split_type split_type;
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+// Partitioners
+using detail::d1::auto_partitioner;
+using detail::d1::simple_partitioner;
+using detail::d1::static_partitioner;
+using detail::d1::affinity_partitioner;
+// Split types
+using detail::split;
+using detail::proportional_split;
+} // namespace v1
+
+} // namespace tbb
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    #pragma warning (pop)
+#endif // warning 4244 is back
+
+#undef __TBB_INITIAL_CHUNKS
+#undef __TBB_RANGE_POOL_CAPACITY
+#undef __TBB_INIT_DEPTH
+
+#endif /* __TBB_partitioner_H */
diff --git a/third_party/tbb/permit_manager.h b/third_party/tbb/permit_manager.h
new file mode 100644
index 000000000..80f32daf4
--- /dev/null
+++ b/third_party/tbb/permit_manager.h
@@ -0,0 +1,61 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_permit_manager_H
+#define _TBB_permit_manager_H
+
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/thread_request_serializer.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class arena;
+class pm_client;
+
+class permit_manager : no_copy {
+public:
+    virtual ~permit_manager() {}
+    virtual pm_client* create_client(arena& a) = 0;
+    virtual void register_client(pm_client* client) = 0;
+    virtual void unregister_and_destroy_client(pm_client& c) = 0;
+
+    virtual void set_active_num_workers(int soft_limit) = 0;
+    virtual void adjust_demand(pm_client&, int mandatory_delta, int workers_delta) = 0;
+
+    void set_thread_request_observer(thread_request_observer& tr_observer) {
+        __TBB_ASSERT(!my_thread_request_observer, "set_thread_request_observer was called already?");
+        my_thread_request_observer = &tr_observer;
+    }
+protected:
+    void notify_thread_request(int delta) {
+        __TBB_ASSERT(my_thread_request_observer, "set_thread_request_observer was not called?");
+        if (delta) {
+            my_thread_request_observer->update(delta);
+        }
+    }
+private:
+    thread_request_observer* my_thread_request_observer{nullptr};
+};
+
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_permit_manager_H
diff --git a/third_party/tbb/pm_client.h b/third_party/tbb/pm_client.h
new file mode 100644
index 000000000..877e1d2a9
--- /dev/null
+++ b/third_party/tbb/pm_client.h
@@ -0,0 +1,71 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_pm_client_H
+#define _TBB_pm_client_H
+
+#include "third_party/tbb/arena.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class pm_client {
+public:
+    pm_client(arena& a) : my_arena(a) {}
+
+    unsigned priority_level() {
+        return my_arena.priority_level();
+    }
+
+    void set_top_priority(bool b) {
+        my_arena.set_top_priority(b);
+    }
+
+    int min_workers() const {
+        return my_min_workers;
+    }
+
+    int max_workers() const {
+        return my_max_workers;
+    }
+
+    int update_request(int mandatory_delta, int workers_delta) {
+        auto min_max_workers = my_arena.update_request(mandatory_delta, workers_delta);
+        int delta = min_max_workers.second - my_max_workers;
+        set_workers(min_max_workers.first, min_max_workers.second);
+        return delta;
+    }
+
+protected:
+    void set_workers(int mn_w, int mx_w) {
+        __TBB_ASSERT(mn_w >= 0, nullptr);
+        __TBB_ASSERT(mx_w >= 0, nullptr);
+        my_min_workers = mn_w;
+        my_max_workers = mx_w;
+    }
+
+    arena& my_arena;
+    int my_min_workers{0};
+    int my_max_workers{0};
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_pm_client_H
diff --git a/third_party/tbb/private_server.cpp b/third_party/tbb/private_server.cpp
new file mode 100644
index 000000000..8b7a758bd
--- /dev/null
+++ b/third_party/tbb/private_server.cpp
@@ -0,0 +1,437 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/cache_aligned_allocator.h"
+#include "third_party/tbb/mutex.h"
+
+#include "third_party/tbb/rml_tbb.h"
+#include "third_party/tbb/rml_thread_monitor.h"
+
+#include "third_party/tbb/scheduler_common.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/misc.h"
+
+#include "third_party/libcxx/atomic"
+
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+namespace rml {
+
+using rml::internal::thread_monitor;
+typedef thread_monitor::handle_type thread_handle;
+
+class private_server;
+
+class private_worker: no_copy {
+private:
+    //! State in finite-state machine that controls the worker.
+    /** State diagram:
+        init --> starting --> normal
+          |         |           |
+          |         V           |
+          \------> quit <------/
+      */
+    enum state_t {
+        //! *this is initialized
+        st_init,
+        //! *this has associated thread that is starting up.
+        st_starting,
+        //! Associated thread is doing normal life sequence.
+        st_normal,
+        //! Associated thread has ended normal life sequence and promises to never touch *this again.
+        st_quit
+    };
+    std::atomic<state_t> my_state;
+
+    //! Associated server
+    private_server& my_server;
+
+    //! Associated client
+    tbb_client& my_client;
+
+    //! index used for avoiding the 64K aliasing problem
+    const std::size_t my_index;
+
+    //! Monitor for sleeping when there is no work to do.
+    /** The invariant that holds for sleeping workers is:
+        "my_slack<=0 && my_state==st_normal && I am on server's list of asleep threads" */
+    thread_monitor my_thread_monitor;
+
+    //! Handle of the OS thread associated with this worker
+    thread_handle my_handle;
+
+    //! Link for list of workers that are sleeping or have no associated thread.
+    private_worker* my_next;
+
+    friend class private_server;
+
+    //! Actions executed by the associated thread
+    void run() noexcept;
+
+    //! Wake up associated thread (or launch a thread if there is none)
+    void wake_or_launch();
+
+    //! Called by a thread (usually not the associated thread) to commence termination.
+    void start_shutdown();
+
+    static __RML_DECL_THREAD_ROUTINE thread_routine( void* arg );
+
+    static void release_handle(thread_handle my_handle, bool join);
+
+protected:
+    private_worker( private_server& server, tbb_client& client, const std::size_t i ) :
+        my_state(st_init), my_server(server), my_client(client), my_index(i),
+        my_handle(), my_next()
+    {}
+};
+
+static const std::size_t cache_line_size = tbb::detail::max_nfs_size;
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress overzealous compiler warnings about uninstantiable class
+    #pragma warning(push)
+    #pragma warning(disable:4510 4610)
+#endif
+class padded_private_worker: public private_worker {
+    char pad[cache_line_size - sizeof(private_worker)%cache_line_size];
+public:
+    padded_private_worker( private_server& server, tbb_client& client, const std::size_t i )
+    : private_worker(server,client,i) { suppress_unused_warning(pad); }
+};
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning(pop)
+#endif
+
+class private_server: public tbb_server, no_copy {
+private:
+    tbb_client& my_client;
+    //! Maximum number of threads to be created.
+    /** Threads are created lazily, so maximum might not actually be reached. */
+    const tbb_client::size_type my_n_thread;
+
+    //! Stack size for each thread. */
+    const std::size_t my_stack_size;
+
+    //! Number of jobs that could use their associated thread minus number of active threads.
+    /** If negative, indicates oversubscription.
+        If positive, indicates that more threads should run.
+        Can be lowered asynchronously, but must be raised only while holding my_asleep_list_mutex,
+        because raising it impacts the invariant for sleeping threads. */
+    std::atomic<int> my_slack;
+
+    //! Counter used to determine when to delete this.
+    std::atomic<int> my_ref_count;
+
+    padded_private_worker* my_thread_array;
+
+    //! List of workers that are asleep or committed to sleeping until notified by another thread.
+    std::atomic<private_worker*> my_asleep_list_root;
+
+    //! Protects my_asleep_list_root
+    typedef mutex asleep_list_mutex_type;
+    asleep_list_mutex_type my_asleep_list_mutex;
+
+#if TBB_USE_ASSERT
+    std::atomic<int> my_net_slack_requests;
+#endif /* TBB_USE_ASSERT */
+
+    //! Wake up to two sleeping workers, if there are any sleeping.
+    /** The call is used to propagate a chain reaction where each thread wakes up two threads,
+        which in turn each wake up two threads, etc. */
+    void propagate_chain_reaction() {
+        // First test of a double-check idiom.  Second test is inside wake_some(0).
+        if( my_asleep_list_root.load(std::memory_order_relaxed) )
+            wake_some(0);
+    }
+
+    //! Try to add t to list of sleeping workers
+    bool try_insert_in_asleep_list( private_worker& t );
+
+    //! Equivalent of adding additional_slack to my_slack and waking up to 2 threads if my_slack permits.
+    void wake_some( int additional_slack );
+
+    ~private_server() override;
+
+    void remove_server_ref() {
+        if( --my_ref_count==0 ) {
+            my_client.acknowledge_close_connection();
+            this->~private_server();
+            tbb::cache_aligned_allocator<private_server>().deallocate( this, 1 );
+        }
+    }
+
+    friend class private_worker;
+public:
+    private_server( tbb_client& client );
+
+    version_type version() const override {
+        return 0;
+    }
+
+    void request_close_connection( bool /*exiting*/ ) override {
+        for( std::size_t i=0; i<my_n_thread; ++i )
+            my_thread_array[i].start_shutdown();
+        remove_server_ref();
+    }
+
+    void yield() override { d0::yield(); }
+
+    void independent_thread_number_changed( int ) override {__TBB_ASSERT(false, nullptr);}
+
+    unsigned default_concurrency() const override { return governor::default_num_threads() - 1; }
+
+    void adjust_job_count_estimate( int delta ) override;
+
+#if _WIN32 || _WIN64
+    void register_external_thread ( ::rml::server::execution_resource_t& ) override {}
+    void unregister_external_thread ( ::rml::server::execution_resource_t ) override {}
+#endif /* _WIN32||_WIN64 */
+};
+
+//------------------------------------------------------------------------
+// Methods of private_worker
+//------------------------------------------------------------------------
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress overzealous compiler warnings about an initialized variable 'sink_for_alloca' not referenced
+    #pragma warning(push)
+    #pragma warning(disable:4189)
+#endif
+#if __MINGW32__ && __GNUC__==4 &&__GNUC_MINOR__>=2 && !__MINGW64__
+// ensure that stack is properly aligned for TBB threads
+__attribute__((force_align_arg_pointer))
+#endif
+__RML_DECL_THREAD_ROUTINE private_worker::thread_routine( void* arg ) {
+    private_worker* self = static_cast<private_worker*>(arg);
+    AVOID_64K_ALIASING( self->my_index );
+    self->run();
+    // return 0 instead of nullptr due to the difference in the type __RML_DECL_THREAD_ROUTINE on various OSs
+    return 0;
+}
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning(pop)
+#endif
+
+void private_worker::release_handle(thread_handle handle, bool join) {
+    if (join)
+        thread_monitor::join(handle);
+    else
+        thread_monitor::detach_thread(handle);
+}
+
+void private_worker::start_shutdown() {
+    __TBB_ASSERT(my_state.load(std::memory_order_relaxed) != st_quit, "The quit state is expected to be set only once");
+
+    // `acq` to acquire my_handle
+    // `rel` to release market state
+    state_t prev_state = my_state.exchange(st_quit, std::memory_order_acq_rel);
+
+    if (prev_state == st_init) {
+        // Perform action that otherwise would be performed by associated thread when it quits.
+        my_server.remove_server_ref();
+    } else {
+        __TBB_ASSERT(prev_state == st_normal || prev_state == st_starting, nullptr);
+        // May have invalidated invariant for sleeping, so wake up the thread.
+        // Note that the notify() here occurs without maintaining invariants for my_slack.
+        // It does not matter, because my_state==st_quit overrides checking of my_slack.
+        my_thread_monitor.notify();
+        // Do not need release handle in st_init state,
+        // because in this case the thread wasn't started yet.
+        // For st_starting release is done at launch site.
+        if (prev_state == st_normal)
+            release_handle(my_handle, governor::does_client_join_workers(my_client));
+    }
+}
+
+void private_worker::run() noexcept {
+    my_server.propagate_chain_reaction();
+
+    // Transiting to st_normal here would require setting my_handle,
+    // which would create race with the launching thread and
+    // complications in handle management on Windows.
+
+    ::rml::job& j = *my_client.create_one_job();
+    // memory_order_seq_cst to be strictly ordered after thread_monitor::wait on the next iteration
+    while( my_state.load(std::memory_order_seq_cst)!=st_quit ) {
+        if( my_server.my_slack.load(std::memory_order_acquire)>=0 ) {
+            my_client.process(j);
+        } else if( my_server.try_insert_in_asleep_list(*this) ) {
+            my_thread_monitor.wait();
+            __TBB_ASSERT(my_state.load(std::memory_order_relaxed) == st_quit || !my_next, "Thread monitor missed a spurious wakeup?" );
+            my_server.propagate_chain_reaction();
+        }
+    }
+    my_client.cleanup(j);
+
+    ++my_server.my_slack;
+    my_server.remove_server_ref();
+}
+
+inline void private_worker::wake_or_launch() {
+    state_t state = my_state.load(std::memory_order_relaxed);
+
+    switch (state) {
+    case st_starting:
+        __TBB_fallthrough;
+    case st_normal:
+        __TBB_ASSERT(!my_next, "Should not wake a thread while it's still in asleep list");
+        my_thread_monitor.notify();
+        break;
+    case st_init:
+        if (my_state.compare_exchange_strong(state, st_starting)) {
+            // after this point, remove_server_ref() must be done by created thread
+#if __TBB_USE_WINAPI
+            // Win thread_monitor::launch is designed on the assumption that the workers thread id go from 1 to Hard limit set by TBB market::global_market
+            const std::size_t worker_idx = my_server.my_n_thread - this->my_index; 
+            my_handle = thread_monitor::launch(thread_routine, this, my_server.my_stack_size, &worker_idx);
+#elif __TBB_USE_POSIX
+            {
+                affinity_helper fpa;
+                fpa.protect_affinity_mask( /*restore_process_mask=*/true);
+                my_handle = thread_monitor::launch(thread_routine, this, my_server.my_stack_size);
+                // Implicit destruction of fpa resets original affinity mask.
+            }
+#endif /* __TBB_USE_POSIX */
+            state = st_starting;
+            if (!my_state.compare_exchange_strong(state, st_normal)) {
+                // Do shutdown during startup. my_handle can't be released
+                // by start_shutdown, because my_handle value might be not set yet
+                // at time of transition from st_starting to st_quit.
+                __TBB_ASSERT(state == st_quit, nullptr);
+                release_handle(my_handle, governor::does_client_join_workers(my_client));
+            }
+        }
+        break;
+    default:
+        __TBB_ASSERT(state == st_quit, nullptr);
+    }
+}
+
+//------------------------------------------------------------------------
+// Methods of private_server
+//------------------------------------------------------------------------
+private_server::private_server( tbb_client& client ) :
+    my_client(client),
+    my_n_thread(client.max_job_count()),
+    my_stack_size(client.min_stack_size()),
+    my_slack(0),
+    my_ref_count(my_n_thread+1),
+    my_thread_array(nullptr),
+    my_asleep_list_root(nullptr)
+#if TBB_USE_ASSERT
+    , my_net_slack_requests(0)
+#endif /* TBB_USE_ASSERT */
+{
+    my_thread_array = tbb::cache_aligned_allocator<padded_private_worker>().allocate( my_n_thread );
+    for( std::size_t i=0; i<my_n_thread; ++i ) {
+        private_worker* t = new( &my_thread_array[i] ) padded_private_worker( *this, client, i );
+        t->my_next = my_asleep_list_root.load(std::memory_order_relaxed);
+        my_asleep_list_root.store(t, std::memory_order_relaxed);
+    }
+}
+
+private_server::~private_server() {
+    __TBB_ASSERT( my_net_slack_requests==0, nullptr);
+    for( std::size_t i=my_n_thread; i--; )
+        my_thread_array[i].~padded_private_worker();
+    tbb::cache_aligned_allocator<padded_private_worker>().deallocate( my_thread_array, my_n_thread );
+    tbb::detail::poison_pointer( my_thread_array );
+}
+
+inline bool private_server::try_insert_in_asleep_list( private_worker& t ) {
+    asleep_list_mutex_type::scoped_lock lock;
+    if( !lock.try_acquire(my_asleep_list_mutex) )
+        return false;
+    // Contribute to slack under lock so that if another takes that unit of slack,
+    // it sees us sleeping on the list and wakes us up.
+    auto expected = my_slack.load(std::memory_order_relaxed);
+    while (expected < 0) {
+        if (my_slack.compare_exchange_strong(expected, expected + 1)) {
+            t.my_next = my_asleep_list_root.load(std::memory_order_relaxed);
+            my_asleep_list_root.store(&t, std::memory_order_relaxed);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void private_server::wake_some( int additional_slack ) {
+    __TBB_ASSERT( additional_slack>=0, nullptr );
+    private_worker* wakee[2];
+    private_worker**w = wakee;
+
+    if (additional_slack) {
+        // Contribute our unused slack to my_slack.
+        my_slack += additional_slack;
+    }
+
+    int allotted_slack = 0;
+    while (allotted_slack < 2) {
+        // Chain reaction; Try to claim unit of slack
+        int old = my_slack.load(std::memory_order_relaxed);
+        do {
+            if (old <= 0) goto done;
+        } while (!my_slack.compare_exchange_strong(old, old - 1));
+        ++allotted_slack;
+    }
+done:
+
+    if (allotted_slack) {
+        asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex);
+        auto root = my_asleep_list_root.load(std::memory_order_relaxed);
+        while( root && w<wakee+2 && allotted_slack) {
+            --allotted_slack;
+            // Pop sleeping worker to combine with claimed unit of slack
+            *w++ = root;
+            root = root->my_next;
+        }
+        my_asleep_list_root.store(root, std::memory_order_relaxed);
+        if(allotted_slack) {
+            // Contribute our unused slack to my_slack.
+            my_slack += allotted_slack;
+        }
+    }
+    while( w>wakee ) {
+        private_worker* ww = *--w;
+        ww->my_next = nullptr;
+        ww->wake_or_launch();
+    }
+}
+
+void private_server::adjust_job_count_estimate( int delta ) {
+#if TBB_USE_ASSERT
+    my_net_slack_requests+=delta;
+#endif /* TBB_USE_ASSERT */
+    if( delta<0 ) {
+        my_slack+=delta;
+    } else if( delta>0 ) {
+        wake_some( delta );
+    }
+}
+
+//! Factory method called from task.cpp to create a private_server.
+tbb_server* make_private_server( tbb_client& client ) {
+    return new( tbb::cache_aligned_allocator<private_server>().allocate(1) ) private_server(client);
+}
+
+} // namespace rml
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/third_party/tbb/profiling.cpp b/third_party/tbb/profiling.cpp
new file mode 100644
index 000000000..21ed67b53
--- /dev/null
+++ b/third_party/tbb/profiling.cpp
@@ -0,0 +1,268 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+
+#include "third_party/tbb/main.h"
+#include "third_party/tbb/itt_notify.h"
+
+#include "third_party/tbb/profiling.h"
+
+#include "libc/mem/alg.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if __TBB_USE_ITT_NOTIFY
+bool ITT_Present;
+static std::atomic<bool> ITT_InitializationDone;
+
+static __itt_domain *tbb_domains[d1::ITT_NUM_DOMAINS] = {};
+
+struct resource_string {
+    const char *str;
+    __itt_string_handle *itt_str_handle;
+};
+
+//
+// populate resource strings
+//
+#define TBB_STRING_RESOURCE( index_name, str ) { str, nullptr },
+static resource_string strings_for_itt[] = {
+    #include "third_party/tbb/detail/_string_resource.h"
+    { "num_resource_strings", nullptr }
+};
+#undef TBB_STRING_RESOURCE
+
+static __itt_string_handle* ITT_get_string_handle(std::uintptr_t idx) {
+    __TBB_ASSERT(idx < NUM_STRINGS, "string handle out of valid range");
+    return idx < NUM_STRINGS ? strings_for_itt[idx].itt_str_handle : nullptr;
+}
+
+static void ITT_init_domains() {
+    tbb_domains[d1::ITT_DOMAIN_MAIN] = __itt_domain_create( _T("tbb") );
+    tbb_domains[d1::ITT_DOMAIN_MAIN]->flags = 1;
+    tbb_domains[d1::ITT_DOMAIN_FLOW] = __itt_domain_create( _T("tbb.flow") );
+    tbb_domains[d1::ITT_DOMAIN_FLOW]->flags = 1;
+    tbb_domains[d1::ITT_DOMAIN_ALGO] = __itt_domain_create( _T("tbb.algorithm") );
+    tbb_domains[d1::ITT_DOMAIN_ALGO]->flags = 1;
+}
+
+static void ITT_init_strings() {
+    for ( std::uintptr_t i = 0; i < NUM_STRINGS; ++i ) {
+#if _WIN32||_WIN64
+        strings_for_itt[i].itt_str_handle = __itt_string_handle_createA( strings_for_itt[i].str );
+#else
+        strings_for_itt[i].itt_str_handle = __itt_string_handle_create( strings_for_itt[i].str );
+#endif
+    }
+}
+
+static void ITT_init() {
+    ITT_init_domains();
+    ITT_init_strings();
+}
+
+/** Thread-unsafe lazy one-time initialization of tools interop.
+    Used by both dummy handlers and general TBB one-time initialization routine. **/
+void ITT_DoUnsafeOneTimeInitialization () {
+    // Double check ITT_InitializationDone is necessary because the first check
+    // in ITT_DoOneTimeInitialization is not guarded with the __TBB_InitOnce lock.
+    if ( !ITT_InitializationDone ) {
+        ITT_Present = (__TBB_load_ittnotify()!=0);
+        if (ITT_Present) ITT_init();
+        ITT_InitializationDone = true;
+    }
+}
+
+/** Thread-safe lazy one-time initialization of tools interop.
+    Used by dummy handlers only. **/
+extern "C"
+void ITT_DoOneTimeInitialization() {
+    if ( !ITT_InitializationDone ) {
+        __TBB_InitOnce::lock();
+        ITT_DoUnsafeOneTimeInitialization();
+        __TBB_InitOnce::unlock();
+    }
+}
+
+void create_itt_sync(void* ptr, const tchar* objtype, const tchar* objname) {
+        ITT_SYNC_CREATE(ptr, objtype, objname);
+}
+
+void call_itt_notify(int t, void *ptr) {
+    switch (t) {
+    case 0: ITT_NOTIFY(sync_prepare, ptr); break;
+    case 1: ITT_NOTIFY(sync_cancel, ptr); break;
+    case 2: ITT_NOTIFY(sync_acquired, ptr); break;
+    case 3: ITT_NOTIFY(sync_releasing, ptr); break;
+    case 4: ITT_NOTIFY(sync_destroy, ptr); break;
+    }
+}
+
+void itt_set_sync_name(void* obj, const tchar* name) {
+    __itt_sync_rename(obj, name);
+}
+
+const __itt_id itt_null_id = { 0, 0, 0 };
+
+static inline __itt_domain* get_itt_domain(d1::itt_domain_enum idx) {
+    if (tbb_domains[idx] == nullptr) {
+        ITT_DoOneTimeInitialization();
+    }
+    return tbb_domains[idx];
+}
+
+static inline void itt_id_make(__itt_id* id, void* addr, unsigned long long extra) {
+    *id = __itt_id_make(addr, extra);
+}
+
+static inline void itt_id_create(const __itt_domain* domain, __itt_id id) {
+    __itt_id_create(domain, id);
+}
+
+void itt_make_task_group(d1::itt_domain_enum domain, void* group, unsigned long long group_extra,
+                         void* parent, unsigned long long parent_extra, string_resource_index name_index) {
+    if (__itt_domain* d = get_itt_domain(domain)) {
+        __itt_id group_id = itt_null_id;
+        __itt_id parent_id = itt_null_id;
+        itt_id_make(&group_id, group, group_extra);
+        itt_id_create(d, group_id);
+        if (parent) {
+            itt_id_make(&parent_id, parent, parent_extra);
+        }
+        __itt_string_handle* n = ITT_get_string_handle(name_index);
+        __itt_task_group(d, group_id, parent_id, n);
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_metadata_str_add(d1::itt_domain_enum domain, void *addr, unsigned long long addr_extra,
+                                              string_resource_index key, const char *value ) {
+    if ( __itt_domain *d = get_itt_domain( domain ) ) {
+        __itt_id id = itt_null_id;
+        itt_id_make( &id, addr, addr_extra );
+        __itt_string_handle *k = ITT_get_string_handle(key);
+        size_t value_length = strlen( value );
+#if _WIN32||_WIN64
+        __itt_metadata_str_addA(d, id, k, value, value_length);
+#else
+        __itt_metadata_str_add(d, id, k, value, value_length);
+#endif
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_metadata_ptr_add(d1::itt_domain_enum domain, void *addr, unsigned long long addr_extra,
+                                              string_resource_index key, void *value ) {
+    if ( __itt_domain *d = get_itt_domain( domain ) ) {
+        __itt_id id = itt_null_id;
+        itt_id_make( &id, addr, addr_extra );
+        __itt_string_handle *k = ITT_get_string_handle(key);
+#if __TBB_x86_32
+        __itt_metadata_add(d, id, k, __itt_metadata_u32, 1, value);
+#else
+        __itt_metadata_add(d, id, k, __itt_metadata_u64, 1, value);
+#endif
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_relation_add(d1::itt_domain_enum domain, void *addr0, unsigned long long addr0_extra,
+                                          itt_relation relation, void *addr1, unsigned long long addr1_extra ) {
+    if ( __itt_domain *d = get_itt_domain( domain ) ) {
+        __itt_id id0 = itt_null_id;
+        __itt_id id1 = itt_null_id;
+        itt_id_make( &id0, addr0, addr0_extra );
+        itt_id_make( &id1, addr1, addr1_extra );
+         __itt_relation_add( d, id0, (__itt_relation)relation, id1 );
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_task_begin(d1::itt_domain_enum domain, void* task, unsigned long long task_extra,
+                    void* parent, unsigned long long parent_extra, string_resource_index name_index) {
+    if (__itt_domain* d = get_itt_domain(domain)) {
+        __itt_id task_id = itt_null_id;
+        __itt_id parent_id = itt_null_id;
+        if (task) {
+            itt_id_make(&task_id, task, task_extra);
+        }
+        if (parent) {
+            itt_id_make(&parent_id, parent, parent_extra);
+        }
+        __itt_string_handle* n = ITT_get_string_handle(name_index);
+        __itt_task_begin(d, task_id, parent_id, n);
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_task_end(d1::itt_domain_enum domain) {
+    if (__itt_domain* d = get_itt_domain(domain)) {
+        __itt_task_end(d);
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_region_begin(d1::itt_domain_enum domain, void *region, unsigned long long region_extra,
+                      void *parent, unsigned long long parent_extra, string_resource_index /* name_index */ ) {
+    if ( __itt_domain *d = get_itt_domain( domain ) ) {
+        __itt_id region_id = itt_null_id;
+        __itt_id parent_id = itt_null_id;
+        itt_id_make( &region_id, region, region_extra );
+        if ( parent ) {
+            itt_id_make( &parent_id, parent, parent_extra );
+        }
+         __itt_region_begin( d, region_id, parent_id, nullptr );
+    }
+}
+
+void __TBB_EXPORTED_FUNC itt_region_end(d1::itt_domain_enum domain, void *region, unsigned long long region_extra ) {
+    if ( __itt_domain *d = get_itt_domain( domain ) ) {
+        __itt_id region_id = itt_null_id;
+        itt_id_make( &region_id, region, region_extra );
+         __itt_region_end( d, region_id );
+    }
+}
+
+#else
+void create_itt_sync(void* /*ptr*/, const tchar* /*objtype*/, const tchar* /*objname*/) {}
+void call_itt_notify(int /*t*/, void* /*ptr*/) {}
+void itt_set_sync_name(void* /*obj*/, const tchar* /*name*/) {}
+void itt_make_task_group(d1::itt_domain_enum /*domain*/, void* /*group*/, unsigned long long /*group_extra*/,
+                         void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/) {}
+void itt_metadata_str_add(d1::itt_domain_enum /*domain*/, void* /*addr*/, unsigned long long /*addr_extra*/,
+                          string_resource_index /*key*/, const char* /*value*/ ) { }
+void itt_metadata_ptr_add(d1::itt_domain_enum /*domain*/, void * /*addr*/, unsigned long long /*addr_extra*/,
+                          string_resource_index /*key*/, void * /*value*/ ) {}
+void itt_relation_add(d1::itt_domain_enum /*domain*/, void* /*addr0*/, unsigned long long /*addr0_extra*/,
+                      itt_relation /*relation*/, void* /*addr1*/, unsigned long long /*addr1_extra*/ ) { }
+void itt_task_begin(d1::itt_domain_enum /*domain*/, void* /*task*/, unsigned long long /*task_extra*/,
+                        void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/ ) { }
+void itt_task_end(d1::itt_domain_enum /*domain*/ ) { }
+void itt_region_begin(d1::itt_domain_enum /*domain*/, void* /*region*/, unsigned long long /*region_extra*/,
+                          void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/ ) { }
+void itt_region_end(d1::itt_domain_enum /*domain*/, void* /*region*/, unsigned long long /*region_extra*/ ) { }
+#endif /* __TBB_USE_ITT_NOTIFY */
+
+const tchar
+    *SyncType_Scheduler = _T("%Constant")
+    ;
+const tchar
+    *SyncObj_ContextsList = _T("TBB Scheduler")
+    ;
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/profiling.h b/third_party/tbb/profiling.h
new file mode 100644
index 000000000..707df8ce4
--- /dev/null
+++ b/third_party/tbb/profiling.h
@@ -0,0 +1,259 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_profiling_H
+#define __TBB_profiling_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/libcxx/cstdint"
+
+#include "third_party/libcxx/string"
+
+namespace tbb {
+namespace detail {
+inline namespace d0 {
+    // include list of index names
+    #define TBB_STRING_RESOURCE(index_name,str) index_name,
+    enum string_resource_index : std::uintptr_t {
+        #include "third_party/tbb/detail/_string_resource.h"
+        NUM_STRINGS
+    };
+    #undef TBB_STRING_RESOURCE
+
+    enum itt_relation
+    {
+    __itt_relation_is_unknown = 0,
+    __itt_relation_is_dependent_on,         /**< "A is dependent on B" means that A cannot start until B completes */
+    __itt_relation_is_sibling_of,           /**< "A is sibling of B" means that A and B were created as a group */
+    __itt_relation_is_parent_of,            /**< "A is parent of B" means that A created B */
+    __itt_relation_is_continuation_of,      /**< "A is continuation of B" means that A assumes the dependencies of B */
+    __itt_relation_is_child_of,             /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */
+    __itt_relation_is_continued_by,         /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */
+    __itt_relation_is_predecessor_to        /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */
+    };
+
+//! Unicode support
+#if (_WIN32||_WIN64)
+    //! Unicode character type. Always wchar_t on Windows.
+    using tchar = wchar_t;
+#else /* !WIN */
+    using tchar = char;
+#endif /* !WIN */
+
+} // namespace d0
+} // namespace detail
+} // namespace tbb
+
+#include "third_party/libcxx/atomic"
+#if _WIN32||_WIN64
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"  /* mbstowcs_s */
+#endif
+// Need these to work regardless of tools support
+namespace tbb {
+namespace detail {
+namespace d1 {
+    enum notify_type {prepare=0, cancel, acquired, releasing, destroy};
+    enum itt_domain_enum { ITT_DOMAIN_FLOW=0, ITT_DOMAIN_MAIN=1, ITT_DOMAIN_ALGO=2, ITT_NUM_DOMAINS };
+} // namespace d1
+
+namespace r1 {
+    TBB_EXPORT void __TBB_EXPORTED_FUNC call_itt_notify(int t, void* ptr);
+    TBB_EXPORT void __TBB_EXPORTED_FUNC create_itt_sync(void* ptr, const tchar* objtype, const tchar* objname);
+    TBB_EXPORT void __TBB_EXPORTED_FUNC itt_make_task_group(d1::itt_domain_enum domain, void* group, unsigned long long group_extra,
+        void* parent, unsigned long long parent_extra, string_resource_index name_index);
+    TBB_EXPORT void __TBB_EXPORTED_FUNC itt_task_begin(d1::itt_domain_enum domain, void* task, unsigned long long task_extra,
+        void* parent, unsigned long long parent_extra, string_resource_index name_index);
+    TBB_EXPORT void __TBB_EXPORTED_FUNC itt_task_end(d1::itt_domain_enum domain);
+    TBB_EXPORT void __TBB_EXPORTED_FUNC itt_set_sync_name(void* obj, const tchar* name);
+    TBB_EXPORT void __TBB_EXPORTED_FUNC itt_metadata_str_add(d1::itt_domain_enum domain, void* addr, unsigned long long addr_extra,
+        string_resource_index key, const char* value);
+    TBB_EXPORT void __TBB_EXPORTED_FUNC itt_metadata_ptr_add(d1::itt_domain_enum domain, void* addr, unsigned long long addr_extra,
+        string_resource_index key, void* value);
+    TBB_EXPORT void __TBB_EXPORTED_FUNC itt_relation_add(d1::itt_domain_enum domain, void* addr0, unsigned long long addr0_extra,
+        itt_relation relation, void* addr1, unsigned long long addr1_extra);
+    TBB_EXPORT void __TBB_EXPORTED_FUNC itt_region_begin(d1::itt_domain_enum domain, void* region, unsigned long long region_extra,
+        void* parent, unsigned long long parent_extra, string_resource_index /* name_index */);
+    TBB_EXPORT void __TBB_EXPORTED_FUNC itt_region_end(d1::itt_domain_enum domain, void* region, unsigned long long region_extra);
+} // namespace r1
+
+namespace d1 {
+#if TBB_USE_PROFILING_TOOLS && (_WIN32||_WIN64)
+    inline std::size_t multibyte_to_widechar(wchar_t* wcs, const char* mbs, std::size_t bufsize) {
+        std::size_t len;
+        mbstowcs_s(&len, wcs, bufsize, mbs, _TRUNCATE);
+        return len;   // mbstowcs_s counts null terminator
+    }
+#endif
+
+#if TBB_USE_PROFILING_TOOLS
+    inline void create_itt_sync(void *ptr, const char *objtype, const char *objname) {
+#if (_WIN32||_WIN64)
+        std::size_t len_type = multibyte_to_widechar(nullptr, objtype, 0);
+        wchar_t *type = new wchar_t[len_type];
+        multibyte_to_widechar(type, objtype, len_type);
+        std::size_t len_name = multibyte_to_widechar(nullptr, objname, 0);
+        wchar_t *name = new wchar_t[len_name];
+        multibyte_to_widechar(name, objname, len_name);
+#else // WIN
+        const char *type = objtype;
+        const char *name = objname;
+#endif
+        r1::create_itt_sync(ptr, type, name);
+
+#if (_WIN32||_WIN64)
+        delete[] type;
+        delete[] name;
+#endif // WIN
+    }
+
+// Distinguish notifications on task for reducing overheads
+#if TBB_USE_PROFILING_TOOLS == 2
+    inline void call_itt_task_notify(d1::notify_type t, void *ptr) {
+        r1::call_itt_notify(static_cast<int>(t), ptr);
+    }
+#else
+    inline void call_itt_task_notify(d1::notify_type, void *) {}
+#endif // TBB_USE_PROFILING_TOOLS
+
+    inline void call_itt_notify(d1::notify_type t, void *ptr) {
+        r1::call_itt_notify(static_cast<int>(t), ptr);
+    }
+
+#if (_WIN32||_WIN64) && !__MINGW32__
+    inline void itt_set_sync_name(void* obj, const wchar_t* name) {
+        r1::itt_set_sync_name(obj, name);
+    }
+    inline void itt_set_sync_name(void* obj, const char* name) {
+        std::size_t len_name = multibyte_to_widechar(nullptr, name, 0);
+        wchar_t *obj_name = new wchar_t[len_name];
+        multibyte_to_widechar(obj_name, name, len_name);
+        r1::itt_set_sync_name(obj, obj_name);
+        delete[] obj_name;
+    }
+#else
+    inline void itt_set_sync_name( void* obj, const char* name) {
+        r1::itt_set_sync_name(obj, name);
+    }
+#endif //WIN
+
+    inline void itt_make_task_group(itt_domain_enum domain, void* group, unsigned long long group_extra,
+        void* parent, unsigned long long parent_extra, string_resource_index name_index) {
+        r1::itt_make_task_group(domain, group, group_extra, parent, parent_extra, name_index);
+    }
+
+    inline void itt_metadata_str_add( itt_domain_enum domain, void *addr, unsigned long long addr_extra,
+                                        string_resource_index key, const char *value ) {
+        r1::itt_metadata_str_add( domain, addr, addr_extra, key, value );
+    }
+
+    inline void register_node_addr(itt_domain_enum domain, void *addr, unsigned long long addr_extra,
+        string_resource_index key, void *value) {
+        r1::itt_metadata_ptr_add(domain, addr, addr_extra, key, value);
+    }
+
+    inline void itt_relation_add( itt_domain_enum domain, void *addr0, unsigned long long addr0_extra,
+                                    itt_relation relation, void *addr1, unsigned long long addr1_extra ) {
+        r1::itt_relation_add( domain, addr0, addr0_extra, relation, addr1, addr1_extra );
+    }
+
+    inline void itt_task_begin( itt_domain_enum domain, void *task, unsigned long long task_extra,
+                                                    void *parent, unsigned long long parent_extra, string_resource_index name_index ) {
+        r1::itt_task_begin( domain, task, task_extra, parent, parent_extra, name_index );
+    }
+
+    inline void itt_task_end( itt_domain_enum domain ) {
+        r1::itt_task_end( domain );
+    }
+
+    inline void itt_region_begin( itt_domain_enum domain, void *region, unsigned long long region_extra,
+                                    void *parent, unsigned long long parent_extra, string_resource_index name_index ) {
+        r1::itt_region_begin( domain, region, region_extra, parent, parent_extra, name_index );
+    }
+
+    inline void itt_region_end( itt_domain_enum domain, void *region, unsigned long long region_extra  ) {
+        r1::itt_region_end( domain, region, region_extra );
+    }
+#else
+    inline void create_itt_sync(void* /*ptr*/, const char* /*objtype*/, const char* /*objname*/) {}
+
+    inline void call_itt_notify(notify_type /*t*/, void* /*ptr*/) {}
+
+    inline void call_itt_task_notify(notify_type /*t*/, void* /*ptr*/) {}
+#endif // TBB_USE_PROFILING_TOOLS
+
+#if TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2)
+class event {
+/** This class supports user event traces through itt.
+    Common use-case is tagging data flow graph tasks (data-id)
+    and visualization by Intel Advisor Flow Graph Analyzer (FGA)  **/
+//  TODO: Replace implementation by itt user event api.
+
+    const std::string my_name;
+
+    static void emit_trace(const std::string &input) {
+        itt_metadata_str_add( ITT_DOMAIN_FLOW, nullptr, FLOW_NULL, USER_EVENT, ( "FGA::DATAID::" + input ).c_str() );
+    }
+
+public:
+    event(const std::string &input)
+              : my_name( input )
+    { }
+
+    void emit() {
+        emit_trace(my_name);
+    }
+
+    static void emit(const std::string &description) {
+        emit_trace(description);
+    }
+
+};
+#else // TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2)
+// Using empty struct if user event tracing is disabled:
+struct event {
+    event(const std::string &) { }
+
+    void emit() { }
+
+    static void emit(const std::string &) { }
+};
+#endif // TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2)
+} // namespace d1
+} // namespace detail
+
+namespace profiling {
+    using detail::d1::event;
+}
+} // namespace tbb
+
+
+#endif /* __TBB_profiling_H */
diff --git a/third_party/tbb/queuing_mutex.h b/third_party/tbb/queuing_mutex.h
new file mode 100644
index 000000000..0636b667e
--- /dev/null
+++ b/third_party/tbb/queuing_mutex.h
@@ -0,0 +1,193 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_queuing_mutex_H
+#define __TBB_queuing_mutex_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_mutex_common.h"
+
+#include "third_party/tbb/profiling.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+//! Queuing mutex with local-only spinning.
+/** @ingroup synchronization */
+class queuing_mutex {
+public:
+    //! Construct unacquired mutex.
+    queuing_mutex() noexcept  {
+        create_itt_sync(this, "tbb::queuing_mutex", "");
+    };
+
+    queuing_mutex(const queuing_mutex&) = delete;
+    queuing_mutex& operator=(const queuing_mutex&) = delete;
+
+    //! The scoped locking pattern
+    /** It helps to avoid the common problem of forgetting to release lock.
+        It also nicely provides the "node" for queuing locks. */
+    class scoped_lock {
+        //! Reset fields to mean "no lock held".
+        void reset() {
+            m_mutex = nullptr;
+        }
+
+    public:
+        //! Construct lock that has not acquired a mutex.
+        /** Equivalent to zero-initialization of *this. */
+        scoped_lock() = default;
+
+        //! Acquire lock on given mutex.
+        scoped_lock(queuing_mutex& m) {
+            acquire(m);
+        }
+
+        //! Release lock (if lock is held).
+        ~scoped_lock() {
+            if (m_mutex) release();
+        }
+
+        //! No Copy
+        scoped_lock( const scoped_lock& ) = delete;
+        scoped_lock& operator=( const scoped_lock& ) = delete;
+
+        //! Acquire lock on given mutex.
+        void acquire( queuing_mutex& m ) {
+            __TBB_ASSERT(!m_mutex, "scoped_lock is already holding a mutex");
+
+            // Must set all fields before the exchange, because once the
+            // exchange executes, *this becomes accessible to other threads.
+            m_mutex = &m;
+            m_next.store(nullptr, std::memory_order_relaxed);
+            m_going.store(0U, std::memory_order_relaxed);
+
+            // x86 compare exchange operation always has a strong fence
+            // "sending" the fields initialized above to other processors.
+            scoped_lock* pred = m.q_tail.exchange(this);
+            if (pred) {
+                call_itt_notify(prepare, &m);
+                __TBB_ASSERT(pred->m_next.load(std::memory_order_relaxed) == nullptr, "the predecessor has another successor!");
+
+                pred->m_next.store(this, std::memory_order_release);
+                spin_wait_while_eq(m_going, 0U);
+            }
+            call_itt_notify(acquired, &m);
+
+        }
+
+        //! Acquire lock on given mutex if free (i.e. non-blocking)
+        bool try_acquire( queuing_mutex& m ) {
+            __TBB_ASSERT(!m_mutex, "scoped_lock is already holding a mutex");
+
+            // Must set all fields before the compare_exchange_strong, because once the
+            // compare_exchange_strong executes, *this becomes accessible to other threads.
+            m_next.store(nullptr, std::memory_order_relaxed);
+            m_going.store(0U, std::memory_order_relaxed);
+
+            scoped_lock* expected = nullptr;
+            // The compare_exchange_strong must have release semantics, because we are
+            // "sending" the fields initialized above to other processors.
+            // x86 compare exchange operation always has a strong fence
+            if (!m.q_tail.compare_exchange_strong(expected, this, std::memory_order_acq_rel))
+                return false;
+
+            m_mutex = &m;
+
+            call_itt_notify(acquired, &m);
+            return true;
+        }
+
+        //! Release lock.
+        void release()
+        {
+            __TBB_ASSERT(this->m_mutex, "no lock acquired");
+
+            call_itt_notify(releasing, this->m_mutex);
+
+            if (m_next.load(std::memory_order_relaxed) == nullptr) {
+                scoped_lock* expected = this;
+                if (m_mutex->q_tail.compare_exchange_strong(expected, nullptr)) {
+                    // this was the only item in the queue, and the queue is now empty.
+                    reset();
+                    return;
+                }
+                // Someone in the queue
+                spin_wait_while_eq(m_next, nullptr);
+            }
+            m_next.load(std::memory_order_acquire)->m_going.store(1U, std::memory_order_release);
+
+            reset();
+        }
+
+    private:
+        //! The pointer to the mutex owned, or nullptr if not holding a mutex.
+        queuing_mutex* m_mutex{nullptr};
+
+        //! The pointer to the next competitor for a mutex
+        std::atomic<scoped_lock*> m_next{nullptr};
+
+        //! The local spin-wait variable
+        /** Inverted (0 - blocked, 1 - acquired the mutex) for the sake of
+            zero-initialization.  Defining it as an entire word instead of
+            a byte seems to help performance slightly. */
+        std::atomic<uintptr_t> m_going{0U};
+    };
+
+    // Mutex traits
+    static constexpr bool is_rw_mutex = false;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = true;
+
+private:
+    //! The last competitor requesting the lock
+    std::atomic<scoped_lock*> q_tail{nullptr};
+
+};
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(queuing_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64)
+inline void set_name(queuing_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif //WIN
+#else
+inline void set_name(queuing_mutex&, const char*) {}
+#if (_WIN32||_WIN64)
+inline void set_name(queuing_mutex&, const wchar_t*) {}
+#endif //WIN
+#endif
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::queuing_mutex;
+} // namespace v1
+namespace profiling {
+    using detail::d1::set_name;
+}
+} // namespace tbb
+
+#endif /* __TBB_queuing_mutex_H */
diff --git a/third_party/tbb/queuing_rw_mutex.cpp b/third_party/tbb/queuing_rw_mutex.cpp
new file mode 100644
index 000000000..675484a40
--- /dev/null
+++ b/third_party/tbb/queuing_rw_mutex.cpp
@@ -0,0 +1,618 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+/** Before making any changes in the implementation, please emulate algorithmic changes
+    with SPIN tool using <TBB directory>/tools/spin_models/ReaderWriterMutex.pml.
+    There could be some code looking as "can be restructured" but its structure does matter! */
+
+#include "third_party/tbb/queuing_rw_mutex.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/itt_notify.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    // Workaround for overzealous compiler warnings
+    #pragma warning (push)
+    #pragma warning (disable: 4311 4312)
+#endif
+
+//! A view of a T* with additional functionality for twiddling low-order bits.
+template<typename T>
+class tricky_atomic_pointer {
+public:
+    using word = uintptr_t;
+
+    static T* fetch_add( std::atomic<word>& location, word addend, std::memory_order memory_order ) {
+        return reinterpret_cast<T*>(location.fetch_add(addend, memory_order));
+    }
+
+    static T* exchange( std::atomic<word>& location, T* value, std::memory_order memory_order ) {
+        return reinterpret_cast<T*>(location.exchange(reinterpret_cast<word>(value), memory_order));
+    }
+
+    static T* compare_exchange_strong( std::atomic<word>& obj, const T* expected, const T* desired, std::memory_order memory_order ) {
+        word expd = reinterpret_cast<word>(expected);
+        obj.compare_exchange_strong(expd, reinterpret_cast<word>(desired), memory_order);
+        return reinterpret_cast<T*>(expd);
+    }
+
+    static void store( std::atomic<word>& location, const T* value, std::memory_order memory_order ) {
+        location.store(reinterpret_cast<word>(value), memory_order);
+    }
+
+    static T* load( std::atomic<word>& location, std::memory_order memory_order ) {
+        return reinterpret_cast<T*>(location.load(memory_order));
+    }
+
+    static void spin_wait_while_eq(const std::atomic<word>& location, const T* value) {
+        tbb::detail::d0::spin_wait_while_eq(location, reinterpret_cast<word>(value) );
+    }
+
+    T* & ref;
+    tricky_atomic_pointer( T*& original ) : ref(original) {};
+    tricky_atomic_pointer(const tricky_atomic_pointer&) = delete;
+    tricky_atomic_pointer& operator=(const tricky_atomic_pointer&) = delete;
+    T* operator&( const word operand2 ) const {
+        return reinterpret_cast<T*>( reinterpret_cast<word>(ref) & operand2 );
+    }
+    T* operator|( const word operand2 ) const {
+        return reinterpret_cast<T*>( reinterpret_cast<word>(ref) | operand2 );
+    }
+};
+
+using tricky_pointer = tricky_atomic_pointer<queuing_rw_mutex::scoped_lock>;
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    // Workaround for overzealous compiler warnings
+    #pragma warning (pop)
+#endif
+
+//! Flag bits in a state_t that specify information about a locking request.
+enum state_t_flags : unsigned char {
+    STATE_NONE                   = 0,
+    STATE_WRITER                 = 1<<0,
+    STATE_READER                 = 1<<1,
+    STATE_READER_UNBLOCKNEXT     = 1<<2,
+    STATE_ACTIVEREADER           = 1<<3,
+    STATE_UPGRADE_REQUESTED      = 1<<4,
+    STATE_UPGRADE_WAITING        = 1<<5,
+    STATE_UPGRADE_LOSER          = 1<<6,
+    STATE_COMBINED_WAITINGREADER = STATE_READER | STATE_READER_UNBLOCKNEXT,
+    STATE_COMBINED_READER        = STATE_COMBINED_WAITINGREADER | STATE_ACTIVEREADER,
+    STATE_COMBINED_UPGRADING     = STATE_UPGRADE_WAITING | STATE_UPGRADE_LOSER
+};
+
+static const unsigned char RELEASED = 0;
+static const unsigned char ACQUIRED = 1;
+
+struct queuing_rw_mutex_impl {
+    //! Try to acquire the internal lock
+    /** Returns true if lock was successfully acquired. */
+    static bool try_acquire_internal_lock(d1::queuing_rw_mutex::scoped_lock& s)
+    {
+        auto expected = RELEASED;
+        return s.my_internal_lock.compare_exchange_strong(expected, ACQUIRED);
+    }
+
+    //! Acquire the internal lock
+    static void acquire_internal_lock(d1::queuing_rw_mutex::scoped_lock& s)
+    {
+        // Usually, we would use the test-test-and-set idiom here, with exponential backoff.
+        // But so far, experiments indicate there is no value in doing so here.
+        while( !try_acquire_internal_lock(s) ) {
+            machine_pause(1);
+        }
+    }
+
+    //! Release the internal lock
+    static void release_internal_lock(d1::queuing_rw_mutex::scoped_lock& s)
+    {
+        s.my_internal_lock.store(RELEASED, std::memory_order_release);
+    }
+
+    //! Wait for internal lock to be released
+    static void wait_for_release_of_internal_lock(d1::queuing_rw_mutex::scoped_lock& s)
+    {
+        spin_wait_until_eq(s.my_internal_lock, RELEASED);
+    }
+
+    //! A helper function
+    static void unblock_or_wait_on_internal_lock(d1::queuing_rw_mutex::scoped_lock& s, uintptr_t flag ) {
+        if( flag ) {
+            wait_for_release_of_internal_lock(s);
+        }
+        else {
+            release_internal_lock(s);
+        }
+    }
+
+    //! Mask for low order bit of a pointer.
+    static const tricky_pointer::word FLAG = 0x1;
+
+    static uintptr_t get_flag( d1::queuing_rw_mutex::scoped_lock* ptr ) {
+        return reinterpret_cast<uintptr_t>(ptr) & FLAG;
+    }
+
+    //------------------------------------------------------------------------
+    // Methods of queuing_rw_mutex::scoped_lock
+    //------------------------------------------------------------------------
+
+    //! A method to acquire queuing_rw_mutex lock
+    static void acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write)
+    {
+        __TBB_ASSERT( !s.my_mutex, "scoped_lock is already holding a mutex");
+
+        // Must set all fields before the exchange, because once the
+        // exchange executes, *this becomes accessible to other threads.
+        s.my_mutex = &m;
+        s.my_prev.store(0U, std::memory_order_relaxed);
+        s.my_next.store(0U, std::memory_order_relaxed);
+        s.my_going.store(0U, std::memory_order_relaxed);
+        s.my_state.store(d1::queuing_rw_mutex::scoped_lock::state_t(write ? STATE_WRITER : STATE_READER), std::memory_order_relaxed);
+        s.my_internal_lock.store(RELEASED, std::memory_order_relaxed);
+
+
+        // The CAS must have release semantics, because we are
+        // "sending" the fields initialized above to other actors.
+        // We need acquire semantics, because we are acquiring the predecessor (or mutex if no predecessor)
+        queuing_rw_mutex::scoped_lock* predecessor = m.q_tail.exchange(&s, std::memory_order_acq_rel);
+
+        if( write ) {       // Acquiring for write
+
+            if( predecessor ) {
+                ITT_NOTIFY(sync_prepare, s.my_mutex);
+                predecessor = tricky_pointer(predecessor) & ~FLAG;
+                __TBB_ASSERT( !predecessor->my_next, "the predecessor has another successor!");
+                tricky_pointer::store(predecessor->my_next, &s, std::memory_order_release);
+                // We are acquiring the mutex
+                spin_wait_until_eq(s.my_going, 1U, std::memory_order_acquire);
+            }
+
+        } else {            // Acquiring for read
+    #if __TBB_USE_ITT_NOTIFY
+            bool sync_prepare_done = false;
+    #endif
+            if( predecessor ) {
+                unsigned char pred_state{};
+                __TBB_ASSERT( !s.my_prev.load(std::memory_order_relaxed), "the predecessor is already set" );
+                if( tricky_pointer(predecessor) & FLAG ) {
+                    /* this is only possible if predecessor is an upgrading reader and it signals us to wait */
+                    pred_state = STATE_UPGRADE_WAITING;
+                    predecessor = tricky_pointer(predecessor) & ~FLAG;
+                } else {
+                    // Load predecessor->my_state now, because once predecessor->my_next becomes
+                    // non-null, we must assume that *predecessor might be destroyed.
+                    pred_state = predecessor->my_state.load(std::memory_order_relaxed);
+                    if (pred_state == STATE_READER) {
+                        // Notify the previous reader to unblock us.
+                        predecessor->my_state.compare_exchange_strong(pred_state, STATE_READER_UNBLOCKNEXT, std::memory_order_relaxed);
+                    }
+                    if (pred_state == STATE_ACTIVEREADER)  { // either we initially read it or CAS failed
+                        // Active reader means that the predecessor already acquired the mutex and cannot notify us.
+                        // Therefore, we need to acquire the mutex ourselves by re-reading predecessor state.
+                        (void)predecessor->my_state.load(std::memory_order_acquire);
+                    }
+                }
+                tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed);
+                __TBB_ASSERT( !( tricky_pointer(predecessor) & FLAG ), "use of corrupted pointer!" );
+                __TBB_ASSERT( !predecessor->my_next.load(std::memory_order_relaxed), "the predecessor has another successor!");
+                tricky_pointer::store(predecessor->my_next, &s, std::memory_order_release);
+                if( pred_state != STATE_ACTIVEREADER ) {
+    #if __TBB_USE_ITT_NOTIFY
+                    sync_prepare_done = true;
+                    ITT_NOTIFY(sync_prepare, s.my_mutex);
+    #endif
+                    // We are acquiring the mutex
+                    spin_wait_until_eq(s.my_going, 1U, std::memory_order_acquire);
+                }
+            }
+
+            // The protected state must have been acquired here before it can be further released to any other reader(s):
+            unsigned char old_state = STATE_READER;
+            // When this reader is signaled by previous actor it acquires the mutex.
+            // We need to build happens-before relation with all other coming readers that will read our ACTIVEREADER
+            // without blocking on my_going. Therefore, we need to publish ACTIVEREADER with release semantics.
+            // On fail it is relaxed, because we will build happens-before on my_going.
+            s.my_state.compare_exchange_strong(old_state, STATE_ACTIVEREADER, std::memory_order_release, std::memory_order_relaxed);
+            if( old_state!=STATE_READER ) {
+#if __TBB_USE_ITT_NOTIFY
+                if( !sync_prepare_done )
+                    ITT_NOTIFY(sync_prepare, s.my_mutex);
+#endif
+                // Failed to become active reader -> need to unblock the next waiting reader first
+                __TBB_ASSERT( s.my_state.load(std::memory_order_relaxed)==STATE_READER_UNBLOCKNEXT, "unexpected state" );
+                spin_wait_while_eq(s.my_next, 0U, std::memory_order_acquire);
+                /* my_state should be changed before unblocking the next otherwise it might finish
+                   and another thread can get our old state and left blocked */
+                s.my_state.store(STATE_ACTIVEREADER, std::memory_order_relaxed);
+                tricky_pointer::load(s.my_next, std::memory_order_relaxed)->my_going.store(1U, std::memory_order_release);
+            }
+            __TBB_ASSERT(s.my_state.load(std::memory_order_relaxed) == STATE_ACTIVEREADER, "unlocked reader is active reader");
+        }
+
+        ITT_NOTIFY(sync_acquired, s.my_mutex);
+    }
+
+    //! A method to acquire queuing_rw_mutex if it is free
+    static bool try_acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write)
+    {
+        __TBB_ASSERT( !s.my_mutex, "scoped_lock is already holding a mutex");
+
+        if( m.q_tail.load(std::memory_order_relaxed) )
+            return false; // Someone already took the lock
+
+        // Must set all fields before the exchange, because once the
+        // exchange executes, *this becomes accessible to other threads.
+        s.my_prev.store(0U, std::memory_order_relaxed);
+        s.my_next.store(0U, std::memory_order_relaxed);
+        s.my_going.store(0U, std::memory_order_relaxed); // TODO: remove dead assignment?
+        s.my_state.store(d1::queuing_rw_mutex::scoped_lock::state_t(write ? STATE_WRITER : STATE_ACTIVEREADER), std::memory_order_relaxed);
+        s.my_internal_lock.store(RELEASED, std::memory_order_relaxed);
+
+        // The CAS must have release semantics, because we are
+        // "sending" the fields initialized above to other actors.
+        // We need acquire semantics, because we are acquiring the mutex
+        d1::queuing_rw_mutex::scoped_lock* expected = nullptr;
+        if (!m.q_tail.compare_exchange_strong(expected, &s, std::memory_order_acq_rel))
+            return false; // Someone already took the lock
+        s.my_mutex = &m;
+        ITT_NOTIFY(sync_acquired, s.my_mutex);
+        return true;
+    }
+
+    //! A method to release queuing_rw_mutex lock
+    static void release(d1::queuing_rw_mutex::scoped_lock& s) {
+        __TBB_ASSERT(s.my_mutex!=nullptr, "no lock acquired");
+
+        ITT_NOTIFY(sync_releasing, s.my_mutex);
+
+        if( s.my_state.load(std::memory_order_relaxed) == STATE_WRITER ) { // Acquired for write
+
+            // The logic below is the same as "writerUnlock", but elides
+            // "return" from the middle of the routine.
+            // In the statement below, acquire semantics of reading my_next is required
+            // so that following operations with fields of my_next are safe.
+            d1::queuing_rw_mutex::scoped_lock* next = tricky_pointer::load(s.my_next, std::memory_order_acquire);
+            if( !next ) {
+                d1::queuing_rw_mutex::scoped_lock* expected = &s;
+                // Release mutex on success otherwise wait for successor publication
+                if( s.my_mutex->q_tail.compare_exchange_strong(expected, nullptr,
+                    std::memory_order_release, std::memory_order_relaxed) )
+                {
+                    // this was the only item in the queue, and the queue is now empty.
+                    goto done;
+                }
+                spin_wait_while_eq(s.my_next, 0U, std::memory_order_relaxed);
+                next = tricky_pointer::load(s.my_next, std::memory_order_acquire);
+            }
+            next->my_going.store(2U, std::memory_order_relaxed); // protect next queue node from being destroyed too early
+            // If the next is STATE_UPGRADE_WAITING, it is expected to acquire all other released readers via release
+            // sequence in next->my_state. In that case, we need to preserve release sequence in next->my_state
+            // contributed by other reader. So, there are two approaches not to break the release sequence:
+            //   1. Use read-modify-write (exchange) operation to store with release the UPGRADE_LOSER state;
+            //   2. Acquire the release sequence and store the sequence and UPGRADE_LOSER state.
+            // The second approach seems better on x86 because it does not involve interlocked operations.
+            // Therefore, we read next->my_state with acquire while it is not required for else branch to get the 
+            // release sequence.
+            if( next->my_state.load(std::memory_order_acquire)==STATE_UPGRADE_WAITING ) {
+                // the next waiting for upgrade means this writer was upgraded before.
+                acquire_internal_lock(s);
+                // Responsibility transition, the one who reads uncorrupted my_prev will do release.
+                // Guarantee that above store of 2 into next->my_going happens-before resetting of next->my_prev
+                d1::queuing_rw_mutex::scoped_lock* tmp = tricky_pointer::exchange(next->my_prev, nullptr, std::memory_order_release);
+                // Pass the release sequence that we acquired with the above load of next->my_state.
+                next->my_state.store(STATE_UPGRADE_LOSER, std::memory_order_release);
+                // We are releasing the mutex
+                next->my_going.store(1U, std::memory_order_release);
+                unblock_or_wait_on_internal_lock(s, get_flag(tmp));
+            } else {
+                // next->state cannot be STATE_UPGRADE_REQUESTED
+                __TBB_ASSERT( next->my_state.load(std::memory_order_relaxed) & (STATE_COMBINED_WAITINGREADER | STATE_WRITER), "unexpected state" );
+                __TBB_ASSERT( !( next->my_prev.load(std::memory_order_relaxed) & FLAG ), "use of corrupted pointer!" );
+                // Guarantee that above store of 2 into next->my_going happens-before resetting of next->my_prev
+                tricky_pointer::store(next->my_prev, nullptr, std::memory_order_release);
+                // We are releasing the mutex
+                next->my_going.store(1U, std::memory_order_release);
+            }
+
+        } else { // Acquired for read
+            // The basic idea it to build happens-before relation with left and right readers via prev and next. In addition,
+            // the first reader should acquire the left (prev) signal and propagate to right (next). To simplify, we always
+            // build happens-before relation between left and right (left is happened before right).
+            queuing_rw_mutex::scoped_lock *tmp = nullptr;
+    retry:
+            // Addition to the original paper: Mark my_prev as in use
+            queuing_rw_mutex::scoped_lock *predecessor = tricky_pointer::fetch_add(s.my_prev, FLAG, std::memory_order_acquire);
+
+            if( predecessor ) {
+                if( !(try_acquire_internal_lock(*predecessor)) )
+                {
+                    // Failed to acquire the lock on predecessor. The predecessor either unlinks or upgrades.
+                    // In the second case, it could or could not know my "in use" flag - need to check
+                    // Responsibility transition, the one who reads uncorrupted my_prev will do release.
+                    tmp = tricky_pointer::compare_exchange_strong(s.my_prev, tricky_pointer(predecessor) | FLAG, predecessor, std::memory_order_acquire);
+                    if( !(tricky_pointer(tmp) & FLAG) ) {
+                        __TBB_ASSERT(tricky_pointer::load(s.my_prev, std::memory_order_relaxed) != (tricky_pointer(predecessor) | FLAG), nullptr);
+                        // Now owner of predecessor is waiting for _us_ to release its lock
+                        release_internal_lock(*predecessor);
+                    }
+                    // else the "in use" flag is back -> the predecessor didn't get it and will release itself; nothing to do
+
+                    tmp = nullptr;
+                    goto retry;
+                }
+                __TBB_ASSERT(predecessor && predecessor->my_internal_lock.load(std::memory_order_relaxed)==ACQUIRED, "predecessor's lock is not acquired");
+                tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed);
+                acquire_internal_lock(s);
+
+                tricky_pointer::store(predecessor->my_next, nullptr, std::memory_order_release);
+
+                d1::queuing_rw_mutex::scoped_lock* expected = &s;
+                if( !tricky_pointer::load(s.my_next, std::memory_order_acquire) && !s.my_mutex->q_tail.compare_exchange_strong(expected, predecessor, std::memory_order_release) ) {
+                    spin_wait_while_eq( s.my_next, 0U, std::memory_order_acquire );
+                }
+                __TBB_ASSERT( !(s.my_next.load(std::memory_order_relaxed) & FLAG), "use of corrupted pointer" );
+
+                // my_next is acquired either with load or spin_wait.
+                if(d1::queuing_rw_mutex::scoped_lock *const l_next = tricky_pointer::load(s.my_next, std::memory_order_relaxed) ) { // I->next != nil, TODO: rename to next after clearing up and adapting the n in the comment two lines below
+                    // Equivalent to I->next->prev = I->prev but protected against (prev[n]&FLAG)!=0
+                    tmp = tricky_pointer::exchange(l_next->my_prev, predecessor, std::memory_order_release);
+                    // I->prev->next = I->next;
+                    __TBB_ASSERT(tricky_pointer::load(s.my_prev, std::memory_order_relaxed)==predecessor, nullptr);
+                    predecessor->my_next.store(s.my_next.load(std::memory_order_relaxed), std::memory_order_release);
+                }
+                // Safe to release in the order opposite to acquiring which makes the code simpler
+                release_internal_lock(*predecessor);
+
+            } else { // No predecessor when we looked
+                acquire_internal_lock(s);  // "exclusiveLock(&I->EL)"
+                d1::queuing_rw_mutex::scoped_lock* next = tricky_pointer::load(s.my_next, std::memory_order_acquire);
+                if( !next ) {
+                    d1::queuing_rw_mutex::scoped_lock* expected = &s;
+                    // Release mutex on success otherwise wait for successor publication
+                    if( !s.my_mutex->q_tail.compare_exchange_strong(expected, nullptr,
+                        std::memory_order_release, std::memory_order_relaxed) )
+                    {
+                        spin_wait_while_eq( s.my_next, 0U, std::memory_order_relaxed );
+                        next = tricky_pointer::load(s.my_next, std::memory_order_acquire);
+                    } else {
+                        goto unlock_self;
+                    }
+                }
+                next->my_going.store(2U, std::memory_order_relaxed);
+                // Responsibility transition, the one who reads uncorrupted my_prev will do release.
+                tmp = tricky_pointer::exchange(next->my_prev, nullptr, std::memory_order_release);
+                next->my_going.store(1U, std::memory_order_release);
+            }
+    unlock_self:
+            unblock_or_wait_on_internal_lock(s, get_flag(tmp));
+        }
+    done:
+        // Lifetime synchronization, no need to build happens-before relation
+        spin_wait_while_eq( s.my_going, 2U, std::memory_order_relaxed );
+
+        s.initialize();
+    }
+
+    static bool downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock& s) {
+        if ( s.my_state.load(std::memory_order_relaxed) == STATE_ACTIVEREADER ) return true; // Already a reader
+
+        ITT_NOTIFY(sync_releasing, s.my_mutex);
+        d1::queuing_rw_mutex::scoped_lock* next = tricky_pointer::load(s.my_next, std::memory_order_acquire);
+        if( !next ) {
+            s.my_state.store(STATE_READER, std::memory_order_seq_cst);
+            // the following load of q_tail must not be reordered with setting STATE_READER above
+            if( &s == s.my_mutex->q_tail.load(std::memory_order_seq_cst) ) {
+                unsigned char old_state = STATE_READER;
+                // When this reader is signaled by previous actor it acquires the mutex.
+                // We need to build happens-before relation with all other coming readers that will read our ACTIVEREADER
+                // without blocking on my_going. Therefore, we need to publish ACTIVEREADER with release semantics.
+                // On fail it is relaxed, because we will build happens-before on my_going.
+                s.my_state.compare_exchange_strong(old_state, STATE_ACTIVEREADER, std::memory_order_release, std::memory_order_relaxed);
+                if( old_state==STATE_READER )
+                    return true; // Downgrade completed
+            }
+            /* wait for the next to register */
+            spin_wait_while_eq(s.my_next, 0U, std::memory_order_relaxed);
+            next = tricky_pointer::load(s.my_next, std::memory_order_acquire);
+        }
+
+        __TBB_ASSERT( next, "still no successor at this point!" );
+        if( next->my_state.load(std::memory_order_relaxed) & STATE_COMBINED_WAITINGREADER )
+            next->my_going.store(1U, std::memory_order_release);
+        // If the next is STATE_UPGRADE_WAITING, it is expected to acquire all other released readers via release
+        // sequence in next->my_state. In that case, we need to preserve release sequence in next->my_state
+        // contributed by other reader. So, there are two approaches not to break the release sequence:
+        //   1. Use read-modify-write (exchange) operation to store with release the UPGRADE_LOSER state;
+        //   2. Acquire the release sequence and store the sequence and UPGRADE_LOSER state.
+        // The second approach seems better on x86 because it does not involve interlocked operations.
+        // Therefore, we read next->my_state with acquire while it is not required for else branch to get the 
+        // release sequence.
+        else if( next->my_state.load(std::memory_order_acquire)==STATE_UPGRADE_WAITING )
+            // the next waiting for upgrade means this writer was upgraded before.
+            // To safe release sequence on next->my_state read it with acquire
+            next->my_state.store(STATE_UPGRADE_LOSER, std::memory_order_release);
+        s.my_state.store(STATE_ACTIVEREADER, std::memory_order_release);
+        return true;
+    }
+
+    static bool upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock& s) {
+        if (s.my_state.load(std::memory_order_relaxed) == STATE_WRITER) {
+            // Already a writer
+            return true;
+        }
+
+        __TBB_ASSERT(s.my_state.load(std::memory_order_relaxed) == STATE_ACTIVEREADER, "only active reader can be updated");
+
+        queuing_rw_mutex::scoped_lock* tmp{};
+        queuing_rw_mutex::scoped_lock* me = &s;
+
+        ITT_NOTIFY(sync_releasing, s.my_mutex);
+        // Publish ourselves into my_state that other UPGRADE_WAITING actors can acquire our state.
+        s.my_state.store(STATE_UPGRADE_REQUESTED, std::memory_order_release);
+    requested:
+        __TBB_ASSERT( !(s.my_next.load(std::memory_order_relaxed) & FLAG), "use of corrupted pointer!" );
+        acquire_internal_lock(s);
+        d1::queuing_rw_mutex::scoped_lock* expected = &s;
+        if( !s.my_mutex->q_tail.compare_exchange_strong(expected, tricky_pointer(me)|FLAG, std::memory_order_acq_rel) ) {
+            spin_wait_while_eq( s.my_next, 0U, std::memory_order_relaxed );
+            queuing_rw_mutex::scoped_lock * next;
+            next = tricky_pointer::fetch_add(s.my_next, FLAG, std::memory_order_acquire);
+            // While we were READER the next READER might reach STATE_UPGRADE_WAITING state.
+            // Therefore, it did not build happens before relation with us and we need to acquire the 
+            // next->my_state to build the happens before relation ourselves
+            unsigned short n_state = next->my_state.load(std::memory_order_acquire);
+            /* the next reader can be blocked by our state. the best thing to do is to unblock it */
+            if( n_state & STATE_COMBINED_WAITINGREADER )
+                next->my_going.store(1U, std::memory_order_release);
+            // Responsibility transition, the one who reads uncorrupted my_prev will do release.
+            tmp = tricky_pointer::exchange(next->my_prev, &s, std::memory_order_release);
+            unblock_or_wait_on_internal_lock(s, get_flag(tmp));
+            if( n_state & (STATE_COMBINED_READER | STATE_UPGRADE_REQUESTED) ) {
+                // save next|FLAG for simplicity of following comparisons
+                tmp = tricky_pointer(next)|FLAG;
+                for( atomic_backoff b; tricky_pointer::load(s.my_next, std::memory_order_relaxed)==tmp; b.pause() ) {
+                    if( s.my_state.load(std::memory_order_acquire) & STATE_COMBINED_UPGRADING ) {
+                        if( tricky_pointer::load(s.my_next, std::memory_order_acquire)==tmp )
+                            tricky_pointer::store(s.my_next, next, std::memory_order_relaxed);
+                        goto waiting;
+                    }
+                }
+                __TBB_ASSERT(tricky_pointer::load(s.my_next, std::memory_order_relaxed) != (tricky_pointer(next)|FLAG), nullptr);
+                goto requested;
+            } else {
+                __TBB_ASSERT( n_state & (STATE_WRITER | STATE_UPGRADE_WAITING), "unexpected state");
+                __TBB_ASSERT( (tricky_pointer(next)|FLAG) == tricky_pointer::load(s.my_next, std::memory_order_relaxed), nullptr);
+                tricky_pointer::store(s.my_next, next, std::memory_order_relaxed);
+            }
+        } else {
+            /* We are in the tail; whoever comes next is blocked by q_tail&FLAG */
+            release_internal_lock(s);
+        } // if( this != my_mutex->q_tail... )
+        {
+            unsigned char old_state = STATE_UPGRADE_REQUESTED;
+            // If we reach STATE_UPGRADE_WAITING state we do not build happens-before relation with READER on
+            // left. We delegate this responsibility to READER on left when it try upgrading. Therefore, we are releasing
+            // on success.
+            // Otherwise, on fail, we already acquired the next->my_state.
+            s.my_state.compare_exchange_strong(old_state, STATE_UPGRADE_WAITING, std::memory_order_release, std::memory_order_relaxed);
+        }
+    waiting:
+        __TBB_ASSERT( !( s.my_next.load(std::memory_order_relaxed) & FLAG ), "use of corrupted pointer!" );
+        __TBB_ASSERT( s.my_state & STATE_COMBINED_UPGRADING, "wrong state at upgrade waiting_retry" );
+        __TBB_ASSERT( me==&s, nullptr );
+        ITT_NOTIFY(sync_prepare, s.my_mutex);
+        /* if no one was blocked by the "corrupted" q_tail, turn it back */
+        expected = tricky_pointer(me)|FLAG;
+        s.my_mutex->q_tail.compare_exchange_strong(expected, &s, std::memory_order_release);
+        queuing_rw_mutex::scoped_lock * predecessor;
+        // Mark my_prev as 'in use' to prevent predecessor from releasing
+        predecessor = tricky_pointer::fetch_add(s.my_prev, FLAG, std::memory_order_acquire);
+        if( predecessor ) {
+            bool success = try_acquire_internal_lock(*predecessor);
+            {
+                // While the predecessor pointer (my_prev) is in use (FLAG is set), we can safely update the node`s state.
+                // Corrupted pointer transitions responsibility to release the predecessor`s node on us.
+                unsigned char old_state = STATE_UPGRADE_REQUESTED;
+                // Try to build happens before with the upgrading READER on left. If fail, the predecessor state is not
+                // important for us because it will acquire our state.
+                predecessor->my_state.compare_exchange_strong(old_state, STATE_UPGRADE_WAITING, std::memory_order_release,
+                    std::memory_order_relaxed);
+            }
+            if( !success ) {
+                // Responsibility transition, the one who reads uncorrupted my_prev will do release.
+                tmp = tricky_pointer::compare_exchange_strong(s.my_prev, tricky_pointer(predecessor)|FLAG, predecessor, std::memory_order_acquire);
+                if( tricky_pointer(tmp) & FLAG ) {
+                    tricky_pointer::spin_wait_while_eq(s.my_prev, predecessor);
+                    predecessor = tricky_pointer::load(s.my_prev, std::memory_order_relaxed);
+                } else {
+                    // TODO: spin_wait condition seems never reachable
+                    tricky_pointer::spin_wait_while_eq(s.my_prev, tricky_pointer(predecessor)|FLAG);
+                    release_internal_lock(*predecessor);
+                }
+            } else {
+                tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed);
+                release_internal_lock(*predecessor);
+                tricky_pointer::spin_wait_while_eq(s.my_prev, predecessor);
+                predecessor = tricky_pointer::load(s.my_prev, std::memory_order_relaxed);
+            }
+            if( predecessor )
+                goto waiting;
+        } else {
+            tricky_pointer::store(s.my_prev, nullptr, std::memory_order_relaxed);
+        }
+        __TBB_ASSERT( !predecessor && !s.my_prev, nullptr );
+
+        // additional lifetime issue prevention checks
+        // wait for the successor to finish working with my fields
+        wait_for_release_of_internal_lock(s);
+        // now wait for the predecessor to finish working with my fields
+        spin_wait_while_eq( s.my_going, 2U );
+
+        bool result = ( s.my_state != STATE_UPGRADE_LOSER );
+        s.my_state.store(STATE_WRITER, std::memory_order_relaxed);
+        s.my_going.store(1U, std::memory_order_relaxed);
+
+        ITT_NOTIFY(sync_acquired, s.my_mutex);
+        return result;
+    }
+
+    static bool is_writer(const d1::queuing_rw_mutex::scoped_lock& m) {
+        return m.my_state.load(std::memory_order_relaxed) == STATE_WRITER;
+    }
+
+    static void construct(d1::queuing_rw_mutex& m) {
+        suppress_unused_warning(m);
+        ITT_SYNC_CREATE(&m, _T("tbb::queuing_rw_mutex"), _T(""));
+    }
+};
+
+void __TBB_EXPORTED_FUNC acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) {
+    queuing_rw_mutex_impl::acquire(m, s, write);
+}
+
+bool __TBB_EXPORTED_FUNC try_acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) {
+    return queuing_rw_mutex_impl::try_acquire(m, s, write);
+}
+
+void __TBB_EXPORTED_FUNC release(d1::queuing_rw_mutex::scoped_lock& s) {
+    queuing_rw_mutex_impl::release(s);
+}
+
+bool __TBB_EXPORTED_FUNC upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock& s) {
+    return queuing_rw_mutex_impl::upgrade_to_writer(s);
+}
+
+bool __TBB_EXPORTED_FUNC is_writer(const d1::queuing_rw_mutex::scoped_lock& s) {
+    return queuing_rw_mutex_impl::is_writer(s);
+}
+
+bool __TBB_EXPORTED_FUNC downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock& s) {
+    return queuing_rw_mutex_impl::downgrade_to_reader(s);
+}
+
+void __TBB_EXPORTED_FUNC construct(d1::queuing_rw_mutex& m) {
+    queuing_rw_mutex_impl::construct(m);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/queuing_rw_mutex.h b/third_party/tbb/queuing_rw_mutex.h
new file mode 100644
index 000000000..4c9368b1b
--- /dev/null
+++ b/third_party/tbb/queuing_rw_mutex.h
@@ -0,0 +1,208 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_queuing_rw_mutex_H
+#define __TBB_queuing_rw_mutex_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_mutex_common.h"
+
+#include "third_party/tbb/profiling.h"
+
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+struct queuing_rw_mutex_impl;
+}
+namespace d1 {
+
+//! Queuing reader-writer mutex with local-only spinning.
+/** Adapted from Krieger, Stumm, et al. pseudocode at
+    https://www.researchgate.net/publication/221083709_A_Fair_Fast_Scalable_Reader-Writer_Lock
+    @ingroup synchronization */
+class queuing_rw_mutex {
+    friend r1::queuing_rw_mutex_impl;
+public:
+    //! Construct unacquired mutex.
+    queuing_rw_mutex() noexcept  {
+        create_itt_sync(this, "tbb::queuing_rw_mutex", "");
+    }
+
+    //! Destructor asserts if the mutex is acquired, i.e. q_tail is non-null
+    ~queuing_rw_mutex() {
+        __TBB_ASSERT(q_tail.load(std::memory_order_relaxed) == nullptr, "destruction of an acquired mutex");
+    }
+
+    //! No Copy
+    queuing_rw_mutex(const queuing_rw_mutex&) = delete;
+    queuing_rw_mutex& operator=(const queuing_rw_mutex&) = delete;
+
+    //! The scoped locking pattern
+    /** It helps to avoid the common problem of forgetting to release lock.
+        It also nicely provides the "node" for queuing locks. */
+    class scoped_lock {
+        friend r1::queuing_rw_mutex_impl;
+        //! Initialize fields to mean "no lock held".
+        void initialize() {
+            my_mutex = nullptr;
+            my_internal_lock.store(0, std::memory_order_relaxed);
+            my_going.store(0, std::memory_order_relaxed);
+#if TBB_USE_ASSERT
+            my_state = 0xFF; // Set to invalid state
+            my_next.store(reinterpret_cast<uintptr_t>(reinterpret_cast<void*>(-1)), std::memory_order_relaxed);
+            my_prev.store(reinterpret_cast<uintptr_t>(reinterpret_cast<void*>(-1)), std::memory_order_relaxed);
+#endif /* TBB_USE_ASSERT */
+        }
+
+    public:
+        //! Construct lock that has not acquired a mutex.
+        /** Equivalent to zero-initialization of *this. */
+        scoped_lock() {initialize();}
+
+        //! Acquire lock on given mutex.
+        scoped_lock( queuing_rw_mutex& m, bool write=true ) {
+            initialize();
+            acquire(m,write);
+        }
+
+        //! Release lock (if lock is held).
+        ~scoped_lock() {
+            if( my_mutex ) release();
+        }
+
+        //! No Copy
+        scoped_lock(const scoped_lock&) = delete;
+        scoped_lock& operator=(const scoped_lock&) = delete;
+
+        //! Acquire lock on given mutex.
+        void acquire( queuing_rw_mutex& m, bool write=true );
+
+        //! Acquire lock on given mutex if free (i.e. non-blocking)
+        bool try_acquire( queuing_rw_mutex& m, bool write=true );
+
+        //! Release lock.
+        void release();
+
+        //! Upgrade reader to become a writer.
+        /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+        bool upgrade_to_writer();
+
+        //! Downgrade writer to become a reader.
+        bool downgrade_to_reader();
+
+        bool is_writer() const;
+
+    private:
+        //! The pointer to the mutex owned, or nullptr if not holding a mutex.
+        queuing_rw_mutex* my_mutex;
+
+        //! The 'pointer' to the previous and next competitors for a mutex
+        std::atomic<uintptr_t> my_prev;
+        std::atomic<uintptr_t> my_next;
+
+        using state_t = unsigned char ;
+
+        //! State of the request: reader, writer, active reader, other service states
+        std::atomic<state_t> my_state;
+
+        //! The local spin-wait variable
+        /** Corresponds to "spin" in the pseudocode but inverted for the sake of zero-initialization */
+        std::atomic<unsigned char> my_going;
+
+        //! A tiny internal lock
+        std::atomic<unsigned char> my_internal_lock;
+    };
+
+    // Mutex traits
+    static constexpr bool is_rw_mutex = true;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = true;
+
+private:
+    //! The last competitor requesting the lock
+    std::atomic<scoped_lock*> q_tail{nullptr};
+};
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(queuing_rw_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64)
+inline void set_name(queuing_rw_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif //WIN
+#else
+inline void set_name(queuing_rw_mutex&, const char*) {}
+#if (_WIN32||_WIN64)
+inline void set_name(queuing_rw_mutex&, const wchar_t*) {}
+#endif //WIN
+#endif
+} // namespace d1
+
+namespace r1 {
+TBB_EXPORT void acquire(d1::queuing_rw_mutex&, d1::queuing_rw_mutex::scoped_lock&, bool);
+TBB_EXPORT bool try_acquire(d1::queuing_rw_mutex&, d1::queuing_rw_mutex::scoped_lock&, bool);
+TBB_EXPORT void release(d1::queuing_rw_mutex::scoped_lock&);
+TBB_EXPORT bool upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock&);
+TBB_EXPORT bool downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock&);
+TBB_EXPORT bool is_writer(const d1::queuing_rw_mutex::scoped_lock&);
+} // namespace r1
+
+namespace d1 {
+
+
+inline void queuing_rw_mutex::scoped_lock::acquire(queuing_rw_mutex& m,bool write) {
+    r1::acquire(m, *this, write);
+}
+
+inline bool queuing_rw_mutex::scoped_lock::try_acquire(queuing_rw_mutex& m, bool write) {
+    return r1::try_acquire(m, *this, write);
+}
+
+inline void queuing_rw_mutex::scoped_lock::release() {
+    r1::release(*this);
+}
+
+inline bool queuing_rw_mutex::scoped_lock::upgrade_to_writer() {
+    return r1::upgrade_to_writer(*this);
+}
+
+inline bool queuing_rw_mutex::scoped_lock::downgrade_to_reader() {
+    return r1::downgrade_to_reader(*this);
+}
+
+inline bool queuing_rw_mutex::scoped_lock::is_writer() const {
+    return r1::is_writer(*this);
+}
+} // namespace d1
+
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::queuing_rw_mutex;
+} // namespace v1
+namespace profiling {
+    using detail::d1::set_name;
+}
+} // namespace tbb
+
+#endif /* __TBB_queuing_rw_mutex_H */
diff --git a/third_party/tbb/rml_base.h b/third_party/tbb/rml_base.h
new file mode 100644
index 000000000..f903c39e1
--- /dev/null
+++ b/third_party/tbb/rml_base.h
@@ -0,0 +1,182 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// Header guard and namespace names follow rml conventions.
+
+#ifndef __RML_rml_base_H
+#define __RML_rml_base_H
+
+#include "third_party/libcxx/cstddef"
+
+#if _WIN32||_WIN64
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#endif /* _WIN32||_WIN64 */
+
+#ifdef RML_PURE_VIRTUAL_HANDLER
+#define RML_PURE(T) {RML_PURE_VIRTUAL_HANDLER(); return (T)0;}
+#else
+#define RML_PURE(T) = 0;
+#endif
+
+namespace rml {
+
+class server;
+
+class versioned_object {
+public:
+    //! A version number
+    typedef unsigned version_type;
+
+    virtual ~versioned_object() {}
+
+    //! Get version of this object
+    /** The version number is incremented when a incompatible change is introduced.
+        The version number is invariant for the lifetime of the object. */
+    virtual version_type version() const RML_PURE(version_type)
+
+};
+
+//! Represents a client's job for an execution context.
+/** A job object is constructed by the client.
+    Not derived from versioned_object because version is same as for client. */
+class job {
+    friend class server;
+};
+
+//! Information that client provides to server when asking for a server.
+/** The instance must endure at least until acknowledge_close_connection is called. */
+class client: public versioned_object {
+public:
+    //! Typedef for convenience of derived classes in other namespaces.
+    typedef ::rml::job job;
+
+    //! Index of a job in a job pool
+    typedef unsigned size_type;
+
+    //! Maximum number of threads that client can exploit profitably if nothing else is running on the machine.
+    /** The returned value should remain invariant for the lifetime of the connection.  [idempotent] */
+    virtual size_type max_job_count() const RML_PURE(size_type)
+
+    //! Minimum stack size for each job.  0 means to use default stack size. [idempotent]
+    virtual std::size_t min_stack_size() const RML_PURE(std::size_t)
+
+    //! Server calls this routine when it needs client to create a job object.
+    virtual job* create_one_job() RML_PURE(job*)
+
+    //! Acknowledge that all jobs have been cleaned up.
+    /** Called by server in response to request_close_connection
+        after cleanup(job) has been called for each job. */
+    virtual void acknowledge_close_connection() RML_PURE(void)
+
+    //! Inform client that server is done with *this.
+    /** Client should destroy the job.
+        Not necessarily called by execution context represented by *this.
+        Never called while any other thread is working on the job. */
+    virtual void cleanup( job& ) RML_PURE(void)
+
+    // In general, we should not add new virtual methods, because that would
+    // break derived classes.  Think about reserving some vtable slots.
+};
+
+// Information that server provides to client.
+// Virtual functions are routines provided by the server for the client to call.
+class server: public versioned_object {
+public:
+    //! Typedef for convenience of derived classes.
+    typedef ::rml::job job;
+
+#if _WIN32||_WIN64
+    typedef void* execution_resource_t;
+#endif
+
+    //! Request that connection to server be closed.
+    /** Causes each job associated with the client to have its cleanup method called,
+        possibly by a thread different than the thread that created the job.
+        This method can return before all cleanup methods return.
+        Actions that have to wait after all cleanup methods return should be part of
+        client::acknowledge_close_connection.
+        Pass true as exiting if request_close_connection() is called because exit() is
+        called. In that case, it is the client's responsibility to make sure all threads
+        are terminated. In all other cases, pass false.  */
+    virtual void request_close_connection( bool exiting = false ) = 0;
+
+    //! Called by client thread when it reaches a point where it cannot make progress until other threads do.
+    virtual void yield() = 0;
+
+    //! Called by client to indicate a change in the number of non-RML threads that are running.
+    /** This is a performance hint to the RML to adjust how many threads it should let run
+        concurrently.  The delta is the change in the number of non-RML threads that are running.
+        For example, a value of 1 means the client has started running another thread, and a value
+        of -1 indicates that the client has blocked or terminated one of its threads. */
+    virtual void independent_thread_number_changed( int delta ) = 0;
+
+    //! Default level of concurrency for which RML strives when there are no non-RML threads running.
+    /** Normally, the value is the hardware concurrency minus one.
+        The "minus one" accounts for the thread created by main(). */
+    virtual unsigned default_concurrency() const = 0;
+};
+
+class factory {
+public:
+    //! status results
+    enum status_type {
+        st_success=0,
+        st_connection_exists,
+        st_not_found,
+        st_incompatible
+    };
+
+protected:
+    //! Pointer to routine that waits for server to indicate when client can close itself.
+    status_type (*my_wait_to_close_routine)( factory& );
+
+public:
+    //! Library handle for use by RML.
+#if _WIN32||_WIN64
+    HMODULE library_handle;
+#else
+    void* library_handle;
+#endif /* _WIN32||_WIN64 */
+
+    //! Special marker to keep dll from being unloaded prematurely
+    static const std::size_t c_dont_unload = 1;
+};
+
+//! Typedef for callback functions to print server info
+typedef void (*server_info_callback_t)( void* arg, const char* server_info );
+
+} // namespace rml
+
+#endif /* __RML_rml_base_H */
diff --git a/third_party/tbb/rml_tbb.cpp b/third_party/tbb/rml_tbb.cpp
new file mode 100644
index 000000000..a08ad0ecd
--- /dev/null
+++ b/third_party/tbb/rml_tbb.cpp
@@ -0,0 +1,113 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_assert.h"
+
+#include "third_party/tbb/rml_tbb.h"
+#include "third_party/tbb/dynamic_link.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+namespace rml {
+
+#define MAKE_SERVER(x) DLD(__TBB_make_rml_server,x)
+#define GET_INFO(x) DLD(__TBB_call_with_my_server_info,x)
+#define SERVER tbb_server
+#define CLIENT tbb_client
+#define FACTORY tbb_factory
+
+#if __TBB_WEAK_SYMBOLS_PRESENT
+    #pragma weak __TBB_make_rml_server
+    #pragma weak __TBB_call_with_my_server_info
+    extern "C" {
+        ::rml::factory::status_type __TBB_make_rml_server( rml::tbb_factory& f, rml::tbb_server*& server, rml::tbb_client& client );
+        void __TBB_call_with_my_server_info( ::rml::server_info_callback_t cb, void* arg );
+    }
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+#if TBB_USE_DEBUG
+#define DEBUG_SUFFIX "_debug"
+#else
+#define DEBUG_SUFFIX
+#endif /* TBB_USE_DEBUG */
+
+// RML_SERVER_NAME is the name of the RML server library.
+#if _WIN32 || _WIN64
+#define RML_SERVER_NAME "irml" DEBUG_SUFFIX ".dll"
+#elif __APPLE__
+#define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".1.dylib"
+#elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX
+#define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".so"
+#elif __unix__
+#define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".so.1"
+#else
+#error Unknown OS
+#endif
+
+const ::rml::versioned_object::version_type CLIENT_VERSION = 2;
+
+#if __TBB_WEAK_SYMBOLS_PRESENT
+    #pragma weak __RML_open_factory
+    #pragma weak __RML_close_factory
+    extern "C" {
+        ::rml::factory::status_type __RML_open_factory ( ::rml::factory&, ::rml::versioned_object::version_type&, ::rml::versioned_object::version_type );
+        void __RML_close_factory( ::rml::factory& f );
+    }
+#endif /* __TBB_WEAK_SYMBOLS_PRESENT */
+
+::rml::factory::status_type FACTORY::open() {
+    // Failure of following assertion indicates that factory is already open, or not zero-inited.
+    __TBB_ASSERT_EX( !library_handle, nullptr);
+    status_type (*open_factory_routine)( factory&, version_type&, version_type );
+    dynamic_link_descriptor server_link_table[4] = {
+        DLD(__RML_open_factory,open_factory_routine),
+        MAKE_SERVER(my_make_server_routine),
+        DLD(__RML_close_factory,my_wait_to_close_routine),
+        GET_INFO(my_call_with_server_info_routine),
+    };
+    status_type result;
+    if ( dynamic_link( RML_SERVER_NAME, server_link_table, 4, &library_handle ) ) {
+        version_type server_version;
+        result = (*open_factory_routine)( *this, server_version, CLIENT_VERSION );
+        // server_version can be checked here for incompatibility if necessary.
+    } else {
+        library_handle = nullptr;
+        result = st_not_found;
+    }
+    return result;
+}
+
+void FACTORY::close() {
+    if ( library_handle )
+        (*my_wait_to_close_routine)(*this);
+    if ( (size_t)library_handle>FACTORY::c_dont_unload ) {
+        dynamic_unlink(library_handle);
+        library_handle = nullptr;
+    }
+}
+
+::rml::factory::status_type FACTORY::make_server( SERVER*& s, CLIENT& c) {
+    // Failure of following assertion means that factory was not successfully opened.
+    __TBB_ASSERT_EX( my_make_server_routine, nullptr);
+    return (*my_make_server_routine)(*this,s,c);
+}
+
+} // namespace rml
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/rml_tbb.h b/third_party/tbb/rml_tbb.h
new file mode 100644
index 000000000..dd571af47
--- /dev/null
+++ b/third_party/tbb/rml_tbb.h
@@ -0,0 +1,95 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// Header guard and namespace names follow TBB conventions.
+
+#ifndef __TBB_rml_tbb_H
+#define __TBB_rml_tbb_H
+
+#include "third_party/tbb/version.h"
+#include "third_party/tbb/rml_base.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+namespace rml {
+
+//------------------------------------------------------------------------
+// Classes instantiated by the server
+//------------------------------------------------------------------------
+
+//! Represents a set of oneTBB worker threads provided by the server.
+class tbb_server: public ::rml::server {
+public:
+    //! Inform server of adjustments in the number of workers that the client can profitably use.
+    virtual void adjust_job_count_estimate( int delta ) = 0;
+
+#if _WIN32 || _WIN64
+    //! Inform server of a oneTBB external thread.
+    virtual void register_external_thread( execution_resource_t& v ) = 0;
+
+    //! Inform server that the oneTBB external thread is done with its work.
+    virtual void unregister_external_thread( execution_resource_t v ) = 0;
+#endif /* _WIN32||_WIN64 */
+};
+
+//------------------------------------------------------------------------
+// Classes instantiated by the client
+//------------------------------------------------------------------------
+
+class tbb_client: public ::rml::client {
+public:
+    //! Defined by TBB to steal a task and execute it.  
+    /** Called by server when it wants an execution context to do some TBB work.
+        The method should return when it is okay for the thread to yield indefinitely. */
+    virtual void process( job& ) RML_PURE(void)
+};
+
+/** Client must ensure that instance is zero-inited, typically by being a file-scope object. */
+class tbb_factory: public ::rml::factory {
+
+    //! Pointer to routine that creates an RML server.
+    status_type (*my_make_server_routine)( tbb_factory&, tbb_server*&, tbb_client& );
+
+    //! Pointer to routine that calls callback function with server version info.
+    void (*my_call_with_server_info_routine)( ::rml::server_info_callback_t cb, void* arg );
+
+public:
+    typedef ::rml::versioned_object::version_type version_type;
+    typedef tbb_client client_type;
+    typedef tbb_server server_type;
+
+    //! Open factory.
+    /** Dynamically links against RML library. 
+        Returns st_success, st_incompatible, or st_not_found. */
+    status_type open();
+
+    //! Factory method to be called by client to create a server object.
+    /** Factory must be open. 
+        Returns st_success, or st_incompatible . */
+    status_type make_server( server_type*&, client_type& );
+
+    //! Close factory
+    void close();
+};
+
+} // namespace rml
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /*__TBB_rml_tbb_H */
diff --git a/third_party/tbb/rml_thread_monitor.h b/third_party/tbb/rml_thread_monitor.h
new file mode 100644
index 000000000..5b8a4d4d4
--- /dev/null
+++ b/third_party/tbb/rml_thread_monitor.h
@@ -0,0 +1,277 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// All platform-specific threading support is encapsulated here. */
+
+#ifndef __RML_thread_monitor_H
+#define __RML_thread_monitor_H
+
+#if __TBB_USE_WINAPI
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+// MISSING #include <process.h>
+#include "libc/mem/mem.h" //_alloca
+#include "third_party/tbb/misc.h" // support for processor groups
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+#include "third_party/libcxx/thread"
+#endif
+#elif __TBB_USE_POSIX
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/thread2.h"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/cstdlib"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/struct/timeval.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/sysv/consts/sched.h"
+#include "libc/sysv/consts/timer.h"
+#include "libc/time/struct/tm.h"
+#include "libc/time/time.h"
+#else
+#error Unsupported platform
+#endif
+#include "third_party/libcxx/cstdio"
+
+#include "third_party/tbb/detail/_template_helpers.h"
+
+#include "third_party/tbb/itt_notify.h"
+#include "third_party/tbb/semaphore.h"
+
+// All platform-specific threading support is in this header.
+
+#if (_WIN32||_WIN64)&&!__TBB_ipf
+// Deal with 64K aliasing.  The formula for "offset" is a Fibonacci hash function,
+// which has the desirable feature of spreading out the offsets fairly evenly
+// without knowing the total number of offsets, and furthermore unlikely to
+// accidentally cancel out other 64K aliasing schemes that Microsoft might implement later.
+// See Knuth Vol 3. "Theorem S" for details on Fibonacci hashing.
+// The second statement is really does need "volatile", otherwise the compiler might remove the _alloca.
+#define AVOID_64K_ALIASING(idx)                       \
+    std::size_t offset = (idx+1) * 40503U % (1U<<16);      \
+    void* volatile sink_for_alloca = _alloca(offset); \
+    __TBB_ASSERT_EX(sink_for_alloca, "_alloca failed");
+#else
+// Linux thread allocators avoid 64K aliasing.
+#define AVOID_64K_ALIASING(idx) tbb::detail::suppress_unused_warning(idx)
+#endif /* _WIN32||_WIN64 */
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+// Forward declaration: throws std::runtime_error with what() returning error_code description prefixed with aux_info
+void handle_perror(int error_code, const char* aux_info);
+
+namespace rml {
+namespace internal {
+
+#if __TBB_USE_ITT_NOTIFY
+static const ::tbb::detail::r1::tchar *SyncType_RML = _T("%Constant");
+static const ::tbb::detail::r1::tchar *SyncObj_ThreadMonitor = _T("RML Thr Monitor");
+#endif /* __TBB_USE_ITT_NOTIFY */
+
+//! Monitor with limited two-phase commit form of wait.
+/** At most one thread should wait on an instance at a time. */
+class thread_monitor {
+public:
+    thread_monitor() {
+        ITT_SYNC_CREATE(&my_sema, SyncType_RML, SyncObj_ThreadMonitor);
+    }
+    ~thread_monitor() {}
+
+    //! Notify waiting thread
+    /** Can be called by any thread. */
+    void notify();
+
+    //! Wait for notification
+    void wait();
+
+#if __TBB_USE_WINAPI
+    typedef HANDLE handle_type;
+
+    #define __RML_DECL_THREAD_ROUTINE unsigned WINAPI
+    typedef unsigned (WINAPI *thread_routine_type)(void*);
+
+    //! Launch a thread
+    static handle_type launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size, const size_t* worker_index = nullptr );
+
+#elif __TBB_USE_POSIX
+    typedef pthread_t handle_type;
+
+    #define __RML_DECL_THREAD_ROUTINE void*
+    typedef void*(*thread_routine_type)(void*);
+
+    //! Launch a thread
+    static handle_type launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size );
+#endif /* __TBB_USE_POSIX */
+
+    //! Join thread
+    static void join(handle_type handle);
+
+    //! Detach thread
+    static void detach_thread(handle_type handle);
+private:
+    // The protection from double notification of the binary semaphore
+    std::atomic<bool> my_notified{ false };
+    binary_semaphore my_sema;
+#if __TBB_USE_POSIX
+    static void check( int error_code, const char* routine );
+#endif
+};
+
+#if __TBB_USE_WINAPI
+
+#ifndef STACK_SIZE_PARAM_IS_A_RESERVATION
+#define STACK_SIZE_PARAM_IS_A_RESERVATION 0x00010000
+#endif
+
+// _beginthreadex API is not available in Windows 8 Store* applications, so use std::thread instead
+#if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+inline thread_monitor::handle_type thread_monitor::launch( thread_routine_type thread_function, void* arg, std::size_t, const std::size_t*) {
+//TODO: check that exception thrown from std::thread is not swallowed silently
+    std::thread* thread_tmp=new std::thread(thread_function, arg);
+    return thread_tmp->native_handle();
+}
+#else
+inline thread_monitor::handle_type thread_monitor::launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size, const std::size_t* worker_index ) {
+    unsigned thread_id;
+    int number_of_processor_groups = ( worker_index ) ? NumberOfProcessorGroups() : 0;
+    unsigned create_flags = ( number_of_processor_groups > 1 ) ? CREATE_SUSPENDED : 0;
+    HANDLE h = (HANDLE)_beginthreadex( nullptr, unsigned(stack_size), thread_routine, arg, STACK_SIZE_PARAM_IS_A_RESERVATION | create_flags, &thread_id );
+    if( !h ) {
+        handle_perror(0, "thread_monitor::launch: _beginthreadex failed\n");
+    }
+    if ( number_of_processor_groups > 1 ) {
+        MoveThreadIntoProcessorGroup( h, FindProcessorGroupIndex( static_cast<int>(*worker_index) ) );
+        ResumeThread( h );
+    }
+    return h;
+}
+#endif //__TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00)
+
+void thread_monitor::join(handle_type handle) {
+#if TBB_USE_ASSERT
+    DWORD res =
+#endif
+        WaitForSingleObjectEx(handle, INFINITE, FALSE);
+    __TBB_ASSERT( res==WAIT_OBJECT_0, nullptr);
+#if TBB_USE_ASSERT
+    BOOL val =
+#endif
+        CloseHandle(handle);
+    __TBB_ASSERT( val, nullptr);
+}
+
+void thread_monitor::detach_thread(handle_type handle) {
+#if TBB_USE_ASSERT
+    BOOL val =
+#endif
+        CloseHandle(handle);
+    __TBB_ASSERT( val, nullptr);
+}
+
+#endif /* __TBB_USE_WINAPI */
+
+#if __TBB_USE_POSIX
+inline void thread_monitor::check( int error_code, const char* routine ) {
+    if( error_code ) {
+        handle_perror(error_code, routine);
+    }
+}
+
+inline thread_monitor::handle_type thread_monitor::launch( void* (*thread_routine)(void*), void* arg, std::size_t stack_size ) {
+    // FIXME - consider more graceful recovery than just exiting if a thread cannot be launched.
+    // Note that there are some tricky situations to deal with, such that the thread is already
+    // grabbed as part of an OpenMP team.
+    pthread_attr_t s;
+    check(pthread_attr_init( &s ), "pthread_attr_init has failed");
+    if( stack_size>0 )
+        check(pthread_attr_setstacksize( &s, stack_size ), "pthread_attr_setstack_size has failed" );
+
+    // pthread_create(2) can spuriously fail with EAGAIN. We retry
+    // max_num_tries times with progressively longer wait times.
+    pthread_t handle;
+    const int max_num_tries = 20;
+    int error = EAGAIN;
+
+    for (int i = 0; i < max_num_tries && error == EAGAIN; i++) {
+      if (i != 0) {
+        // Wait i milliseconds
+        struct timespec ts = {0, i * 1000 * 1000};
+        nanosleep(&ts, NULL);
+      }
+      error = pthread_create(&handle, &s, thread_routine, arg);
+    }
+
+    if (error)
+      handle_perror(error, "pthread_create has failed");
+
+    check( pthread_attr_destroy( &s ), "pthread_attr_destroy has failed" );
+    return handle;
+}
+
+void thread_monitor::join(handle_type handle) {
+    check(pthread_join(handle, nullptr), "pthread_join has failed");
+}
+
+void thread_monitor::detach_thread(handle_type handle) {
+    check(pthread_detach(handle), "pthread_detach has failed");
+}
+#endif /* __TBB_USE_POSIX */
+
+inline void thread_monitor::notify() {
+    // Check that the semaphore is not notified twice
+    if (!my_notified.exchange(true, std::memory_order_release)) {
+        my_sema.V();
+    }
+}
+
+inline void thread_monitor::wait() {
+    my_sema.P();
+    // memory_order_seq_cst is required here to be ordered with
+    // further load checking shutdown state
+    my_notified.store(false, std::memory_order_seq_cst);
+}
+
+} // namespace internal
+} // namespace rml
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __RML_thread_monitor_H */
diff --git a/third_party/tbb/rtm_mutex.cpp b/third_party/tbb/rtm_mutex.cpp
new file mode 100644
index 000000000..04328689a
--- /dev/null
+++ b/third_party/tbb/rtm_mutex.cpp
@@ -0,0 +1,122 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_rtm_mutex.h"
+#include "third_party/tbb/itt_notify.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/misc.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+
+struct rtm_mutex_impl {
+    // maximum number of times to retry
+    // TODO: experiment on retry values.
+    static constexpr int retry_threshold = 10;
+    using transaction_result_type = decltype(begin_transaction());
+
+    //! Release speculative mutex
+    static void release(d1::rtm_mutex::scoped_lock& s) {
+        switch(s.m_transaction_state) {
+        case d1::rtm_mutex::rtm_state::rtm_transacting:
+            __TBB_ASSERT(is_in_transaction(), "m_transaction_state && not speculating");
+            end_transaction();
+            s.m_mutex = nullptr;
+            break;
+        case d1::rtm_mutex::rtm_state::rtm_real:
+            s.m_mutex->unlock();
+            s.m_mutex = nullptr;
+            break;
+        case d1::rtm_mutex::rtm_state::rtm_none:
+            __TBB_ASSERT(false, "mutex is not locked, but in release");
+            break;
+        default:
+            __TBB_ASSERT(false, "invalid m_transaction_state");
+        }
+        s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_none;
+    }
+
+    //! Acquire lock on the given mutex.
+    static void acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s, bool only_speculate) {
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_none, "scoped_lock already in transaction");
+        if(governor::speculation_enabled()) {
+            int num_retries = 0;
+            transaction_result_type abort_code = 0;
+            do {
+                if(m.m_flag.load(std::memory_order_acquire)) {
+                    if(only_speculate) return;
+                    spin_wait_while_eq(m.m_flag, true);
+                }
+                // _xbegin returns -1 on success or the abort code, so capture it
+                if((abort_code = begin_transaction()) == transaction_result_type(speculation_successful_begin))
+                {
+                    // started speculation
+                    if(m.m_flag.load(std::memory_order_relaxed)) {
+                        abort_transaction();
+                    }
+                    s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_transacting;
+                    // Don not wrap the following assignment to a function,
+                    // because it can abort the transaction in debug. Need mutex for release().
+                    s.m_mutex = &m;
+                    return;  // successfully started speculation
+                }
+                ++num_retries;
+            } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold));
+        }
+
+        if(only_speculate) return;
+        s.m_mutex = &m;
+        s.m_mutex->lock();
+        s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_real;
+    }
+
+    //! Try to acquire lock on the given mutex.
+    static bool try_acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s) {
+        acquire(m, s, /*only_speculate=*/true);
+        if (s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_transacting) {
+            return true;
+        }
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_none, nullptr);
+        // transacting acquire failed. try_lock the real mutex
+        if (m.try_lock()) {
+            s.m_mutex = &m;
+            s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_real;
+            return true;
+        }
+        return false;
+    }
+};
+
+void __TBB_EXPORTED_FUNC acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s, bool only_speculate) {
+    rtm_mutex_impl::acquire(m, s, only_speculate);
+}
+bool __TBB_EXPORTED_FUNC try_acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s) {
+    return rtm_mutex_impl::try_acquire(m, s);
+}
+void __TBB_EXPORTED_FUNC release(d1::rtm_mutex::scoped_lock& s) {
+    rtm_mutex_impl::release(s);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/third_party/tbb/rtm_rw_mutex.cpp b/third_party/tbb/rtm_rw_mutex.cpp
new file mode 100644
index 000000000..9e57652e6
--- /dev/null
+++ b/third_party/tbb/rtm_rw_mutex.cpp
@@ -0,0 +1,272 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_rtm_rw_mutex.h"
+#include "third_party/tbb/itt_notify.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/misc.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+struct rtm_rw_mutex_impl {
+    // maximum number of times to retry
+    // TODO: experiment on retry values.
+    static constexpr int retry_threshold_read = 10;
+    static constexpr int retry_threshold_write = 10;
+    using transaction_result_type = decltype(begin_transaction());
+
+    //! Release speculative mutex
+    static void release(d1::rtm_rw_mutex::scoped_lock& s) {
+        switch(s.m_transaction_state) {
+        case d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer:
+        case d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader:
+            __TBB_ASSERT(is_in_transaction(), "m_transaction_state && not speculating");
+            end_transaction();
+            s.m_mutex = nullptr;
+            break;
+        case d1::rtm_rw_mutex::rtm_type::rtm_real_reader:
+            __TBB_ASSERT(!s.m_mutex->write_flag.load(std::memory_order_relaxed), "write_flag set but read lock acquired");
+            s.m_mutex->unlock_shared();
+            s.m_mutex = nullptr;
+            break;
+        case d1::rtm_rw_mutex::rtm_type::rtm_real_writer:
+            __TBB_ASSERT(s.m_mutex->write_flag.load(std::memory_order_relaxed), "write_flag unset but write lock acquired");
+            s.m_mutex->write_flag.store(false, std::memory_order_relaxed);
+            s.m_mutex->unlock();
+            s.m_mutex = nullptr;
+            break;
+        case d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex:
+            __TBB_ASSERT(false, "rtm_not_in_mutex, but in release");
+            break;
+        default:
+            __TBB_ASSERT(false, "invalid m_transaction_state");
+        }
+        s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex;
+    }
+
+    //! Acquire write lock on the given mutex.
+    static void acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) {
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, "scoped_lock already in transaction");
+        if(governor::speculation_enabled()) {
+            int num_retries = 0;
+            transaction_result_type abort_code = 0;
+            do {
+                if(m.m_state.load(std::memory_order_acquire)) {
+                    if(only_speculate) return;
+                    spin_wait_until_eq(m.m_state, d1::rtm_rw_mutex::state_type(0));
+                }
+                // _xbegin returns -1 on success or the abort code, so capture it
+                if((abort_code = begin_transaction()) == transaction_result_type(speculation_successful_begin))
+                {
+                    // started speculation
+                    if(m.m_state.load(std::memory_order_relaxed)) {  // add spin_rw_mutex to read-set.
+                        // reader or writer grabbed the lock, so abort.
+                        abort_transaction();
+                    }
+                    s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer;
+                    // Don not wrap the following assignment to a function,
+                    // because it can abort the transaction in debug. Need mutex for release().
+                    s.m_mutex = &m;
+                    return;  // successfully started speculation
+                }
+                ++num_retries;
+            } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold_write));
+        }
+
+        if(only_speculate) return;
+        s.m_mutex = &m;                                                          // should apply a real try_lock...
+        s.m_mutex->lock();                                                       // kill transactional writers
+        __TBB_ASSERT(!m.write_flag.load(std::memory_order_relaxed), "After acquire for write, write_flag already true");
+        m.write_flag.store(true, std::memory_order_relaxed);                       // kill transactional readers
+        s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer;
+    }
+
+    //! Acquire read lock on given mutex.
+    //  only_speculate : true if we are doing a try_acquire.  If true and we fail to speculate, don't
+    //     really acquire the lock, return and do a try_acquire on the contained spin_rw_mutex.  If
+    //     the lock is already held by a writer, just return.
+    static void acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) {
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, "scoped_lock already in transaction");
+        if(governor::speculation_enabled()) {
+            int num_retries = 0;
+            transaction_result_type abort_code = 0;
+            do {
+                // if in try_acquire, and lock is held as writer, don't attempt to speculate.
+                if(m.write_flag.load(std::memory_order_acquire)) {
+                    if(only_speculate) return;
+                    spin_wait_while_eq(m.write_flag, true);
+                }
+                // _xbegin returns -1 on success or the abort code, so capture it
+                if((abort_code = begin_transaction()) == transaction_result_type(speculation_successful_begin))
+                {
+                    // started speculation
+                    if(m.write_flag.load(std::memory_order_relaxed)) {  // add write_flag to read-set.
+                        abort_transaction();  // writer grabbed the lock, so abort.
+                    }
+                    s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader;
+                    // Don not wrap the following assignment to a function,
+                    // because it can abort the transaction in debug. Need mutex for release().
+                    s.m_mutex = &m;
+                    return;  // successfully started speculation
+                }
+                // fallback path
+                // retry only if there is any hope of getting into a transaction soon
+                // Retry in the following cases (from Section 8.3.5 of
+                // Intel(R) Architecture Instruction Set Extensions Programming Reference):
+                // 1. abort caused by XABORT instruction (bit 0 of EAX register is set)
+                // 2. the transaction may succeed on a retry (bit 1 of EAX register is set)
+                // 3. if another logical processor conflicted with a memory address
+                //    that was part of the transaction that aborted (bit 2 of EAX register is set)
+                // That is, retry if (abort_code & 0x7) is non-zero
+                ++num_retries;
+            } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold_read));
+        }
+
+        if(only_speculate) return;
+        s.m_mutex = &m;
+        s.m_mutex->lock_shared();
+        s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader;
+    }
+
+    //! Upgrade reader to become a writer.
+    /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+    static bool upgrade(d1::rtm_rw_mutex::scoped_lock& s) {
+        switch(s.m_transaction_state) {
+        case d1::rtm_rw_mutex::rtm_type::rtm_real_reader: {
+            s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer;
+            bool no_release = s.m_mutex->upgrade();
+            __TBB_ASSERT(!s.m_mutex->write_flag.load(std::memory_order_relaxed), "After upgrade, write_flag already true");
+            s.m_mutex->write_flag.store(true, std::memory_order_relaxed);
+            return no_release;
+        }
+        case d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader: {
+            d1::rtm_rw_mutex& m = *s.m_mutex;
+            if(m.m_state.load(std::memory_order_acquire)) {  // add spin_rw_mutex to read-set.
+                // Real reader or writer holds the lock; so commit the read and re-acquire for write.
+                release(s);
+                acquire_writer(m, s, false);
+                return false;
+            } else
+            {
+                s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer;
+                return true;
+            }
+        }
+        default:
+            __TBB_ASSERT(false, "Invalid state for upgrade");
+            return false;
+        }
+    }
+
+    //! Downgrade writer to a reader.
+    static bool downgrade(d1::rtm_rw_mutex::scoped_lock& s) {
+        switch (s.m_transaction_state) {
+        case d1::rtm_rw_mutex::rtm_type::rtm_real_writer:
+            s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader;
+            __TBB_ASSERT(s.m_mutex->write_flag.load(std::memory_order_relaxed), "Before downgrade write_flag not true");
+            s.m_mutex->write_flag.store(false, std::memory_order_relaxed);
+            s.m_mutex->downgrade();
+            return true;
+        case d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer:
+            s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader;
+            return true;
+        default:
+            __TBB_ASSERT(false, "Invalid state for downgrade");
+            return false;
+        }
+    }
+
+    //! Try to acquire write lock on the given mutex.
+    //  There may be reader(s) which acquired the spin_rw_mutex, as well as possibly
+    //  transactional reader(s).  If this is the case, the acquire will fail, and assigning
+    //  write_flag will kill the transactors.  So we only assign write_flag if we have successfully
+    //  acquired the lock.
+    static bool try_acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) {
+        acquire_writer(m, s, /*only_speculate=*/true);
+        if (s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer) {
+            return true;
+        }
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, nullptr);
+        // transacting write acquire failed. try_lock the real mutex
+        if (m.try_lock()) {
+            s.m_mutex = &m;
+            // only shoot down readers if we're not transacting ourselves
+            __TBB_ASSERT(!m.write_flag.load(std::memory_order_relaxed), "After try_acquire_writer, write_flag already true");
+            m.write_flag.store(true, std::memory_order_relaxed);
+            s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer;
+            return true;
+        }
+        return false;
+    }
+
+    //! Try to acquire read lock on the given mutex.
+    static bool try_acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) {
+        // speculatively acquire the lock. If this fails, do try_lock_shared on the spin_rw_mutex.
+        acquire_reader(m, s, /*only_speculate=*/true);
+        if (s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader) {
+            return true;
+        }
+        __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, nullptr);
+        // transacting read acquire failed. try_lock_shared the real mutex
+        if (m.try_lock_shared()) {
+            s.m_mutex = &m;
+            s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader;
+            return true;
+        }
+        return false;
+    }
+};
+
+void __TBB_EXPORTED_FUNC acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) {
+    rtm_rw_mutex_impl::acquire_writer(m, s, only_speculate);
+}
+//! Internal acquire read lock.
+// only_speculate == true if we're doing a try_lock, else false.
+void __TBB_EXPORTED_FUNC acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) {
+    rtm_rw_mutex_impl::acquire_reader(m, s, only_speculate);
+}
+//! Internal upgrade reader to become a writer.
+bool __TBB_EXPORTED_FUNC upgrade(d1::rtm_rw_mutex::scoped_lock& s) {
+    return rtm_rw_mutex_impl::upgrade(s);
+}
+//! Internal downgrade writer to become a reader.
+bool __TBB_EXPORTED_FUNC downgrade(d1::rtm_rw_mutex::scoped_lock& s) {
+    return rtm_rw_mutex_impl::downgrade(s);
+}
+//! Internal try_acquire write lock.
+bool __TBB_EXPORTED_FUNC try_acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) {
+    return rtm_rw_mutex_impl::try_acquire_writer(m, s);
+}
+//! Internal try_acquire read lock.
+bool __TBB_EXPORTED_FUNC try_acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) {
+    return rtm_rw_mutex_impl::try_acquire_reader(m, s);
+}
+//! Internal release lock.
+void __TBB_EXPORTED_FUNC release(d1::rtm_rw_mutex::scoped_lock& s) {
+    rtm_rw_mutex_impl::release(s);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+
diff --git a/third_party/tbb/rw_mutex.h b/third_party/tbb/rw_mutex.h
new file mode 100644
index 000000000..d156a0c60
--- /dev/null
+++ b/third_party/tbb/rw_mutex.h
@@ -0,0 +1,217 @@
+// clang-format off
+/*
+    Copyright (c) 2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_rw_mutex_H
+#define __TBB_rw_mutex_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_waitable_atomic.h"
+#include "third_party/tbb/detail/_scoped_lock.h"
+#include "third_party/tbb/detail/_mutex_common.h"
+#include "third_party/tbb/profiling.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+class rw_mutex {
+public:
+    //! Constructors
+    rw_mutex() noexcept : m_state(0) {
+       create_itt_sync(this, "tbb::rw_mutex", "");
+    }
+
+    //! Destructor
+    ~rw_mutex() {
+        __TBB_ASSERT(!m_state.load(std::memory_order_relaxed), "destruction of an acquired mutex");
+    }
+
+    //! No Copy
+    rw_mutex(const rw_mutex&) = delete;
+    rw_mutex& operator=(const rw_mutex&) = delete;
+
+    using scoped_lock = rw_scoped_lock<rw_mutex>;
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = true;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+
+    //! Acquire lock
+    void lock() {
+        call_itt_notify(prepare, this);
+        while (!try_lock()) {
+            if (!(m_state.load(std::memory_order_relaxed) & WRITER_PENDING)) { // no pending writers
+                m_state |= WRITER_PENDING;
+            }
+
+            auto wakeup_condition = [&] { return !(m_state.load(std::memory_order_relaxed) & BUSY); };
+            adaptive_wait_on_address(this, wakeup_condition, WRITER_CONTEXT);
+        }
+
+        call_itt_notify(acquired, this);
+    }
+
+    //! Try acquiring lock (non-blocking)
+    /** Return true if lock acquired; false otherwise. */
+    bool try_lock() {
+        // for a writer: only possible to acquire if no active readers or writers
+        // Use relaxed memory fence is OK here because
+        // Acquire memory fence guaranteed by compare_exchange_strong()
+        state_type s = m_state.load(std::memory_order_relaxed);
+        if (!(s & BUSY)) { // no readers, no writers; mask is 1..1101
+            if (m_state.compare_exchange_strong(s, WRITER)) {
+                call_itt_notify(acquired, this);
+                return true; // successfully stored writer flag
+            }
+        }
+        return false;
+    }
+
+    //! Release lock
+    void unlock() {
+        call_itt_notify(releasing, this);
+        state_type curr_state = (m_state &= READERS | WRITER_PENDING); // Returns current state
+
+        if (curr_state & WRITER_PENDING) {
+            r1::notify_by_address(this, WRITER_CONTEXT);
+        } else {
+            // It's possible that WRITER sleeps without WRITER_PENDING,
+            // because other thread might clear this bit at upgrade()
+            r1::notify_by_address_all(this);
+        }
+    }
+
+    //! Lock shared ownership mutex
+    void lock_shared() {
+        call_itt_notify(prepare, this);
+        while (!try_lock_shared()) {
+            state_type has_writer = WRITER | WRITER_PENDING;
+            auto wakeup_condition = [&] { return !(m_state.load(std::memory_order_relaxed) & has_writer); };
+            adaptive_wait_on_address(this, wakeup_condition, READER_CONTEXT);
+        }
+        __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & READERS, "invalid state of a read lock: no readers");
+    }
+
+    //! Try lock shared ownership mutex
+    bool try_lock_shared() {
+        // for a reader: acquire if no active or waiting writers
+        // Use relaxed memory fence is OK here because
+        // Acquire memory fence guaranteed by fetch_add()
+        state_type has_writer = WRITER | WRITER_PENDING;
+        if (!(m_state.load(std::memory_order_relaxed) & has_writer)) {
+            if (m_state.fetch_add(ONE_READER) & has_writer) {
+                m_state -= ONE_READER;
+                r1::notify_by_address(this, WRITER_CONTEXT);
+            } else {
+                call_itt_notify(acquired, this);
+                return true; // successfully stored increased number of readers
+            }
+        }
+        return false;
+    }
+
+    //! Unlock shared ownership mutex
+    void unlock_shared() {
+        __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & READERS, "invalid state of a read lock: no readers");
+        call_itt_notify(releasing, this);
+
+        state_type curr_state = (m_state -= ONE_READER); // Returns current state
+
+        if (curr_state & (WRITER_PENDING)) {
+            r1::notify_by_address(this, WRITER_CONTEXT);
+        } else {
+            // It's possible that WRITER sleeps without WRITER_PENDING,
+            // because other thread might clear this bit at upgrade()
+            r1::notify_by_address_all(this);
+        }
+    }
+
+private:
+    /** Internal non ISO C++ standard API **/
+    //! This API is used through the scoped_lock class
+
+    //! Upgrade reader to become a writer.
+    /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+    bool upgrade() {
+        state_type s = m_state.load(std::memory_order_relaxed);
+        __TBB_ASSERT(s & READERS, "invalid state before upgrade: no readers ");
+        // Check and set writer-pending flag.
+        // Required conditions: either no pending writers, or we are the only reader
+        // (with multiple readers and pending writer, another upgrade could have been requested)
+        while ((s & READERS) == ONE_READER || !(s & WRITER_PENDING)) {
+            if (m_state.compare_exchange_strong(s, s | WRITER | WRITER_PENDING)) {
+                auto wakeup_condition = [&] { return (m_state.load(std::memory_order_relaxed) & READERS) == ONE_READER; };
+                while ((m_state.load(std::memory_order_relaxed) & READERS) != ONE_READER) {
+                    adaptive_wait_on_address(this, wakeup_condition, WRITER_CONTEXT);
+                }
+
+                __TBB_ASSERT((m_state.load(std::memory_order_relaxed) & (WRITER_PENDING|WRITER)) == (WRITER_PENDING | WRITER),
+                             "invalid state when upgrading to writer");
+                // Both new readers and writers are blocked at this time
+                m_state -= (ONE_READER + WRITER_PENDING);
+                return true; // successfully upgraded
+            }
+        }
+        // Slow reacquire
+        unlock_shared();
+        lock();
+        return false;
+    }
+
+    //! Downgrade writer to a reader
+    void downgrade() {
+        __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & WRITER, nullptr),
+        call_itt_notify(releasing, this);
+        m_state += (ONE_READER - WRITER);
+
+        if (!(m_state & WRITER_PENDING)) {
+            r1::notify_by_address(this, READER_CONTEXT);
+        }
+
+        __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & READERS, "invalid state after downgrade: no readers");
+    }
+
+    using state_type = std::intptr_t;
+    static constexpr state_type WRITER = 1;
+    static constexpr state_type WRITER_PENDING = 2;
+    static constexpr state_type READERS = ~(WRITER | WRITER_PENDING);
+    static constexpr state_type ONE_READER = 4;
+    static constexpr state_type BUSY = WRITER | READERS;
+
+    using context_type = std::uintptr_t;
+    static constexpr context_type WRITER_CONTEXT = 0;
+    static constexpr context_type READER_CONTEXT = 1;
+    friend scoped_lock;
+    //! State of lock
+    /** Bit 0 = writer is holding lock
+        Bit 1 = request by a writer to acquire lock (hint to readers to wait)
+        Bit 2..N = number of readers holding lock */
+    std::atomic<state_type> m_state;
+}; // class rw_mutex
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::rw_mutex;
+} // namespace v1
+
+} // namespace tbb
+
+#endif // __TBB_rw_mutex_H
diff --git a/third_party/tbb/scalable_allocator.h b/third_party/tbb/scalable_allocator.h
new file mode 100644
index 000000000..d6f6b9c60
--- /dev/null
+++ b/third_party/tbb/scalable_allocator.h
@@ -0,0 +1,338 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_scalable_allocator_H
+#define __TBB_scalable_allocator_H
+
+#ifdef __cplusplus
+// MISSING #include "oneapi/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_utils.h"
+// MISSING #include "oneapi/tbb/detail/_namespace_injection.h"
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/new" /* std::bad_alloc() */
+#else
+// MISSING #include "oneapi/tbb/detail/_export.h"
+ /* Need ptrdiff_t and size_t from here. */
+#if !defined(_MSC_VER) || defined(__clang__)
+#include "libc/inttypes.h"
+#include "libc/limits.h"
+#include "libc/literal.h" /* Need intptr_t from here. */
+#endif
+#endif
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+// MISSING #include <memory_resource>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#if _MSC_VER
+    #define __TBB_EXPORTED_FUNC __cdecl
+#else
+    #define __TBB_EXPORTED_FUNC
+#endif
+
+/** The "malloc" analogue to allocate block of memory of size bytes.
+  * @ingroup memory_allocation */
+TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_malloc(size_t size);
+
+/** The "free" analogue to discard a previously allocated piece of memory.
+    @ingroup memory_allocation */
+TBBMALLOC_EXPORT void   __TBB_EXPORTED_FUNC scalable_free(void* ptr);
+
+/** The "realloc" analogue complementing scalable_malloc.
+    @ingroup memory_allocation */
+TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_realloc(void* ptr, size_t size);
+
+/** The "calloc" analogue complementing scalable_malloc.
+    @ingroup memory_allocation */
+TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_calloc(size_t nobj, size_t size);
+
+/** The "posix_memalign" analogue.
+    @ingroup memory_allocation */
+TBBMALLOC_EXPORT int __TBB_EXPORTED_FUNC scalable_posix_memalign(void** memptr, size_t alignment, size_t size);
+
+/** The "_aligned_malloc" analogue.
+    @ingroup memory_allocation */
+TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_aligned_malloc(size_t size, size_t alignment);
+
+/** The "_aligned_realloc" analogue.
+    @ingroup memory_allocation */
+TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_aligned_realloc(void* ptr, size_t size, size_t alignment);
+
+/** The "_aligned_free" analogue.
+    @ingroup memory_allocation */
+TBBMALLOC_EXPORT void __TBB_EXPORTED_FUNC scalable_aligned_free(void* ptr);
+
+/** The analogue of _msize/malloc_size/malloc_usable_size.
+    Returns the usable size of a memory block previously allocated by scalable_*,
+    or 0 (zero) if ptr does not point to such a block.
+    @ingroup memory_allocation */
+TBBMALLOC_EXPORT size_t __TBB_EXPORTED_FUNC scalable_msize(void* ptr);
+
+/* Results for scalable_allocation_* functions */
+typedef enum {
+    TBBMALLOC_OK,
+    TBBMALLOC_INVALID_PARAM,
+    TBBMALLOC_UNSUPPORTED,
+    TBBMALLOC_NO_MEMORY,
+    TBBMALLOC_NO_EFFECT
+} ScalableAllocationResult;
+
+/* Setting TBB_MALLOC_USE_HUGE_PAGES environment variable to 1 enables huge pages.
+   scalable_allocation_mode call has priority over environment variable. */
+typedef enum {
+    TBBMALLOC_USE_HUGE_PAGES,  /* value turns using huge pages on and off */
+    /* deprecated, kept for backward compatibility only */
+    USE_HUGE_PAGES = TBBMALLOC_USE_HUGE_PAGES,
+    /* try to limit memory consumption value (Bytes), clean internal buffers
+       if limit is exceeded, but not prevents from requesting memory from OS */
+    TBBMALLOC_SET_SOFT_HEAP_LIMIT,
+    /* Lower bound for the size (Bytes), that is interpreted as huge
+     * and not released during regular cleanup operations. */
+    TBBMALLOC_SET_HUGE_SIZE_THRESHOLD
+} AllocationModeParam;
+
+/** Set TBB allocator-specific allocation modes.
+    @ingroup memory_allocation */
+TBBMALLOC_EXPORT int __TBB_EXPORTED_FUNC scalable_allocation_mode(int param, intptr_t value);
+
+typedef enum {
+    /* Clean internal allocator buffers for all threads.
+       Returns TBBMALLOC_NO_EFFECT if no buffers cleaned,
+       TBBMALLOC_OK if some memory released from buffers. */
+    TBBMALLOC_CLEAN_ALL_BUFFERS,
+    /* Clean internal allocator buffer for current thread only.
+       Return values same as for TBBMALLOC_CLEAN_ALL_BUFFERS. */
+    TBBMALLOC_CLEAN_THREAD_BUFFERS
+} ScalableAllocationCmd;
+
+/** Call TBB allocator-specific commands.
+    @ingroup memory_allocation */
+TBBMALLOC_EXPORT int __TBB_EXPORTED_FUNC scalable_allocation_command(int cmd, void *param);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#ifdef __cplusplus
+
+//! The namespace rml contains components of low-level memory pool interface.
+namespace rml {
+class MemoryPool;
+
+typedef void *(*rawAllocType)(std::intptr_t pool_id, std::size_t &bytes);
+// returns non-zero in case of error
+typedef int   (*rawFreeType)(std::intptr_t pool_id, void* raw_ptr, std::size_t raw_bytes);
+
+struct MemPoolPolicy {
+    enum {
+        TBBMALLOC_POOL_VERSION = 1
+    };
+
+    rawAllocType pAlloc;
+    rawFreeType  pFree;
+                 // granularity of pAlloc allocations. 0 means default used.
+    std::size_t  granularity;
+    int          version;
+                 // all memory consumed at 1st pAlloc call and never returned,
+                 // no more pAlloc calls after 1st
+    unsigned     fixedPool : 1,
+                 // memory consumed but returned only at pool termination
+                 keepAllMemory : 1,
+                 reserved : 30;
+
+    MemPoolPolicy(rawAllocType pAlloc_, rawFreeType pFree_,
+                  std::size_t granularity_ = 0, bool fixedPool_ = false,
+                  bool keepAllMemory_ = false) :
+        pAlloc(pAlloc_), pFree(pFree_), granularity(granularity_), version(TBBMALLOC_POOL_VERSION),
+        fixedPool(fixedPool_), keepAllMemory(keepAllMemory_),
+        reserved(0) {}
+};
+
+// enums have same values as appropriate enums from ScalableAllocationResult
+// TODO: use ScalableAllocationResult in pool_create directly
+enum MemPoolError {
+    // pool created successfully
+    POOL_OK = TBBMALLOC_OK,
+    // invalid policy parameters found
+    INVALID_POLICY = TBBMALLOC_INVALID_PARAM,
+     // requested pool policy is not supported by allocator library
+    UNSUPPORTED_POLICY = TBBMALLOC_UNSUPPORTED,
+    // lack of memory during pool creation
+    NO_MEMORY = TBBMALLOC_NO_MEMORY,
+    // action takes no effect
+    NO_EFFECT = TBBMALLOC_NO_EFFECT
+};
+
+TBBMALLOC_EXPORT MemPoolError pool_create_v1(std::intptr_t pool_id, const MemPoolPolicy *policy,
+                            rml::MemoryPool **pool);
+
+TBBMALLOC_EXPORT bool  pool_destroy(MemoryPool* memPool);
+TBBMALLOC_EXPORT void *pool_malloc(MemoryPool* memPool, std::size_t size);
+TBBMALLOC_EXPORT void *pool_realloc(MemoryPool* memPool, void *object, std::size_t size);
+TBBMALLOC_EXPORT void *pool_aligned_malloc(MemoryPool* mPool, std::size_t size, std::size_t alignment);
+TBBMALLOC_EXPORT void *pool_aligned_realloc(MemoryPool* mPool, void *ptr, std::size_t size, std::size_t alignment);
+TBBMALLOC_EXPORT bool  pool_reset(MemoryPool* memPool);
+TBBMALLOC_EXPORT bool  pool_free(MemoryPool *memPool, void *object);
+TBBMALLOC_EXPORT MemoryPool *pool_identify(void *object);
+TBBMALLOC_EXPORT std::size_t pool_msize(MemoryPool *memPool, void *object);
+
+} // namespace rml
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+// keep throw in a separate function to prevent code bloat
+template<typename E>
+void throw_exception(const E &e) {
+#if TBB_USE_EXCEPTIONS
+    throw e;
+#else
+    suppress_unused_warning(e);
+#endif
+}
+
+template<typename T>
+class scalable_allocator {
+public:
+    using value_type = T;
+    using propagate_on_container_move_assignment = std::true_type;
+
+    //! Always defined for TBB containers
+    using is_always_equal = std::true_type;
+
+    scalable_allocator() = default;
+    template<typename U> scalable_allocator(const scalable_allocator<U>&) noexcept {}
+
+    //! Allocate space for n objects.
+    __TBB_nodiscard T* allocate(std::size_t n) {
+        T* p = static_cast<T*>(scalable_malloc(n * sizeof(value_type)));
+        if (!p) {
+            throw_exception(std::bad_alloc());
+        }
+        return p;
+    }
+
+    //! Free previously allocated block of memory
+    void deallocate(T* p, std::size_t) {
+        scalable_free(p);
+    }
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    using pointer = value_type*;
+    using const_pointer = const value_type*;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using difference_type = std::ptrdiff_t;
+    using size_type = std::size_t;
+    template<typename U> struct rebind {
+        using other = scalable_allocator<U>;
+    };
+    //! Largest value for which method allocate might succeed.
+    size_type max_size() const noexcept {
+        size_type absolutemax = static_cast<size_type>(-1) / sizeof (value_type);
+        return (absolutemax > 0 ? absolutemax : 1);
+    }
+    template<typename U, typename... Args>
+    void construct(U *p, Args&&... args)
+        { ::new((void *)p) U(std::forward<Args>(args)...); }
+    void destroy(pointer p) { p->~value_type(); }
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return &x; }
+#endif // TBB_ALLOCATOR_TRAITS_BROKEN
+
+};
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    template<>
+    class scalable_allocator<void> {
+    public:
+        using pointer = void*;
+        using const_pointer = const void*;
+        using value_type = void;
+        template<typename U> struct rebind {
+            using other = scalable_allocator<U>;
+        };
+    };
+#endif
+
+template<typename T, typename U>
+inline bool operator==(const scalable_allocator<T>&, const scalable_allocator<U>&) noexcept { return true; }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template<typename T, typename U>
+inline bool operator!=(const scalable_allocator<T>&, const scalable_allocator<U>&) noexcept { return false; }
+#endif
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+
+//! C++17 memory resource implementation for scalable allocator
+//! ISO C++ Section 23.12.2
+class scalable_resource_impl : public std::pmr::memory_resource {
+private:
+    void* do_allocate(std::size_t bytes, std::size_t alignment) override {
+        void* p = scalable_aligned_malloc(bytes, alignment);
+        if (!p) {
+            throw_exception(std::bad_alloc());
+        }
+        return p;
+    }
+
+    void do_deallocate(void* ptr, std::size_t /*bytes*/, std::size_t /*alignment*/) override {
+        scalable_free(ptr);
+    }
+
+    //! Memory allocated by one instance of scalable_resource_impl could be deallocated by any
+    //! other instance of this class
+    bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override {
+        return this == &other ||
+#if __TBB_USE_OPTIONAL_RTTI
+            dynamic_cast<const scalable_resource_impl*>(&other) != nullptr;
+#else
+            false;
+#endif
+    }
+};
+
+//! Global scalable allocator memory resource provider
+inline std::pmr::memory_resource* scalable_memory_resource() noexcept {
+    static tbb::detail::d1::scalable_resource_impl scalable_res;
+    return &scalable_res;
+}
+
+#endif // __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::scalable_allocator;
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+using detail::d1::scalable_memory_resource;
+#endif
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __cplusplus */
+
+#endif /* __TBB_scalable_allocator_H */
diff --git a/third_party/tbb/scheduler_common.h b/third_party/tbb/scheduler_common.h
new file mode 100644
index 000000000..8a0496bd6
--- /dev/null
+++ b/third_party/tbb/scheduler_common.h
@@ -0,0 +1,599 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_scheduler_common_H
+#define _TBB_scheduler_common_H
+
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/detail/_machine.h"
+#include "third_party/tbb/task_group.h"
+#include "third_party/tbb/cache_aligned_allocator.h"
+#include "third_party/tbb/itt_notify.h"
+#include "third_party/tbb/co_context.h"
+#include "third_party/tbb/misc.h"
+#include "third_party/tbb/governor.h"
+
+#ifndef __TBB_SCHEDULER_MUTEX_TYPE
+#define __TBB_SCHEDULER_MUTEX_TYPE tbb::spin_mutex
+#endif
+// TODO: add conditional inclusion based on specified type
+#include "third_party/tbb/spin_mutex.h"
+#include "third_party/tbb/mutex.h"
+
+#if TBB_USE_ASSERT
+#include "third_party/libcxx/atomic"
+#endif
+
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/exception"
+#include "third_party/libcxx/memory" // unique_ptr
+
+//! Mutex type for global locks in the scheduler
+using scheduler_mutex_type = __TBB_SCHEDULER_MUTEX_TYPE;
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Workaround for overzealous compiler warnings
+    // These particular warnings are so ubiquitous that no attempt is made to narrow
+    // the scope of the warnings.
+    #pragma warning (disable: 4100 4127 4312 4244 4267 4706)
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class arena;
+class mail_inbox;
+class mail_outbox;
+class market;
+class observer_proxy;
+
+enum task_stream_accessor_type { front_accessor = 0, back_nonnull_accessor };
+template<task_stream_accessor_type> class task_stream;
+
+using isolation_type = std::intptr_t;
+constexpr isolation_type no_isolation = 0;
+
+struct cache_aligned_deleter {
+    template <typename T>
+    void operator() (T* ptr) const {
+        ptr->~T();
+        cache_aligned_deallocate(ptr);
+    }
+};
+
+template <typename T>
+using cache_aligned_unique_ptr = std::unique_ptr<T, cache_aligned_deleter>;
+
+template <typename T, typename ...Args>
+cache_aligned_unique_ptr<T> make_cache_aligned_unique(Args&& ...args) {
+    return cache_aligned_unique_ptr<T>(new (cache_aligned_allocate(sizeof(T))) T(std::forward<Args>(args)...));
+}
+
+//------------------------------------------------------------------------
+// Extended execute data
+//------------------------------------------------------------------------
+
+//! Execute data used on a task dispatcher side, reflects a current execution state
+struct execution_data_ext : d1::execution_data {
+    task_dispatcher* task_disp{};
+    isolation_type isolation{};
+    d1::wait_context* wait_ctx{};
+};
+
+//------------------------------------------------------------------------
+// Task accessor
+//------------------------------------------------------------------------
+
+//! Interpretation of reserved task fields inside a task dispatcher
+struct task_accessor {
+    static constexpr std::uint64_t proxy_task_trait = 1;
+    static constexpr std::uint64_t resume_task_trait = 2;
+    static d1::task_group_context*& context(d1::task& t) {
+        task_group_context** tgc = reinterpret_cast<task_group_context**>(&t.m_reserved[0]);
+        return *tgc;
+    }
+    static isolation_type& isolation(d1::task& t) {
+        isolation_type* tag = reinterpret_cast<isolation_type*>(&t.m_reserved[2]);
+        return *tag;
+    }
+    static void set_proxy_trait(d1::task& t) {
+        // TODO: refactor proxy tasks not to work on uninitialized memory.
+        //__TBB_ASSERT((t.m_version_and_traits & proxy_task_trait) == 0, nullptr);
+        t.m_version_and_traits |= proxy_task_trait;
+    }
+    static bool is_proxy_task(d1::task& t) {
+        return (t.m_version_and_traits & proxy_task_trait) != 0;
+    }
+    static void set_resume_trait(d1::task& t) {
+        __TBB_ASSERT((t.m_version_and_traits & resume_task_trait) == 0, nullptr);
+        t.m_version_and_traits |= resume_task_trait;
+    }
+    static bool is_resume_task(d1::task& t) {
+        return (t.m_version_and_traits & resume_task_trait) != 0;
+    }
+};
+
+//------------------------------------------------------------------------
+//! Extended variant of the standard offsetof macro
+/** The standard offsetof macro is not sufficient for TBB as it can be used for
+    POD-types only. The constant 0x1000 (not nullptr) is necessary to appease GCC. **/
+#define __TBB_offsetof(class_name, member_name) \
+    ((ptrdiff_t)&(reinterpret_cast<class_name*>(0x1000)->member_name) - 0x1000)
+
+//! Returns address of the object containing a member with the given name and address
+#define __TBB_get_object_ref(class_name, member_name, member_addr) \
+    (*reinterpret_cast<class_name*>((char*)member_addr - __TBB_offsetof(class_name, member_name)))
+
+//! Helper class for tracking floating point context and task group context switches
+/** Assuming presence of an itt collector, in addition to keeping track of floating
+    point context, this class emits itt events to indicate begin and end of task group
+    context execution **/
+template <bool report_tasks>
+class context_guard_helper {
+    const d1::task_group_context* curr_ctx;
+    d1::cpu_ctl_env guard_cpu_ctl_env;
+    d1::cpu_ctl_env curr_cpu_ctl_env;
+public:
+    context_guard_helper() : curr_ctx(nullptr) {
+        guard_cpu_ctl_env.get_env();
+        curr_cpu_ctl_env = guard_cpu_ctl_env;
+    }
+    ~context_guard_helper() {
+        if (curr_cpu_ctl_env != guard_cpu_ctl_env)
+            guard_cpu_ctl_env.set_env();
+        if (report_tasks && curr_ctx)
+            ITT_TASK_END;
+    }
+    // The function is called from bypass dispatch loop on the hot path.
+    // Consider performance issues when refactoring.
+    void set_ctx(const d1::task_group_context* ctx) {
+        if (!ctx)
+            return;
+        const d1::cpu_ctl_env* ctl = reinterpret_cast<const d1::cpu_ctl_env*>(&ctx->my_cpu_ctl_env);
+        // Compare the FPU settings directly because the context can be reused between parallel algorithms.
+        if (*ctl != curr_cpu_ctl_env) {
+            curr_cpu_ctl_env = *ctl;
+            curr_cpu_ctl_env.set_env();
+        }
+        if (report_tasks && ctx != curr_ctx) {
+            // if task group context was active, report end of current execution frame.
+            if (curr_ctx)
+                ITT_TASK_END;
+            // reporting begin of new task group context execution frame.
+            // using address of task group context object to group tasks (parent).
+            // id of task execution frame is nullptr and reserved for future use.
+            ITT_TASK_BEGIN(ctx, ctx->my_name, nullptr);
+            curr_ctx = ctx;
+        }
+    }
+#if _WIN64
+    void restore_default() {
+        if (curr_cpu_ctl_env != guard_cpu_ctl_env) {
+            guard_cpu_ctl_env.set_env();
+            curr_cpu_ctl_env = guard_cpu_ctl_env;
+        }
+    }
+#endif // _WIN64
+};
+
+#if (_WIN32 || _WIN64 || __unix__ || __APPLE__) && (__TBB_x86_32 || __TBB_x86_64)
+#if _MSC_VER
+#pragma intrinsic(__rdtsc)
+#endif
+inline std::uint64_t machine_time_stamp() {
+#if __INTEL_COMPILER
+    return _rdtsc();
+#elif _MSC_VER
+    return __rdtsc();
+#else
+    std::uint32_t hi, lo;
+    __asm__ __volatile__("rdtsc" : "=d"(hi), "=a"(lo));
+    return (std::uint64_t(hi) << 32) | lo;
+#endif
+}
+
+inline void prolonged_pause_impl() {
+    // Assumption based on practice: 1000-2000 ticks seems to be a suitable invariant for the
+    // majority of platforms. Currently, skip platforms that define __TBB_STEALING_PAUSE
+    // because these platforms require very careful tuning.
+    std::uint64_t prev = machine_time_stamp();
+    const std::uint64_t finish = prev + 1000;
+    atomic_backoff backoff;
+    do {
+        backoff.bounded_pause();
+        std::uint64_t curr = machine_time_stamp();
+        if (curr <= prev)
+            // Possibly, the current logical thread is moved to another hardware thread or overflow is occurred.
+            break;
+        prev = curr;
+    } while (prev < finish);
+}
+#else
+inline void prolonged_pause_impl() {
+#ifdef __TBB_ipf
+    static const long PauseTime = 1500;
+#else
+    static const long PauseTime = 80;
+#endif
+    // TODO IDEA: Update PauseTime adaptively?
+    machine_pause(PauseTime);
+}
+#endif
+
+inline void prolonged_pause() {
+#if __TBB_WAITPKG_INTRINSICS_PRESENT
+    if (governor::wait_package_enabled()) {
+        std::uint64_t time_stamp = machine_time_stamp();
+        // _tpause function directs the processor to enter an implementation-dependent optimized state
+        // until the Time Stamp Counter reaches or exceeds the value specified in second parameter.
+        // Constant "700" is ticks to wait for.
+        // First parameter 0 selects between a lower power (cleared) or faster wakeup (set) optimized state.
+        _tpause(0, time_stamp + 700);
+    }
+    else
+#endif
+    prolonged_pause_impl();
+}
+
+// TODO: investigate possibility to work with number of CPU cycles
+// because for different configurations this number of pauses + yields
+// will be calculated in different amount of CPU cycles
+// for example use rdtsc for it
+class stealing_loop_backoff {
+    const int my_pause_threshold;
+    const int my_yield_threshold;
+    int my_pause_count;
+    int my_yield_count;
+public:
+    // my_yield_threshold = 100 is an experimental value. Ideally, once we start calling __TBB_Yield(),
+    // the time spent spinning before calling out_of_work() should be approximately
+    // the time it takes for a thread to be woken up. Doing so would guarantee that we do
+    // no worse than 2x the optimal spin time. Or perhaps a time-slice quantum is the right amount.
+    stealing_loop_backoff(int num_workers, int yields_multiplier)
+        : my_pause_threshold{ 2 * (num_workers + 1) }
+#if __APPLE__
+        // threshold value tuned separately for macOS due to high cost of sched_yield there
+        , my_yield_threshold{10 * yields_multiplier}
+#else
+        , my_yield_threshold{100 * yields_multiplier}
+#endif
+        , my_pause_count{}
+        , my_yield_count{}
+    {}
+    bool pause() {
+        prolonged_pause();
+        if (my_pause_count++ >= my_pause_threshold) {
+            my_pause_count = my_pause_threshold;
+            d0::yield();
+            if (my_yield_count++ >= my_yield_threshold) {
+                my_yield_count = my_yield_threshold;
+                return true;
+            }
+        }
+        return false;
+    }
+    void reset_wait() {
+        my_pause_count = my_yield_count = 0;
+    }
+};
+
+//------------------------------------------------------------------------
+// Exception support
+//------------------------------------------------------------------------
+//! Task group state change propagation global epoch
+/** Together with generic_scheduler::my_context_state_propagation_epoch forms
+    cross-thread signaling mechanism that allows to avoid locking at the hot path
+    of normal execution flow.
+
+    When a descendant task group context is registered or unregistered, the global
+    and local epochs are compared. If they differ, a state change is being propagated,
+    and thus registration/deregistration routines take slower branch that may block
+    (at most one thread of the pool can be blocked at any moment). Otherwise the
+    control path is lock-free and fast. **/
+extern std::atomic<std::uintptr_t> the_context_state_propagation_epoch;
+
+//! Mutex guarding state change propagation across task groups forest.
+/** Also protects modification of related data structures. **/
+typedef scheduler_mutex_type context_state_propagation_mutex_type;
+extern context_state_propagation_mutex_type the_context_state_propagation_mutex;
+
+class tbb_exception_ptr {
+    std::exception_ptr my_ptr;
+public:
+    static tbb_exception_ptr* allocate() noexcept;
+
+    //! Destroys this objects
+    /** Note that objects of this type can be created only by the allocate() method. **/
+    void destroy() noexcept;
+
+    //! Throws the contained exception .
+    void throw_self();
+
+private:
+    tbb_exception_ptr(const std::exception_ptr& src) : my_ptr(src) {}
+}; // class tbb_exception_ptr
+
+//------------------------------------------------------------------------
+// Debugging support
+//------------------------------------------------------------------------
+
+#if TBB_USE_ASSERT
+static const std::uintptr_t venom = tbb::detail::select_size_t_constant<0xDEADBEEFU, 0xDDEEAADDDEADBEEFULL>::value;
+
+inline void poison_value(std::uintptr_t& val) { val = venom; }
+
+inline void poison_value(std::atomic<std::uintptr_t>& val) { val.store(venom, std::memory_order_relaxed); }
+
+/** Expected to be used in assertions only, thus no empty form is defined. **/
+inline bool is_alive(std::uintptr_t v) { return v != venom; }
+
+/** Logically, this method should be a member of class task.
+    But we do not want to publish it, so it is here instead. */
+inline void assert_task_valid(const d1::task* t) {
+    assert_pointer_valid(t);
+}
+#else /* !TBB_USE_ASSERT */
+
+/** In contrast to debug version poison_value() is a macro here because
+    the variable used as its argument may be undefined in release builds. **/
+#define poison_value(g) ((void)0)
+
+inline void assert_task_valid(const d1::task*) {}
+
+#endif /* !TBB_USE_ASSERT */
+
+struct suspend_point_type {
+#if __TBB_RESUMABLE_TASKS
+    //! The arena related to this task_dispatcher
+    arena* m_arena{ nullptr };
+    //! The random for the resume task
+    FastRandom m_random;
+    //! The flag is raised when the original owner should return to this task dispatcher.
+    std::atomic<bool> m_is_owner_recalled{ false };
+    //! Inicates if the resume task should be placed to the critical task stream.
+    bool m_is_critical{ false };
+    //! Associated coroutine
+    co_context m_co_context;
+    //! Supend point before resume
+    suspend_point_type* m_prev_suspend_point{nullptr};
+
+    // Possible state transitions:
+    // A -> S -> N -> A
+    // A -> N -> S -> N -> A
+    enum class stack_state {
+        active, // some thread is working with this stack
+        suspended, // no thread is working with this stack
+        notified // some thread tried to resume this stack
+    };
+
+    //! The flag required to protect suspend finish and resume call
+    std::atomic<stack_state> m_stack_state{stack_state::active};
+
+    void resume(suspend_point_type* sp) {
+        __TBB_ASSERT(m_stack_state.load(std::memory_order_relaxed) != stack_state::suspended, "The stack is expected to be active");
+
+        sp->m_prev_suspend_point = this;
+
+        // Do not access sp after resume
+        m_co_context.resume(sp->m_co_context);
+        __TBB_ASSERT(m_stack_state.load(std::memory_order_relaxed) != stack_state::active, nullptr);
+
+        finilize_resume();
+    }
+
+    void finilize_resume() {
+        m_stack_state.store(stack_state::active, std::memory_order_relaxed);
+        // Set the suspended state for the stack that we left. If the state is already notified, it means that 
+        // someone already tried to resume our previous stack but failed. So, we need to resume it.
+        // m_prev_suspend_point might be nullptr when destroying co_context based on threads
+        if (m_prev_suspend_point && m_prev_suspend_point->m_stack_state.exchange(stack_state::suspended) == stack_state::notified) {
+            r1::resume(m_prev_suspend_point);
+        }
+        m_prev_suspend_point = nullptr;
+    }
+
+    bool try_notify_resume() {
+        // Check that stack is already suspended. Return false if not yet.
+        return m_stack_state.exchange(stack_state::notified) == stack_state::suspended;
+    }
+
+    void recall_owner() {
+        __TBB_ASSERT(m_stack_state.load(std::memory_order_relaxed) == stack_state::suspended, nullptr);
+        m_stack_state.store(stack_state::notified, std::memory_order_relaxed);
+        m_is_owner_recalled.store(true, std::memory_order_release);
+    }
+
+    struct resume_task final : public d1::task {
+        task_dispatcher& m_target;
+        explicit resume_task(task_dispatcher& target) : m_target(target) {
+            task_accessor::set_resume_trait(*this);
+        }
+        d1::task* execute(d1::execution_data& ed) override;
+        d1::task* cancel(d1::execution_data&) override {
+            __TBB_ASSERT(false, "The resume task cannot be canceled");
+            return nullptr;
+        }
+    } m_resume_task;
+
+    suspend_point_type(arena* a, std::size_t stack_size, task_dispatcher& target);
+#endif /*__TBB_RESUMABLE_TASKS */
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+// structure was padded due to alignment specifier
+#pragma warning( push )
+#pragma warning( disable: 4324 )
+#endif
+
+class alignas (max_nfs_size) task_dispatcher {
+public:
+    // TODO: reconsider low level design to better organize dependencies and files.
+    friend class thread_data;
+    friend class arena_slot;
+    friend class nested_arena_context;
+    friend class delegated_task;
+    friend struct base_waiter;
+
+    //! The list of possible post resume actions.
+    enum class post_resume_action {
+        invalid,
+        register_waiter,
+        cleanup,
+        notify,
+        none
+    };
+
+    //! The data of the current thread attached to this task_dispatcher
+    thread_data* m_thread_data{ nullptr };
+
+    //! The current execution data
+    execution_data_ext m_execute_data_ext;
+
+    //! Properties
+    struct properties {
+        bool outermost{ true };
+        bool fifo_tasks_allowed{ true };
+        bool critical_task_allowed{ true };
+    } m_properties;
+
+    //! Position in the call stack when stealing is still allowed.
+    std::uintptr_t m_stealing_threshold{};
+
+    //! Suspend point (null if this task dispatcher has been never suspended)
+    suspend_point_type* m_suspend_point{ nullptr };
+
+    //! Attempt to get a task from the mailbox.
+    /** Gets a task only if it has not been executed by its sender or a thief
+        that has stolen it from the sender's task pool. Otherwise returns nullptr.
+        This method is intended to be used only by the thread extracting the proxy
+        from its mailbox. (In contrast to local task pool, mailbox can be read only
+        by its owner). **/
+    d1::task* get_mailbox_task(mail_inbox& my_inbox, execution_data_ext& ed, isolation_type isolation);
+
+    d1::task* get_critical_task(d1::task*, execution_data_ext&, isolation_type, bool);
+
+    template <bool ITTPossible, typename Waiter>
+    d1::task* receive_or_steal_task(thread_data& tls, execution_data_ext& ed, Waiter& waiter,
+                                isolation_type isolation, bool outermost, bool criticality_absence);
+
+    template <bool ITTPossible, typename Waiter>
+    d1::task* local_wait_for_all(d1::task * t, Waiter& waiter);
+
+    task_dispatcher(const task_dispatcher&) = delete;
+
+    bool can_steal();
+public:
+    task_dispatcher(arena* a);
+
+    ~task_dispatcher() {
+        if (m_suspend_point) {
+            m_suspend_point->~suspend_point_type();
+            cache_aligned_deallocate(m_suspend_point);
+        }
+        poison_pointer(m_thread_data);
+        poison_pointer(m_suspend_point);
+    }
+
+    template <typename Waiter>
+    d1::task* local_wait_for_all(d1::task* t, Waiter& waiter);
+
+    bool allow_fifo_task(bool new_state) {
+        bool old_state = m_properties.fifo_tasks_allowed;
+        m_properties.fifo_tasks_allowed = new_state;
+        return old_state;
+    }
+
+    isolation_type set_isolation(isolation_type isolation) {
+        isolation_type prev = m_execute_data_ext.isolation;
+        m_execute_data_ext.isolation = isolation;
+        return prev;
+    }
+
+    thread_data& get_thread_data() {
+        __TBB_ASSERT(m_thread_data, nullptr);
+        return *m_thread_data;
+    }
+
+    static void execute_and_wait(d1::task* t, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx);
+
+    void set_stealing_threshold(std::uintptr_t stealing_threshold) {
+        bool assert_condition = (stealing_threshold == 0 && m_stealing_threshold != 0) ||
+                                (stealing_threshold != 0 && m_stealing_threshold == 0);
+        __TBB_ASSERT_EX( assert_condition, nullptr );
+        m_stealing_threshold = stealing_threshold;
+    }
+
+    d1::task* get_inbox_or_critical_task(execution_data_ext&, mail_inbox&, isolation_type, bool);
+    d1::task* get_stream_or_critical_task(execution_data_ext&, arena&, task_stream<front_accessor>&,
+                                      unsigned& /*hint_for_stream*/, isolation_type,
+                                      bool /*critical_allowed*/);
+    d1::task* steal_or_get_critical(execution_data_ext&, arena&, unsigned /*arena_index*/, FastRandom&,
+                                isolation_type, bool /*critical_allowed*/);
+
+#if __TBB_RESUMABLE_TASKS
+    /* [[noreturn]] */ void co_local_wait_for_all() noexcept;
+    void suspend(suspend_callback_type suspend_callback, void* user_callback);
+    void internal_suspend();
+    void do_post_resume_action();
+
+    bool resume(task_dispatcher& target);
+    suspend_point_type* get_suspend_point();
+    void init_suspend_point(arena* a, std::size_t stack_size);
+    friend void internal_resume(suspend_point_type*);
+    void recall_point();
+#endif /* __TBB_RESUMABLE_TASKS */
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+#pragma warning( pop )
+#endif
+
+inline std::uintptr_t calculate_stealing_threshold(std::uintptr_t base, std::size_t stack_size) {
+    __TBB_ASSERT(stack_size != 0, "Stack size cannot be zero");
+    __TBB_ASSERT(base > stack_size / 2, "Stack anchor calculation overflow");
+    return base - stack_size / 2;
+}
+
+struct task_group_context_impl {
+    static void destroy(d1::task_group_context&);
+    static void initialize(d1::task_group_context&);
+    static void register_with(d1::task_group_context&, thread_data*);
+    static void bind_to_impl(d1::task_group_context&, thread_data*);
+    static void bind_to(d1::task_group_context&, thread_data*);
+    static void propagate_task_group_state(d1::task_group_context&, std::atomic<uint32_t> d1::task_group_context::*, d1::task_group_context&, uint32_t);
+    static bool cancel_group_execution(d1::task_group_context&);
+    static bool is_group_execution_cancelled(const d1::task_group_context&);
+    static void reset(d1::task_group_context&);
+    static void capture_fp_settings(d1::task_group_context&);
+    static void copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src);
+};
+
+
+//! Forward declaration for scheduler entities
+bool gcc_rethrow_exception_broken();
+void fix_broken_rethrow();
+//! Forward declaration: throws std::runtime_error with what() returning error_code description prefixed with aux_info
+void handle_perror(int error_code, const char* aux_info);
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_scheduler_common_H */
diff --git a/third_party/tbb/semaphore.cpp b/third_party/tbb/semaphore.cpp
new file mode 100644
index 000000000..a1ac96b3d
--- /dev/null
+++ b/third_party/tbb/semaphore.cpp
@@ -0,0 +1,93 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/semaphore.h"
+#if __TBB_USE_SRWLOCK
+#include "third_party/tbb/dynamic_link.h" // Refers to src/tbb, not include/tbb
+// MISSING #include "tbb_misc.h"
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+// TODO: For new win UI port, we can use SRWLock API without dynamic_link etc.
+#if __TBB_USE_SRWLOCK
+
+static std::atomic<do_once_state> concmon_module_inited;
+
+void WINAPI init_binsem_using_event( SRWLOCK* h_ )
+{
+    srwl_or_handle* shptr = (srwl_or_handle*) h_;
+    shptr->h = CreateEventEx( nullptr, nullptr, 0, EVENT_ALL_ACCESS|SEMAPHORE_ALL_ACCESS );
+}
+
+void WINAPI acquire_binsem_using_event( SRWLOCK* h_ )
+{
+    srwl_or_handle* shptr = (srwl_or_handle*) h_;
+    WaitForSingleObjectEx( shptr->h, INFINITE, FALSE );
+}
+
+void WINAPI release_binsem_using_event( SRWLOCK* h_ )
+{
+    srwl_or_handle* shptr = (srwl_or_handle*) h_;
+    SetEvent( shptr->h );
+}
+
+static void (WINAPI *__TBB_init_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&init_binsem_using_event;
+static void (WINAPI *__TBB_acquire_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&acquire_binsem_using_event;
+static void (WINAPI *__TBB_release_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&release_binsem_using_event;
+
+//! Table describing the how to link the handlers.
+static const dynamic_link_descriptor SRWLLinkTable[] = {
+    DLD(InitializeSRWLock,       __TBB_init_binsem),
+    DLD(AcquireSRWLockExclusive, __TBB_acquire_binsem),
+    DLD(ReleaseSRWLockExclusive, __TBB_release_binsem)
+};
+
+inline void init_concmon_module()
+{
+    __TBB_ASSERT( (uintptr_t)__TBB_init_binsem==(uintptr_t)&init_binsem_using_event, nullptr);
+    if( dynamic_link( "Kernel32.dll", SRWLLinkTable, sizeof(SRWLLinkTable)/sizeof(dynamic_link_descriptor) ) ) {
+        __TBB_ASSERT( (uintptr_t)__TBB_init_binsem!=(uintptr_t)&init_binsem_using_event, nullptr);
+        __TBB_ASSERT( (uintptr_t)__TBB_acquire_binsem!=(uintptr_t)&acquire_binsem_using_event, nullptr);
+        __TBB_ASSERT( (uintptr_t)__TBB_release_binsem!=(uintptr_t)&release_binsem_using_event, nullptr);
+    }
+}
+
+binary_semaphore::binary_semaphore() {
+    atomic_do_once( &init_concmon_module, concmon_module_inited );
+
+    __TBB_init_binsem( &my_sem.lock );
+    if( (uintptr_t)__TBB_init_binsem!=(uintptr_t)&init_binsem_using_event )
+        P();
+}
+
+binary_semaphore::~binary_semaphore() {
+    if( (uintptr_t)__TBB_init_binsem==(uintptr_t)&init_binsem_using_event )
+        CloseHandle( my_sem.h );
+}
+
+void binary_semaphore::P() { __TBB_acquire_binsem( &my_sem.lock ); }
+
+void binary_semaphore::V() { __TBB_release_binsem( &my_sem.lock ); }
+
+#endif /* __TBB_USE_SRWLOCK */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/semaphore.h b/third_party/tbb/semaphore.h
new file mode 100644
index 000000000..281d18516
--- /dev/null
+++ b/third_party/tbb/semaphore.h
@@ -0,0 +1,331 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_semaphore_H
+#define __TBB_semaphore_H
+
+#include "third_party/tbb/detail/_utils.h"
+
+#if _WIN32||_WIN64
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#elif __APPLE__
+// MISSING #include <dispatch/dispatch.h>
+#else
+#include "libc/thread/semaphore.h"
+#ifdef TBB_USE_DEBUG
+#include "third_party/libcxx/cerrno"
+#endif
+#endif /*_WIN32||_WIN64*/
+
+#include "third_party/libcxx/atomic"
+
+#if __unix__
+#if defined(__has_include)
+#define __TBB_has_include __has_include
+#else
+#define __TBB_has_include(x) 0
+#endif
+
+/* Futex definitions */
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+#if defined(__linux__) || __TBB_has_include(<sys/syscall.h>)
+#include "libc/stdio/syscall.h"
+#endif
+
+#if defined(SYS_futex)
+
+/* This section is included for Linux and some other systems that may support futexes.*/
+
+#define __TBB_USE_FUTEX 1
+
+/*
+If available, use typical headers where futex API is defined. While Linux and OpenBSD
+are known to provide such headers, other systems might have them as well.
+*/
+#if defined(__linux__) || __TBB_has_include(<linux/futex.h>)
+#include "libc/sysv/consts/futex.h"
+#include "libc/sysv/consts/nr.h"
+#elif defined(__OpenBSD__) || __TBB_has_include(<sys/futex.h>)
+// MISSING #include <sys/futex.h>
+#endif
+
+#include "third_party/libcxx/climits"
+#include "third_party/libcxx/cerrno"
+
+/*
+Some systems might not define the macros or use different names. In such case we expect
+the actual parameter values to match Linux: 0 for wait, 1 for wake.
+*/
+#if defined(FUTEX_WAIT_PRIVATE)
+#define __TBB_FUTEX_WAIT FUTEX_WAIT_PRIVATE
+#elif defined(FUTEX_WAIT)
+#define __TBB_FUTEX_WAIT FUTEX_WAIT
+#else
+#define __TBB_FUTEX_WAIT 0
+#endif
+
+#if defined(FUTEX_WAKE_PRIVATE)
+#define __TBB_FUTEX_WAKE FUTEX_WAKE_PRIVATE
+#elif defined(FUTEX_WAKE)
+#define __TBB_FUTEX_WAKE FUTEX_WAKE
+#else
+#define __TBB_FUTEX_WAKE 1
+#endif
+
+#endif // SYS_futex
+#endif // __unix__
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Futex implementation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if __TBB_USE_FUTEX
+
+static inline int futex_wait( void *futex, int comparand ) {
+    int r = ::syscall(SYS_futex, futex, __TBB_FUTEX_WAIT, comparand, nullptr, nullptr, 0);
+#if TBB_USE_ASSERT
+    int e = errno;
+    __TBB_ASSERT(r == 0 || r == EWOULDBLOCK || (r == -1 && (e == EAGAIN || e == EINTR)), "futex_wait failed.");
+#endif /* TBB_USE_ASSERT */
+    return r;
+}
+
+static inline int futex_wakeup_one( void *futex ) {
+    int r = ::syscall(SYS_futex, futex, __TBB_FUTEX_WAKE, 1, nullptr, nullptr, 0);
+    __TBB_ASSERT(r == 0 || r == 1, "futex_wakeup_one: more than one thread woken up?");
+    return r;
+}
+
+// Additional possible methods that are not required right now
+// static inline int futex_wakeup_all( void *futex ) {
+//     int r = ::syscall( SYS_futex,futex,__TBB_FUTEX_WAKE,INT_MAX,nullptr,nullptr,0 );
+//     __TBB_ASSERT( r>=0, "futex_wakeup_all: error in waking up threads" );
+//     return r;
+// }
+
+#endif // __TBB_USE_FUTEX
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#if _WIN32||_WIN64
+typedef LONG sem_count_t;
+//! Edsger Dijkstra's counting semaphore
+class semaphore : no_copy {
+    static const int max_semaphore_cnt = MAXLONG;
+public:
+    //! ctor
+    semaphore(size_t start_cnt_ = 0) {init_semaphore(start_cnt_);}
+    //! dtor
+    ~semaphore() {CloseHandle( sem );}
+    //! wait/acquire
+    void P() {WaitForSingleObjectEx( sem, INFINITE, FALSE );}
+    //! post/release
+    void V() {ReleaseSemaphore( sem, 1, nullptr);}
+private:
+    HANDLE sem;
+    void init_semaphore(size_t start_cnt_) {
+        sem = CreateSemaphoreEx( nullptr, LONG(start_cnt_), max_semaphore_cnt, nullptr, 0, SEMAPHORE_ALL_ACCESS );
+    }
+};
+#elif __APPLE__
+//! Edsger Dijkstra's counting semaphore
+class semaphore : no_copy {
+public:
+    //! ctor
+    semaphore(int start_cnt_ = 0) { my_sem = dispatch_semaphore_create(start_cnt_); }
+    //! dtor
+    ~semaphore() { dispatch_release(my_sem); }
+    //! wait/acquire
+    void P() {
+        std::intptr_t ret = dispatch_semaphore_wait(my_sem, DISPATCH_TIME_FOREVER);
+        __TBB_ASSERT_EX(ret == 0, "dispatch_semaphore_wait() failed");
+    }
+    //! post/release
+    void V() { dispatch_semaphore_signal(my_sem); }
+private:
+    dispatch_semaphore_t my_sem;
+};
+#else /* Linux/Unix */
+typedef uint32_t sem_count_t;
+//! Edsger Dijkstra's counting semaphore
+class semaphore : no_copy {
+public:
+    //! ctor
+    semaphore(int start_cnt_ = 0 ) { init_semaphore( start_cnt_ ); }
+
+    //! dtor
+    ~semaphore() {
+        int ret = sem_destroy( &sem );
+        __TBB_ASSERT_EX( !ret, nullptr);
+    }
+    //! wait/acquire
+    void P() {
+        while( sem_wait( &sem )!=0 )
+            __TBB_ASSERT( errno==EINTR, nullptr);
+    }
+    //! post/release
+    void V() { sem_post( &sem ); }
+private:
+    sem_t sem;
+    void init_semaphore(int start_cnt_) {
+        int ret = sem_init( &sem, /*shared among threads*/ 0, start_cnt_ );
+        __TBB_ASSERT_EX( !ret, nullptr);
+    }
+};
+#endif /* _WIN32||_WIN64 */
+
+
+//! for performance reasons, we want specialized binary_semaphore
+#if _WIN32||_WIN64
+#if !__TBB_USE_SRWLOCK
+//! binary_semaphore for concurrent_monitor
+class binary_semaphore : no_copy {
+public:
+    //! ctor
+    binary_semaphore() { my_sem = CreateEventEx( nullptr, nullptr, 0, EVENT_ALL_ACCESS );  }
+    //! dtor
+    ~binary_semaphore() { CloseHandle( my_sem ); }
+    //! wait/acquire
+    void P() { WaitForSingleObjectEx( my_sem, INFINITE, FALSE ); }
+    //! post/release
+    void V() { SetEvent( my_sem ); }
+private:
+    HANDLE my_sem;
+};
+#else /* __TBB_USE_SRWLOCK */
+
+union srwl_or_handle {
+    SRWLOCK lock;
+    HANDLE  h;
+};
+
+//! binary_semaphore for concurrent_monitor
+class binary_semaphore : no_copy {
+public:
+    //! ctor
+    binary_semaphore();
+    //! dtor
+    ~binary_semaphore();
+    //! wait/acquire
+    void P();
+    //! post/release
+    void V();
+private:
+    srwl_or_handle my_sem;
+};
+#endif /* !__TBB_USE_SRWLOCK */
+#elif __APPLE__
+//! binary_semaphore for concurrent monitor
+using binary_semaphore = semaphore;
+#else /* Linux/Unix */
+
+#if __TBB_USE_FUTEX
+class binary_semaphore : no_copy {
+// The implementation is equivalent to the "Mutex, Take 3" one
+// in the paper "Futexes Are Tricky" by Ulrich Drepper
+public:
+    //! ctor
+    binary_semaphore() { my_sem = 1; }
+    //! dtor
+    ~binary_semaphore() {}
+    //! wait/acquire
+    void P() {
+        int s = 0;
+        if( !my_sem.compare_exchange_strong( s, 1 ) ) {
+            if( s!=2 )
+                s = my_sem.exchange( 2 );
+            while( s!=0 ) { // This loop deals with spurious wakeup
+                futex_wait( &my_sem, 2 );
+                s = my_sem.exchange( 2 );
+            }
+        }
+    }
+    //! post/release
+    void V() {
+        __TBB_ASSERT( my_sem.load(std::memory_order_relaxed)>=1, "multiple V()'s in a row?" );
+        if( my_sem.exchange( 0 )==2 )
+            futex_wakeup_one( &my_sem );
+    }
+private:
+    std::atomic<int> my_sem; // 0 - open; 1 - closed, no waits; 2 - closed, possible waits
+};
+#else
+typedef uint32_t sem_count_t;
+//! binary_semaphore for concurrent monitor
+class binary_semaphore : no_copy {
+public:
+    //! ctor
+    binary_semaphore() {
+        int ret = sem_init( &my_sem, /*shared among threads*/ 0, 0 );
+        __TBB_ASSERT_EX( !ret, nullptr);
+    }
+    //! dtor
+    ~binary_semaphore() {
+        int ret = sem_destroy( &my_sem );
+        __TBB_ASSERT_EX( !ret, nullptr);
+    }
+    //! wait/acquire
+    void P() {
+        while( sem_wait( &my_sem )!=0 )
+            __TBB_ASSERT( errno==EINTR, nullptr);
+    }
+    //! post/release
+    void V() { sem_post( &my_sem ); }
+private:
+    sem_t my_sem;
+};
+#endif /* __TBB_USE_FUTEX */
+#endif /* _WIN32||_WIN64 */
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_semaphore_H */
diff --git a/third_party/tbb/small_object_pool.cpp b/third_party/tbb/small_object_pool.cpp
new file mode 100644
index 000000000..74a970d9d
--- /dev/null
+++ b/third_party/tbb/small_object_pool.cpp
@@ -0,0 +1,155 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/cache_aligned_allocator.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/thread_data.h"
+#include "third_party/tbb/task_dispatcher.h"
+
+#include "third_party/libcxx/cstddef"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+small_object_pool_impl::small_object* const small_object_pool_impl::dead_public_list =
+                reinterpret_cast<small_object_pool_impl::small_object*>(1);
+
+void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& allocator, std::size_t number_of_bytes, const d1::execution_data& ed) {
+    auto& tls = static_cast<const execution_data_ext&>(ed).task_disp->get_thread_data();
+    auto pool = tls.my_small_object_pool;
+    return pool->allocate_impl(allocator, number_of_bytes);
+}
+
+void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& allocator, std::size_t number_of_bytes) {
+    // TODO: optimize if the allocator contains a valid pool.
+    auto tls = governor::get_thread_data();
+    auto pool = tls->my_small_object_pool;
+    return pool->allocate_impl(allocator, number_of_bytes);
+}
+
+void* small_object_pool_impl::allocate_impl(d1::small_object_pool*& allocator, std::size_t number_of_bytes)
+{
+    small_object* obj{nullptr};
+
+    if (number_of_bytes <= small_object_size) {
+        if (m_private_list) {
+            obj = m_private_list;
+            m_private_list = m_private_list->next;
+        } else if (m_public_list.load(std::memory_order_relaxed)) {
+            // No fence required for read of my_public_list above, because std::atomic::exchange() has a fence.
+            obj = m_public_list.exchange(nullptr);
+            __TBB_ASSERT( obj, "another thread emptied the my_public_list" );
+            m_private_list = obj->next;
+        } else {
+            obj = new (cache_aligned_allocate(small_object_size)) small_object{nullptr};
+            ++m_private_counter;
+        }
+    } else {
+        obj = new (cache_aligned_allocate(number_of_bytes)) small_object{nullptr};
+    }
+    allocator = this;
+
+    // Return uninitialized memory for further construction on user side.
+    obj->~small_object();
+    return obj;
+}
+
+void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& allocator, void* ptr, std::size_t number_of_bytes) {
+    auto pool = static_cast<small_object_pool_impl*>(&allocator);
+    auto tls = governor::get_thread_data();
+    pool->deallocate_impl(ptr, number_of_bytes, *tls);
+}
+
+void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& allocator, void* ptr, std::size_t number_of_bytes, const d1::execution_data& ed) {
+    auto& tls = static_cast<const execution_data_ext&>(ed).task_disp->get_thread_data();
+    auto pool = static_cast<small_object_pool_impl*>(&allocator);
+    pool->deallocate_impl(ptr, number_of_bytes, tls);
+}
+
+void small_object_pool_impl::deallocate_impl(void* ptr, std::size_t number_of_bytes, thread_data& td) {
+    __TBB_ASSERT(ptr != nullptr, "pointer to deallocate should not be null");
+    __TBB_ASSERT(number_of_bytes >= sizeof(small_object), "number of bytes should be at least sizeof(small_object)");
+
+    if (number_of_bytes <= small_object_size) {
+        auto obj = new (ptr) small_object{nullptr};
+        if (td.my_small_object_pool == this) {
+            obj->next = m_private_list;
+            m_private_list = obj;
+        } else {
+            auto old_public_list = m_public_list.load(std::memory_order_relaxed);
+
+            for (;;) {
+                if (old_public_list == dead_public_list) {
+                    obj->~small_object();
+                    cache_aligned_deallocate(obj);
+                    if (++m_public_counter == 0)
+                    {
+                        this->~small_object_pool_impl();
+                        cache_aligned_deallocate(this);
+                    }
+                    break;
+                }
+                obj->next = old_public_list;
+                if (m_public_list.compare_exchange_strong(old_public_list, obj)) {
+                    break;
+                }
+            }
+        }
+    } else {
+        cache_aligned_deallocate(ptr);
+    }
+}
+
+std::int64_t small_object_pool_impl::cleanup_list(small_object* list)
+{
+    std::int64_t removed_count{};
+
+    while (list) {
+        small_object* current = list;
+        list = list->next;
+        current->~small_object();
+        cache_aligned_deallocate(current);
+        ++removed_count;
+    }
+    return removed_count;
+}
+
+void small_object_pool_impl::destroy()
+{
+    // clean up private list and subtract the removed count from private counter
+    m_private_counter -= cleanup_list(m_private_list);
+    // Grab public list and place dead mark
+    small_object* public_list = m_public_list.exchange(dead_public_list);
+    // clean up public list and subtract from private (intentionally) counter
+    m_private_counter -= cleanup_list(public_list);
+    __TBB_ASSERT(m_private_counter >= 0, "Private counter may not be less than 0");
+    // Equivalent to fetch_sub(m_private_counter) - m_private_counter. But we need to do it
+    // atomically with operator-= not to access m_private_counter after the subtraction.
+    auto new_value = m_public_counter -= m_private_counter;
+    // check if this method is responsible to clean up the resources
+    if (new_value == 0) {
+        this->~small_object_pool_impl();
+        cache_aligned_deallocate(this);
+    }
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/small_object_pool_impl.h b/third_party/tbb/small_object_pool_impl.h
new file mode 100644
index 000000000..7478880a9
--- /dev/null
+++ b/third_party/tbb/small_object_pool_impl.h
@@ -0,0 +1,60 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_small_object_pool_impl_H
+#define __TBB_small_object_pool_impl_H
+
+#include "third_party/tbb/detail/_small_object_pool.h"
+#include "third_party/tbb/detail/_utils.h"
+
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/atomic"
+
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class thread_data;
+
+class small_object_pool_impl : public d1::small_object_pool
+{
+    static constexpr std::size_t small_object_size = 256;
+    struct small_object {
+        small_object* next;
+    };
+    static small_object* const dead_public_list;
+public:
+    void* allocate_impl(small_object_pool*& allocator, std::size_t number_of_bytes);
+    void deallocate_impl(void* ptr, std::size_t number_of_bytes, thread_data& td);
+    void destroy();
+private:
+    static std::int64_t cleanup_list(small_object* list);
+    ~small_object_pool_impl() = default;
+private:
+    alignas(max_nfs_size) small_object* m_private_list;
+    std::int64_t m_private_counter{};
+    alignas(max_nfs_size) std::atomic<small_object*> m_public_list;
+    std::atomic<std::int64_t> m_public_counter{};
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* __TBB_small_object_pool_impl_H */
diff --git a/third_party/tbb/spin_mutex.h b/third_party/tbb/spin_mutex.h
new file mode 100644
index 000000000..69d1047bb
--- /dev/null
+++ b/third_party/tbb/spin_mutex.h
@@ -0,0 +1,135 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_spin_mutex_H
+#define __TBB_spin_mutex_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_mutex_common.h"
+
+#include "third_party/tbb/profiling.h"
+
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_scoped_lock.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+#if __TBB_TSX_INTRINSICS_PRESENT
+class rtm_mutex;
+#endif
+
+/** A spin_mutex is a low-level synchronization primitive.
+    While locked, it causes the waiting threads to spin in a loop until the lock is released.
+    It should be used only for locking short critical sections
+    (typically less than 20 instructions) when fairness is not an issue.
+    If zero-initialized, the mutex is considered unheld.
+    @ingroup synchronization */
+class spin_mutex {
+public:
+    //! Constructors
+    spin_mutex() noexcept : m_flag(false) {
+        create_itt_sync(this, "tbb::spin_mutex", "");
+    };
+
+    //! Destructor
+    ~spin_mutex() = default;
+
+    //! No Copy
+    spin_mutex(const spin_mutex&) = delete;
+    spin_mutex& operator=(const spin_mutex&) = delete;
+
+    using scoped_lock = unique_scoped_lock<spin_mutex>;
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = false;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+
+    //! Acquire lock
+    /** Spin if the lock is taken */
+    void lock() {
+        atomic_backoff backoff;
+        call_itt_notify(prepare, this);
+        while (m_flag.exchange(true)) backoff.pause();
+        call_itt_notify(acquired, this);
+    }
+
+    //! Try acquiring lock (non-blocking)
+    /** Return true if lock acquired; false otherwise. */
+    bool try_lock() {
+        bool result = !m_flag.exchange(true);
+        if (result) {
+            call_itt_notify(acquired, this);
+        }
+        return result;
+    }
+
+    //! Release lock
+    void unlock() {
+        call_itt_notify(releasing, this);
+        m_flag.store(false, std::memory_order_release);
+    }
+
+protected:
+    std::atomic<bool> m_flag;
+}; // class spin_mutex
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(spin_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64)
+inline void set_name(spin_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif //WIN
+#else
+inline void set_name(spin_mutex&, const char*) {}
+#if (_WIN32||_WIN64)
+inline void set_name(spin_mutex&, const wchar_t*) {}
+#endif // WIN
+#endif
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::spin_mutex;
+} // namespace v1
+namespace profiling {
+    using detail::d1::set_name;
+}
+} // namespace tbb
+
+#include "third_party/tbb/detail/_rtm_mutex.h"
+
+namespace tbb {
+inline namespace v1 {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    using speculative_spin_mutex = detail::d1::rtm_mutex;
+#else
+    using speculative_spin_mutex = detail::d1::spin_mutex;
+#endif
+}
+}
+
+#endif /* __TBB_spin_mutex_H */
+
diff --git a/third_party/tbb/spin_rw_mutex.h b/third_party/tbb/spin_rw_mutex.h
new file mode 100644
index 000000000..71cbdf7ec
--- /dev/null
+++ b/third_party/tbb/spin_rw_mutex.h
@@ -0,0 +1,230 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_spin_rw_mutex_H
+#define __TBB_spin_rw_mutex_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_mutex_common.h"
+
+#include "third_party/tbb/profiling.h"
+
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_scoped_lock.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+#if __TBB_TSX_INTRINSICS_PRESENT
+class rtm_rw_mutex;
+#endif
+
+//! Fast, unfair, spinning reader-writer lock with backoff and writer-preference
+/** @ingroup synchronization */
+class spin_rw_mutex {
+public:
+    //! Constructors
+    spin_rw_mutex() noexcept : m_state(0) {
+       create_itt_sync(this, "tbb::spin_rw_mutex", "");
+    }
+
+    //! Destructor
+    ~spin_rw_mutex() {
+        __TBB_ASSERT(!m_state, "destruction of an acquired mutex");
+    }
+
+    //! No Copy
+    spin_rw_mutex(const spin_rw_mutex&) = delete;
+    spin_rw_mutex& operator=(const spin_rw_mutex&) = delete;
+
+    using scoped_lock = rw_scoped_lock<spin_rw_mutex>;
+
+    //! Mutex traits
+    static constexpr bool is_rw_mutex = true;
+    static constexpr bool is_recursive_mutex = false;
+    static constexpr bool is_fair_mutex = false;
+
+    //! Acquire lock
+    void lock() {
+        call_itt_notify(prepare, this);
+        for (atomic_backoff backoff; ; backoff.pause()) {
+            state_type s = m_state.load(std::memory_order_relaxed);
+            if (!(s & BUSY)) { // no readers, no writers
+                if (m_state.compare_exchange_strong(s, WRITER))
+                    break; // successfully stored writer flag
+                backoff.reset(); // we could be very close to complete op.
+            } else if (!(s & WRITER_PENDING)) { // no pending writers
+                m_state |= WRITER_PENDING;
+            }
+        }
+        call_itt_notify(acquired, this);
+    }
+
+    //! Try acquiring lock (non-blocking)
+    /** Return true if lock acquired; false otherwise. */
+    bool try_lock() {
+        // for a writer: only possible to acquire if no active readers or writers
+        state_type s = m_state.load(std::memory_order_relaxed);
+        if (!(s & BUSY)) { // no readers, no writers; mask is 1..1101
+            if (m_state.compare_exchange_strong(s, WRITER)) {
+                call_itt_notify(acquired, this);
+                return true; // successfully stored writer flag
+            }
+        }
+        return false;
+    }
+
+    //! Release lock
+    void unlock() {
+        call_itt_notify(releasing, this);
+        m_state &= READERS;
+    }
+
+    //! Lock shared ownership mutex
+    void lock_shared() {
+        call_itt_notify(prepare, this);
+        for (atomic_backoff b; ; b.pause()) {
+            state_type s = m_state.load(std::memory_order_relaxed);
+            if (!(s & (WRITER | WRITER_PENDING))) { // no writer or write requests
+                state_type prev_state = m_state.fetch_add(ONE_READER);
+                if (!(prev_state & WRITER)) {
+                    break; // successfully stored increased number of readers
+                }
+                // writer got there first, undo the increment
+                m_state -= ONE_READER;
+            }
+        }
+        call_itt_notify(acquired, this);
+        __TBB_ASSERT(m_state & READERS, "invalid state of a read lock: no readers");
+    }
+
+    //! Try lock shared ownership mutex
+    bool try_lock_shared() {
+        // for a reader: acquire if no active or waiting writers
+        state_type s = m_state.load(std::memory_order_relaxed);
+        if (!(s & (WRITER | WRITER_PENDING))) { // no writers
+            state_type prev_state = m_state.fetch_add(ONE_READER);
+            if (!(prev_state & WRITER)) {  // got the lock
+                call_itt_notify(acquired, this);
+                return true; // successfully stored increased number of readers
+            }
+            // writer got there first, undo the increment
+            m_state -= ONE_READER;
+        }
+        return false;
+    }
+
+    //! Unlock shared ownership mutex
+    void unlock_shared() {
+        __TBB_ASSERT(m_state & READERS, "invalid state of a read lock: no readers");
+        call_itt_notify(releasing, this);
+        m_state -= ONE_READER;
+    }
+
+protected:
+    /** Internal non ISO C++ standard API **/
+    //! This API is used through the scoped_lock class
+
+    //! Upgrade reader to become a writer.
+    /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
+    bool upgrade() {
+        state_type s = m_state.load(std::memory_order_relaxed);
+        __TBB_ASSERT(s & READERS, "invalid state before upgrade: no readers ");
+        // Check and set writer-pending flag.
+        // Required conditions: either no pending writers, or we are the only reader
+        // (with multiple readers and pending writer, another upgrade could have been requested)
+        while ((s & READERS) == ONE_READER || !(s & WRITER_PENDING)) {
+            if (m_state.compare_exchange_strong(s, s | WRITER | WRITER_PENDING)) {
+                atomic_backoff backoff;
+                while ((m_state.load(std::memory_order_relaxed) & READERS) != ONE_READER) backoff.pause();
+                __TBB_ASSERT((m_state & (WRITER_PENDING|WRITER)) == (WRITER_PENDING | WRITER), "invalid state when upgrading to writer");
+                // Both new readers and writers are blocked at this time
+                m_state -= (ONE_READER + WRITER_PENDING);
+                return true; // successfully upgraded
+            }
+        }
+        // Slow reacquire
+        unlock_shared();
+        lock();
+        return false;
+    }
+
+    //! Downgrade writer to a reader
+    void downgrade() {
+        call_itt_notify(releasing, this);
+        m_state += (ONE_READER - WRITER);
+        __TBB_ASSERT(m_state & READERS, "invalid state after downgrade: no readers");
+    }
+
+    using state_type = std::intptr_t;
+    static constexpr state_type WRITER = 1;
+    static constexpr state_type WRITER_PENDING = 2;
+    static constexpr state_type READERS = ~(WRITER | WRITER_PENDING);
+    static constexpr state_type ONE_READER = 4;
+    static constexpr state_type BUSY = WRITER | READERS;
+    friend scoped_lock;
+    //! State of lock
+    /** Bit 0 = writer is holding lock
+        Bit 1 = request by a writer to acquire lock (hint to readers to wait)
+        Bit 2..N = number of readers holding lock */
+    std::atomic<state_type> m_state;
+}; // class spin_rw_mutex
+
+#if TBB_USE_PROFILING_TOOLS
+inline void set_name(spin_rw_mutex& obj, const char* name) {
+    itt_set_sync_name(&obj, name);
+}
+#if (_WIN32||_WIN64)
+inline void set_name(spin_rw_mutex& obj, const wchar_t* name) {
+    itt_set_sync_name(&obj, name);
+}
+#endif // WIN
+#else
+inline void set_name(spin_rw_mutex&, const char*) {}
+#if (_WIN32||_WIN64)
+inline void set_name(spin_rw_mutex&, const wchar_t*) {}
+#endif // WIN
+#endif
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::spin_rw_mutex;
+} // namespace v1
+namespace profiling {
+    using detail::d1::set_name;
+}
+} // namespace tbb
+
+#include "third_party/tbb/detail/_rtm_rw_mutex.h"
+
+namespace tbb {
+inline namespace v1 {
+#if __TBB_TSX_INTRINSICS_PRESENT
+    using speculative_spin_rw_mutex = detail::d1::rtm_rw_mutex;
+#else
+    using speculative_spin_rw_mutex = detail::d1::spin_rw_mutex;
+#endif
+}
+}
+
+#endif /* __TBB_spin_rw_mutex_H */
+
diff --git a/third_party/tbb/task.cpp b/third_party/tbb/task.cpp
new file mode 100644
index 000000000..c40017376
--- /dev/null
+++ b/third_party/tbb/task.cpp
@@ -0,0 +1,228 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+// Do not include task.h directly. Use scheduler_common.h instead
+#include "third_party/tbb/scheduler_common.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/arena.h"
+#include "third_party/tbb/thread_data.h"
+#include "third_party/tbb/task_dispatcher.h"
+#include "third_party/tbb/waiters.h"
+#include "third_party/tbb/itt_notify.h"
+
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/partitioner.h"
+#include "third_party/tbb/task.h"
+
+#include "third_party/libcxx/cstring"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// resumable tasks
+//------------------------------------------------------------------------
+#if __TBB_RESUMABLE_TASKS
+
+void suspend(suspend_callback_type suspend_callback, void* user_callback) {
+    thread_data& td = *governor::get_thread_data();
+    td.my_task_dispatcher->suspend(suspend_callback, user_callback);
+    // Do not access td after suspend.
+}
+
+void resume(suspend_point_type* sp) {
+    assert_pointers_valid(sp, sp->m_arena);
+    task_dispatcher& task_disp = sp->m_resume_task.m_target;
+
+    if (sp->try_notify_resume()) {
+        // TODO: remove this work-around
+        // Prolong the arena's lifetime while all coroutines are alive
+        // (otherwise the arena can be destroyed while some tasks are suspended).
+        arena& a = *sp->m_arena;
+        a.my_references += arena::ref_worker;
+
+        if (task_disp.m_properties.critical_task_allowed) {
+            // The target is not in the process of executing critical task, so the resume task is not critical.
+            a.my_resume_task_stream.push(&sp->m_resume_task, random_lane_selector(sp->m_random));
+        } else {
+    #if __TBB_PREVIEW_CRITICAL_TASKS
+            // The target is in the process of executing critical task, so the resume task is critical.
+            a.my_critical_task_stream.push(&sp->m_resume_task, random_lane_selector(sp->m_random));
+    #endif
+        }
+        // Do not access target after that point.
+        a.advertise_new_work<arena::wakeup>();
+        // Release our reference to my_arena.
+        a.on_thread_leaving(arena::ref_worker);
+    }
+
+}
+
+suspend_point_type* current_suspend_point() {
+    thread_data& td = *governor::get_thread_data();
+    return td.my_task_dispatcher->get_suspend_point();
+}
+
+task_dispatcher& create_coroutine(thread_data& td) {
+    // We may have some task dispatchers cached
+    task_dispatcher* task_disp = td.my_arena->my_co_cache.pop();
+    if (!task_disp) {
+        void* ptr = cache_aligned_allocate(sizeof(task_dispatcher));
+        task_disp = new(ptr) task_dispatcher(td.my_arena);
+        task_disp->init_suspend_point(td.my_arena, td.my_arena->my_threading_control->worker_stack_size());
+    }
+    // Prolong the arena's lifetime until all coroutines is alive
+    // (otherwise the arena can be destroyed while some tasks are suspended).
+    // TODO: consider behavior if there are more than 4K external references.
+    td.my_arena->my_references += arena::ref_external;
+    return *task_disp;
+}
+
+void task_dispatcher::internal_suspend() {
+    __TBB_ASSERT(m_thread_data != nullptr, nullptr);
+
+    arena_slot* slot = m_thread_data->my_arena_slot;
+    __TBB_ASSERT(slot != nullptr, nullptr);
+
+    task_dispatcher& default_task_disp = slot->default_task_dispatcher();
+    // TODO: simplify the next line, e.g. is_task_dispatcher_recalled( task_dispatcher& )
+    bool is_recalled = default_task_disp.get_suspend_point()->m_is_owner_recalled.load(std::memory_order_acquire);
+    task_dispatcher& target = is_recalled ? default_task_disp : create_coroutine(*m_thread_data);
+
+    resume(target);
+
+    if (m_properties.outermost) {
+        recall_point();
+    }
+}
+
+void task_dispatcher::suspend(suspend_callback_type suspend_callback, void* user_callback) {
+    __TBB_ASSERT(suspend_callback != nullptr, nullptr);
+    __TBB_ASSERT(user_callback != nullptr, nullptr);
+    suspend_callback(user_callback, get_suspend_point());
+
+    __TBB_ASSERT(m_thread_data != nullptr, nullptr);
+    __TBB_ASSERT(m_thread_data->my_post_resume_action == post_resume_action::none, nullptr);
+    __TBB_ASSERT(m_thread_data->my_post_resume_arg == nullptr, nullptr);
+    internal_suspend();
+}
+
+bool task_dispatcher::resume(task_dispatcher& target) {
+    // Do not create non-trivial objects on the stack of this function. They might never be destroyed
+    {
+        thread_data* td = m_thread_data;
+        __TBB_ASSERT(&target != this, "We cannot resume to ourself");
+        __TBB_ASSERT(td != nullptr, "This task dispatcher must be attach to a thread data");
+        __TBB_ASSERT(td->my_task_dispatcher == this, "Thread data must be attached to this task dispatcher");
+
+        // Change the task dispatcher
+        td->detach_task_dispatcher();
+        td->attach_task_dispatcher(target);
+    }
+    __TBB_ASSERT(m_suspend_point != nullptr, "Suspend point must be created");
+    __TBB_ASSERT(target.m_suspend_point != nullptr, "Suspend point must be created");
+    // Swap to the target coroutine.
+
+    m_suspend_point->resume(target.m_suspend_point);
+    // Pay attention that m_thread_data can be changed after resume
+    if (m_thread_data) {
+        thread_data* td = m_thread_data;
+        __TBB_ASSERT(td != nullptr, "This task dispatcher must be attach to a thread data");
+        __TBB_ASSERT(td->my_task_dispatcher == this, "Thread data must be attached to this task dispatcher");
+        do_post_resume_action();
+
+        // Remove the recall flag if the thread in its original task dispatcher
+        arena_slot* slot = td->my_arena_slot;
+        __TBB_ASSERT(slot != nullptr, nullptr);
+        if (this == slot->my_default_task_dispatcher) {
+            __TBB_ASSERT(m_suspend_point != nullptr, nullptr);
+            m_suspend_point->m_is_owner_recalled.store(false, std::memory_order_relaxed);
+        }
+        return true;
+    }
+    return false;
+}
+
+void task_dispatcher::do_post_resume_action() {
+    thread_data* td = m_thread_data;
+    switch (td->my_post_resume_action) {
+    case post_resume_action::register_waiter:
+    {
+        __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument");
+        static_cast<thread_control_monitor::resume_context*>(td->my_post_resume_arg)->notify();
+        break;
+    }
+    case post_resume_action::cleanup:
+    {
+        __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument");
+        task_dispatcher* to_cleanup = static_cast<task_dispatcher*>(td->my_post_resume_arg);
+        // Release coroutine's reference to my_arena
+        td->my_arena->on_thread_leaving(arena::ref_external);
+        // Cache the coroutine for possible later re-usage
+        td->my_arena->my_co_cache.push(to_cleanup);
+        break;
+    }
+    case post_resume_action::notify:
+    {
+        __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument");
+        suspend_point_type* sp = static_cast<suspend_point_type*>(td->my_post_resume_arg);
+        sp->recall_owner();
+        // Do not access sp because it can be destroyed after recall
+
+        auto is_our_suspend_point = [sp] (market_context ctx) {
+            return std::uintptr_t(sp) == ctx.my_uniq_addr;
+        };
+        td->my_arena->get_waiting_threads_monitor().notify(is_our_suspend_point);
+        break;
+    }
+    default:
+        __TBB_ASSERT(td->my_post_resume_action == post_resume_action::none, "Unknown post resume action");
+        __TBB_ASSERT(td->my_post_resume_arg == nullptr, "The post resume argument should not be set");
+    }
+    td->clear_post_resume_action();
+}
+
+#else
+
+void suspend(suspend_callback_type, void*) {
+    __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform");
+}
+
+void resume(suspend_point_type*) {
+    __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform");
+}
+
+suspend_point_type* current_suspend_point() {
+    __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform");
+    return nullptr;
+}
+
+#endif /* __TBB_RESUMABLE_TASKS */
+
+void notify_waiters(std::uintptr_t wait_ctx_addr) {
+    auto is_related_wait_ctx = [&] (market_context context) {
+        return wait_ctx_addr == context.my_uniq_addr;
+    };
+
+    governor::get_thread_data()->my_arena->get_waiting_threads_monitor().notify(is_related_wait_ctx);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/third_party/tbb/task.h b/third_party/tbb/task.h
new file mode 100644
index 000000000..691c18341
--- /dev/null
+++ b/third_party/tbb/task.h
@@ -0,0 +1,38 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_task_H
+#define __TBB_task_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_task.h"
+
+namespace tbb {
+inline namespace v1 {
+namespace task {
+#if __TBB_RESUMABLE_TASKS
+    using detail::d1::suspend_point;
+    using detail::d1::resume;
+    using detail::d1::suspend;
+#endif /* __TBB_RESUMABLE_TASKS */
+    using detail::d1::current_context;
+} // namespace task
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_task_H */
diff --git a/third_party/tbb/task_arena.h b/third_party/tbb/task_arena.h
new file mode 100644
index 000000000..2b3fbda53
--- /dev/null
+++ b/third_party/tbb/task_arena.h
@@ -0,0 +1,500 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_task_arena_H
+#define __TBB_task_arena_H
+
+#include "third_party/tbb/detail/_config.h"
+
+#include "third_party/tbb/detail/_aligned_space.h"
+#include "third_party/tbb/detail/_attach.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+#include "third_party/tbb/detail/_task.h"
+
+#include "third_party/tbb/detail/_task_handle.h"
+
+#if __TBB_ARENA_BINDING
+#include "third_party/tbb/info.h"
+#endif /*__TBB_ARENA_BINDING*/
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+
+template<typename F, typename R>
+class task_arena_function : public delegate_base {
+    F &my_func;
+    aligned_space<R> my_return_storage;
+    bool my_constructed{false};
+    // The function should be called only once.
+    bool operator()() const override {
+        new (my_return_storage.begin()) R(my_func());
+        return true;
+    }
+public:
+    task_arena_function(F& f) : my_func(f) {}
+    // The function can be called only after operator() and only once.
+    R consume_result() {
+        my_constructed = true;
+        return std::move(*(my_return_storage.begin()));
+    }
+    ~task_arena_function() override {
+        if (my_constructed) {
+            my_return_storage.begin()->~R();
+        }
+    }
+};
+
+template<typename F>
+class task_arena_function<F,void> : public delegate_base {
+    F &my_func;
+    bool operator()() const override {
+        my_func();
+        return true;
+    }
+public:
+    task_arena_function(F& f) : my_func(f) {}
+    void consume_result() const {}
+
+    friend class task_arena_base;
+};
+
+class task_arena_base;
+class task_scheduler_observer;
+} // namespace d1
+
+namespace r1 {
+class arena;
+struct task_arena_impl;
+
+TBB_EXPORT void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer&, bool);
+TBB_EXPORT void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base&);
+TBB_EXPORT void __TBB_EXPORTED_FUNC terminate(d1::task_arena_base&);
+TBB_EXPORT bool __TBB_EXPORTED_FUNC attach(d1::task_arena_base&);
+TBB_EXPORT void __TBB_EXPORTED_FUNC execute(d1::task_arena_base&, d1::delegate_base&);
+TBB_EXPORT void __TBB_EXPORTED_FUNC wait(d1::task_arena_base&);
+TBB_EXPORT int  __TBB_EXPORTED_FUNC max_concurrency(const d1::task_arena_base*);
+TBB_EXPORT void __TBB_EXPORTED_FUNC isolate_within_arena(d1::delegate_base& d, std::intptr_t);
+
+TBB_EXPORT void __TBB_EXPORTED_FUNC enqueue(d1::task&, d1::task_arena_base*);
+TBB_EXPORT void __TBB_EXPORTED_FUNC enqueue(d1::task&, d1::task_group_context&, d1::task_arena_base*);
+TBB_EXPORT void __TBB_EXPORTED_FUNC submit(d1::task&, d1::task_group_context&, arena*, std::uintptr_t);
+} // namespace r1
+
+namespace d2 {
+inline void enqueue_impl(task_handle&& th, d1::task_arena_base* ta) {
+    __TBB_ASSERT(th != nullptr, "Attempt to schedule empty task_handle");
+
+    auto& ctx = task_handle_accessor::ctx_of(th);
+
+    // Do not access th after release
+    r1::enqueue(*task_handle_accessor::release(th), ctx, ta);
+}
+} //namespace d2
+
+namespace d1 {
+
+static constexpr unsigned num_priority_levels = 3;
+static constexpr int priority_stride = INT_MAX / (num_priority_levels + 1);
+
+class task_arena_base {
+    friend struct r1::task_arena_impl;
+    friend void r1::observe(d1::task_scheduler_observer&, bool);
+public:
+    enum class priority : int {
+        low    = 1 * priority_stride,
+        normal = 2 * priority_stride,
+        high   = 3 * priority_stride
+    };
+#if __TBB_ARENA_BINDING
+    using constraints = tbb::detail::d1::constraints;
+#endif /*__TBB_ARENA_BINDING*/
+protected:
+    //! Special settings
+    intptr_t my_version_and_traits;
+
+    std::atomic<do_once_state> my_initialization_state;
+
+    //! nullptr if not currently initialized.
+    std::atomic<r1::arena*> my_arena;
+    static_assert(sizeof(std::atomic<r1::arena*>) == sizeof(r1::arena*),
+        "To preserve backward compatibility we need the equal size of an atomic pointer and a pointer");
+
+    //! Concurrency level for deferred initialization
+    int my_max_concurrency;
+
+    //! Reserved slots for external threads
+    unsigned my_num_reserved_slots;
+
+    //! Arena priority
+    priority my_priority;
+
+    //! The NUMA node index to which the arena will be attached
+    numa_node_id my_numa_id;
+
+    //! The core type index to which arena will be attached
+    core_type_id my_core_type;
+
+    //! Number of threads per core
+    int my_max_threads_per_core;
+
+    // Backward compatibility checks.
+    core_type_id core_type() const {
+        return (my_version_and_traits & core_type_support_flag) == core_type_support_flag ? my_core_type : automatic;
+    }
+    int max_threads_per_core() const {
+        return (my_version_and_traits & core_type_support_flag) == core_type_support_flag ? my_max_threads_per_core : automatic;
+    }
+
+    enum {
+        default_flags = 0
+        , core_type_support_flag = 1
+    };
+
+    task_arena_base(int max_concurrency, unsigned reserved_for_masters, priority a_priority)
+        : my_version_and_traits(default_flags | core_type_support_flag)
+        , my_initialization_state(do_once_state::uninitialized)
+        , my_arena(nullptr)
+        , my_max_concurrency(max_concurrency)
+        , my_num_reserved_slots(reserved_for_masters)
+        , my_priority(a_priority)
+        , my_numa_id(automatic)
+        , my_core_type(automatic)
+        , my_max_threads_per_core(automatic)
+        {}
+
+#if __TBB_ARENA_BINDING
+    task_arena_base(const constraints& constraints_, unsigned reserved_for_masters, priority a_priority)
+        : my_version_and_traits(default_flags | core_type_support_flag)
+        , my_initialization_state(do_once_state::uninitialized)
+        , my_arena(nullptr)
+        , my_max_concurrency(constraints_.max_concurrency)
+        , my_num_reserved_slots(reserved_for_masters)
+        , my_priority(a_priority)
+        , my_numa_id(constraints_.numa_id)
+        , my_core_type(constraints_.core_type)
+        , my_max_threads_per_core(constraints_.max_threads_per_core)
+        {}
+#endif /*__TBB_ARENA_BINDING*/
+public:
+    //! Typedef for number of threads that is automatic.
+    static const int automatic = -1;
+    static const int not_initialized = -2;
+};
+
+template<typename R, typename F>
+R isolate_impl(F& f) {
+    task_arena_function<F, R> func(f);
+    r1::isolate_within_arena(func, /*isolation*/ 0);
+    return func.consume_result();
+}
+
+template <typename F>
+class enqueue_task : public task {
+    small_object_allocator m_allocator;
+    const F m_func;
+
+    void finalize(const execution_data& ed) {
+        m_allocator.delete_object(this, ed);
+    }
+    task* execute(execution_data& ed) override {
+        m_func();
+        finalize(ed);
+        return nullptr;
+    }
+    task* cancel(execution_data&) override {
+        __TBB_ASSERT_RELEASE(false, "Unhandled exception from enqueue task is caught");
+        return nullptr;
+    }
+public:
+    enqueue_task(const F& f, small_object_allocator& alloc) : m_allocator(alloc), m_func(f) {}
+    enqueue_task(F&& f, small_object_allocator& alloc) : m_allocator(alloc), m_func(std::move(f)) {}
+};
+
+template<typename F>
+void enqueue_impl(F&& f, task_arena_base* ta) {
+    small_object_allocator alloc{};
+    r1::enqueue(*alloc.new_object<enqueue_task<typename std::decay<F>::type>>(std::forward<F>(f), alloc), ta);
+}
+/** 1-to-1 proxy representation class of scheduler's arena
+ * Constructors set up settings only, real construction is deferred till the first method invocation
+ * Destructor only removes one of the references to the inner arena representation.
+ * Final destruction happens when all the references (and the work) are gone.
+ */
+class task_arena : public task_arena_base {
+
+    void mark_initialized() {
+        __TBB_ASSERT( my_arena.load(std::memory_order_relaxed), "task_arena initialization is incomplete" );
+        my_initialization_state.store(do_once_state::initialized, std::memory_order_release);
+    }
+
+    template<typename R, typename F>
+    R execute_impl(F& f) {
+        initialize();
+        task_arena_function<F, R> func(f);
+        r1::execute(*this, func);
+        return func.consume_result();
+    }
+public:
+    //! Creates task_arena with certain concurrency limits
+    /** Sets up settings only, real construction is deferred till the first method invocation
+     *  @arg max_concurrency specifies total number of slots in arena where threads work
+     *  @arg reserved_for_masters specifies number of slots to be used by external threads only.
+     *       Value of 1 is default and reflects behavior of implicit arenas.
+     **/
+    task_arena(int max_concurrency_ = automatic, unsigned reserved_for_masters = 1,
+               priority a_priority = priority::normal)
+        : task_arena_base(max_concurrency_, reserved_for_masters, a_priority)
+    {}
+
+#if __TBB_ARENA_BINDING
+    //! Creates task arena pinned to certain NUMA node
+    task_arena(const constraints& constraints_, unsigned reserved_for_masters = 1,
+               priority a_priority = priority::normal)
+        : task_arena_base(constraints_, reserved_for_masters, a_priority)
+    {}
+
+    //! Copies settings from another task_arena
+    task_arena(const task_arena &s) // copy settings but not the reference or instance
+        : task_arena_base(
+            constraints{}
+                .set_numa_id(s.my_numa_id)
+                .set_max_concurrency(s.my_max_concurrency)
+                .set_core_type(s.my_core_type)
+                .set_max_threads_per_core(s.my_max_threads_per_core)
+            , s.my_num_reserved_slots, s.my_priority)
+    {}
+#else
+    //! Copies settings from another task_arena
+    task_arena(const task_arena& a) // copy settings but not the reference or instance
+        : task_arena_base(a.my_max_concurrency, a.my_num_reserved_slots, a.my_priority)
+    {}
+#endif /*__TBB_ARENA_BINDING*/
+
+    //! Tag class used to indicate the "attaching" constructor
+    struct attach {};
+
+    //! Creates an instance of task_arena attached to the current arena of the thread
+    explicit task_arena( attach )
+        : task_arena_base(automatic, 1, priority::normal) // use default settings if attach fails
+    {
+        if (r1::attach(*this)) {
+            mark_initialized();
+        }
+    }
+
+    //! Creates an instance of task_arena attached to the current arena of the thread
+    explicit task_arena(d1::attach)
+        : task_arena(attach{})
+    {}
+
+    //! Forces allocation of the resources for the task_arena as specified in constructor arguments
+    void initialize() {
+        atomic_do_once([this]{ r1::initialize(*this); }, my_initialization_state);
+    }
+
+    //! Overrides concurrency level and forces initialization of internal representation
+    void initialize(int max_concurrency_, unsigned reserved_for_masters = 1,
+                    priority a_priority = priority::normal)
+    {
+        __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena");
+        if( !is_active() ) {
+            my_max_concurrency = max_concurrency_;
+            my_num_reserved_slots = reserved_for_masters;
+            my_priority = a_priority;
+            r1::initialize(*this);
+            mark_initialized();
+        }
+    }
+
+#if __TBB_ARENA_BINDING
+    void initialize(constraints constraints_, unsigned reserved_for_masters = 1,
+                    priority a_priority = priority::normal)
+    {
+        __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena");
+        if( !is_active() ) {
+            my_numa_id = constraints_.numa_id;
+            my_max_concurrency = constraints_.max_concurrency;
+            my_core_type = constraints_.core_type;
+            my_max_threads_per_core = constraints_.max_threads_per_core;
+            my_num_reserved_slots = reserved_for_masters;
+            my_priority = a_priority;
+            r1::initialize(*this);
+            mark_initialized();
+        }
+    }
+#endif /*__TBB_ARENA_BINDING*/
+
+    //! Attaches this instance to the current arena of the thread
+    void initialize(attach) {
+        // TODO: decide if this call must be thread-safe
+        __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena");
+        if( !is_active() ) {
+            if ( !r1::attach(*this) ) {
+                r1::initialize(*this);
+            }
+            mark_initialized();
+        }
+    }
+
+    //! Attaches this instance to the current arena of the thread
+    void initialize(d1::attach) {
+        initialize(attach{});
+    }
+
+    //! Removes the reference to the internal arena representation.
+    //! Not thread safe wrt concurrent invocations of other methods.
+    void terminate() {
+        if( is_active() ) {
+            r1::terminate(*this);
+            my_initialization_state.store(do_once_state::uninitialized, std::memory_order_relaxed);
+        }
+    }
+
+    //! Removes the reference to the internal arena representation, and destroys the external object.
+    //! Not thread safe wrt concurrent invocations of other methods.
+    ~task_arena() {
+        terminate();
+    }
+
+    //! Returns true if the arena is active (initialized); false otherwise.
+    //! The name was chosen to match a task_scheduler_init method with the same semantics.
+    bool is_active() const {
+        return my_initialization_state.load(std::memory_order_acquire) == do_once_state::initialized;
+    }
+
+    //! Enqueues a task into the arena to process a functor, and immediately returns.
+    //! Does not require the calling thread to join the arena
+
+    template<typename F>
+    void enqueue(F&& f) {
+        initialize();
+        enqueue_impl(std::forward<F>(f), this);
+    }
+
+    //! Enqueues a task into the arena to process a functor wrapped in task_handle, and immediately returns.
+    //! Does not require the calling thread to join the arena
+    void enqueue(d2::task_handle&& th) {
+        initialize();
+        d2::enqueue_impl(std::move(th), this);
+    }
+
+    //! Joins the arena and executes a mutable functor, then returns
+    //! If not possible to join, wraps the functor into a task, enqueues it and waits for task completion
+    //! Can decrement the arena demand for workers, causing a worker to leave and free a slot to the calling thread
+    //! Since C++11, the method returns the value returned by functor (prior to C++11 it returns void).
+    template<typename F>
+    auto execute(F&& f) -> decltype(f()) {
+        return execute_impl<decltype(f())>(f);
+    }
+
+#if __TBB_EXTRA_DEBUG
+    //! Returns my_num_reserved_slots
+    int debug_reserved_slots() const {
+        // Handle special cases inside the library
+        return my_num_reserved_slots;
+    }
+
+    //! Returns my_max_concurrency
+    int debug_max_concurrency() const {
+        // Handle special cases inside the library
+        return my_max_concurrency;
+    }
+
+    //! Wait for all work in the arena to be completed
+    //! Even submitted by other application threads
+    //! Joins arena if/when possible (in the same way as execute())
+    void debug_wait_until_empty() {
+        initialize();
+        r1::wait(*this);
+    }
+#endif //__TBB_EXTRA_DEBUG
+
+    //! Returns the maximal number of threads that can work inside the arena
+    int max_concurrency() const {
+        // Handle special cases inside the library
+        return (my_max_concurrency > 1) ? my_max_concurrency : r1::max_concurrency(this);
+    }
+
+    friend void submit(task& t, task_arena& ta, task_group_context& ctx, bool as_critical) {
+        __TBB_ASSERT(ta.is_active(), nullptr);
+        call_itt_task_notify(releasing, &t);
+        r1::submit(t, ctx, ta.my_arena.load(std::memory_order_relaxed), as_critical ? 1 : 0);
+    }
+};
+
+//! Executes a mutable functor in isolation within the current task arena.
+//! Since C++11, the method returns the value returned by functor (prior to C++11 it returns void).
+template<typename F>
+inline auto isolate(F&& f) -> decltype(f()) {
+    return isolate_impl<decltype(f())>(f);
+}
+
+//! Returns the index, aka slot number, of the calling thread in its current arena
+inline int current_thread_index() {
+    slot_id idx = r1::execution_slot(nullptr);
+    return idx == slot_id(-1) ? task_arena_base::not_initialized : int(idx);
+}
+
+#if __TBB_PREVIEW_TASK_GROUP_EXTENSIONS
+inline bool is_inside_task() {
+    return nullptr != current_context();
+}
+#endif //__TBB_PREVIEW_TASK_GROUP_EXTENSIONS
+
+//! Returns the maximal number of threads that can work inside the arena
+inline int max_concurrency() {
+    return r1::max_concurrency(nullptr);
+}
+
+inline void enqueue(d2::task_handle&& th) {
+    d2::enqueue_impl(std::move(th), nullptr);
+}
+
+template<typename F>
+inline void enqueue(F&& f) {
+    enqueue_impl(std::forward<F>(f), nullptr);
+}
+
+using r1::submit;
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::task_arena;
+using detail::d1::attach;
+
+#if __TBB_PREVIEW_TASK_GROUP_EXTENSIONS
+using detail::d1::is_inside_task;
+#endif
+
+namespace this_task_arena {
+using detail::d1::current_thread_index;
+using detail::d1::max_concurrency;
+using detail::d1::isolate;
+
+using detail::d1::enqueue;
+} // namespace this_task_arena
+
+} // inline namespace v1
+
+} // namespace tbb
+#endif /* __TBB_task_arena_H */
diff --git a/third_party/tbb/task_dispatcher.cpp b/third_party/tbb/task_dispatcher.cpp
new file mode 100644
index 000000000..0ab1cb9ac
--- /dev/null
+++ b/third_party/tbb/task_dispatcher.cpp
@@ -0,0 +1,245 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/task_dispatcher.h"
+#include "third_party/tbb/waiters.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+static inline void spawn_and_notify(d1::task& t, arena_slot* slot, arena* a) {
+    slot->spawn(t);
+    a->advertise_new_work<arena::work_spawned>();
+    // TODO: TBB_REVAMP_TODO slot->assert_task_pool_valid();
+}
+
+void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx) {
+    thread_data* tls = governor::get_thread_data();
+    task_group_context_impl::bind_to(ctx, tls);
+    arena* a = tls->my_arena;
+    arena_slot* slot = tls->my_arena_slot;
+    // Capture current context
+    task_accessor::context(t) = &ctx;
+    // Mark isolation
+    task_accessor::isolation(t) = tls->my_task_dispatcher->m_execute_data_ext.isolation;
+    spawn_and_notify(t, slot, a);
+}
+
+void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx, d1::slot_id id) {
+    thread_data* tls = governor::get_thread_data();
+    task_group_context_impl::bind_to(ctx, tls);
+    arena* a = tls->my_arena;
+    arena_slot* slot = tls->my_arena_slot;
+    execution_data_ext& ed = tls->my_task_dispatcher->m_execute_data_ext;
+
+    // Capture context
+    task_accessor::context(t) = &ctx;
+    // Mark isolation
+    task_accessor::isolation(t) = ed.isolation;
+
+    if ( id != d1::no_slot && id != tls->my_arena_index && id < a->my_num_slots) {
+        // Allocate proxy task
+        d1::small_object_allocator alloc{};
+        auto proxy = alloc.new_object<task_proxy>(static_cast<d1::execution_data&>(ed));
+        // Mark as a proxy
+        task_accessor::set_proxy_trait(*proxy);
+        // Mark isolation for the proxy task
+        task_accessor::isolation(*proxy) = ed.isolation;
+        // Deallocation hint (tls) from the task allocator
+        proxy->allocator = alloc;
+        proxy->slot = id;
+        proxy->outbox = &a->mailbox(id);
+        // Mark proxy as present in both locations (sender's task pool and destination mailbox)
+        proxy->task_and_tag = intptr_t(&t) | task_proxy::location_mask;
+        // Mail the proxy - after this point t may be destroyed by another thread at any moment.
+        proxy->outbox->push(proxy);
+        // Spawn proxy to the local task pool
+        spawn_and_notify(*proxy, slot, a);
+    } else {
+        spawn_and_notify(t, slot, a);
+    }
+}
+
+void __TBB_EXPORTED_FUNC submit(d1::task& t, d1::task_group_context& ctx, arena* a, std::uintptr_t as_critical) {
+    suppress_unused_warning(as_critical);
+    assert_pointer_valid(a);
+    thread_data& tls = *governor::get_thread_data();
+
+    // TODO revamp: for each use case investigate neccesity to make this call
+    task_group_context_impl::bind_to(ctx, &tls);
+    task_accessor::context(t) = &ctx;
+    // TODO revamp: consider respecting task isolation if this call is being made by external thread
+    task_accessor::isolation(t) = tls.my_task_dispatcher->m_execute_data_ext.isolation;
+
+    // TODO: consider code refactoring when lane selection mechanism is unified.
+
+    if ( tls.is_attached_to(a) ) {
+        arena_slot* slot = tls.my_arena_slot;
+#if __TBB_PREVIEW_CRITICAL_TASKS
+        if( as_critical ) {
+            a->my_critical_task_stream.push( &t, subsequent_lane_selector(slot->critical_hint()) );
+        } else
+#endif
+        {
+            slot->spawn(t);
+        }
+    } else {
+        random_lane_selector lane_selector{tls.my_random};
+#if !__TBB_PREVIEW_CRITICAL_TASKS
+        suppress_unused_warning(as_critical);
+#else
+        if ( as_critical ) {
+            a->my_critical_task_stream.push( &t, lane_selector );
+        } else
+#endif
+        {
+            // Avoid joining the arena the thread is not currently in.
+            a->my_fifo_task_stream.push( &t, lane_selector );
+        }
+    }
+    // It is assumed that some thread will explicitly wait in the arena the task is submitted
+    // into. Therefore, no need to utilize mandatory concurrency here.
+    a->advertise_new_work<arena::work_spawned>();
+}
+
+void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) {
+    task_accessor::context(t) = &t_ctx;
+    task_dispatcher::execute_and_wait(&t, wait_ctx, w_ctx);
+}
+
+void __TBB_EXPORTED_FUNC wait(d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) {
+    // Enter the task dispatch loop without a task
+    task_dispatcher::execute_and_wait(nullptr, wait_ctx, w_ctx);
+}
+
+d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data* ed) {
+    if (ed) {
+        const execution_data_ext* ed_ext = static_cast<const execution_data_ext*>(ed);
+        assert_pointers_valid(ed_ext->task_disp, ed_ext->task_disp->m_thread_data);
+        return ed_ext->task_disp->m_thread_data->my_arena_index;
+    } else {
+        thread_data* td = governor::get_thread_data_if_initialized();
+        return td ? td->my_arena_index : d1::slot_id(-1);
+    }
+}
+
+d1::task_group_context* __TBB_EXPORTED_FUNC current_context() {
+    thread_data* td = governor::get_thread_data();
+    assert_pointers_valid(td, td->my_task_dispatcher);
+
+    task_dispatcher* task_disp = td->my_task_dispatcher;
+    if (task_disp->m_properties.outermost) {
+        // No one task is executed, so no execute_data.
+        return nullptr;
+    } else {
+        return td->my_task_dispatcher->m_execute_data_ext.context;
+    }
+}
+
+void task_dispatcher::execute_and_wait(d1::task* t, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) {
+    // Get an associated task dispatcher
+    thread_data* tls = governor::get_thread_data();
+    __TBB_ASSERT(tls->my_task_dispatcher != nullptr, nullptr);
+    task_dispatcher& local_td = *tls->my_task_dispatcher;
+
+    // TODO: factor out the binding to execute_and_wait_impl
+    if (t) {
+        task_group_context_impl::bind_to(*task_accessor::context(*t), tls);
+        // Propagate the isolation to the task executed without spawn.
+        task_accessor::isolation(*t) = tls->my_task_dispatcher->m_execute_data_ext.isolation;
+    }
+
+    // Waiting on special object tied to a waiting thread.
+    external_waiter waiter{ *tls->my_arena, wait_ctx };
+    t = local_td.local_wait_for_all(t, waiter);
+    __TBB_ASSERT_EX(t == nullptr, "External waiter must not leave dispatch loop with a task");
+
+    // The external thread couldn't exit the dispatch loop in an idle state
+    if (local_td.m_thread_data->my_inbox.is_idle_state(true)) {
+        local_td.m_thread_data->my_inbox.set_is_idle(false);
+    }
+
+    auto exception = w_ctx.my_exception.load(std::memory_order_acquire);
+    if (exception) {
+        __TBB_ASSERT(w_ctx.is_group_execution_cancelled(), "The task group context with an exception should be canceled.");
+        exception->throw_self();
+    }
+}
+
+#if __TBB_RESUMABLE_TASKS
+
+#if _WIN32
+/* [[noreturn]] */ void __stdcall co_local_wait_for_all(void* addr) noexcept
+#else
+/* [[noreturn]] */ void co_local_wait_for_all(unsigned hi, unsigned lo) noexcept
+#endif
+{
+#if !_WIN32
+    std::uintptr_t addr = lo;
+    __TBB_ASSERT(sizeof(addr) == 8 || hi == 0, nullptr);
+    addr += std::uintptr_t(std::uint64_t(hi) << 32);
+#endif
+    task_dispatcher& task_disp = *reinterpret_cast<task_dispatcher*>(addr);
+    assert_pointers_valid(task_disp.m_thread_data, task_disp.m_thread_data->my_arena);
+    task_disp.set_stealing_threshold(task_disp.m_thread_data->my_arena->calculate_stealing_threshold());
+    __TBB_ASSERT(task_disp.can_steal(), nullptr);
+    task_disp.co_local_wait_for_all();
+    // This code is unreachable
+}
+
+/* [[noreturn]] */ void task_dispatcher::co_local_wait_for_all() noexcept {
+    // Do not create non-trivial objects on the stack of this function. They will never be destroyed.
+    assert_pointer_valid(m_thread_data);
+
+    m_suspend_point->finilize_resume();
+    // Basically calls the user callback passed to the tbb::task::suspend function
+    do_post_resume_action();
+
+    // Endless loop here because coroutine could be reused
+    d1::task* resume_task{};
+    do {
+        arena* a = m_thread_data->my_arena;
+        coroutine_waiter waiter(*a);
+        resume_task = local_wait_for_all(nullptr, waiter);
+        assert_task_valid(resume_task);
+        __TBB_ASSERT(this == m_thread_data->my_task_dispatcher, nullptr);
+
+        m_thread_data->set_post_resume_action(post_resume_action::cleanup, this);
+
+    } while (resume(static_cast<suspend_point_type::resume_task*>(resume_task)->m_target));
+    // This code might be unreachable
+}
+
+d1::suspend_point task_dispatcher::get_suspend_point() {
+    if (m_suspend_point == nullptr) {
+        assert_pointer_valid(m_thread_data);
+        // 0 means that we attach this task dispatcher to the current stack
+        init_suspend_point(m_thread_data->my_arena, 0);
+    }
+    assert_pointer_valid(m_suspend_point);
+    return m_suspend_point;
+}
+void task_dispatcher::init_suspend_point(arena* a, std::size_t stack_size) {
+    __TBB_ASSERT(m_suspend_point == nullptr, nullptr);
+    m_suspend_point = new(cache_aligned_allocate(sizeof(suspend_point_type)))
+        suspend_point_type(a, stack_size, *this);
+}
+#endif /* __TBB_RESUMABLE_TASKS */
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/task_dispatcher.h b/third_party/tbb/task_dispatcher.h
new file mode 100644
index 000000000..4bcbbf66e
--- /dev/null
+++ b/third_party/tbb/task_dispatcher.h
@@ -0,0 +1,469 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_task_dispatcher_H
+#define _TBB_task_dispatcher_H
+
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/global_control.h"
+
+#include "third_party/tbb/scheduler_common.h"
+#include "third_party/tbb/waiters.h"
+#include "third_party/tbb/arena_slot.h"
+#include "third_party/tbb/arena.h"
+#include "third_party/tbb/thread_data.h"
+#include "third_party/tbb/mailbox.h"
+#include "third_party/tbb/itt_notify.h"
+#include "third_party/tbb/concurrent_monitor.h"
+#include "third_party/tbb/threading_control.h"
+
+#include "third_party/libcxx/atomic"
+
+#if !__TBB_CPU_CTL_ENV_PRESENT
+#include "libc/runtime/fenv.h" //
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+inline d1::task* get_self_recall_task(arena_slot& slot) {
+    suppress_unused_warning(slot);
+    d1::task* t = nullptr;
+#if __TBB_RESUMABLE_TASKS
+    suspend_point_type* sp = slot.default_task_dispatcher().m_suspend_point;
+    if (sp && sp->m_is_owner_recalled.load(std::memory_order_acquire)) {
+        t = &sp->m_resume_task;
+        __TBB_ASSERT(sp->m_resume_task.m_target.m_thread_data == nullptr, nullptr);
+    }
+#endif /* __TBB_RESUMABLE_TASKS */
+    return t;
+}
+
+// Defined in exception.cpp
+/*[[noreturn]]*/void do_throw_noexcept(void (*throw_exception)()) noexcept;
+
+//------------------------------------------------------------------------
+// Suspend point
+//------------------------------------------------------------------------
+#if __TBB_RESUMABLE_TASKS
+
+inline d1::task* suspend_point_type::resume_task::execute(d1::execution_data& ed) {
+    execution_data_ext& ed_ext = static_cast<execution_data_ext&>(ed);
+
+    if (ed_ext.wait_ctx) {
+        thread_control_monitor::resume_context monitor_node{{std::uintptr_t(ed_ext.wait_ctx), nullptr}, ed_ext, m_target};
+        // The wait_ctx is present only in external_waiter. In that case we leave the current stack
+        // in the abandoned state to resume when waiting completes.
+        thread_data* td = ed_ext.task_disp->m_thread_data;
+        td->set_post_resume_action(task_dispatcher::post_resume_action::register_waiter, &monitor_node);
+
+        thread_control_monitor& wait_list = td->my_arena->get_waiting_threads_monitor();
+
+        if (wait_list.wait([&] { return !ed_ext.wait_ctx->continue_execution(); }, monitor_node)) {
+            return nullptr;
+        }
+
+        td->clear_post_resume_action();
+        r1::resume(ed_ext.task_disp->get_suspend_point());
+    } else {
+        // If wait_ctx is null, it can be only a worker thread on outermost level because
+        // coroutine_waiter interrupts bypass loop before the resume_task execution.
+        ed_ext.task_disp->m_thread_data->set_post_resume_action(task_dispatcher::post_resume_action::notify,
+            ed_ext.task_disp->get_suspend_point());
+    }
+    // Do not access this task because it might be destroyed
+    ed_ext.task_disp->resume(m_target);
+    return nullptr;
+}
+
+inline suspend_point_type::suspend_point_type(arena* a, size_t stack_size, task_dispatcher& task_disp)
+    : m_arena(a)
+    , m_random(this)
+    , m_co_context(stack_size, &task_disp)
+    , m_resume_task(task_disp)
+{
+    assert_pointer_valid(m_arena);
+    assert_pointer_valid(m_arena->my_default_ctx);
+    task_accessor::context(m_resume_task) = m_arena->my_default_ctx;
+    task_accessor::isolation(m_resume_task) = no_isolation;
+    // Initialize the itt_caller for the context of the resume task.
+    // It will be bound to the stack of the first suspend call.
+    task_group_context_impl::bind_to(*task_accessor::context(m_resume_task), task_disp.m_thread_data);
+}
+
+#endif /* __TBB_RESUMABLE_TASKS */
+
+//------------------------------------------------------------------------
+// Task Dispatcher
+//------------------------------------------------------------------------
+inline task_dispatcher::task_dispatcher(arena* a) {
+    m_execute_data_ext.context = a->my_default_ctx;
+    m_execute_data_ext.task_disp = this;
+}
+
+inline bool task_dispatcher::can_steal() {
+    __TBB_ASSERT(m_stealing_threshold != 0, nullptr);
+    stack_anchor_type anchor{};
+    return reinterpret_cast<std::uintptr_t>(&anchor) > m_stealing_threshold;
+}
+
+inline d1::task* task_dispatcher::get_inbox_or_critical_task(
+    execution_data_ext& ed, mail_inbox& inbox, isolation_type isolation, bool critical_allowed)
+{
+    if (inbox.empty())
+        return nullptr;
+    d1::task* result = get_critical_task(nullptr, ed, isolation, critical_allowed);
+    if (result)
+        return result;
+    // Check if there are tasks mailed to this thread via task-to-thread affinity mechanism.
+    result = get_mailbox_task(inbox, ed, isolation);
+    // There is a race with a thread adding a new task (possibly with suitable isolation)
+    // to our mailbox, so the below conditions might result in a false positive.
+    // Then set_is_idle(false) allows that task to be stolen; it's OK.
+    if (isolation != no_isolation && !result && !inbox.empty() && inbox.is_idle_state(true)) {
+        // We have proxy tasks in our mailbox but the isolation blocks their execution.
+        // So publish the proxy tasks in mailbox to be available for stealing from owner's task pool.
+        inbox.set_is_idle( false );
+    }
+    return result;
+}
+
+inline d1::task* task_dispatcher::get_stream_or_critical_task(
+    execution_data_ext& ed, arena& a, task_stream<front_accessor>& stream, unsigned& hint,
+    isolation_type isolation, bool critical_allowed)
+{
+    if (stream.empty())
+        return nullptr;
+    d1::task* result = get_critical_task(nullptr, ed, isolation, critical_allowed);
+    if (result)
+        return result;
+    return a.get_stream_task(stream, hint);
+}
+
+inline d1::task* task_dispatcher::steal_or_get_critical(
+    execution_data_ext& ed, arena& a, unsigned arena_index, FastRandom& random,
+    isolation_type isolation, bool critical_allowed)
+{
+    if (d1::task* t = a.steal_task(arena_index, random, ed, isolation)) {
+        ed.context = task_accessor::context(*t);
+        ed.isolation = task_accessor::isolation(*t);
+        return get_critical_task(t, ed, isolation, critical_allowed);
+    }
+    return nullptr;
+}
+
+template <bool ITTPossible, typename Waiter>
+d1::task* task_dispatcher::receive_or_steal_task(
+    thread_data& tls, execution_data_ext& ed, Waiter& waiter, isolation_type isolation,
+    bool fifo_allowed, bool critical_allowed)
+{
+    __TBB_ASSERT(governor::is_thread_data_set(&tls), nullptr);
+    // Task to return
+    d1::task* t = nullptr;
+    // Get tls data (again)
+    arena& a = *tls.my_arena;
+    arena_slot& slot = *tls.my_arena_slot;
+    unsigned arena_index = tls.my_arena_index;
+    mail_inbox& inbox = tls.my_inbox;
+    task_stream<front_accessor>& resume_stream = a.my_resume_task_stream;
+    unsigned& resume_hint = slot.hint_for_resume_stream;
+    task_stream<front_accessor>& fifo_stream = a.my_fifo_task_stream;
+    unsigned& fifo_hint = slot.hint_for_fifo_stream;
+
+    waiter.reset_wait();
+    // Thread is in idle state now
+    inbox.set_is_idle(true);
+
+    bool stealing_is_allowed = can_steal();
+
+    // Stealing loop mailbox/enqueue/other_slots
+    for (;;) {
+        __TBB_ASSERT(t == nullptr, nullptr);
+        // Check if the resource manager requires our arena to relinquish some threads
+        // For the external thread restore idle state to true after dispatch loop
+        if (!waiter.continue_execution(slot, t)) {
+            __TBB_ASSERT(t == nullptr, nullptr);
+            break;
+        }
+        // Start searching
+        if (t != nullptr) {
+            // continue_execution returned a task
+        }
+        else if ((t = get_inbox_or_critical_task(ed, inbox, isolation, critical_allowed))) {
+            // Successfully got the task from mailbox or critical task
+        }
+        else if ((t = get_stream_or_critical_task(ed, a, resume_stream, resume_hint, isolation, critical_allowed))) {
+            // Successfully got the resume or critical task
+        }
+        else if (fifo_allowed && isolation == no_isolation
+                 && (t = get_stream_or_critical_task(ed, a, fifo_stream, fifo_hint, isolation, critical_allowed))) {
+            // Checked if there are tasks in starvation-resistant stream. Only allowed at the outermost dispatch level without isolation.
+        }
+        else if (stealing_is_allowed
+                 && (t = steal_or_get_critical(ed, a, arena_index, tls.my_random, isolation, critical_allowed))) {
+            // Stole a task from a random arena slot
+        }
+        else {
+            t = get_critical_task(t, ed, isolation, critical_allowed);
+        }
+
+        if (t != nullptr) {
+            ed.context = task_accessor::context(*t);
+            ed.isolation = task_accessor::isolation(*t);
+            a.my_observers.notify_entry_observers(tls.my_last_observer, tls.my_is_worker);
+            break; // Stealing success, end of stealing attempt
+        }
+        // Nothing to do, pause a little.
+        waiter.pause(slot);
+    } // end of nonlocal task retrieval loop
+
+    __TBB_ASSERT(is_alive(a.my_guard), nullptr);
+    if (inbox.is_idle_state(true)) {
+        inbox.set_is_idle(false);
+    }
+    return t;
+}
+
+template <bool ITTPossible, typename Waiter>
+d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter ) {
+    assert_pointer_valid(m_thread_data);
+    __TBB_ASSERT(m_thread_data->my_task_dispatcher == this, nullptr);
+
+    // Guard an outer/default execution state
+    struct dispatch_loop_guard {
+        task_dispatcher& task_disp;
+        execution_data_ext old_execute_data_ext;
+        properties old_properties;
+
+        ~dispatch_loop_guard() {
+            task_disp.m_execute_data_ext = old_execute_data_ext;
+            task_disp.m_properties = old_properties;
+
+            __TBB_ASSERT(task_disp.m_thread_data && governor::is_thread_data_set(task_disp.m_thread_data), nullptr);
+            __TBB_ASSERT(task_disp.m_thread_data->my_task_dispatcher == &task_disp, nullptr);
+        }
+    } dl_guard{ *this, m_execute_data_ext, m_properties };
+
+    // The context guard to track fp setting and itt tasks.
+    context_guard_helper</*report_tasks=*/ITTPossible> context_guard;
+
+    // Current isolation context
+    const isolation_type isolation = dl_guard.old_execute_data_ext.isolation;
+
+    // Critical work inflection point. Once turned false current execution context has taken
+    // critical task on the previous stack frame and cannot take more until that critical path is
+    // finished.
+    bool critical_allowed = dl_guard.old_properties.critical_task_allowed;
+
+    // Extended execution data that is used for dispatching.
+    // Base version is passed to the task::execute method.
+    execution_data_ext& ed = m_execute_data_ext;
+    ed.context = t ? task_accessor::context(*t) : nullptr;
+    ed.original_slot = m_thread_data->my_arena_index;
+    ed.affinity_slot = d1::no_slot;
+    ed.task_disp = this;
+    ed.wait_ctx = waiter.wait_ctx();
+
+    m_properties.outermost = false;
+    m_properties.fifo_tasks_allowed = false;
+
+    t = get_critical_task(t, ed, isolation, critical_allowed);
+    if (t && m_thread_data->my_inbox.is_idle_state(true)) {
+        // The thread has a work to do. Therefore, marking its inbox as not idle so that
+        // affinitized tasks can be stolen from it.
+        m_thread_data->my_inbox.set_is_idle(false);
+    }
+
+    // Infinite exception loop
+    for (;;) {
+        try {
+            // Main execution loop
+            do {
+                // We assume that bypass tasks are from the same task group.
+                context_guard.set_ctx(ed.context);
+                // Inner level evaluates tasks coming from nesting loops and those returned
+                // by just executed tasks (bypassing spawn or enqueue calls).
+                while (t != nullptr) {
+                    assert_task_valid(t);
+                    assert_pointer_valid</*alignment = */alignof(void*)>(ed.context);
+                    __TBB_ASSERT(ed.context->my_state == d1::task_group_context::state::bound ||
+                        ed.context->my_state == d1::task_group_context::state::isolated, nullptr);
+                    __TBB_ASSERT(m_thread_data->my_inbox.is_idle_state(false), nullptr);
+                    __TBB_ASSERT(task_accessor::is_resume_task(*t) || isolation == no_isolation || isolation == ed.isolation, nullptr);
+                    // Check premature leave
+                    if (Waiter::postpone_execution(*t)) {
+                        __TBB_ASSERT(task_accessor::is_resume_task(*t) && dl_guard.old_properties.outermost,
+                            "Currently, the bypass loop can be interrupted only for resume task on outermost level");
+                        return t;
+                    }
+                    // Copy itt_caller to a stack because the context might be destroyed after t->execute.
+                    void* itt_caller = ed.context->my_itt_caller;
+                    suppress_unused_warning(itt_caller);
+
+                    ITT_CALLEE_ENTER(ITTPossible, t, itt_caller);
+
+                    if (ed.context->is_group_execution_cancelled()) {
+                        t = t->cancel(ed);
+                    } else {
+                        t = t->execute(ed);
+                    }
+
+                    ITT_CALLEE_LEAVE(ITTPossible, itt_caller);
+
+                    // The task affinity in execution data is set for affinitized tasks.
+                    // So drop it after the task execution.
+                    ed.affinity_slot = d1::no_slot;
+                    // Reset task owner id for bypassed task
+                    ed.original_slot = m_thread_data->my_arena_index;
+                    t = get_critical_task(t, ed, isolation, critical_allowed);
+                }
+                __TBB_ASSERT(m_thread_data && governor::is_thread_data_set(m_thread_data), nullptr);
+                __TBB_ASSERT(m_thread_data->my_task_dispatcher == this, nullptr);
+                // When refactoring, pay attention that m_thread_data can be changed after t->execute()
+                __TBB_ASSERT(m_thread_data->my_arena_slot != nullptr, nullptr);
+                arena_slot& slot = *m_thread_data->my_arena_slot;
+                if (!waiter.continue_execution(slot, t)) {
+                    break;
+                }
+                // Retrieve the task from local task pool
+                if (t || (slot.is_task_pool_published() && (t = slot.get_task(ed, isolation)))) {
+                    __TBB_ASSERT(ed.original_slot == m_thread_data->my_arena_index, nullptr);
+                    ed.context = task_accessor::context(*t);
+                    ed.isolation = task_accessor::isolation(*t);
+                    continue;
+                }
+                // Retrieve the task from global sources
+                t = receive_or_steal_task<ITTPossible>(
+                    *m_thread_data, ed, waiter, isolation, dl_guard.old_properties.fifo_tasks_allowed,
+                    critical_allowed
+                );
+            } while (t != nullptr); // main dispatch loop
+            break; // Exit exception loop;
+        } catch (...) {
+            if (global_control::active_value(global_control::terminate_on_exception) == 1) {
+                do_throw_noexcept([] { throw; });
+            }
+            if (ed.context->cancel_group_execution()) {
+                /* We are the first to signal cancellation, so store the exception that caused it. */
+                ed.context->my_exception.store(tbb_exception_ptr::allocate(), std::memory_order_release);
+            }
+        }
+    } // Infinite exception loop
+    __TBB_ASSERT(t == nullptr, nullptr);
+
+
+#if __TBB_RESUMABLE_TASKS
+    if (dl_guard.old_properties.outermost) {
+        recall_point();
+    }
+#endif /* __TBB_RESUMABLE_TASKS */
+
+    return nullptr;
+}
+
+#if __TBB_RESUMABLE_TASKS
+inline void task_dispatcher::recall_point() {
+    if (this != &m_thread_data->my_arena_slot->default_task_dispatcher()) {
+        __TBB_ASSERT(m_suspend_point != nullptr, nullptr);
+        __TBB_ASSERT(m_suspend_point->m_is_owner_recalled.load(std::memory_order_relaxed) == false, nullptr);
+
+        m_thread_data->set_post_resume_action(post_resume_action::notify, get_suspend_point());
+        internal_suspend();
+
+        if (m_thread_data->my_inbox.is_idle_state(true)) {
+            m_thread_data->my_inbox.set_is_idle(false);
+        }
+    }
+}
+#endif /* __TBB_RESUMABLE_TASKS */
+
+#if __TBB_PREVIEW_CRITICAL_TASKS
+inline d1::task* task_dispatcher::get_critical_task(d1::task* t, execution_data_ext& ed, isolation_type isolation, bool critical_allowed) {
+    __TBB_ASSERT( critical_allowed || !m_properties.critical_task_allowed, nullptr );
+
+    if (!critical_allowed) {
+        // The stack is already in the process of critical path execution. Cannot take another
+        // critical work until finish with the current one.
+        __TBB_ASSERT(!m_properties.critical_task_allowed, nullptr);
+        return t;
+    }
+
+    assert_pointers_valid(m_thread_data, m_thread_data->my_arena, m_thread_data->my_arena_slot);
+    thread_data& td = *m_thread_data;
+    arena& a = *td.my_arena;
+    arena_slot& slot = *td.my_arena_slot;
+
+    d1::task* crit_t = a.get_critical_task(slot.hint_for_critical_stream, isolation);
+    if (crit_t != nullptr) {
+        assert_task_valid(crit_t);
+        if (t != nullptr) {
+            assert_pointer_valid</*alignment = */alignof(void*)>(ed.context);
+            r1::spawn(*t, *ed.context);
+        }
+        ed.context = task_accessor::context(*crit_t);
+        ed.isolation = task_accessor::isolation(*crit_t);
+
+        // We cannot execute more than one critical task on the same stack.
+        // In other words, we prevent nested critical tasks.
+        m_properties.critical_task_allowed = false;
+
+        // TODO: add a test that the observer is called when critical task is taken.
+        a.my_observers.notify_entry_observers(td.my_last_observer, td.my_is_worker);
+        t = crit_t;
+    } else {
+        // Was unable to find critical work in the queue. Allow inspecting the queue in nested
+        // invocations. Handles the case when critical task has been just completed.
+        m_properties.critical_task_allowed = true;
+    }
+    return t;
+}
+#else
+inline d1::task* task_dispatcher::get_critical_task(d1::task* t, execution_data_ext&, isolation_type, bool /*critical_allowed*/) {
+    return t;
+}
+#endif
+
+inline d1::task* task_dispatcher::get_mailbox_task(mail_inbox& my_inbox, execution_data_ext& ed, isolation_type isolation) {
+    while (task_proxy* const tp = my_inbox.pop(isolation)) {
+        if (d1::task* result = tp->extract_task<task_proxy::mailbox_bit>()) {
+            ed.original_slot = (unsigned short)(-2);
+            ed.affinity_slot = ed.task_disp->m_thread_data->my_arena_index;
+            return result;
+        }
+        // We have exclusive access to the proxy, and can destroy it.
+        tp->allocator.delete_object(tp, ed);
+    }
+    return nullptr;
+}
+
+template <typename Waiter>
+d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter) {
+    if (governor::is_itt_present()) {
+        return local_wait_for_all</*ITTPossible = */ true>(t, waiter);
+    } else {
+        return local_wait_for_all</*ITTPossible = */ false>(t, waiter);
+    }
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_task_dispatcher_H
+
diff --git a/third_party/tbb/task_group.h b/third_party/tbb/task_group.h
new file mode 100644
index 000000000..70d0bccd8
--- /dev/null
+++ b/third_party/tbb/task_group.h
@@ -0,0 +1,747 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_task_group_H
+#define __TBB_task_group_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/detail/_assert.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_template_helpers.h"
+#include "third_party/tbb/detail/_exception.h"
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/detail/_small_object_pool.h"
+#include "third_party/tbb/detail/_intrusive_list_node.h"
+#include "third_party/tbb/detail/_task_handle.h"
+
+#include "third_party/tbb/profiling.h"
+
+#include "third_party/libcxx/type_traits"
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress warning: structure was padded due to alignment specifier
+    #pragma warning(push)
+    #pragma warning(disable:4324)
+#endif
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class delegate_base;
+class task_arena_base;
+class task_group_context;
+class task_group_base;
+}
+
+namespace r1 {
+// Forward declarations
+class tbb_exception_ptr;
+class cancellation_disseminator;
+class thread_data;
+class task_dispatcher;
+template <bool>
+class context_guard_helper;
+struct task_arena_impl;
+class context_list;
+
+TBB_EXPORT void __TBB_EXPORTED_FUNC execute(d1::task_arena_base&, d1::delegate_base&);
+TBB_EXPORT void __TBB_EXPORTED_FUNC isolate_within_arena(d1::delegate_base&, std::intptr_t);
+
+TBB_EXPORT void __TBB_EXPORTED_FUNC initialize(d1::task_group_context&);
+TBB_EXPORT void __TBB_EXPORTED_FUNC destroy(d1::task_group_context&);
+TBB_EXPORT void __TBB_EXPORTED_FUNC reset(d1::task_group_context&);
+TBB_EXPORT bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context&);
+TBB_EXPORT bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context&);
+TBB_EXPORT void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context&);
+
+struct task_group_context_impl;
+}
+
+namespace d2 {
+
+namespace {
+template<typename F>
+d1::task* task_ptr_or_nullptr(F&& f);
+}
+
+template<typename F>
+class function_task : public task_handle_task  {
+    //TODO: apply empty base optimization here
+    const F m_func;
+
+private:
+    d1::task* execute(d1::execution_data& ed) override {
+        __TBB_ASSERT(ed.context == &this->ctx(), "The task group context should be used for all tasks");
+        task* res = task_ptr_or_nullptr(m_func);
+        finalize(&ed);
+        return res;
+    }
+    d1::task* cancel(d1::execution_data& ed) override {
+        finalize(&ed);
+        return nullptr;
+    }
+public:
+    template<typename FF>
+    function_task(FF&& f, d1::wait_context& wo, d1::task_group_context& ctx, d1::small_object_allocator& alloc)
+        : task_handle_task{wo, ctx, alloc},
+          m_func(std::forward<FF>(f)) {}
+};
+
+#if __TBB_PREVIEW_TASK_GROUP_EXTENSIONS
+namespace {
+    template<typename F>
+    d1::task* task_ptr_or_nullptr_impl(std::false_type, F&& f){
+        task_handle th = std::forward<F>(f)();
+        return task_handle_accessor::release(th);
+    }
+
+    template<typename F>
+    d1::task* task_ptr_or_nullptr_impl(std::true_type, F&& f){
+        std::forward<F>(f)();
+        return nullptr;
+    }
+
+    template<typename F>
+    d1::task* task_ptr_or_nullptr(F&& f){
+        using is_void_t = std::is_void<
+            decltype(std::forward<F>(f)())
+            >;
+
+        return  task_ptr_or_nullptr_impl(is_void_t{}, std::forward<F>(f));
+    }
+}
+#else
+namespace {
+    template<typename F>
+    d1::task* task_ptr_or_nullptr(F&& f){
+        std::forward<F>(f)();
+        return nullptr;
+    }
+}  // namespace
+#endif // __TBB_PREVIEW_TASK_GROUP_EXTENSIONS
+} // namespace d2
+
+namespace d1 {
+
+// This structure is left here for backward compatibility check
+struct context_list_node {
+    std::atomic<context_list_node*> prev{};
+    std::atomic<context_list_node*> next{};
+};
+
+//! Used to form groups of tasks
+/** @ingroup task_scheduling
+    The context services explicit cancellation requests from user code, and unhandled
+    exceptions intercepted during tasks execution. Intercepting an exception results
+    in generating internal cancellation requests (which is processed in exactly the
+    same way as external ones).
+
+    The context is associated with one or more root tasks and defines the cancellation
+    group that includes all the descendants of the corresponding root task(s). Association
+    is established when a context object is passed as an argument to the task::allocate_root()
+    method. See task_group_context::task_group_context for more details.
+
+    The context can be bound to another one, and other contexts can be bound to it,
+    forming a tree-like structure: parent -> this -> children. Arrows here designate
+    cancellation propagation direction. If a task in a cancellation group is cancelled
+    all the other tasks in this group and groups bound to it (as children) get cancelled too.
+**/
+class task_group_context : no_copy {
+public:
+    enum traits_type {
+        fp_settings     = 1 << 1,
+        concurrent_wait = 1 << 2,
+        default_traits  = 0
+    };
+    enum kind_type {
+        isolated,
+        bound
+    };
+private:
+    //! Space for platform-specific FPU settings.
+    /** Must only be accessed inside TBB binaries, and never directly in user
+    code or inline methods. */
+    std::uint64_t my_cpu_ctl_env;
+
+    //! Specifies whether cancellation was requested for this task group.
+    std::atomic<std::uint32_t> my_cancellation_requested;
+
+    //! Versioning for run-time checks and behavioral traits of the context.
+    enum class task_group_context_version : std::uint8_t {
+        unused = 1       // ensure that new versions, if any, will not clash with previously used ones
+    };
+    task_group_context_version my_version;
+
+    //! The context traits.
+    struct context_traits {
+        bool fp_settings        : 1;
+        bool concurrent_wait    : 1;
+        bool bound              : 1;
+        bool reserved1          : 1;
+        bool reserved2          : 1;
+        bool reserved3          : 1;
+        bool reserved4          : 1;
+        bool reserved5          : 1;
+    } my_traits;
+
+    static_assert(sizeof(context_traits) == 1, "Traits shall fit into one byte.");
+
+    static constexpr std::uint8_t may_have_children = 1;
+    //! The context internal state (currently only may_have_children).
+    std::atomic<std::uint8_t> my_may_have_children;
+
+    enum class state : std::uint8_t {
+        created,
+        locked,
+        isolated,
+        bound,
+        dead,
+        proxy = std::uint8_t(-1) //the context is not the real one, but proxy to other one
+    };
+
+    //! The synchronization machine state to manage lifetime.
+    std::atomic<state> my_state;
+
+    union {
+        //! Pointer to the context of the parent cancellation group. nullptr for isolated contexts.
+        task_group_context* my_parent;
+
+        //! Pointer to the actual context 'this' context represents a proxy of.
+        task_group_context* my_actual_context;
+    };
+
+    //! Thread data instance that registered this context in its list.
+    r1::context_list* my_context_list;
+    static_assert(sizeof(std::atomic<r1::thread_data*>) == sizeof(r1::context_list*), "To preserve backward compatibility these types should have the same size");
+
+    //! Used to form the thread specific list of contexts without additional memory allocation.
+    /** A context is included into the list of the current thread when its binding to
+        its parent happens. Any context can be present in the list of one thread only. **/
+    intrusive_list_node my_node;
+    static_assert(sizeof(intrusive_list_node) == sizeof(context_list_node), "To preserve backward compatibility these types should have the same size");
+
+    //! Pointer to the container storing exception being propagated across this task group.
+    std::atomic<r1::tbb_exception_ptr*> my_exception;
+    static_assert(sizeof(std::atomic<r1::tbb_exception_ptr*>) == sizeof(r1::tbb_exception_ptr*),
+        "backward compatibility check");
+
+    //! Used to set and maintain stack stitching point for Intel Performance Tools.
+    void* my_itt_caller;
+
+    //! Description of algorithm for scheduler based instrumentation.
+    string_resource_index my_name;
+
+    char padding[max_nfs_size
+        - sizeof(std::uint64_t)                          // my_cpu_ctl_env
+        - sizeof(std::atomic<std::uint32_t>)             // my_cancellation_requested
+        - sizeof(std::uint8_t)                           // my_version
+        - sizeof(context_traits)                         // my_traits
+        - sizeof(std::atomic<std::uint8_t>)              // my_state
+        - sizeof(std::atomic<state>)                     // my_state
+        - sizeof(task_group_context*)                    // my_parent
+        - sizeof(r1::context_list*)                      // my_context_list
+        - sizeof(intrusive_list_node)                    // my_node
+        - sizeof(std::atomic<r1::tbb_exception_ptr*>)    // my_exception
+        - sizeof(void*)                                  // my_itt_caller
+        - sizeof(string_resource_index)                  // my_name
+    ];
+
+    task_group_context(context_traits t, string_resource_index name)
+        : my_version{task_group_context_version::unused}, my_name{name}
+    {
+        my_traits = t; // GCC4.8 issues warning list initialization for bitset (missing-field-initializers)
+        r1::initialize(*this);
+    }
+
+    task_group_context(task_group_context* actual_context)
+        : my_version{task_group_context_version::unused}
+        , my_state{state::proxy}
+        , my_actual_context{actual_context}
+    {
+        __TBB_ASSERT(my_actual_context, "Passed pointer value points to nothing.");
+        my_name = actual_context->my_name;
+
+        // no need to initialize 'this' context as it acts as a proxy for my_actual_context, which
+        // initialization is a user-side responsibility.
+    }
+
+    static context_traits make_traits(kind_type relation_with_parent, std::uintptr_t user_traits) {
+        context_traits ct;
+        ct.fp_settings = (user_traits & fp_settings) == fp_settings;
+        ct.concurrent_wait = (user_traits & concurrent_wait) == concurrent_wait;
+        ct.bound = relation_with_parent == bound;
+        ct.reserved1 = ct.reserved2 = ct.reserved3 = ct.reserved4 = ct.reserved5 = false;
+        return ct;
+    }
+
+    bool is_proxy() const {
+        return my_state.load(std::memory_order_relaxed) == state::proxy;
+    }
+
+    task_group_context& actual_context() noexcept {
+        if (is_proxy()) {
+            __TBB_ASSERT(my_actual_context, "Actual task_group_context is not set.");
+            return *my_actual_context;
+        }
+        return *this;
+    }
+
+    const task_group_context& actual_context() const noexcept {
+        if (is_proxy()) {
+            __TBB_ASSERT(my_actual_context, "Actual task_group_context is not set.");
+            return *my_actual_context;
+        }
+        return *this;
+    }
+
+public:
+    //! Default & binding constructor.
+    /** By default a bound context is created. That is this context will be bound
+        (as child) to the context of the currently executing task . Cancellation
+        requests passed to the parent context are propagated to all the contexts
+        bound to it. Similarly priority change is propagated from the parent context
+        to its children.
+
+        If task_group_context::isolated is used as the argument, then the tasks associated
+        with this context will never be affected by events in any other context.
+
+        Creating isolated contexts involve much less overhead, but they have limited
+        utility. Normally when an exception occurs in an algorithm that has nested
+        ones running, it is desirably to have all the nested algorithms cancelled
+        as well. Such a behavior requires nested algorithms to use bound contexts.
+
+        There is one good place where using isolated algorithms is beneficial. It is
+        an external thread. That is if a particular algorithm is invoked directly from
+        the external thread (not from a TBB task), supplying it with explicitly
+        created isolated context will result in a faster algorithm startup.
+
+        VERSIONING NOTE:
+        Implementation(s) of task_group_context constructor(s) cannot be made
+        entirely out-of-line because the run-time version must be set by the user
+        code. This will become critically important for binary compatibility, if
+        we ever have to change the size of the context object. **/
+
+    task_group_context(kind_type relation_with_parent = bound,
+                       std::uintptr_t t = default_traits)
+        : task_group_context(make_traits(relation_with_parent, t), CUSTOM_CTX) {}
+
+    // Custom constructor for instrumentation of oneTBB algorithm
+    task_group_context(string_resource_index name )
+        : task_group_context(make_traits(bound, default_traits), name) {}
+
+    // Do not introduce any logic on user side since it might break state propagation assumptions
+    ~task_group_context() {
+        // When 'this' serves as a proxy, the initialization does not happen - nor should the
+        // destruction.
+        if (!is_proxy())
+        {
+            r1::destroy(*this);
+        }
+    }
+
+    //! Forcefully reinitializes the context after the task tree it was associated with is completed.
+    /** Because the method assumes that all the tasks that used to be associated with
+        this context have already finished, calling it while the context is still
+        in use somewhere in the task hierarchy leads to undefined behavior.
+
+        IMPORTANT: This method is not thread safe!
+
+        The method does not change the context's parent if it is set. **/
+    void reset() {
+        r1::reset(actual_context());
+    }
+
+    //! Initiates cancellation of all tasks in this cancellation group and its subordinate groups.
+    /** \return false if cancellation has already been requested, true otherwise.
+
+        Note that canceling never fails. When false is returned, it just means that
+        another thread (or this one) has already sent cancellation request to this
+        context or to one of its ancestors (if this context is bound). It is guaranteed
+        that when this method is concurrently called on the same not yet cancelled
+        context, true will be returned by one and only one invocation. **/
+    bool cancel_group_execution() {
+        return r1::cancel_group_execution(actual_context());
+    }
+
+    //! Returns true if the context received cancellation request.
+    bool is_group_execution_cancelled() {
+        return r1::is_group_execution_cancelled(actual_context());
+    }
+
+#if __TBB_FP_CONTEXT
+    //! Captures the current FPU control settings to the context.
+    /** Because the method assumes that all the tasks that used to be associated with
+        this context have already finished, calling it while the context is still
+        in use somewhere in the task hierarchy leads to undefined behavior.
+
+        IMPORTANT: This method is not thread safe!
+
+        The method does not change the FPU control settings of the context's parent. **/
+    void capture_fp_settings() {
+        r1::capture_fp_settings(actual_context());
+    }
+#endif
+
+    //! Returns the user visible context trait
+    std::uintptr_t traits() const {
+        std::uintptr_t t{};
+        const task_group_context& ctx = actual_context();
+        t |= ctx.my_traits.fp_settings ? fp_settings : 0;
+        t |= ctx.my_traits.concurrent_wait ? concurrent_wait : 0;
+        return t;
+    }
+private:
+    //// TODO: cleanup friends
+    friend class r1::cancellation_disseminator;
+    friend class r1::thread_data;
+    friend class r1::task_dispatcher;
+    template <bool>
+    friend class r1::context_guard_helper;
+    friend struct r1::task_arena_impl;
+    friend struct r1::task_group_context_impl;
+    friend class task_group_base;
+}; // class task_group_context
+
+static_assert(sizeof(task_group_context) == 128, "Wrong size of task_group_context");
+
+enum task_group_status {
+    not_complete,
+    complete,
+    canceled
+};
+
+class task_group;
+class structured_task_group;
+#if TBB_PREVIEW_ISOLATED_TASK_GROUP
+class isolated_task_group;
+#endif
+
+template<typename F>
+class function_task : public task {
+    const F m_func;
+    wait_context& m_wait_ctx;
+    small_object_allocator m_allocator;
+
+    void finalize(const execution_data& ed) {
+        // Make a local reference not to access this after destruction.
+        wait_context& wo = m_wait_ctx;
+        // Copy allocator to the stack
+        auto allocator = m_allocator;
+        // Destroy user functor before release wait.
+        this->~function_task();
+        wo.release();
+
+        allocator.deallocate(this, ed);
+    }
+    task* execute(execution_data& ed) override {
+        task* res = d2::task_ptr_or_nullptr(m_func);
+        finalize(ed);
+        return res;
+    }
+    task* cancel(execution_data& ed) override {
+        finalize(ed);
+        return nullptr;
+    }
+public:
+    function_task(const F& f, wait_context& wo, small_object_allocator& alloc)
+        : m_func(f)
+        , m_wait_ctx(wo)
+        , m_allocator(alloc) {}
+
+    function_task(F&& f, wait_context& wo, small_object_allocator& alloc)
+        : m_func(std::move(f))
+        , m_wait_ctx(wo)
+        , m_allocator(alloc) {}
+};
+
+template <typename F>
+class function_stack_task : public task {
+    const F& m_func;
+    wait_context& m_wait_ctx;
+
+    void finalize() {
+        m_wait_ctx.release();
+    }
+    task* execute(execution_data&) override {
+        task* res = d2::task_ptr_or_nullptr(m_func);
+        finalize();
+        return res;
+    }
+    task* cancel(execution_data&) override {
+        finalize();
+        return nullptr;
+    }
+public:
+    function_stack_task(const F& f, wait_context& wo) : m_func(f), m_wait_ctx(wo) {}
+};
+
+class task_group_base : no_copy {
+protected:
+    wait_context m_wait_ctx;
+    task_group_context m_context;
+
+    template<typename F>
+    task_group_status internal_run_and_wait(const F& f) {
+        function_stack_task<F> t{ f, m_wait_ctx };
+        m_wait_ctx.reserve();
+        bool cancellation_status = false;
+        try_call([&] {
+            execute_and_wait(t, context(), m_wait_ctx, context());
+        }).on_completion([&] {
+            // TODO: the reset method is not thread-safe. Ensure the correct behavior.
+            cancellation_status = context().is_group_execution_cancelled();
+            context().reset();
+        });
+        return cancellation_status ? canceled : complete;
+    }
+
+    task_group_status internal_run_and_wait(d2::task_handle&& h) {
+        __TBB_ASSERT(h != nullptr, "Attempt to schedule empty task_handle");
+
+        using acs = d2::task_handle_accessor;
+        __TBB_ASSERT(&acs::ctx_of(h) == &context(), "Attempt to schedule task_handle into different task_group");
+
+        bool cancellation_status = false;
+        try_call([&] {
+            execute_and_wait(*acs::release(h), context(), m_wait_ctx, context());
+        }).on_completion([&] {
+            // TODO: the reset method is not thread-safe. Ensure the correct behavior.
+            cancellation_status = context().is_group_execution_cancelled();
+            context().reset();
+        });
+        return cancellation_status ? canceled : complete;
+    }
+
+    template<typename F>
+    task* prepare_task(F&& f) {
+        m_wait_ctx.reserve();
+        small_object_allocator alloc{};
+        return alloc.new_object<function_task<typename std::decay<F>::type>>(std::forward<F>(f), m_wait_ctx, alloc);
+    }
+
+    task_group_context& context() noexcept {
+        return m_context.actual_context();
+    }
+
+    template<typename F>
+    d2::task_handle prepare_task_handle(F&& f) {
+        m_wait_ctx.reserve();
+        small_object_allocator alloc{};
+        using function_task_t =  d2::function_task<typename std::decay<F>::type>;
+        d2::task_handle_task* function_task_p =  alloc.new_object<function_task_t>(std::forward<F>(f), m_wait_ctx, context(), alloc);
+
+        return d2::task_handle_accessor::construct(function_task_p);
+    }
+
+public:
+    task_group_base(uintptr_t traits = 0)
+        : m_wait_ctx(0)
+        , m_context(task_group_context::bound, task_group_context::default_traits | traits)
+    {}
+
+    task_group_base(task_group_context& ctx)
+        : m_wait_ctx(0)
+        , m_context(&ctx)
+    {}
+
+    ~task_group_base() noexcept(false) {
+        if (m_wait_ctx.continue_execution()) {
+#if __TBB_CPP17_UNCAUGHT_EXCEPTIONS_PRESENT
+            bool stack_unwinding_in_progress = std::uncaught_exceptions() > 0;
+#else
+            bool stack_unwinding_in_progress = std::uncaught_exception();
+#endif
+            // Always attempt to do proper cleanup to avoid inevitable memory corruption
+            // in case of missing wait (for the sake of better testability & debuggability)
+            if (!context().is_group_execution_cancelled())
+                cancel();
+            d1::wait(m_wait_ctx, context());
+            if (!stack_unwinding_in_progress)
+                throw_exception(exception_id::missing_wait);
+        }
+    }
+
+    task_group_status wait() {
+        bool cancellation_status = false;
+        try_call([&] {
+            d1::wait(m_wait_ctx, context());
+        }).on_completion([&] {
+            // TODO: the reset method is not thread-safe. Ensure the correct behavior.
+            cancellation_status = m_context.is_group_execution_cancelled();
+            context().reset();
+        });
+        return cancellation_status ? canceled : complete;
+    }
+
+    void cancel() {
+        context().cancel_group_execution();
+    }
+}; // class task_group_base
+
+class task_group : public task_group_base {
+public:
+    task_group() : task_group_base(task_group_context::concurrent_wait) {}
+    task_group(task_group_context& ctx) : task_group_base(ctx) {}
+
+    template<typename F>
+    void run(F&& f) {
+        spawn(*prepare_task(std::forward<F>(f)), context());
+    }
+
+    void run(d2::task_handle&& h) {
+        __TBB_ASSERT(h != nullptr, "Attempt to schedule empty task_handle");
+
+        using acs = d2::task_handle_accessor;
+        __TBB_ASSERT(&acs::ctx_of(h) == &context(), "Attempt to schedule task_handle into different task_group");
+
+        spawn(*acs::release(h), context());
+    }
+
+    template<typename F>
+    d2::task_handle defer(F&& f) {
+        return prepare_task_handle(std::forward<F>(f));
+
+    }
+
+    template<typename F>
+    task_group_status run_and_wait(const F& f) {
+        return internal_run_and_wait(f);
+    }
+
+    task_group_status run_and_wait(d2::task_handle&& h) {
+        return internal_run_and_wait(std::move(h));
+    }
+}; // class task_group
+
+#if TBB_PREVIEW_ISOLATED_TASK_GROUP
+class spawn_delegate : public delegate_base {
+    task* task_to_spawn;
+    task_group_context& context;
+    bool operator()() const override {
+        spawn(*task_to_spawn, context);
+        return true;
+    }
+public:
+    spawn_delegate(task* a_task, task_group_context& ctx)
+        : task_to_spawn(a_task), context(ctx)
+    {}
+};
+
+class wait_delegate : public delegate_base {
+    bool operator()() const override {
+        status = tg.wait();
+        return true;
+    }
+protected:
+    task_group& tg;
+    task_group_status& status;
+public:
+    wait_delegate(task_group& a_group, task_group_status& tgs)
+        : tg(a_group), status(tgs) {}
+};
+
+template<typename F>
+class run_wait_delegate : public wait_delegate {
+    F& func;
+    bool operator()() const override {
+        status = tg.run_and_wait(func);
+        return true;
+    }
+public:
+    run_wait_delegate(task_group& a_group, F& a_func, task_group_status& tgs)
+        : wait_delegate(a_group, tgs), func(a_func) {}
+};
+
+class isolated_task_group : public task_group {
+    intptr_t this_isolation() {
+        return reinterpret_cast<intptr_t>(this);
+    }
+public:
+    isolated_task_group() : task_group() {}
+
+    isolated_task_group(task_group_context& ctx) : task_group(ctx) {}
+
+    template<typename F>
+    void run(F&& f) {
+        spawn_delegate sd(prepare_task(std::forward<F>(f)), context());
+        r1::isolate_within_arena(sd, this_isolation());
+    }
+
+    void run(d2::task_handle&& h) {
+        __TBB_ASSERT(h != nullptr, "Attempt to schedule empty task_handle");
+
+        using acs = d2::task_handle_accessor;
+        __TBB_ASSERT(&acs::ctx_of(h) == &context(), "Attempt to schedule task_handle into different task_group");
+
+        spawn_delegate sd(acs::release(h), context());
+        r1::isolate_within_arena(sd, this_isolation());
+    }
+
+    template<typename F>
+    task_group_status run_and_wait( const F& f ) {
+        task_group_status result = not_complete;
+        run_wait_delegate<const F> rwd(*this, f, result);
+        r1::isolate_within_arena(rwd, this_isolation());
+        __TBB_ASSERT(result != not_complete, "premature exit from wait?");
+        return result;
+    }
+
+    task_group_status wait() {
+        task_group_status result = not_complete;
+        wait_delegate wd(*this, result);
+        r1::isolate_within_arena(wd, this_isolation());
+        __TBB_ASSERT(result != not_complete, "premature exit from wait?");
+        return result;
+    }
+}; // class isolated_task_group
+#endif // TBB_PREVIEW_ISOLATED_TASK_GROUP
+
+inline bool is_current_task_group_canceling() {
+    task_group_context* ctx = current_context();
+    return ctx ? ctx->is_group_execution_cancelled() : false;
+}
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::task_group_context;
+using detail::d1::task_group;
+#if TBB_PREVIEW_ISOLATED_TASK_GROUP
+using detail::d1::isolated_task_group;
+#endif
+
+using detail::d1::task_group_status;
+using detail::d1::not_complete;
+using detail::d1::complete;
+using detail::d1::canceled;
+
+using detail::d1::is_current_task_group_canceling;
+using detail::r1::missing_wait;
+
+using detail::d2::task_handle;
+}
+
+} // namespace tbb
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning(pop) // 4324 warning
+#endif
+
+#endif // __TBB_task_group_H
diff --git a/third_party/tbb/task_group_context.cpp b/third_party/tbb/task_group_context.cpp
new file mode 100644
index 000000000..4f91c54e0
--- /dev/null
+++ b/third_party/tbb/task_group_context.cpp
@@ -0,0 +1,359 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/tbb_allocator.h"
+#include "third_party/tbb/task_group.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/thread_data.h"
+#include "third_party/tbb/scheduler_common.h"
+#include "third_party/tbb/itt_notify.h"
+#include "third_party/tbb/task_dispatcher.h"
+
+#include "third_party/libcxx/type_traits"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//------------------------------------------------------------------------
+// tbb_exception_ptr
+//------------------------------------------------------------------------
+tbb_exception_ptr* tbb_exception_ptr::allocate() noexcept {
+    tbb_exception_ptr* eptr = (tbb_exception_ptr*)allocate_memory(sizeof(tbb_exception_ptr));
+    return eptr ? new (eptr) tbb_exception_ptr(std::current_exception()) : nullptr;
+}
+
+void tbb_exception_ptr::destroy() noexcept {
+    this->~tbb_exception_ptr();
+    deallocate_memory(this);
+}
+
+void tbb_exception_ptr::throw_self() {
+    if (governor::rethrow_exception_broken()) fix_broken_rethrow();
+    std::rethrow_exception(my_ptr);
+}
+
+//------------------------------------------------------------------------
+// task_group_context
+//------------------------------------------------------------------------
+
+void task_group_context_impl::destroy(d1::task_group_context& ctx) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
+
+    if (ctx.my_context_list != nullptr) {
+        __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) == d1::task_group_context::state::bound, nullptr);
+        // The owner can be destroyed at any moment. Access the associate data with caution.
+        ctx.my_context_list->remove(ctx.my_node);
+    }
+    d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
+#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
+    suppress_unused_warning(ctl);
+#endif
+    ctl->~cpu_ctl_env();
+
+    auto exception = ctx.my_exception.load(std::memory_order_relaxed);
+    if (exception) {
+        exception->destroy();
+    }
+    ITT_STACK_DESTROY(ctx.my_itt_caller);
+
+    poison_pointer(ctx.my_parent);
+    poison_pointer(ctx.my_context_list);
+    poison_pointer(ctx.my_node.my_next_node);
+    poison_pointer(ctx.my_node.my_prev_node);
+    poison_pointer(ctx.my_exception);
+    poison_pointer(ctx.my_itt_caller);
+
+    ctx.my_state.store(d1::task_group_context::state::dead, std::memory_order_release);
+}
+
+void task_group_context_impl::initialize(d1::task_group_context& ctx) {
+    ITT_TASK_GROUP(&ctx, ctx.my_name, nullptr);
+
+    ctx.my_node.my_next_node = &ctx.my_node;
+    ctx.my_node.my_prev_node = &ctx.my_node;
+    ctx.my_cpu_ctl_env = 0;
+    ctx.my_cancellation_requested = 0;
+    ctx.my_may_have_children.store(0, std::memory_order_relaxed);
+    // Set the created state to bound at the first usage.
+    ctx.my_state.store(d1::task_group_context::state::created, std::memory_order_relaxed);
+    ctx.my_parent = nullptr;
+    ctx.my_context_list = nullptr;
+    ctx.my_exception.store(nullptr, std::memory_order_relaxed);
+    ctx.my_itt_caller = nullptr;
+
+    static_assert(sizeof(d1::cpu_ctl_env) <= sizeof(ctx.my_cpu_ctl_env), "FPU settings storage does not fit to uint64_t");
+    d1::cpu_ctl_env* ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
+    if (ctx.my_traits.fp_settings)
+        ctl->get_env();
+}
+
+void task_group_context_impl::register_with(d1::task_group_context& ctx, thread_data* td) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
+    __TBB_ASSERT(td, nullptr);
+    ctx.my_context_list = td->my_context_list;
+
+    ctx.my_context_list->push_front(ctx.my_node);
+}
+
+void task_group_context_impl::bind_to_impl(d1::task_group_context& ctx, thread_data* td) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
+    __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) == d1::task_group_context::state::locked, "The context can be bound only under the lock.");
+    __TBB_ASSERT(!ctx.my_parent, "Parent is set before initial binding");
+
+    ctx.my_parent = td->my_task_dispatcher->m_execute_data_ext.context;
+    __TBB_ASSERT(ctx.my_parent, nullptr);
+
+    // Inherit FPU settings only if the context has not captured FPU settings yet.
+    if (!ctx.my_traits.fp_settings)
+        copy_fp_settings(ctx, *ctx.my_parent);
+
+    // Condition below prevents unnecessary thrashing parent context's cache line
+    if (ctx.my_parent->my_may_have_children.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) {
+        ctx.my_parent->my_may_have_children.store(d1::task_group_context::may_have_children, std::memory_order_relaxed); // full fence is below
+    }
+    if (ctx.my_parent->my_parent) {
+        // Even if this context were made accessible for state change propagation
+        // (by placing store_with_release(td->my_context_list_state.head.my_next, &ctx.my_node)
+        // above), it still could be missed if state propagation from a grand-ancestor
+        // was underway concurrently with binding.
+        // Speculative propagation from the parent together with epoch counters
+        // detecting possibility of such a race allow to avoid taking locks when
+        // there is no contention.
+
+        // Acquire fence is necessary to prevent reordering subsequent speculative
+        // loads of parent state data out of the scope where epoch counters comparison
+        // can reliably validate it.
+        uintptr_t local_count_snapshot = ctx.my_parent->my_context_list->epoch.load(std::memory_order_acquire);
+        // Speculative propagation of parent's state. The speculation will be
+        // validated by the epoch counters check further on.
+        ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        register_with(ctx, td); // Issues full fence
+
+        // If no state propagation was detected by the following condition, the above
+        // full fence guarantees that the parent had correct state during speculative
+        // propagation before the fence. Otherwise the propagation from parent is
+        // repeated under the lock.
+        if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
+            // Another thread may be propagating state change right now. So resort to lock.
+            context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
+            ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        }
+    } else {
+        register_with(ctx, td); // Issues full fence
+        // As we do not have grand-ancestors, concurrent state propagation (if any)
+        // may originate only from the parent context, and thus it is safe to directly
+        // copy the state from it.
+        ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
+    }
+}
+
+void task_group_context_impl::bind_to(d1::task_group_context& ctx, thread_data* td) {
+    d1::task_group_context::state state = ctx.my_state.load(std::memory_order_acquire);
+    if (state <= d1::task_group_context::state::locked) {
+        if (state == d1::task_group_context::state::created &&
+#if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
+            ((std::atomic<typename std::underlying_type<d1::task_group_context::state>::type>&)ctx.my_state).compare_exchange_strong(
+                (typename std::underlying_type<d1::task_group_context::state>::type&)state,
+                (typename std::underlying_type<d1::task_group_context::state>::type)d1::task_group_context::state::locked)
+#else
+            ctx.my_state.compare_exchange_strong(state, d1::task_group_context::state::locked)
+#endif
+            ) {
+            // If we are in the outermost task dispatch loop of an external thread, then
+            // there is nothing to bind this context to, and we skip the binding part
+            // treating the context as isolated.
+            __TBB_ASSERT(td->my_task_dispatcher->m_execute_data_ext.context != nullptr, nullptr);
+            d1::task_group_context::state release_state{};
+            if (td->my_task_dispatcher->m_execute_data_ext.context == td->my_arena->my_default_ctx || !ctx.my_traits.bound) {
+                if (!ctx.my_traits.fp_settings) {
+                    copy_fp_settings(ctx, *td->my_arena->my_default_ctx);
+                }
+                release_state = d1::task_group_context::state::isolated;
+            } else {
+                bind_to_impl(ctx, td);
+                release_state = d1::task_group_context::state::bound;
+            }
+            ITT_STACK_CREATE(ctx.my_itt_caller);
+            ctx.my_state.store(release_state, std::memory_order_release);
+        }
+        spin_wait_while_eq(ctx.my_state, d1::task_group_context::state::locked);
+    }
+    __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) != d1::task_group_context::state::created, nullptr);
+    __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) != d1::task_group_context::state::locked, nullptr);
+}
+
+void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic<std::uint32_t> d1::task_group_context::* mptr_state, d1::task_group_context& src, std::uint32_t new_state) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
+    /*  1. if ((ctx.*mptr_state).load(std::memory_order_relaxed) == new_state):
+            Nothing to do, whether descending from "src" or not, so no need to scan.
+            Hopefully this happens often thanks to earlier invocations.
+            This optimization is enabled by LIFO order in the context lists:
+                - new contexts are bound to the beginning of lists;
+                - descendants are newer than ancestors;
+                - earlier invocations are therefore likely to "paint" long chains.
+        2. if (&ctx != &src):
+            This clause is disjunct from the traversal below, which skips src entirely.
+            Note that src.*mptr_state is not necessarily still equal to new_state (another thread may have changed it again).
+            Such interference is probably not frequent enough to aim for optimisation by writing new_state again (to make the other thread back down).
+            Letting the other thread prevail may also be fairer.
+    */
+    if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state && &ctx != &src) {
+        for (d1::task_group_context* ancestor = ctx.my_parent; ancestor != nullptr; ancestor = ancestor->my_parent) {
+            if (ancestor == &src) {
+                for (d1::task_group_context* c = &ctx; c != ancestor; c = c->my_parent)
+                    (c->*mptr_state).store(new_state, std::memory_order_relaxed);
+                break;
+            }
+        }
+    }
+}
+
+bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
+    __TBB_ASSERT(ctx.my_cancellation_requested.load(std::memory_order_relaxed) <= 1, "The cancellation state can be either 0 or 1");
+    if (ctx.my_cancellation_requested.load(std::memory_order_relaxed) || ctx.my_cancellation_requested.exchange(1)) {
+        // This task group and any descendants have already been canceled.
+        // (A newly added descendant would inherit its parent's ctx.my_cancellation_requested,
+        // not missing out on any cancellation still being propagated, and a context cannot be uncanceled.)
+        return false;
+    }
+    governor::get_thread_data()->my_arena->my_threading_control->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1));
+    return true;
+}
+
+bool task_group_context_impl::is_group_execution_cancelled(const d1::task_group_context& ctx) {
+    return ctx.my_cancellation_requested.load(std::memory_order_relaxed) != 0;
+}
+
+// IMPORTANT: It is assumed that this method is not used concurrently!
+void task_group_context_impl::reset(d1::task_group_context& ctx) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
+    //! TODO: Add assertion that this context does not have children
+    // No fences are necessary since this context can be accessed from another thread
+    // only after stealing happened (which means necessary fences were used).
+
+    auto exception = ctx.my_exception.load(std::memory_order_relaxed);
+    if (exception) {
+        exception->destroy();
+        ctx.my_exception.store(nullptr, std::memory_order_relaxed);
+    }
+    ctx.my_cancellation_requested = 0;
+}
+
+// IMPORTANT: It is assumed that this method is not used concurrently!
+void task_group_context_impl::capture_fp_settings(d1::task_group_context& ctx) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
+    //! TODO: Add assertion that this context does not have children
+    // No fences are necessary since this context can be accessed from another thread
+    // only after stealing happened (which means necessary fences were used).
+    d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
+    if (!ctx.my_traits.fp_settings) {
+        ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
+        ctx.my_traits.fp_settings = true;
+    }
+    ctl->get_env();
+}
+
+void task_group_context_impl::copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src) {
+    __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
+    __TBB_ASSERT(!ctx.my_traits.fp_settings, "The context already has FPU settings.");
+    __TBB_ASSERT(src.my_traits.fp_settings, "The source context does not have FPU settings.");
+
+    const d1::cpu_ctl_env* src_ctl = reinterpret_cast<const d1::cpu_ctl_env*>(&src.my_cpu_ctl_env);
+    new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env(*src_ctl);
+    ctx.my_traits.fp_settings = true;
+}
+
+/*
+    Comments:
+
+1.  The premise of the cancellation support implementation is that cancellations are
+    not part of the hot path of the program execution. Therefore all changes in its
+    implementation in order to reduce the overhead of the cancellation control flow
+    should be done only in ways that do not increase overhead of the normal execution.
+
+    In general, contexts are used by all threads and their descendants are created in
+    different threads as well. In order to minimize impact of the cross-thread tree
+    maintenance (first of all because of the synchronization), the tree of contexts
+    is split into pieces, each of which is handled by a single thread. Such pieces
+    are represented as lists of contexts, members of which are contexts that were
+    bound to their parents in the given thread.
+
+    The context tree maintenance and cancellation propagation algorithms are designed
+    in such a manner that cross-thread access to a context list will take place only
+    when cancellation signal is sent (by user or when an exception happens), and
+    synchronization is necessary only then. Thus the normal execution flow (without
+    exceptions and cancellation) remains free from any synchronization done on
+    behalf of exception handling and cancellation support.
+
+2.  Consider parallel cancellations at the different levels of the context tree:
+
+        Ctx1 <- Cancelled by Thread1            |- Thread2 started processing
+         |                                      |
+        Ctx2                                    |- Thread1 started processing
+         |                                   T1 |- Thread2 finishes and syncs up local counters
+        Ctx3 <- Cancelled by Thread2            |
+         |                                      |- Ctx5 is bound to Ctx2
+        Ctx4                                    |
+                                             T2 |- Thread1 reaches Ctx2
+
+    Thread-propagator of each cancellation increments global counter. However the thread
+    propagating the cancellation from the outermost context (Thread1) may be the last
+    to finish. Which means that the local counters may be synchronized earlier (by Thread2,
+    at Time1) than it propagated cancellation into Ctx2 (at time Time2). If a new context
+    (Ctx5) is created and bound to Ctx2 between Time1 and Time2, checking its parent only
+    (Ctx2) may result in cancellation request being lost.
+
+    This issue is solved by doing the whole propagation under the lock.
+
+    If we need more concurrency while processing parallel cancellations, we could try
+    the following modification of the propagation algorithm:
+
+    advance global counter and remember it
+    for each thread:
+        scan thread's list of contexts
+    for each thread:
+        sync up its local counter only if the global counter has not been changed
+
+    However this version of the algorithm requires more analysis and verification.
+*/
+
+void __TBB_EXPORTED_FUNC initialize(d1::task_group_context& ctx) {
+    task_group_context_impl::initialize(ctx);
+}
+void __TBB_EXPORTED_FUNC destroy(d1::task_group_context& ctx) {
+    task_group_context_impl::destroy(ctx);
+}
+void __TBB_EXPORTED_FUNC reset(d1::task_group_context& ctx) {
+    task_group_context_impl::reset(ctx);
+}
+bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context& ctx) {
+    return task_group_context_impl::cancel_group_execution(ctx);
+}
+bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context& ctx) {
+    return task_group_context_impl::is_group_execution_cancelled(ctx);
+}
+void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context& ctx) {
+    task_group_context_impl::capture_fp_settings(ctx);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
diff --git a/third_party/tbb/task_scheduler_observer.h b/third_party/tbb/task_scheduler_observer.h
new file mode 100644
index 000000000..4c3d31e79
--- /dev/null
+++ b/third_party/tbb/task_scheduler_observer.h
@@ -0,0 +1,117 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_task_scheduler_observer_H
+#define __TBB_task_scheduler_observer_H
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/tbb/task_arena.h"
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+
+namespace d1 {
+class task_scheduler_observer;
+}
+
+namespace r1 {
+class observer_proxy;
+class observer_list;
+
+//! Enable or disable observation
+/** For local observers the method can be used only when the current thread
+has the task scheduler initialized or is attached to an arena.
+Repeated calls with the same state are no-ops. **/
+TBB_EXPORT void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer&, bool state = true);
+}
+
+namespace d1 {
+class task_scheduler_observer {
+    friend class r1::observer_proxy;
+    friend class r1::observer_list;
+    friend void r1::observe(d1::task_scheduler_observer&, bool);
+
+    //! Pointer to the proxy holding this observer.
+    /** Observers are proxied by the scheduler to maintain persistent lists of them. **/
+    std::atomic<r1::observer_proxy*> my_proxy{ nullptr };
+
+    //! Counter preventing the observer from being destroyed while in use by the scheduler.
+    /** Valid only when observation is on. **/
+    std::atomic<intptr_t> my_busy_count{ 0 };
+
+    //! Contains task_arena pointer
+    task_arena* my_task_arena{ nullptr };
+public:
+    //! Returns true if observation is enabled, false otherwise.
+    bool is_observing() const { return my_proxy.load(std::memory_order_relaxed) != nullptr; }
+
+    //! Entry notification
+    /** Invoked from inside observe(true) call and whenever a worker enters the arena
+        this observer is associated with. If a thread is already in the arena when
+        the observer is activated, the entry notification is called before it
+        executes the first stolen task. **/
+    virtual void on_scheduler_entry( bool /*is_worker*/ ) {}
+
+    //! Exit notification
+    /** Invoked from inside observe(false) call and whenever a worker leaves the
+        arena this observer is associated with. **/
+    virtual void on_scheduler_exit( bool /*is_worker*/ ) {}
+
+    //! Construct local or global observer in inactive state (observation disabled).
+    /** For a local observer entry/exit notifications are invoked whenever a worker
+        thread joins/leaves the arena of the observer's owner thread. If a thread is
+        already in the arena when the observer is activated, the entry notification is
+        called before it executes the first stolen task. **/
+    explicit task_scheduler_observer() = default;
+
+    //! Construct local observer for a given arena in inactive state (observation disabled).
+    /** entry/exit notifications are invoked whenever a thread joins/leaves arena.
+        If a thread is already in the arena when the observer is activated, the entry notification
+        is called before it executes the first stolen task. **/
+    explicit task_scheduler_observer(task_arena& a) : my_task_arena(&a) {}
+
+    /** Destructor protects instance of the observer from concurrent notification.
+       It is recommended to disable observation before destructor of a derived class starts,
+       otherwise it can lead to concurrent notification callback on partly destroyed object **/
+    virtual ~task_scheduler_observer() {
+        if (my_proxy.load(std::memory_order_acquire)) {
+            observe(false);
+        }
+    }
+
+    //! Enable or disable observation
+    /** Warning: concurrent invocations of this method are not safe.
+        Repeated calls with the same state are no-ops. **/
+    void observe(bool state = true) {
+        if( state && !my_proxy.load(std::memory_order_relaxed) ) {
+            __TBB_ASSERT( my_busy_count.load(std::memory_order_relaxed) == 0, "Inconsistent state of task_scheduler_observer instance");
+        }
+        r1::observe(*this, state);
+    }
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+    using detail::d1::task_scheduler_observer;
+}
+} // namespace tbb
+
+
+#endif /* __TBB_task_scheduler_observer_H */
diff --git a/third_party/tbb/task_stream.h b/third_party/tbb/task_stream.h
new file mode 100644
index 000000000..54d84446c
--- /dev/null
+++ b/third_party/tbb/task_stream.h
@@ -0,0 +1,287 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_task_stream_H
+#define _TBB_task_stream_H
+
+//! This file is a possible future replacement for the task_stream class implemented in
+//! task_stream.h. It refactors the code and extends task_stream capabilities by moving lane
+//! management during operations on caller side. Despite the fact that new implementation should not
+//! affect performance of the original task stream, analysis on this subject was not made at the
+//! time it was developed. In addition, it is not clearly seen at the moment that this container
+//! would be suitable for critical tasks due to linear time complexity on its operations.
+
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/cache_aligned_allocator.h"
+#include "third_party/tbb/mutex.h"
+
+#include "third_party/tbb/scheduler_common.h"
+#include "third_party/tbb/misc.h" // for FastRandom
+
+#include "third_party/libcxx/deque"
+#include "third_party/libcxx/climits"
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+//! Essentially, this is just a pair of a queue and a mutex to protect the queue.
+/** The reason std::pair is not used is that the code would look less clean
+    if field names were replaced with 'first' and 'second'. **/
+template< typename T, typename mutex_t >
+struct alignas(max_nfs_size) queue_and_mutex {
+    typedef std::deque< T, cache_aligned_allocator<T> > queue_base_t;
+
+    queue_base_t my_queue{};
+    mutex_t      my_mutex{};
+};
+
+using population_t = uintptr_t;
+const population_t one = 1;
+
+inline void set_one_bit( std::atomic<population_t>& dest, int pos ) {
+    __TBB_ASSERT( pos>=0, nullptr);
+    __TBB_ASSERT( pos<int(sizeof(population_t)*CHAR_BIT), nullptr);
+    dest.fetch_or( one<<pos );
+}
+
+inline void clear_one_bit( std::atomic<population_t>& dest, int pos ) {
+    __TBB_ASSERT( pos>=0, nullptr);
+    __TBB_ASSERT( pos<int(sizeof(population_t)*CHAR_BIT), nullptr);
+    dest.fetch_and( ~(one<<pos) );
+}
+
+inline bool is_bit_set( population_t val, int pos ) {
+    __TBB_ASSERT( pos>=0, nullptr);
+    __TBB_ASSERT( pos<int(sizeof(population_t)*CHAR_BIT), nullptr);
+    return (val & (one<<pos)) != 0;
+}
+
+struct random_lane_selector :
+#if __INTEL_COMPILER == 1110 || __INTEL_COMPILER == 1500
+        no_assign
+#else
+        no_copy
+#endif
+{
+    random_lane_selector( FastRandom& random ) : my_random( random ) {}
+    unsigned operator()( unsigned out_of ) const {
+        __TBB_ASSERT( ((out_of-1) & out_of) == 0, "number of lanes is not power of two." );
+        return my_random.get() & (out_of-1);
+    }
+private:
+    FastRandom& my_random;
+};
+
+struct lane_selector_base :
+#if __INTEL_COMPILER == 1110 || __INTEL_COMPILER == 1500
+        no_assign
+#else
+        no_copy
+#endif
+{
+    unsigned& my_previous;
+    lane_selector_base( unsigned& previous ) : my_previous( previous ) {}
+};
+
+struct subsequent_lane_selector : lane_selector_base {
+    subsequent_lane_selector( unsigned& previous ) : lane_selector_base( previous ) {}
+    unsigned operator()( unsigned out_of ) const {
+        __TBB_ASSERT( ((out_of-1) & out_of) == 0, "number of lanes is not power of two." );
+        return (++my_previous &= out_of-1);
+    }
+};
+
+struct preceding_lane_selector : lane_selector_base {
+    preceding_lane_selector( unsigned& previous ) : lane_selector_base( previous ) {}
+    unsigned operator()( unsigned out_of ) const {
+        __TBB_ASSERT( ((out_of-1) & out_of) == 0, "number of lanes is not power of two." );
+        return (--my_previous &= (out_of-1));
+    }
+};
+
+//! Specializes from which side of the underlying container elements are retrieved. Method must be
+//! called under corresponding mutex locked.
+template<task_stream_accessor_type accessor>
+class task_stream_accessor : no_copy {
+protected:
+    using lane_t = queue_and_mutex <d1::task*, mutex>;
+    d1::task* get_item( lane_t::queue_base_t& queue ) {
+        d1::task* result = queue.front();
+        queue.pop_front();
+        return result;
+    }
+};
+
+template<>
+class task_stream_accessor< back_nonnull_accessor > : no_copy {
+protected:
+    using lane_t = queue_and_mutex <d1::task*, mutex>;
+    d1::task* get_item( lane_t::queue_base_t& queue ) {
+        d1::task* result = nullptr;
+        __TBB_ASSERT(!queue.empty(), nullptr);
+        // Isolated task can put zeros in queue see look_specific
+        do {
+            result = queue.back();
+            queue.pop_back();
+        } while ( !result && !queue.empty() );
+        return result;
+    }
+};
+
+//! The container for "fairness-oriented" aka "enqueued" tasks.
+template<task_stream_accessor_type accessor>
+class task_stream : public task_stream_accessor< accessor > {
+    using lane_t = typename task_stream_accessor<accessor>::lane_t;
+    std::atomic<population_t> population{};
+    lane_t* lanes{nullptr};
+    unsigned N{};
+
+public:
+    task_stream() = default;
+
+    void initialize( unsigned n_lanes ) {
+        const unsigned max_lanes = sizeof(population_t) * CHAR_BIT;
+
+        N = n_lanes >= max_lanes ? max_lanes : n_lanes > 2 ? 1 << (tbb::detail::log2(n_lanes - 1) + 1) : 2;
+        __TBB_ASSERT( N == max_lanes || (N >= n_lanes && ((N - 1) & N) == 0), "number of lanes miscalculated" );
+        __TBB_ASSERT( N <= sizeof(population_t) * CHAR_BIT, nullptr);
+        lanes = static_cast<lane_t*>(cache_aligned_allocate(sizeof(lane_t) * N));
+        for (unsigned i = 0; i < N; ++i) {
+            new (lanes + i) lane_t;
+        }
+        __TBB_ASSERT( !population.load(std::memory_order_relaxed), nullptr);
+    }
+
+    ~task_stream() {
+        if (lanes) {
+            for (unsigned i = 0; i < N; ++i) {
+                lanes[i].~lane_t();
+            }
+            cache_aligned_deallocate(lanes);
+        }
+    }
+
+    //! Push a task into a lane. Lane selection is performed by passed functor.
+    template<typename lane_selector_t>
+    void push(d1::task* source, const lane_selector_t& next_lane ) {
+        bool succeed = false;
+        unsigned lane = 0;
+        do {
+            lane = next_lane( /*out_of=*/N );
+            __TBB_ASSERT( lane < N, "Incorrect lane index." );
+        } while( ! (succeed = try_push( source, lane )) );
+    }
+
+    //! Try finding and popping a task using passed functor for lane selection. Last used lane is
+    //! updated inside lane selector.
+    template<typename lane_selector_t>
+    d1::task* pop( const lane_selector_t& next_lane ) {
+        d1::task* popped = nullptr;
+        unsigned lane = 0;
+        for (atomic_backoff b; !empty() && !popped; b.pause()) {
+            lane = next_lane( /*out_of=*/N);
+            __TBB_ASSERT(lane < N, "Incorrect lane index.");
+            popped = try_pop(lane);
+        }
+        return popped;
+    }
+
+    //! Try finding and popping a related task.
+    d1::task* pop_specific( unsigned& last_used_lane, isolation_type isolation ) {
+        d1::task* result = nullptr;
+        // Lane selection is round-robin in backward direction.
+        unsigned idx = last_used_lane & (N-1);
+        do {
+            if( is_bit_set( population.load(std::memory_order_relaxed), idx ) ) {
+                lane_t& lane = lanes[idx];
+                mutex::scoped_lock lock;
+                if( lock.try_acquire(lane.my_mutex) && !lane.my_queue.empty() ) {
+                    result = look_specific( lane.my_queue, isolation );
+                    if( lane.my_queue.empty() )
+                        clear_one_bit( population, idx );
+                    if( result )
+                        break;
+                }
+            }
+            idx=(idx-1)&(N-1);
+        } while( !empty() && idx != last_used_lane );
+        last_used_lane = idx;
+        return result;
+    }
+
+    //! Checks existence of a task.
+    bool empty() {
+        return !population.load(std::memory_order_relaxed);
+    }
+
+private:
+    //! Returns true on successful push, otherwise - false.
+    bool try_push(d1::task* source, unsigned lane_idx ) {
+        mutex::scoped_lock lock;
+        if( lock.try_acquire( lanes[lane_idx].my_mutex ) ) {
+            lanes[lane_idx].my_queue.push_back( source );
+            set_one_bit( population, lane_idx ); // TODO: avoid atomic op if the bit is already set
+            return true;
+        }
+        return false;
+    }
+
+    //! Returns pointer to task on successful pop, otherwise - nullptr.
+    d1::task* try_pop( unsigned lane_idx ) {
+        if( !is_bit_set( population.load(std::memory_order_relaxed), lane_idx ) )
+            return nullptr;
+        d1::task* result = nullptr;
+        lane_t& lane = lanes[lane_idx];
+        mutex::scoped_lock lock;
+        if( lock.try_acquire( lane.my_mutex ) && !lane.my_queue.empty() ) {
+            result = this->get_item( lane.my_queue );
+            if( lane.my_queue.empty() )
+                clear_one_bit( population, lane_idx );
+        }
+        return result;
+    }
+
+    // TODO: unify '*_specific' logic with 'pop' methods above
+    d1::task* look_specific( typename lane_t::queue_base_t& queue, isolation_type isolation ) {
+        __TBB_ASSERT( !queue.empty(), nullptr);
+        // TODO: add a worst-case performance test and consider an alternative container with better
+        // performance for isolation search.
+        typename lane_t::queue_base_t::iterator curr = queue.end();
+        do {
+            // TODO: consider logic from get_task to simplify the code.
+            d1::task* result = *--curr;
+            if( result && task_accessor::isolation(*result) == isolation ) {
+                if( queue.end() - curr == 1 )
+                    queue.pop_back(); // a little of housekeeping along the way
+                else
+                    *curr = nullptr;      // grabbing task with the same isolation
+                // TODO: move one of the container's ends instead if the task has been found there
+                return result;
+            }
+        } while( curr != queue.begin() );
+        return nullptr;
+    }
+
+}; // task_stream
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_task_stream_H */
diff --git a/third_party/tbb/tbb.h b/third_party/tbb/tbb.h
new file mode 100644
index 000000000..f83d7791f
--- /dev/null
+++ b/third_party/tbb/tbb.h
@@ -0,0 +1,75 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_tbb_H
+#define __TBB_tbb_H
+
+/**
+    This header bulk-includes declarations or definitions of all the functionality
+    provided by TBB (save for tbbmalloc and 3rd party dependent headers).
+
+    If you use only a few TBB constructs, consider including specific headers only.
+    Any header listed below can be included independently of others.
+**/
+
+// MISSING #include "oneapi/tbb/blocked_range.h"
+// MISSING #include "oneapi/tbb/blocked_range2d.h"
+// MISSING #include "oneapi/tbb/blocked_range3d.h"
+#if TBB_PREVIEW_BLOCKED_RANGE_ND
+// MISSING #include "tbb/blocked_rangeNd.h"
+#endif
+// MISSING #include "oneapi/tbb/cache_aligned_allocator.h"
+// MISSING #include "oneapi/tbb/combinable.h"
+// MISSING #include "oneapi/tbb/concurrent_hash_map.h"
+#if TBB_PREVIEW_CONCURRENT_LRU_CACHE
+// MISSING #include "tbb/concurrent_lru_cache.h"
+#endif
+// MISSING #include "oneapi/tbb/collaborative_call_once.h"
+// MISSING #include "oneapi/tbb/concurrent_priority_queue.h"
+// MISSING #include "oneapi/tbb/concurrent_queue.h"
+// MISSING #include "oneapi/tbb/concurrent_unordered_map.h"
+// MISSING #include "oneapi/tbb/concurrent_unordered_set.h"
+// MISSING #include "oneapi/tbb/concurrent_map.h"
+// MISSING #include "oneapi/tbb/concurrent_set.h"
+// MISSING #include "oneapi/tbb/concurrent_vector.h"
+// MISSING #include "oneapi/tbb/enumerable_thread_specific.h"
+// MISSING #include "oneapi/tbb/flow_graph.h"
+// MISSING #include "oneapi/tbb/global_control.h"
+// MISSING #include "oneapi/tbb/info.h"
+// MISSING #include "oneapi/tbb/null_mutex.h"
+// MISSING #include "oneapi/tbb/null_rw_mutex.h"
+// MISSING #include "oneapi/tbb/parallel_for.h"
+// MISSING #include "oneapi/tbb/parallel_for_each.h"
+// MISSING #include "oneapi/tbb/parallel_invoke.h"
+// MISSING #include "oneapi/tbb/parallel_pipeline.h"
+// MISSING #include "oneapi/tbb/parallel_reduce.h"
+// MISSING #include "oneapi/tbb/parallel_scan.h"
+// MISSING #include "oneapi/tbb/parallel_sort.h"
+// MISSING #include "oneapi/tbb/partitioner.h"
+// MISSING #include "oneapi/tbb/queuing_mutex.h"
+// MISSING #include "oneapi/tbb/queuing_rw_mutex.h"
+// MISSING #include "oneapi/tbb/spin_mutex.h"
+// MISSING #include "oneapi/tbb/spin_rw_mutex.h"
+// MISSING #include "oneapi/tbb/task.h"
+// MISSING #include "oneapi/tbb/task_arena.h"
+// MISSING #include "oneapi/tbb/task_group.h"
+// MISSING #include "oneapi/tbb/task_scheduler_observer.h"
+// MISSING #include "oneapi/tbb/tbb_allocator.h"
+// MISSING #include "oneapi/tbb/tick_count.h"
+// MISSING #include "oneapi/tbb/version.h"
+
+#endif /* __TBB_tbb_H */
diff --git a/third_party/tbb/tbb.mk b/third_party/tbb/tbb.mk
new file mode 100644
index 000000000..b565bf3d8
--- /dev/null
+++ b/third_party/tbb/tbb.mk
@@ -0,0 +1,43 @@
+#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
+#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
+
+PKGS += THIRD_PARTY_TBB
+
+THIRD_PARTY_TBB_ARTIFACTS += THIRD_PARTY_TBB_A
+THIRD_PARTY_TBB = $(THIRD_PARTY_TBB_A_DEPS) $(THIRD_PARTY_TBB_A)
+THIRD_PARTY_TBB_A = o/$(MODE)/third_party/tbb/tbb.a
+THIRD_PARTY_TBB_FILES := $(wildcard third_party/tbb/*) $(wildcard third_party/tbb/detail/*)
+THIRD_PARTY_TBB_HDRS = $(filter %.h,$(THIRD_PARTY_TBB_FILES))
+THIRD_PARTY_TBB_SRCS = $(filter %.cpp,$(THIRD_PARTY_TBB_FILES))
+THIRD_PARTY_TBB_OBJS = $(THIRD_PARTY_TBB_SRCS:%.cpp=o/$(MODE)/%.o)
+
+# Use this to debug
+# $(info $$THIRD_PARTY_TBB_HDRS is [${THIRD_PARTY_TBB_HDRS}])
+
+THIRD_PARTY_TBB_CHECKS =				\
+	$(THIRD_PARTY_TBB_A).pkg			\
+	$(THIRD_PARTY_TBB_HDRS:%=o/$(MODE)/%.ok)
+
+THIRD_PARTY_TBB_A_DIRECTDEPS =				\
+	THIRD_PARTY_LIBCXX
+
+THIRD_PARTY_TBB_A_DEPS :=				\
+	$(call uniq,$(foreach x,$(THIRD_PARTY_TBB_A_DIRECTDEPS),$($(x))))
+
+$(THIRD_PARTY_TBB_A):					\
+		third_party/tbb/			\
+		$(THIRD_PARTY_TBB_A).pkg		\
+		$(THIRD_PARTY_TBB_OBJS)
+
+$(THIRD_PARTY_TBB_A).pkg:				\
+		$(THIRD_PARTY_TBB_OBJS)		\
+		$(foreach x,$(THIRD_PARTY_TBB_A_DIRECTDEPS),$($(x)_A).pkg)
+
+THIRD_PARTY_TBB_LIBS = $(THIRD_PARTY_TBB_A)
+
+$(THIRD_PARTY_TBB_OBJS): $(BUILD_FILES) third_party/tbb/tbb.mk
+
+.PHONY: o/$(MODE)/third_party/tbb
+o/$(MODE)/third_party/tbb: \
+		$(THIRD_PARTY_TBB_CHECKS) \
+		$(THIRD_PARTY_TBB_A)
diff --git a/third_party/tbb/tbb.rc b/third_party/tbb/tbb.rc
new file mode 100644
index 000000000..a60744cfd
--- /dev/null
+++ b/third_party/tbb/tbb.rc
@@ -0,0 +1,75 @@
+// clang-format off
+// Copyright (c) 2005-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Includes
+//
+// MISSING #include <winresrc.h>
+// MISSING #include "../../include/oneapi/tbb/version.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// Neutral resources
+
+#ifdef _WIN32
+LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL
+#pragma code_page(1252)
+#endif //_WIN32
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Version
+//
+#define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR,TBB_VERSION_PATCH
+#define TBB_VERSION TBB_VERSION_STRING
+
+VS_VERSION_INFO VERSIONINFO
+ FILEVERSION TBB_VERNUMBERS
+ PRODUCTVERSION TBB_VERNUMBERS
+ FILEFLAGSMASK 0x17L
+#ifdef _DEBUG
+ FILEFLAGS 0x1L
+#else
+ FILEFLAGS 0x0L
+#endif
+ FILEOS 0x40004L
+ FILETYPE 0x2L
+ FILESUBTYPE 0x0L
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "000004b0"
+        BEGIN
+            VALUE "CompanyName", "Intel Corporation\0"
+            VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0"
+            VALUE "FileVersion", TBB_VERSION "\0"
+            VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation.  All Rights Reserved.\0"
+            VALUE "LegalTrademarks", "\0"
+#ifndef TBB_USE_DEBUG
+            VALUE "OriginalFilename", "tbb12.dll\0"
+#else
+            VALUE "OriginalFilename", "tbb12_debug.dll\0"
+#endif
+            VALUE "ProductName", "oneAPI Threading Building Blocks (oneTBB)\0"
+            VALUE "ProductVersion", TBB_VERSION "\0"
+            VALUE "PrivateBuild", "\0"
+            VALUE "SpecialBuild", "\0"
+        END
+    END
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x0, 1200
+    END
+END
diff --git a/third_party/tbb/tbb_allocator.h b/third_party/tbb/tbb_allocator.h
new file mode 100644
index 000000000..0284dfb89
--- /dev/null
+++ b/third_party/tbb/tbb_allocator.h
@@ -0,0 +1,127 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_tbb_allocator_H
+#define __TBB_tbb_allocator_H
+
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/detail/_namespace_injection.h"
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/utility"
+
+#if __TBB_CPP17_MEMORY_RESOURCE_PRESENT
+// MISSING #include <memory_resource>
+#endif
+
+namespace tbb {
+namespace detail {
+
+namespace r1 {
+TBB_EXPORT void* __TBB_EXPORTED_FUNC allocate_memory(std::size_t size);
+TBB_EXPORT void  __TBB_EXPORTED_FUNC deallocate_memory(void* p);
+TBB_EXPORT bool  __TBB_EXPORTED_FUNC is_tbbmalloc_used();
+}
+
+namespace d1 {
+
+template<typename T>
+class tbb_allocator {
+public:
+    using value_type = T;
+    using propagate_on_container_move_assignment = std::true_type;
+
+    //! Always defined for TBB containers (supported since C++17 for std containers)
+    using is_always_equal = std::true_type;
+
+    //! Specifies current allocator
+    enum malloc_type {
+        scalable,
+        standard
+    };
+
+    tbb_allocator() = default;
+    template<typename U> tbb_allocator(const tbb_allocator<U>&) noexcept {}
+
+    //! Allocate space for n objects.
+    __TBB_nodiscard T* allocate(std::size_t n) {
+        return static_cast<T*>(r1::allocate_memory(n * sizeof(value_type)));
+    }
+
+    //! Free previously allocated block of memory.
+    void deallocate(T* p, std::size_t) {
+        r1::deallocate_memory(p);
+    }
+
+    //! Returns current allocator
+    static malloc_type allocator_type() {
+        return r1::is_tbbmalloc_used() ? standard : scalable;
+    }
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    using pointer = value_type*;
+    using const_pointer = const value_type*;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using difference_type = std::ptrdiff_t;
+    using size_type = std::size_t;
+    template<typename U> struct rebind {
+        using other = tbb_allocator<U>;
+    };
+    //! Largest value for which method allocate might succeed.
+    size_type max_size() const noexcept {
+        size_type max = ~(std::size_t(0)) / sizeof(value_type);
+        return (max > 0 ? max : 1);
+    }
+    template<typename U, typename... Args>
+    void construct(U *p, Args&&... args)
+        { ::new (p) U(std::forward<Args>(args)...); }
+    void destroy( pointer p ) { p->~value_type(); }
+    pointer address(reference x) const { return &x; }
+    const_pointer address(const_reference x) const { return &x; }
+#endif // TBB_ALLOCATOR_TRAITS_BROKEN
+};
+
+#if TBB_ALLOCATOR_TRAITS_BROKEN
+    template<>
+    class tbb_allocator<void> {
+    public:
+        using pointer = void*;
+        using const_pointer = const void*;
+        using value_type = void;
+        template<typename U> struct rebind {
+            using other = tbb_allocator<U>;
+        };
+    };
+#endif
+
+template<typename T, typename U>
+inline bool operator==(const tbb_allocator<T>&, const tbb_allocator<U>&) noexcept { return true; }
+
+#if !__TBB_CPP20_COMPARISONS_PRESENT
+template<typename T, typename U>
+inline bool operator!=(const tbb_allocator<T>&, const tbb_allocator<U>&) noexcept { return false; }
+#endif
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+using detail::d1::tbb_allocator;
+} // namespace v1
+} // namespace tbb
+
+#endif /* __TBB_tbb_allocator_H */
diff --git a/third_party/tbb/tbbmalloc_proxy.h b/third_party/tbb/tbbmalloc_proxy.h
new file mode 100644
index 000000000..cf262a207
--- /dev/null
+++ b/third_party/tbb/tbbmalloc_proxy.h
@@ -0,0 +1,66 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+/*
+Replacing the standard memory allocation routines in Microsoft* C/C++ RTL
+(malloc/free, global new/delete, etc.) with the TBB memory allocator.
+
+Include the following header to a source of any binary which is loaded during
+application startup
+
+// MISSING #include "oneapi/tbb/tbbmalloc_proxy.h"
+
+or add following parameters to the linker options for the binary which is
+loaded during application startup. It can be either exe-file or dll.
+
+For win32
+tbbmalloc_proxy.lib /INCLUDE:"___TBB_malloc_proxy"
+win64
+tbbmalloc_proxy.lib /INCLUDE:"__TBB_malloc_proxy"
+*/
+
+#ifndef __TBB_tbbmalloc_proxy_H
+#define __TBB_tbbmalloc_proxy_H
+
+#if _MSC_VER
+
+#ifdef _DEBUG
+    #pragma comment(lib, "tbbmalloc_proxy_debug.lib")
+#else
+    #pragma comment(lib, "tbbmalloc_proxy.lib")
+#endif
+
+#if defined(_WIN64)
+    #pragma comment(linker, "/include:__TBB_malloc_proxy")
+#else
+    #pragma comment(linker, "/include:___TBB_malloc_proxy")
+#endif
+
+#else
+/* Primarily to support MinGW */
+
+extern "C" void __TBB_malloc_proxy();
+struct __TBB_malloc_proxy_caller {
+    __TBB_malloc_proxy_caller() { __TBB_malloc_proxy(); }
+} volatile __TBB_malloc_proxy_helper_object;
+
+#endif // _MSC_VER
+
+/* Public Windows API */
+extern "C" int TBB_malloc_replacement_log(char *** function_replacement_log_ptr);
+
+#endif //__TBB_tbbmalloc_proxy_H
diff --git a/third_party/tbb/thread_control_monitor.h b/third_party/tbb/thread_control_monitor.h
new file mode 100644
index 000000000..06e2755af
--- /dev/null
+++ b/third_party/tbb/thread_control_monitor.h
@@ -0,0 +1,117 @@
+// clang-format off
+/*
+    Copyright (c) 2021-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_thread_control_monitor_H
+#define __TBB_thread_control_monitor_H
+
+#include "third_party/tbb/concurrent_monitor.h"
+#include "third_party/tbb/scheduler_common.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+struct market_context {
+    market_context() = default;
+
+    market_context(std::uintptr_t first_addr, arena* a) :
+        my_uniq_addr(first_addr), my_arena_addr(a)
+    {}
+
+    std::uintptr_t my_uniq_addr{0};
+    arena* my_arena_addr{nullptr};
+};
+
+#if __TBB_RESUMABLE_TASKS
+class resume_node : public wait_node<market_context> {
+    using base_type = wait_node<market_context>;
+public:
+    resume_node(market_context ctx, execution_data_ext& ed_ext, task_dispatcher& target)
+        : base_type(ctx), my_curr_dispatcher(ed_ext.task_disp), my_target_dispatcher(&target)
+        , my_suspend_point(my_curr_dispatcher->get_suspend_point())
+    {}
+
+    ~resume_node() override {
+        if (this->my_skipped_wakeup) {
+            spin_wait_until_eq(this->my_notify_calls, 1);
+        }
+
+        poison_pointer(my_curr_dispatcher);
+        poison_pointer(my_target_dispatcher);
+        poison_pointer(my_suspend_point);
+    }
+
+    void init() override {
+        base_type::init();
+    }
+
+    void wait() override {
+        my_curr_dispatcher->resume(*my_target_dispatcher);
+        __TBB_ASSERT(!this->my_is_in_list.load(std::memory_order_relaxed), "Still in the queue?");
+    }
+
+    void reset() override {
+        base_type::reset();
+        spin_wait_until_eq(this->my_notify_calls, 1);
+        my_notify_calls.store(0, std::memory_order_relaxed);
+    }
+
+    // notify is called (perhaps, concurrently) twice from:
+    //   - concurrent_monitor::notify
+    //   - post_resume_action::register_waiter
+    // The second notify is called after thread switches the stack
+    // (Because we can not call resume while the stack is occupied)
+    // We need calling resume only when both notifications are performed.
+    void notify() override {
+        if (++my_notify_calls == 2) {
+            r1::resume(my_suspend_point);
+        }
+    }
+
+private:
+    friend class thread_data;
+    friend struct suspend_point_type::resume_task;
+    task_dispatcher* my_curr_dispatcher;
+    task_dispatcher* my_target_dispatcher;
+    suspend_point_type* my_suspend_point;
+    std::atomic<int> my_notify_calls{0};
+};
+#endif // __TBB_RESUMABLE_TASKS
+
+class thread_control_monitor : public concurrent_monitor_base<market_context> {
+    using base_type = concurrent_monitor_base<market_context>;
+public:
+    using base_type::base_type;
+
+    ~thread_control_monitor() {
+        destroy();
+    }
+
+    /** per-thread descriptor for concurrent_monitor */
+    using thread_context = sleep_node<market_context>;
+#if __TBB_RESUMABLE_TASKS
+    using resume_context = resume_node;
+#endif
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_thread_control_monitor_H
diff --git a/third_party/tbb/thread_data.h b/third_party/tbb/thread_data.h
new file mode 100644
index 000000000..638e87e2f
--- /dev/null
+++ b/third_party/tbb/thread_data.h
@@ -0,0 +1,260 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_thread_data_H
+#define __TBB_thread_data_H
+
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/task.h"
+
+#include "third_party/tbb/rml_base.h" // rml::job
+
+#include "third_party/tbb/scheduler_common.h"
+#include "third_party/tbb/arena.h"
+#include "third_party/tbb/concurrent_monitor.h"
+#include "third_party/tbb/mailbox.h"
+#include "third_party/tbb/misc.h" // FastRandom
+#include "third_party/tbb/small_object_pool_impl.h"
+#include "third_party/tbb/intrusive_list.h"
+
+#include "third_party/libcxx/atomic"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class task;
+class arena_slot;
+class task_group_context;
+class task_dispatcher;
+class thread_dispatcher_client;
+
+class context_list : public intrusive_list<d1::intrusive_list_node> {
+public:
+    bool orphaned{false};
+
+    //! Last state propagation epoch known to this thread
+    /** Together with the_context_state_propagation_epoch constitute synchronization protocol
+    that keeps hot path of task group context construction destruction mostly
+    lock-free.
+    When local epoch equals the global one, the state of task group contexts
+    registered with this thread is consistent with that of the task group trees
+    they belong to. **/
+    std::atomic<std::uintptr_t> epoch{};
+
+    //! Mutex protecting access to the list of task group contexts.
+    d1::mutex m_mutex{};
+
+    void destroy() {
+        this->~context_list();
+        cache_aligned_deallocate(this);
+    }
+
+    void remove(d1::intrusive_list_node& val) {
+        mutex::scoped_lock lock(m_mutex);
+
+        intrusive_list<d1::intrusive_list_node>::remove(val);
+
+        if (orphaned && empty()) {
+            lock.release();
+            destroy();
+        }
+    }
+
+    void push_front(d1::intrusive_list_node& val) {
+        mutex::scoped_lock lock(m_mutex);
+
+        intrusive_list<d1::intrusive_list_node>::push_front(val);
+    }
+
+    void orphan() {
+        mutex::scoped_lock lock(m_mutex);
+
+        orphaned = true;
+        if (empty()) {
+            lock.release();
+            destroy();
+        }
+    }
+};
+
+//------------------------------------------------------------------------
+// Thread Data
+//------------------------------------------------------------------------
+class thread_data : public ::rml::job
+                  , public d1::intrusive_list_node
+                  , no_copy {
+public:
+    thread_data(unsigned short index, bool is_worker)
+        : my_arena_index{ index }
+        , my_is_worker{ is_worker }
+        , my_task_dispatcher{ nullptr }
+        , my_arena{ nullptr }
+        , my_last_client{ nullptr }
+        , my_arena_slot{}
+        , my_random{ this }
+        , my_last_observer{ nullptr }
+        , my_small_object_pool{new (cache_aligned_allocate(sizeof(small_object_pool_impl))) small_object_pool_impl{}}
+        , my_context_list(new (cache_aligned_allocate(sizeof(context_list))) context_list{})
+#if __TBB_RESUMABLE_TASKS
+        , my_post_resume_action{ task_dispatcher::post_resume_action::none }
+        , my_post_resume_arg{nullptr}
+#endif /* __TBB_RESUMABLE_TASKS */
+    {
+        ITT_SYNC_CREATE(&my_context_list->m_mutex, SyncType_Scheduler, SyncObj_ContextsList);
+    }
+
+    ~thread_data() {
+        my_context_list->orphan();
+        my_small_object_pool->destroy();
+        poison_pointer(my_task_dispatcher);
+        poison_pointer(my_arena);
+        poison_pointer(my_arena_slot);
+        poison_pointer(my_last_observer);
+        poison_pointer(my_small_object_pool);
+        poison_pointer(my_context_list);
+#if __TBB_RESUMABLE_TASKS
+        poison_pointer(my_post_resume_arg);
+#endif /* __TBB_RESUMABLE_TASKS */
+    }
+
+    void attach_arena(arena& a, std::size_t index);
+    bool is_attached_to(arena*);
+    void attach_task_dispatcher(task_dispatcher&);
+    void detach_task_dispatcher();
+    void enter_task_dispatcher(task_dispatcher& task_disp, std::uintptr_t stealing_threshold);
+    void leave_task_dispatcher();
+    void propagate_task_group_state(std::atomic<uint32_t> d1::task_group_context::* mptr_state, d1::task_group_context& src, uint32_t new_state);
+
+    //! Index of the arena slot the scheduler occupies now, or occupied last time
+    unsigned short my_arena_index;
+
+    //! Indicates if the thread is created by RML
+    const bool my_is_worker;
+
+    //! The current task dipsatcher
+    task_dispatcher* my_task_dispatcher;
+
+    //! The arena that I own (if external thread) or am servicing at the moment (if worker)
+    arena* my_arena;
+
+    thread_dispatcher_client* my_last_client;
+
+    //! Pointer to the slot in the arena we own at the moment
+    arena_slot* my_arena_slot;
+
+    //! The mailbox (affinity mechanism) the current thread attached to
+    mail_inbox my_inbox;
+
+    //! The random generator
+    FastRandom my_random;
+
+    //! Last observer in the observers list processed on this slot
+    observer_proxy* my_last_observer;
+
+    //! Pool of small object for fast task allocation
+    small_object_pool_impl* my_small_object_pool;
+
+    context_list* my_context_list;
+#if __TBB_RESUMABLE_TASKS
+    //! Suspends the current coroutine (task_dispatcher).
+    void suspend(void* suspend_callback, void* user_callback);
+
+    //! Resumes the target task_dispatcher.
+    void resume(task_dispatcher& target);
+
+    //! Set post resume action to perform after resume.
+    void set_post_resume_action(task_dispatcher::post_resume_action pra, void* arg) {
+        __TBB_ASSERT(my_post_resume_action == task_dispatcher::post_resume_action::none, "The Post resume action must not be set");
+        __TBB_ASSERT(!my_post_resume_arg, "The post resume action must not have an argument");
+        my_post_resume_action = pra;
+        my_post_resume_arg = arg;
+    }
+
+    void clear_post_resume_action() {
+        my_post_resume_action = task_dispatcher::post_resume_action::none;
+        my_post_resume_arg = nullptr;
+    }
+
+    //! The post resume action requested after the swap contexts.
+    task_dispatcher::post_resume_action my_post_resume_action;
+
+    //! The post resume action argument.
+    void* my_post_resume_arg;
+#endif /* __TBB_RESUMABLE_TASKS */
+
+    //! The default context
+    // TODO: consider using common default context because it is used only to simplify
+    // cancellation check.
+    d1::task_group_context my_default_context;
+};
+
+inline void thread_data::attach_arena(arena& a, std::size_t index) {
+    my_arena = &a;
+    my_arena_index = static_cast<unsigned short>(index);
+    my_arena_slot = a.my_slots + index;
+    // Read the current slot mail_outbox and attach it to the mail_inbox (remove inbox later maybe)
+    my_inbox.attach(my_arena->mailbox(index));
+}
+
+inline bool thread_data::is_attached_to(arena* a) { return my_arena == a; }
+
+inline void thread_data::attach_task_dispatcher(task_dispatcher& task_disp) {
+    __TBB_ASSERT(my_task_dispatcher == nullptr, nullptr);
+    __TBB_ASSERT(task_disp.m_thread_data == nullptr, nullptr);
+    task_disp.m_thread_data = this;
+    my_task_dispatcher = &task_disp;
+}
+
+inline void thread_data::detach_task_dispatcher() {
+    __TBB_ASSERT(my_task_dispatcher != nullptr, nullptr);
+    __TBB_ASSERT(my_task_dispatcher->m_thread_data == this, nullptr);
+    my_task_dispatcher->m_thread_data = nullptr;
+    my_task_dispatcher = nullptr;
+}
+
+inline void thread_data::enter_task_dispatcher(task_dispatcher& task_disp, std::uintptr_t stealing_threshold) {
+    task_disp.set_stealing_threshold(stealing_threshold);
+    attach_task_dispatcher(task_disp);
+}
+
+inline void thread_data::leave_task_dispatcher() {
+    my_task_dispatcher->set_stealing_threshold(0);
+    detach_task_dispatcher();
+}
+
+inline void thread_data::propagate_task_group_state(std::atomic<std::uint32_t> d1::task_group_context::* mptr_state, d1::task_group_context& src, std::uint32_t new_state) {
+    mutex::scoped_lock lock(my_context_list->m_mutex);
+    // Acquire fence is necessary to ensure that the subsequent node->my_next load
+    // returned the correct value in case it was just inserted in another thread.
+    // The fence also ensures visibility of the correct ctx.my_parent value.
+    for (context_list::iterator it = my_context_list->begin(); it != my_context_list->end(); ++it) {
+        d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, &(*it));
+        if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state)
+            task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state);
+    }
+    // Sync up local propagation epoch with the global one. Release fence prevents
+    // reordering of possible store to *mptr_state after the sync point.
+    my_context_list->epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // __TBB_thread_data_H
+
diff --git a/third_party/tbb/thread_dispatcher.cpp b/third_party/tbb/thread_dispatcher.cpp
new file mode 100644
index 000000000..6562d8d10
--- /dev/null
+++ b/third_party/tbb/thread_dispatcher.cpp
@@ -0,0 +1,225 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/thread_dispatcher.h"
+#include "third_party/tbb/threading_control.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+thread_dispatcher::thread_dispatcher(threading_control& tc, unsigned hard_limit, std::size_t stack_size)
+    : my_threading_control(tc)
+    , my_num_workers_hard_limit(hard_limit)
+    , my_stack_size(stack_size)
+{
+    my_server = governor::create_rml_server( *this );
+    __TBB_ASSERT( my_server, "Failed to create RML server" );
+}
+
+thread_dispatcher::~thread_dispatcher() {
+    poison_pointer(my_server);
+}
+
+thread_dispatcher_client* thread_dispatcher::select_next_client(thread_dispatcher_client* hint) {
+    unsigned next_client_priority_level = num_priority_levels;
+    if (hint) {
+        next_client_priority_level = hint->priority_level();
+    }
+
+    for (unsigned idx = 0; idx < next_client_priority_level; ++idx) {
+        if (!my_client_list[idx].empty()) {
+            return &*my_client_list[idx].begin();
+        }
+    }
+
+    return hint;
+}
+
+thread_dispatcher_client* thread_dispatcher::create_client(arena& a) {
+    return new (cache_aligned_allocate(sizeof(thread_dispatcher_client))) thread_dispatcher_client(a, my_clients_aba_epoch);
+}
+
+
+void thread_dispatcher::register_client(thread_dispatcher_client* client) {
+    client_list_mutex_type::scoped_lock lock(my_list_mutex);
+    insert_client(*client);
+}
+
+bool thread_dispatcher::try_unregister_client(thread_dispatcher_client* client, std::uint64_t aba_epoch, unsigned priority) {
+    __TBB_ASSERT(client, nullptr);
+    // we hold reference to the server, so market cannot be destroyed at any moment here
+    __TBB_ASSERT(!is_poisoned(my_server), nullptr);
+    my_list_mutex.lock();
+    for (auto& it : my_client_list[priority]) {
+        if (client == &it) {
+            if (it.get_aba_epoch() == aba_epoch) {
+                // Client is alive
+                // Acquire my_references to sync with threads that just left the arena
+                // Pay attention that references should be read before workers_requested because
+                // if references is no zero some other thread might call adjust_demand and lead to
+                // a race over workers_requested
+                if (!client->references() && !client->has_request()) {
+                    // Client is abandoned. Destroy it.
+                    remove_client(*client);
+                    ++my_clients_aba_epoch;
+
+                    my_list_mutex.unlock();
+                    destroy_client(client);
+
+                    return true;
+                }
+            }
+            break;
+        }
+    }
+    my_list_mutex.unlock();
+    return false;
+}
+
+void thread_dispatcher::destroy_client(thread_dispatcher_client* client) {
+    client->~thread_dispatcher_client();
+    cache_aligned_deallocate(client);
+}
+
+// Should be called under lock
+void thread_dispatcher::insert_client(thread_dispatcher_client& client) {
+    __TBB_ASSERT(client.priority_level() < num_priority_levels, nullptr);
+    my_client_list[client.priority_level()].push_front(client);
+
+    __TBB_ASSERT(!my_next_client || my_next_client->priority_level() < num_priority_levels, nullptr);
+    my_next_client = select_next_client(my_next_client);
+}
+
+// Should be called under lock
+void thread_dispatcher::remove_client(thread_dispatcher_client& client) {
+    __TBB_ASSERT(client.priority_level() < num_priority_levels, nullptr);
+    my_client_list[client.priority_level()].remove(client);
+
+    if (my_next_client == &client) {
+        my_next_client = nullptr;
+    }
+    my_next_client = select_next_client(my_next_client);
+}
+
+bool thread_dispatcher::is_client_alive(thread_dispatcher_client* client) {
+    if (!client) {
+        return false;
+    }
+
+    // Still cannot access internals of the client since the object itself might be destroyed.
+    for (auto& priority_list : my_client_list) {
+        for (auto& c : priority_list) {
+            if (client == &c) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+thread_dispatcher_client* thread_dispatcher::client_in_need(client_list_type* clients, thread_dispatcher_client* hint) {
+    // TODO: make sure client with higher priority returned only if there are available slots in it.
+    hint = select_next_client(hint);
+    if (!hint) {
+        return nullptr;
+    }
+
+    client_list_type::iterator it = hint;
+    unsigned curr_priority_level = hint->priority_level();
+    __TBB_ASSERT(it != clients[curr_priority_level].end(), nullptr);
+    do {
+        thread_dispatcher_client& t = *it;
+        if (++it == clients[curr_priority_level].end()) {
+            do {
+                ++curr_priority_level %= num_priority_levels;
+            } while (clients[curr_priority_level].empty());
+            it = clients[curr_priority_level].begin();
+        }
+        if (t.try_join()) {
+            return &t;
+        }
+    } while (it != hint);
+    return nullptr;
+}
+
+thread_dispatcher_client* thread_dispatcher::client_in_need(thread_dispatcher_client* prev) {
+    client_list_mutex_type::scoped_lock lock(my_list_mutex, /*is_writer=*/false);
+    if (is_client_alive(prev)) {
+        return client_in_need(my_client_list, prev);
+    }
+    return client_in_need(my_client_list, my_next_client);
+}
+
+void thread_dispatcher::adjust_job_count_estimate(int delta) {
+    my_server->adjust_job_count_estimate(delta);
+}
+
+void thread_dispatcher::release(bool blocking_terminate) {
+    my_join_workers = blocking_terminate;
+    my_server->request_close_connection();
+}
+
+void thread_dispatcher::process(job& j) {
+    thread_data& td = static_cast<thread_data&>(j);
+    // td.my_last_client can be dead. Don't access it until client_in_need is called
+    thread_dispatcher_client* client = td.my_last_client;
+    for (int i = 0; i < 2; ++i) {
+        while ((client = client_in_need(client)) ) {
+            td.my_last_client = client;
+            client->process(td);
+        }
+        // Workers leave thread_dispatcher because there is no client in need. It can happen earlier than
+        // adjust_job_count_estimate() decreases my_slack and RML can put this thread to sleep.
+        // It might result in a busy-loop checking for my_slack<0 and calling this method instantly.
+        // the yield refines this spinning.
+        if ( !i ) {
+            yield();
+        }
+    }
+}
+
+
+//! Used when RML asks for join mode during workers termination.
+bool thread_dispatcher::must_join_workers() const { return my_join_workers; }
+
+//! Returns the requested stack size of worker threads.
+std::size_t thread_dispatcher::worker_stack_size() const { return my_stack_size; }
+
+void thread_dispatcher::acknowledge_close_connection() {
+    my_threading_control.destroy();
+}
+
+::rml::job* thread_dispatcher::create_one_job() {
+    unsigned short index = ++my_first_unused_worker_idx;
+    __TBB_ASSERT(index > 0, nullptr);
+    ITT_THREAD_SET_NAME(_T("TBB Worker Thread"));
+    // index serves as a hint decreasing conflicts between workers when they migrate between arenas
+    thread_data* td = new (cache_aligned_allocate(sizeof(thread_data))) thread_data{ index, true };
+    __TBB_ASSERT(index <= my_num_workers_hard_limit, nullptr);
+    my_threading_control.register_thread(*td);
+    return td;
+}
+
+void thread_dispatcher::cleanup(job& j) {
+    my_threading_control.unregister_thread(static_cast<thread_data&>(j));
+    governor::auto_terminate(&j);
+}
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
diff --git a/third_party/tbb/thread_dispatcher.h b/third_party/tbb/thread_dispatcher.h
new file mode 100644
index 000000000..85f3d4766
--- /dev/null
+++ b/third_party/tbb/thread_dispatcher.h
@@ -0,0 +1,107 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_thread_dispatcher_H
+#define _TBB_thread_dispatcher_H
+
+#include "third_party/tbb/detail/_config.h"
+#include "third_party/tbb/detail/_utils.h"
+#include "third_party/tbb/rw_mutex.h"
+#include "third_party/tbb/task_arena.h"
+
+#include "third_party/tbb/arena.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/thread_data.h"
+#include "third_party/tbb/rml_tbb.h"
+#include "third_party/tbb/thread_dispatcher_client.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class threading_control_impl;
+
+class thread_dispatcher : no_copy, rml::tbb_client {
+    using client_list_type = intrusive_list<thread_dispatcher_client>;
+    using client_list_mutex_type = d1::rw_mutex;
+public:
+    thread_dispatcher(threading_control& tc, unsigned hard_limit, std::size_t stack_size);
+    ~thread_dispatcher();
+
+    thread_dispatcher_client* create_client(arena& a);
+    void register_client(thread_dispatcher_client* client);
+    bool try_unregister_client(thread_dispatcher_client* client, std::uint64_t aba_epoch, unsigned priority);
+
+    void adjust_job_count_estimate(int delta);
+    void release(bool blocking_terminate);
+    void process(job& j) override;
+    //! Used when RML asks for join mode during workers termination.
+    bool must_join_workers() const;
+    //! Returns the requested stack size of worker threads.
+    std::size_t worker_stack_size() const;
+
+private:
+    version_type version () const override { return 0; }
+    unsigned max_job_count () const override { return my_num_workers_hard_limit; }
+    std::size_t min_stack_size () const override { return worker_stack_size(); }
+    void cleanup(job& j) override;
+    void acknowledge_close_connection() override;
+    ::rml::job* create_one_job() override;
+
+    thread_dispatcher_client* select_next_client(thread_dispatcher_client* hint);
+    void destroy_client(thread_dispatcher_client* client);
+    void insert_client(thread_dispatcher_client& client);
+    void remove_client(thread_dispatcher_client& client);
+    bool is_client_alive(thread_dispatcher_client* client);
+    thread_dispatcher_client* client_in_need(client_list_type* clients, thread_dispatcher_client* hint);
+    thread_dispatcher_client* client_in_need(thread_dispatcher_client* prev);
+
+    friend class threading_control_impl;
+    static constexpr unsigned num_priority_levels = d1::num_priority_levels;
+    client_list_mutex_type my_list_mutex;
+    client_list_type my_client_list[num_priority_levels];
+
+    thread_dispatcher_client* my_next_client{nullptr};
+
+    //! Shutdown mode
+    bool my_join_workers{false};
+
+    threading_control& my_threading_control;
+
+    //! ABA prevention marker to assign to newly created clients
+    std::atomic<std::uint64_t> my_clients_aba_epoch{0};
+
+    //! Maximal number of workers allowed for use by the underlying resource manager
+    /** It can't be changed after thread_dispatcher creation. **/
+    unsigned my_num_workers_hard_limit{0};
+
+    //! Stack size of worker threads
+    std::size_t my_stack_size{0};
+
+    //! First unused index of worker
+    /** Used to assign indices to the new workers coming from RML **/
+    std::atomic<unsigned> my_first_unused_worker_idx{0};
+
+    //! Pointer to the RML server object that services this TBB instance.
+    rml::tbb_server* my_server{nullptr};
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_thread_dispatcher_H
diff --git a/third_party/tbb/thread_dispatcher_client.h b/third_party/tbb/thread_dispatcher_client.h
new file mode 100644
index 000000000..7c95b5118
--- /dev/null
+++ b/third_party/tbb/thread_dispatcher_client.h
@@ -0,0 +1,65 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_thread_dispatcher_client_H
+#define _TBB_thread_dispatcher_client_H
+
+#include "third_party/tbb/detail/_intrusive_list_node.h"
+#include "third_party/tbb/arena.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class thread_dispatcher_client : public d1::intrusive_list_node /* Need for list in thread pool */ {
+public:
+    thread_dispatcher_client(arena& a, std::uint64_t aba_epoch) : my_arena(a), my_aba_epoch(aba_epoch) {}
+
+    // Interface of communication with thread pool
+    bool try_join() {
+        return my_arena.try_join();
+    }
+    void process(thread_data& td) {
+        my_arena.process(td);
+    }
+
+    unsigned priority_level() {
+        return my_arena.priority_level();
+    }
+
+    std::uint64_t get_aba_epoch() {
+        return my_aba_epoch;
+    }
+
+    unsigned references() {
+        return my_arena.references();
+    }
+
+    bool has_request() {
+        return my_arena.has_request();
+    }
+
+private:
+    arena& my_arena;
+    std::uint64_t my_aba_epoch;
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_thread_dispatcher_client_H
diff --git a/third_party/tbb/thread_request_serializer.cpp b/third_party/tbb/thread_request_serializer.cpp
new file mode 100644
index 000000000..534f720ce
--- /dev/null
+++ b/third_party/tbb/thread_request_serializer.cpp
@@ -0,0 +1,139 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/misc.h"
+#include "third_party/tbb/thread_request_serializer.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+thread_request_serializer::thread_request_serializer(thread_dispatcher& td, int soft_limit)
+    : my_thread_dispatcher(td)
+    , my_soft_limit(soft_limit)
+{}
+
+void thread_request_serializer::update(int delta) {
+    constexpr std::uint64_t delta_mask = (pending_delta_base << 1) - 1;
+    constexpr std::uint64_t counter_value = delta_mask + 1;
+
+    int prev_pending_delta = my_pending_delta.fetch_add(counter_value + delta);
+
+    // There is a pseudo request aggregator, so only thread that see pending_delta_base in my_pending_delta
+    // Will enter to critical section and call adjust_job_count_estimate
+    if (prev_pending_delta == pending_delta_base) {
+        delta = int(my_pending_delta.exchange(pending_delta_base) & delta_mask) - int(pending_delta_base);
+        mutex_type::scoped_lock lock(my_mutex);
+        my_total_request += delta;
+        delta = limit_delta(delta, my_soft_limit, my_total_request);
+        my_thread_dispatcher.adjust_job_count_estimate(delta);
+    }
+}
+
+void thread_request_serializer::set_active_num_workers(int soft_limit) {
+    mutex_type::scoped_lock lock(my_mutex);
+    int delta = soft_limit - my_soft_limit;
+    delta = limit_delta(delta, my_total_request, soft_limit);
+    my_thread_dispatcher.adjust_job_count_estimate(delta);
+    my_soft_limit = soft_limit;
+}
+
+int thread_request_serializer::limit_delta(int delta, int limit, int new_value) {
+    // This method can be described with such pseudocode:
+    // bool above_limit = prev_value >= limit && new_value >= limit;
+    // bool below_limit = prev_value <= limit && new_value <= limit;
+    // enum request_type { ABOVE_LIMIT, CROSS_LIMIT, BELOW_LIMIT };
+    // request = above_limit ? ABOVE_LIMIT : below_limit ? BELOW_LIMIT : CROSS_LIMIT;
+
+    // switch (request) {
+    // case ABOVE_LIMIT:
+    //     delta = 0;
+    // case CROSS_LIMIT:
+    //     delta = delta > 0 ? limit - prev_value : new_value - limit;
+    // case BELOW_LIMIT:
+    //     // No changes to delta
+    // }
+
+   int prev_value = new_value - delta;
+
+    // actual new_value and prev_value cannot exceed the limit
+    new_value = min(limit, new_value);
+    prev_value = min(limit, prev_value);
+    return new_value - prev_value;
+}
+
+
+thread_request_serializer_proxy::thread_request_serializer_proxy(thread_dispatcher& td, int soft_limit) : my_serializer(td, soft_limit)
+{}
+
+void thread_request_serializer_proxy::register_mandatory_request(int mandatory_delta) {
+    if (mandatory_delta != 0) {
+        mutex_type::scoped_lock lock(my_mutex, /* is_write = */ false);
+        int prev_value = my_num_mandatory_requests.fetch_add(mandatory_delta);
+
+        const bool should_try_enable = mandatory_delta > 0 && prev_value == 0;
+        const bool should_try_disable = mandatory_delta < 0 && prev_value == 1;
+
+        if (should_try_enable) {
+            enable_mandatory_concurrency(lock);
+        } else if (should_try_disable) {
+            disable_mandatory_concurrency(lock);
+        }
+    }
+}
+
+void thread_request_serializer_proxy::set_active_num_workers(int soft_limit) {
+    mutex_type::scoped_lock lock(my_mutex, /* is_write = */ true);
+
+    if (soft_limit != 0) {
+        my_is_mandatory_concurrency_enabled = false;
+        my_serializer.set_active_num_workers(soft_limit);
+    } else {
+        if (my_num_mandatory_requests > 0 && !my_is_mandatory_concurrency_enabled) {
+            my_is_mandatory_concurrency_enabled = true;
+            my_serializer.set_active_num_workers(1);
+        }
+    }
+}
+
+void thread_request_serializer_proxy::update(int delta) { my_serializer.update(delta); }
+
+void thread_request_serializer_proxy::enable_mandatory_concurrency(mutex_type::scoped_lock& lock) {
+    lock.upgrade_to_writer();
+    bool still_should_enable = my_num_mandatory_requests.load(std::memory_order_relaxed) > 0 &&
+            !my_is_mandatory_concurrency_enabled && my_serializer.is_no_workers_avaliable();
+
+    if (still_should_enable) {
+        my_is_mandatory_concurrency_enabled = true;
+        my_serializer.set_active_num_workers(1);
+    }
+}
+
+void thread_request_serializer_proxy::disable_mandatory_concurrency(mutex_type::scoped_lock& lock) {
+    lock.upgrade_to_writer();
+    bool still_should_disable = my_num_mandatory_requests.load(std::memory_order_relaxed) <= 0 &&
+            my_is_mandatory_concurrency_enabled && !my_serializer.is_no_workers_avaliable();
+
+    if (still_should_disable) {
+        my_is_mandatory_concurrency_enabled = false;
+        my_serializer.set_active_num_workers(0);
+    }
+}
+
+} // r1
+} // detail
+} // tbb
diff --git a/third_party/tbb/thread_request_serializer.h b/third_party/tbb/thread_request_serializer.h
new file mode 100644
index 000000000..2c633853a
--- /dev/null
+++ b/third_party/tbb/thread_request_serializer.h
@@ -0,0 +1,83 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_thread_serializer_handlers_H
+#define _TBB_thread_serializer_handlers_H
+
+#include "third_party/tbb/mutex.h"
+#include "third_party/tbb/rw_mutex.h"
+
+#include "third_party/tbb/thread_dispatcher.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class thread_request_observer {
+protected:
+    virtual ~thread_request_observer() {}
+public:
+    virtual void update(int delta) = 0;
+};
+
+
+class thread_request_serializer : public thread_request_observer {
+    using mutex_type = d1::mutex;
+public:
+    thread_request_serializer(thread_dispatcher& td, int soft_limit);
+    void set_active_num_workers(int soft_limit);
+    bool is_no_workers_avaliable() { return my_soft_limit == 0; }
+
+private:
+    friend class thread_request_serializer_proxy;
+    void update(int delta) override;
+    static int limit_delta(int delta, int limit, int new_value);
+
+    thread_dispatcher& my_thread_dispatcher;
+    int my_soft_limit{ 0 };
+    int my_total_request{ 0 };
+    // my_pending_delta is set to pending_delta_base to have ability to hold negative values
+    // consider increase base since thead number will be bigger than 1 << 15
+    static constexpr std::uint64_t pending_delta_base = 1 << 15;
+    std::atomic<std::uint64_t> my_pending_delta{ pending_delta_base };
+    mutex_type my_mutex;
+};
+
+// Handles mandatory concurrency i.e. enables worker threads for enqueue tasks
+class thread_request_serializer_proxy : public thread_request_observer {
+    using mutex_type = d1::rw_mutex;
+public:
+    thread_request_serializer_proxy(thread_dispatcher& td, int soft_limit);
+    void register_mandatory_request(int mandatory_delta);
+    void set_active_num_workers(int soft_limit);
+
+private:
+    void update(int delta) override;
+    void enable_mandatory_concurrency(mutex_type::scoped_lock& lock);
+    void disable_mandatory_concurrency(mutex_type::scoped_lock& lock);
+
+    std::atomic<int> my_num_mandatory_requests{0};
+    bool my_is_mandatory_concurrency_enabled{false};
+    thread_request_serializer my_serializer;
+    mutex_type my_mutex;
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_thread_serializer_handlers_H
diff --git a/third_party/tbb/threading_control.cpp b/third_party/tbb/threading_control.cpp
new file mode 100644
index 000000000..9f48853ed
--- /dev/null
+++ b/third_party/tbb/threading_control.cpp
@@ -0,0 +1,392 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/threading_control.h"
+#include "third_party/tbb/permit_manager.h"
+#include "third_party/tbb/market.h"
+#include "third_party/tbb/thread_dispatcher.h"
+#include "third_party/tbb/governor.h"
+#include "third_party/tbb/thread_dispatcher_client.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+// ---------------------------------------- threading_control_impl --------------------------------------------------------------
+
+std::size_t global_control_active_value_unsafe(d1::global_control::parameter);
+
+std::pair<unsigned, unsigned> threading_control_impl::calculate_workers_limits() {
+    // Expecting that 4P is suitable for most applications.
+    // Limit to 2P for large thread number.
+    // TODO: ask RML for max concurrency and possibly correct hard_limit
+    unsigned factor = governor::default_num_threads() <= 128 ? 4 : 2;
+
+    // The requested number of threads is intentionally not considered in
+    // computation of the hard limit, in order to separate responsibilities
+    // and avoid complicated interactions between global_control and task_scheduler_init.
+    // The threading control guarantees that at least 256 threads might be created.
+    unsigned workers_app_limit = global_control_active_value_unsafe(global_control::max_allowed_parallelism);
+    unsigned workers_hard_limit = max(max(factor * governor::default_num_threads(), 256u), workers_app_limit);
+    unsigned workers_soft_limit = calc_workers_soft_limit(workers_hard_limit);
+    
+    return std::make_pair(workers_soft_limit, workers_hard_limit);
+}
+
+unsigned threading_control_impl::calc_workers_soft_limit(unsigned workers_hard_limit) {
+    unsigned workers_soft_limit{};
+    unsigned soft_limit = global_control_active_value_unsafe(global_control::max_allowed_parallelism);
+
+    // if user set no limits (yet), use default value
+    workers_soft_limit = soft_limit != 0 ? soft_limit - 1 : governor::default_num_threads() - 1;
+
+    if (workers_soft_limit >= workers_hard_limit) {
+        workers_soft_limit = workers_hard_limit - 1;
+    }
+
+    return workers_soft_limit;
+}
+
+cache_aligned_unique_ptr<permit_manager> threading_control_impl::make_permit_manager(unsigned workers_soft_limit) {
+    return make_cache_aligned_unique<market>(workers_soft_limit);
+}
+
+cache_aligned_unique_ptr<thread_dispatcher> threading_control_impl::make_thread_dispatcher(threading_control& tc,
+                                                                                           unsigned workers_soft_limit,
+                                                                                           unsigned workers_hard_limit)
+{
+    stack_size_type stack_size = global_control_active_value_unsafe(global_control::thread_stack_size);
+
+    cache_aligned_unique_ptr<thread_dispatcher> td =
+        make_cache_aligned_unique<thread_dispatcher>(tc, workers_hard_limit, stack_size);
+    // This check relies on the fact that for shared RML default_concurrency == max_concurrency
+    if (!governor::UsePrivateRML && td->my_server->default_concurrency() < workers_soft_limit) {
+        runtime_warning("RML might limit the number of workers to %u while %u is requested.\n",
+            td->my_server->default_concurrency(), workers_soft_limit);
+    }
+
+    return td;
+}
+
+threading_control_impl::threading_control_impl(threading_control* tc) {
+    unsigned workers_soft_limit{}, workers_hard_limit{};
+    std::tie(workers_soft_limit, workers_hard_limit) = calculate_workers_limits();
+
+    my_permit_manager = make_permit_manager(workers_soft_limit);
+    my_thread_dispatcher = make_thread_dispatcher(*tc, workers_soft_limit, workers_hard_limit);
+    my_thread_request_serializer =
+        make_cache_aligned_unique<thread_request_serializer_proxy>(*my_thread_dispatcher, workers_soft_limit);
+    my_permit_manager->set_thread_request_observer(*my_thread_request_serializer);
+
+    my_cancellation_disseminator = make_cache_aligned_unique<cancellation_disseminator>();
+    my_waiting_threads_monitor = make_cache_aligned_unique<thread_control_monitor>();
+}
+
+void threading_control_impl::release(bool blocking_terminate) {
+    my_thread_dispatcher->release(blocking_terminate);
+}
+
+void threading_control_impl::set_active_num_workers(unsigned soft_limit) {
+    __TBB_ASSERT(soft_limit <= my_thread_dispatcher->my_num_workers_hard_limit, nullptr);
+    my_thread_request_serializer->set_active_num_workers(soft_limit);
+    my_permit_manager->set_active_num_workers(soft_limit);
+}
+
+threading_control_client threading_control_impl::create_client(arena& a) {
+    pm_client* pm_client = my_permit_manager->create_client(a);
+    thread_dispatcher_client* td_client = my_thread_dispatcher->create_client(a);
+
+    return threading_control_client{pm_client, td_client};
+}
+
+threading_control_impl::client_snapshot threading_control_impl::prepare_client_destruction(threading_control_client client) {
+    auto td_client = client.get_thread_dispatcher_client();
+    return {td_client->get_aba_epoch(), td_client->priority_level(), td_client, client.get_pm_client()};
+}
+
+bool threading_control_impl::try_destroy_client(threading_control_impl::client_snapshot snapshot) {
+    if (my_thread_dispatcher->try_unregister_client(snapshot.my_td_client, snapshot.aba_epoch, snapshot.priority_level)) {
+        my_permit_manager->unregister_and_destroy_client(*snapshot.my_pm_client);
+        return true;
+    }
+    return false;
+}
+
+void threading_control_impl::publish_client(threading_control_client tc_client) {
+    my_permit_manager->register_client(tc_client.get_pm_client());
+    my_thread_dispatcher->register_client(tc_client.get_thread_dispatcher_client());
+}
+
+void threading_control_impl::register_thread(thread_data& td) {
+    my_cancellation_disseminator->register_thread(td);
+}
+void threading_control_impl::unregister_thread(thread_data& td) {
+    my_cancellation_disseminator->unregister_thread(td);
+}
+
+void threading_control_impl::propagate_task_group_state(std::atomic<uint32_t> d1::task_group_context::*mptr_state,
+                                                        d1::task_group_context& src, uint32_t new_state)
+{
+    my_cancellation_disseminator->propagate_task_group_state(mptr_state, src, new_state);
+}
+
+std::size_t threading_control_impl::worker_stack_size() {
+    return my_thread_dispatcher->worker_stack_size();
+}
+
+unsigned threading_control_impl::max_num_workers() {
+    return my_thread_dispatcher->my_num_workers_hard_limit;
+}
+
+void threading_control_impl::adjust_demand(threading_control_client tc_client, int mandatory_delta, int workers_delta) {
+    auto& c = *tc_client.get_pm_client();
+    my_thread_request_serializer->register_mandatory_request(mandatory_delta);
+    my_permit_manager->adjust_demand(c, mandatory_delta, workers_delta);
+}
+
+thread_control_monitor& threading_control_impl::get_waiting_threads_monitor() {
+    return *my_waiting_threads_monitor;
+}
+
+// ---------------------------------------- threading_control -------------------------------------------------------------------
+
+// Defined in global_control.cpp
+void global_control_lock();
+void global_control_unlock();
+
+void threading_control::add_ref(bool is_public) {
+    ++my_ref_count;
+    if (is_public) {
+        my_public_ref_count++;
+    }
+}
+
+bool threading_control::remove_ref(bool is_public) {
+    if (is_public) {
+        __TBB_ASSERT(g_threading_control == this, "Global threading control instance was destroyed prematurely?");
+        __TBB_ASSERT(my_public_ref_count.load(std::memory_order_relaxed), nullptr);
+        --my_public_ref_count;
+    }
+
+    bool is_last_ref = --my_ref_count == 0;
+    if (is_last_ref) {
+        __TBB_ASSERT(!my_public_ref_count.load(std::memory_order_relaxed), nullptr);
+        g_threading_control = nullptr;
+    }
+
+    return is_last_ref;
+}
+
+threading_control* threading_control::get_threading_control(bool is_public) {
+    threading_control* control = g_threading_control;
+    if (control) {
+        control->add_ref(is_public);
+    }
+
+    return control;
+}
+
+threading_control* threading_control::create_threading_control() {
+    // Global control should be locked before threading_control_impl
+    global_control_lock();
+
+    threading_control* thr_control{ nullptr };
+    try_call([&] {
+        global_mutex_type::scoped_lock lock(g_threading_control_mutex);
+
+        thr_control = get_threading_control(/*public = */ true);
+        if (thr_control == nullptr) {
+            thr_control =  new (cache_aligned_allocate(sizeof(threading_control))) threading_control(/*public_ref = */ 1, /*private_ref = */ 1);
+            thr_control->my_pimpl = make_cache_aligned_unique<threading_control_impl>(thr_control);
+
+            __TBB_InitOnce::add_ref();
+
+            if (global_control_active_value_unsafe(global_control::scheduler_handle)) {
+                ++thr_control->my_public_ref_count;
+                ++thr_control->my_ref_count;
+            }
+
+            g_threading_control = thr_control;
+        }
+    }).on_exception([&] {
+        global_control_unlock();
+
+        cache_aligned_deleter deleter{};
+        deleter(thr_control);
+    });
+
+    global_control_unlock();
+    return thr_control;
+}
+
+void threading_control::destroy () {
+    cache_aligned_deleter deleter;
+    deleter(this);
+    __TBB_InitOnce::remove_ref();
+}
+
+void threading_control::wait_last_reference(global_mutex_type::scoped_lock& lock) {
+    while (my_public_ref_count.load(std::memory_order_relaxed) == 1 && my_ref_count.load(std::memory_order_relaxed) > 1) {
+        lock.release();
+        // To guarantee that request_close_connection() is called by the last external thread, we need to wait till all
+        // references are released. Re-read my_public_ref_count to limit waiting if new external threads are created.
+        // Theoretically, new private references to the threading control can be added during waiting making it potentially
+        // endless.
+        // TODO: revise why the weak scheduler needs threading control's pointer and try to remove this wait.
+        // Note that the threading control should know about its schedulers for cancellation/exception/priority propagation,
+        // see e.g. task_group_context::cancel_group_execution()
+        while (my_public_ref_count.load(std::memory_order_acquire) == 1 && my_ref_count.load(std::memory_order_acquire) > 1) {
+            yield();
+        }
+        lock.acquire(g_threading_control_mutex);
+    }
+}
+
+bool threading_control::release(bool is_public, bool blocking_terminate) {
+    bool do_release = false;
+    {
+        global_mutex_type::scoped_lock lock(g_threading_control_mutex);
+        if (blocking_terminate) {
+            __TBB_ASSERT(is_public, "Only an object with a public reference can request the blocking terminate");
+            wait_last_reference(lock);
+        }
+        do_release = remove_ref(is_public);
+    }
+
+    if (do_release) {
+        __TBB_ASSERT(!my_public_ref_count.load(std::memory_order_relaxed), "No public references must remain if we remove the threading control.");
+        // inform RML that blocking termination is required
+        my_pimpl->release(blocking_terminate);
+        return blocking_terminate;
+    }
+    return false;
+}
+
+threading_control::threading_control(unsigned public_ref, unsigned ref) : my_public_ref_count(public_ref), my_ref_count(ref)
+{}
+
+threading_control* threading_control::register_public_reference() {
+    threading_control* control{nullptr};
+    global_mutex_type::scoped_lock lock(g_threading_control_mutex);
+    control = get_threading_control(/*public = */ true);
+    if (!control) {
+        // We are going to create threading_control_impl, we should acquire mutexes in right order
+        lock.release();
+        control = create_threading_control();
+    }
+
+    return control;
+}
+
+bool threading_control::unregister_public_reference(bool blocking_terminate) {
+    __TBB_ASSERT(g_threading_control, "Threading control should exist until last public reference");
+    __TBB_ASSERT(g_threading_control->my_public_ref_count.load(std::memory_order_relaxed), nullptr);
+    return g_threading_control->release(/*public = */ true, /*blocking_terminate = */ blocking_terminate);
+}
+
+threading_control_client threading_control::create_client(arena& a) {
+    {
+        global_mutex_type::scoped_lock lock(g_threading_control_mutex);
+        add_ref(/*public = */ false);
+    }
+
+    return my_pimpl->create_client(a);
+}
+
+void threading_control::publish_client(threading_control_client client) {
+    return my_pimpl->publish_client(client);
+}
+
+threading_control::client_snapshot threading_control::prepare_client_destruction(threading_control_client client) {
+    return my_pimpl->prepare_client_destruction(client);
+}
+
+bool threading_control::try_destroy_client(threading_control::client_snapshot deleter) {
+    bool res = my_pimpl->try_destroy_client(deleter);
+    if (res) {
+        release(/*public = */ false, /*blocking_terminate = */ false);
+    }
+    return res;
+}
+
+void threading_control::set_active_num_workers(unsigned soft_limit) {
+    threading_control* thr_control = get_threading_control(/*public = */ false);
+    if (thr_control != nullptr) {
+        thr_control->my_pimpl->set_active_num_workers(soft_limit);
+        thr_control->release(/*is_public=*/false, /*blocking_terminate=*/false);
+    }
+}
+
+bool threading_control::is_present() {
+    global_mutex_type::scoped_lock lock(g_threading_control_mutex);
+    return g_threading_control != nullptr;
+}
+
+bool threading_control::register_lifetime_control() {
+    global_mutex_type::scoped_lock lock(g_threading_control_mutex);
+    return get_threading_control(/*public = */ true) != nullptr;
+}
+
+bool threading_control::unregister_lifetime_control(bool blocking_terminate) {
+    threading_control* thr_control{nullptr};
+    {
+        global_mutex_type::scoped_lock lock(g_threading_control_mutex);
+        thr_control = g_threading_control;
+    }
+
+    bool released{true};
+    if (thr_control) {
+        released = thr_control->release(/*public = */ true, /*blocking_terminate = */ blocking_terminate);
+    }
+
+    return released;
+}
+
+void threading_control::register_thread(thread_data& td) {
+    my_pimpl->register_thread(td);
+}
+
+void threading_control::unregister_thread(thread_data& td) {
+    my_pimpl->unregister_thread(td);
+}
+
+void threading_control::propagate_task_group_state(std::atomic<uint32_t> d1::task_group_context::*mptr_state,
+                                                   d1::task_group_context& src, uint32_t new_state)
+{
+    my_pimpl->propagate_task_group_state(mptr_state, src, new_state);
+}
+
+std::size_t threading_control::worker_stack_size() {
+    return my_pimpl->worker_stack_size();
+}
+
+unsigned threading_control::max_num_workers() {
+    global_mutex_type::scoped_lock lock(g_threading_control_mutex);
+    return g_threading_control ? g_threading_control->my_pimpl->max_num_workers() : 0;
+}
+
+void threading_control::adjust_demand(threading_control_client client, int mandatory_delta, int workers_delta) {
+    my_pimpl->adjust_demand(client, mandatory_delta, workers_delta);
+}
+
+thread_control_monitor& threading_control::get_waiting_threads_monitor() {
+    return my_pimpl->get_waiting_threads_monitor();
+}
+
+} // r1
+} // detail
+} // tbb
diff --git a/third_party/tbb/threading_control.h b/third_party/tbb/threading_control.h
new file mode 100644
index 000000000..b42c9dd42
--- /dev/null
+++ b/third_party/tbb/threading_control.h
@@ -0,0 +1,153 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_threading_control_H
+#define _TBB_threading_control_H
+
+#include "third_party/tbb/mutex.h"
+#include "third_party/tbb/global_control.h"
+
+#include "third_party/tbb/threading_control_client.h"
+#include "third_party/tbb/intrusive_list.h"
+#include "third_party/tbb/main.h"
+#include "third_party/tbb/permit_manager.h"
+#include "third_party/tbb/pm_client.h"
+#include "third_party/tbb/thread_dispatcher.h"
+#include "third_party/tbb/cancellation_disseminator.h"
+#include "third_party/tbb/thread_request_serializer.h"
+#include "third_party/tbb/scheduler_common.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class arena;
+class thread_data;
+
+class threading_control;
+
+class threading_control_impl {
+public:
+    threading_control_impl(threading_control*);
+
+public:
+    void release(bool blocking_terminate);
+
+    threading_control_client create_client(arena& a);
+    void publish_client(threading_control_client client);
+
+    struct client_snapshot {
+        std::uint64_t aba_epoch;
+        unsigned priority_level;
+        thread_dispatcher_client* my_td_client;
+        pm_client* my_pm_client;
+    };
+
+    client_snapshot prepare_client_destruction(threading_control_client client);
+    bool try_destroy_client(client_snapshot deleter);
+
+    void register_thread(thread_data& td);
+    void unregister_thread(thread_data& td);
+    void propagate_task_group_state(std::atomic<uint32_t> d1::task_group_context::*mptr_state,
+                                    d1::task_group_context& src, uint32_t new_state);
+
+    void set_active_num_workers(unsigned soft_limit);
+    std::size_t worker_stack_size();
+    unsigned max_num_workers();
+
+    void adjust_demand(threading_control_client, int mandatory_delta, int workers_delta);
+
+    thread_control_monitor& get_waiting_threads_monitor();
+
+private:
+    static unsigned calc_workers_soft_limit(unsigned workers_hard_limit);
+    static std::pair<unsigned, unsigned> calculate_workers_limits();
+    static cache_aligned_unique_ptr<permit_manager> make_permit_manager(unsigned workers_soft_limit);
+    static cache_aligned_unique_ptr<thread_dispatcher> make_thread_dispatcher(threading_control& control,
+                                                                              unsigned workers_soft_limit,
+                                                                              unsigned workers_hard_limit);
+
+    // TODO: Consider allocation one chunk of memory and construct objects on it
+    cache_aligned_unique_ptr<permit_manager> my_permit_manager{nullptr};
+    cache_aligned_unique_ptr<thread_dispatcher> my_thread_dispatcher{nullptr};
+    cache_aligned_unique_ptr<thread_request_serializer_proxy> my_thread_request_serializer{nullptr};
+    cache_aligned_unique_ptr<cancellation_disseminator> my_cancellation_disseminator{nullptr};
+    cache_aligned_unique_ptr<thread_control_monitor> my_waiting_threads_monitor{nullptr};
+};
+
+
+class threading_control {
+    using global_mutex_type = d1::mutex;
+public:
+    using client_snapshot = threading_control_impl::client_snapshot;
+
+    static threading_control* register_public_reference();
+    static bool unregister_public_reference(bool blocking_terminate);
+
+    static bool is_present();
+    static void set_active_num_workers(unsigned soft_limit);
+    static bool register_lifetime_control();
+    static bool unregister_lifetime_control(bool blocking_terminate);
+
+    threading_control_client create_client(arena& a);
+    void publish_client(threading_control_client client);
+    client_snapshot prepare_client_destruction(threading_control_client client);
+    bool try_destroy_client(client_snapshot deleter);
+
+    void register_thread(thread_data& td);
+    void unregister_thread(thread_data& td);
+    void propagate_task_group_state(std::atomic<uint32_t> d1::task_group_context::*mptr_state,
+                                    d1::task_group_context& src, uint32_t new_state);
+
+    std::size_t worker_stack_size();
+    static unsigned max_num_workers();
+
+    void adjust_demand(threading_control_client client, int mandatory_delta, int workers_delta);
+
+    thread_control_monitor& get_waiting_threads_monitor();
+
+private:
+    threading_control(unsigned public_ref, unsigned ref);
+    void add_ref(bool is_public);
+    bool remove_ref(bool is_public);
+
+    static threading_control* get_threading_control(bool is_public);
+    static threading_control* create_threading_control();
+
+    bool release(bool is_public, bool blocking_terminate);
+    void wait_last_reference(global_mutex_type::scoped_lock& lock);
+    void destroy();
+
+    friend class thread_dispatcher;
+
+    static threading_control* g_threading_control;
+    //! Mutex guarding creation/destruction of g_threading_control, insertions/deletions in my_arenas, and cancellation propagation
+    static global_mutex_type g_threading_control_mutex;
+
+    cache_aligned_unique_ptr<threading_control_impl> my_pimpl{nullptr};
+    //! Count of external threads attached
+    std::atomic<unsigned> my_public_ref_count{0};
+    //! Reference count controlling threading_control object lifetime
+    std::atomic<unsigned> my_ref_count{0};
+};
+
+} // r1
+} // detail
+} // tbb
+
+
+#endif // _TBB_threading_control_H
diff --git a/third_party/tbb/threading_control_client.h b/third_party/tbb/threading_control_client.h
new file mode 100644
index 000000000..941a9de3f
--- /dev/null
+++ b/third_party/tbb/threading_control_client.h
@@ -0,0 +1,59 @@
+// clang-format off
+/*
+    Copyright (c) 2022-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_threading_control_client_H
+#define _TBB_threading_control_client_H
+
+#include "third_party/tbb/detail/_assert.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+class pm_client;
+class thread_dispatcher_client;
+
+class threading_control_client {
+public:
+    threading_control_client() = default;
+    threading_control_client(const threading_control_client&) = default;
+    threading_control_client& operator=(const threading_control_client&) = default;
+
+    threading_control_client(pm_client* p, thread_dispatcher_client* t) : my_pm_client(p), my_thread_dispatcher_client(t) {
+        __TBB_ASSERT(my_pm_client, nullptr);
+        __TBB_ASSERT(my_thread_dispatcher_client, nullptr);
+    }
+
+    pm_client* get_pm_client() {
+        return my_pm_client;
+    }
+
+    thread_dispatcher_client* get_thread_dispatcher_client() {
+        return my_thread_dispatcher_client;
+    }
+
+private:
+    pm_client* my_pm_client{nullptr};
+    thread_dispatcher_client* my_thread_dispatcher_client{nullptr};
+};
+
+
+}
+}
+}
+
+#endif // _TBB_threading_control_client_H
diff --git a/third_party/tbb/tick_count.h b/third_party/tbb/tick_count.h
new file mode 100644
index 000000000..37880a9c4
--- /dev/null
+++ b/third_party/tbb/tick_count.h
@@ -0,0 +1,100 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_tick_count_H
+#define __TBB_tick_count_H
+
+#include "third_party/libcxx/chrono"
+
+#include "third_party/tbb/detail/_namespace_injection.h"
+
+namespace tbb {
+namespace detail {
+namespace d1 {
+
+
+//! Absolute timestamp
+/** @ingroup timing */
+class tick_count {
+public:
+    using clock_type = typename std::conditional<std::chrono::high_resolution_clock::is_steady,
+        std::chrono::high_resolution_clock, std::chrono::steady_clock>::type;
+
+    //! Relative time interval.
+    class interval_t : public clock_type::duration {
+    public:
+        //! Construct a time interval representing zero time duration
+        interval_t() : clock_type::duration(clock_type::duration::zero()) {}
+
+        //! Construct a time interval representing sec seconds time duration
+        explicit interval_t( double sec )
+            : clock_type::duration(std::chrono::duration_cast<clock_type::duration>(std::chrono::duration<double>(sec))) {}
+
+        //! Return the length of a time interval in seconds
+        double seconds() const {
+            return std::chrono::duration_cast<std::chrono::duration<double>>(*this).count();
+        }
+
+        //! Extract the intervals from the tick_counts and subtract them.
+        friend interval_t operator-( const tick_count& t1, const tick_count& t0 );
+
+        //! Add two intervals.
+        friend interval_t operator+( const interval_t& i, const interval_t& j ) {
+            return interval_t(std::chrono::operator+(i, j));
+        }
+
+        //! Subtract two intervals.
+        friend interval_t operator-( const interval_t& i, const interval_t& j ) {
+            return interval_t(std::chrono::operator-(i, j));
+        }
+
+    private:
+        explicit interval_t( clock_type::duration value_ ) : clock_type::duration(value_) {}
+    };
+
+    tick_count() = default;
+
+    //! Return current time.
+    static tick_count now() {
+        return clock_type::now();
+    }
+
+    //! Subtract two timestamps to get the time interval between
+    friend interval_t operator-( const tick_count& t1, const tick_count& t0 ) {
+        return tick_count::interval_t(t1.my_time_point - t0.my_time_point);
+    }
+
+    //! Return the resolution of the clock in seconds per tick.
+    static double resolution() {
+        return static_cast<double>(interval_t::period::num) / interval_t::period::den;
+    }
+
+private:
+    clock_type::time_point my_time_point;
+    tick_count( clock_type::time_point tp ) : my_time_point(tp) {}
+};
+
+} // namespace d1
+} // namespace detail
+
+inline namespace v1 {
+    using detail::d1::tick_count;
+} // namespace v1
+
+} // namespace tbb
+
+#endif /* __TBB_tick_count_H */
diff --git a/third_party/tbb/tls.h b/third_party/tbb/tls.h
new file mode 100644
index 000000000..7a143a915
--- /dev/null
+++ b/third_party/tbb/tls.h
@@ -0,0 +1,103 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2022 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_tls_H
+#define _TBB_tls_H
+
+#include "third_party/tbb/detail/_config.h"
+
+#if __TBB_USE_POSIX
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/thread/thread.h"
+#include "libc/thread/thread2.h"
+#else /* assume __TBB_USE_WINAPI */
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+#endif
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+typedef void (*tls_dtor_t)(void*);
+
+//! Basic cross-platform wrapper class for TLS operations.
+template <typename T>
+class basic_tls {
+#if __TBB_USE_POSIX
+    typedef pthread_key_t tls_key_t;
+public:
+    int  create( tls_dtor_t dtor = nullptr ) {
+        return pthread_key_create(&my_key, dtor);
+    }
+    int  destroy()      { return pthread_key_delete(my_key); }
+    void set( T value ) { pthread_setspecific(my_key, (void*)value); }
+    T    get()          { return (T)pthread_getspecific(my_key); }
+#else /* __TBB_USE_WINAPI */
+    typedef DWORD tls_key_t;
+public:
+#if !__TBB_WIN8UI_SUPPORT
+    int create() {
+        tls_key_t tmp = TlsAlloc();
+        if( tmp==TLS_OUT_OF_INDEXES )
+            return TLS_OUT_OF_INDEXES;
+        my_key = tmp;
+        return 0;
+    }
+    int  destroy()      { TlsFree(my_key); my_key=0; return 0; }
+    void set( T value ) { TlsSetValue(my_key, (LPVOID)value); }
+    T    get()          { return (T)TlsGetValue(my_key); }
+#else /*!__TBB_WIN8UI_SUPPORT*/
+    int create() {
+        tls_key_t tmp = FlsAlloc(nullptr);
+        if( tmp== (DWORD)0xFFFFFFFF )
+            return (DWORD)0xFFFFFFFF;
+        my_key = tmp;
+        return 0;
+    }
+    int  destroy()      { FlsFree(my_key); my_key=0; return 0; }
+    void set( T value ) { FlsSetValue(my_key, (LPVOID)value); }
+    T    get()          { return (T)FlsGetValue(my_key); }
+#endif /* !__TBB_WIN8UI_SUPPORT */
+#endif /* __TBB_USE_WINAPI */
+private:
+    tls_key_t my_key;
+};
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif /* _TBB_tls_H */
diff --git a/third_party/tbb/version.cpp b/third_party/tbb/version.cpp
new file mode 100644
index 000000000..d86164b2b
--- /dev/null
+++ b/third_party/tbb/version.cpp
@@ -0,0 +1,27 @@
+// clang-format off
+/*
+    Copyright (c) 2020-2021 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "third_party/tbb/version.h"
+
+extern "C" int TBB_runtime_interface_version() {
+    return TBB_INTERFACE_VERSION;
+}
+
+extern "C" const char* TBB_runtime_version() {
+    static const char version_str[] = TBB_VERSION_STRING;
+    return version_str;
+}
diff --git a/third_party/tbb/version.h b/third_party/tbb/version.h
new file mode 100644
index 000000000..eae21b2c4
--- /dev/null
+++ b/third_party/tbb/version.h
@@ -0,0 +1,115 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_version_H
+#define __TBB_version_H
+
+// Exclude all includes during .rc files compilation
+#ifndef RC_INVOKED
+    #include "third_party/tbb/detail/_config.h"
+    #include "third_party/tbb/detail/_namespace_injection.h"
+#else
+    #define __TBB_STRING_AUX(x) #x
+    #define __TBB_STRING(x) __TBB_STRING_AUX(x)
+#endif
+
+// Product version
+#define TBB_VERSION_MAJOR 2021
+// Update version
+#define TBB_VERSION_MINOR 10
+// "Patch" version for custom releases
+#define TBB_VERSION_PATCH 0
+// Suffix string
+#define __TBB_VERSION_SUFFIX ""
+// Full official version string
+#define TBB_VERSION_STRING __TBB_STRING(TBB_VERSION_MAJOR) "." __TBB_STRING(TBB_VERSION_MINOR) __TBB_VERSION_SUFFIX
+
+// OneAPI oneTBB specification version
+#define ONETBB_SPEC_VERSION "1.0"
+// Full interface version
+#define TBB_INTERFACE_VERSION 12100
+// Major interface version
+#define TBB_INTERFACE_VERSION_MAJOR (TBB_INTERFACE_VERSION/1000)
+// Minor interface version
+#define TBB_INTERFACE_VERSION_MINOR (TBB_INTERFACE_VERSION%1000/10)
+
+// The binary compatibility version
+// To be used in SONAME, manifests, etc.
+#define __TBB_BINARY_VERSION 12
+
+//! TBB_VERSION support
+#ifndef ENDL
+#define ENDL "\n"
+#endif
+
+//TBB_REVAMP_TODO: consider enabling version_string.ver generation
+//TBB_REVAMP_TODO: // MISSING #include "version_string.ver"
+
+#define __TBB_ONETBB_SPEC_VERSION(N) #N ": SPECIFICATION VERSION\t" ONETBB_SPEC_VERSION ENDL
+#define __TBB_VERSION_NUMBER(N) #N ": VERSION\t\t" TBB_VERSION_STRING ENDL
+#define __TBB_INTERFACE_VERSION_NUMBER(N) #N ": INTERFACE VERSION\t" __TBB_STRING(TBB_INTERFACE_VERSION) ENDL
+
+#ifndef TBB_USE_DEBUG
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\tundefined" ENDL
+#elif TBB_USE_DEBUG==0
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t0" ENDL
+#elif TBB_USE_DEBUG==1
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t1" ENDL
+#elif TBB_USE_DEBUG==2
+    #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t2" ENDL
+#else
+    #error Unexpected value for TBB_USE_DEBUG
+#endif
+
+#ifndef TBB_USE_ASSERT
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\tundefined" ENDL
+#elif TBB_USE_ASSERT==0
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t0" ENDL
+#elif TBB_USE_ASSERT==1
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t1" ENDL
+#elif TBB_USE_ASSERT==2
+    #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t2" ENDL
+#else
+    #error Unexpected value for TBB_USE_ASSERT
+#endif
+
+#define TBB_VERSION_STRINGS_P(N)                \
+    __TBB_ONETBB_SPEC_VERSION(N)                \
+    __TBB_VERSION_NUMBER(N)                     \
+    __TBB_INTERFACE_VERSION_NUMBER(N)           \
+    __TBB_VERSION_USE_DEBUG(N)                  \
+    __TBB_VERSION_USE_ASSERT(N)
+
+#define TBB_VERSION_STRINGS TBB_VERSION_STRINGS_P(oneTBB)
+#define TBBMALLOC_VERSION_STRINGS TBB_VERSION_STRINGS_P(TBBmalloc)
+
+//! The function returns the version string for the Intel(R) oneAPI Threading Building Blocks (oneTBB)
+//! shared library being used.
+/**
+ * The returned pointer is an address of a string in the shared library.
+ * It can be different than the TBB_VERSION_STRING obtained at compile time.
+ */
+extern "C" TBB_EXPORT const char* __TBB_EXPORTED_FUNC TBB_runtime_version();
+
+//! The function returns the interface version of the oneTBB shared library being used.
+/**
+ * The returned version is determined at runtime, not at compile/link time.
+ * It can be different than the value of TBB_INTERFACE_VERSION obtained at compile time.
+ */
+extern "C" TBB_EXPORT int __TBB_EXPORTED_FUNC TBB_runtime_interface_version();
+
+#endif // __TBB_version_H
diff --git a/third_party/tbb/waiters.h b/third_party/tbb/waiters.h
new file mode 100644
index 000000000..d9ca0467b
--- /dev/null
+++ b/third_party/tbb/waiters.h
@@ -0,0 +1,202 @@
+// clang-format off
+/*
+    Copyright (c) 2005-2023 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_waiters_H
+#define _TBB_waiters_H
+
+#include "third_party/tbb/detail/_task.h"
+#include "third_party/tbb/scheduler_common.h"
+#include "third_party/tbb/arena.h"
+#include "third_party/tbb/threading_control.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+inline d1::task* get_self_recall_task(arena_slot& slot);
+
+class waiter_base {
+public:
+    waiter_base(arena& a, int yields_multiplier = 1) : my_arena(a), my_backoff(int(a.my_num_slots), yields_multiplier) {}
+
+    bool pause() {
+        if (my_backoff.pause()) {
+            my_arena.out_of_work();
+            return true;
+        }
+
+        return false;
+    }
+
+    void reset_wait() {
+        my_backoff.reset_wait();
+    }
+
+protected:
+    arena& my_arena;
+    stealing_loop_backoff my_backoff;
+};
+
+class outermost_worker_waiter : public waiter_base {
+public:
+    using waiter_base::waiter_base;
+
+    bool continue_execution(arena_slot& slot, d1::task*& t) const {
+        __TBB_ASSERT(t == nullptr, nullptr);
+
+        if (is_worker_should_leave(slot)) {
+            // Leave dispatch loop
+            return false;
+        }
+
+        t = get_self_recall_task(slot);
+        return true;
+    }
+
+    void pause(arena_slot&) {
+        waiter_base::pause();
+    }
+
+
+    d1::wait_context* wait_ctx() {
+        return nullptr;
+    }
+
+    static bool postpone_execution(d1::task&) {
+        return false;
+    }
+
+private:
+    using base_type = waiter_base;
+
+    bool is_worker_should_leave(arena_slot& slot) const {
+        bool is_top_priority_arena = my_arena.is_top_priority();
+        bool is_task_pool_empty = slot.task_pool.load(std::memory_order_relaxed) == EmptyTaskPool;
+
+        if (is_top_priority_arena) {
+            // Worker in most priority arena do not leave arena, until all work in task_pool is done
+            if (is_task_pool_empty && my_arena.is_recall_requested()) {
+                return true;
+            }
+        } else {
+            if (my_arena.is_recall_requested()) {
+                // If worker has work in task pool, we must notify other threads,
+                // because can appear missed wake up of other threads
+                if (!is_task_pool_empty) {
+                    my_arena.advertise_new_work<arena::wakeup>();
+                }
+                return true;
+            }
+        }
+
+        return false;
+    }
+};
+
+class sleep_waiter : public waiter_base {
+protected:
+    using waiter_base::waiter_base;
+
+    template <typename Pred>
+    void sleep(std::uintptr_t uniq_tag, Pred wakeup_condition) {
+        my_arena.get_waiting_threads_monitor().wait<thread_control_monitor::thread_context>(wakeup_condition,
+            market_context{uniq_tag, &my_arena});
+    }
+};
+
+class external_waiter : public sleep_waiter {
+public:
+    external_waiter(arena& a, d1::wait_context& wo)
+        : sleep_waiter(a, /*yields_multiplier*/10), my_wait_ctx(wo)
+        {}
+
+    bool continue_execution(arena_slot& slot, d1::task*& t) const {
+        __TBB_ASSERT(t == nullptr, nullptr);
+        if (!my_wait_ctx.continue_execution())
+            return false;
+        t = get_self_recall_task(slot);
+        return true;
+    }
+
+    void pause(arena_slot&) {
+        if (!sleep_waiter::pause()) {
+            return;
+        }
+
+        auto wakeup_condition = [&] { return !my_arena.is_empty() || !my_wait_ctx.continue_execution(); };
+
+        sleep(std::uintptr_t(&my_wait_ctx), wakeup_condition);
+        my_backoff.reset_wait();
+    }
+
+    d1::wait_context* wait_ctx() {
+        return &my_wait_ctx;
+    }
+
+    static bool postpone_execution(d1::task&) {
+        return false;
+    }
+
+private:
+    d1::wait_context& my_wait_ctx;
+};
+
+#if __TBB_RESUMABLE_TASKS
+
+class coroutine_waiter : public sleep_waiter {
+public:
+    using sleep_waiter::sleep_waiter;
+
+    bool continue_execution(arena_slot& slot, d1::task*& t) const {
+        __TBB_ASSERT(t == nullptr, nullptr);
+        t = get_self_recall_task(slot);
+        return true;
+    }
+
+    void pause(arena_slot& slot) {
+        if (!sleep_waiter::pause()) {
+            return;
+        }
+
+        suspend_point_type* sp = slot.default_task_dispatcher().m_suspend_point;
+
+        auto wakeup_condition = [&] { return !my_arena.is_empty() || sp->m_is_owner_recalled.load(std::memory_order_relaxed); };
+
+        sleep(std::uintptr_t(sp), wakeup_condition);
+        my_backoff.reset_wait();
+    }
+
+    void reset_wait() {
+        my_backoff.reset_wait();
+    }
+
+    d1::wait_context* wait_ctx() {
+        return nullptr;
+    }
+
+    static bool postpone_execution(d1::task& t) {
+        return task_accessor::is_resume_task(t);
+    }
+};
+
+#endif // __TBB_RESUMABLE_TASKS
+
+} // namespace r1
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_waiters_H
diff --git a/third_party/third_party.mk b/third_party/third_party.mk
index 29062ce2f..56295ab3d 100644
--- a/third_party/third_party.mk
+++ b/third_party/third_party.mk
@@ -17,6 +17,7 @@ o/$(MODE)/third_party:				\
 	o/$(MODE)/third_party/radpajama		\
 	o/$(MODE)/third_party/hiredis		\
 	o/$(MODE)/third_party/libcxx		\
+	o/$(MODE)/third_party/tbb		\
 	o/$(MODE)/third_party/linenoise		\
 	o/$(MODE)/third_party/lua		\
 	o/$(MODE)/third_party/lz4cli		\